utf8: punctuation
parent
0eeb607ffd
commit
8e1c27d129
|
@ -11,6 +11,11 @@
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
module utf8
|
module utf8
|
||||||
|
|
||||||
|
/**********************************************************************
|
||||||
|
*
|
||||||
|
* Utility functions
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
|
||||||
// len return the leght as number of unicode chars from a string
|
// len return the leght as number of unicode chars from a string
|
||||||
pub fn len(s string) int {
|
pub fn len(s string) int {
|
||||||
|
@ -33,6 +38,54 @@ pub fn u_len(s ustring) int {
|
||||||
return len(s.s)
|
return len(s.s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get_uchar convert a unicode glyph in string[index] into a int unicode char
|
||||||
|
pub fn get_uchar(s string, index int) int {
|
||||||
|
mut res := 0
|
||||||
|
mut ch_len := 0
|
||||||
|
if s.len > 0 {
|
||||||
|
ch_len = utf8util_char_len(s.str[index])
|
||||||
|
|
||||||
|
if ch_len == 1 {
|
||||||
|
return u16(s.str[0])
|
||||||
|
}if ch_len > 1 && ch_len < 5{
|
||||||
|
mut lword := 0
|
||||||
|
for i:=0; i < ch_len ; i++ {
|
||||||
|
lword = (lword << 8 ) | int( s.str[index + i] )
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2 byte utf-8
|
||||||
|
// byte format: 110xxxxx 10xxxxxx
|
||||||
|
//
|
||||||
|
if ch_len == 2 {
|
||||||
|
res = (lword & 0x1f00) >> 2 | (lword & 0x3f)
|
||||||
|
}
|
||||||
|
// 3 byte utf-8
|
||||||
|
// byte format: 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
//
|
||||||
|
else if ch_len == 3 {
|
||||||
|
res = ( lword & 0x0f0000 ) >> 4 | ( lword & 0x3f00 ) >> 2 | ( lword & 0x3f )
|
||||||
|
}
|
||||||
|
// 4 byte utf-8
|
||||||
|
// byte format: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
//
|
||||||
|
else if ch_len == 4 {
|
||||||
|
res = (( lword & 0x07000000 ) >> 6) | (( lword & 0x003f0000 ) >> 4) |
|
||||||
|
(( lword & 0x00003F00 ) >> 2 ) | ( lword & 0x0000003f )
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************
|
||||||
|
*
|
||||||
|
* Conversion functions
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
|
||||||
// to_upper return an uppercase string from a string
|
// to_upper return an uppercase string from a string
|
||||||
pub fn to_upper(s string) string {
|
pub fn to_upper(s string) string {
|
||||||
|
@ -59,16 +112,52 @@ pub fn u_to_lower(s ustring) ustring {
|
||||||
|
|
||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
*
|
*
|
||||||
* Private functions
|
* Punctuation functions
|
||||||
|
*
|
||||||
|
* The "western" function search on a small table, that is quicker than
|
||||||
|
* the global unicode table search. **Use only for western chars**.
|
||||||
*
|
*
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
|
|
||||||
// utf8util_char_len calculate the lenght in bytes of a utf8 rune
|
//
|
||||||
|
// Western
|
||||||
|
//
|
||||||
|
|
||||||
|
// is_punct return true if the string[index] byte is the start of a unicode western punctuation
|
||||||
|
pub fn is_punct( s string , index int) bool {
|
||||||
|
return is_uchar_punct(get_uchar(s, index))
|
||||||
|
}
|
||||||
|
|
||||||
|
// is_uchar_punct return true if the input unicode is a western unicode punctuation
|
||||||
|
pub fn is_uchar_punct( uchar int ) bool {
|
||||||
|
return find_punct_in_table(uchar, unicode_punct_western ) != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Global
|
||||||
|
//
|
||||||
|
|
||||||
|
// is_global_punct return true if the string[index] byte of is the start of a global unicode punctuation
|
||||||
|
pub fn is_global_punct( s string , index int) bool {
|
||||||
|
return is_uchar_global_punct(get_uchar(s, index))
|
||||||
|
}
|
||||||
|
|
||||||
|
// is_uchar_global_punct return true if the input unicode is a global unicode punctuation
|
||||||
|
pub fn is_uchar_global_punct( uchar int ) bool {
|
||||||
|
return find_punct_in_table( uchar , unicode_punct ) != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**********************************************************************
|
||||||
|
*
|
||||||
|
* Private functions
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
// utf8util_char_len calculate the length in bytes of a utf8 char
|
||||||
fn utf8util_char_len(b byte) int {
|
fn utf8util_char_len(b byte) int {
|
||||||
return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1
|
return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// if upper_flag == true then make low ==> upper conversion
|
// if upper_flag == true then make low ==> upper conversion
|
||||||
// if upper_flag == false then make upper ==> low conversion
|
// if upper_flag == false then make upper ==> low conversion
|
||||||
|
@ -76,7 +165,6 @@ fn utf8util_char_len(b byte) int {
|
||||||
// up_low make the dirt job
|
// up_low make the dirt job
|
||||||
fn up_low(s string, upper_flag bool) string {
|
fn up_low(s string, upper_flag bool) string {
|
||||||
mut _index := 0
|
mut _index := 0
|
||||||
mut old_index := 0
|
|
||||||
mut str_res := malloc(s.len + 1)
|
mut str_res := malloc(s.len + 1)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
@ -98,7 +186,7 @@ fn up_low(s string, upper_flag bool) string {
|
||||||
|
|
||||||
//C.printf(" #%d (%x) ", _index, lword)
|
//C.printf(" #%d (%x) ", _index, lword)
|
||||||
|
|
||||||
mut res := int(0)
|
mut res := 0
|
||||||
|
|
||||||
// 2 byte utf-8
|
// 2 byte utf-8
|
||||||
// byte format: 110xxxxx 10xxxxxx
|
// byte format: 110xxxxx 10xxxxxx
|
||||||
|
@ -131,7 +219,7 @@ fn up_low(s string, upper_flag bool) string {
|
||||||
}
|
}
|
||||||
//C.printf("\n")
|
//C.printf("\n")
|
||||||
}else{
|
}else{
|
||||||
tab_char := u16(unicode_con_table_up_to_low[ch_index])
|
tab_char := unicode_con_table_up_to_low[ch_index]
|
||||||
//C.printf("tab_char: %04x ",tab_char)
|
//C.printf("tab_char: %04x ",tab_char)
|
||||||
|
|
||||||
if ch_len == 2 {
|
if ch_len == 2 {
|
||||||
|
@ -176,7 +264,6 @@ fn up_low(s string, upper_flag bool) string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
old_index = _index
|
|
||||||
_index += ch_len
|
_index += ch_len
|
||||||
|
|
||||||
// we are done, exit the loop
|
// we are done, exit the loop
|
||||||
|
@ -199,13 +286,13 @@ fn find_char_in_table( in_code u16, upper_flag bool) int {
|
||||||
// We will use a simple binary search
|
// We will use a simple binary search
|
||||||
//
|
//
|
||||||
|
|
||||||
mut first_index := int(0) // first index of our utf8 char range
|
mut first_index := 0 // first index of our utf8 char range
|
||||||
mut last_index := int(unicode_con_table_up_to_low.len >> 1) // last+1 index of our utf8 char range
|
mut last_index := (unicode_con_table_up_to_low.len >> 1) // last+1 index of our utf8 char range
|
||||||
mut index := int(0)
|
mut index := 0
|
||||||
mut x := u16(0)
|
mut x := u16(0)
|
||||||
|
|
||||||
mut offset:=int(0) // up to low
|
mut offset:=0 // up to low
|
||||||
mut i_step:=int(1) // up to low
|
mut i_step:=1 // up to low
|
||||||
if upper_flag==true {
|
if upper_flag==true {
|
||||||
offset=1 // low to up
|
offset=1 // low to up
|
||||||
i_step=0 // low to up
|
i_step=0 // low to up
|
||||||
|
@ -220,11 +307,10 @@ fn find_char_in_table( in_code u16, upper_flag bool) int {
|
||||||
|
|
||||||
if x == in_code {
|
if x == in_code {
|
||||||
//C.printf(" Found!\n")
|
//C.printf(" Found!\n")
|
||||||
return int( (index<<1) + i_step)
|
return ( (index<<1) + i_step)
|
||||||
}
|
}
|
||||||
else if x>in_code {
|
else if x>in_code {
|
||||||
last_index=index
|
last_index=index
|
||||||
|
|
||||||
}else {
|
}else {
|
||||||
first_index=index
|
first_index=index
|
||||||
}
|
}
|
||||||
|
@ -234,7 +320,40 @@ fn find_char_in_table( in_code u16, upper_flag bool) int {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//C.printf("not found.\n")
|
//C.printf("not found.\n")
|
||||||
return int(0)
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// find punct in lockup table
|
||||||
|
fn find_punct_in_table( in_code int , in_table []int ) int {
|
||||||
|
//
|
||||||
|
// We will use a simple binary search
|
||||||
|
//
|
||||||
|
|
||||||
|
mut first_index := 0
|
||||||
|
mut last_index := (in_table.len)
|
||||||
|
mut index := 0
|
||||||
|
mut x := 0
|
||||||
|
|
||||||
|
for {
|
||||||
|
index = (first_index+last_index) >> 1
|
||||||
|
x = in_table[ index ]
|
||||||
|
//C.printf("(%d..%d) index:%d base[%08x]==>[%08x]\n",first_index,last_index,index,in_code,x)
|
||||||
|
|
||||||
|
if x == in_code {
|
||||||
|
return index
|
||||||
|
}
|
||||||
|
else if x>in_code {
|
||||||
|
last_index=index
|
||||||
|
}else {
|
||||||
|
first_index=index
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last_index-first_index)<=1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//C.printf("not found.\n")
|
||||||
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -927,3 +1046,640 @@ u16(0x0041), 0x0061, //LATIN CAPITAL LETTER A LATIN SMALL LETTER A
|
||||||
0xFF3A, 0xFF5A, //FULLWIDTH LATIN CAPITAL LETTER Z FULLWIDTH LATIN SMALL LETTER Z
|
0xFF3A, 0xFF5A, //FULLWIDTH LATIN CAPITAL LETTER Z FULLWIDTH LATIN SMALL LETTER Z
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/*****************************************************************************
|
||||||
|
*
|
||||||
|
* Unicode punctuation chars
|
||||||
|
*
|
||||||
|
* source: http://www.unicode.org/faq/punctuation_symbols.html
|
||||||
|
*
|
||||||
|
*****************************************************************************/
|
||||||
|
const(
|
||||||
|
|
||||||
|
// Western punctuation mark
|
||||||
|
// Character Name Browser Image
|
||||||
|
unicode_punct_western=[
|
||||||
|
0x0021, // EXCLAMATION MARK !
|
||||||
|
0x0022, // QUOTATION MARK "
|
||||||
|
0x0027, // APOSTROPHE '
|
||||||
|
0x002A, // ASTERISK *
|
||||||
|
0x002C, // COMMA ,
|
||||||
|
0x002E, // FULL STOP .
|
||||||
|
0x002F, // SOLIDUS /
|
||||||
|
0x003A, // COLON :
|
||||||
|
0x003B, // SEMICOLON ;
|
||||||
|
0x003F, // QUESTION MARK ?
|
||||||
|
0x00A1, // INVERTED EXCLAMATION MARK ¡
|
||||||
|
0x00A7, // SECTION SIGN §
|
||||||
|
0x00B6, // PILCROW SIGN ¶
|
||||||
|
0x00B7, // MIDDLE DOT ·
|
||||||
|
0x00BF, // INVERTED QUESTION MARK ¿
|
||||||
|
0x037E, // GREEK QUESTION MARK ;
|
||||||
|
0x0387, // GREEK ANO TELEIA ·
|
||||||
|
0x055A, // ARMENIAN APOSTROPHE ՚
|
||||||
|
0x055B, // ARMENIAN EMPHASIS MARK ՛
|
||||||
|
0x055C, // ARMENIAN EXCLAMATION MARK ՜
|
||||||
|
0x055D, // ARMENIAN COMMA ՝
|
||||||
|
0x055E, // ARMENIAN QUESTION MARK ՞
|
||||||
|
0x055F, // ARMENIAN ABBREVIATION MARK ՟
|
||||||
|
0x0589, // ARMENIAN FULL STOP ։
|
||||||
|
0x05C0, // HEBREW PUNCTUATION PASEQ ׀
|
||||||
|
0x05C3, // HEBREW PUNCTUATION SOF PASUQ ׃
|
||||||
|
0x05C6, // HEBREW PUNCTUATION NUN HAFUKHA ׆
|
||||||
|
0x05F3, // HEBREW PUNCTUATION GERESH ׳
|
||||||
|
0x05F4, // HEBREW PUNCTUATION GERSHAYIM ״
|
||||||
|
]
|
||||||
|
|
||||||
|
// Unicode Characters in the 'Punctuation, Other' Category
|
||||||
|
// Character Name Browser Image
|
||||||
|
unicode_punct=[
|
||||||
|
0x0021, // EXCLAMATION MARK !
|
||||||
|
0x0022, // QUOTATION MARK "
|
||||||
|
0x0023, // NUMBER SIGN #
|
||||||
|
0x0025, // PERCENT SIGN %
|
||||||
|
0x0026, // AMPERSAND &
|
||||||
|
0x0027, // APOSTROPHE '
|
||||||
|
0x002A, // ASTERISK *
|
||||||
|
0x002C, // COMMA ,
|
||||||
|
0x002E, // FULL STOP .
|
||||||
|
0x002F, // SOLIDUS /
|
||||||
|
0x003A, // COLON :
|
||||||
|
0x003B, // SEMICOLON ;
|
||||||
|
0x003F, // QUESTION MARK ?
|
||||||
|
0x0040, // COMMERCIAL AT @
|
||||||
|
0x005C, // REVERSE SOLIDUS \
|
||||||
|
0x00A1, // INVERTED EXCLAMATION MARK ¡
|
||||||
|
0x00A7, // SECTION SIGN §
|
||||||
|
0x00B6, // PILCROW SIGN ¶
|
||||||
|
0x00B7, // MIDDLE DOT ·
|
||||||
|
0x00BF, // INVERTED QUESTION MARK ¿
|
||||||
|
0x037E, // GREEK QUESTION MARK ;
|
||||||
|
0x0387, // GREEK ANO TELEIA ·
|
||||||
|
0x055A, // ARMENIAN APOSTROPHE ՚
|
||||||
|
0x055B, // ARMENIAN EMPHASIS MARK ՛
|
||||||
|
0x055C, // ARMENIAN EXCLAMATION MARK ՜
|
||||||
|
0x055D, // ARMENIAN COMMA ՝
|
||||||
|
0x055E, // ARMENIAN QUESTION MARK ՞
|
||||||
|
0x055F, // ARMENIAN ABBREVIATION MARK ՟
|
||||||
|
0x0589, // ARMENIAN FULL STOP ։
|
||||||
|
0x05C0, // HEBREW PUNCTUATION PASEQ ׀
|
||||||
|
0x05C3, // HEBREW PUNCTUATION SOF PASUQ ׃
|
||||||
|
0x05C6, // HEBREW PUNCTUATION NUN HAFUKHA ׆
|
||||||
|
0x05F3, // HEBREW PUNCTUATION GERESH ׳
|
||||||
|
0x05F4, // HEBREW PUNCTUATION GERSHAYIM ״
|
||||||
|
0x0609, // ARABIC-INDIC PER MILLE SIGN ؉
|
||||||
|
0x060A, // ARABIC-INDIC PER TEN THOUSAND SIGN ؊
|
||||||
|
0x060C, // ARABIC COMMA ،
|
||||||
|
0x060D, // ARABIC DATE SEPARATOR ؍
|
||||||
|
0x061B, // ARABIC SEMICOLON ؛
|
||||||
|
0x061E, // ARABIC TRIPLE DOT PUNCTUATION MARK ؞
|
||||||
|
0x061F, // ARABIC QUESTION MARK ؟
|
||||||
|
0x066A, // ARABIC PERCENT SIGN ٪
|
||||||
|
0x066B, // ARABIC DECIMAL SEPARATOR ٫
|
||||||
|
0x066C, // ARABIC THOUSANDS SEPARATOR ٬
|
||||||
|
0x066D, // ARABIC FIVE POINTED STAR ٭
|
||||||
|
0x06D4, // ARABIC FULL STOP ۔
|
||||||
|
0x0700, // SYRIAC END OF PARAGRAPH ܀
|
||||||
|
0x0701, // SYRIAC SUPRALINEAR FULL STOP ܁
|
||||||
|
0x0702, // SYRIAC SUBLINEAR FULL STOP ܂
|
||||||
|
0x0703, // SYRIAC SUPRALINEAR COLON ܃
|
||||||
|
0x0704, // SYRIAC SUBLINEAR COLON ܄
|
||||||
|
0x0705, // SYRIAC HORIZONTAL COLON ܅
|
||||||
|
0x0706, // SYRIAC COLON SKEWED LEFT ܆
|
||||||
|
0x0707, // SYRIAC COLON SKEWED RIGHT ܇
|
||||||
|
0x0708, // SYRIAC SUPRALINEAR COLON SKEWED LEFT ܈
|
||||||
|
0x0709, // SYRIAC SUBLINEAR COLON SKEWED RIGHT ܉
|
||||||
|
0x070A, // SYRIAC CONTRACTION ܊
|
||||||
|
0x070B, // SYRIAC HARKLEAN OBELUS ܋
|
||||||
|
0x070C, // SYRIAC HARKLEAN METOBELUS ܌
|
||||||
|
0x070D, // SYRIAC HARKLEAN ASTERISCUS ܍
|
||||||
|
0x07F7, // NKO SYMBOL GBAKURUNEN ߷
|
||||||
|
0x07F8, // NKO COMMA ߸
|
||||||
|
0x07F9, // NKO EXCLAMATION MARK ߹
|
||||||
|
0x0830, // SAMARITAN PUNCTUATION NEQUDAA ࠰
|
||||||
|
0x0831, // SAMARITAN PUNCTUATION AFSAAQ ࠱
|
||||||
|
0x0832, // SAMARITAN PUNCTUATION ANGED ࠲
|
||||||
|
0x0833, // SAMARITAN PUNCTUATION BAU ࠳
|
||||||
|
0x0834, // SAMARITAN PUNCTUATION ATMAAU ࠴
|
||||||
|
0x0835, // SAMARITAN PUNCTUATION SHIYYAALAA ࠵
|
||||||
|
0x0836, // SAMARITAN ABBREVIATION MARK ࠶
|
||||||
|
0x0837, // SAMARITAN PUNCTUATION MELODIC QITSA ࠷
|
||||||
|
0x0838, // SAMARITAN PUNCTUATION ZIQAA ࠸
|
||||||
|
0x0839, // SAMARITAN PUNCTUATION QITSA ࠹
|
||||||
|
0x083A, // SAMARITAN PUNCTUATION ZAEF ࠺
|
||||||
|
0x083B, // SAMARITAN PUNCTUATION TURU ࠻
|
||||||
|
0x083C, // SAMARITAN PUNCTUATION ARKAANU ࠼
|
||||||
|
0x083D, // SAMARITAN PUNCTUATION SOF MASHFAAT ࠽
|
||||||
|
0x083E, // SAMARITAN PUNCTUATION ANNAAU ࠾
|
||||||
|
0x085E, // MANDAIC PUNCTUATION ࡞
|
||||||
|
0x0964, // DEVANAGARI DANDA ।
|
||||||
|
0x0965, // DEVANAGARI DOUBLE DANDA ॥
|
||||||
|
0x0970, // DEVANAGARI ABBREVIATION SIGN ॰
|
||||||
|
0x09FD, // BENGALI ABBREVIATION SIGN ৽
|
||||||
|
0x0A76, // GURMUKHI ABBREVIATION SIGN ੶
|
||||||
|
0x0AF0, // GUJARATI ABBREVIATION SIGN ૰
|
||||||
|
0x0C77, // TELUGU SIGN SIDDHAM ౷
|
||||||
|
0x0C84, // KANNADA SIGN SIDDHAM ಄
|
||||||
|
0x0DF4, // SINHALA PUNCTUATION KUNDDALIYA ෴
|
||||||
|
0x0E4F, // THAI CHARACTER FONGMAN ๏
|
||||||
|
0x0E5A, // THAI CHARACTER ANGKHANKHU ๚
|
||||||
|
0x0E5B, // THAI CHARACTER KHOMUT ๛
|
||||||
|
0x0F04, // TIBETAN MARK INITIAL YIG MGO MDUN MA ༄
|
||||||
|
0x0F05, // TIBETAN MARK CLOSING YIG MGO SGAB MA ༅
|
||||||
|
0x0F06, // TIBETAN MARK CARET YIG MGO PHUR SHAD MA ༆
|
||||||
|
0x0F07, // TIBETAN MARK YIG MGO TSHEG SHAD MA ༇
|
||||||
|
0x0F08, // TIBETAN MARK SBRUL SHAD ༈
|
||||||
|
0x0F09, // TIBETAN MARK BSKUR YIG MGO ༉
|
||||||
|
0x0F0A, // TIBETAN MARK BKA- SHOG YIG MGO ༊
|
||||||
|
0x0F0B, // TIBETAN MARK INTERSYLLABIC TSHEG ་
|
||||||
|
0x0F0C, // TIBETAN MARK DELIMITER TSHEG BSTAR ༌
|
||||||
|
0x0F0D, // TIBETAN MARK SHAD །
|
||||||
|
0x0F0E, // TIBETAN MARK NYIS SHAD ༎
|
||||||
|
0x0F0F, // TIBETAN MARK TSHEG SHAD ༏
|
||||||
|
0x0F10, // TIBETAN MARK NYIS TSHEG SHAD ༐
|
||||||
|
0x0F11, // TIBETAN MARK RIN CHEN SPUNGS SHAD ༑
|
||||||
|
0x0F12, // TIBETAN MARK RGYA GRAM SHAD ༒
|
||||||
|
0x0F14, // TIBETAN MARK GTER TSHEG ༔
|
||||||
|
0x0F85, // TIBETAN MARK PALUTA ྅
|
||||||
|
0x0FD0, // TIBETAN MARK BSKA- SHOG GI MGO RGYAN ࿐
|
||||||
|
0x0FD1, // TIBETAN MARK MNYAM YIG GI MGO RGYAN ࿑
|
||||||
|
0x0FD2, // TIBETAN MARK NYIS TSHEG ࿒
|
||||||
|
0x0FD3, // TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA ࿓
|
||||||
|
0x0FD4, // TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA ࿔
|
||||||
|
0x0FD9, // TIBETAN MARK LEADING MCHAN RTAGS ࿙
|
||||||
|
0x0FDA, // TIBETAN MARK TRAILING MCHAN RTAGS ࿚
|
||||||
|
0x104A, // MYANMAR SIGN LITTLE SECTION ၊
|
||||||
|
0x104B, // MYANMAR SIGN SECTION ။
|
||||||
|
0x104C, // MYANMAR SYMBOL LOCATIVE ၌
|
||||||
|
0x104D, // MYANMAR SYMBOL COMPLETED ၍
|
||||||
|
0x104E, // MYANMAR SYMBOL AFOREMENTIONED ၎
|
||||||
|
0x104F, // MYANMAR SYMBOL GENITIVE ၏
|
||||||
|
0x10FB, // GEORGIAN PARAGRAPH SEPARATOR ჻
|
||||||
|
0x1360, // ETHIOPIC SECTION MARK ፠
|
||||||
|
0x1361, // ETHIOPIC WORDSPACE ፡
|
||||||
|
0x1362, // ETHIOPIC FULL STOP ።
|
||||||
|
0x1363, // ETHIOPIC COMMA ፣
|
||||||
|
0x1364, // ETHIOPIC SEMICOLON ፤
|
||||||
|
0x1365, // ETHIOPIC COLON ፥
|
||||||
|
0x1366, // ETHIOPIC PREFACE COLON ፦
|
||||||
|
0x1367, // ETHIOPIC QUESTION MARK ፧
|
||||||
|
0x1368, // ETHIOPIC PARAGRAPH SEPARATOR ፨
|
||||||
|
0x166E, // CANADIAN SYLLABICS FULL STOP ᙮
|
||||||
|
0x16EB, // RUNIC SINGLE PUNCTUATION ᛫
|
||||||
|
0x16EC, // RUNIC MULTIPLE PUNCTUATION ᛬
|
||||||
|
0x16ED, // RUNIC CROSS PUNCTUATION ᛭
|
||||||
|
0x1735, // PHILIPPINE SINGLE PUNCTUATION ᜵
|
||||||
|
0x1736, // PHILIPPINE DOUBLE PUNCTUATION ᜶
|
||||||
|
0x17D4, // KHMER SIGN KHAN ។
|
||||||
|
0x17D5, // KHMER SIGN BARIYOOSAN ៕
|
||||||
|
0x17D6, // KHMER SIGN CAMNUC PII KUUH ៖
|
||||||
|
0x17D8, // KHMER SIGN BEYYAL ៘
|
||||||
|
0x17D9, // KHMER SIGN PHNAEK MUAN ៙
|
||||||
|
0x17DA, // KHMER SIGN KOOMUUT ៚
|
||||||
|
0x1800, // MONGOLIAN BIRGA ᠀
|
||||||
|
0x1801, // MONGOLIAN ELLIPSIS ᠁
|
||||||
|
0x1802, // MONGOLIAN COMMA ᠂
|
||||||
|
0x1803, // MONGOLIAN FULL STOP ᠃
|
||||||
|
0x1804, // MONGOLIAN COLON ᠄
|
||||||
|
0x1805, // MONGOLIAN FOUR DOTS ᠅
|
||||||
|
0x1807, // MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER ᠇
|
||||||
|
0x1808, // MONGOLIAN MANCHU COMMA ᠈
|
||||||
|
0x1809, // MONGOLIAN MANCHU FULL STOP ᠉
|
||||||
|
0x180A, // MONGOLIAN NIRUGU ᠊
|
||||||
|
0x1944, // LIMBU EXCLAMATION MARK ᥄
|
||||||
|
0x1945, // LIMBU QUESTION MARK ᥅
|
||||||
|
0x1A1E, // BUGINESE PALLAWA ᨞
|
||||||
|
0x1A1F, // BUGINESE END OF SECTION ᨟
|
||||||
|
0x1AA0, // TAI THAM SIGN WIANG ᪠
|
||||||
|
0x1AA1, // TAI THAM SIGN WIANGWAAK ᪡
|
||||||
|
0x1AA2, // TAI THAM SIGN SAWAN ᪢
|
||||||
|
0x1AA3, // TAI THAM SIGN KEOW ᪣
|
||||||
|
0x1AA4, // TAI THAM SIGN HOY ᪤
|
||||||
|
0x1AA5, // TAI THAM SIGN DOKMAI ᪥
|
||||||
|
0x1AA6, // TAI THAM SIGN REVERSED ROTATED RANA ᪦
|
||||||
|
0x1AA8, // TAI THAM SIGN KAAN ᪨
|
||||||
|
0x1AA9, // TAI THAM SIGN KAANKUU ᪩
|
||||||
|
0x1AAA, // TAI THAM SIGN SATKAAN ᪪
|
||||||
|
0x1AAB, // TAI THAM SIGN SATKAANKUU ᪫
|
||||||
|
0x1AAC, // TAI THAM SIGN HANG ᪬
|
||||||
|
0x1AAD, // TAI THAM SIGN CAANG ᪭
|
||||||
|
0x1B5A, // BALINESE PANTI ᭚
|
||||||
|
0x1B5B, // BALINESE PAMADA ᭛
|
||||||
|
0x1B5C, // BALINESE WINDU ᭜
|
||||||
|
0x1B5D, // BALINESE CARIK PAMUNGKAH ᭝
|
||||||
|
0x1B5E, // BALINESE CARIK SIKI ᭞
|
||||||
|
0x1B5F, // BALINESE CARIK PAREREN ᭟
|
||||||
|
0x1B60, // BALINESE PAMENENG ᭠
|
||||||
|
0x1BFC, // BATAK SYMBOL BINDU NA METEK ᯼
|
||||||
|
0x1BFD, // BATAK SYMBOL BINDU PINARBORAS ᯽
|
||||||
|
0x1BFE, // BATAK SYMBOL BINDU JUDUL ᯾
|
||||||
|
0x1BFF, // BATAK SYMBOL BINDU PANGOLAT ᯿
|
||||||
|
0x1C3B, // LEPCHA PUNCTUATION TA-ROL ᰻
|
||||||
|
0x1C3C, // LEPCHA PUNCTUATION NYET THYOOM TA-ROL ᰼
|
||||||
|
0x1C3D, // LEPCHA PUNCTUATION CER-WA ᰽
|
||||||
|
0x1C3E, // LEPCHA PUNCTUATION TSHOOK CER-WA ᰾
|
||||||
|
0x1C3F, // LEPCHA PUNCTUATION TSHOOK ᰿
|
||||||
|
0x1C7E, // OL CHIKI PUNCTUATION MUCAAD ᱾
|
||||||
|
0x1C7F, // OL CHIKI PUNCTUATION DOUBLE MUCAAD ᱿
|
||||||
|
0x1CC0, // SUNDANESE PUNCTUATION BINDU SURYA ᳀
|
||||||
|
0x1CC1, // SUNDANESE PUNCTUATION BINDU PANGLONG ᳁
|
||||||
|
0x1CC2, // SUNDANESE PUNCTUATION BINDU PURNAMA ᳂
|
||||||
|
0x1CC3, // SUNDANESE PUNCTUATION BINDU CAKRA ᳃
|
||||||
|
0x1CC4, // SUNDANESE PUNCTUATION BINDU LEU SATANGA ᳄
|
||||||
|
0x1CC5, // SUNDANESE PUNCTUATION BINDU KA SATANGA ᳅
|
||||||
|
0x1CC6, // SUNDANESE PUNCTUATION BINDU DA SATANGA ᳆
|
||||||
|
0x1CC7, // SUNDANESE PUNCTUATION BINDU BA SATANGA ᳇
|
||||||
|
0x1CD3, // VEDIC SIGN NIHSHVASA ᳓
|
||||||
|
0x2016, // DOUBLE VERTICAL LINE ‖
|
||||||
|
0x2017, // DOUBLE LOW LINE ‗
|
||||||
|
0x2020, // DAGGER †
|
||||||
|
0x2021, // DOUBLE DAGGER ‡
|
||||||
|
0x2022, // BULLET •
|
||||||
|
0x2023, // TRIANGULAR BULLET ‣
|
||||||
|
0x2024, // ONE DOT LEADER ․
|
||||||
|
0x2025, // TWO DOT LEADER ‥
|
||||||
|
0x2026, // HORIZONTAL ELLIPSIS …
|
||||||
|
0x2027, // HYPHENATION POINT ‧
|
||||||
|
0x2030, // PER MILLE SIGN ‰
|
||||||
|
0x2031, // PER TEN THOUSAND SIGN ‱
|
||||||
|
0x2032, // PRIME ′
|
||||||
|
0x2033, // DOUBLE PRIME ″
|
||||||
|
0x2034, // TRIPLE PRIME ‴
|
||||||
|
0x2035, // REVERSED PRIME ‵
|
||||||
|
0x2036, // REVERSED DOUBLE PRIME ‶
|
||||||
|
0x2037, // REVERSED TRIPLE PRIME ‷
|
||||||
|
0x2038, // CARET ‸
|
||||||
|
0x203B, // REFERENCE MARK ※
|
||||||
|
0x203C, // DOUBLE EXCLAMATION MARK ‼
|
||||||
|
0x203D, // INTERROBANG ‽
|
||||||
|
0x203E, // OVERLINE ‾
|
||||||
|
0x2041, // CARET INSERTION POINT ⁁
|
||||||
|
0x2042, // ASTERISM ⁂
|
||||||
|
0x2043, // HYPHEN BULLET ⁃
|
||||||
|
0x2047, // DOUBLE QUESTION MARK ⁇
|
||||||
|
0x2048, // QUESTION EXCLAMATION MARK ⁈
|
||||||
|
0x2049, // EXCLAMATION QUESTION MARK ⁉
|
||||||
|
0x204A, // TIRONIAN SIGN ET ⁊
|
||||||
|
0x204B, // REVERSED PILCROW SIGN ⁋
|
||||||
|
0x204C, // BLACK LEFTWARDS BULLET ⁌
|
||||||
|
0x204D, // BLACK RIGHTWARDS BULLET ⁍
|
||||||
|
0x204E, // LOW ASTERISK ⁎
|
||||||
|
0x204F, // REVERSED SEMICOLON ⁏
|
||||||
|
0x2050, // CLOSE UP ⁐
|
||||||
|
0x2051, // TWO ASTERISKS ALIGNED VERTICALLY ⁑
|
||||||
|
0x2053, // SWUNG DASH ⁓
|
||||||
|
0x2055, // FLOWER PUNCTUATION MARK ⁕
|
||||||
|
0x2056, // THREE DOT PUNCTUATION ⁖
|
||||||
|
0x2057, // QUADRUPLE PRIME ⁗
|
||||||
|
0x2058, // FOUR DOT PUNCTUATION ⁘
|
||||||
|
0x2059, // FIVE DOT PUNCTUATION ⁙
|
||||||
|
0x205A, // TWO DOT PUNCTUATION ⁚
|
||||||
|
0x205B, // FOUR DOT MARK ⁛
|
||||||
|
0x205C, // DOTTED CROSS ⁜
|
||||||
|
0x205D, // TRICOLON ⁝
|
||||||
|
0x205E, // VERTICAL FOUR DOTS ⁞
|
||||||
|
0x2CF9, // COPTIC OLD NUBIAN FULL STOP ⳹
|
||||||
|
0x2CFA, // COPTIC OLD NUBIAN DIRECT QUESTION MARK ⳺
|
||||||
|
0x2CFB, // COPTIC OLD NUBIAN INDIRECT QUESTION MARK ⳻
|
||||||
|
0x2CFC, // COPTIC OLD NUBIAN VERSE DIVIDER ⳼
|
||||||
|
0x2CFE, // COPTIC FULL STOP ⳾
|
||||||
|
0x2CFF, // COPTIC MORPHOLOGICAL DIVIDER ⳿
|
||||||
|
0x2D70, // TIFINAGH SEPARATOR MARK ⵰
|
||||||
|
0x2E00, // RIGHT ANGLE SUBSTITUTION MARKER ⸀
|
||||||
|
0x2E01, // RIGHT ANGLE DOTTED SUBSTITUTION MARKER ⸁
|
||||||
|
0x2E06, // RAISED INTERPOLATION MARKER ⸆
|
||||||
|
0x2E07, // RAISED DOTTED INTERPOLATION MARKER ⸇
|
||||||
|
0x2E08, // DOTTED TRANSPOSITION MARKER ⸈
|
||||||
|
0x2E0B, // RAISED SQUARE ⸋
|
||||||
|
0x2E0E, // EDITORIAL CORONIS ⸎
|
||||||
|
0x2E0F, // PARAGRAPHOS ⸏
|
||||||
|
0x2E10, // FORKED PARAGRAPHOS ⸐
|
||||||
|
0x2E11, // REVERSED FORKED PARAGRAPHOS ⸑
|
||||||
|
0x2E12, // HYPODIASTOLE ⸒
|
||||||
|
0x2E13, // DOTTED OBELOS ⸓
|
||||||
|
0x2E14, // DOWNWARDS ANCORA ⸔
|
||||||
|
0x2E15, // UPWARDS ANCORA ⸕
|
||||||
|
0x2E16, // DOTTED RIGHT-POINTING ANGLE ⸖
|
||||||
|
0x2E18, // INVERTED INTERROBANG ⸘
|
||||||
|
0x2E19, // PALM BRANCH ⸙
|
||||||
|
0x2E1B, // TILDE WITH RING ABOVE ⸛
|
||||||
|
0x2E1E, // TILDE WITH DOT ABOVE ⸞
|
||||||
|
0x2E1F, // TILDE WITH DOT BELOW ⸟
|
||||||
|
0x2E2A, // TWO DOTS OVER ONE DOT PUNCTUATION ⸪
|
||||||
|
0x2E2B, // ONE DOT OVER TWO DOTS PUNCTUATION ⸫
|
||||||
|
0x2E2C, // SQUARED FOUR DOT PUNCTUATION ⸬
|
||||||
|
0x2E2D, // FIVE DOT MARK ⸭
|
||||||
|
0x2E2E, // REVERSED QUESTION MARK ⸮
|
||||||
|
0x2E30, // RING POINT ⸰
|
||||||
|
0x2E31, // WORD SEPARATOR MIDDLE DOT ⸱
|
||||||
|
0x2E32, // TURNED COMMA ⸲
|
||||||
|
0x2E33, // RAISED DOT ⸳
|
||||||
|
0x2E34, // RAISED COMMA ⸴
|
||||||
|
0x2E35, // TURNED SEMICOLON ⸵
|
||||||
|
0x2E36, // DAGGER WITH LEFT GUARD ⸶
|
||||||
|
0x2E37, // DAGGER WITH RIGHT GUARD ⸷
|
||||||
|
0x2E38, // TURNED DAGGER ⸸
|
||||||
|
0x2E39, // TOP HALF SECTION SIGN ⸹
|
||||||
|
0x2E3C, // STENOGRAPHIC FULL STOP ⸼
|
||||||
|
0x2E3D, // VERTICAL SIX DOTS ⸽
|
||||||
|
0x2E3E, // WIGGLY VERTICAL LINE ⸾
|
||||||
|
0x2E3F, // CAPITULUM ⸿
|
||||||
|
0x2E41, // REVERSED COMMA ⹁
|
||||||
|
0x2E43, // DASH WITH LEFT UPTURN ⹃
|
||||||
|
0x2E44, // DOUBLE SUSPENSION MARK ⹄
|
||||||
|
0x2E45, // INVERTED LOW KAVYKA ⹅
|
||||||
|
0x2E46, // INVERTED LOW KAVYKA WITH KAVYKA ABOVE ⹆
|
||||||
|
0x2E47, // LOW KAVYKA ⹇
|
||||||
|
0x2E48, // LOW KAVYKA WITH DOT ⹈
|
||||||
|
0x2E49, // DOUBLE STACKED COMMA ⹉
|
||||||
|
0x2E4A, // DOTTED SOLIDUS ⹊
|
||||||
|
0x2E4B, // TRIPLE DAGGER ⹋
|
||||||
|
0x2E4C, // MEDIEVAL COMMA ⹌
|
||||||
|
0x2E4D, // PARAGRAPHUS MARK ⹍
|
||||||
|
0x2E4E, // PUNCTUS ELEVATUS MARK ⹎
|
||||||
|
0x2E4F, // CORNISH VERSE DIVIDER ⹏
|
||||||
|
0x3001, // IDEOGRAPHIC COMMA 、
|
||||||
|
0x3002, // IDEOGRAPHIC FULL STOP 。
|
||||||
|
0x3003, // DITTO MARK 〃
|
||||||
|
0x303D, // PART ALTERNATION MARK 〽
|
||||||
|
0x30FB, // KATAKANA MIDDLE DOT ・
|
||||||
|
0xA4FE, // LISU PUNCTUATION COMMA ꓾
|
||||||
|
0xA4FF, // LISU PUNCTUATION FULL STOP ꓿
|
||||||
|
0xA60D, // VAI COMMA ꘍
|
||||||
|
0xA60E, // VAI FULL STOP ꘎
|
||||||
|
0xA60F, // VAI QUESTION MARK ꘏
|
||||||
|
0xA673, // SLAVONIC ASTERISK ꙳
|
||||||
|
0xA67E, // CYRILLIC KAVYKA ꙾
|
||||||
|
0xA6F2, // BAMUM NJAEMLI ꛲
|
||||||
|
0xA6F3, // BAMUM FULL STOP ꛳
|
||||||
|
0xA6F4, // BAMUM COLON ꛴
|
||||||
|
0xA6F5, // BAMUM COMMA ꛵
|
||||||
|
0xA6F6, // BAMUM SEMICOLON ꛶
|
||||||
|
0xA6F7, // BAMUM QUESTION MARK ꛷
|
||||||
|
0xA874, // PHAGS-PA SINGLE HEAD MARK ꡴
|
||||||
|
0xA875, // PHAGS-PA DOUBLE HEAD MARK ꡵
|
||||||
|
0xA876, // PHAGS-PA MARK SHAD ꡶
|
||||||
|
0xA877, // PHAGS-PA MARK DOUBLE SHAD ꡷
|
||||||
|
0xA8CE, // SAURASHTRA DANDA ꣎
|
||||||
|
0xA8CF, // SAURASHTRA DOUBLE DANDA ꣏
|
||||||
|
0xA8F8, // DEVANAGARI SIGN PUSHPIKA ꣸
|
||||||
|
0xA8F9, // DEVANAGARI GAP FILLER ꣹
|
||||||
|
0xA8FA, // DEVANAGARI CARET ꣺
|
||||||
|
0xA8FC, // DEVANAGARI SIGN SIDDHAM ꣼
|
||||||
|
0xA92E, // KAYAH LI SIGN CWI ꤮
|
||||||
|
0xA92F, // KAYAH LI SIGN SHYA ꤯
|
||||||
|
0xA95F, // REJANG SECTION MARK ꥟
|
||||||
|
0xA9C1, // JAVANESE LEFT RERENGGAN ꧁
|
||||||
|
0xA9C2, // JAVANESE RIGHT RERENGGAN ꧂
|
||||||
|
0xA9C3, // JAVANESE PADA ANDAP ꧃
|
||||||
|
0xA9C4, // JAVANESE PADA MADYA ꧄
|
||||||
|
0xA9C5, // JAVANESE PADA LUHUR ꧅
|
||||||
|
0xA9C6, // JAVANESE PADA WINDU ꧆
|
||||||
|
0xA9C7, // JAVANESE PADA PANGKAT ꧇
|
||||||
|
0xA9C8, // JAVANESE PADA LINGSA ꧈
|
||||||
|
0xA9C9, // JAVANESE PADA LUNGSI ꧉
|
||||||
|
0xA9CA, // JAVANESE PADA ADEG ꧊
|
||||||
|
0xA9CB, // JAVANESE PADA ADEG ADEG ꧋
|
||||||
|
0xA9CC, // JAVANESE PADA PISELEH ꧌
|
||||||
|
0xA9CD, // JAVANESE TURNED PADA PISELEH ꧍
|
||||||
|
0xA9DE, // JAVANESE PADA TIRTA TUMETES ꧞
|
||||||
|
0xA9DF, // JAVANESE PADA ISEN-ISEN ꧟
|
||||||
|
0xAA5C, // CHAM PUNCTUATION SPIRAL ꩜
|
||||||
|
0xAA5D, // CHAM PUNCTUATION DANDA ꩝
|
||||||
|
0xAA5E, // CHAM PUNCTUATION DOUBLE DANDA ꩞
|
||||||
|
0xAA5F, // CHAM PUNCTUATION TRIPLE DANDA ꩟
|
||||||
|
0xAADE, // TAI VIET SYMBOL HO HOI ꫞
|
||||||
|
0xAADF, // TAI VIET SYMBOL KOI KOI ꫟
|
||||||
|
0xAAF0, // MEETEI MAYEK CHEIKHAN ꫰
|
||||||
|
0xAAF1, // MEETEI MAYEK AHANG KHUDAM ꫱
|
||||||
|
0xABEB, // MEETEI MAYEK CHEIKHEI ꯫
|
||||||
|
0xFE10, // PRESENTATION FORM FOR VERTICAL COMMA ︐
|
||||||
|
0xFE11, // PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA ︑
|
||||||
|
0xFE12, // PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP ︒
|
||||||
|
0xFE13, // PRESENTATION FORM FOR VERTICAL COLON ︓
|
||||||
|
0xFE14, // PRESENTATION FORM FOR VERTICAL SEMICOLON ︔
|
||||||
|
0xFE15, // PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK ︕
|
||||||
|
0xFE16, // PRESENTATION FORM FOR VERTICAL QUESTION MARK ︖
|
||||||
|
0xFE19, // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS ︙
|
||||||
|
0xFE30, // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER ︰
|
||||||
|
0xFE45, // SESAME DOT ﹅
|
||||||
|
0xFE46, // WHITE SESAME DOT ﹆
|
||||||
|
0xFE49, // DASHED OVERLINE ﹉
|
||||||
|
0xFE4A, // CENTRELINE OVERLINE ﹊
|
||||||
|
0xFE4B, // WAVY OVERLINE ﹋
|
||||||
|
0xFE4C, // DOUBLE WAVY OVERLINE ﹌
|
||||||
|
0xFE50, // SMALL COMMA ﹐
|
||||||
|
0xFE51, // SMALL IDEOGRAPHIC COMMA ﹑
|
||||||
|
0xFE52, // SMALL FULL STOP ﹒
|
||||||
|
0xFE54, // SMALL SEMICOLON ﹔
|
||||||
|
0xFE55, // SMALL COLON ﹕
|
||||||
|
0xFE56, // SMALL QUESTION MARK ﹖
|
||||||
|
0xFE57, // SMALL EXCLAMATION MARK ﹗
|
||||||
|
0xFE5F, // SMALL NUMBER SIGN ﹟
|
||||||
|
0xFE60, // SMALL AMPERSAND ﹠
|
||||||
|
0xFE61, // SMALL ASTERISK ﹡
|
||||||
|
0xFE68, // SMALL REVERSE SOLIDUS ﹨
|
||||||
|
0xFE6A, // SMALL PERCENT SIGN ﹪
|
||||||
|
0xFE6B, // SMALL COMMERCIAL AT ﹫
|
||||||
|
0xFF01, // FULLWIDTH EXCLAMATION MARK !
|
||||||
|
0xFF02, // FULLWIDTH QUOTATION MARK "
|
||||||
|
0xFF03, // FULLWIDTH NUMBER SIGN #
|
||||||
|
0xFF05, // FULLWIDTH PERCENT SIGN %
|
||||||
|
0xFF06, // FULLWIDTH AMPERSAND &
|
||||||
|
0xFF07, // FULLWIDTH APOSTROPHE '
|
||||||
|
0xFF0A, // FULLWIDTH ASTERISK *
|
||||||
|
0xFF0C, // FULLWIDTH COMMA ,
|
||||||
|
0xFF0E, // FULLWIDTH FULL STOP .
|
||||||
|
0xFF0F, // FULLWIDTH SOLIDUS /
|
||||||
|
0xFF1A, // FULLWIDTH COLON :
|
||||||
|
0xFF1B, // FULLWIDTH SEMICOLON ;
|
||||||
|
0xFF1F, // FULLWIDTH QUESTION MARK ?
|
||||||
|
0xFF20, // FULLWIDTH COMMERCIAL AT @
|
||||||
|
0xFF3C, // FULLWIDTH REVERSE SOLIDUS \
|
||||||
|
0xFF61, // HALFWIDTH IDEOGRAPHIC FULL STOP 。
|
||||||
|
0xFF64, // HALFWIDTH IDEOGRAPHIC COMMA 、
|
||||||
|
0xFF65, // HALFWIDTH KATAKANA MIDDLE DOT ・
|
||||||
|
0x10100, // AEGEAN WORD SEPARATOR LINE 𐄀
|
||||||
|
0x10101, // AEGEAN WORD SEPARATOR DOT 𐄁
|
||||||
|
0x10102, // AEGEAN CHECK MARK 𐄂
|
||||||
|
0x1039F, // UGARITIC WORD DIVIDER 𐎟
|
||||||
|
0x103D0, // OLD PERSIAN WORD DIVIDER 𐏐
|
||||||
|
0x1056F, // CAUCASIAN ALBANIAN CITATION MARK 𐕯
|
||||||
|
0x10857, // IMPERIAL ARAMAIC SECTION SIGN 𐡗
|
||||||
|
0x1091F, // PHOENICIAN WORD SEPARATOR 𐤟
|
||||||
|
0x1093F, // LYDIAN TRIANGULAR MARK 𐤿
|
||||||
|
0x10A50, // KHAROSHTHI PUNCTUATION DOT 𐩐
|
||||||
|
0x10A51, // KHAROSHTHI PUNCTUATION SMALL CIRCLE 𐩑
|
||||||
|
0x10A52, // KHAROSHTHI PUNCTUATION CIRCLE 𐩒
|
||||||
|
0x10A53, // KHAROSHTHI PUNCTUATION CRESCENT BAR 𐩓
|
||||||
|
0x10A54, // KHAROSHTHI PUNCTUATION MANGALAM 𐩔
|
||||||
|
0x10A55, // KHAROSHTHI PUNCTUATION LOTUS 𐩕
|
||||||
|
0x10A56, // KHAROSHTHI PUNCTUATION DANDA 𐩖
|
||||||
|
0x10A57, // KHAROSHTHI PUNCTUATION DOUBLE DANDA 𐩗
|
||||||
|
0x10A58, // KHAROSHTHI PUNCTUATION LINES 𐩘
|
||||||
|
0x10A7F, // OLD SOUTH ARABIAN NUMERIC INDICATOR 𐩿
|
||||||
|
0x10AF0, // MANICHAEAN PUNCTUATION STAR 𐫰
|
||||||
|
0x10AF1, // MANICHAEAN PUNCTUATION FLEURON 𐫱
|
||||||
|
0x10AF2, // MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT 𐫲
|
||||||
|
0x10AF3, // MANICHAEAN PUNCTUATION DOT WITHIN DOT 𐫳
|
||||||
|
0x10AF4, // MANICHAEAN PUNCTUATION DOT 𐫴
|
||||||
|
0x10AF5, // MANICHAEAN PUNCTUATION TWO DOTS 𐫵
|
||||||
|
0x10AF6, // MANICHAEAN PUNCTUATION LINE FILLER 𐫶
|
||||||
|
0x10B39, // AVESTAN ABBREVIATION MARK 𐬹
|
||||||
|
0x10B3A, // TINY TWO DOTS OVER ONE DOT PUNCTUATION 𐬺
|
||||||
|
0x10B3B, // SMALL TWO DOTS OVER ONE DOT PUNCTUATION 𐬻
|
||||||
|
0x10B3C, // LARGE TWO DOTS OVER ONE DOT PUNCTUATION 𐬼
|
||||||
|
0x10B3D, // LARGE ONE DOT OVER TWO DOTS PUNCTUATION 𐬽
|
||||||
|
0x10B3E, // LARGE TWO RINGS OVER ONE RING PUNCTUATION 𐬾
|
||||||
|
0x10B3F, // LARGE ONE RING OVER TWO RINGS PUNCTUATION 𐬿
|
||||||
|
0x10B99, // PSALTER PAHLAVI SECTION MARK 𐮙
|
||||||
|
0x10B9A, // PSALTER PAHLAVI TURNED SECTION MARK 𐮚
|
||||||
|
0x10B9B, // PSALTER PAHLAVI FOUR DOTS WITH CROSS 𐮛
|
||||||
|
0x10B9C, // PSALTER PAHLAVI FOUR DOTS WITH DOT 𐮜
|
||||||
|
0x10F55, // SOGDIAN PUNCTUATION TWO VERTICAL BARS 𐽕
|
||||||
|
0x10F56, // SOGDIAN PUNCTUATION TWO VERTICAL BARS WITH DOTS 𐽖
|
||||||
|
0x10F57, // SOGDIAN PUNCTUATION CIRCLE WITH DOT 𐽗
|
||||||
|
0x10F58, // SOGDIAN PUNCTUATION TWO CIRCLES WITH DOTS 𐽘
|
||||||
|
0x10F59, // SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 𐽙
|
||||||
|
0x11047, // BRAHMI DANDA 𑁇
|
||||||
|
0x11048, // BRAHMI DOUBLE DANDA 𑁈
|
||||||
|
0x11049, // BRAHMI PUNCTUATION DOT 𑁉
|
||||||
|
0x1104A, // BRAHMI PUNCTUATION DOUBLE DOT 𑁊
|
||||||
|
0x1104B, // BRAHMI PUNCTUATION LINE 𑁋
|
||||||
|
0x1104C, // BRAHMI PUNCTUATION CRESCENT BAR 𑁌
|
||||||
|
0x1104D, // BRAHMI PUNCTUATION LOTUS 𑁍
|
||||||
|
0x110BB, // KAITHI ABBREVIATION SIGN 𑂻
|
||||||
|
0x110BC, // KAITHI ENUMERATION SIGN 𑂼
|
||||||
|
0x110BE, // KAITHI SECTION MARK 𑂾
|
||||||
|
0x110BF, // KAITHI DOUBLE SECTION MARK 𑂿
|
||||||
|
0x110C0, // KAITHI DANDA 𑃀
|
||||||
|
0x110C1, // KAITHI DOUBLE DANDA 𑃁
|
||||||
|
0x11140, // CHAKMA SECTION MARK 𑅀
|
||||||
|
0x11141, // CHAKMA DANDA 𑅁
|
||||||
|
0x11142, // CHAKMA DOUBLE DANDA 𑅂
|
||||||
|
0x11143, // CHAKMA QUESTION MARK 𑅃
|
||||||
|
0x11174, // MAHAJANI ABBREVIATION SIGN 𑅴
|
||||||
|
0x11175, // MAHAJANI SECTION MARK 𑅵
|
||||||
|
0x111C5, // SHARADA DANDA 𑇅
|
||||||
|
0x111C6, // SHARADA DOUBLE DANDA 𑇆
|
||||||
|
0x111C7, // SHARADA ABBREVIATION SIGN 𑇇
|
||||||
|
0x111C8, // SHARADA SEPARATOR 𑇈
|
||||||
|
0x111CD, // SHARADA SUTRA MARK 𑇍
|
||||||
|
0x111DB, // SHARADA SIGN SIDDHAM 𑇛
|
||||||
|
0x111DD, // SHARADA CONTINUATION SIGN 𑇝
|
||||||
|
0x111DE, // SHARADA SECTION MARK-1 𑇞
|
||||||
|
0x111DF, // SHARADA SECTION MARK-2 𑇟
|
||||||
|
0x11238, // KHOJKI DANDA 𑈸
|
||||||
|
0x11239, // KHOJKI DOUBLE DANDA 𑈹
|
||||||
|
0x1123A, // KHOJKI WORD SEPARATOR 𑈺
|
||||||
|
0x1123B, // KHOJKI SECTION MARK 𑈻
|
||||||
|
0x1123C, // KHOJKI DOUBLE SECTION MARK 𑈼
|
||||||
|
0x1123D, // KHOJKI ABBREVIATION SIGN 𑈽
|
||||||
|
0x112A9, // MULTANI SECTION MARK 𑊩
|
||||||
|
0x1144B, // NEWA DANDA 𑑋
|
||||||
|
0x1144C, // NEWA DOUBLE DANDA 𑑌
|
||||||
|
0x1144D, // NEWA COMMA 𑑍
|
||||||
|
0x1144E, // NEWA GAP FILLER 𑑎
|
||||||
|
0x1144F, // NEWA ABBREVIATION SIGN 𑑏
|
||||||
|
0x1145B, // NEWA PLACEHOLDER MARK 𑑛
|
||||||
|
0x1145D, // NEWA INSERTION SIGN 𑑝
|
||||||
|
0x114C6, // TIRHUTA ABBREVIATION SIGN 𑓆
|
||||||
|
0x115C1, // SIDDHAM SIGN SIDDHAM 𑗁
|
||||||
|
0x115C2, // SIDDHAM DANDA 𑗂
|
||||||
|
0x115C3, // SIDDHAM DOUBLE DANDA 𑗃
|
||||||
|
0x115C4, // SIDDHAM SEPARATOR DOT 𑗄
|
||||||
|
0x115C5, // SIDDHAM SEPARATOR BAR 𑗅
|
||||||
|
0x115C6, // SIDDHAM REPETITION MARK-1 𑗆
|
||||||
|
0x115C7, // SIDDHAM REPETITION MARK-2 𑗇
|
||||||
|
0x115C8, // SIDDHAM REPETITION MARK-3 𑗈
|
||||||
|
0x115C9, // SIDDHAM END OF TEXT MARK 𑗉
|
||||||
|
0x115CA, // SIDDHAM SECTION MARK WITH TRIDENT AND U-SHAPED ORNAMENTS 𑗊
|
||||||
|
0x115CB, // SIDDHAM SECTION MARK WITH TRIDENT AND DOTTED CRESCENTS 𑗋
|
||||||
|
0x115CC, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED CRESCENTS 𑗌
|
||||||
|
0x115CD, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED DOUBLE CRESCENTS 𑗍
|
||||||
|
0x115CE, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED TRIPLE CRESCENTS 𑗎
|
||||||
|
0x115CF, // SIDDHAM SECTION MARK DOUBLE RING 𑗏
|
||||||
|
0x115D0, // SIDDHAM SECTION MARK DOUBLE RING WITH RAYS 𑗐
|
||||||
|
0x115D1, // SIDDHAM SECTION MARK WITH DOUBLE CRESCENTS 𑗑
|
||||||
|
0x115D2, // SIDDHAM SECTION MARK WITH TRIPLE CRESCENTS 𑗒
|
||||||
|
0x115D3, // SIDDHAM SECTION MARK WITH QUADRUPLE CRESCENTS 𑗓
|
||||||
|
0x115D4, // SIDDHAM SECTION MARK WITH SEPTUPLE CRESCENTS 𑗔
|
||||||
|
0x115D5, // SIDDHAM SECTION MARK WITH CIRCLES AND RAYS 𑗕
|
||||||
|
0x115D6, // SIDDHAM SECTION MARK WITH CIRCLES AND TWO ENCLOSURES 𑗖
|
||||||
|
0x115D7, // SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 𑗗
|
||||||
|
0x11641, // MODI DANDA 𑙁
|
||||||
|
0x11642, // MODI DOUBLE DANDA 𑙂
|
||||||
|
0x11643, // MODI ABBREVIATION SIGN 𑙃
|
||||||
|
0x11660, // MONGOLIAN BIRGA WITH ORNAMENT 𑙠
|
||||||
|
0x11661, // MONGOLIAN ROTATED BIRGA 𑙡
|
||||||
|
0x11662, // MONGOLIAN DOUBLE BIRGA WITH ORNAMENT 𑙢
|
||||||
|
0x11663, // MONGOLIAN TRIPLE BIRGA WITH ORNAMENT 𑙣
|
||||||
|
0x11664, // MONGOLIAN BIRGA WITH DOUBLE ORNAMENT 𑙤
|
||||||
|
0x11665, // MONGOLIAN ROTATED BIRGA WITH ORNAMENT 𑙥
|
||||||
|
0x11666, // MONGOLIAN ROTATED BIRGA WITH DOUBLE ORNAMENT 𑙦
|
||||||
|
0x11667, // MONGOLIAN INVERTED BIRGA 𑙧
|
||||||
|
0x11668, // MONGOLIAN INVERTED BIRGA WITH DOUBLE ORNAMENT 𑙨
|
||||||
|
0x11669, // MONGOLIAN SWIRL BIRGA 𑙩
|
||||||
|
0x1166A, // MONGOLIAN SWIRL BIRGA WITH ORNAMENT 𑙪
|
||||||
|
0x1166B, // MONGOLIAN SWIRL BIRGA WITH DOUBLE ORNAMENT 𑙫
|
||||||
|
0x1166C, // MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT 𑙬
|
||||||
|
0x1173C, // AHOM SIGN SMALL SECTION 𑜼
|
||||||
|
0x1173D, // AHOM SIGN SECTION 𑜽
|
||||||
|
0x1173E, // AHOM SIGN RULAI 𑜾
|
||||||
|
0x1183B, // DOGRA ABBREVIATION SIGN 𑠻
|
||||||
|
0x119E2, // NANDINAGARI SIGN SIDDHAM 𑧢
|
||||||
|
0x11A3F, // ZANABAZAR SQUARE INITIAL HEAD MARK 𑨿
|
||||||
|
0x11A40, // ZANABAZAR SQUARE CLOSING HEAD MARK 𑩀
|
||||||
|
0x11A41, // ZANABAZAR SQUARE MARK TSHEG 𑩁
|
||||||
|
0x11A42, // ZANABAZAR SQUARE MARK SHAD 𑩂
|
||||||
|
0x11A43, // ZANABAZAR SQUARE MARK DOUBLE SHAD 𑩃
|
||||||
|
0x11A44, // ZANABAZAR SQUARE MARK LONG TSHEG 𑩄
|
||||||
|
0x11A45, // ZANABAZAR SQUARE INITIAL DOUBLE-LINED HEAD MARK 𑩅
|
||||||
|
0x11A46, // ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK 𑩆
|
||||||
|
0x11A9A, // SOYOMBO MARK TSHEG 𑪚
|
||||||
|
0x11A9B, // SOYOMBO MARK SHAD 𑪛
|
||||||
|
0x11A9C, // SOYOMBO MARK DOUBLE SHAD 𑪜
|
||||||
|
0x11A9E, // SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME 𑪞
|
||||||
|
0x11A9F, // SOYOMBO HEAD MARK WITH MOON AND SUN AND FLAME 𑪟
|
||||||
|
0x11AA0, // SOYOMBO HEAD MARK WITH MOON AND SUN 𑪠
|
||||||
|
0x11AA1, // SOYOMBO TERMINAL MARK-1 𑪡
|
||||||
|
0x11AA2, // SOYOMBO TERMINAL MARK-2 𑪢
|
||||||
|
0x11C41, // BHAIKSUKI DANDA 𑱁
|
||||||
|
0x11C42, // BHAIKSUKI DOUBLE DANDA 𑱂
|
||||||
|
0x11C43, // BHAIKSUKI WORD SEPARATOR 𑱃
|
||||||
|
0x11C44, // BHAIKSUKI GAP FILLER-1 𑱄
|
||||||
|
0x11C45, // BHAIKSUKI GAP FILLER-2 𑱅
|
||||||
|
0x11C70, // MARCHEN HEAD MARK 𑱰
|
||||||
|
0x11C71, // MARCHEN MARK SHAD 𑱱
|
||||||
|
0x11EF7, // MAKASAR PASSIMBANG 𑻷
|
||||||
|
0x11EF8, // MAKASAR END OF SECTION 𑻸
|
||||||
|
0x11FFF, // TAMIL PUNCTUATION END OF TEXT 𑿿
|
||||||
|
0x12470, // CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER 𒑰
|
||||||
|
0x12471, // CUNEIFORM PUNCTUATION SIGN VERTICAL COLON 𒑱
|
||||||
|
0x12472, // CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON 𒑲
|
||||||
|
0x12473, // CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON 𒑳
|
||||||
|
0x12474, // CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON 𒑴
|
||||||
|
0x16A6E, // MRO DANDA 𖩮
|
||||||
|
0x16A6F, // MRO DOUBLE DANDA 𖩯
|
||||||
|
0x16AF5, // BASSA VAH FULL STOP 𖫵
|
||||||
|
0x16B37, // PAHAWH HMONG SIGN VOS THOM 𖬷
|
||||||
|
0x16B38, // PAHAWH HMONG SIGN VOS TSHAB CEEB 𖬸
|
||||||
|
0x16B39, // PAHAWH HMONG SIGN CIM CHEEM 𖬹
|
||||||
|
0x16B3A, // PAHAWH HMONG SIGN VOS THIAB 𖬺
|
||||||
|
0x16B3B, // PAHAWH HMONG SIGN VOS FEEM 𖬻
|
||||||
|
0x16B44, // PAHAWH HMONG SIGN XAUS 𖭄
|
||||||
|
0x16E97, // MEDEFAIDRIN COMMA 𖺗
|
||||||
|
0x16E98, // MEDEFAIDRIN FULL STOP 𖺘
|
||||||
|
0x16E99, // MEDEFAIDRIN SYMBOL AIVA 𖺙
|
||||||
|
0x16E9A, // MEDEFAIDRIN EXCLAMATION OH 𖺚
|
||||||
|
0x16FE2, // OLD CHINESE HOOK MARK 𖿢
|
||||||
|
0x1BC9F, // DUPLOYAN PUNCTUATION CHINOOK FULL STOP 𛲟
|
||||||
|
0x1DA87, // SIGNWRITING COMMA 𝪇
|
||||||
|
0x1DA88, // SIGNWRITING FULL STOP 𝪈
|
||||||
|
0x1DA89, // SIGNWRITING SEMICOLON 𝪉
|
||||||
|
0x1DA8A, // SIGNWRITING COLON 𝪊
|
||||||
|
0x1DA8B, // SIGNWRITING PARENTHESIS 𝪋
|
||||||
|
0x1E95E, // ADLAM INITIAL EXCLAMATION MARK 𞥞
|
||||||
|
0x1E95F, // ADLAM INITIAL QUESTION MARK
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
|
@ -25,4 +25,25 @@ fn test_utf8_util() {
|
||||||
// test u_len function
|
// test u_len function
|
||||||
assert utf8.u_len(src1)==15 //29
|
assert utf8.u_len(src1)==15 //29
|
||||||
assert utf8.u_len("pippo".ustring())==5
|
assert utf8.u_len("pippo".ustring())==5
|
||||||
|
|
||||||
|
// western punctuation
|
||||||
|
a := '.abc?abcòàè.'
|
||||||
|
assert utf8.is_punct(a,0)==true
|
||||||
|
assert utf8.is_punct('b',0)==false
|
||||||
|
assert utf8.is_uchar_punct(0x002E)==true
|
||||||
|
assert utf8.is_punct(a,4)==true // ?
|
||||||
|
assert utf8.is_punct(a,14)==true // last .
|
||||||
|
assert utf8.is_punct(a,12)==false // è
|
||||||
|
println("OK western")
|
||||||
|
|
||||||
|
// global punctuation
|
||||||
|
b := '.ĂĂa. ÔÔ TESTO Æ€'
|
||||||
|
assert utf8.is_global_punct(b,0)==true
|
||||||
|
assert utf8.is_global_punct('.',0)==true
|
||||||
|
assert utf8.is_uchar_punct(0x002E)==true
|
||||||
|
assert utf8.is_global_punct(b,6)==true // .
|
||||||
|
assert utf8.is_global_punct(b,1)==false // a
|
||||||
|
|
||||||
|
// test utility functions
|
||||||
|
assert utf8.get_uchar(b,0)==0x002E
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue