utf8: punctuation

pull/3067/head
penguindark 2019-12-12 19:08:44 +01:00 committed by Alexander Medvednikov
parent 0eeb607ffd
commit 8e1c27d129
2 changed files with 792 additions and 15 deletions

View File

@ -11,6 +11,11 @@
**********************************************************************/ **********************************************************************/
module utf8 module utf8
/**********************************************************************
*
* Utility functions
*
**********************************************************************/
// len return the leght as number of unicode chars from a string // len return the leght as number of unicode chars from a string
pub fn len(s string) int { pub fn len(s string) int {
@ -33,6 +38,54 @@ pub fn u_len(s ustring) int {
return len(s.s) return len(s.s)
} }
// get_uchar convert a unicode glyph in string[index] into a int unicode char
pub fn get_uchar(s string, index int) int {
mut res := 0
mut ch_len := 0
if s.len > 0 {
ch_len = utf8util_char_len(s.str[index])
if ch_len == 1 {
return u16(s.str[0])
}if ch_len > 1 && ch_len < 5{
mut lword := 0
for i:=0; i < ch_len ; i++ {
lword = (lword << 8 ) | int( s.str[index + i] )
}
// 2 byte utf-8
// byte format: 110xxxxx 10xxxxxx
//
if ch_len == 2 {
res = (lword & 0x1f00) >> 2 | (lword & 0x3f)
}
// 3 byte utf-8
// byte format: 1110xxxx 10xxxxxx 10xxxxxx
//
else if ch_len == 3 {
res = ( lword & 0x0f0000 ) >> 4 | ( lword & 0x3f00 ) >> 2 | ( lword & 0x3f )
}
// 4 byte utf-8
// byte format: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
//
else if ch_len == 4 {
res = (( lword & 0x07000000 ) >> 6) | (( lword & 0x003f0000 ) >> 4) |
(( lword & 0x00003F00 ) >> 2 ) | ( lword & 0x0000003f )
}
}
}
return res
}
/**********************************************************************
*
* Conversion functions
*
**********************************************************************/
// to_upper return an uppercase string from a string // to_upper return an uppercase string from a string
pub fn to_upper(s string) string { pub fn to_upper(s string) string {
@ -59,16 +112,52 @@ pub fn u_to_lower(s ustring) ustring {
/********************************************************************** /**********************************************************************
* *
* Private functions * Punctuation functions
*
* The "western" function search on a small table, that is quicker than
* the global unicode table search. **Use only for western chars**.
* *
**********************************************************************/ **********************************************************************/
// utf8util_char_len calculate the lenght in bytes of a utf8 rune //
// Western
//
// is_punct return true if the string[index] byte is the start of a unicode western punctuation
pub fn is_punct( s string , index int) bool {
return is_uchar_punct(get_uchar(s, index))
}
// is_uchar_punct return true if the input unicode is a western unicode punctuation
pub fn is_uchar_punct( uchar int ) bool {
return find_punct_in_table(uchar, unicode_punct_western ) != 0
}
//
// Global
//
// is_global_punct return true if the string[index] byte of is the start of a global unicode punctuation
pub fn is_global_punct( s string , index int) bool {
return is_uchar_global_punct(get_uchar(s, index))
}
// is_uchar_global_punct return true if the input unicode is a global unicode punctuation
pub fn is_uchar_global_punct( uchar int ) bool {
return find_punct_in_table( uchar , unicode_punct ) != 0
}
/**********************************************************************
*
* Private functions
*
**********************************************************************/
// utf8util_char_len calculate the length in bytes of a utf8 char
fn utf8util_char_len(b byte) int { fn utf8util_char_len(b byte) int {
return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1 return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1
} }
// //
// if upper_flag == true then make low ==> upper conversion // if upper_flag == true then make low ==> upper conversion
// if upper_flag == false then make upper ==> low conversion // if upper_flag == false then make upper ==> low conversion
@ -76,7 +165,6 @@ fn utf8util_char_len(b byte) int {
// up_low make the dirt job // up_low make the dirt job
fn up_low(s string, upper_flag bool) string { fn up_low(s string, upper_flag bool) string {
mut _index := 0 mut _index := 0
mut old_index := 0
mut str_res := malloc(s.len + 1) mut str_res := malloc(s.len + 1)
for { for {
@ -98,7 +186,7 @@ fn up_low(s string, upper_flag bool) string {
//C.printf(" #%d (%x) ", _index, lword) //C.printf(" #%d (%x) ", _index, lword)
mut res := int(0) mut res := 0
// 2 byte utf-8 // 2 byte utf-8
// byte format: 110xxxxx 10xxxxxx // byte format: 110xxxxx 10xxxxxx
@ -131,7 +219,7 @@ fn up_low(s string, upper_flag bool) string {
} }
//C.printf("\n") //C.printf("\n")
}else{ }else{
tab_char := u16(unicode_con_table_up_to_low[ch_index]) tab_char := unicode_con_table_up_to_low[ch_index]
//C.printf("tab_char: %04x ",tab_char) //C.printf("tab_char: %04x ",tab_char)
if ch_len == 2 { if ch_len == 2 {
@ -176,7 +264,6 @@ fn up_low(s string, upper_flag bool) string {
} }
} }
old_index = _index
_index += ch_len _index += ch_len
// we are done, exit the loop // we are done, exit the loop
@ -199,13 +286,13 @@ fn find_char_in_table( in_code u16, upper_flag bool) int {
// We will use a simple binary search // We will use a simple binary search
// //
mut first_index := int(0) // first index of our utf8 char range mut first_index := 0 // first index of our utf8 char range
mut last_index := int(unicode_con_table_up_to_low.len >> 1) // last+1 index of our utf8 char range mut last_index := (unicode_con_table_up_to_low.len >> 1) // last+1 index of our utf8 char range
mut index := int(0) mut index := 0
mut x := u16(0) mut x := u16(0)
mut offset:=int(0) // up to low mut offset:=0 // up to low
mut i_step:=int(1) // up to low mut i_step:=1 // up to low
if upper_flag==true { if upper_flag==true {
offset=1 // low to up offset=1 // low to up
i_step=0 // low to up i_step=0 // low to up
@ -220,11 +307,10 @@ fn find_char_in_table( in_code u16, upper_flag bool) int {
if x == in_code { if x == in_code {
//C.printf(" Found!\n") //C.printf(" Found!\n")
return int( (index<<1) + i_step) return ( (index<<1) + i_step)
} }
else if x>in_code { else if x>in_code {
last_index=index last_index=index
}else { }else {
first_index=index first_index=index
} }
@ -234,7 +320,40 @@ fn find_char_in_table( in_code u16, upper_flag bool) int {
} }
} }
//C.printf("not found.\n") //C.printf("not found.\n")
return int(0) return 0
}
// find punct in lockup table
fn find_punct_in_table( in_code int , in_table []int ) int {
//
// We will use a simple binary search
//
mut first_index := 0
mut last_index := (in_table.len)
mut index := 0
mut x := 0
for {
index = (first_index+last_index) >> 1
x = in_table[ index ]
//C.printf("(%d..%d) index:%d base[%08x]==>[%08x]\n",first_index,last_index,index,in_code,x)
if x == in_code {
return index
}
else if x>in_code {
last_index=index
}else {
first_index=index
}
if (last_index-first_index)<=1 {
break
}
}
//C.printf("not found.\n")
return 0
} }
@ -927,3 +1046,640 @@ u16(0x0041), 0x0061, //LATIN CAPITAL LETTER A LATIN SMALL LETTER A
0xFF3A, 0xFF5A, //FULLWIDTH LATIN CAPITAL LETTER Z FULLWIDTH LATIN SMALL LETTER Z 0xFF3A, 0xFF5A, //FULLWIDTH LATIN CAPITAL LETTER Z FULLWIDTH LATIN SMALL LETTER Z
] ]
) )
/*****************************************************************************
*
* Unicode punctuation chars
*
* source: http://www.unicode.org/faq/punctuation_symbols.html
*
*****************************************************************************/
const(
// Western punctuation mark
// Character Name Browser Image
unicode_punct_western=[
0x0021, // EXCLAMATION MARK !
0x0022, // QUOTATION MARK "
0x0027, // APOSTROPHE '
0x002A, // ASTERISK *
0x002C, // COMMA ,
0x002E, // FULL STOP .
0x002F, // SOLIDUS /
0x003A, // COLON :
0x003B, // SEMICOLON ;
0x003F, // QUESTION MARK ?
0x00A1, // INVERTED EXCLAMATION MARK ¡
0x00A7, // SECTION SIGN §
0x00B6, // PILCROW SIGN ¶
0x00B7, // MIDDLE DOT ·
0x00BF, // INVERTED QUESTION MARK ¿
0x037E, // GREEK QUESTION MARK ;
0x0387, // GREEK ANO TELEIA ·
0x055A, // ARMENIAN APOSTROPHE ՚
0x055B, // ARMENIAN EMPHASIS MARK ՛
0x055C, // ARMENIAN EXCLAMATION MARK ՜
0x055D, // ARMENIAN COMMA ՝
0x055E, // ARMENIAN QUESTION MARK ՞
0x055F, // ARMENIAN ABBREVIATION MARK ՟
0x0589, // ARMENIAN FULL STOP ։
0x05C0, // HEBREW PUNCTUATION PASEQ ׀
0x05C3, // HEBREW PUNCTUATION SOF PASUQ ׃
0x05C6, // HEBREW PUNCTUATION NUN HAFUKHA ׆
0x05F3, // HEBREW PUNCTUATION GERESH ׳
0x05F4, // HEBREW PUNCTUATION GERSHAYIM ״
]
// Unicode Characters in the 'Punctuation, Other' Category
// Character Name Browser Image
unicode_punct=[
0x0021, // EXCLAMATION MARK !
0x0022, // QUOTATION MARK "
0x0023, // NUMBER SIGN #
0x0025, // PERCENT SIGN %
0x0026, // AMPERSAND &
0x0027, // APOSTROPHE '
0x002A, // ASTERISK *
0x002C, // COMMA ,
0x002E, // FULL STOP .
0x002F, // SOLIDUS /
0x003A, // COLON :
0x003B, // SEMICOLON ;
0x003F, // QUESTION MARK ?
0x0040, // COMMERCIAL AT @
0x005C, // REVERSE SOLIDUS \
0x00A1, // INVERTED EXCLAMATION MARK ¡
0x00A7, // SECTION SIGN §
0x00B6, // PILCROW SIGN ¶
0x00B7, // MIDDLE DOT ·
0x00BF, // INVERTED QUESTION MARK ¿
0x037E, // GREEK QUESTION MARK ;
0x0387, // GREEK ANO TELEIA ·
0x055A, // ARMENIAN APOSTROPHE ՚
0x055B, // ARMENIAN EMPHASIS MARK ՛
0x055C, // ARMENIAN EXCLAMATION MARK ՜
0x055D, // ARMENIAN COMMA ՝
0x055E, // ARMENIAN QUESTION MARK ՞
0x055F, // ARMENIAN ABBREVIATION MARK ՟
0x0589, // ARMENIAN FULL STOP ։
0x05C0, // HEBREW PUNCTUATION PASEQ ׀
0x05C3, // HEBREW PUNCTUATION SOF PASUQ ׃
0x05C6, // HEBREW PUNCTUATION NUN HAFUKHA ׆
0x05F3, // HEBREW PUNCTUATION GERESH ׳
0x05F4, // HEBREW PUNCTUATION GERSHAYIM ״
0x0609, // ARABIC-INDIC PER MILLE SIGN ؉
0x060A, // ARABIC-INDIC PER TEN THOUSAND SIGN ؊
0x060C, // ARABIC COMMA ،
0x060D, // ARABIC DATE SEPARATOR ؍
0x061B, // ARABIC SEMICOLON ؛
0x061E, // ARABIC TRIPLE DOT PUNCTUATION MARK ؞
0x061F, // ARABIC QUESTION MARK ؟
0x066A, // ARABIC PERCENT SIGN ٪
0x066B, // ARABIC DECIMAL SEPARATOR ٫
0x066C, // ARABIC THOUSANDS SEPARATOR ٬
0x066D, // ARABIC FIVE POINTED STAR ٭
0x06D4, // ARABIC FULL STOP ۔
0x0700, // SYRIAC END OF PARAGRAPH ܀
0x0701, // SYRIAC SUPRALINEAR FULL STOP ܁
0x0702, // SYRIAC SUBLINEAR FULL STOP ܂
0x0703, // SYRIAC SUPRALINEAR COLON ܃
0x0704, // SYRIAC SUBLINEAR COLON ܄
0x0705, // SYRIAC HORIZONTAL COLON ܅
0x0706, // SYRIAC COLON SKEWED LEFT ܆
0x0707, // SYRIAC COLON SKEWED RIGHT ܇
0x0708, // SYRIAC SUPRALINEAR COLON SKEWED LEFT ܈
0x0709, // SYRIAC SUBLINEAR COLON SKEWED RIGHT ܉
0x070A, // SYRIAC CONTRACTION ܊
0x070B, // SYRIAC HARKLEAN OBELUS ܋
0x070C, // SYRIAC HARKLEAN METOBELUS ܌
0x070D, // SYRIAC HARKLEAN ASTERISCUS ܍
0x07F7, // NKO SYMBOL GBAKURUNEN ߷
0x07F8, // NKO COMMA ߸
0x07F9, // NKO EXCLAMATION MARK ߹
0x0830, // SAMARITAN PUNCTUATION NEQUDAA ࠰
0x0831, // SAMARITAN PUNCTUATION AFSAAQ ࠱
0x0832, // SAMARITAN PUNCTUATION ANGED ࠲
0x0833, // SAMARITAN PUNCTUATION BAU ࠳
0x0834, // SAMARITAN PUNCTUATION ATMAAU ࠴
0x0835, // SAMARITAN PUNCTUATION SHIYYAALAA ࠵
0x0836, // SAMARITAN ABBREVIATION MARK ࠶
0x0837, // SAMARITAN PUNCTUATION MELODIC QITSA ࠷
0x0838, // SAMARITAN PUNCTUATION ZIQAA ࠸
0x0839, // SAMARITAN PUNCTUATION QITSA ࠹
0x083A, // SAMARITAN PUNCTUATION ZAEF ࠺
0x083B, // SAMARITAN PUNCTUATION TURU ࠻
0x083C, // SAMARITAN PUNCTUATION ARKAANU ࠼
0x083D, // SAMARITAN PUNCTUATION SOF MASHFAAT ࠽
0x083E, // SAMARITAN PUNCTUATION ANNAAU ࠾
0x085E, // MANDAIC PUNCTUATION ࡞
0x0964, // DEVANAGARI DANDA ।
0x0965, // DEVANAGARI DOUBLE DANDA ॥
0x0970, // DEVANAGARI ABBREVIATION SIGN ॰
0x09FD, // BENGALI ABBREVIATION SIGN ৽
0x0A76, // GURMUKHI ABBREVIATION SIGN ੶
0x0AF0, // GUJARATI ABBREVIATION SIGN ૰
0x0C77, // TELUGU SIGN SIDDHAM ౷
0x0C84, // KANNADA SIGN SIDDHAM ಄
0x0DF4, // SINHALA PUNCTUATION KUNDDALIYA ෴
0x0E4F, // THAI CHARACTER FONGMAN ๏
0x0E5A, // THAI CHARACTER ANGKHANKHU ๚
0x0E5B, // THAI CHARACTER KHOMUT ๛
0x0F04, // TIBETAN MARK INITIAL YIG MGO MDUN MA ༄
0x0F05, // TIBETAN MARK CLOSING YIG MGO SGAB MA ༅
0x0F06, // TIBETAN MARK CARET YIG MGO PHUR SHAD MA ༆
0x0F07, // TIBETAN MARK YIG MGO TSHEG SHAD MA ༇
0x0F08, // TIBETAN MARK SBRUL SHAD ༈
0x0F09, // TIBETAN MARK BSKUR YIG MGO ༉
0x0F0A, // TIBETAN MARK BKA- SHOG YIG MGO ༊
0x0F0B, // TIBETAN MARK INTERSYLLABIC TSHEG ་
0x0F0C, // TIBETAN MARK DELIMITER TSHEG BSTAR ༌
0x0F0D, // TIBETAN MARK SHAD །
0x0F0E, // TIBETAN MARK NYIS SHAD ༎
0x0F0F, // TIBETAN MARK TSHEG SHAD ༏
0x0F10, // TIBETAN MARK NYIS TSHEG SHAD ༐
0x0F11, // TIBETAN MARK RIN CHEN SPUNGS SHAD ༑
0x0F12, // TIBETAN MARK RGYA GRAM SHAD ༒
0x0F14, // TIBETAN MARK GTER TSHEG ༔
0x0F85, // TIBETAN MARK PALUTA ྅
0x0FD0, // TIBETAN MARK BSKA- SHOG GI MGO RGYAN ࿐
0x0FD1, // TIBETAN MARK MNYAM YIG GI MGO RGYAN ࿑
0x0FD2, // TIBETAN MARK NYIS TSHEG ࿒
0x0FD3, // TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA ࿓
0x0FD4, // TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA ࿔
0x0FD9, // TIBETAN MARK LEADING MCHAN RTAGS ࿙
0x0FDA, // TIBETAN MARK TRAILING MCHAN RTAGS ࿚
0x104A, // MYANMAR SIGN LITTLE SECTION ၊
0x104B, // MYANMAR SIGN SECTION ။
0x104C, // MYANMAR SYMBOL LOCATIVE ၌
0x104D, // MYANMAR SYMBOL COMPLETED ၍
0x104E, // MYANMAR SYMBOL AFOREMENTIONED ၎
0x104F, // MYANMAR SYMBOL GENITIVE ၏
0x10FB, // GEORGIAN PARAGRAPH SEPARATOR ჻
0x1360, // ETHIOPIC SECTION MARK ፠
0x1361, // ETHIOPIC WORDSPACE ፡
0x1362, // ETHIOPIC FULL STOP ።
0x1363, // ETHIOPIC COMMA ፣
0x1364, // ETHIOPIC SEMICOLON ፤
0x1365, // ETHIOPIC COLON ፥
0x1366, // ETHIOPIC PREFACE COLON ፦
0x1367, // ETHIOPIC QUESTION MARK ፧
0x1368, // ETHIOPIC PARAGRAPH SEPARATOR ፨
0x166E, // CANADIAN SYLLABICS FULL STOP
0x16EB, // RUNIC SINGLE PUNCTUATION ᛫
0x16EC, // RUNIC MULTIPLE PUNCTUATION
0x16ED, // RUNIC CROSS PUNCTUATION
0x1735, // PHILIPPINE SINGLE PUNCTUATION
0x1736, // PHILIPPINE DOUBLE PUNCTUATION ᜶
0x17D4, // KHMER SIGN KHAN ។
0x17D5, // KHMER SIGN BARIYOOSAN ៕
0x17D6, // KHMER SIGN CAMNUC PII KUUH ៖
0x17D8, // KHMER SIGN BEYYAL ៘
0x17D9, // KHMER SIGN PHNAEK MUAN ៙
0x17DA, // KHMER SIGN KOOMUUT ៚
0x1800, // MONGOLIAN BIRGA ᠀
0x1801, // MONGOLIAN ELLIPSIS ᠁
0x1802, // MONGOLIAN COMMA ᠂
0x1803, // MONGOLIAN FULL STOP
0x1804, // MONGOLIAN COLON ᠄
0x1805, // MONGOLIAN FOUR DOTS ᠅
0x1807, // MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER ᠇
0x1808, // MONGOLIAN MANCHU COMMA ᠈
0x1809, // MONGOLIAN MANCHU FULL STOP
0x180A, // MONGOLIAN NIRUGU ᠊
0x1944, // LIMBU EXCLAMATION MARK ᥄
0x1945, // LIMBU QUESTION MARK ᥅
0x1A1E, // BUGINESE PALLAWA ᨞
0x1A1F, // BUGINESE END OF SECTION ᨟
0x1AA0, // TAI THAM SIGN WIANG ᪠
0x1AA1, // TAI THAM SIGN WIANGWAAK ᪡
0x1AA2, // TAI THAM SIGN SAWAN ᪢
0x1AA3, // TAI THAM SIGN KEOW ᪣
0x1AA4, // TAI THAM SIGN HOY ᪤
0x1AA5, // TAI THAM SIGN DOKMAI ᪥
0x1AA6, // TAI THAM SIGN REVERSED ROTATED RANA ᪦
0x1AA8, // TAI THAM SIGN KAAN ᪨
0x1AA9, // TAI THAM SIGN KAANKUU ᪩
0x1AAA, // TAI THAM SIGN SATKAAN ᪪
0x1AAB, // TAI THAM SIGN SATKAANKUU ᪫
0x1AAC, // TAI THAM SIGN HANG ᪬
0x1AAD, // TAI THAM SIGN CAANG ᪭
0x1B5A, // BALINESE PANTI ᭚
0x1B5B, // BALINESE PAMADA ᭛
0x1B5C, // BALINESE WINDU ᭜
0x1B5D, // BALINESE CARIK PAMUNGKAH ᭝
0x1B5E, // BALINESE CARIK SIKI ᭞
0x1B5F, // BALINESE CARIK PAREREN ᭟
0x1B60, // BALINESE PAMENENG ᭠
0x1BFC, // BATAK SYMBOL BINDU NA METEK ᯼
0x1BFD, // BATAK SYMBOL BINDU PINARBORAS ᯽
0x1BFE, // BATAK SYMBOL BINDU JUDUL ᯾
0x1BFF, // BATAK SYMBOL BINDU PANGOLAT ᯿
0x1C3B, // LEPCHA PUNCTUATION TA-ROL ᰻
0x1C3C, // LEPCHA PUNCTUATION NYET THYOOM TA-ROL ᰼
0x1C3D, // LEPCHA PUNCTUATION CER-WA ᰽
0x1C3E, // LEPCHA PUNCTUATION TSHOOK CER-WA ᰾
0x1C3F, // LEPCHA PUNCTUATION TSHOOK ᰿
0x1C7E, // OL CHIKI PUNCTUATION MUCAAD ᱾
0x1C7F, // OL CHIKI PUNCTUATION DOUBLE MUCAAD ᱿
0x1CC0, // SUNDANESE PUNCTUATION BINDU SURYA ᳀
0x1CC1, // SUNDANESE PUNCTUATION BINDU PANGLONG ᳁
0x1CC2, // SUNDANESE PUNCTUATION BINDU PURNAMA ᳂
0x1CC3, // SUNDANESE PUNCTUATION BINDU CAKRA ᳃
0x1CC4, // SUNDANESE PUNCTUATION BINDU LEU SATANGA ᳄
0x1CC5, // SUNDANESE PUNCTUATION BINDU KA SATANGA ᳅
0x1CC6, // SUNDANESE PUNCTUATION BINDU DA SATANGA ᳆
0x1CC7, // SUNDANESE PUNCTUATION BINDU BA SATANGA ᳇
0x1CD3, // VEDIC SIGN NIHSHVASA ᳓
0x2016, // DOUBLE VERTICAL LINE ‖
0x2017, // DOUBLE LOW LINE ‗
0x2020, // DAGGER †
0x2021, // DOUBLE DAGGER ‡
0x2022, // BULLET •
0x2023, // TRIANGULAR BULLET ‣
0x2024, // ONE DOT LEADER
0x2025, // TWO DOT LEADER ‥
0x2026, // HORIZONTAL ELLIPSIS …
0x2027, // HYPHENATION POINT ‧
0x2030, // PER MILLE SIGN ‰
0x2031, // PER TEN THOUSAND SIGN ‱
0x2032, // PRIME
0x2033, // DOUBLE PRIME ″
0x2034, // TRIPLE PRIME ‴
0x2035, // REVERSED PRIME
0x2036, // REVERSED DOUBLE PRIME ‶
0x2037, // REVERSED TRIPLE PRIME ‷
0x2038, // CARET ‸
0x203B, // REFERENCE MARK ※
0x203C, // DOUBLE EXCLAMATION MARK ‼
0x203D, // INTERROBANG ‽
0x203E, // OVERLINE ‾
0x2041, // CARET INSERTION POINT
0x2042, // ASTERISM ⁂
0x2043, // HYPHEN BULLET
0x2047, // DOUBLE QUESTION MARK ⁇
0x2048, // QUESTION EXCLAMATION MARK ⁈
0x2049, // EXCLAMATION QUESTION MARK ⁉
0x204A, // TIRONIAN SIGN ET ⁊
0x204B, // REVERSED PILCROW SIGN ⁋
0x204C, // BLACK LEFTWARDS BULLET ⁌
0x204D, // BLACK RIGHTWARDS BULLET ⁍
0x204E, // LOW ASTERISK
0x204F, // REVERSED SEMICOLON ⁏
0x2050, // CLOSE UP ⁐
0x2051, // TWO ASTERISKS ALIGNED VERTICALLY ⁑
0x2053, // SWUNG DASH
0x2055, // FLOWER PUNCTUATION MARK ⁕
0x2056, // THREE DOT PUNCTUATION ⁖
0x2057, // QUADRUPLE PRIME ⁗
0x2058, // FOUR DOT PUNCTUATION ⁘
0x2059, // FIVE DOT PUNCTUATION ⁙
0x205A, // TWO DOT PUNCTUATION
0x205B, // FOUR DOT MARK ⁛
0x205C, // DOTTED CROSS ⁜
0x205D, // TRICOLON ⁝
0x205E, // VERTICAL FOUR DOTS ⁞
0x2CF9, // COPTIC OLD NUBIAN FULL STOP ⳹
0x2CFA, // COPTIC OLD NUBIAN DIRECT QUESTION MARK ⳺
0x2CFB, // COPTIC OLD NUBIAN INDIRECT QUESTION MARK ⳻
0x2CFC, // COPTIC OLD NUBIAN VERSE DIVIDER ⳼
0x2CFE, // COPTIC FULL STOP ⳾
0x2CFF, // COPTIC MORPHOLOGICAL DIVIDER ⳿
0x2D70, // TIFINAGH SEPARATOR MARK ⵰
0x2E00, // RIGHT ANGLE SUBSTITUTION MARKER ⸀
0x2E01, // RIGHT ANGLE DOTTED SUBSTITUTION MARKER ⸁
0x2E06, // RAISED INTERPOLATION MARKER ⸆
0x2E07, // RAISED DOTTED INTERPOLATION MARKER ⸇
0x2E08, // DOTTED TRANSPOSITION MARKER ⸈
0x2E0B, // RAISED SQUARE ⸋
0x2E0E, // EDITORIAL CORONIS ⸎
0x2E0F, // PARAGRAPHOS ⸏
0x2E10, // FORKED PARAGRAPHOS ⸐
0x2E11, // REVERSED FORKED PARAGRAPHOS ⸑
0x2E12, // HYPODIASTOLE ⸒
0x2E13, // DOTTED OBELOS ⸓
0x2E14, // DOWNWARDS ANCORA ⸔
0x2E15, // UPWARDS ANCORA ⸕
0x2E16, // DOTTED RIGHT-POINTING ANGLE ⸖
0x2E18, // INVERTED INTERROBANG ⸘
0x2E19, // PALM BRANCH ⸙
0x2E1B, // TILDE WITH RING ABOVE ⸛
0x2E1E, // TILDE WITH DOT ABOVE ⸞
0x2E1F, // TILDE WITH DOT BELOW ⸟
0x2E2A, // TWO DOTS OVER ONE DOT PUNCTUATION ⸪
0x2E2B, // ONE DOT OVER TWO DOTS PUNCTUATION ⸫
0x2E2C, // SQUARED FOUR DOT PUNCTUATION ⸬
0x2E2D, // FIVE DOT MARK ⸭
0x2E2E, // REVERSED QUESTION MARK ⸮
0x2E30, // RING POINT ⸰
0x2E31, // WORD SEPARATOR MIDDLE DOT ⸱
0x2E32, // TURNED COMMA ⸲
0x2E33, // RAISED DOT ⸳
0x2E34, // RAISED COMMA ⸴
0x2E35, // TURNED SEMICOLON ⸵
0x2E36, // DAGGER WITH LEFT GUARD ⸶
0x2E37, // DAGGER WITH RIGHT GUARD ⸷
0x2E38, // TURNED DAGGER ⸸
0x2E39, // TOP HALF SECTION SIGN ⸹
0x2E3C, // STENOGRAPHIC FULL STOP ⸼
0x2E3D, // VERTICAL SIX DOTS ⸽
0x2E3E, // WIGGLY VERTICAL LINE ⸾
0x2E3F, // CAPITULUM ⸿
0x2E41, // REVERSED COMMA ⹁
0x2E43, // DASH WITH LEFT UPTURN ⹃
0x2E44, // DOUBLE SUSPENSION MARK ⹄
0x2E45, // INVERTED LOW KAVYKA ⹅
0x2E46, // INVERTED LOW KAVYKA WITH KAVYKA ABOVE ⹆
0x2E47, // LOW KAVYKA ⹇
0x2E48, // LOW KAVYKA WITH DOT ⹈
0x2E49, // DOUBLE STACKED COMMA ⹉
0x2E4A, // DOTTED SOLIDUS ⹊
0x2E4B, // TRIPLE DAGGER ⹋
0x2E4C, // MEDIEVAL COMMA ⹌
0x2E4D, // PARAGRAPHUS MARK ⹍
0x2E4E, // PUNCTUS ELEVATUS MARK ⹎
0x2E4F, // CORNISH VERSE DIVIDER ⹏
0x3001, // IDEOGRAPHIC COMMA 、
0x3002, // IDEOGRAPHIC FULL STOP 。
0x3003, // DITTO MARK 〃
0x303D, // PART ALTERNATION MARK 〽
0x30FB, // KATAKANA MIDDLE DOT ・
0xA4FE, // LISU PUNCTUATION COMMA ꓾
0xA4FF, // LISU PUNCTUATION FULL STOP
0xA60D, // VAI COMMA ꘍
0xA60E, // VAI FULL STOP
0xA60F, // VAI QUESTION MARK ꘏
0xA673, // SLAVONIC ASTERISK ꙳
0xA67E, // CYRILLIC KAVYKA ꙾
0xA6F2, // BAMUM NJAEMLI ꛲
0xA6F3, // BAMUM FULL STOP ꛳
0xA6F4, // BAMUM COLON ꛴
0xA6F5, // BAMUM COMMA ꛵
0xA6F6, // BAMUM SEMICOLON ꛶
0xA6F7, // BAMUM QUESTION MARK ꛷
0xA874, // PHAGS-PA SINGLE HEAD MARK ꡴
0xA875, // PHAGS-PA DOUBLE HEAD MARK ꡵
0xA876, // PHAGS-PA MARK SHAD ꡶
0xA877, // PHAGS-PA MARK DOUBLE SHAD ꡷
0xA8CE, // SAURASHTRA DANDA ꣎
0xA8CF, // SAURASHTRA DOUBLE DANDA ꣏
0xA8F8, // DEVANAGARI SIGN PUSHPIKA ꣸
0xA8F9, // DEVANAGARI GAP FILLER ꣹
0xA8FA, // DEVANAGARI CARET ꣺
0xA8FC, // DEVANAGARI SIGN SIDDHAM ꣼
0xA92E, // KAYAH LI SIGN CWI ꤮
0xA92F, // KAYAH LI SIGN SHYA ꤯
0xA95F, // REJANG SECTION MARK ꥟
0xA9C1, // JAVANESE LEFT RERENGGAN ꧁
0xA9C2, // JAVANESE RIGHT RERENGGAN ꧂
0xA9C3, // JAVANESE PADA ANDAP ꧃
0xA9C4, // JAVANESE PADA MADYA ꧄
0xA9C5, // JAVANESE PADA LUHUR ꧅
0xA9C6, // JAVANESE PADA WINDU ꧆
0xA9C7, // JAVANESE PADA PANGKAT ꧇
0xA9C8, // JAVANESE PADA LINGSA ꧈
0xA9C9, // JAVANESE PADA LUNGSI ꧉
0xA9CA, // JAVANESE PADA ADEG ꧊
0xA9CB, // JAVANESE PADA ADEG ADEG ꧋
0xA9CC, // JAVANESE PADA PISELEH ꧌
0xA9CD, // JAVANESE TURNED PADA PISELEH ꧍
0xA9DE, // JAVANESE PADA TIRTA TUMETES ꧞
0xA9DF, // JAVANESE PADA ISEN-ISEN ꧟
0xAA5C, // CHAM PUNCTUATION SPIRAL ꩜
0xAA5D, // CHAM PUNCTUATION DANDA ꩝
0xAA5E, // CHAM PUNCTUATION DOUBLE DANDA ꩞
0xAA5F, // CHAM PUNCTUATION TRIPLE DANDA ꩟
0xAADE, // TAI VIET SYMBOL HO HOI ꫞
0xAADF, // TAI VIET SYMBOL KOI KOI ꫟
0xAAF0, // MEETEI MAYEK CHEIKHAN ꫰
0xAAF1, // MEETEI MAYEK AHANG KHUDAM ꫱
0xABEB, // MEETEI MAYEK CHEIKHEI ꯫
0xFE10, // PRESENTATION FORM FOR VERTICAL COMMA ︐
0xFE11, // PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA ︑
0xFE12, // PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP ︒
0xFE13, // PRESENTATION FORM FOR VERTICAL COLON ︓
0xFE14, // PRESENTATION FORM FOR VERTICAL SEMICOLON ︔
0xFE15, // PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK ︕
0xFE16, // PRESENTATION FORM FOR VERTICAL QUESTION MARK ︖
0xFE19, // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS ︙
0xFE30, // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
0xFE45, // SESAME DOT ﹅
0xFE46, // WHITE SESAME DOT ﹆
0xFE49, // DASHED OVERLINE ﹉
0xFE4A, // CENTRELINE OVERLINE ﹊
0xFE4B, // WAVY OVERLINE ﹋
0xFE4C, // DOUBLE WAVY OVERLINE ﹌
0xFE50, // SMALL COMMA ﹐
0xFE51, // SMALL IDEOGRAPHIC COMMA ﹑
0xFE52, // SMALL FULL STOP ﹒
0xFE54, // SMALL SEMICOLON ﹔
0xFE55, // SMALL COLON ﹕
0xFE56, // SMALL QUESTION MARK ﹖
0xFE57, // SMALL EXCLAMATION MARK ﹗
0xFE5F, // SMALL NUMBER SIGN ﹟
0xFE60, // SMALL AMPERSAND ﹠
0xFE61, // SMALL ASTERISK ﹡
0xFE68, // SMALL REVERSE SOLIDUS
0xFE6A, // SMALL PERCENT SIGN ﹪
0xFE6B, // SMALL COMMERCIAL AT ﹫
0xFF01, // FULLWIDTH EXCLAMATION MARK
0xFF02, // FULLWIDTH QUOTATION MARK
0xFF03, // FULLWIDTH NUMBER SIGN
0xFF05, // FULLWIDTH PERCENT SIGN
0xFF06, // FULLWIDTH AMPERSAND
0xFF07, // FULLWIDTH APOSTROPHE
0xFF0A, // FULLWIDTH ASTERISK
0xFF0C, // FULLWIDTH COMMA
0xFF0E, // FULLWIDTH FULL STOP
0xFF0F, // FULLWIDTH SOLIDUS
0xFF1A, // FULLWIDTH COLON
0xFF1B, // FULLWIDTH SEMICOLON
0xFF1F, // FULLWIDTH QUESTION MARK
0xFF20, // FULLWIDTH COMMERCIAL AT
0xFF3C, // FULLWIDTH REVERSE SOLIDUS
0xFF61, // HALFWIDTH IDEOGRAPHIC FULL STOP 。
0xFF64, // HALFWIDTH IDEOGRAPHIC COMMA 、
0xFF65, // HALFWIDTH KATAKANA MIDDLE DOT ・
0x10100, // AEGEAN WORD SEPARATOR LINE 𐄀
0x10101, // AEGEAN WORD SEPARATOR DOT 𐄁
0x10102, // AEGEAN CHECK MARK 𐄂
0x1039F, // UGARITIC WORD DIVIDER 𐎟
0x103D0, // OLD PERSIAN WORD DIVIDER 𐏐
0x1056F, // CAUCASIAN ALBANIAN CITATION MARK 𐕯
0x10857, // IMPERIAL ARAMAIC SECTION SIGN 𐡗
0x1091F, // PHOENICIAN WORD SEPARATOR 𐤟
0x1093F, // LYDIAN TRIANGULAR MARK 𐤿
0x10A50, // KHAROSHTHI PUNCTUATION DOT 𐩐
0x10A51, // KHAROSHTHI PUNCTUATION SMALL CIRCLE 𐩑
0x10A52, // KHAROSHTHI PUNCTUATION CIRCLE 𐩒
0x10A53, // KHAROSHTHI PUNCTUATION CRESCENT BAR 𐩓
0x10A54, // KHAROSHTHI PUNCTUATION MANGALAM 𐩔
0x10A55, // KHAROSHTHI PUNCTUATION LOTUS 𐩕
0x10A56, // KHAROSHTHI PUNCTUATION DANDA 𐩖
0x10A57, // KHAROSHTHI PUNCTUATION DOUBLE DANDA 𐩗
0x10A58, // KHAROSHTHI PUNCTUATION LINES 𐩘
0x10A7F, // OLD SOUTH ARABIAN NUMERIC INDICATOR 𐩿
0x10AF0, // MANICHAEAN PUNCTUATION STAR 𐫰
0x10AF1, // MANICHAEAN PUNCTUATION FLEURON 𐫱
0x10AF2, // MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT 𐫲
0x10AF3, // MANICHAEAN PUNCTUATION DOT WITHIN DOT 𐫳
0x10AF4, // MANICHAEAN PUNCTUATION DOT 𐫴
0x10AF5, // MANICHAEAN PUNCTUATION TWO DOTS 𐫵
0x10AF6, // MANICHAEAN PUNCTUATION LINE FILLER 𐫶
0x10B39, // AVESTAN ABBREVIATION MARK 𐬹
0x10B3A, // TINY TWO DOTS OVER ONE DOT PUNCTUATION 𐬺
0x10B3B, // SMALL TWO DOTS OVER ONE DOT PUNCTUATION 𐬻
0x10B3C, // LARGE TWO DOTS OVER ONE DOT PUNCTUATION 𐬼
0x10B3D, // LARGE ONE DOT OVER TWO DOTS PUNCTUATION 𐬽
0x10B3E, // LARGE TWO RINGS OVER ONE RING PUNCTUATION 𐬾
0x10B3F, // LARGE ONE RING OVER TWO RINGS PUNCTUATION 𐬿
0x10B99, // PSALTER PAHLAVI SECTION MARK 𐮙
0x10B9A, // PSALTER PAHLAVI TURNED SECTION MARK 𐮚
0x10B9B, // PSALTER PAHLAVI FOUR DOTS WITH CROSS 𐮛
0x10B9C, // PSALTER PAHLAVI FOUR DOTS WITH DOT 𐮜
0x10F55, // SOGDIAN PUNCTUATION TWO VERTICAL BARS 𐽕
0x10F56, // SOGDIAN PUNCTUATION TWO VERTICAL BARS WITH DOTS 𐽖
0x10F57, // SOGDIAN PUNCTUATION CIRCLE WITH DOT 𐽗
0x10F58, // SOGDIAN PUNCTUATION TWO CIRCLES WITH DOTS 𐽘
0x10F59, // SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT 𐽙
0x11047, // BRAHMI DANDA 𑁇
0x11048, // BRAHMI DOUBLE DANDA 𑁈
0x11049, // BRAHMI PUNCTUATION DOT 𑁉
0x1104A, // BRAHMI PUNCTUATION DOUBLE DOT 𑁊
0x1104B, // BRAHMI PUNCTUATION LINE 𑁋
0x1104C, // BRAHMI PUNCTUATION CRESCENT BAR 𑁌
0x1104D, // BRAHMI PUNCTUATION LOTUS 𑁍
0x110BB, // KAITHI ABBREVIATION SIGN 𑂻
0x110BC, // KAITHI ENUMERATION SIGN 𑂼
0x110BE, // KAITHI SECTION MARK 𑂾
0x110BF, // KAITHI DOUBLE SECTION MARK 𑂿
0x110C0, // KAITHI DANDA 𑃀
0x110C1, // KAITHI DOUBLE DANDA 𑃁
0x11140, // CHAKMA SECTION MARK 𑅀
0x11141, // CHAKMA DANDA 𑅁
0x11142, // CHAKMA DOUBLE DANDA 𑅂
0x11143, // CHAKMA QUESTION MARK 𑅃
0x11174, // MAHAJANI ABBREVIATION SIGN 𑅴
0x11175, // MAHAJANI SECTION MARK 𑅵
0x111C5, // SHARADA DANDA 𑇅
0x111C6, // SHARADA DOUBLE DANDA 𑇆
0x111C7, // SHARADA ABBREVIATION SIGN 𑇇
0x111C8, // SHARADA SEPARATOR 𑇈
0x111CD, // SHARADA SUTRA MARK 𑇍
0x111DB, // SHARADA SIGN SIDDHAM 𑇛
0x111DD, // SHARADA CONTINUATION SIGN 𑇝
0x111DE, // SHARADA SECTION MARK-1 𑇞
0x111DF, // SHARADA SECTION MARK-2 𑇟
0x11238, // KHOJKI DANDA 𑈸
0x11239, // KHOJKI DOUBLE DANDA 𑈹
0x1123A, // KHOJKI WORD SEPARATOR 𑈺
0x1123B, // KHOJKI SECTION MARK 𑈻
0x1123C, // KHOJKI DOUBLE SECTION MARK 𑈼
0x1123D, // KHOJKI ABBREVIATION SIGN 𑈽
0x112A9, // MULTANI SECTION MARK 𑊩
0x1144B, // NEWA DANDA 𑑋
0x1144C, // NEWA DOUBLE DANDA 𑑌
0x1144D, // NEWA COMMA 𑑍
0x1144E, // NEWA GAP FILLER 𑑎
0x1144F, // NEWA ABBREVIATION SIGN 𑑏
0x1145B, // NEWA PLACEHOLDER MARK 𑑛
0x1145D, // NEWA INSERTION SIGN 𑑝
0x114C6, // TIRHUTA ABBREVIATION SIGN 𑓆
0x115C1, // SIDDHAM SIGN SIDDHAM 𑗁
0x115C2, // SIDDHAM DANDA 𑗂
0x115C3, // SIDDHAM DOUBLE DANDA 𑗃
0x115C4, // SIDDHAM SEPARATOR DOT 𑗄
0x115C5, // SIDDHAM SEPARATOR BAR 𑗅
0x115C6, // SIDDHAM REPETITION MARK-1 𑗆
0x115C7, // SIDDHAM REPETITION MARK-2 𑗇
0x115C8, // SIDDHAM REPETITION MARK-3 𑗈
0x115C9, // SIDDHAM END OF TEXT MARK 𑗉
0x115CA, // SIDDHAM SECTION MARK WITH TRIDENT AND U-SHAPED ORNAMENTS 𑗊
0x115CB, // SIDDHAM SECTION MARK WITH TRIDENT AND DOTTED CRESCENTS 𑗋
0x115CC, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED CRESCENTS 𑗌
0x115CD, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED DOUBLE CRESCENTS 𑗍
0x115CE, // SIDDHAM SECTION MARK WITH RAYS AND DOTTED TRIPLE CRESCENTS 𑗎
0x115CF, // SIDDHAM SECTION MARK DOUBLE RING 𑗏
0x115D0, // SIDDHAM SECTION MARK DOUBLE RING WITH RAYS 𑗐
0x115D1, // SIDDHAM SECTION MARK WITH DOUBLE CRESCENTS 𑗑
0x115D2, // SIDDHAM SECTION MARK WITH TRIPLE CRESCENTS 𑗒
0x115D3, // SIDDHAM SECTION MARK WITH QUADRUPLE CRESCENTS 𑗓
0x115D4, // SIDDHAM SECTION MARK WITH SEPTUPLE CRESCENTS 𑗔
0x115D5, // SIDDHAM SECTION MARK WITH CIRCLES AND RAYS 𑗕
0x115D6, // SIDDHAM SECTION MARK WITH CIRCLES AND TWO ENCLOSURES 𑗖
0x115D7, // SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES 𑗗
0x11641, // MODI DANDA 𑙁
0x11642, // MODI DOUBLE DANDA 𑙂
0x11643, // MODI ABBREVIATION SIGN 𑙃
0x11660, // MONGOLIAN BIRGA WITH ORNAMENT 𑙠
0x11661, // MONGOLIAN ROTATED BIRGA 𑙡
0x11662, // MONGOLIAN DOUBLE BIRGA WITH ORNAMENT 𑙢
0x11663, // MONGOLIAN TRIPLE BIRGA WITH ORNAMENT 𑙣
0x11664, // MONGOLIAN BIRGA WITH DOUBLE ORNAMENT 𑙤
0x11665, // MONGOLIAN ROTATED BIRGA WITH ORNAMENT 𑙥
0x11666, // MONGOLIAN ROTATED BIRGA WITH DOUBLE ORNAMENT 𑙦
0x11667, // MONGOLIAN INVERTED BIRGA 𑙧
0x11668, // MONGOLIAN INVERTED BIRGA WITH DOUBLE ORNAMENT 𑙨
0x11669, // MONGOLIAN SWIRL BIRGA 𑙩
0x1166A, // MONGOLIAN SWIRL BIRGA WITH ORNAMENT 𑙪
0x1166B, // MONGOLIAN SWIRL BIRGA WITH DOUBLE ORNAMENT 𑙫
0x1166C, // MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT 𑙬
0x1173C, // AHOM SIGN SMALL SECTION 𑜼
0x1173D, // AHOM SIGN SECTION 𑜽
0x1173E, // AHOM SIGN RULAI 𑜾
0x1183B, // DOGRA ABBREVIATION SIGN 𑠻
0x119E2, // NANDINAGARI SIGN SIDDHAM 𑧢
0x11A3F, // ZANABAZAR SQUARE INITIAL HEAD MARK 𑨿
0x11A40, // ZANABAZAR SQUARE CLOSING HEAD MARK 𑩀
0x11A41, // ZANABAZAR SQUARE MARK TSHEG 𑩁
0x11A42, // ZANABAZAR SQUARE MARK SHAD 𑩂
0x11A43, // ZANABAZAR SQUARE MARK DOUBLE SHAD 𑩃
0x11A44, // ZANABAZAR SQUARE MARK LONG TSHEG 𑩄
0x11A45, // ZANABAZAR SQUARE INITIAL DOUBLE-LINED HEAD MARK 𑩅
0x11A46, // ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK 𑩆
0x11A9A, // SOYOMBO MARK TSHEG 𑪚
0x11A9B, // SOYOMBO MARK SHAD 𑪛
0x11A9C, // SOYOMBO MARK DOUBLE SHAD 𑪜
0x11A9E, // SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME 𑪞
0x11A9F, // SOYOMBO HEAD MARK WITH MOON AND SUN AND FLAME 𑪟
0x11AA0, // SOYOMBO HEAD MARK WITH MOON AND SUN 𑪠
0x11AA1, // SOYOMBO TERMINAL MARK-1 𑪡
0x11AA2, // SOYOMBO TERMINAL MARK-2 𑪢
0x11C41, // BHAIKSUKI DANDA 𑱁
0x11C42, // BHAIKSUKI DOUBLE DANDA 𑱂
0x11C43, // BHAIKSUKI WORD SEPARATOR 𑱃
0x11C44, // BHAIKSUKI GAP FILLER-1 𑱄
0x11C45, // BHAIKSUKI GAP FILLER-2 𑱅
0x11C70, // MARCHEN HEAD MARK 𑱰
0x11C71, // MARCHEN MARK SHAD 𑱱
0x11EF7, // MAKASAR PASSIMBANG 𑻷
0x11EF8, // MAKASAR END OF SECTION 𑻸
0x11FFF, // TAMIL PUNCTUATION END OF TEXT 𑿿
0x12470, // CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER 𒑰
0x12471, // CUNEIFORM PUNCTUATION SIGN VERTICAL COLON 𒑱
0x12472, // CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON 𒑲
0x12473, // CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON 𒑳
0x12474, // CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON 𒑴
0x16A6E, // MRO DANDA 𖩮
0x16A6F, // MRO DOUBLE DANDA 𖩯
0x16AF5, // BASSA VAH FULL STOP 𖫵
0x16B37, // PAHAWH HMONG SIGN VOS THOM 𖬷
0x16B38, // PAHAWH HMONG SIGN VOS TSHAB CEEB 𖬸
0x16B39, // PAHAWH HMONG SIGN CIM CHEEM 𖬹
0x16B3A, // PAHAWH HMONG SIGN VOS THIAB 𖬺
0x16B3B, // PAHAWH HMONG SIGN VOS FEEM 𖬻
0x16B44, // PAHAWH HMONG SIGN XAUS 𖭄
0x16E97, // MEDEFAIDRIN COMMA 𖺗
0x16E98, // MEDEFAIDRIN FULL STOP 𖺘
0x16E99, // MEDEFAIDRIN SYMBOL AIVA 𖺙
0x16E9A, // MEDEFAIDRIN EXCLAMATION OH 𖺚
0x16FE2, // OLD CHINESE HOOK MARK 𖿢
0x1BC9F, // DUPLOYAN PUNCTUATION CHINOOK FULL STOP 𛲟
0x1DA87, // SIGNWRITING COMMA 𝪇
0x1DA88, // SIGNWRITING FULL STOP 𝪈
0x1DA89, // SIGNWRITING SEMICOLON 𝪉
0x1DA8A, // SIGNWRITING COLON 𝪊
0x1DA8B, // SIGNWRITING PARENTHESIS 𝪋
0x1E95E, // ADLAM INITIAL EXCLAMATION MARK 𞥞
0x1E95F, // ADLAM INITIAL QUESTION MARK
]
)

View File

@ -25,4 +25,25 @@ fn test_utf8_util() {
// test u_len function // test u_len function
assert utf8.u_len(src1)==15 //29 assert utf8.u_len(src1)==15 //29
assert utf8.u_len("pippo".ustring())==5 assert utf8.u_len("pippo".ustring())==5
// western punctuation
a := '.abc?abcòàè.'
assert utf8.is_punct(a,0)==true
assert utf8.is_punct('b',0)==false
assert utf8.is_uchar_punct(0x002E)==true
assert utf8.is_punct(a,4)==true // ?
assert utf8.is_punct(a,14)==true // last .
assert utf8.is_punct(a,12)==false // è
println("OK western")
// global punctuation
b := '.ĂĂa. ÔÔ TESTO Æ'
assert utf8.is_global_punct(b,0)==true
assert utf8.is_global_punct('.',0)==true
assert utf8.is_uchar_punct(0x002E)==true
assert utf8.is_global_punct(b,6)==true // .
assert utf8.is_global_punct(b,1)==false // a
// test utility functions
assert utf8.get_uchar(b,0)==0x002E
} }