utf: fix string length calculation for combining characters
parent
a72f3ed0c7
commit
f831910c5c
|
@ -174,7 +174,7 @@ fn utf8_len(c byte) int {
|
||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate string length for formatting, i.e. number of "characters"
|
// Calculate string length for in number of codepoints
|
||||||
fn utf8_str_len(s string) int {
|
fn utf8_str_len(s string) int {
|
||||||
mut l := 0
|
mut l := 0
|
||||||
for i := 0; i < s.len; i++ {
|
for i := 0; i < s.len; i++ {
|
||||||
|
@ -189,6 +189,41 @@ fn utf8_str_len(s string) int {
|
||||||
return l
|
return l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calculate string length for formatting, i.e. number of "characters"
|
||||||
|
fn utf8_str_visible_length(s string) int {
|
||||||
|
mut l := 0
|
||||||
|
mut ul := 1
|
||||||
|
for i := 0; i < s.len; i+=ul {
|
||||||
|
ul = 1
|
||||||
|
c := s.str[i]
|
||||||
|
if (c & (1 << 7)) != 0 {
|
||||||
|
for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
|
||||||
|
ul++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if i + ul > s.len { // incomplete UTF-8 sequence
|
||||||
|
return l
|
||||||
|
}
|
||||||
|
l++
|
||||||
|
// recognize combining characters
|
||||||
|
if c == 0xcc || c == 0xcd {
|
||||||
|
r := (u16(c) << 8) | s.str[i+1]
|
||||||
|
if r >= 0xcc80 && r < 0xcdb0 { // diacritical marks
|
||||||
|
l--
|
||||||
|
}
|
||||||
|
} else if c == 0xe1 || c == 0xe2 || c == 0xef {
|
||||||
|
r := (u32(c) << 16) | (u32(s.str[i+1]) << 8) | s.str[i+2]
|
||||||
|
if r >= 0xe1aab0 && r < 0xe1ac80 // diacritical marks extended
|
||||||
|
|| r >= 0xe1b780 && r < 0xe1b880 // diacritical marks supplement
|
||||||
|
|| r >= 0xe28390 && r < 0xe28480 // diacritical marks for symbols
|
||||||
|
|| r >= 0xefb8a0 && r < 0xefb8b0 { // half marks
|
||||||
|
l--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return l
|
||||||
|
}
|
||||||
|
|
||||||
// Reads an utf8 character from standard input
|
// Reads an utf8 character from standard input
|
||||||
pub fn utf8_getchar() int {
|
pub fn utf8_getchar() int {
|
||||||
c := C.getchar()
|
c := C.getchar()
|
||||||
|
|
|
@ -57,9 +57,9 @@ string _STR(const char *fmt, int nfmts, ...) {
|
||||||
if (fmt[k-4] == '*') { // %*.*s
|
if (fmt[k-4] == '*') { // %*.*s
|
||||||
int fwidth = va_arg(argptr, int);
|
int fwidth = va_arg(argptr, int);
|
||||||
if (fwidth < 0)
|
if (fwidth < 0)
|
||||||
fwidth -= (s.len - utf8_str_len(s));
|
fwidth -= (s.len - utf8_str_visible_length(s));
|
||||||
else
|
else
|
||||||
fwidth += (s.len - utf8_str_len(s));
|
fwidth += (s.len - utf8_str_visible_length(s));
|
||||||
_STR_PRINT_ARG(fmt, &buf, &nbytes, &memsize, k+fwidth-4, fwidth, s.len, s.str);
|
_STR_PRINT_ARG(fmt, &buf, &nbytes, &memsize, k+fwidth-4, fwidth, s.len, s.str);
|
||||||
} else { // %.*s
|
} else { // %.*s
|
||||||
_STR_PRINT_ARG(fmt, &buf, &nbytes, &memsize, k+s.len-4, s.len, s.str);
|
_STR_PRINT_ARG(fmt, &buf, &nbytes, &memsize, k+s.len-4, s.len, s.str);
|
||||||
|
|
|
@ -102,6 +102,11 @@ fn test_utf8_string_interpolation() {
|
||||||
e := '\u20AC' // Eurosign
|
e := '\u20AC' // Eurosign
|
||||||
// TODO: this fails with MSVC and tcc
|
// TODO: this fails with MSVC and tcc
|
||||||
// assert '100.00 $e' == '100.00 €'
|
// assert '100.00 $e' == '100.00 €'
|
||||||
|
m2 := 'Москва́' // cyrillic а́: combination of U+0430 and U+0301, UTF-8: d0 b0 cc 81
|
||||||
|
d := 'Antonín Dvořák' // latin á: U+00E1, UTF-8: c3 a1
|
||||||
|
assert ':${m2:7}:${d:-15}:' == ': Москва́:Antonín Dvořák :'
|
||||||
|
g := 'Πελοπόννησος'
|
||||||
|
assert '>${g:-13}<' == '>Πελοπόννησος <'
|
||||||
}
|
}
|
||||||
|
|
||||||
struct S {
|
struct S {
|
||||||
|
|
Loading…
Reference in New Issue