From f831910c5cc536d3f68aa2f919d29935a187c47d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kr=C3=BCger?= <45282134+UweKrueger@users.noreply.github.com> Date: Mon, 4 May 2020 13:21:11 +0200 Subject: [PATCH] utf: fix string length calculation for combining characters --- vlib/builtin/utf8.v | 37 +++++++++++++++++++++++- vlib/v/gen/str.v | 4 +-- vlib/v/tests/string_interpolation_test.v | 5 ++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/vlib/builtin/utf8.v b/vlib/builtin/utf8.v index dbf6fb349b..545c99c23c 100644 --- a/vlib/builtin/utf8.v +++ b/vlib/builtin/utf8.v @@ -174,7 +174,7 @@ fn utf8_len(c byte) int { return b } -// Calculate string length for formatting, i.e. number of "characters" +// Calculate string length for in number of codepoints fn utf8_str_len(s string) int { mut l := 0 for i := 0; i < s.len; i++ { @@ -189,6 +189,41 @@ fn utf8_str_len(s string) int { return l } +// Calculate string length for formatting, i.e. number of "characters" +fn utf8_str_visible_length(s string) int { + mut l := 0 + mut ul := 1 + for i := 0; i < s.len; i+=ul { + ul = 1 + c := s.str[i] + if (c & (1 << 7)) != 0 { + for t := byte(1 << 6); (c & t) != 0; t >>= 1 { + ul++ + } + } + if i + ul > s.len { // incomplete UTF-8 sequence + return l + } + l++ + // recognize combining characters + if c == 0xcc || c == 0xcd { + r := (u16(c) << 8) | s.str[i+1] + if r >= 0xcc80 && r < 0xcdb0 { // diacritical marks + l-- + } + } else if c == 0xe1 || c == 0xe2 || c == 0xef { + r := (u32(c) << 16) | (u32(s.str[i+1]) << 8) | s.str[i+2] + if r >= 0xe1aab0 && r < 0xe1ac80 // diacritical marks extended + || r >= 0xe1b780 && r < 0xe1b880 // diacritical marks supplement + || r >= 0xe28390 && r < 0xe28480 // diacritical marks for symbols + || r >= 0xefb8a0 && r < 0xefb8b0 { // half marks + l-- + } + } + } + return l +} + // Reads an utf8 character from standard input pub fn utf8_getchar() int { c := C.getchar() diff --git a/vlib/v/gen/str.v b/vlib/v/gen/str.v index 9e7a5d662a..820e4377d7 100644 --- a/vlib/v/gen/str.v +++ b/vlib/v/gen/str.v @@ -57,9 +57,9 @@ string _STR(const char *fmt, int nfmts, ...) { if (fmt[k-4] == '*') { // %*.*s int fwidth = va_arg(argptr, int); if (fwidth < 0) - fwidth -= (s.len - utf8_str_len(s)); + fwidth -= (s.len - utf8_str_visible_length(s)); else - fwidth += (s.len - utf8_str_len(s)); + fwidth += (s.len - utf8_str_visible_length(s)); _STR_PRINT_ARG(fmt, &buf, &nbytes, &memsize, k+fwidth-4, fwidth, s.len, s.str); } else { // %.*s _STR_PRINT_ARG(fmt, &buf, &nbytes, &memsize, k+s.len-4, s.len, s.str); diff --git a/vlib/v/tests/string_interpolation_test.v b/vlib/v/tests/string_interpolation_test.v index 4a14b35c64..e62fcefb77 100644 --- a/vlib/v/tests/string_interpolation_test.v +++ b/vlib/v/tests/string_interpolation_test.v @@ -102,6 +102,11 @@ fn test_utf8_string_interpolation() { e := '\u20AC' // Eurosign // TODO: this fails with MSVC and tcc // assert '100.00 $e' == '100.00 €' + m2 := 'Москва́' // cyrillic а́: combination of U+0430 and U+0301, UTF-8: d0 b0 cc 81 + d := 'Antonín Dvořák' // latin á: U+00E1, UTF-8: c3 a1 + assert ':${m2:7}:${d:-15}:' == ': Москва́:Antonín Dvořák :' + g := 'Πελοπόννησος' + assert '>${g:-13}<' == '>Πελοπόννησος <' } struct S {