From f8ed4ab9d0b222198d16f7021763ce98b2e20688 Mon Sep 17 00:00:00 2001 From: StunxFS <56417208+StunxFS@users.noreply.github.com> Date: Fri, 29 Apr 2022 03:23:57 -0400 Subject: [PATCH] builtin: add `string.len_utf8()` method (#14208) --- vlib/builtin/string.v | 11 +++++++++++ vlib/builtin/string_test.v | 8 ++++++++ vlib/builtin/utf8.v | 24 +++++++++++++----------- vlib/v/gen/c/cgen.v | 2 +- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index eec1194638..5b10dd283b 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -295,6 +295,17 @@ pub fn (cp &char) vstring_literal_with_len(len int) string { } } +// len_utf8 returns the number of runes contained in the string `s`. +pub fn (s string) len_utf8() int { + mut l := 0 + mut i := 0 + for i < s.len { + l++ + i += ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1 + } + return l +} + // clone_static returns an independent copy of a given array. // It should be used only in -autofree generated code. fn (a string) clone_static() string { diff --git a/vlib/builtin/string_test.v b/vlib/builtin/string_test.v index 38c0f17202..c22c7fb66b 100644 --- a/vlib/builtin/string_test.v +++ b/vlib/builtin/string_test.v @@ -24,6 +24,14 @@ fn test_add() { assert a.ends_with('3') } +fn test_len_utf8() { + assert 'Vlang'.len_utf8() == 5 + assert 'María'.len_utf8() == 5 + assert '姓名'.len_utf8() == 2 + assert 'Слово'.len_utf8() == 5 + assert 'Λέξη'.len_utf8() == 4 +} + fn test_ends_with() { a := 'browser.v' assert a.ends_with('.v') diff --git a/vlib/builtin/utf8.v b/vlib/builtin/utf8.v index dfd3269264..750f0403ab 100644 --- a/vlib/builtin/utf8.v +++ b/vlib/builtin/utf8.v @@ -66,6 +66,19 @@ pub fn utf32_decode_to_buffer(code u32, buf &u8) int { return 0 } +// utf8_str_len returns the number of runes contained in the string. +[deprecated: 'use `string.len_utf8()` instead'] +[deprecated_after: '2022-05-28'] +pub fn utf8_str_len(s string) int { + mut l := 0 + mut i := 0 + for i < s.len { + l++ + i += ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1 + } + return l +} + // Convert utf8 to utf32 // the original implementation did not check for // valid utf8 in the string, and could result in @@ -134,17 +147,6 @@ fn utf8_len(c u8) int { return b } -// Calculate string length for in number of codepoints -pub fn utf8_str_len(s string) int { - mut l := 0 - mut i := 0 - for i < s.len { - l++ - i += ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1 - } - return l -} - // Calculate string length for formatting, i.e. number of "characters" // This is simplified implementation. if you need specification compliant width, // use utf8.east_asian.display_width. diff --git a/vlib/v/gen/c/cgen.v b/vlib/v/gen/c/cgen.v index cba58fe7c4..a4b20fe684 100644 --- a/vlib/v/gen/c/cgen.v +++ b/vlib/v/gen/c/cgen.v @@ -3143,7 +3143,7 @@ fn (mut g Gen) char_literal(node ast.CharLiteral) { return } // TODO: optimize use L-char instead of u32 when possible - if utf8_str_len(node.val) < node.val.len { + if node.val.len_utf8() < node.val.len { g.write('((rune)0x$node.val.utf32_code().hex() /* `$node.val` */)') return }