From f8ed4ab9d0b222198d16f7021763ce98b2e20688 Mon Sep 17 00:00:00 2001
From: StunxFS <56417208+StunxFS@users.noreply.github.com>
Date: Fri, 29 Apr 2022 03:23:57 -0400
Subject: [PATCH] builtin: add `string.len_utf8()` method (#14208)

---
 vlib/builtin/string.v      | 11 +++++++++++
 vlib/builtin/string_test.v |  8 ++++++++
 vlib/builtin/utf8.v        | 24 +++++++++++++-----------
 vlib/v/gen/c/cgen.v        |  2 +-
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v
index eec1194638..5b10dd283b 100644
--- a/vlib/builtin/string.v
+++ b/vlib/builtin/string.v
@@ -295,6 +295,17 @@ pub fn (cp &char) vstring_literal_with_len(len int) string {
 	}
 }
 
+// len_utf8 returns the number of runes contained in the string `s`.
+pub fn (s string) len_utf8() int {
+	mut l := 0
+	mut i := 0
+	for i < s.len {
+		l++
+		i += ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1
+	}
+	return l
+}
+
 // clone_static returns an independent copy of a given array.
 // It should be used only in -autofree generated code.
 fn (a string) clone_static() string {
diff --git a/vlib/builtin/string_test.v b/vlib/builtin/string_test.v
index 38c0f17202..c22c7fb66b 100644
--- a/vlib/builtin/string_test.v
+++ b/vlib/builtin/string_test.v
@@ -24,6 +24,14 @@ fn test_add() {
 	assert a.ends_with('3')
 }
 
+fn test_len_utf8() {
+	assert 'Vlang'.len_utf8() == 5
+	assert 'María'.len_utf8() == 5
+	assert '姓名'.len_utf8() == 2
+	assert 'Слово'.len_utf8() == 5
+	assert 'Λέξη'.len_utf8() == 4
+}
+
 fn test_ends_with() {
 	a := 'browser.v'
 	assert a.ends_with('.v')
diff --git a/vlib/builtin/utf8.v b/vlib/builtin/utf8.v
index dfd3269264..750f0403ab 100644
--- a/vlib/builtin/utf8.v
+++ b/vlib/builtin/utf8.v
@@ -66,6 +66,19 @@ pub fn utf32_decode_to_buffer(code u32, buf &u8) int {
 	return 0
 }
 
+// utf8_str_len returns the number of runes contained in the string.
+[deprecated: 'use `string.len_utf8()` instead']
+[deprecated_after: '2022-05-28']
+pub fn utf8_str_len(s string) int {
+	mut l := 0
+	mut i := 0
+	for i < s.len {
+		l++
+		i += ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1
+	}
+	return l
+}
+
 // Convert utf8 to utf32
 // the original implementation did not check for
 // valid utf8 in the string, and could result in
@@ -134,17 +147,6 @@ fn utf8_len(c u8) int {
 	return b
 }
 
-// Calculate string length for in number of codepoints
-pub fn utf8_str_len(s string) int {
-	mut l := 0
-	mut i := 0
-	for i < s.len {
-		l++
-		i += ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1
-	}
-	return l
-}
-
 // Calculate string length for formatting, i.e. number of "characters"
 // This is simplified implementation. if you need specification compliant width,
 // use utf8.east_asian.display_width.
diff --git a/vlib/v/gen/c/cgen.v b/vlib/v/gen/c/cgen.v
index cba58fe7c4..a4b20fe684 100644
--- a/vlib/v/gen/c/cgen.v
+++ b/vlib/v/gen/c/cgen.v
@@ -3143,7 +3143,7 @@ fn (mut g Gen) char_literal(node ast.CharLiteral) {
 		return
 	}
 	// TODO: optimize use L-char instead of u32 when possible
-	if utf8_str_len(node.val) < node.val.len {
+	if node.val.len_utf8() < node.val.len {
 		g.write('((rune)0x$node.val.utf32_code().hex() /* `$node.val` */)')
 		return
 	}