builtin: add a non allocating rune.length_in_bytes() method and tests for it

pull/12647/head
Delyan Angelov 2021-12-01 16:35:13 +02:00
parent b3aedff3f8
commit d7bc2a88f7
No known key found for this signature in database
GPG Key ID: 66886C0F12D595ED
2 changed files with 42 additions and 0 deletions
vlib/builtin

View File

@ -53,3 +53,21 @@ pub fn (c rune) repeat(count int) string {
res := unsafe { utf32_to_str_no_malloc(u32(c), &buffer[0]) } res := unsafe { utf32_to_str_no_malloc(u32(c), &buffer[0]) }
return res.repeat(count) return res.repeat(count)
} }
pub fn (c rune) length_in_bytes() int {
code := u32(c)
if code <= 0x7F {
return 1
} else if code <= 0x7FF {
return 2
} else if 0xD800 <= code && code <= 0xDFFF {
// between min and max for surrogates
return -1
} else if code <= 0xFFFF {
return 3
} else if code <= 0x10FFFF {
// 0x10FFFF is the maximum valid unicode code point
return 4
}
return -1
}

View File

@ -11,3 +11,27 @@ fn test_repeat() {
assert r1.repeat(0) == '' assert r1.repeat(0) == ''
assert r2.repeat(0) == '' assert r2.repeat(0) == ''
} }
fn test_length_in_bytes() {
assert rune(0x0).length_in_bytes() == 1
assert `A`.length_in_bytes() == 1 // latin letter
assert rune(0x7F).length_in_bytes() == 1
//
assert rune(0x80).length_in_bytes() == 2
assert `Д`.length_in_bytes() == 2 // cyrillic letter
assert rune(0x7FF).length_in_bytes() == 2
//
assert rune(0x800).length_in_bytes() == 3
assert ``.length_in_bytes() == 3 // hey
assert rune(0xFFFF).length_in_bytes() == 3
//
assert rune(0xD800).length_in_bytes() == -1 // min for surrogates
assert rune(0xD866).length_in_bytes() == -1 // invalid
assert rune(0xDFFF).length_in_bytes() == -1 // max for surrogates
//
assert rune(0x100000).length_in_bytes() == 4
assert rune(0x10FFD7).length_in_bytes() == 4 // "Supplementary Private Use Area-B" ¯\_(ツ)_/¯
assert rune(0x10FFFF).length_in_bytes() == 4
//
assert rune(0x110000).length_in_bytes() == -1
}