diff --git a/vlib/builtin/rune.v b/vlib/builtin/rune.v index d08f1f6f30..28e8b4ba66 100644 --- a/vlib/builtin/rune.v +++ b/vlib/builtin/rune.v @@ -53,3 +53,21 @@ pub fn (c rune) repeat(count int) string { res := unsafe { utf32_to_str_no_malloc(u32(c), &buffer[0]) } return res.repeat(count) } + +pub fn (c rune) length_in_bytes() int { + code := u32(c) + if code <= 0x7F { + return 1 + } else if code <= 0x7FF { + return 2 + } else if 0xD800 <= code && code <= 0xDFFF { + // between min and max for surrogates + return -1 + } else if code <= 0xFFFF { + return 3 + } else if code <= 0x10FFFF { + // 0x10FFFF is the maximum valid unicode code point + return 4 + } + return -1 +} diff --git a/vlib/builtin/rune_test.v b/vlib/builtin/rune_test.v index 6b7e98786f..271c01eb76 100644 --- a/vlib/builtin/rune_test.v +++ b/vlib/builtin/rune_test.v @@ -11,3 +11,27 @@ fn test_repeat() { assert r1.repeat(0) == '' assert r2.repeat(0) == '' } + +fn test_length_in_bytes() { + assert rune(0x0).length_in_bytes() == 1 + assert `A`.length_in_bytes() == 1 // latin letter + assert rune(0x7F).length_in_bytes() == 1 + // + assert rune(0x80).length_in_bytes() == 2 + assert `Д`.length_in_bytes() == 2 // cyrillic letter + assert rune(0x7FF).length_in_bytes() == 2 + // + assert rune(0x800).length_in_bytes() == 3 + assert `喂`.length_in_bytes() == 3 // hey + assert rune(0xFFFF).length_in_bytes() == 3 + // + assert rune(0xD800).length_in_bytes() == -1 // min for surrogates + assert rune(0xD866).length_in_bytes() == -1 // invalid + assert rune(0xDFFF).length_in_bytes() == -1 // max for surrogates + // + assert rune(0x100000).length_in_bytes() == 4 + assert rune(0x10FFD7).length_in_bytes() == 4 // "Supplementary Private Use Area-B" ¯\_(ツ)_/¯ + assert rune(0x10FFFF).length_in_bytes() == 4 + // + assert rune(0x110000).length_in_bytes() == -1 +}