From 9aabf222feec7034493f6ad231964da408b85ca7 Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Mon, 5 Apr 2021 20:28:21 +0200 Subject: [PATCH] encoding.utf8: fix a bug in up_low (#9610) --- vlib/encoding/utf8/utf8_util.v | 32 ++++++++++++++++++++++++----- vlib/encoding/utf8/utf8_util_test.v | 3 +++ 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/vlib/encoding/utf8/utf8_util.v b/vlib/encoding/utf8/utf8_util.v index 7c94686a57..fe9e99a7a2 100644 --- a/vlib/encoding/utf8/utf8_util.v +++ b/vlib/encoding/utf8/utf8_util.v @@ -216,7 +216,7 @@ fn up_low(s string, upper_flag bool) string { //C.printf("len: %d code: %04x ",ch_len,res) ch_index := find_char_in_table(u16(res), upper_flag) - //C.printf(" utf8 index: %d ",ch_index) + //C.printf(" utf8 index: %d \n",ch_index) // char not in table, no need of conversion if ch_index == 0 { @@ -307,11 +307,11 @@ fn find_char_in_table( in_code u16, upper_flag bool) int { mut index := 0 mut x := u16(0) - mut offset:=0 // up to low + mut offset:=0 // up to low mut i_step:=1 // up to low if upper_flag==true { - offset=1 // low to up - i_step=0 // low to up + offset=1 // low to up + i_step=0 // low to up } //C.printf("looking for [%04x] in (%d..%d).\n",in_code,first_index,last_index) @@ -335,7 +335,29 @@ fn find_char_in_table( in_code u16, upper_flag bool) int { break } } - //C.printf("not found.\n") + //C.printf("not found.\n %d %04x",index, unicode_con_table_up_to_low[ (index<<1)+offset ] ) + // the low to up is not full sorted for different reasons, + // we must try a linear search in the surroundings + if upper_flag { + search_radius := 30 * 2 + max_index := unicode_con_table_up_to_low.len >> 1 + mut index1 := index + search_radius + + if index1 > max_index { + index1 = max_index + } + index = index - search_radius + if index < 0 { + index = 0 + } + for index < index1 { + if unicode_con_table_up_to_low[ (index << 1) + 1 ] == in_code { + return (index << 1) + } + index++ + } + } + //eprintln("NOT FOUND!!") return 0 } diff --git a/vlib/encoding/utf8/utf8_util_test.v b/vlib/encoding/utf8/utf8_util_test.v index 208fdc8674..646bd69a7f 100644 --- a/vlib/encoding/utf8/utf8_util_test.v +++ b/vlib/encoding/utf8/utf8_util_test.v @@ -10,6 +10,9 @@ fn test_utf8_util() { lower:=utf8.to_lower(src) assert upper==src_upper assert lower==src_lower + + assert utf8.to_upper('абвёabc12{') == 'АБВЁABC12{' + assert utf8.to_lower('АБВЁABC12{') == 'абвёabc12{' // ustring test src1:=src.ustring()