encoding.utf8: fix a bug in up_low (#9610)

2021-04-05 20:28:21 +02:00 · 2021-04-05 20:28:21 +02:00 · 9aabf222fe
parent d11fb8497a
commit 9aabf222fe
2 changed files with 30 additions and 5 deletions
--- a/vlib/encoding/utf8/utf8_util.v
+++ b/vlib/encoding/utf8/utf8_util.v
@ -216,7 +216,7 @@ fn up_low(s string, upper_flag bool) string {

 			//C.printf("len: %d code: %04x ",ch_len,res)
 			ch_index := find_char_in_table(u16(res), upper_flag)
-			//C.printf(" utf8 index: %d ",ch_index)
+			//C.printf(" utf8 index: %d \n",ch_index)

 			// char not in table, no need of conversion
 			if ch_index == 0 {
@ -335,7 +335,29 @@ fn find_char_in_table( in_code u16, upper_flag bool) int {
 			break
 		}
 	}
-	//C.printf("not found.\n")
+	//C.printf("not found.\n %d %04x",index, unicode_con_table_up_to_low[ (index<<1)+offset ] )
+	// the low to up is not full sorted for different reasons, 
+	// we must try a linear search in the surroundings
+	if upper_flag {
+		search_radius := 30 * 2
+		max_index     := unicode_con_table_up_to_low.len >> 1
+		mut index1    := index + search_radius
+		
+		if index1 > max_index {
+			index1 = max_index
+		}
+		index = index - search_radius
+		if index < 0 {
+			index = 0
+		}
+		for index < index1 {
+			if unicode_con_table_up_to_low[ (index << 1) + 1 ] == in_code {
+				return (index << 1)
+			}
+			index++
+		}
+	}
+	//eprintln("NOT FOUND!!")
 	return 0
 }

--- a/vlib/encoding/utf8/utf8_util_test.v
+++ b/vlib/encoding/utf8/utf8_util_test.v
@ -11,6 +11,9 @@ fn test_utf8_util() {
 	assert upper==src_upper
 	assert lower==src_lower
 	
+	assert utf8.to_upper('абвёabc12｛') == 'АБВЁABC12｛'
+	assert utf8.to_lower('АБВЁABC12｛') == 'абвёabc12｛'
+
 	// ustring test
 	src1:=src.ustring()
 	upper1:=utf8.u_to_upper(src1)