From ce115dcbe07b794a923fa53a22902f6960c7b450 Mon Sep 17 00:00:00 2001 From: zakuro Date: Mon, 1 Mar 2021 08:18:02 +0900 Subject: [PATCH] builtin: correct error underline for unicode wide chars (#9010) --- vlib/builtin/utf8.v | 120 +++++++++++++------- vlib/v/checker/checker.v | 13 ++- vlib/v/checker/tests/error_with_unicode.out | 55 +++++++++ vlib/v/checker/tests/error_with_unicode.vv | 14 +++ vlib/v/util/errors.v | 23 ++-- 5 files changed, 172 insertions(+), 53 deletions(-) create mode 100644 vlib/v/checker/tests/error_with_unicode.out create mode 100644 vlib/v/checker/tests/error_with_unicode.vv diff --git a/vlib/builtin/utf8.v b/vlib/builtin/utf8.v index 6901a2becf..f886060aa5 100644 --- a/vlib/builtin/utf8.v +++ b/vlib/builtin/utf8.v @@ -4,7 +4,7 @@ module builtin pub fn utf8_char_len(b byte) int { - return ((0xe5000000>>((b>>3) & 0x1e)) & 3) + 1 + return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1 } // Convert utf32 to utf8 @@ -22,26 +22,28 @@ pub fn utf32_to_str_no_malloc(code u32, buf voidptr) string { mut res := '' unsafe { mut buffer := byteptr(buf) - if icode <= 127 { /* 0x7F */ + if icode <= 127 { + // 0x7F buffer[0] = byte(icode) res = tos(buffer, 1) - } - else if icode <= 2047 { /* 0x7FF */ - buffer[0] = 192 | byte(icode>>6) /* 0xC0 - 110xxxxx */ - buffer[1] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */ + } else if icode <= 2047 { + // 0x7FF + buffer[0] = 192 | byte(icode >> 6) // 0xC0 - 110xxxxx + buffer[1] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx res = tos(buffer, 2) - } - else if icode <= 65535 { /* 0xFFFF */ - buffer[0] = 224 | byte(icode>>12)/* 0xE0 - 1110xxxx */ - buffer[1] = 128 | (byte(icode>>6) & 63) /* 0x80 - 0x3F - 10xxxxxx */ - buffer[2] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */ + } else if icode <= 65535 { + // 0xFFFF + buffer[0] = 224 | byte(icode >> 12) // 0xE0 - 1110xxxx + buffer[1] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx + buffer[2] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx res = tos(buffer, 3) } - else if icode <= 1114111/* 0x10FFFF */ { - buffer[0] = 240 | byte(icode>>18) /* 0xF0 - 11110xxx */ - buffer[1] = 128 | (byte(icode>>12) & 63) /* 0x80 - 0x3F - 10xxxxxx */ - buffer[2] = 128 | (byte(icode>>6) & 63) /* 0x80 - 0x3F - 10xxxxxx */ - buffer[3] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */ + // 0x10FFFF + else if icode <= 1114111 { + buffer[0] = 240 | byte(icode >> 18) // 0xF0 - 11110xxx + buffer[1] = 128 | (byte(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx + buffer[2] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx + buffer[3] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx res = tos(buffer, 4) } } @@ -61,12 +63,12 @@ pub fn (_rune string) utf32_code() int { mut b := byte(int(_rune[0])) // TODO should be // res := int( rune[0] << rune.len) - b = b<<_rune.len + b = b << _rune.len mut res := int(b) mut shift := 6 - _rune.len for i := 1; i < _rune.len; i++ { c := int(_rune[i]) - res = res<>= 4 - } - else { + } else { b += 4 } if (x & 12) != 0 { // 0x0C x >>= 2 - } - else { + } else { b += 2 } if (x & 2) == 0 { @@ -103,7 +103,7 @@ fn utf8_str_len(s string) int { mut l := 0 for i := 0; i < s.len; i++ { l++ - c := unsafe {s.str[i]} + c := unsafe { s.str[i] } if (c & (1 << 7)) != 0 { for t := byte(1 << 6); (c & t) != 0; t >>= 1 { i++ @@ -114,12 +114,14 @@ fn utf8_str_len(s string) int { } // Calculate string length for formatting, i.e. number of "characters" -fn utf8_str_visible_length(s string) int { +// This is simplified implementation. if you need specification compliant width, +// use utf8.east_asian.display_width. +pub fn utf8_str_visible_length(s string) int { mut l := 0 mut ul := 1 - for i := 0; i < s.len; i+=ul { + for i := 0; i < s.len; i += ul { ul = 1 - c := unsafe {s.str[i]} + c := unsafe { s.str[i] } if (c & (1 << 7)) != 0 { for t := byte(1 << 6); (c & t) != 0; t >>= 1 { ul++ @@ -129,24 +131,58 @@ fn utf8_str_visible_length(s string) int { return l } l++ - // recognize combining characters - if c == 0xcc || c == 0xcd { - r := (u16(c) << 8) | unsafe {s.str[i+1]} - if r >= 0xcc80 && r < 0xcdb0 { // diacritical marks - l-- + // recognize combining characters and wide characters + match ul { + 2 { + r := u64((u16(c) << 8) | unsafe { s.str[i + 1] }) + if r >= 0xcc80 && r < 0xcdb0 { + // diacritical marks + l-- + } } - } else if c == 0xe1 || c == 0xe2 || c == 0xef { - r := (u32(c) << 16) | unsafe {(u32(s.str[i+1]) << 8) | s.str[i+2]} - // diacritical marks extended 0xe1aab0 - 0xe1ac80 - // diacritical marks supplement 0xe1b780 - 0xe1b880 - // diacritical marks for symbols 0xe28390 - 0xe28480 - // half marks 0xefb8a0 - 0xefb8b0 - if (r >= 0xe1aab0 && r < 0xe1ac80) - || (r >= 0xe1b780 && r < 0xe1b880) - || (r >= 0xe28390 && r < 0xe28480) - || (r >= 0xefb8a0 && r < 0xefb8b0) { - l-- + 3 { + r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] }) + // diacritical marks extended + // diacritical marks supplement + // diacritical marks for symbols + if (r >= 0xe1aab0 && r <= 0xe1ac7f) + || (r >= 0xe1b780 && r <= 0xe1b87f) + || (r >= 0xe28390 && r <= 0xe2847f) + || (r >= 0xefb8a0 && r <= 0xefb8af) { + // diacritical marks + l-- + } + // Hangru + // CJK Unified Ideographics + // Hangru + // CJK + else if (r >= 0xe18480 && r <= 0xe1859f) + || (r >= 0xe2ba80 && r <= 0xe2bf95) + || (r >= 0xe38080 && r <= 0xe4b77f) + || (r >= 0xe4b880 && r <= 0xea807f) + || (r >= 0xeaa5a0 && r <= 0xeaa79f) + || (r >= 0xeab080 && r <= 0xed9eaf) + || (r >= 0xefa480 && r <= 0xefac7f) + || (r >= 0xefb8b8 && r <= 0xefb9af) { + // half marks + l++ + } } + 4 { + r := u64((u32(c) << 24) | unsafe { + (u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3] + }) + // Enclosed Ideographic Supplement + // Emoji + // CJK Unified Ideographs Extension B-G + if (r >= 0x0f9f8880 && r <= 0xf09f8a8f) + || (r >= 0xf09f8c80 && r <= 0xf09f9c90) + || (r >= 0xf09fa490 && r <= 0xf09fa7af) + || (r >= 0xff0a08080 && r <= 0xf180807f) { + l++ + } + } + else {} } } return l diff --git a/vlib/v/checker/checker.v b/vlib/v/checker/checker.v index c5728bea8e..ae749ce22f 100644 --- a/vlib/v/checker/checker.v +++ b/vlib/v/checker/checker.v @@ -4407,15 +4407,15 @@ fn (mut c Checker) match_exprs(mut node ast.MatchExpr, cond_type_sym table.TypeS for expr in branch.exprs { mut key := '' if expr is ast.RangeExpr { - mut low := 0 - mut high := 0 + mut low := i64(0) + mut high := i64(0) c.expected_type = node.expected_type low_expr := expr.low high_expr := expr.high if low_expr is ast.IntegerLiteral { if high_expr is ast.IntegerLiteral { - low = low_expr.val.int() - high = high_expr.val.int() + low = low_expr.val.i64() + high = high_expr.val.i64() } else { c.error('mismatched range types', low_expr.pos) } @@ -4430,6 +4430,11 @@ fn (mut c Checker) match_exprs(mut node ast.MatchExpr, cond_type_sym table.TypeS typ := c.table.type_to_str(c.expr(expr.low)) c.error('cannot use type `$typ` in match range', branch.pos) } + high_low_cutoff := 1000 + if high - low > high_low_cutoff { + c.warn('more than $high_low_cutoff possibilities ($low ... $high) in match range', + branch.pos) + } for i in low .. high + 1 { key = i.str() val := if key in branch_exprs { branch_exprs[key] } else { 0 } diff --git a/vlib/v/checker/tests/error_with_unicode.out b/vlib/v/checker/tests/error_with_unicode.out new file mode 100644 index 0000000000..a521ceff66 --- /dev/null +++ b/vlib/v/checker/tests/error_with_unicode.out @@ -0,0 +1,55 @@ +vlib/v/checker/tests/error_with_unicode.vv:5:17: error: cannot use `int literal` as `string` in argument 2 to `f1` + 3 | + 4 | fn main() { + 5 | f1('🐀🐈', 0) + | ^ + 6 | f2(0, '🐟🐧') + 7 | mut n := 0 +vlib/v/checker/tests/error_with_unicode.vv:6:8: error: cannot use `string` as `int` in argument 2 to `f2` + 4 | fn main() { + 5 | f1('🐀🐈', 0) + 6 | f2(0, '🐟🐧') + | ~~~~~~ + 7 | mut n := 0 + 8 | n = '漢字' +vlib/v/checker/tests/error_with_unicode.vv:8:6: error: cannot assign to `n`: expected `int`, not `string` + 6 | f2(0, '🐟🐧') + 7 | mut n := 0 + 8 | n = '漢字' + | ~~~~~~ + 9 | n = 'ひらがな' + 10 | n = '简体字' +vlib/v/checker/tests/error_with_unicode.vv:9:6: error: cannot assign to `n`: expected `int`, not `string` + 7 | mut n := 0 + 8 | n = '漢字' + 9 | n = 'ひらがな' + | ~~~~~~~~~~ + 10 | n = '简体字' + 11 | n = '繁體字' +vlib/v/checker/tests/error_with_unicode.vv:10:6: error: cannot assign to `n`: expected `int`, not `string` + 8 | n = '漢字' + 9 | n = 'ひらがな' + 10 | n = '简体字' + | ~~~~~~~~ + 11 | n = '繁體字' + 12 | n = '한글' +vlib/v/checker/tests/error_with_unicode.vv:11:6: error: cannot assign to `n`: expected `int`, not `string` + 9 | n = 'ひらがな' + 10 | n = '简体字' + 11 | n = '繁體字' + | ~~~~~~~~ + 12 | n = '한글' + 13 | n = 'Кириллица' +vlib/v/checker/tests/error_with_unicode.vv:12:6: error: cannot assign to `n`: expected `int`, not `string` + 10 | n = '简体字' + 11 | n = '繁體字' + 12 | n = '한글' + | ~~~~~~ + 13 | n = 'Кириллица' + 14 | } +vlib/v/checker/tests/error_with_unicode.vv:13:6: error: cannot assign to `n`: expected `int`, not `string` + 11 | n = '繁體字' + 12 | n = '한글' + 13 | n = 'Кириллица' + | ~~~~~~~~~~~ + 14 | } diff --git a/vlib/v/checker/tests/error_with_unicode.vv b/vlib/v/checker/tests/error_with_unicode.vv new file mode 100644 index 0000000000..af3edc4469 --- /dev/null +++ b/vlib/v/checker/tests/error_with_unicode.vv @@ -0,0 +1,14 @@ +fn f1(_ string, _ string) {} +fn f2(_ int, _ int) {} + +fn main() { + f1('🐀🐈', 0) + f2(0, '🐟🐧') + mut n := 0 + n = '漢字' + n = 'ひらがな' + n = '简体字' + n = '繁體字' + n = '한글' + n = 'Кириллица' +} diff --git a/vlib/v/util/errors.v b/vlib/v/util/errors.v index 3b6df3e8f5..ee86fd6383 100644 --- a/vlib/v/util/errors.v +++ b/vlib/v/util/errors.v @@ -4,6 +4,7 @@ module util import os +import strings import term import v.token @@ -132,14 +133,22 @@ pub fn source_context(kind string, source string, column int, pos token.Position // line, so that it prints the ^ character exactly on the *same spot* // where it is needed. That is the reason we can not just // use strings.repeat(` `, col) to form it. - mut pointerline := '' - for bchar in sline[..start_column] { - x := if bchar.is_space() { bchar } else { ` ` } - pointerline += x.ascii_str() + mut pointerline_builder := strings.new_builder(sline.len) + for i := 0; i < start_column; { + if sline[i].is_space() { + pointerline_builder.write_b(sline[i]) + i++ + } else { + char_len := utf8_char_len(sline[i]) + spaces := ' '.repeat(utf8_str_visible_length(sline[i..i + char_len])) + pointerline_builder.write_string(spaces) + i += char_len + } } - underline := if pos.len > 1 { '~'.repeat(end_column - start_column) } else { '^' } - pointerline += bold(color(kind, underline)) - clines << ' | ' + pointerline.replace('\t', tab_spaces) + underline_len := utf8_str_visible_length(sline[start_column..end_column]) + underline := if underline_len > 1 { '~'.repeat(underline_len) } else { '^' } + pointerline_builder.write_string(bold(color(kind, underline))) + clines << ' | ' + pointerline_builder.str().replace('\t', tab_spaces) } } return clines