builtin: correct error underline for unicode wide chars (#9010)

pull/9046/head
zakuro 2021-03-01 08:18:02 +09:00 committed by GitHub
parent e937d6249c
commit ce115dcbe0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 172 additions and 53 deletions

View File

@ -4,7 +4,7 @@
module builtin
pub fn utf8_char_len(b byte) int {
return ((0xe5000000>>((b>>3) & 0x1e)) & 3) + 1
return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
}
// Convert utf32 to utf8
@ -22,26 +22,28 @@ pub fn utf32_to_str_no_malloc(code u32, buf voidptr) string {
mut res := ''
unsafe {
mut buffer := byteptr(buf)
if icode <= 127 { /* 0x7F */
if icode <= 127 {
// 0x7F
buffer[0] = byte(icode)
res = tos(buffer, 1)
}
else if icode <= 2047 { /* 0x7FF */
buffer[0] = 192 | byte(icode>>6) /* 0xC0 - 110xxxxx */
buffer[1] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */
} else if icode <= 2047 {
// 0x7FF
buffer[0] = 192 | byte(icode >> 6) // 0xC0 - 110xxxxx
buffer[1] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
res = tos(buffer, 2)
}
else if icode <= 65535 { /* 0xFFFF */
buffer[0] = 224 | byte(icode>>12)/* 0xE0 - 1110xxxx */
buffer[1] = 128 | (byte(icode>>6) & 63) /* 0x80 - 0x3F - 10xxxxxx */
buffer[2] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */
} else if icode <= 65535 {
// 0xFFFF
buffer[0] = 224 | byte(icode >> 12) // 0xE0 - 1110xxxx
buffer[1] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
buffer[2] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
res = tos(buffer, 3)
}
else if icode <= 1114111/* 0x10FFFF */ {
buffer[0] = 240 | byte(icode>>18) /* 0xF0 - 11110xxx */
buffer[1] = 128 | (byte(icode>>12) & 63) /* 0x80 - 0x3F - 10xxxxxx */
buffer[2] = 128 | (byte(icode>>6) & 63) /* 0x80 - 0x3F - 10xxxxxx */
buffer[3] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */
// 0x10FFFF
else if icode <= 1114111 {
buffer[0] = 240 | byte(icode >> 18) // 0xF0 - 11110xxx
buffer[1] = 128 | (byte(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
buffer[2] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
buffer[3] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
res = tos(buffer, 4)
}
}
@ -61,12 +63,12 @@ pub fn (_rune string) utf32_code() int {
mut b := byte(int(_rune[0]))
// TODO should be
// res := int( rune[0] << rune.len)
b = b<<_rune.len
b = b << _rune.len
mut res := int(b)
mut shift := 6 - _rune.len
for i := 1; i < _rune.len; i++ {
c := int(_rune[i])
res = res<<shift
res = res << shift
res |= c & 63 // 0x3f
shift = 6
}
@ -80,15 +82,13 @@ fn utf8_len(c byte) int {
if (x & 240) != 0 {
// 0xF0
x >>= 4
}
else {
} else {
b += 4
}
if (x & 12) != 0 {
// 0x0C
x >>= 2
}
else {
} else {
b += 2
}
if (x & 2) == 0 {
@ -103,7 +103,7 @@ fn utf8_str_len(s string) int {
mut l := 0
for i := 0; i < s.len; i++ {
l++
c := unsafe {s.str[i]}
c := unsafe { s.str[i] }
if (c & (1 << 7)) != 0 {
for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
i++
@ -114,12 +114,14 @@ fn utf8_str_len(s string) int {
}
// Calculate string length for formatting, i.e. number of "characters"
fn utf8_str_visible_length(s string) int {
// This is simplified implementation. if you need specification compliant width,
// use utf8.east_asian.display_width.
pub fn utf8_str_visible_length(s string) int {
mut l := 0
mut ul := 1
for i := 0; i < s.len; i+=ul {
for i := 0; i < s.len; i += ul {
ul = 1
c := unsafe {s.str[i]}
c := unsafe { s.str[i] }
if (c & (1 << 7)) != 0 {
for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
ul++
@ -129,24 +131,58 @@ fn utf8_str_visible_length(s string) int {
return l
}
l++
// recognize combining characters
if c == 0xcc || c == 0xcd {
r := (u16(c) << 8) | unsafe {s.str[i+1]}
if r >= 0xcc80 && r < 0xcdb0 { // diacritical marks
l--
// recognize combining characters and wide characters
match ul {
2 {
r := u64((u16(c) << 8) | unsafe { s.str[i + 1] })
if r >= 0xcc80 && r < 0xcdb0 {
// diacritical marks
l--
}
}
} else if c == 0xe1 || c == 0xe2 || c == 0xef {
r := (u32(c) << 16) | unsafe {(u32(s.str[i+1]) << 8) | s.str[i+2]}
// diacritical marks extended 0xe1aab0 - 0xe1ac80
// diacritical marks supplement 0xe1b780 - 0xe1b880
// diacritical marks for symbols 0xe28390 - 0xe28480
// half marks 0xefb8a0 - 0xefb8b0
if (r >= 0xe1aab0 && r < 0xe1ac80)
|| (r >= 0xe1b780 && r < 0xe1b880)
|| (r >= 0xe28390 && r < 0xe28480)
|| (r >= 0xefb8a0 && r < 0xefb8b0) {
l--
3 {
r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] })
// diacritical marks extended
// diacritical marks supplement
// diacritical marks for symbols
if (r >= 0xe1aab0 && r <= 0xe1ac7f)
|| (r >= 0xe1b780 && r <= 0xe1b87f)
|| (r >= 0xe28390 && r <= 0xe2847f)
|| (r >= 0xefb8a0 && r <= 0xefb8af) {
// diacritical marks
l--
}
// Hangru
// CJK Unified Ideographics
// Hangru
// CJK
else if (r >= 0xe18480 && r <= 0xe1859f)
|| (r >= 0xe2ba80 && r <= 0xe2bf95)
|| (r >= 0xe38080 && r <= 0xe4b77f)
|| (r >= 0xe4b880 && r <= 0xea807f)
|| (r >= 0xeaa5a0 && r <= 0xeaa79f)
|| (r >= 0xeab080 && r <= 0xed9eaf)
|| (r >= 0xefa480 && r <= 0xefac7f)
|| (r >= 0xefb8b8 && r <= 0xefb9af) {
// half marks
l++
}
}
4 {
r := u64((u32(c) << 24) | unsafe {
(u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3]
})
// Enclosed Ideographic Supplement
// Emoji
// CJK Unified Ideographs Extension B-G
if (r >= 0x0f9f8880 && r <= 0xf09f8a8f)
|| (r >= 0xf09f8c80 && r <= 0xf09f9c90)
|| (r >= 0xf09fa490 && r <= 0xf09fa7af)
|| (r >= 0xff0a08080 && r <= 0xf180807f) {
l++
}
}
else {}
}
}
return l

View File

@ -4407,15 +4407,15 @@ fn (mut c Checker) match_exprs(mut node ast.MatchExpr, cond_type_sym table.TypeS
for expr in branch.exprs {
mut key := ''
if expr is ast.RangeExpr {
mut low := 0
mut high := 0
mut low := i64(0)
mut high := i64(0)
c.expected_type = node.expected_type
low_expr := expr.low
high_expr := expr.high
if low_expr is ast.IntegerLiteral {
if high_expr is ast.IntegerLiteral {
low = low_expr.val.int()
high = high_expr.val.int()
low = low_expr.val.i64()
high = high_expr.val.i64()
} else {
c.error('mismatched range types', low_expr.pos)
}
@ -4430,6 +4430,11 @@ fn (mut c Checker) match_exprs(mut node ast.MatchExpr, cond_type_sym table.TypeS
typ := c.table.type_to_str(c.expr(expr.low))
c.error('cannot use type `$typ` in match range', branch.pos)
}
high_low_cutoff := 1000
if high - low > high_low_cutoff {
c.warn('more than $high_low_cutoff possibilities ($low ... $high) in match range',
branch.pos)
}
for i in low .. high + 1 {
key = i.str()
val := if key in branch_exprs { branch_exprs[key] } else { 0 }

View File

@ -0,0 +1,55 @@
vlib/v/checker/tests/error_with_unicode.vv:5:17: error: cannot use `int literal` as `string` in argument 2 to `f1`
3 |
4 | fn main() {
5 | f1('🐀🐈', 0)
| ^
6 | f2(0, '🐟🐧')
7 | mut n := 0
vlib/v/checker/tests/error_with_unicode.vv:6:8: error: cannot use `string` as `int` in argument 2 to `f2`
4 | fn main() {
5 | f1('🐀🐈', 0)
6 | f2(0, '🐟🐧')
| ~~~~~~
7 | mut n := 0
8 | n = '漢字'
vlib/v/checker/tests/error_with_unicode.vv:8:6: error: cannot assign to `n`: expected `int`, not `string`
6 | f2(0, '🐟🐧')
7 | mut n := 0
8 | n = '漢字'
| ~~~~~~
9 | n = 'ひらがな'
10 | n = '简体字'
vlib/v/checker/tests/error_with_unicode.vv:9:6: error: cannot assign to `n`: expected `int`, not `string`
7 | mut n := 0
8 | n = '漢字'
9 | n = 'ひらがな'
| ~~~~~~~~~~
10 | n = '简体字'
11 | n = '繁體字'
vlib/v/checker/tests/error_with_unicode.vv:10:6: error: cannot assign to `n`: expected `int`, not `string`
8 | n = '漢字'
9 | n = 'ひらがな'
10 | n = '简体字'
| ~~~~~~~~
11 | n = '繁體字'
12 | n = '한글'
vlib/v/checker/tests/error_with_unicode.vv:11:6: error: cannot assign to `n`: expected `int`, not `string`
9 | n = 'ひらがな'
10 | n = '简体字'
11 | n = '繁體字'
| ~~~~~~~~
12 | n = '한글'
13 | n = 'Кириллица'
vlib/v/checker/tests/error_with_unicode.vv:12:6: error: cannot assign to `n`: expected `int`, not `string`
10 | n = '简体字'
11 | n = '繁體字'
12 | n = '한글'
| ~~~~~~
13 | n = 'Кириллица'
14 | }
vlib/v/checker/tests/error_with_unicode.vv:13:6: error: cannot assign to `n`: expected `int`, not `string`
11 | n = '繁體字'
12 | n = '한글'
13 | n = 'Кириллица'
| ~~~~~~~~~~~
14 | }

View File

@ -0,0 +1,14 @@
fn f1(_ string, _ string) {}
fn f2(_ int, _ int) {}
fn main() {
f1('🐀🐈', 0)
f2(0, '🐟🐧')
mut n := 0
n = ''
n = ''
n = ''
n = ''
n = ''
n = 'Кириллица'
}

View File

@ -4,6 +4,7 @@
module util
import os
import strings
import term
import v.token
@ -132,14 +133,22 @@ pub fn source_context(kind string, source string, column int, pos token.Position
// line, so that it prints the ^ character exactly on the *same spot*
// where it is needed. That is the reason we can not just
// use strings.repeat(` `, col) to form it.
mut pointerline := ''
for bchar in sline[..start_column] {
x := if bchar.is_space() { bchar } else { ` ` }
pointerline += x.ascii_str()
mut pointerline_builder := strings.new_builder(sline.len)
for i := 0; i < start_column; {
if sline[i].is_space() {
pointerline_builder.write_b(sline[i])
i++
} else {
char_len := utf8_char_len(sline[i])
spaces := ' '.repeat(utf8_str_visible_length(sline[i..i + char_len]))
pointerline_builder.write_string(spaces)
i += char_len
}
}
underline := if pos.len > 1 { '~'.repeat(end_column - start_column) } else { '^' }
pointerline += bold(color(kind, underline))
clines << ' | ' + pointerline.replace('\t', tab_spaces)
underline_len := utf8_str_visible_length(sline[start_column..end_column])
underline := if underline_len > 1 { '~'.repeat(underline_len) } else { '^' }
pointerline_builder.write_string(bold(color(kind, underline)))
clines << ' | ' + pointerline_builder.str().replace('\t', tab_spaces)
}
}
return clines