builtin: correct error underline for unicode wide chars (#9010)
parent
e937d6249c
commit
ce115dcbe0
|
@ -4,7 +4,7 @@
|
|||
module builtin
|
||||
|
||||
pub fn utf8_char_len(b byte) int {
|
||||
return ((0xe5000000>>((b>>3) & 0x1e)) & 3) + 1
|
||||
return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
|
||||
}
|
||||
|
||||
// Convert utf32 to utf8
|
||||
|
@ -22,26 +22,28 @@ pub fn utf32_to_str_no_malloc(code u32, buf voidptr) string {
|
|||
mut res := ''
|
||||
unsafe {
|
||||
mut buffer := byteptr(buf)
|
||||
if icode <= 127 { /* 0x7F */
|
||||
if icode <= 127 {
|
||||
// 0x7F
|
||||
buffer[0] = byte(icode)
|
||||
res = tos(buffer, 1)
|
||||
}
|
||||
else if icode <= 2047 { /* 0x7FF */
|
||||
buffer[0] = 192 | byte(icode>>6) /* 0xC0 - 110xxxxx */
|
||||
buffer[1] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */
|
||||
} else if icode <= 2047 {
|
||||
// 0x7FF
|
||||
buffer[0] = 192 | byte(icode >> 6) // 0xC0 - 110xxxxx
|
||||
buffer[1] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
|
||||
res = tos(buffer, 2)
|
||||
}
|
||||
else if icode <= 65535 { /* 0xFFFF */
|
||||
buffer[0] = 224 | byte(icode>>12)/* 0xE0 - 1110xxxx */
|
||||
buffer[1] = 128 | (byte(icode>>6) & 63) /* 0x80 - 0x3F - 10xxxxxx */
|
||||
buffer[2] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */
|
||||
} else if icode <= 65535 {
|
||||
// 0xFFFF
|
||||
buffer[0] = 224 | byte(icode >> 12) // 0xE0 - 1110xxxx
|
||||
buffer[1] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
|
||||
buffer[2] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
|
||||
res = tos(buffer, 3)
|
||||
}
|
||||
else if icode <= 1114111/* 0x10FFFF */ {
|
||||
buffer[0] = 240 | byte(icode>>18) /* 0xF0 - 11110xxx */
|
||||
buffer[1] = 128 | (byte(icode>>12) & 63) /* 0x80 - 0x3F - 10xxxxxx */
|
||||
buffer[2] = 128 | (byte(icode>>6) & 63) /* 0x80 - 0x3F - 10xxxxxx */
|
||||
buffer[3] = 128 | byte(icode & 63) /* 0x80 - 0x3F - 10xxxxxx */
|
||||
// 0x10FFFF
|
||||
else if icode <= 1114111 {
|
||||
buffer[0] = 240 | byte(icode >> 18) // 0xF0 - 11110xxx
|
||||
buffer[1] = 128 | (byte(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
|
||||
buffer[2] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
|
||||
buffer[3] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
|
||||
res = tos(buffer, 4)
|
||||
}
|
||||
}
|
||||
|
@ -61,12 +63,12 @@ pub fn (_rune string) utf32_code() int {
|
|||
mut b := byte(int(_rune[0]))
|
||||
// TODO should be
|
||||
// res := int( rune[0] << rune.len)
|
||||
b = b<<_rune.len
|
||||
b = b << _rune.len
|
||||
mut res := int(b)
|
||||
mut shift := 6 - _rune.len
|
||||
for i := 1; i < _rune.len; i++ {
|
||||
c := int(_rune[i])
|
||||
res = res<<shift
|
||||
res = res << shift
|
||||
res |= c & 63 // 0x3f
|
||||
shift = 6
|
||||
}
|
||||
|
@ -80,15 +82,13 @@ fn utf8_len(c byte) int {
|
|||
if (x & 240) != 0 {
|
||||
// 0xF0
|
||||
x >>= 4
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
b += 4
|
||||
}
|
||||
if (x & 12) != 0 {
|
||||
// 0x0C
|
||||
x >>= 2
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
b += 2
|
||||
}
|
||||
if (x & 2) == 0 {
|
||||
|
@ -103,7 +103,7 @@ fn utf8_str_len(s string) int {
|
|||
mut l := 0
|
||||
for i := 0; i < s.len; i++ {
|
||||
l++
|
||||
c := unsafe {s.str[i]}
|
||||
c := unsafe { s.str[i] }
|
||||
if (c & (1 << 7)) != 0 {
|
||||
for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
|
||||
i++
|
||||
|
@ -114,12 +114,14 @@ fn utf8_str_len(s string) int {
|
|||
}
|
||||
|
||||
// Calculate string length for formatting, i.e. number of "characters"
|
||||
fn utf8_str_visible_length(s string) int {
|
||||
// This is simplified implementation. if you need specification compliant width,
|
||||
// use utf8.east_asian.display_width.
|
||||
pub fn utf8_str_visible_length(s string) int {
|
||||
mut l := 0
|
||||
mut ul := 1
|
||||
for i := 0; i < s.len; i+=ul {
|
||||
for i := 0; i < s.len; i += ul {
|
||||
ul = 1
|
||||
c := unsafe {s.str[i]}
|
||||
c := unsafe { s.str[i] }
|
||||
if (c & (1 << 7)) != 0 {
|
||||
for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
|
||||
ul++
|
||||
|
@ -129,24 +131,58 @@ fn utf8_str_visible_length(s string) int {
|
|||
return l
|
||||
}
|
||||
l++
|
||||
// recognize combining characters
|
||||
if c == 0xcc || c == 0xcd {
|
||||
r := (u16(c) << 8) | unsafe {s.str[i+1]}
|
||||
if r >= 0xcc80 && r < 0xcdb0 { // diacritical marks
|
||||
l--
|
||||
// recognize combining characters and wide characters
|
||||
match ul {
|
||||
2 {
|
||||
r := u64((u16(c) << 8) | unsafe { s.str[i + 1] })
|
||||
if r >= 0xcc80 && r < 0xcdb0 {
|
||||
// diacritical marks
|
||||
l--
|
||||
}
|
||||
}
|
||||
} else if c == 0xe1 || c == 0xe2 || c == 0xef {
|
||||
r := (u32(c) << 16) | unsafe {(u32(s.str[i+1]) << 8) | s.str[i+2]}
|
||||
// diacritical marks extended 0xe1aab0 - 0xe1ac80
|
||||
// diacritical marks supplement 0xe1b780 - 0xe1b880
|
||||
// diacritical marks for symbols 0xe28390 - 0xe28480
|
||||
// half marks 0xefb8a0 - 0xefb8b0
|
||||
if (r >= 0xe1aab0 && r < 0xe1ac80)
|
||||
|| (r >= 0xe1b780 && r < 0xe1b880)
|
||||
|| (r >= 0xe28390 && r < 0xe28480)
|
||||
|| (r >= 0xefb8a0 && r < 0xefb8b0) {
|
||||
l--
|
||||
3 {
|
||||
r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] })
|
||||
// diacritical marks extended
|
||||
// diacritical marks supplement
|
||||
// diacritical marks for symbols
|
||||
if (r >= 0xe1aab0 && r <= 0xe1ac7f)
|
||||
|| (r >= 0xe1b780 && r <= 0xe1b87f)
|
||||
|| (r >= 0xe28390 && r <= 0xe2847f)
|
||||
|| (r >= 0xefb8a0 && r <= 0xefb8af) {
|
||||
// diacritical marks
|
||||
l--
|
||||
}
|
||||
// Hangru
|
||||
// CJK Unified Ideographics
|
||||
// Hangru
|
||||
// CJK
|
||||
else if (r >= 0xe18480 && r <= 0xe1859f)
|
||||
|| (r >= 0xe2ba80 && r <= 0xe2bf95)
|
||||
|| (r >= 0xe38080 && r <= 0xe4b77f)
|
||||
|| (r >= 0xe4b880 && r <= 0xea807f)
|
||||
|| (r >= 0xeaa5a0 && r <= 0xeaa79f)
|
||||
|| (r >= 0xeab080 && r <= 0xed9eaf)
|
||||
|| (r >= 0xefa480 && r <= 0xefac7f)
|
||||
|| (r >= 0xefb8b8 && r <= 0xefb9af) {
|
||||
// half marks
|
||||
l++
|
||||
}
|
||||
}
|
||||
4 {
|
||||
r := u64((u32(c) << 24) | unsafe {
|
||||
(u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3]
|
||||
})
|
||||
// Enclosed Ideographic Supplement
|
||||
// Emoji
|
||||
// CJK Unified Ideographs Extension B-G
|
||||
if (r >= 0x0f9f8880 && r <= 0xf09f8a8f)
|
||||
|| (r >= 0xf09f8c80 && r <= 0xf09f9c90)
|
||||
|| (r >= 0xf09fa490 && r <= 0xf09fa7af)
|
||||
|| (r >= 0xff0a08080 && r <= 0xf180807f) {
|
||||
l++
|
||||
}
|
||||
}
|
||||
else {}
|
||||
}
|
||||
}
|
||||
return l
|
||||
|
|
|
@ -4407,15 +4407,15 @@ fn (mut c Checker) match_exprs(mut node ast.MatchExpr, cond_type_sym table.TypeS
|
|||
for expr in branch.exprs {
|
||||
mut key := ''
|
||||
if expr is ast.RangeExpr {
|
||||
mut low := 0
|
||||
mut high := 0
|
||||
mut low := i64(0)
|
||||
mut high := i64(0)
|
||||
c.expected_type = node.expected_type
|
||||
low_expr := expr.low
|
||||
high_expr := expr.high
|
||||
if low_expr is ast.IntegerLiteral {
|
||||
if high_expr is ast.IntegerLiteral {
|
||||
low = low_expr.val.int()
|
||||
high = high_expr.val.int()
|
||||
low = low_expr.val.i64()
|
||||
high = high_expr.val.i64()
|
||||
} else {
|
||||
c.error('mismatched range types', low_expr.pos)
|
||||
}
|
||||
|
@ -4430,6 +4430,11 @@ fn (mut c Checker) match_exprs(mut node ast.MatchExpr, cond_type_sym table.TypeS
|
|||
typ := c.table.type_to_str(c.expr(expr.low))
|
||||
c.error('cannot use type `$typ` in match range', branch.pos)
|
||||
}
|
||||
high_low_cutoff := 1000
|
||||
if high - low > high_low_cutoff {
|
||||
c.warn('more than $high_low_cutoff possibilities ($low ... $high) in match range',
|
||||
branch.pos)
|
||||
}
|
||||
for i in low .. high + 1 {
|
||||
key = i.str()
|
||||
val := if key in branch_exprs { branch_exprs[key] } else { 0 }
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
vlib/v/checker/tests/error_with_unicode.vv:5:17: error: cannot use `int literal` as `string` in argument 2 to `f1`
|
||||
3 |
|
||||
4 | fn main() {
|
||||
5 | f1('🐀🐈', 0)
|
||||
| ^
|
||||
6 | f2(0, '🐟🐧')
|
||||
7 | mut n := 0
|
||||
vlib/v/checker/tests/error_with_unicode.vv:6:8: error: cannot use `string` as `int` in argument 2 to `f2`
|
||||
4 | fn main() {
|
||||
5 | f1('🐀🐈', 0)
|
||||
6 | f2(0, '🐟🐧')
|
||||
| ~~~~~~
|
||||
7 | mut n := 0
|
||||
8 | n = '漢字'
|
||||
vlib/v/checker/tests/error_with_unicode.vv:8:6: error: cannot assign to `n`: expected `int`, not `string`
|
||||
6 | f2(0, '🐟🐧')
|
||||
7 | mut n := 0
|
||||
8 | n = '漢字'
|
||||
| ~~~~~~
|
||||
9 | n = 'ひらがな'
|
||||
10 | n = '简体字'
|
||||
vlib/v/checker/tests/error_with_unicode.vv:9:6: error: cannot assign to `n`: expected `int`, not `string`
|
||||
7 | mut n := 0
|
||||
8 | n = '漢字'
|
||||
9 | n = 'ひらがな'
|
||||
| ~~~~~~~~~~
|
||||
10 | n = '简体字'
|
||||
11 | n = '繁體字'
|
||||
vlib/v/checker/tests/error_with_unicode.vv:10:6: error: cannot assign to `n`: expected `int`, not `string`
|
||||
8 | n = '漢字'
|
||||
9 | n = 'ひらがな'
|
||||
10 | n = '简体字'
|
||||
| ~~~~~~~~
|
||||
11 | n = '繁體字'
|
||||
12 | n = '한글'
|
||||
vlib/v/checker/tests/error_with_unicode.vv:11:6: error: cannot assign to `n`: expected `int`, not `string`
|
||||
9 | n = 'ひらがな'
|
||||
10 | n = '简体字'
|
||||
11 | n = '繁體字'
|
||||
| ~~~~~~~~
|
||||
12 | n = '한글'
|
||||
13 | n = 'Кириллица'
|
||||
vlib/v/checker/tests/error_with_unicode.vv:12:6: error: cannot assign to `n`: expected `int`, not `string`
|
||||
10 | n = '简体字'
|
||||
11 | n = '繁體字'
|
||||
12 | n = '한글'
|
||||
| ~~~~~~
|
||||
13 | n = 'Кириллица'
|
||||
14 | }
|
||||
vlib/v/checker/tests/error_with_unicode.vv:13:6: error: cannot assign to `n`: expected `int`, not `string`
|
||||
11 | n = '繁體字'
|
||||
12 | n = '한글'
|
||||
13 | n = 'Кириллица'
|
||||
| ~~~~~~~~~~~
|
||||
14 | }
|
|
@ -0,0 +1,14 @@
|
|||
fn f1(_ string, _ string) {}
|
||||
fn f2(_ int, _ int) {}
|
||||
|
||||
fn main() {
|
||||
f1('🐀🐈', 0)
|
||||
f2(0, '🐟🐧')
|
||||
mut n := 0
|
||||
n = '漢字'
|
||||
n = 'ひらがな'
|
||||
n = '简体字'
|
||||
n = '繁體字'
|
||||
n = '한글'
|
||||
n = 'Кириллица'
|
||||
}
|
|
@ -4,6 +4,7 @@
|
|||
module util
|
||||
|
||||
import os
|
||||
import strings
|
||||
import term
|
||||
import v.token
|
||||
|
||||
|
@ -132,14 +133,22 @@ pub fn source_context(kind string, source string, column int, pos token.Position
|
|||
// line, so that it prints the ^ character exactly on the *same spot*
|
||||
// where it is needed. That is the reason we can not just
|
||||
// use strings.repeat(` `, col) to form it.
|
||||
mut pointerline := ''
|
||||
for bchar in sline[..start_column] {
|
||||
x := if bchar.is_space() { bchar } else { ` ` }
|
||||
pointerline += x.ascii_str()
|
||||
mut pointerline_builder := strings.new_builder(sline.len)
|
||||
for i := 0; i < start_column; {
|
||||
if sline[i].is_space() {
|
||||
pointerline_builder.write_b(sline[i])
|
||||
i++
|
||||
} else {
|
||||
char_len := utf8_char_len(sline[i])
|
||||
spaces := ' '.repeat(utf8_str_visible_length(sline[i..i + char_len]))
|
||||
pointerline_builder.write_string(spaces)
|
||||
i += char_len
|
||||
}
|
||||
}
|
||||
underline := if pos.len > 1 { '~'.repeat(end_column - start_column) } else { '^' }
|
||||
pointerline += bold(color(kind, underline))
|
||||
clines << ' | ' + pointerline.replace('\t', tab_spaces)
|
||||
underline_len := utf8_str_visible_length(sline[start_column..end_column])
|
||||
underline := if underline_len > 1 { '~'.repeat(underline_len) } else { '^' }
|
||||
pointerline_builder.write_string(bold(color(kind, underline)))
|
||||
clines << ' | ' + pointerline_builder.str().replace('\t', tab_spaces)
|
||||
}
|
||||
}
|
||||
return clines
|
||||
|
|
Loading…
Reference in New Issue