scanner, cgen: improve support for escape codes in backticks/runes (#13127)

pull/13142/head
jeffmikels 2022-01-11 15:36:18 -05:00 committed by GitHub
parent ea660315e0
commit ab642cac43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 249 additions and 48 deletions

View File

@ -617,8 +617,10 @@ pub fn (mut c Checker) string_inter_lit(mut node ast.StringInterLiteral) ast.Typ
return ast.string_type
}
const hex_lit_overflow_message = 'hex character literal overflows string'
const unicode_lit_overflow_message = 'unicode character exceeds max allowed value of 0x10ffff, consider using a unicode literal (\\u####)'
// unicode character literals are limited to a maximum value of 0x10ffff
// https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff
pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
mut idx := 0
for idx < node.val.len {
@ -631,7 +633,7 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
start_idx := idx
idx++
next_ch := node.val[idx] or { return ast.string_type }
if next_ch == `x` {
if next_ch == `u` {
idx++
mut ch := node.val[idx] or { return ast.string_type }
mut hex_char_count := 0
@ -647,13 +649,13 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
first_digit := node.val[idx - 5] - 48
second_digit := node.val[idx - 4] - 48
if first_digit > 1 {
c.error(checker.hex_lit_overflow_message, end_pos)
c.error(checker.unicode_lit_overflow_message, end_pos)
} else if first_digit == 1 && second_digit > 0 {
c.error(checker.hex_lit_overflow_message, end_pos)
c.error(checker.unicode_lit_overflow_message, end_pos)
}
}
else {
c.error(checker.hex_lit_overflow_message, end_pos)
c.error(checker.unicode_lit_overflow_message, end_pos)
}
}
idx++

View File

@ -1,18 +0,0 @@
vlib/v/checker/tests/hex_literal_overflow.vv:1:7: error: hex character literal overflows string
1 | a := '\x11ffff'
| ~~~~~~~~
2 | b := '\x20ffff'
3 | c := '\x10fffff'
vlib/v/checker/tests/hex_literal_overflow.vv:2:7: error: hex character literal overflows string
1 | a := '\x11ffff'
2 | b := '\x20ffff'
| ~~~~~~~~
3 | c := '\x10fffff'
4 | println(a)
vlib/v/checker/tests/hex_literal_overflow.vv:3:7: error: hex character literal overflows string
1 | a := '\x11ffff'
2 | b := '\x20ffff'
3 | c := '\x10fffff'
| ~~~~~~~~~
4 | println(a)
5 | println(b)

View File

@ -1,5 +1,5 @@
vlib/v/checker/tests/import_mod_sub_as_sub_err.vv:1:25: error: import alias `encoding.utf8 as utf8` is redundant
1 | import encoding.utf8 as utf8
| ~~~~
2 |
3 | fn main() {
vlib/v/checker/tests/import_mod_sub_as_sub_err.vv:1:25: error: import alias `encoding.utf8 as utf8` is redundant
1 | import encoding.utf8 as utf8
| ~~~~
2 |
3 | fn main() {

View File

@ -1,5 +1,5 @@
vlib/v/checker/tests/minus_op_wrong_type_err.vv:10:10: error: mismatched types `Aaa` and `int literal`
8 |
8 |
9 | fn main() {
10 | println(Aaa{} - 10)
| ~~~~~~~~~~

View File

@ -1,7 +1,7 @@
vlib/v/checker/tests/mut_array_get_element_address_err.vv:3:20: error: cannot take the address of mutable array elements outside unsafe blocks
1 | fn main() {
2 | mut arr_int := [int(23), 45, 7, 8]
3 | ele := &arr_int[1]
| ~~~
4 | println(ele)
5 | }
vlib/v/checker/tests/mut_array_get_element_address_err.vv:3:20: error: cannot take the address of mutable array elements outside unsafe blocks
1 | fn main() {
2 | mut arr_int := [int(23), 45, 7, 8]
3 | ele := &arr_int[1]
| ~~~
4 | println(ele)
5 | }

View File

@ -1,4 +1,4 @@
vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used with no following hex digits
vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used without two following hex digits
1 | fn main() {
2 | println('\x')
| ^

View File

@ -1,4 +1,4 @@
vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used with no following hex digits
vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used without two following hex digits
1 | fn main() {
2 | println('\xhh')
| ^

View File

@ -1,5 +1,5 @@
vlib/v/checker/tests/unknown_function.vv:4:15: error: unknown function: math.max_i64
2 |
2 |
3 | fn main() {
4 | println(math.max_i64())
| ~~~~~~~~~

View File

@ -2510,6 +2510,19 @@ fn (mut g Gen) expr_with_cast(expr ast.Expr, got_type_raw ast.Type, expected_typ
g.expr(expr)
}
fn cescape_nonascii(original string) string {
mut b := strings.new_builder(original.len)
for c in original {
if c < 32 || c > 126 {
b.write_string('\\${c:03o}')
continue
}
b.write_b(c)
}
res := b.str()
return res
}
// cestring returns a V string, properly escaped for embeddeding in a C string literal.
fn cestring(s string) string {
return s.replace('\\', '\\\\').replace('"', "'")
@ -2517,7 +2530,7 @@ fn cestring(s string) string {
// ctoslit returns a '_SLIT("$s")' call, where s is properly escaped.
fn ctoslit(s string) string {
return '_SLIT("' + cestring(s) + '")'
return '_SLIT("' + cescape_nonascii(cestring(s)) + '")'
}
fn (mut g Gen) gen_attrs(attrs []ast.Attr) {

View File

@ -6,7 +6,7 @@ import v.ast
import v.util
fn (mut g Gen) string_literal(node ast.StringLiteral) {
escaped_val := util.smart_quote(node.val, node.is_raw)
escaped_val := cescape_nonascii(util.smart_quote(node.val, node.is_raw))
if node.language == .c {
g.write('"$escaped_val"')
} else {
@ -25,7 +25,7 @@ fn (mut g Gen) string_inter_literal_sb_optimized(call_expr ast.CallExpr) {
is_nl := call_expr.name == 'writeln'
// println('optimize sb $call_expr.name')
for i, val in node.vals {
escaped_val := util.smart_quote(val, false)
escaped_val := cescape_nonascii(util.smart_quote(val, false))
// if val == '' {
// break
// continue

View File

@ -1174,6 +1174,7 @@ fn (mut s Scanner) ident_string() string {
}
s.is_inside_string = false
mut u_escapes_pos := []int{} // pos list of \uXXXX
mut h_escapes_pos := []int{} // pos list of \xXX
mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 }
for {
s.pos++
@ -1221,8 +1222,12 @@ fn (mut s Scanner) ident_string() string {
// Escape `\x` `\u`
if backslash_count % 2 == 1 && !is_raw && !is_cstr {
// Escape `\x`
if c == `x` && (s.text[s.pos + 1] == s.quote || !s.text[s.pos + 1].is_hex_digit()) {
s.error(r'`\x` used with no following hex digits')
if c == `x` {
if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
&& s.text[s.pos + 2].is_hex_digit()) {
s.error(r'`\x` used without two following hex digits')
}
h_escapes_pos << s.pos - 1
}
// Escape `\u`
if c == `u` {
@ -1266,6 +1271,9 @@ fn (mut s Scanner) ident_string() string {
if !s.is_fmt && u_escapes_pos.len > 0 {
string_so_far = decode_u_escapes(string_so_far, start, u_escapes_pos)
}
if !s.is_fmt && h_escapes_pos.len > 0 {
string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos)
}
if n_cr_chars > 0 {
string_so_far = string_so_far.replace('\r', '')
}
@ -1278,6 +1286,27 @@ fn (mut s Scanner) ident_string() string {
return lit
}
// only handle single-byte inline escapes like '\xc0'
fn decode_h_escapes(s string, start int, escapes_pos []int) string {
if escapes_pos.len == 0 {
return s
}
mut ss := []string{cap: escapes_pos.len * 2 + 1}
ss << s[..escapes_pos.first() - start]
for i, pos in escapes_pos {
idx := pos - start
end_idx := idx + 4 // "\xXX".len == 4
// notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
ss << [byte(strconv.parse_uint(s[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
if i + 1 < escapes_pos.len {
ss << s[end_idx..escapes_pos[i + 1] - start]
} else {
ss << s[end_idx..]
}
}
return ss.join('')
}
fn decode_u_escapes(s string, start int, escapes_pos []int) string {
if escapes_pos.len == 0 {
return s
@ -1312,10 +1341,32 @@ fn trim_slash_line_break(s string) string {
return ret_str
}
/// ident_char is called when a backtick "single-char" is parsed from the code
/// it is needed because some runes (chars) are written with escape sequences
/// the string it returns should be a standardized, simplified version of the character
/// as it would appear in source code
/// possibilities:
/// single chars like `a`, `b` => 'a', 'b'
/// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
/// escaped hex bytes like `\x01`, `\x61` => '\x01', 'a'
/// escaped multibyte runes like `\xe29885` => (★)
/// escaped unicode literals like `\u2605`
fn (mut s Scanner) ident_char() string {
start := s.pos
lspos := token.Position{
line_nr: s.line_nr
pos: s.pos
col: s.pos - s.last_nl_pos - 1
}
start := s.pos // the string position of the first backtick char
slash := `\\`
mut len := 0
// set flags for advanced escapes first
escaped_hex := s.expect('\\x', start + 1)
escaped_unicode := s.expect('\\u', start + 1)
// walk the string to get characters up to the next backtick
for {
s.pos++
if s.pos >= s.text.len {
@ -1334,12 +1385,68 @@ fn (mut s Scanner) ident_char() string {
}
}
len--
c := s.text[start + 1..s.pos]
mut c := s.text[start + 1..s.pos]
if len != 1 {
// if the content expresses an escape code, it will have an even number of characters
// e.g. \x61 or \u2605
if (c.len % 2 == 0) && (escaped_hex || escaped_unicode) {
if escaped_unicode {
c = decode_u_escapes(c, 0, [0])
} else {
// we have to handle hex ourselves
ascii_0 := byte(0x30)
ascii_a := byte(0x61)
mut accumulated := []byte{}
val := c[2..c.len].to_lower() // 0A -> 0a
mut offset := 0
// take two characters at a time, parse as hex and add to bytes
for {
if offset >= val.len - 1 {
break
}
mut byteval := byte(0)
big := val[offset]
little := val[offset + 1]
if !big.is_hex_digit() {
accumulated.clear()
break
}
if !little.is_hex_digit() {
accumulated.clear()
break
}
if big.is_digit() {
byteval |= (big - ascii_0) << 4
} else {
byteval |= (big - ascii_a + 10) << 4
}
if little.is_digit() {
byteval |= (little - ascii_0)
} else {
byteval |= (little - ascii_a + 10)
}
accumulated << byteval
offset += 2
}
if accumulated.len > 0 {
c = accumulated.bytestr()
}
}
}
// the string inside the backticks is longer than one character
// but we might only have one rune, say in the case
u := c.runes()
if u.len != 1 {
s.error('invalid character literal (more than one character)\n' +
'use quotes for strings, backticks for characters')
if escaped_hex || escaped_unicode {
s.error('invalid character literal (escape sequence did not refer to a singular rune)')
} else {
s.add_error_detail_with_pos('use quotes for strings, backticks for characters',
lspos)
s.error('invalid character literal (more than one character)')
}
}
}
// Escapes a `'` character

View File

@ -19,6 +19,19 @@ fn scan_kinds(text string) []token.Kind {
return token_kinds
}
fn scan_tokens(text string) []token.Token {
mut scanner := new_scanner(text, .parse_comments, &pref.Preferences{})
mut tokens := []token.Token{}
for {
tok := scanner.scan()
if tok.kind == .eof {
break
}
tokens << tok
}
return tokens
}
fn test_scan() {
token_kinds := scan_kinds('println(2 + 3)')
assert token_kinds.len == 6
@ -138,6 +151,90 @@ fn test_ref_ref_array_ref_ref_foo() {
}
fn test_escape_string() {
// these assertions aren't helpful...
// they test the vlib built-in to the compiler,
// but we want to test this module before compilation
assert '\x61' == 'a'
assert '\x62' == 'b'
// assert `\x61` == `a` // will work after pull request goes through
// SINGLE CHAR ESCAPES
// SINGLE CHAR APOSTROPHE
mut result := scan_tokens(r"`'`")
assert result[0].kind == .chartoken
assert result[0].lit == r"\'"
// SINGLE CHAR BACKTICK
result = scan_tokens(r'`\``')
assert result[0].kind == .chartoken
assert result[0].lit == r'\`'
// SINGLE CHAR SLASH
result = scan_tokens(r'`\\`')
assert result[0].kind == .chartoken
assert result[0].lit == r'\\'
// SINGLE CHAR UNICODE ESCAPE
result = scan_tokens(r'`\u2605`')
assert result[0].kind == .chartoken
assert result[0].lit == r'★'
// SINGLE CHAR ESCAPED ASCII
result = scan_tokens(r'`\x61`')
assert result[0].kind == .chartoken
assert result[0].lit == r'a'
// SINGLE CHAR INCORRECT ESCAPE
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
// SINGLE CHAR MULTI-BYTE UTF-8
// Compilation blocked by vlib/v/checker/check_types.v, but works in the repl
result = scan_tokens(r'`\xe29885`')
assert result[0].lit == r'★'
// STRING ESCAPES =================
// STRING APOSTROPHE
result = scan_tokens(r"'\''")
assert result[0].kind == .string
assert result[0].lit == r"\'"
// STRING BACKTICK
result = scan_tokens(r"'\`'")
assert result[0].kind == .string
assert result[0].lit == r'\`'
// STRING SLASH
result = scan_tokens(r"'\\'")
assert result[0].kind == .string
assert result[0].lit == r'\\'
// STRING UNICODE ESCAPE
result = scan_tokens(r"'\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'★'
// STRING ESCAPED ASCII
result = scan_tokens(r"'\x61'")
assert result[0].kind == .string
assert result[0].lit == r'a'
// STRING ESCAPED EXTENDED ASCII
// (should not be converted to unicode)
result = scan_tokens(r"'\xe29885'")
assert result[0].kind == .string
assert result[0].lit.bytes() == [byte(0xe2), `9`, `8`, `8`, `5`]
// SHOULD RESULT IN ERRORS
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
// result = scan_tokens(r"'\x'") // should always result in an error
// result = scan_tokens(r'`hello`') // should always result in an error
}
fn test_comment_string() {
mut result := scan_tokens('// single line comment will get an \\x01 prepended')
assert result[0].kind == .comment
assert result[0].lit[0] == byte(1) // \x01
// result = scan_tokens('/// doc comment will keep third / at beginning')
// result = scan_tokens('/* block comment will be stripped of whitespace */')
// result = scan_tokens('a := 0 // line end comment also gets \\x01 prepended')
}