scanner, cgen: improve support for escape codes in backticks/runes (#13127)
parent
ea660315e0
commit
ab642cac43
|
@ -617,8 +617,10 @@ pub fn (mut c Checker) string_inter_lit(mut node ast.StringInterLiteral) ast.Typ
|
|||
return ast.string_type
|
||||
}
|
||||
|
||||
const hex_lit_overflow_message = 'hex character literal overflows string'
|
||||
const unicode_lit_overflow_message = 'unicode character exceeds max allowed value of 0x10ffff, consider using a unicode literal (\\u####)'
|
||||
|
||||
// unicode character literals are limited to a maximum value of 0x10ffff
|
||||
// https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff
|
||||
pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
|
||||
mut idx := 0
|
||||
for idx < node.val.len {
|
||||
|
@ -631,7 +633,7 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
|
|||
start_idx := idx
|
||||
idx++
|
||||
next_ch := node.val[idx] or { return ast.string_type }
|
||||
if next_ch == `x` {
|
||||
if next_ch == `u` {
|
||||
idx++
|
||||
mut ch := node.val[idx] or { return ast.string_type }
|
||||
mut hex_char_count := 0
|
||||
|
@ -647,13 +649,13 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
|
|||
first_digit := node.val[idx - 5] - 48
|
||||
second_digit := node.val[idx - 4] - 48
|
||||
if first_digit > 1 {
|
||||
c.error(checker.hex_lit_overflow_message, end_pos)
|
||||
c.error(checker.unicode_lit_overflow_message, end_pos)
|
||||
} else if first_digit == 1 && second_digit > 0 {
|
||||
c.error(checker.hex_lit_overflow_message, end_pos)
|
||||
c.error(checker.unicode_lit_overflow_message, end_pos)
|
||||
}
|
||||
}
|
||||
else {
|
||||
c.error(checker.hex_lit_overflow_message, end_pos)
|
||||
c.error(checker.unicode_lit_overflow_message, end_pos)
|
||||
}
|
||||
}
|
||||
idx++
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
vlib/v/checker/tests/hex_literal_overflow.vv:1:7: error: hex character literal overflows string
|
||||
1 | a := '\x11ffff'
|
||||
| ~~~~~~~~
|
||||
2 | b := '\x20ffff'
|
||||
3 | c := '\x10fffff'
|
||||
vlib/v/checker/tests/hex_literal_overflow.vv:2:7: error: hex character literal overflows string
|
||||
1 | a := '\x11ffff'
|
||||
2 | b := '\x20ffff'
|
||||
| ~~~~~~~~
|
||||
3 | c := '\x10fffff'
|
||||
4 | println(a)
|
||||
vlib/v/checker/tests/hex_literal_overflow.vv:3:7: error: hex character literal overflows string
|
||||
1 | a := '\x11ffff'
|
||||
2 | b := '\x20ffff'
|
||||
3 | c := '\x10fffff'
|
||||
| ~~~~~~~~~
|
||||
4 | println(a)
|
||||
5 | println(b)
|
|
@ -1,4 +1,4 @@
|
|||
vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used with no following hex digits
|
||||
vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used without two following hex digits
|
||||
1 | fn main() {
|
||||
2 | println('\x')
|
||||
| ^
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used with no following hex digits
|
||||
vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used without two following hex digits
|
||||
1 | fn main() {
|
||||
2 | println('\xhh')
|
||||
| ^
|
||||
|
|
|
@ -2510,6 +2510,19 @@ fn (mut g Gen) expr_with_cast(expr ast.Expr, got_type_raw ast.Type, expected_typ
|
|||
g.expr(expr)
|
||||
}
|
||||
|
||||
fn cescape_nonascii(original string) string {
|
||||
mut b := strings.new_builder(original.len)
|
||||
for c in original {
|
||||
if c < 32 || c > 126 {
|
||||
b.write_string('\\${c:03o}')
|
||||
continue
|
||||
}
|
||||
b.write_b(c)
|
||||
}
|
||||
res := b.str()
|
||||
return res
|
||||
}
|
||||
|
||||
// cestring returns a V string, properly escaped for embeddeding in a C string literal.
|
||||
fn cestring(s string) string {
|
||||
return s.replace('\\', '\\\\').replace('"', "'")
|
||||
|
@ -2517,7 +2530,7 @@ fn cestring(s string) string {
|
|||
|
||||
// ctoslit returns a '_SLIT("$s")' call, where s is properly escaped.
|
||||
fn ctoslit(s string) string {
|
||||
return '_SLIT("' + cestring(s) + '")'
|
||||
return '_SLIT("' + cescape_nonascii(cestring(s)) + '")'
|
||||
}
|
||||
|
||||
fn (mut g Gen) gen_attrs(attrs []ast.Attr) {
|
||||
|
|
|
@ -6,7 +6,7 @@ import v.ast
|
|||
import v.util
|
||||
|
||||
fn (mut g Gen) string_literal(node ast.StringLiteral) {
|
||||
escaped_val := util.smart_quote(node.val, node.is_raw)
|
||||
escaped_val := cescape_nonascii(util.smart_quote(node.val, node.is_raw))
|
||||
if node.language == .c {
|
||||
g.write('"$escaped_val"')
|
||||
} else {
|
||||
|
@ -25,7 +25,7 @@ fn (mut g Gen) string_inter_literal_sb_optimized(call_expr ast.CallExpr) {
|
|||
is_nl := call_expr.name == 'writeln'
|
||||
// println('optimize sb $call_expr.name')
|
||||
for i, val in node.vals {
|
||||
escaped_val := util.smart_quote(val, false)
|
||||
escaped_val := cescape_nonascii(util.smart_quote(val, false))
|
||||
// if val == '' {
|
||||
// break
|
||||
// continue
|
||||
|
|
|
@ -1174,6 +1174,7 @@ fn (mut s Scanner) ident_string() string {
|
|||
}
|
||||
s.is_inside_string = false
|
||||
mut u_escapes_pos := []int{} // pos list of \uXXXX
|
||||
mut h_escapes_pos := []int{} // pos list of \xXX
|
||||
mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 }
|
||||
for {
|
||||
s.pos++
|
||||
|
@ -1221,8 +1222,12 @@ fn (mut s Scanner) ident_string() string {
|
|||
// Escape `\x` `\u`
|
||||
if backslash_count % 2 == 1 && !is_raw && !is_cstr {
|
||||
// Escape `\x`
|
||||
if c == `x` && (s.text[s.pos + 1] == s.quote || !s.text[s.pos + 1].is_hex_digit()) {
|
||||
s.error(r'`\x` used with no following hex digits')
|
||||
if c == `x` {
|
||||
if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
|
||||
&& s.text[s.pos + 2].is_hex_digit()) {
|
||||
s.error(r'`\x` used without two following hex digits')
|
||||
}
|
||||
h_escapes_pos << s.pos - 1
|
||||
}
|
||||
// Escape `\u`
|
||||
if c == `u` {
|
||||
|
@ -1266,6 +1271,9 @@ fn (mut s Scanner) ident_string() string {
|
|||
if !s.is_fmt && u_escapes_pos.len > 0 {
|
||||
string_so_far = decode_u_escapes(string_so_far, start, u_escapes_pos)
|
||||
}
|
||||
if !s.is_fmt && h_escapes_pos.len > 0 {
|
||||
string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos)
|
||||
}
|
||||
if n_cr_chars > 0 {
|
||||
string_so_far = string_so_far.replace('\r', '')
|
||||
}
|
||||
|
@ -1278,6 +1286,27 @@ fn (mut s Scanner) ident_string() string {
|
|||
return lit
|
||||
}
|
||||
|
||||
// only handle single-byte inline escapes like '\xc0'
|
||||
fn decode_h_escapes(s string, start int, escapes_pos []int) string {
|
||||
if escapes_pos.len == 0 {
|
||||
return s
|
||||
}
|
||||
mut ss := []string{cap: escapes_pos.len * 2 + 1}
|
||||
ss << s[..escapes_pos.first() - start]
|
||||
for i, pos in escapes_pos {
|
||||
idx := pos - start
|
||||
end_idx := idx + 4 // "\xXX".len == 4
|
||||
// notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
|
||||
ss << [byte(strconv.parse_uint(s[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
|
||||
if i + 1 < escapes_pos.len {
|
||||
ss << s[end_idx..escapes_pos[i + 1] - start]
|
||||
} else {
|
||||
ss << s[end_idx..]
|
||||
}
|
||||
}
|
||||
return ss.join('')
|
||||
}
|
||||
|
||||
fn decode_u_escapes(s string, start int, escapes_pos []int) string {
|
||||
if escapes_pos.len == 0 {
|
||||
return s
|
||||
|
@ -1312,10 +1341,32 @@ fn trim_slash_line_break(s string) string {
|
|||
return ret_str
|
||||
}
|
||||
|
||||
/// ident_char is called when a backtick "single-char" is parsed from the code
|
||||
/// it is needed because some runes (chars) are written with escape sequences
|
||||
/// the string it returns should be a standardized, simplified version of the character
|
||||
/// as it would appear in source code
|
||||
/// possibilities:
|
||||
/// single chars like `a`, `b` => 'a', 'b'
|
||||
/// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
|
||||
/// escaped hex bytes like `\x01`, `\x61` => '\x01', 'a'
|
||||
/// escaped multibyte runes like `\xe29885` => (★)
|
||||
/// escaped unicode literals like `\u2605`
|
||||
fn (mut s Scanner) ident_char() string {
|
||||
start := s.pos
|
||||
lspos := token.Position{
|
||||
line_nr: s.line_nr
|
||||
pos: s.pos
|
||||
col: s.pos - s.last_nl_pos - 1
|
||||
}
|
||||
|
||||
start := s.pos // the string position of the first backtick char
|
||||
slash := `\\`
|
||||
mut len := 0
|
||||
|
||||
// set flags for advanced escapes first
|
||||
escaped_hex := s.expect('\\x', start + 1)
|
||||
escaped_unicode := s.expect('\\u', start + 1)
|
||||
|
||||
// walk the string to get characters up to the next backtick
|
||||
for {
|
||||
s.pos++
|
||||
if s.pos >= s.text.len {
|
||||
|
@ -1334,12 +1385,68 @@ fn (mut s Scanner) ident_char() string {
|
|||
}
|
||||
}
|
||||
len--
|
||||
c := s.text[start + 1..s.pos]
|
||||
mut c := s.text[start + 1..s.pos]
|
||||
if len != 1 {
|
||||
// if the content expresses an escape code, it will have an even number of characters
|
||||
// e.g. \x61 or \u2605
|
||||
if (c.len % 2 == 0) && (escaped_hex || escaped_unicode) {
|
||||
if escaped_unicode {
|
||||
c = decode_u_escapes(c, 0, [0])
|
||||
} else {
|
||||
// we have to handle hex ourselves
|
||||
ascii_0 := byte(0x30)
|
||||
ascii_a := byte(0x61)
|
||||
mut accumulated := []byte{}
|
||||
val := c[2..c.len].to_lower() // 0A -> 0a
|
||||
mut offset := 0
|
||||
// take two characters at a time, parse as hex and add to bytes
|
||||
for {
|
||||
if offset >= val.len - 1 {
|
||||
break
|
||||
}
|
||||
mut byteval := byte(0)
|
||||
big := val[offset]
|
||||
little := val[offset + 1]
|
||||
if !big.is_hex_digit() {
|
||||
accumulated.clear()
|
||||
break
|
||||
}
|
||||
if !little.is_hex_digit() {
|
||||
accumulated.clear()
|
||||
break
|
||||
}
|
||||
|
||||
if big.is_digit() {
|
||||
byteval |= (big - ascii_0) << 4
|
||||
} else {
|
||||
byteval |= (big - ascii_a + 10) << 4
|
||||
}
|
||||
if little.is_digit() {
|
||||
byteval |= (little - ascii_0)
|
||||
} else {
|
||||
byteval |= (little - ascii_a + 10)
|
||||
}
|
||||
|
||||
accumulated << byteval
|
||||
offset += 2
|
||||
}
|
||||
if accumulated.len > 0 {
|
||||
c = accumulated.bytestr()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the string inside the backticks is longer than one character
|
||||
// but we might only have one rune, say in the case
|
||||
u := c.runes()
|
||||
if u.len != 1 {
|
||||
s.error('invalid character literal (more than one character)\n' +
|
||||
'use quotes for strings, backticks for characters')
|
||||
if escaped_hex || escaped_unicode {
|
||||
s.error('invalid character literal (escape sequence did not refer to a singular rune)')
|
||||
} else {
|
||||
s.add_error_detail_with_pos('use quotes for strings, backticks for characters',
|
||||
lspos)
|
||||
s.error('invalid character literal (more than one character)')
|
||||
}
|
||||
}
|
||||
}
|
||||
// Escapes a `'` character
|
||||
|
|
|
@ -19,6 +19,19 @@ fn scan_kinds(text string) []token.Kind {
|
|||
return token_kinds
|
||||
}
|
||||
|
||||
fn scan_tokens(text string) []token.Token {
|
||||
mut scanner := new_scanner(text, .parse_comments, &pref.Preferences{})
|
||||
mut tokens := []token.Token{}
|
||||
for {
|
||||
tok := scanner.scan()
|
||||
if tok.kind == .eof {
|
||||
break
|
||||
}
|
||||
tokens << tok
|
||||
}
|
||||
return tokens
|
||||
}
|
||||
|
||||
fn test_scan() {
|
||||
token_kinds := scan_kinds('println(2 + 3)')
|
||||
assert token_kinds.len == 6
|
||||
|
@ -138,6 +151,90 @@ fn test_ref_ref_array_ref_ref_foo() {
|
|||
}
|
||||
|
||||
fn test_escape_string() {
|
||||
// these assertions aren't helpful...
|
||||
// they test the vlib built-in to the compiler,
|
||||
// but we want to test this module before compilation
|
||||
assert '\x61' == 'a'
|
||||
assert '\x62' == 'b'
|
||||
// assert `\x61` == `a` // will work after pull request goes through
|
||||
|
||||
// SINGLE CHAR ESCAPES
|
||||
// SINGLE CHAR APOSTROPHE
|
||||
mut result := scan_tokens(r"`'`")
|
||||
assert result[0].kind == .chartoken
|
||||
assert result[0].lit == r"\'"
|
||||
|
||||
// SINGLE CHAR BACKTICK
|
||||
result = scan_tokens(r'`\``')
|
||||
assert result[0].kind == .chartoken
|
||||
assert result[0].lit == r'\`'
|
||||
|
||||
// SINGLE CHAR SLASH
|
||||
result = scan_tokens(r'`\\`')
|
||||
assert result[0].kind == .chartoken
|
||||
assert result[0].lit == r'\\'
|
||||
|
||||
// SINGLE CHAR UNICODE ESCAPE
|
||||
result = scan_tokens(r'`\u2605`')
|
||||
assert result[0].kind == .chartoken
|
||||
assert result[0].lit == r'★'
|
||||
|
||||
// SINGLE CHAR ESCAPED ASCII
|
||||
result = scan_tokens(r'`\x61`')
|
||||
assert result[0].kind == .chartoken
|
||||
assert result[0].lit == r'a'
|
||||
|
||||
// SINGLE CHAR INCORRECT ESCAPE
|
||||
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
|
||||
|
||||
// SINGLE CHAR MULTI-BYTE UTF-8
|
||||
// Compilation blocked by vlib/v/checker/check_types.v, but works in the repl
|
||||
result = scan_tokens(r'`\xe29885`')
|
||||
assert result[0].lit == r'★'
|
||||
|
||||
// STRING ESCAPES =================
|
||||
// STRING APOSTROPHE
|
||||
result = scan_tokens(r"'\''")
|
||||
assert result[0].kind == .string
|
||||
assert result[0].lit == r"\'"
|
||||
|
||||
// STRING BACKTICK
|
||||
result = scan_tokens(r"'\`'")
|
||||
assert result[0].kind == .string
|
||||
assert result[0].lit == r'\`'
|
||||
|
||||
// STRING SLASH
|
||||
result = scan_tokens(r"'\\'")
|
||||
assert result[0].kind == .string
|
||||
assert result[0].lit == r'\\'
|
||||
|
||||
// STRING UNICODE ESCAPE
|
||||
result = scan_tokens(r"'\u2605'")
|
||||
assert result[0].kind == .string
|
||||
assert result[0].lit == r'★'
|
||||
|
||||
// STRING ESCAPED ASCII
|
||||
result = scan_tokens(r"'\x61'")
|
||||
assert result[0].kind == .string
|
||||
assert result[0].lit == r'a'
|
||||
|
||||
// STRING ESCAPED EXTENDED ASCII
|
||||
// (should not be converted to unicode)
|
||||
result = scan_tokens(r"'\xe29885'")
|
||||
assert result[0].kind == .string
|
||||
assert result[0].lit.bytes() == [byte(0xe2), `9`, `8`, `8`, `5`]
|
||||
|
||||
// SHOULD RESULT IN ERRORS
|
||||
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
|
||||
// result = scan_tokens(r"'\x'") // should always result in an error
|
||||
// result = scan_tokens(r'`hello`') // should always result in an error
|
||||
}
|
||||
|
||||
fn test_comment_string() {
|
||||
mut result := scan_tokens('// single line comment will get an \\x01 prepended')
|
||||
assert result[0].kind == .comment
|
||||
assert result[0].lit[0] == byte(1) // \x01
|
||||
// result = scan_tokens('/// doc comment will keep third / at beginning')
|
||||
// result = scan_tokens('/* block comment will be stripped of whitespace */')
|
||||
// result = scan_tokens('a := 0 // line end comment also gets \\x01 prepended')
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue