scanner, cgen: improve support for escape codes in backticks/runes (#13127)

2022-01-11 15:36:18 -05:00 · 2022-01-11 15:36:18 -05:00 · ab642cac43
parent ea660315e0
commit ab642cac43
12 changed files with 249 additions and 48 deletions
--- a/vlib/v/checker/check_types.v
+++ b/vlib/v/checker/check_types.v
@ -617,8 +617,10 @@ pub fn (mut c Checker) string_inter_lit(mut node ast.StringInterLiteral) ast.Typ
 	return ast.string_type
 }
-const hex_lit_overflow_message = 'hex character literal overflows string'
+const unicode_lit_overflow_message = 'unicode character exceeds max allowed value of 0x10ffff, consider using a unicode literal (\\u####)'
 // unicode character literals are limited to a maximum value of 0x10ffff
 // https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff
 pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
 	mut idx := 0
 	for idx < node.val.len {
@ -631,7 +633,7 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
 				start_idx := idx
 				idx++
 				next_ch := node.val[idx] or { return ast.string_type }
-				if next_ch == `x` {
+				if next_ch == `u` {
 					idx++
 					mut ch := node.val[idx] or { return ast.string_type }
 					mut hex_char_count := 0
@ -647,13 +649,13 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
 								first_digit := node.val[idx - 5] - 48
 								second_digit := node.val[idx - 4] - 48
 								if first_digit > 1 {
-									c.error(checker.hex_lit_overflow_message, end_pos)
+									c.error(checker.unicode_lit_overflow_message, end_pos)
 								} else if first_digit == 1 && second_digit > 0 {
-									c.error(checker.hex_lit_overflow_message, end_pos)
+									c.error(checker.unicode_lit_overflow_message, end_pos)
 								}
 							}
 							else {
-								c.error(checker.hex_lit_overflow_message, end_pos)
+								c.error(checker.unicode_lit_overflow_message, end_pos)
 							}
 						}
 						idx++
--- a/vlib/v/checker/tests/hex_literal_overflow.out
+++ b/vlib/v/checker/tests/hex_literal_overflow.out
@ -1,18 +0,0 @@
 vlib/v/checker/tests/hex_literal_overflow.vv:1:7: error: hex character literal overflows string
    1 | a := '\x11ffff'
      |       ~~~~~~~~
    2 | b := '\x20ffff'
    3 | c := '\x10fffff'
 vlib/v/checker/tests/hex_literal_overflow.vv:2:7: error: hex character literal overflows string
    1 | a := '\x11ffff'
    2 | b := '\x20ffff'
      |       ~~~~~~~~
    3 | c := '\x10fffff'
    4 | println(a)
 vlib/v/checker/tests/hex_literal_overflow.vv:3:7: error: hex character literal overflows string
    1 | a := '\x11ffff'
    2 | b := '\x20ffff'
    3 | c := '\x10fffff'
      |       ~~~~~~~~~
    4 | println(a)
    5 | println(b)
--- a/vlib/v/checker/tests/import_mod_sub_as_sub_err.out
+++ b/vlib/v/checker/tests/import_mod_sub_as_sub_err.out
@ -1,5 +1,5 @@
-vlib/v/checker/tests/import_mod_sub_as_sub_err.vv:1:25: error: import alias `encoding.utf8 as utf8` is redundant
+vlib/v/checker/tests/import_mod_sub_as_sub_err.vv:1:25: error: import alias `encoding.utf8 as utf8` is redundant
-    1 | import encoding.utf8 as utf8
+    1 | import encoding.utf8 as utf8
-      |                         ~~~~
+      |                         ~~~~
-    2 |
+    2 | 
-    3 | fn main() {
+    3 | fn main() {
--- a/vlib/v/checker/tests/minus_op_wrong_type_err.out
+++ b/vlib/v/checker/tests/minus_op_wrong_type_err.out
@ -1,5 +1,5 @@
 vlib/v/checker/tests/minus_op_wrong_type_err.vv:10:10: error: mismatched types `Aaa` and `int literal`
-    8 |
+    8 | 
    9 | fn main() {
   10 |     println(Aaa{} - 10)
      |             ~~~~~~~~~~
--- a/vlib/v/checker/tests/mut_array_get_element_address_err.out
+++ b/vlib/v/checker/tests/mut_array_get_element_address_err.out
@ -1,7 +1,7 @@
-vlib/v/checker/tests/mut_array_get_element_address_err.vv:3:20: error: cannot take the address of mutable array elements outside unsafe blocks
+vlib/v/checker/tests/mut_array_get_element_address_err.vv:3:20: error: cannot take the address of mutable array elements outside unsafe blocks
-    1 | fn main() {
+    1 | fn main() {
-    2 |     mut arr_int := [int(23), 45, 7, 8]
+    2 |     mut arr_int := [int(23), 45, 7, 8]
-    3 |     ele := &arr_int[1]
+    3 |     ele := &arr_int[1]
-      |                    ~~~
+      |                    ~~~
-    4 |     println(ele)
+    4 |     println(ele)
-    5 | }
+    5 | }
--- a/vlib/v/checker/tests/string_escape_x_err_a.out
+++ b/vlib/v/checker/tests/string_escape_x_err_a.out
@ -1,4 +1,4 @@
-vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used with no following hex digits
+vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used without two following hex digits
    1 | fn main() {
    2 |     println('\x')
      |               ^
--- a/vlib/v/checker/tests/string_escape_x_err_b.out
+++ b/vlib/v/checker/tests/string_escape_x_err_b.out
@ -1,4 +1,4 @@
-vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used with no following hex digits
+vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used without two following hex digits
    1 | fn main() {
    2 |     println('\xhh')
      |               ^
--- a/vlib/v/checker/tests/unknown_function.out
+++ b/vlib/v/checker/tests/unknown_function.out
@ -1,5 +1,5 @@
 vlib/v/checker/tests/unknown_function.vv:4:15: error: unknown function: math.max_i64
-    2 |
+    2 | 
    3 | fn main() {
    4 |     println(math.max_i64())
      |                  ~~~~~~~~~
--- a/vlib/v/gen/c/cgen.v
+++ b/vlib/v/gen/c/cgen.v
@ -2510,6 +2510,19 @@ fn (mut g Gen) expr_with_cast(expr ast.Expr, got_type_raw ast.Type, expected_typ
 	g.expr(expr)
 }
 fn cescape_nonascii(original string) string {
 	mut b := strings.new_builder(original.len)
 	for c in original {
 		if c < 32 || c > 126 {
 			b.write_string('\\${c:03o}')
 			continue
 		}
 		b.write_b(c)
 	}
 	res := b.str()
 	return res
 }
 // cestring returns a V string, properly escaped for embeddeding in a C string literal.
 fn cestring(s string) string {
 	return s.replace('\\', '\\\\').replace('"', "'")
@ -2517,7 +2530,7 @@ fn cestring(s string) string {
 // ctoslit returns a '_SLIT("$s")' call, where s is properly escaped.
 fn ctoslit(s string) string {
-	return '_SLIT("' + cestring(s) + '")'
+	return '_SLIT("' + cescape_nonascii(cestring(s)) + '")'
 }
 fn (mut g Gen) gen_attrs(attrs []ast.Attr) {
--- a/vlib/v/gen/c/str.v
+++ b/vlib/v/gen/c/str.v
@ -6,7 +6,7 @@ import v.ast
 import v.util
 fn (mut g Gen) string_literal(node ast.StringLiteral) {
-	escaped_val := util.smart_quote(node.val, node.is_raw)
+	escaped_val := cescape_nonascii(util.smart_quote(node.val, node.is_raw))
 	if node.language == .c {
 		g.write('"$escaped_val"')
 	} else {
@ -25,7 +25,7 @@ fn (mut g Gen) string_inter_literal_sb_optimized(call_expr ast.CallExpr) {
 	is_nl := call_expr.name == 'writeln'
 	// println('optimize sb $call_expr.name')
 	for i, val in node.vals {
-		escaped_val := util.smart_quote(val, false)
+		escaped_val := cescape_nonascii(util.smart_quote(val, false))
 		// if val == '' {
 		// break
 		// continue
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -1174,6 +1174,7 @@ fn (mut s Scanner) ident_string() string {
 	}
 	s.is_inside_string = false
 	mut u_escapes_pos := []int{} // pos list of \uXXXX
 	mut h_escapes_pos := []int{} // pos list of \xXX
 	mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 }
 	for {
 		s.pos++
@ -1221,8 +1222,12 @@ fn (mut s Scanner) ident_string() string {
 		// Escape `\x` `\u`
 		if backslash_count % 2 == 1 && !is_raw && !is_cstr {
 			// Escape `\x`
-			if c == `x` && (s.text[s.pos + 1] == s.quote || !s.text[s.pos + 1].is_hex_digit()) {
+			if c == `x` {
-				s.error(r'`\x` used with no following hex digits')
+				if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
 					&& s.text[s.pos + 2].is_hex_digit()) {
 					s.error(r'`\x` used without two following hex digits')
 				}
 				h_escapes_pos << s.pos - 1
 			}
 			// Escape `\u`
 			if c == `u` {
@ -1266,6 +1271,9 @@ fn (mut s Scanner) ident_string() string {
 		if !s.is_fmt && u_escapes_pos.len > 0 {
 			string_so_far = decode_u_escapes(string_so_far, start, u_escapes_pos)
 		}
 		if !s.is_fmt && h_escapes_pos.len > 0 {
 			string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos)
 		}
 		if n_cr_chars > 0 {
 			string_so_far = string_so_far.replace('\r', '')
 		}
@ -1278,6 +1286,27 @@ fn (mut s Scanner) ident_string() string {
 	return lit
 }
 // only handle single-byte inline escapes like '\xc0'
 fn decode_h_escapes(s string, start int, escapes_pos []int) string {
 	if escapes_pos.len == 0 {
 		return s
 	}
 	mut ss := []string{cap: escapes_pos.len * 2 + 1}
 	ss << s[..escapes_pos.first() - start]
 	for i, pos in escapes_pos {
 		idx := pos - start
 		end_idx := idx + 4 // "\xXX".len == 4
 		// notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
 		ss << [byte(strconv.parse_uint(s[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
 		if i + 1 < escapes_pos.len {
 			ss << s[end_idx..escapes_pos[i + 1] - start]
 		} else {
 			ss << s[end_idx..]
 		}
 	}
 	return ss.join('')
 }
 fn decode_u_escapes(s string, start int, escapes_pos []int) string {
 	if escapes_pos.len == 0 {
 		return s
@ -1312,10 +1341,32 @@ fn trim_slash_line_break(s string) string {
 	return ret_str
 }
 /// ident_char is called when a backtick "single-char" is parsed from the code
 /// it is needed because some runes (chars) are written with escape sequences
 /// the string it returns should be a standardized, simplified version of the character
 /// as it would appear in source code
 /// possibilities:
 ///   single chars like `a`, `b` => 'a', 'b'
 ///   escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
 ///   escaped hex bytes like `\x01`, `\x61` => '\x01', 'a'
 ///   escaped multibyte runes like `\xe29885` => (★)
 ///   escaped unicode literals like `\u2605`
 fn (mut s Scanner) ident_char() string {
-	start := s.pos
+	lspos := token.Position{
 		line_nr: s.line_nr
 		pos: s.pos
 		col: s.pos - s.last_nl_pos - 1
 	}
 	start := s.pos // the string position of the first backtick char
 	slash := `\\`
 	mut len := 0
 	// set flags for advanced escapes first
 	escaped_hex := s.expect('\\x', start + 1)
 	escaped_unicode := s.expect('\\u', start + 1)
 	// walk the string to get characters up to the next backtick
 	for {
 		s.pos++
 		if s.pos >= s.text.len {
@ -1334,12 +1385,68 @@ fn (mut s Scanner) ident_char() string {
 		}
 	}
 	len--
-	c := s.text[start + 1..s.pos]
+	mut c := s.text[start + 1..s.pos]
 	if len != 1 {
 		// if the content expresses an escape code, it will have an even number of characters
 		// e.g. \x61 or \u2605
 		if (c.len % 2 == 0) && (escaped_hex || escaped_unicode) {
 			if escaped_unicode {
 				c = decode_u_escapes(c, 0, [0])
 			} else {
 				// we have to handle hex ourselves
 				ascii_0 := byte(0x30)
 				ascii_a := byte(0x61)
 				mut accumulated := []byte{}
 				val := c[2..c.len].to_lower() // 0A -> 0a
 				mut offset := 0
 				// take two characters at a time, parse as hex and add to bytes
 				for {
 					if offset >= val.len - 1 {
 						break
 					}
 					mut byteval := byte(0)
 					big := val[offset]
 					little := val[offset + 1]
 					if !big.is_hex_digit() {
 						accumulated.clear()
 						break
 					}
 					if !little.is_hex_digit() {
 						accumulated.clear()
 						break
 					}
 					if big.is_digit() {
 						byteval |= (big - ascii_0) << 4
 					} else {
 						byteval |= (big - ascii_a + 10) << 4
 					}
 					if little.is_digit() {
 						byteval |= (little - ascii_0)
 					} else {
 						byteval |= (little - ascii_a + 10)
 					}
 					accumulated << byteval
 					offset += 2
 				}
 				if accumulated.len > 0 {
 					c = accumulated.bytestr()
 				}
 			}
 		}
 		// the string inside the backticks is longer than one character
 		// but we might only have one rune, say in the case
 		u := c.runes()
 		if u.len != 1 {
-			s.error('invalid character literal (more than one character)\n' +
+			if escaped_hex || escaped_unicode {
-				'use quotes for strings, backticks for characters')
+				s.error('invalid character literal (escape sequence did not refer to a singular rune)')
 			} else {
 				s.add_error_detail_with_pos('use quotes for strings, backticks for characters',
 					lspos)
 				s.error('invalid character literal (more than one character)')
 			}
 		}
 	}
 	// Escapes a `'` character
--- a/vlib/v/scanner/scanner_test.v
+++ b/vlib/v/scanner/scanner_test.v
@ -19,6 +19,19 @@ fn scan_kinds(text string) []token.Kind {
 	return token_kinds
 }
 fn scan_tokens(text string) []token.Token {
 	mut scanner := new_scanner(text, .parse_comments, &pref.Preferences{})
 	mut tokens := []token.Token{}
 	for {
 		tok := scanner.scan()
 		if tok.kind == .eof {
 			break
 		}
 		tokens << tok
 	}
 	return tokens
 }
 fn test_scan() {
 	token_kinds := scan_kinds('println(2 + 3)')
 	assert token_kinds.len == 6
@ -138,6 +151,90 @@ fn test_ref_ref_array_ref_ref_foo() {
 }
 fn test_escape_string() {
 	// these assertions aren't helpful...
 	// they test the vlib built-in to the compiler,
 	// but we want to test this module before compilation
 	assert '\x61' == 'a'
 	assert '\x62' == 'b'
 	// assert `\x61` == `a` // will work after pull request goes through
 	// SINGLE CHAR ESCAPES
 	// SINGLE CHAR APOSTROPHE
 	mut result := scan_tokens(r"`'`")
 	assert result[0].kind == .chartoken
 	assert result[0].lit == r"\'"
 	// SINGLE CHAR BACKTICK
 	result = scan_tokens(r'`\``')
 	assert result[0].kind == .chartoken
 	assert result[0].lit == r'\`'
 	// SINGLE CHAR SLASH
 	result = scan_tokens(r'`\\`')
 	assert result[0].kind == .chartoken
 	assert result[0].lit == r'\\'
 	// SINGLE CHAR UNICODE ESCAPE
 	result = scan_tokens(r'`\u2605`')
 	assert result[0].kind == .chartoken
 	assert result[0].lit == r'★'
 	// SINGLE CHAR ESCAPED ASCII
 	result = scan_tokens(r'`\x61`')
 	assert result[0].kind == .chartoken
 	assert result[0].lit == r'a'
 	// SINGLE CHAR INCORRECT ESCAPE
 	// result = scan_tokens(r'`\x61\x61`') // should always result in an error
 	// SINGLE CHAR MULTI-BYTE UTF-8
 	// Compilation blocked by vlib/v/checker/check_types.v, but works in the repl
 	result = scan_tokens(r'`\xe29885`')
 	assert result[0].lit == r'★'
 	// STRING ESCAPES =================
 	// STRING APOSTROPHE
 	result = scan_tokens(r"'\''")
 	assert result[0].kind == .string
 	assert result[0].lit == r"\'"
 	// STRING BACKTICK
 	result = scan_tokens(r"'\`'")
 	assert result[0].kind == .string
 	assert result[0].lit == r'\`'
 	// STRING SLASH
 	result = scan_tokens(r"'\\'")
 	assert result[0].kind == .string
 	assert result[0].lit == r'\\'
 	// STRING UNICODE ESCAPE
 	result = scan_tokens(r"'\u2605'")
 	assert result[0].kind == .string
 	assert result[0].lit == r'★'
 	// STRING ESCAPED ASCII
 	result = scan_tokens(r"'\x61'")
 	assert result[0].kind == .string
 	assert result[0].lit == r'a'
 	// STRING ESCAPED EXTENDED ASCII
 	// (should not be converted to unicode)
 	result = scan_tokens(r"'\xe29885'")
 	assert result[0].kind == .string
 	assert result[0].lit.bytes() == [byte(0xe2), `9`, `8`, `8`, `5`]
 	// SHOULD RESULT IN ERRORS
 	// result = scan_tokens(r'`\x61\x61`') // should always result in an error
 	// result = scan_tokens(r"'\x'") // should always result in an error
 	// result = scan_tokens(r'`hello`') // should always result in an error
 }
 fn test_comment_string() {
 	mut result := scan_tokens('// single line comment will get an \\x01 prepended')
 	assert result[0].kind == .comment
 	assert result[0].lit[0] == byte(1) // \x01
 	// result = scan_tokens('/// doc comment will keep third / at beginning')
 	// result = scan_tokens('/* block comment will be stripped of whitespace */')
 	// result = scan_tokens('a := 0 // line end comment also gets \\x01 prepended')
 }