scanner, cgen: improve support for escape codes in backticks/runes (#13127)

2022-01-11 15:36:18 -05:00 · 2022-01-11 15:36:18 -05:00 · ab642cac43
parent ea660315e0
commit ab642cac43
12 changed files with 249 additions and 48 deletions
--- a/vlib/v/checker/check_types.v
+++ b/vlib/v/checker/check_types.v
@ -617,8 +617,10 @@ pub fn (mut c Checker) string_inter_lit(mut node ast.StringInterLiteral) ast.Typ
 	return ast.string_type
 }

-const hex_lit_overflow_message = 'hex character literal overflows string'
+const unicode_lit_overflow_message = 'unicode character exceeds max allowed value of 0x10ffff, consider using a unicode literal (\\u####)'

+// unicode character literals are limited to a maximum value of 0x10ffff
+// https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff
 pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
 	mut idx := 0
 	for idx < node.val.len {
@ -631,7 +633,7 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
 				start_idx := idx
 				idx++
 				next_ch := node.val[idx] or { return ast.string_type }
-				if next_ch == `x` {
+				if next_ch == `u` {
 					idx++
 					mut ch := node.val[idx] or { return ast.string_type }
 					mut hex_char_count := 0
@ -647,13 +649,13 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
 								first_digit := node.val[idx - 5] - 48
 								second_digit := node.val[idx - 4] - 48
 								if first_digit > 1 {
-									c.error(checker.hex_lit_overflow_message, end_pos)
+									c.error(checker.unicode_lit_overflow_message, end_pos)
 								} else if first_digit == 1 && second_digit > 0 {
-									c.error(checker.hex_lit_overflow_message, end_pos)
+									c.error(checker.unicode_lit_overflow_message, end_pos)
 								}
 							}
 							else {
-								c.error(checker.hex_lit_overflow_message, end_pos)
+								c.error(checker.unicode_lit_overflow_message, end_pos)
 							}
 						}
 						idx++
--- a/vlib/v/checker/tests/hex_literal_overflow.out
+++ b/vlib/v/checker/tests/hex_literal_overflow.out
@ -1,18 +0,0 @@
-vlib/v/checker/tests/hex_literal_overflow.vv:1:7: error: hex character literal overflows string
-    1 | a := '\x11ffff'
-      |       ~~~~~~~~
-    2 | b := '\x20ffff'
-    3 | c := '\x10fffff'
-vlib/v/checker/tests/hex_literal_overflow.vv:2:7: error: hex character literal overflows string
-    1 | a := '\x11ffff'
-    2 | b := '\x20ffff'
-      |       ~~~~~~~~
-    3 | c := '\x10fffff'
-    4 | println(a)
-vlib/v/checker/tests/hex_literal_overflow.vv:3:7: error: hex character literal overflows string
-    1 | a := '\x11ffff'
-    2 | b := '\x20ffff'
-    3 | c := '\x10fffff'
-      |       ~~~~~~~~~
-    4 | println(a)
-    5 | println(b)
--- a/vlib/v/checker/tests/string_escape_x_err_a.out
+++ b/vlib/v/checker/tests/string_escape_x_err_a.out
@ -1,4 +1,4 @@
-vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used with no following hex digits
+vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used without two following hex digits
    1 | fn main() {
    2 |     println('\x')
      |               ^
--- a/vlib/v/checker/tests/string_escape_x_err_b.out
+++ b/vlib/v/checker/tests/string_escape_x_err_b.out
@ -1,4 +1,4 @@
-vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used with no following hex digits
+vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used without two following hex digits
    1 | fn main() {
    2 |     println('\xhh')
      |               ^
--- a/vlib/v/gen/c/cgen.v
+++ b/vlib/v/gen/c/cgen.v
@ -2510,6 +2510,19 @@ fn (mut g Gen) expr_with_cast(expr ast.Expr, got_type_raw ast.Type, expected_typ
 	g.expr(expr)
 }

+fn cescape_nonascii(original string) string {
+	mut b := strings.new_builder(original.len)
+	for c in original {
+		if c < 32 || c > 126 {
+			b.write_string('\\${c:03o}')
+			continue
+		}
+		b.write_b(c)
+	}
+	res := b.str()
+	return res
+}
+
 // cestring returns a V string, properly escaped for embeddeding in a C string literal.
 fn cestring(s string) string {
 	return s.replace('\\', '\\\\').replace('"', "'")
@ -2517,7 +2530,7 @@ fn cestring(s string) string {

 // ctoslit returns a '_SLIT("$s")' call, where s is properly escaped.
 fn ctoslit(s string) string {
-	return '_SLIT("' + cestring(s) + '")'
+	return '_SLIT("' + cescape_nonascii(cestring(s)) + '")'
 }

 fn (mut g Gen) gen_attrs(attrs []ast.Attr) {
--- a/vlib/v/gen/c/str.v
+++ b/vlib/v/gen/c/str.v
@ -6,7 +6,7 @@ import v.ast
 import v.util

 fn (mut g Gen) string_literal(node ast.StringLiteral) {
-	escaped_val := util.smart_quote(node.val, node.is_raw)
+	escaped_val := cescape_nonascii(util.smart_quote(node.val, node.is_raw))
 	if node.language == .c {
 		g.write('"$escaped_val"')
 	} else {
@ -25,7 +25,7 @@ fn (mut g Gen) string_inter_literal_sb_optimized(call_expr ast.CallExpr) {
 	is_nl := call_expr.name == 'writeln'
 	// println('optimize sb $call_expr.name')
 	for i, val in node.vals {
-		escaped_val := util.smart_quote(val, false)
+		escaped_val := cescape_nonascii(util.smart_quote(val, false))
 		// if val == '' {
 		// break
 		// continue
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -1174,6 +1174,7 @@ fn (mut s Scanner) ident_string() string {
 	}
 	s.is_inside_string = false
 	mut u_escapes_pos := []int{} // pos list of \uXXXX
+	mut h_escapes_pos := []int{} // pos list of \xXX
 	mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 }
 	for {
 		s.pos++
@ -1221,8 +1222,12 @@ fn (mut s Scanner) ident_string() string {
 		// Escape `\x` `\u`
 		if backslash_count % 2 == 1 && !is_raw && !is_cstr {
 			// Escape `\x`
-			if c == `x` && (s.text[s.pos + 1] == s.quote || !s.text[s.pos + 1].is_hex_digit()) {
-				s.error(r'`\x` used with no following hex digits')
+			if c == `x` {
+				if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
+					&& s.text[s.pos + 2].is_hex_digit()) {
+					s.error(r'`\x` used without two following hex digits')
+				}
+				h_escapes_pos << s.pos - 1
 			}
 			// Escape `\u`
 			if c == `u` {
@ -1266,6 +1271,9 @@ fn (mut s Scanner) ident_string() string {
 		if !s.is_fmt && u_escapes_pos.len > 0 {
 			string_so_far = decode_u_escapes(string_so_far, start, u_escapes_pos)
 		}
+		if !s.is_fmt && h_escapes_pos.len > 0 {
+			string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos)
+		}
 		if n_cr_chars > 0 {
 			string_so_far = string_so_far.replace('\r', '')
 		}
@ -1278,6 +1286,27 @@ fn (mut s Scanner) ident_string() string {
 	return lit
 }

+// only handle single-byte inline escapes like '\xc0'
+fn decode_h_escapes(s string, start int, escapes_pos []int) string {
+	if escapes_pos.len == 0 {
+		return s
+	}
+	mut ss := []string{cap: escapes_pos.len * 2 + 1}
+	ss << s[..escapes_pos.first() - start]
+	for i, pos in escapes_pos {
+		idx := pos - start
+		end_idx := idx + 4 // "\xXX".len == 4
+		// notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
+		ss << [byte(strconv.parse_uint(s[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
+		if i + 1 < escapes_pos.len {
+			ss << s[end_idx..escapes_pos[i + 1] - start]
+		} else {
+			ss << s[end_idx..]
+		}
+	}
+	return ss.join('')
+}
+
 fn decode_u_escapes(s string, start int, escapes_pos []int) string {
 	if escapes_pos.len == 0 {
 		return s
@ -1312,10 +1341,32 @@ fn trim_slash_line_break(s string) string {
 	return ret_str
 }

+/// ident_char is called when a backtick "single-char" is parsed from the code
+/// it is needed because some runes (chars) are written with escape sequences
+/// the string it returns should be a standardized, simplified version of the character
+/// as it would appear in source code
+/// possibilities:
+///   single chars like `a`, `b` => 'a', 'b'
+///   escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
+///   escaped hex bytes like `\x01`, `\x61` => '\x01', 'a'
+///   escaped multibyte runes like `\xe29885` => (★)
+///   escaped unicode literals like `\u2605`
 fn (mut s Scanner) ident_char() string {
-	start := s.pos
+	lspos := token.Position{
+		line_nr: s.line_nr
+		pos: s.pos
+		col: s.pos - s.last_nl_pos - 1
+	}
+
+	start := s.pos // the string position of the first backtick char
 	slash := `\\`
 	mut len := 0
+
+	// set flags for advanced escapes first
+	escaped_hex := s.expect('\\x', start + 1)
+	escaped_unicode := s.expect('\\u', start + 1)
+
+	// walk the string to get characters up to the next backtick
 	for {
 		s.pos++
 		if s.pos >= s.text.len {
@ -1334,12 +1385,68 @@ fn (mut s Scanner) ident_char() string {
 		}
 	}
 	len--
-	c := s.text[start + 1..s.pos]
+	mut c := s.text[start + 1..s.pos]
 	if len != 1 {
+		// if the content expresses an escape code, it will have an even number of characters
+		// e.g. \x61 or \u2605
+		if (c.len % 2 == 0) && (escaped_hex || escaped_unicode) {
+			if escaped_unicode {
+				c = decode_u_escapes(c, 0, [0])
+			} else {
+				// we have to handle hex ourselves
+				ascii_0 := byte(0x30)
+				ascii_a := byte(0x61)
+				mut accumulated := []byte{}
+				val := c[2..c.len].to_lower() // 0A -> 0a
+				mut offset := 0
+				// take two characters at a time, parse as hex and add to bytes
+				for {
+					if offset >= val.len - 1 {
+						break
+					}
+					mut byteval := byte(0)
+					big := val[offset]
+					little := val[offset + 1]
+					if !big.is_hex_digit() {
+						accumulated.clear()
+						break
+					}
+					if !little.is_hex_digit() {
+						accumulated.clear()
+						break
+					}
+
+					if big.is_digit() {
+						byteval |= (big - ascii_0) << 4
+					} else {
+						byteval |= (big - ascii_a + 10) << 4
+					}
+					if little.is_digit() {
+						byteval |= (little - ascii_0)
+					} else {
+						byteval |= (little - ascii_a + 10)
+					}
+
+					accumulated << byteval
+					offset += 2
+				}
+				if accumulated.len > 0 {
+					c = accumulated.bytestr()
+				}
+			}
+		}
+
+		// the string inside the backticks is longer than one character
+		// but we might only have one rune, say in the case
 		u := c.runes()
 		if u.len != 1 {
-			s.error('invalid character literal (more than one character)\n' +
-				'use quotes for strings, backticks for characters')
+			if escaped_hex || escaped_unicode {
+				s.error('invalid character literal (escape sequence did not refer to a singular rune)')
+			} else {
+				s.add_error_detail_with_pos('use quotes for strings, backticks for characters',
+					lspos)
+				s.error('invalid character literal (more than one character)')
+			}
 		}
 	}
 	// Escapes a `'` character
--- a/vlib/v/scanner/scanner_test.v
+++ b/vlib/v/scanner/scanner_test.v
@ -19,6 +19,19 @@ fn scan_kinds(text string) []token.Kind {
 	return token_kinds
 }

+fn scan_tokens(text string) []token.Token {
+	mut scanner := new_scanner(text, .parse_comments, &pref.Preferences{})
+	mut tokens := []token.Token{}
+	for {
+		tok := scanner.scan()
+		if tok.kind == .eof {
+			break
+		}
+		tokens << tok
+	}
+	return tokens
+}
+
 fn test_scan() {
 	token_kinds := scan_kinds('println(2 + 3)')
 	assert token_kinds.len == 6
@ -138,6 +151,90 @@ fn test_ref_ref_array_ref_ref_foo() {
 }

 fn test_escape_string() {
+	// these assertions aren't helpful...
+	// they test the vlib built-in to the compiler,
+	// but we want to test this module before compilation
 	assert '\x61' == 'a'
 	assert '\x62' == 'b'
+	// assert `\x61` == `a` // will work after pull request goes through
+
+	// SINGLE CHAR ESCAPES
+	// SINGLE CHAR APOSTROPHE
+	mut result := scan_tokens(r"`'`")
+	assert result[0].kind == .chartoken
+	assert result[0].lit == r"\'"
+
+	// SINGLE CHAR BACKTICK
+	result = scan_tokens(r'`\``')
+	assert result[0].kind == .chartoken
+	assert result[0].lit == r'\`'
+
+	// SINGLE CHAR SLASH
+	result = scan_tokens(r'`\\`')
+	assert result[0].kind == .chartoken
+	assert result[0].lit == r'\\'
+
+	// SINGLE CHAR UNICODE ESCAPE
+	result = scan_tokens(r'`\u2605`')
+	assert result[0].kind == .chartoken
+	assert result[0].lit == r'★'
+
+	// SINGLE CHAR ESCAPED ASCII
+	result = scan_tokens(r'`\x61`')
+	assert result[0].kind == .chartoken
+	assert result[0].lit == r'a'
+
+	// SINGLE CHAR INCORRECT ESCAPE
+	// result = scan_tokens(r'`\x61\x61`') // should always result in an error
+
+	// SINGLE CHAR MULTI-BYTE UTF-8
+	// Compilation blocked by vlib/v/checker/check_types.v, but works in the repl
+	result = scan_tokens(r'`\xe29885`')
+	assert result[0].lit == r'★'
+
+	// STRING ESCAPES =================
+	// STRING APOSTROPHE
+	result = scan_tokens(r"'\''")
+	assert result[0].kind == .string
+	assert result[0].lit == r"\'"
+
+	// STRING BACKTICK
+	result = scan_tokens(r"'\`'")
+	assert result[0].kind == .string
+	assert result[0].lit == r'\`'
+
+	// STRING SLASH
+	result = scan_tokens(r"'\\'")
+	assert result[0].kind == .string
+	assert result[0].lit == r'\\'
+
+	// STRING UNICODE ESCAPE
+	result = scan_tokens(r"'\u2605'")
+	assert result[0].kind == .string
+	assert result[0].lit == r'★'
+
+	// STRING ESCAPED ASCII
+	result = scan_tokens(r"'\x61'")
+	assert result[0].kind == .string
+	assert result[0].lit == r'a'
+
+	// STRING ESCAPED EXTENDED ASCII
+	// (should not be converted to unicode)
+	result = scan_tokens(r"'\xe29885'")
+	assert result[0].kind == .string
+	assert result[0].lit.bytes() == [byte(0xe2), `9`, `8`, `8`, `5`]
+
+	// SHOULD RESULT IN ERRORS
+	// result = scan_tokens(r'`\x61\x61`') // should always result in an error
+	// result = scan_tokens(r"'\x'") // should always result in an error
+	// result = scan_tokens(r'`hello`') // should always result in an error
+}
+
+fn test_comment_string() {
+	mut result := scan_tokens('// single line comment will get an \\x01 prepended')
+	assert result[0].kind == .comment
+	assert result[0].lit[0] == byte(1) // \x01
+	// result = scan_tokens('/// doc comment will keep third / at beginning')
+	// result = scan_tokens('/* block comment will be stripped of whitespace */')
+	// result = scan_tokens('a := 0 // line end comment also gets \\x01 prepended')
 }