toml: fix 7 escape tests (#12017)

2021-09-29 18:28:09 +02:00 · 2021-09-29 18:28:09 +02:00 · e3d3727c0c
parent c2f535fee1
commit e3d3727c0c
3 changed files with 98 additions and 25 deletions
--- a/vlib/toml/checker/checker.v
+++ b/vlib/toml/checker/checker.v
@ -9,6 +9,8 @@ import toml.ast.walker
 import toml.token
 import toml.scanner

+pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`]
+
 // Checker checks a tree of TOML `ast.Value`'s for common errors.
 pub struct Checker {
 	scanner &scanner.Scanner
@ -172,12 +174,68 @@ fn (c Checker) check_boolean(b ast.Bool) ? {
 		' boolean values like "$lit" can only be `true` or `false` literals, not `$lit` in ...${c.excerpt(b.pos)}...')
 }

-fn (c Checker) check_quoted(b ast.Quoted) ? {
-	lit := b.text
-	quote := b.quote.ascii_str()
+fn (c Checker) check_quoted(q ast.Quoted) ? {
+	lit := q.text
+	quote := q.quote.ascii_str()
 	triple_quote := quote + quote + quote
-	if b.is_multiline && lit.ends_with(triple_quote) {
+	if q.is_multiline && lit.ends_with(triple_quote) {
 		return error(@MOD + '.' + @STRUCT + '.' + @FN +
-			' string values like "$lit" is has unbalanced quote literals `b.quote` in ...${c.excerpt(b.pos)}...')
+			' string values like "$lit" is has unbalanced quote literals `q.quote` in ...${c.excerpt(q.pos)}...')
+	}
+	c.check_quoted_escapes(q) ?
+}
+
+// check_quoted_escapes returns an error for any disallowed escape sequences.
+// Delimiters in TOML has significant meaning:
+// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
+// "/""" delimits *basic* strings
+// Allowed escapes in *basic* strings are:
+// \b         - backspace       (U+0008)
+// \t         - tab             (U+0009)
+// \n         - linefeed        (U+000A)
+// \f         - form feed       (U+000C)
+// \r         - carriage return (U+000D)
+// \"         - quote           (U+0022)
+// \\         - backslash       (U+005C)
+// \uXXXX     - unicode         (U+XXXX)
+// \UXXXXXXXX - unicode         (U+XXXXXXXX)
+fn (c Checker) check_quoted_escapes(q ast.Quoted) ? {
+	// Setup a scanner in stack memory for easier navigation.
+	mut s := scanner.new_simple(q.text) ?
+
+	is_basic := q.quote == `\"`
+	for {
+		ch := s.next()
+		if ch == -1 {
+			break
+		}
+		ch_byte := byte(ch)
+		if ch == `\\` {
+			next_ch := byte(s.at())
+
+			if next_ch == `\\` {
+				s.next()
+				continue
+			}
+			escape := ch_byte.ascii_str() + next_ch.ascii_str()
+			if is_basic {
+				if q.is_multiline {
+					if next_ch == byte(32) && s.peek(1) == byte(92) {
+						st := s.state()
+						return error(@MOD + '.' + @STRUCT + '.' + @FN +
+							' can not escape whitespaces before escapes in multi-line strings (`\\ \\`) at `$escape` ($st.line_nr,$st.col) in ...${c.excerpt(q.pos)}...')
+					}
+					if next_ch in [`\t`, `\n`, ` `] {
+						s.next()
+						continue
+					}
+				}
+				if next_ch !in checker.allowed_basic_escape_chars {
+					st := s.state()
+					return error(@MOD + '.' + @STRUCT + '.' + @FN +
+						' unknown basic string escape character `$next_ch.ascii_str()` in `$escape` ($st.line_nr,$st.col) in ...${c.excerpt(q.pos)}...')
+				}
+			}
+		}
 	}
 }
--- a/vlib/toml/scanner/scanner.v
+++ b/vlib/toml/scanner/scanner.v
@ -25,6 +25,16 @@ mut:
 	mode    Mode // sub-mode of the scanner
 }

+// State is a read-only copy of the scanner's internal state.
+// See also `Scanner.state()`.
+pub struct State {
+pub:
+	col     int  // current column number (x coordinate)
+	line_nr int = 1 // current line number (y coordinate)
+	pos     int  // current flat/index position in the `text` field
+	mode    Mode // sub-mode of the scanner
+}
+
 enum Mode {
 	normal
 	inside_string
@ -426,6 +436,8 @@ fn (mut s Scanner) extract_multiline_string() ?string {
 		}

 		c := s.at()
+		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `$c.ascii_str()` / $c (quote type: $quote/$quote.ascii_str())')
+
 		if c == `\n` {
 			s.inc_line_number()
 			lit += c.ascii_str()
@ -443,8 +455,6 @@ fn (mut s Scanner) extract_multiline_string() ?string {
 			}
 		}

-		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `$c.ascii_str()` / $c')
-
 		if c == quote {
 			if s.peek(1) == quote && s.peek(2) == quote {
 				if s.peek(3) == -1 {
@ -469,14 +479,16 @@ fn (mut s Scanner) extract_multiline_string() ?string {
 	return lit
 }

-// handle_escapes
+// handle_escapes returns any escape character sequence.
+// For escape sequence validation see `Checker.check_quoted_escapes`.
 fn (mut s Scanner) handle_escapes(quote byte, is_multiline bool) (string, int) {
 	c := s.at()
 	mut lit := c.ascii_str()
-	if s.peek(1) == byte(92) {
-		lit += lit
-		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`')
-		return lit, 1
+	if s.peek(1) == `u` && byte(s.peek(2)).is_hex_digit() && byte(s.peek(3)).is_hex_digit()
+		&& byte(s.peek(4)).is_hex_digit() && byte(s.peek(5)).is_hex_digit() {
+		lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str()
+		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped unicode `$lit`')
+		return lit, 4
 	} else if s.peek(1) == quote {
 		if (!is_multiline && s.peek(2) == `\n`)
 			|| (is_multiline && s.peek(2) == quote && s.peek(3) == quote && s.peek(4) == `\n`) {
@ -486,13 +498,9 @@ fn (mut s Scanner) handle_escapes(quote byte, is_multiline bool) (string, int) {
 		lit += quote.ascii_str()
 		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`')
 		return lit, 1
-	} else if s.peek(1) == `u` && byte(s.peek(2)).is_hex_digit() && byte(s.peek(3)).is_hex_digit()
-		&& byte(s.peek(4)).is_hex_digit() && byte(s.peek(5)).is_hex_digit() {
-		lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str()
-		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`')
-		return lit, 4
 	}
-	return '', 0
+	lit += byte(s.peek(1)).ascii_str()
+	return lit, 1
 }

 // extract_number collects and returns a string containing
@ -542,3 +550,13 @@ pub fn (s Scanner) excerpt(pos int, margin int) string {
 	end := if pos + margin < s.text.len { pos + margin } else { s.text.len }
 	return s.text[start..end].replace('\n', r'\n')
 }
+
+// state returns a read-only view of the scanner's internal state.
+pub fn (s Scanner) state() State {
+	return State{
+		col: s.col
+		line_nr: s.line_nr
+		pos: s.pos
+		mode: s.mode
+	}
+}
--- a/vlib/toml/tests/burntsushi.toml-test_test.v
+++ b/vlib/toml/tests/burntsushi.toml-test_test.v
@ -15,19 +15,12 @@ const (
 	invalid_exceptions = [
 		// String
 		'string/basic-multiline-out-of-range-unicode-escape-1.toml',
-		'string/basic-byte-escapes.toml',
-		'string/multiline-escape-space.toml',
 		'string/bad-codepoint.toml',
 		'string/basic-multiline-out-of-range-unicode-escape-2.toml',
-		'string/bad-slash-escape.toml',
 		'string/basic-out-of-range-unicode-escape-1.toml',
 		'string/basic-out-of-range-unicode-escape-2.toml',
 		'string/bad-uni-esc.toml',
-		'string/bad-escape.toml',
-		'string/basic-multiline-unknown-escape.toml',
 		'string/missing-quotes.toml',
-		'string/bad-byte-escape.toml',
-		'string/basic-unknown-escape.toml',
 		// Integer
 		'integer/capital-bin.toml',
 		'integer/invalid-bin.toml',
@ -155,6 +148,10 @@ fn test_burnt_sushi_tomltest() {
 			if relative !in invalid_exceptions {
 				println('OK   [$i/$invalid_test_files.len] "$invalid_test_file"...')
 				if toml_doc := toml.parse_file(invalid_test_file) {
+					content_that_should_have_failed := os.read_file(invalid_test_file) or {
+						panic(err)
+					}
+					println('     This TOML should have failed:\n${'-'.repeat(40)}\n$content_that_should_have_failed\n${'-'.repeat(40)}')
 					assert false
 				} else {
 					println('     $err.msg')