toml: add value decoding (#12521)

2021-11-20 18:48:44 +01:00 · 2021-11-20 18:48:44 +01:00 · f1dd0e3355
parent 4b9e8e243c
commit f1dd0e3355
8 changed files with 206 additions and 64 deletions
--- a/vlib/toml/ast/types.v
+++ b/vlib/toml/ast/types.v
@ -97,8 +97,9 @@ pub fn (n Null) str() string {
 // Quoted is the data representation of a TOML quoted type (`"quoted-key" = "I'm a quoted value"`).
 // Quoted types can appear both as keys and values in TOML documents.
 pub struct Quoted {
+pub mut:
+	text string
 pub:
-	text         string
 	pos          token.Position
 	is_multiline bool
 	quote        byte
--- a/vlib/toml/ast/walker/walker.v
+++ b/vlib/toml/ast/walker/walker.v
@ -2,11 +2,16 @@ module walker

 import toml.ast

-// Visitor defines a visit method which is invoked by the walker in each Value node it encounters.
+// Visitor defines a visit method which is invoked by the walker on each Value node it encounters.
 pub interface Visitor {
 	visit(value &ast.Value) ?
 }

+// Modifier defines a modify method which is invoked by the walker on each Value node it encounters.
+pub interface Modifier {
+	modify(mut value ast.Value) ?
+}
+
 pub type InspectorFn = fn (value &ast.Value, data voidptr) ?

 struct Inspector {
@ -31,7 +36,32 @@ pub fn walk(visitor Visitor, value &ast.Value) ? {
 		for _, val in value_map {
 			walk(visitor, &val) ?
 		}
+	}
+	if value is []ast.Value {
+		value_array := value as []ast.Value
+		for val in value_array {
+			walk(visitor, &val) ?
+		}
 	} else {
 		visitor.visit(value) ?
 	}
 }
+
+// walk_and_modify traverses the AST using the given modifier and lets the visitor
+// modify the contents.
+pub fn walk_and_modify(modifier Modifier, mut value ast.Value) ? {
+	if value is map[string]ast.Value {
+		mut value_map := value as map[string]ast.Value
+		for _, mut val in value_map {
+			walk_and_modify(modifier, mut &val) ?
+		}
+	}
+	if value is []ast.Value {
+		mut value_array := value as []ast.Value
+		for mut val in value_array {
+			walk_and_modify(modifier, mut &val) ?
+		}
+	} else {
+		modifier.modify(mut value) ?
+	}
+}
--- a/vlib/toml/checker/checker.v
+++ b/vlib/toml/checker/checker.v
@ -400,7 +400,7 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? {
 	is_basic := q.quote == `\"`
 	for {
 		ch := s.next()
-		if ch == -1 {
+		if ch == scanner.end_of_text {
 			break
 		}
 		ch_byte := byte(ch)
--- a/vlib/toml/decoder/decoder.v
+++ b/vlib/toml/decoder/decoder.v
@ -0,0 +1,148 @@
+// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
+// Use of this source code is governed by an MIT license
+// that can be found in the LICENSE file.
+module decoder
+
+import toml.ast
+import toml.ast.walker
+import toml.token
+import toml.scanner
+import strconv
+
+// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
+pub struct Decoder {
+	scanner &scanner.Scanner
+}
+
+// decode decodes certain `ast.Value`'s and all it's children.
+pub fn (d Decoder) decode(mut n ast.Value) ? {
+	walker.walk_and_modify(d, mut n) ?
+}
+
+fn (d Decoder) modify(mut value ast.Value) ? {
+	match value {
+		ast.Quoted {
+			mut v := &(value as ast.Quoted)
+			d.decode_quoted(mut v) ?
+		}
+		else {}
+	}
+}
+
+// excerpt returns a string of the token's surroundings
+fn (d Decoder) excerpt(tp token.Position) string {
+	return d.scanner.excerpt(tp.pos, 10)
+}
+
+// decode_quoted returns an error if `q` is not a valid quoted TOML string.
+fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
+	d.decode_quoted_escapes(mut q) ?
+}
+
+// decode_quoted_escapes returns an error for any disallowed escape sequences.
+// Delimiters in TOML has significant meaning:
+// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
+// "/""" delimits *basic* strings
+// Allowed escapes in *basic* strings are:
+// \b         - backspace       (U+0008)
+// \t         - tab             (U+0009)
+// \n         - linefeed        (U+000A)
+// \f         - form feed       (U+000C)
+// \r         - carriage return (U+000D)
+// \"         - quote           (U+0022)
+// \\         - backslash       (U+005C)
+// \uXXXX     - Unicode         (U+XXXX)
+// \UXXXXXXXX - Unicode         (U+XXXXXXXX)
+fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
+	// Setup a scanner in stack memory for easier navigation.
+	mut s := scanner.new_simple(q.text) ?
+
+	q.text = q.text.replace('\\"', '"')
+
+	// TODO use string builder
+	mut decoded_s := ''
+	// See https://toml.io/en/v1.0.0#string for more info on string types.
+	is_basic := q.quote == `\"`
+	if !is_basic {
+		return
+	}
+	for {
+		ch := s.next()
+		if ch == scanner.end_of_text {
+			break
+		}
+		ch_byte := byte(ch)
+
+		if ch == `\\` {
+			ch_next := byte(s.at())
+
+			if ch_next == `\\` {
+				decoded_s += ch_next.ascii_str()
+				s.next()
+				continue
+			}
+
+			if ch_next == `"` {
+				decoded_s += '"'
+				s.next()
+				continue
+			}
+
+			if ch_next == `n` {
+				decoded_s += '\n'
+				s.next()
+				continue
+			}
+
+			escape := ch_byte.ascii_str() + ch_next.ascii_str()
+			// Decode unicode escapes
+			if escape.to_lower() == '\\u' {
+				// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
+				// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
+				// of 9 chars plus one extra.
+				mut decoded := ''
+				if s.remaining() >= 10 {
+					pos := s.state().pos
+					decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or {
+						st := s.state()
+						return error(@MOD + '.' + @STRUCT + '.' + @FN +
+							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
+					}
+					decoded_s += decoded
+					s.skip_n(s.text[pos..pos + 11].len)
+					continue
+				} else {
+					pos := s.state().pos
+					decoded = d.decode_unicode_escape(s.text[pos..]) or {
+						st := s.state()
+						return error(@MOD + '.' + @STRUCT + '.' + @FN +
+							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
+					}
+					decoded_s += decoded
+					s.skip_n(s.text[pos..].len)
+					continue
+				}
+			}
+		}
+		decoded_s += ch_byte.ascii_str()
+	}
+	q.text = decoded_s
+}
+
+// decode_unicode_escape returns an error if `esc_unicode` is not
+// a valid Unicode escape sequence. `esc_unicode` is expected to be
+// prefixed with either `u` or `U`.
+fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
+	is_long_esc_type := esc_unicode.starts_with('U')
+	mut sequence := esc_unicode[1..]
+	hex_digits_len := if is_long_esc_type { 8 } else { 4 }
+
+	sequence = sequence[..hex_digits_len]
+
+	mut unicode_point := sequence
+	if unicode_point.len < 8 {
+		unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
+	}
+	rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
+	return '$rn'
+}
--- a/vlib/toml/parser/parser.v
+++ b/vlib/toml/parser/parser.v
@ -5,6 +5,7 @@ module parser

 import toml.ast
 import toml.checker
+import toml.decoder
 import toml.util
 import toml.token
 import toml.scanner
@ -69,10 +70,12 @@ mut:

 // Config is used to configure a Parser instance.
 // `run_checks` is used to en- or disable running of the strict `checker.Checker` type checks.
+// `decode_values` is used to en- or disable decoding of values with the `decoder.Decoder`.
 pub struct Config {
 pub:
-	scanner    &scanner.Scanner
-	run_checks bool = true
+	scanner       &scanner.Scanner
+	run_checks    bool = true
+	decode_values bool = true
 }

 // new_parser returns a new, stack allocated, `Parser`.
@ -104,12 +107,24 @@ fn (mut p Parser) run_checker() ? {
 	}
 }

+// run_decoder decodes values in the parsed `ast.Value` nodes in the
+// the generated AST.
+fn (mut p Parser) run_decoder() ? {
+	if p.config.decode_values {
+		dcoder := decoder.Decoder{
+			scanner: p.scanner
+		}
+		dcoder.decode(mut p.root_map) ?
+	}
+}
+
 // parse starts parsing the input and returns the root
 // of the generated AST.
 pub fn (mut p Parser) parse() ?&ast.Root {
 	p.init() ?
 	p.root_table() ?
 	p.run_checker() ?
+	p.run_decoder() ?
 	p.ast_root.table = p.root_map
 	return p.ast_root
 }
--- a/vlib/toml/scanner/scanner.v
+++ b/vlib/toml/scanner/scanner.v
@ -9,9 +9,10 @@ import toml.input
 import toml.token
 import toml.util

-pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
-
-const end_of_text = -1
+pub const (
+	digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
+	end_of_text  = -1
+)

 // Scanner contains the necessary fields for the state of the scan process.
 // the task the scanner does is also refered to as "lexing" or "tokenizing".
--- a/vlib/toml/tests/burntsushi.toml-test_test.v
+++ b/vlib/toml/tests/burntsushi.toml-test_test.v
@ -19,7 +19,6 @@ const (
 	valid_value_exceptions = [
 		// String
 		'string/escapes.toml',
-		'string/escape-tricky.toml',
 		'string/multiline.toml',
 		// Integer
 		'integer/long.toml',
@ -199,13 +198,7 @@ fn test_burnt_sushi_tomltest() {
 fn to_burntsushi(value ast.Value) string {
 	match value {
 		ast.Quoted {
-			mut json_text := ''
-			if value.quote == `"` {
-				json_text = toml_to_json_escapes(value) or { '<error>' }
-			} else {
-				json_text = json2.Any(value.text).json_str()
-			}
-
+			json_text := json2.Any(value.text).json_str()
 			return '{ "type": "string", "value": "$json_text" }'
 		}
 		ast.DateTime {
@ -271,49 +264,3 @@ fn to_burntsushi(value ast.Value) string {
 	}
 	return '<error>'
 }
-
-// toml_to_json_escapes is a utility function for normalizing
-// TOML basic string to JSON string
-fn toml_to_json_escapes(q ast.Quoted) ?string {
-	mut s := scanner.new_simple(q.text) ?
-	mut r := ''
-	for {
-		ch := s.next()
-		if ch == scanner.end_of_text {
-			break
-		}
-		ch_byte := byte(ch)
-
-		if ch == `"` {
-			if byte(s.peek(-1)) != `\\` {
-				r += '\\'
-			}
-		}
-
-		if ch == `\\` {
-			next_ch := byte(s.at())
-
-			escape := ch_byte.ascii_str() + next_ch.ascii_str()
-			if escape.to_lower() == '\\u' {
-				mut b := s.next()
-				mut unicode_point := ''
-				for {
-					b = s.next()
-					if b != ` ` && b != scanner.end_of_text {
-						unicode_point += byte(b).ascii_str()
-					} else {
-						break
-					}
-				}
-				if unicode_point.len < 8 {
-					unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
-				}
-				rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
-				r += '$rn'
-				continue
-			}
-		}
-		r += ch_byte.ascii_str()
-	}
-	return r
-}
--- a/vlib/toml/tests/strings_test.v
+++ b/vlib/toml/tests/strings_test.v
@ -72,9 +72,9 @@ fn test_unicode_escapes() {
 	mut toml_doc := toml.parse(toml_unicode_escapes) or { panic(err) }

 	mut value := toml_doc.value('short')
-	assert value.string() == r'\u03B4'
+	assert value.string() == '\u03B4' // <- This escape is handled by V
 	value = toml_doc.value('long')
-	assert value.string() == r'\U000003B4'
+	assert value.string() == 'δ' // <- for the long escape we compare with the unicode point
 }

 fn test_literal_strings() {