toml: add value decoding (#12521)

2021-11-20 18:48:44 +01:00 · 2021-11-20 18:48:44 +01:00 · f1dd0e3355
parent 4b9e8e243c
commit f1dd0e3355
8 changed files with 206 additions and 64 deletions
--- a/vlib/toml/ast/types.v
+++ b/vlib/toml/ast/types.v
@ -97,8 +97,9 @@ pub fn (n Null) str() string {
 // Quoted is the data representation of a TOML quoted type (`"quoted-key" = "I'm a quoted value"`).
 // Quoted types can appear both as keys and values in TOML documents.
 pub struct Quoted {
-pub:
+pub mut:
 	text string
 pub:
 	pos          token.Position
 	is_multiline bool
 	quote        byte
--- a/vlib/toml/ast/walker/walker.v
+++ b/vlib/toml/ast/walker/walker.v
@ -2,11 +2,16 @@ module walker
 import toml.ast
-// Visitor defines a visit method which is invoked by the walker in each Value node it encounters.
+// Visitor defines a visit method which is invoked by the walker on each Value node it encounters.
 pub interface Visitor {
 	visit(value &ast.Value) ?
 }
 // Modifier defines a modify method which is invoked by the walker on each Value node it encounters.
 pub interface Modifier {
 	modify(mut value ast.Value) ?
 }
 pub type InspectorFn = fn (value &ast.Value, data voidptr) ?
 struct Inspector {
@ -31,7 +36,32 @@ pub fn walk(visitor Visitor, value &ast.Value) ? {
 		for _, val in value_map {
 			walk(visitor, &val) ?
 		}
 	}
 	if value is []ast.Value {
 		value_array := value as []ast.Value
 		for val in value_array {
 			walk(visitor, &val) ?
 		}
 	} else {
 		visitor.visit(value) ?
 	}
 }
 // walk_and_modify traverses the AST using the given modifier and lets the visitor
 // modify the contents.
 pub fn walk_and_modify(modifier Modifier, mut value ast.Value) ? {
 	if value is map[string]ast.Value {
 		mut value_map := value as map[string]ast.Value
 		for _, mut val in value_map {
 			walk_and_modify(modifier, mut &val) ?
 		}
 	}
 	if value is []ast.Value {
 		mut value_array := value as []ast.Value
 		for mut val in value_array {
 			walk_and_modify(modifier, mut &val) ?
 		}
 	} else {
 		modifier.modify(mut value) ?
 	}
 }
--- a/vlib/toml/checker/checker.v
+++ b/vlib/toml/checker/checker.v
@ -400,7 +400,7 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? {
 	is_basic := q.quote == `\"`
 	for {
 		ch := s.next()
-		if ch == -1 {
+		if ch == scanner.end_of_text {
 			break
 		}
 		ch_byte := byte(ch)
--- a/vlib/toml/decoder/decoder.v
+++ b/vlib/toml/decoder/decoder.v
@ -0,0 +1,148 @@
 // Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
 // Use of this source code is governed by an MIT license
 // that can be found in the LICENSE file.
 module decoder
 import toml.ast
 import toml.ast.walker
 import toml.token
 import toml.scanner
 import strconv
 // Decoder decode special sequences in a tree of TOML `ast.Value`'s.
 pub struct Decoder {
 	scanner &scanner.Scanner
 }
 // decode decodes certain `ast.Value`'s and all it's children.
 pub fn (d Decoder) decode(mut n ast.Value) ? {
 	walker.walk_and_modify(d, mut n) ?
 }
 fn (d Decoder) modify(mut value ast.Value) ? {
 	match value {
 		ast.Quoted {
 			mut v := &(value as ast.Quoted)
 			d.decode_quoted(mut v) ?
 		}
 		else {}
 	}
 }
 // excerpt returns a string of the token's surroundings
 fn (d Decoder) excerpt(tp token.Position) string {
 	return d.scanner.excerpt(tp.pos, 10)
 }
 // decode_quoted returns an error if `q` is not a valid quoted TOML string.
 fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
 	d.decode_quoted_escapes(mut q) ?
 }
 // decode_quoted_escapes returns an error for any disallowed escape sequences.
 // Delimiters in TOML has significant meaning:
 // '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
 // "/""" delimits *basic* strings
 // Allowed escapes in *basic* strings are:
 // \b         - backspace       (U+0008)
 // \t         - tab             (U+0009)
 // \n         - linefeed        (U+000A)
 // \f         - form feed       (U+000C)
 // \r         - carriage return (U+000D)
 // \"         - quote           (U+0022)
 // \\         - backslash       (U+005C)
 // \uXXXX     - Unicode         (U+XXXX)
 // \UXXXXXXXX - Unicode         (U+XXXXXXXX)
 fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
 	// Setup a scanner in stack memory for easier navigation.
 	mut s := scanner.new_simple(q.text) ?
 	q.text = q.text.replace('\\"', '"')
 	// TODO use string builder
 	mut decoded_s := ''
 	// See https://toml.io/en/v1.0.0#string for more info on string types.
 	is_basic := q.quote == `\"`
 	if !is_basic {
 		return
 	}
 	for {
 		ch := s.next()
 		if ch == scanner.end_of_text {
 			break
 		}
 		ch_byte := byte(ch)
 		if ch == `\\` {
 			ch_next := byte(s.at())
 			if ch_next == `\\` {
 				decoded_s += ch_next.ascii_str()
 				s.next()
 				continue
 			}
 			if ch_next == `"` {
 				decoded_s += '"'
 				s.next()
 				continue
 			}
 			if ch_next == `n` {
 				decoded_s += '\n'
 				s.next()
 				continue
 			}
 			escape := ch_byte.ascii_str() + ch_next.ascii_str()
 			// Decode unicode escapes
 			if escape.to_lower() == '\\u' {
 				// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
 				// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
 				// of 9 chars plus one extra.
 				mut decoded := ''
 				if s.remaining() >= 10 {
 					pos := s.state().pos
 					decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or {
 						st := s.state()
 						return error(@MOD + '.' + @STRUCT + '.' + @FN +
 							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
 					}
 					decoded_s += decoded
 					s.skip_n(s.text[pos..pos + 11].len)
 					continue
 				} else {
 					pos := s.state().pos
 					decoded = d.decode_unicode_escape(s.text[pos..]) or {
 						st := s.state()
 						return error(@MOD + '.' + @STRUCT + '.' + @FN +
 							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
 					}
 					decoded_s += decoded
 					s.skip_n(s.text[pos..].len)
 					continue
 				}
 			}
 		}
 		decoded_s += ch_byte.ascii_str()
 	}
 	q.text = decoded_s
 }
 // decode_unicode_escape returns an error if `esc_unicode` is not
 // a valid Unicode escape sequence. `esc_unicode` is expected to be
 // prefixed with either `u` or `U`.
 fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
 	is_long_esc_type := esc_unicode.starts_with('U')
 	mut sequence := esc_unicode[1..]
 	hex_digits_len := if is_long_esc_type { 8 } else { 4 }
 	sequence = sequence[..hex_digits_len]
 	mut unicode_point := sequence
 	if unicode_point.len < 8 {
 		unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
 	}
 	rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
 	return '$rn'
 }
--- a/vlib/toml/parser/parser.v
+++ b/vlib/toml/parser/parser.v
@ -5,6 +5,7 @@ module parser
 import toml.ast
 import toml.checker
 import toml.decoder
 import toml.util
 import toml.token
 import toml.scanner
@ -69,10 +70,12 @@ mut:
 // Config is used to configure a Parser instance.
 // `run_checks` is used to en- or disable running of the strict `checker.Checker` type checks.
 // `decode_values` is used to en- or disable decoding of values with the `decoder.Decoder`.
 pub struct Config {
 pub:
 	scanner       &scanner.Scanner
 	run_checks    bool = true
 	decode_values bool = true
 }
 // new_parser returns a new, stack allocated, `Parser`.
@ -104,12 +107,24 @@ fn (mut p Parser) run_checker() ? {
 	}
 }
 // run_decoder decodes values in the parsed `ast.Value` nodes in the
 // the generated AST.
 fn (mut p Parser) run_decoder() ? {
 	if p.config.decode_values {
 		dcoder := decoder.Decoder{
 			scanner: p.scanner
 		}
 		dcoder.decode(mut p.root_map) ?
 	}
 }
 // parse starts parsing the input and returns the root
 // of the generated AST.
 pub fn (mut p Parser) parse() ?&ast.Root {
 	p.init() ?
 	p.root_table() ?
 	p.run_checker() ?
 	p.run_decoder() ?
 	p.ast_root.table = p.root_map
 	return p.ast_root
 }
--- a/vlib/toml/scanner/scanner.v
+++ b/vlib/toml/scanner/scanner.v
@ -9,9 +9,10 @@ import toml.input
 import toml.token
 import toml.util
-pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
+pub const (
-
+	digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
-const end_of_text = -1
+	end_of_text  = -1
 )
 // Scanner contains the necessary fields for the state of the scan process.
 // the task the scanner does is also refered to as "lexing" or "tokenizing".
--- a/vlib/toml/tests/burntsushi.toml-test_test.v
+++ b/vlib/toml/tests/burntsushi.toml-test_test.v
@ -19,7 +19,6 @@ const (
 	valid_value_exceptions = [
 		// String
 		'string/escapes.toml',
 		'string/escape-tricky.toml',
 		'string/multiline.toml',
 		// Integer
 		'integer/long.toml',
@ -199,13 +198,7 @@ fn test_burnt_sushi_tomltest() {
 fn to_burntsushi(value ast.Value) string {
 	match value {
 		ast.Quoted {
-			mut json_text := ''
+			json_text := json2.Any(value.text).json_str()
 			if value.quote == `"` {
 				json_text = toml_to_json_escapes(value) or { '<error>' }
 			} else {
 				json_text = json2.Any(value.text).json_str()
 			}
 			return '{ "type": "string", "value": "$json_text" }'
 		}
 		ast.DateTime {
@ -271,49 +264,3 @@ fn to_burntsushi(value ast.Value) string {
 	}
 	return '<error>'
 }
 // toml_to_json_escapes is a utility function for normalizing
 // TOML basic string to JSON string
 fn toml_to_json_escapes(q ast.Quoted) ?string {
 	mut s := scanner.new_simple(q.text) ?
 	mut r := ''
 	for {
 		ch := s.next()
 		if ch == scanner.end_of_text {
 			break
 		}
 		ch_byte := byte(ch)
 		if ch == `"` {
 			if byte(s.peek(-1)) != `\\` {
 				r += '\\'
 			}
 		}
 		if ch == `\\` {
 			next_ch := byte(s.at())
 			escape := ch_byte.ascii_str() + next_ch.ascii_str()
 			if escape.to_lower() == '\\u' {
 				mut b := s.next()
 				mut unicode_point := ''
 				for {
 					b = s.next()
 					if b != ` ` && b != scanner.end_of_text {
 						unicode_point += byte(b).ascii_str()
 					} else {
 						break
 					}
 				}
 				if unicode_point.len < 8 {
 					unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
 				}
 				rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
 				r += '$rn'
 				continue
 			}
 		}
 		r += ch_byte.ascii_str()
 	}
 	return r
 }
--- a/vlib/toml/tests/strings_test.v
+++ b/vlib/toml/tests/strings_test.v
@ -72,9 +72,9 @@ fn test_unicode_escapes() {
 	mut toml_doc := toml.parse(toml_unicode_escapes) or { panic(err) }
 	mut value := toml_doc.value('short')
-	assert value.string() == r'\u03B4'
+	assert value.string() == '\u03B4' // <- This escape is handled by V
 	value = toml_doc.value('long')
-	assert value.string() == r'\U000003B4'
+	assert value.string() == 'δ' // <- for the long escape we compare with the unicode point
 }
 fn test_literal_strings() {