toml: add value decoding (#12521)
parent
4b9e8e243c
commit
f1dd0e3355
|
@ -97,8 +97,9 @@ pub fn (n Null) str() string {
|
||||||
// Quoted is the data representation of a TOML quoted type (`"quoted-key" = "I'm a quoted value"`).
|
// Quoted is the data representation of a TOML quoted type (`"quoted-key" = "I'm a quoted value"`).
|
||||||
// Quoted types can appear both as keys and values in TOML documents.
|
// Quoted types can appear both as keys and values in TOML documents.
|
||||||
pub struct Quoted {
|
pub struct Quoted {
|
||||||
pub:
|
pub mut:
|
||||||
text string
|
text string
|
||||||
|
pub:
|
||||||
pos token.Position
|
pos token.Position
|
||||||
is_multiline bool
|
is_multiline bool
|
||||||
quote byte
|
quote byte
|
||||||
|
|
|
@ -2,11 +2,16 @@ module walker
|
||||||
|
|
||||||
import toml.ast
|
import toml.ast
|
||||||
|
|
||||||
// Visitor defines a visit method which is invoked by the walker in each Value node it encounters.
|
// Visitor defines a visit method which is invoked by the walker on each Value node it encounters.
|
||||||
pub interface Visitor {
|
pub interface Visitor {
|
||||||
visit(value &ast.Value) ?
|
visit(value &ast.Value) ?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Modifier defines a modify method which is invoked by the walker on each Value node it encounters.
|
||||||
|
pub interface Modifier {
|
||||||
|
modify(mut value ast.Value) ?
|
||||||
|
}
|
||||||
|
|
||||||
pub type InspectorFn = fn (value &ast.Value, data voidptr) ?
|
pub type InspectorFn = fn (value &ast.Value, data voidptr) ?
|
||||||
|
|
||||||
struct Inspector {
|
struct Inspector {
|
||||||
|
@ -31,7 +36,32 @@ pub fn walk(visitor Visitor, value &ast.Value) ? {
|
||||||
for _, val in value_map {
|
for _, val in value_map {
|
||||||
walk(visitor, &val) ?
|
walk(visitor, &val) ?
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if value is []ast.Value {
|
||||||
|
value_array := value as []ast.Value
|
||||||
|
for val in value_array {
|
||||||
|
walk(visitor, &val) ?
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
visitor.visit(value) ?
|
visitor.visit(value) ?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// walk_and_modify traverses the AST using the given modifier and lets the visitor
|
||||||
|
// modify the contents.
|
||||||
|
pub fn walk_and_modify(modifier Modifier, mut value ast.Value) ? {
|
||||||
|
if value is map[string]ast.Value {
|
||||||
|
mut value_map := value as map[string]ast.Value
|
||||||
|
for _, mut val in value_map {
|
||||||
|
walk_and_modify(modifier, mut &val) ?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if value is []ast.Value {
|
||||||
|
mut value_array := value as []ast.Value
|
||||||
|
for mut val in value_array {
|
||||||
|
walk_and_modify(modifier, mut &val) ?
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
modifier.modify(mut value) ?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -400,7 +400,7 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? {
|
||||||
is_basic := q.quote == `\"`
|
is_basic := q.quote == `\"`
|
||||||
for {
|
for {
|
||||||
ch := s.next()
|
ch := s.next()
|
||||||
if ch == -1 {
|
if ch == scanner.end_of_text {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
ch_byte := byte(ch)
|
ch_byte := byte(ch)
|
||||||
|
|
|
@ -0,0 +1,148 @@
|
||||||
|
// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
|
||||||
|
// Use of this source code is governed by an MIT license
|
||||||
|
// that can be found in the LICENSE file.
|
||||||
|
module decoder
|
||||||
|
|
||||||
|
import toml.ast
|
||||||
|
import toml.ast.walker
|
||||||
|
import toml.token
|
||||||
|
import toml.scanner
|
||||||
|
import strconv
|
||||||
|
|
||||||
|
// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
|
||||||
|
pub struct Decoder {
|
||||||
|
scanner &scanner.Scanner
|
||||||
|
}
|
||||||
|
|
||||||
|
// decode decodes certain `ast.Value`'s and all it's children.
|
||||||
|
pub fn (d Decoder) decode(mut n ast.Value) ? {
|
||||||
|
walker.walk_and_modify(d, mut n) ?
|
||||||
|
}
|
||||||
|
|
||||||
|
fn (d Decoder) modify(mut value ast.Value) ? {
|
||||||
|
match value {
|
||||||
|
ast.Quoted {
|
||||||
|
mut v := &(value as ast.Quoted)
|
||||||
|
d.decode_quoted(mut v) ?
|
||||||
|
}
|
||||||
|
else {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// excerpt returns a string of the token's surroundings
|
||||||
|
fn (d Decoder) excerpt(tp token.Position) string {
|
||||||
|
return d.scanner.excerpt(tp.pos, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
// decode_quoted returns an error if `q` is not a valid quoted TOML string.
|
||||||
|
fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
|
||||||
|
d.decode_quoted_escapes(mut q) ?
|
||||||
|
}
|
||||||
|
|
||||||
|
// decode_quoted_escapes returns an error for any disallowed escape sequences.
|
||||||
|
// Delimiters in TOML has significant meaning:
|
||||||
|
// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
|
||||||
|
// "/""" delimits *basic* strings
|
||||||
|
// Allowed escapes in *basic* strings are:
|
||||||
|
// \b - backspace (U+0008)
|
||||||
|
// \t - tab (U+0009)
|
||||||
|
// \n - linefeed (U+000A)
|
||||||
|
// \f - form feed (U+000C)
|
||||||
|
// \r - carriage return (U+000D)
|
||||||
|
// \" - quote (U+0022)
|
||||||
|
// \\ - backslash (U+005C)
|
||||||
|
// \uXXXX - Unicode (U+XXXX)
|
||||||
|
// \UXXXXXXXX - Unicode (U+XXXXXXXX)
|
||||||
|
fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
|
||||||
|
// Setup a scanner in stack memory for easier navigation.
|
||||||
|
mut s := scanner.new_simple(q.text) ?
|
||||||
|
|
||||||
|
q.text = q.text.replace('\\"', '"')
|
||||||
|
|
||||||
|
// TODO use string builder
|
||||||
|
mut decoded_s := ''
|
||||||
|
// See https://toml.io/en/v1.0.0#string for more info on string types.
|
||||||
|
is_basic := q.quote == `\"`
|
||||||
|
if !is_basic {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
ch := s.next()
|
||||||
|
if ch == scanner.end_of_text {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
ch_byte := byte(ch)
|
||||||
|
|
||||||
|
if ch == `\\` {
|
||||||
|
ch_next := byte(s.at())
|
||||||
|
|
||||||
|
if ch_next == `\\` {
|
||||||
|
decoded_s += ch_next.ascii_str()
|
||||||
|
s.next()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if ch_next == `"` {
|
||||||
|
decoded_s += '"'
|
||||||
|
s.next()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if ch_next == `n` {
|
||||||
|
decoded_s += '\n'
|
||||||
|
s.next()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
escape := ch_byte.ascii_str() + ch_next.ascii_str()
|
||||||
|
// Decode unicode escapes
|
||||||
|
if escape.to_lower() == '\\u' {
|
||||||
|
// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
|
||||||
|
// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
|
||||||
|
// of 9 chars plus one extra.
|
||||||
|
mut decoded := ''
|
||||||
|
if s.remaining() >= 10 {
|
||||||
|
pos := s.state().pos
|
||||||
|
decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or {
|
||||||
|
st := s.state()
|
||||||
|
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
||||||
|
' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
|
||||||
|
}
|
||||||
|
decoded_s += decoded
|
||||||
|
s.skip_n(s.text[pos..pos + 11].len)
|
||||||
|
continue
|
||||||
|
} else {
|
||||||
|
pos := s.state().pos
|
||||||
|
decoded = d.decode_unicode_escape(s.text[pos..]) or {
|
||||||
|
st := s.state()
|
||||||
|
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
||||||
|
' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
|
||||||
|
}
|
||||||
|
decoded_s += decoded
|
||||||
|
s.skip_n(s.text[pos..].len)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
decoded_s += ch_byte.ascii_str()
|
||||||
|
}
|
||||||
|
q.text = decoded_s
|
||||||
|
}
|
||||||
|
|
||||||
|
// decode_unicode_escape returns an error if `esc_unicode` is not
|
||||||
|
// a valid Unicode escape sequence. `esc_unicode` is expected to be
|
||||||
|
// prefixed with either `u` or `U`.
|
||||||
|
fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
|
||||||
|
is_long_esc_type := esc_unicode.starts_with('U')
|
||||||
|
mut sequence := esc_unicode[1..]
|
||||||
|
hex_digits_len := if is_long_esc_type { 8 } else { 4 }
|
||||||
|
|
||||||
|
sequence = sequence[..hex_digits_len]
|
||||||
|
|
||||||
|
mut unicode_point := sequence
|
||||||
|
if unicode_point.len < 8 {
|
||||||
|
unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
|
||||||
|
}
|
||||||
|
rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
|
||||||
|
return '$rn'
|
||||||
|
}
|
|
@ -5,6 +5,7 @@ module parser
|
||||||
|
|
||||||
import toml.ast
|
import toml.ast
|
||||||
import toml.checker
|
import toml.checker
|
||||||
|
import toml.decoder
|
||||||
import toml.util
|
import toml.util
|
||||||
import toml.token
|
import toml.token
|
||||||
import toml.scanner
|
import toml.scanner
|
||||||
|
@ -69,10 +70,12 @@ mut:
|
||||||
|
|
||||||
// Config is used to configure a Parser instance.
|
// Config is used to configure a Parser instance.
|
||||||
// `run_checks` is used to en- or disable running of the strict `checker.Checker` type checks.
|
// `run_checks` is used to en- or disable running of the strict `checker.Checker` type checks.
|
||||||
|
// `decode_values` is used to en- or disable decoding of values with the `decoder.Decoder`.
|
||||||
pub struct Config {
|
pub struct Config {
|
||||||
pub:
|
pub:
|
||||||
scanner &scanner.Scanner
|
scanner &scanner.Scanner
|
||||||
run_checks bool = true
|
run_checks bool = true
|
||||||
|
decode_values bool = true
|
||||||
}
|
}
|
||||||
|
|
||||||
// new_parser returns a new, stack allocated, `Parser`.
|
// new_parser returns a new, stack allocated, `Parser`.
|
||||||
|
@ -104,12 +107,24 @@ fn (mut p Parser) run_checker() ? {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// run_decoder decodes values in the parsed `ast.Value` nodes in the
|
||||||
|
// the generated AST.
|
||||||
|
fn (mut p Parser) run_decoder() ? {
|
||||||
|
if p.config.decode_values {
|
||||||
|
dcoder := decoder.Decoder{
|
||||||
|
scanner: p.scanner
|
||||||
|
}
|
||||||
|
dcoder.decode(mut p.root_map) ?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// parse starts parsing the input and returns the root
|
// parse starts parsing the input and returns the root
|
||||||
// of the generated AST.
|
// of the generated AST.
|
||||||
pub fn (mut p Parser) parse() ?&ast.Root {
|
pub fn (mut p Parser) parse() ?&ast.Root {
|
||||||
p.init() ?
|
p.init() ?
|
||||||
p.root_table() ?
|
p.root_table() ?
|
||||||
p.run_checker() ?
|
p.run_checker() ?
|
||||||
|
p.run_decoder() ?
|
||||||
p.ast_root.table = p.root_map
|
p.ast_root.table = p.root_map
|
||||||
return p.ast_root
|
return p.ast_root
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,9 +9,10 @@ import toml.input
|
||||||
import toml.token
|
import toml.token
|
||||||
import toml.util
|
import toml.util
|
||||||
|
|
||||||
pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
|
pub const (
|
||||||
|
digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
|
||||||
const end_of_text = -1
|
end_of_text = -1
|
||||||
|
)
|
||||||
|
|
||||||
// Scanner contains the necessary fields for the state of the scan process.
|
// Scanner contains the necessary fields for the state of the scan process.
|
||||||
// the task the scanner does is also refered to as "lexing" or "tokenizing".
|
// the task the scanner does is also refered to as "lexing" or "tokenizing".
|
||||||
|
|
|
@ -19,7 +19,6 @@ const (
|
||||||
valid_value_exceptions = [
|
valid_value_exceptions = [
|
||||||
// String
|
// String
|
||||||
'string/escapes.toml',
|
'string/escapes.toml',
|
||||||
'string/escape-tricky.toml',
|
|
||||||
'string/multiline.toml',
|
'string/multiline.toml',
|
||||||
// Integer
|
// Integer
|
||||||
'integer/long.toml',
|
'integer/long.toml',
|
||||||
|
@ -199,13 +198,7 @@ fn test_burnt_sushi_tomltest() {
|
||||||
fn to_burntsushi(value ast.Value) string {
|
fn to_burntsushi(value ast.Value) string {
|
||||||
match value {
|
match value {
|
||||||
ast.Quoted {
|
ast.Quoted {
|
||||||
mut json_text := ''
|
json_text := json2.Any(value.text).json_str()
|
||||||
if value.quote == `"` {
|
|
||||||
json_text = toml_to_json_escapes(value) or { '<error>' }
|
|
||||||
} else {
|
|
||||||
json_text = json2.Any(value.text).json_str()
|
|
||||||
}
|
|
||||||
|
|
||||||
return '{ "type": "string", "value": "$json_text" }'
|
return '{ "type": "string", "value": "$json_text" }'
|
||||||
}
|
}
|
||||||
ast.DateTime {
|
ast.DateTime {
|
||||||
|
@ -271,49 +264,3 @@ fn to_burntsushi(value ast.Value) string {
|
||||||
}
|
}
|
||||||
return '<error>'
|
return '<error>'
|
||||||
}
|
}
|
||||||
|
|
||||||
// toml_to_json_escapes is a utility function for normalizing
|
|
||||||
// TOML basic string to JSON string
|
|
||||||
fn toml_to_json_escapes(q ast.Quoted) ?string {
|
|
||||||
mut s := scanner.new_simple(q.text) ?
|
|
||||||
mut r := ''
|
|
||||||
for {
|
|
||||||
ch := s.next()
|
|
||||||
if ch == scanner.end_of_text {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
ch_byte := byte(ch)
|
|
||||||
|
|
||||||
if ch == `"` {
|
|
||||||
if byte(s.peek(-1)) != `\\` {
|
|
||||||
r += '\\'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ch == `\\` {
|
|
||||||
next_ch := byte(s.at())
|
|
||||||
|
|
||||||
escape := ch_byte.ascii_str() + next_ch.ascii_str()
|
|
||||||
if escape.to_lower() == '\\u' {
|
|
||||||
mut b := s.next()
|
|
||||||
mut unicode_point := ''
|
|
||||||
for {
|
|
||||||
b = s.next()
|
|
||||||
if b != ` ` && b != scanner.end_of_text {
|
|
||||||
unicode_point += byte(b).ascii_str()
|
|
||||||
} else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if unicode_point.len < 8 {
|
|
||||||
unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
|
|
||||||
}
|
|
||||||
rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
|
|
||||||
r += '$rn'
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
r += ch_byte.ascii_str()
|
|
||||||
}
|
|
||||||
return r
|
|
||||||
}
|
|
||||||
|
|
|
@ -72,9 +72,9 @@ fn test_unicode_escapes() {
|
||||||
mut toml_doc := toml.parse(toml_unicode_escapes) or { panic(err) }
|
mut toml_doc := toml.parse(toml_unicode_escapes) or { panic(err) }
|
||||||
|
|
||||||
mut value := toml_doc.value('short')
|
mut value := toml_doc.value('short')
|
||||||
assert value.string() == r'\u03B4'
|
assert value.string() == '\u03B4' // <- This escape is handled by V
|
||||||
value = toml_doc.value('long')
|
value = toml_doc.value('long')
|
||||||
assert value.string() == r'\U000003B4'
|
assert value.string() == 'δ' // <- for the long escape we compare with the unicode point
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_literal_strings() {
|
fn test_literal_strings() {
|
||||||
|
|
Loading…
Reference in New Issue