toml: fix unicode and escape value decoding (#12534)
parent
13a2d547b4
commit
3f0e532660
|
@ -9,6 +9,11 @@ import toml.token
|
||||||
import toml.scanner
|
import toml.scanner
|
||||||
import strconv
|
import strconv
|
||||||
|
|
||||||
|
const (
|
||||||
|
// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
|
||||||
|
utf8_max = 0x10FFFF
|
||||||
|
)
|
||||||
|
|
||||||
// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
|
// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
|
||||||
pub struct Decoder {
|
pub struct Decoder {
|
||||||
scanner &scanner.Scanner
|
scanner &scanner.Scanner
|
||||||
|
@ -36,7 +41,7 @@ fn (d Decoder) excerpt(tp token.Position) string {
|
||||||
|
|
||||||
// decode_quoted returns an error if `q` is not a valid quoted TOML string.
|
// decode_quoted returns an error if `q` is not a valid quoted TOML string.
|
||||||
fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
|
fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
|
||||||
d.decode_quoted_escapes(mut q) ?
|
decode_quoted_escapes(mut q) ?
|
||||||
}
|
}
|
||||||
|
|
||||||
// decode_quoted_escapes returns an error for any disallowed escape sequences.
|
// decode_quoted_escapes returns an error for any disallowed escape sequences.
|
||||||
|
@ -53,12 +58,9 @@ fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
|
||||||
// \\ - backslash (U+005C)
|
// \\ - backslash (U+005C)
|
||||||
// \uXXXX - Unicode (U+XXXX)
|
// \uXXXX - Unicode (U+XXXX)
|
||||||
// \UXXXXXXXX - Unicode (U+XXXXXXXX)
|
// \UXXXXXXXX - Unicode (U+XXXXXXXX)
|
||||||
fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
|
pub fn decode_quoted_escapes(mut q ast.Quoted) ? {
|
||||||
// Setup a scanner in stack memory for easier navigation.
|
// Setup a scanner in stack memory for easier navigation.
|
||||||
mut s := scanner.new_simple(q.text) ?
|
mut eat_whitespace := false
|
||||||
|
|
||||||
q.text = q.text.replace('\\"', '"')
|
|
||||||
|
|
||||||
// TODO use string builder
|
// TODO use string builder
|
||||||
mut decoded_s := ''
|
mut decoded_s := ''
|
||||||
// See https://toml.io/en/v1.0.0#string for more info on string types.
|
// See https://toml.io/en/v1.0.0#string for more info on string types.
|
||||||
|
@ -66,6 +68,10 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
|
||||||
if !is_basic {
|
if !is_basic {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mut s := scanner.new_simple(q.text) ?
|
||||||
|
q.text = q.text.replace('\\"', '"')
|
||||||
|
|
||||||
for {
|
for {
|
||||||
ch := s.next()
|
ch := s.next()
|
||||||
if ch == scanner.end_of_text {
|
if ch == scanner.end_of_text {
|
||||||
|
@ -73,15 +79,28 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
|
||||||
}
|
}
|
||||||
ch_byte := byte(ch)
|
ch_byte := byte(ch)
|
||||||
|
|
||||||
|
if eat_whitespace && ch_byte.is_space() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
eat_whitespace = false
|
||||||
|
|
||||||
if ch == `\\` {
|
if ch == `\\` {
|
||||||
ch_next := byte(s.at())
|
ch_next := s.at()
|
||||||
|
ch_next_byte := byte(ch_next)
|
||||||
|
|
||||||
if ch_next == `\\` {
|
if ch_next == `\\` {
|
||||||
decoded_s += ch_next.ascii_str()
|
decoded_s += ch_next_byte.ascii_str()
|
||||||
s.next()
|
s.next()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if q.is_multiline {
|
||||||
|
if ch_next_byte.is_space() {
|
||||||
|
eat_whitespace = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ch_next == `"` {
|
if ch_next == `"` {
|
||||||
decoded_s += '"'
|
decoded_s += '"'
|
||||||
s.next()
|
s.next()
|
||||||
|
@ -94,29 +113,74 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
escape := ch_byte.ascii_str() + ch_next.ascii_str()
|
if ch_next == `t` {
|
||||||
|
decoded_s += '\t'
|
||||||
|
s.next()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if ch_next == `b` {
|
||||||
|
decoded_s += '\b'
|
||||||
|
s.next()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if ch_next == `r` {
|
||||||
|
decoded_s += '\r'
|
||||||
|
s.next()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if ch_next == `f` {
|
||||||
|
decoded_s += '\f'
|
||||||
|
s.next()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
escape := ch_byte.ascii_str() + ch_next_byte.ascii_str()
|
||||||
// Decode unicode escapes
|
// Decode unicode escapes
|
||||||
if escape.to_lower() == '\\u' {
|
if escape.to_lower() == '\\u' {
|
||||||
|
is_valid_short := byte(s.peek(1)).is_hex_digit() && byte(s.peek(2)).is_hex_digit()
|
||||||
|
&& byte(s.peek(3)).is_hex_digit() && byte(s.peek(4)).is_hex_digit()
|
||||||
|
|
||||||
|
if is_valid_short {
|
||||||
|
// is_valid_long := byte(s.peek(5)).is_hex_digit() && byte(s.peek(6)).is_hex_digit() && byte(s.peek(7)).is_hex_digit() && byte(s.peek(8)).is_hex_digit()
|
||||||
// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
|
// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
|
||||||
// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
|
// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
|
||||||
// of 9 chars plus one extra.
|
// of 9 chars plus one extra.
|
||||||
mut decoded := ''
|
mut decoded := ''
|
||||||
|
mut sequence_length := 0
|
||||||
|
mut unicode_val := 0
|
||||||
if s.remaining() >= 10 {
|
if s.remaining() >= 10 {
|
||||||
pos := s.state().pos
|
pos := s.state().pos
|
||||||
decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or {
|
sequence := s.text[pos..pos + 11]
|
||||||
st := s.state()
|
|
||||||
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
|
||||||
' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
|
decoded_s += escape
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if unicode_val > decoder.utf8_max || unicode_val < 0 {
|
||||||
|
decoded_s += escape
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Check if the Unicode value is actually in the valid Unicode scalar value ranges.
|
||||||
|
if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF)
|
||||||
|
|| (unicode_val >= 0xE000 && unicode_val <= decoder.utf8_max)) {
|
||||||
|
decoded_s += escape
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if unicode_val in [0x7F, 0x1F, 0x5C, 0x75] {
|
||||||
|
sequence_length -= 2
|
||||||
}
|
}
|
||||||
decoded_s += decoded
|
decoded_s += decoded
|
||||||
s.skip_n(s.text[pos..pos + 11].len)
|
s.skip_n(s.text[pos..pos + 2 + sequence_length + 1].len)
|
||||||
continue
|
continue
|
||||||
} else {
|
} else {
|
||||||
pos := s.state().pos
|
pos := s.state().pos
|
||||||
decoded = d.decode_unicode_escape(s.text[pos..]) or {
|
sequence := s.text[pos..]
|
||||||
st := s.state()
|
decoded, _, _ = decode_unicode_escape(sequence) or {
|
||||||
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
decoded_s += escape
|
||||||
' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
|
continue
|
||||||
}
|
}
|
||||||
decoded_s += decoded
|
decoded_s += decoded
|
||||||
s.skip_n(s.text[pos..].len)
|
s.skip_n(s.text[pos..].len)
|
||||||
|
@ -124,6 +188,7 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
decoded_s += ch_byte.ascii_str()
|
decoded_s += ch_byte.ascii_str()
|
||||||
}
|
}
|
||||||
q.text = decoded_s
|
q.text = decoded_s
|
||||||
|
@ -132,10 +197,11 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
|
||||||
// decode_unicode_escape returns an error if `esc_unicode` is not
|
// decode_unicode_escape returns an error if `esc_unicode` is not
|
||||||
// a valid Unicode escape sequence. `esc_unicode` is expected to be
|
// a valid Unicode escape sequence. `esc_unicode` is expected to be
|
||||||
// prefixed with either `u` or `U`.
|
// prefixed with either `u` or `U`.
|
||||||
fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
|
fn decode_unicode_escape(esc_unicode string) ?(string, int, int) {
|
||||||
is_long_esc_type := esc_unicode.starts_with('U')
|
is_long_esc_type := esc_unicode.starts_with('U')
|
||||||
mut sequence := esc_unicode[1..]
|
mut sequence := esc_unicode[1..]
|
||||||
hex_digits_len := if is_long_esc_type { 8 } else { 4 }
|
hex_digits_len := if is_long_esc_type { 8 } else { 4 }
|
||||||
|
mut sequence_len := hex_digits_len
|
||||||
|
|
||||||
sequence = sequence[..hex_digits_len]
|
sequence = sequence[..hex_digits_len]
|
||||||
|
|
||||||
|
@ -143,6 +209,7 @@ fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
|
||||||
if unicode_point.len < 8 {
|
if unicode_point.len < 8 {
|
||||||
unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
|
unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
|
||||||
}
|
}
|
||||||
rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
|
i64_val := strconv.parse_int(unicode_point, 16, 0) ?
|
||||||
return '$rn'
|
rn := rune(i64_val)
|
||||||
|
return '$rn', int(i64_val), sequence_len
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import toml
|
import toml
|
||||||
import toml.ast
|
import toml.ast
|
||||||
import toml.scanner
|
|
||||||
import x.json2
|
import x.json2
|
||||||
import strconv
|
|
||||||
|
|
||||||
// Instructions for developers:
|
// Instructions for developers:
|
||||||
// The actual tests and data can be obtained by doing:
|
// The actual tests and data can be obtained by doing:
|
||||||
|
@ -17,22 +15,14 @@ const (
|
||||||
invalid_exceptions = []string{}
|
invalid_exceptions = []string{}
|
||||||
|
|
||||||
valid_value_exceptions = [
|
valid_value_exceptions = [
|
||||||
// String
|
|
||||||
'string/escapes.toml',
|
|
||||||
'string/multiline.toml',
|
|
||||||
// Integer
|
// Integer
|
||||||
'integer/long.toml',
|
'integer/long.toml',
|
||||||
// Float
|
// Float
|
||||||
'float/inf-and-nan.toml',
|
'float/inf-and-nan.toml',
|
||||||
// Comment
|
|
||||||
'comment/tricky.toml',
|
|
||||||
// Table
|
// Table
|
||||||
'table/array-implicit.toml',
|
'table/array-implicit.toml',
|
||||||
'table/names.toml',
|
|
||||||
// Date-time
|
// Date-time
|
||||||
'datetime/milliseconds.toml',
|
'datetime/milliseconds.toml',
|
||||||
// Inline-table
|
|
||||||
'inline-table/multiline.toml',
|
|
||||||
// Key
|
// Key
|
||||||
'key/escapes.toml',
|
'key/escapes.toml',
|
||||||
]
|
]
|
||||||
|
|
Loading…
Reference in New Issue