toml: fix unicode and escape value decoding (#12534)

pull/12542/head
Larpon 2021-11-23 10:02:43 +01:00 committed by GitHub
parent 13a2d547b4
commit 3f0e532660
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 102 additions and 45 deletions

View File

@ -9,6 +9,11 @@ import toml.token
import toml.scanner import toml.scanner
import strconv import strconv
const (
// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
utf8_max = 0x10FFFF
)
// Decoder decode special sequences in a tree of TOML `ast.Value`'s. // Decoder decode special sequences in a tree of TOML `ast.Value`'s.
pub struct Decoder { pub struct Decoder {
scanner &scanner.Scanner scanner &scanner.Scanner
@ -36,7 +41,7 @@ fn (d Decoder) excerpt(tp token.Position) string {
// decode_quoted returns an error if `q` is not a valid quoted TOML string. // decode_quoted returns an error if `q` is not a valid quoted TOML string.
fn (d Decoder) decode_quoted(mut q ast.Quoted) ? { fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
d.decode_quoted_escapes(mut q) ? decode_quoted_escapes(mut q) ?
} }
// decode_quoted_escapes returns an error for any disallowed escape sequences. // decode_quoted_escapes returns an error for any disallowed escape sequences.
@ -53,12 +58,9 @@ fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
// \\ - backslash (U+005C) // \\ - backslash (U+005C)
// \uXXXX - Unicode (U+XXXX) // \uXXXX - Unicode (U+XXXX)
// \UXXXXXXXX - Unicode (U+XXXXXXXX) // \UXXXXXXXX - Unicode (U+XXXXXXXX)
fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? { pub fn decode_quoted_escapes(mut q ast.Quoted) ? {
// Setup a scanner in stack memory for easier navigation. // Setup a scanner in stack memory for easier navigation.
mut s := scanner.new_simple(q.text) ? mut eat_whitespace := false
q.text = q.text.replace('\\"', '"')
// TODO use string builder // TODO use string builder
mut decoded_s := '' mut decoded_s := ''
// See https://toml.io/en/v1.0.0#string for more info on string types. // See https://toml.io/en/v1.0.0#string for more info on string types.
@ -66,6 +68,10 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
if !is_basic { if !is_basic {
return return
} }
mut s := scanner.new_simple(q.text) ?
q.text = q.text.replace('\\"', '"')
for { for {
ch := s.next() ch := s.next()
if ch == scanner.end_of_text { if ch == scanner.end_of_text {
@ -73,15 +79,28 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
} }
ch_byte := byte(ch) ch_byte := byte(ch)
if eat_whitespace && ch_byte.is_space() {
continue
}
eat_whitespace = false
if ch == `\\` { if ch == `\\` {
ch_next := byte(s.at()) ch_next := s.at()
ch_next_byte := byte(ch_next)
if ch_next == `\\` { if ch_next == `\\` {
decoded_s += ch_next.ascii_str() decoded_s += ch_next_byte.ascii_str()
s.next() s.next()
continue continue
} }
if q.is_multiline {
if ch_next_byte.is_space() {
eat_whitespace = true
continue
}
}
if ch_next == `"` { if ch_next == `"` {
decoded_s += '"' decoded_s += '"'
s.next() s.next()
@ -94,33 +113,79 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
continue continue
} }
escape := ch_byte.ascii_str() + ch_next.ascii_str() if ch_next == `t` {
decoded_s += '\t'
s.next()
continue
}
if ch_next == `b` {
decoded_s += '\b'
s.next()
continue
}
if ch_next == `r` {
decoded_s += '\r'
s.next()
continue
}
if ch_next == `f` {
decoded_s += '\f'
s.next()
continue
}
escape := ch_byte.ascii_str() + ch_next_byte.ascii_str()
// Decode unicode escapes // Decode unicode escapes
if escape.to_lower() == '\\u' { if escape.to_lower() == '\\u' {
// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters is_valid_short := byte(s.peek(1)).is_hex_digit() && byte(s.peek(2)).is_hex_digit()
// we pass in 10 characters from the `u`/`U` which is the longest possible sequence && byte(s.peek(3)).is_hex_digit() && byte(s.peek(4)).is_hex_digit()
// of 9 chars plus one extra.
mut decoded := '' if is_valid_short {
if s.remaining() >= 10 { // is_valid_long := byte(s.peek(5)).is_hex_digit() && byte(s.peek(6)).is_hex_digit() && byte(s.peek(7)).is_hex_digit() && byte(s.peek(8)).is_hex_digit()
pos := s.state().pos // Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or { // we pass in 10 characters from the `u`/`U` which is the longest possible sequence
st := s.state() // of 9 chars plus one extra.
return error(@MOD + '.' + @STRUCT + '.' + @FN + mut decoded := ''
' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...') mut sequence_length := 0
mut unicode_val := 0
if s.remaining() >= 10 {
pos := s.state().pos
sequence := s.text[pos..pos + 11]
decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
decoded_s += escape
continue
}
if unicode_val > decoder.utf8_max || unicode_val < 0 {
decoded_s += escape
continue
}
// Check if the Unicode value is actually in the valid Unicode scalar value ranges.
if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF)
|| (unicode_val >= 0xE000 && unicode_val <= decoder.utf8_max)) {
decoded_s += escape
continue
}
if unicode_val in [0x7F, 0x1F, 0x5C, 0x75] {
sequence_length -= 2
}
decoded_s += decoded
s.skip_n(s.text[pos..pos + 2 + sequence_length + 1].len)
continue
} else {
pos := s.state().pos
sequence := s.text[pos..]
decoded, _, _ = decode_unicode_escape(sequence) or {
decoded_s += escape
continue
}
decoded_s += decoded
s.skip_n(s.text[pos..].len)
continue
} }
decoded_s += decoded
s.skip_n(s.text[pos..pos + 11].len)
continue
} else {
pos := s.state().pos
decoded = d.decode_unicode_escape(s.text[pos..]) or {
st := s.state()
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
}
decoded_s += decoded
s.skip_n(s.text[pos..].len)
continue
} }
} }
} }
@ -132,10 +197,11 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
// decode_unicode_escape returns an error if `esc_unicode` is not // decode_unicode_escape returns an error if `esc_unicode` is not
// a valid Unicode escape sequence. `esc_unicode` is expected to be // a valid Unicode escape sequence. `esc_unicode` is expected to be
// prefixed with either `u` or `U`. // prefixed with either `u` or `U`.
fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string { fn decode_unicode_escape(esc_unicode string) ?(string, int, int) {
is_long_esc_type := esc_unicode.starts_with('U') is_long_esc_type := esc_unicode.starts_with('U')
mut sequence := esc_unicode[1..] mut sequence := esc_unicode[1..]
hex_digits_len := if is_long_esc_type { 8 } else { 4 } hex_digits_len := if is_long_esc_type { 8 } else { 4 }
mut sequence_len := hex_digits_len
sequence = sequence[..hex_digits_len] sequence = sequence[..hex_digits_len]
@ -143,6 +209,7 @@ fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
if unicode_point.len < 8 { if unicode_point.len < 8 {
unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
} }
rn := rune(strconv.parse_int(unicode_point, 16, 0) ?) i64_val := strconv.parse_int(unicode_point, 16, 0) ?
return '$rn' rn := rune(i64_val)
return '$rn', int(i64_val), sequence_len
} }

View File

@ -1,9 +1,7 @@
import os import os
import toml import toml
import toml.ast import toml.ast
import toml.scanner
import x.json2 import x.json2
import strconv
// Instructions for developers: // Instructions for developers:
// The actual tests and data can be obtained by doing: // The actual tests and data can be obtained by doing:
@ -17,22 +15,14 @@ const (
invalid_exceptions = []string{} invalid_exceptions = []string{}
valid_value_exceptions = [ valid_value_exceptions = [
// String
'string/escapes.toml',
'string/multiline.toml',
// Integer // Integer
'integer/long.toml', 'integer/long.toml',
// Float // Float
'float/inf-and-nan.toml', 'float/inf-and-nan.toml',
// Comment
'comment/tricky.toml',
// Table // Table
'table/array-implicit.toml', 'table/array-implicit.toml',
'table/names.toml',
// Date-time // Date-time
'datetime/milliseconds.toml', 'datetime/milliseconds.toml',
// Inline-table
'inline-table/multiline.toml',
// Key // Key
'key/escapes.toml', 'key/escapes.toml',
] ]