toml: fix unicode and escape value decoding (#12534)
							parent
							
								
									13a2d547b4
								
							
						
					
					
						commit
						3f0e532660
					
				| 
						 | 
					@ -9,6 +9,11 @@ import toml.token
 | 
				
			||||||
import toml.scanner
 | 
					import toml.scanner
 | 
				
			||||||
import strconv
 | 
					import strconv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const (
 | 
				
			||||||
 | 
						// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
 | 
				
			||||||
 | 
						utf8_max = 0x10FFFF
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
 | 
					// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
 | 
				
			||||||
pub struct Decoder {
 | 
					pub struct Decoder {
 | 
				
			||||||
	scanner &scanner.Scanner
 | 
						scanner &scanner.Scanner
 | 
				
			||||||
| 
						 | 
					@ -36,7 +41,7 @@ fn (d Decoder) excerpt(tp token.Position) string {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// decode_quoted returns an error if `q` is not a valid quoted TOML string.
 | 
					// decode_quoted returns an error if `q` is not a valid quoted TOML string.
 | 
				
			||||||
fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
 | 
					fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
 | 
				
			||||||
	d.decode_quoted_escapes(mut q) ?
 | 
						decode_quoted_escapes(mut q) ?
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// decode_quoted_escapes returns an error for any disallowed escape sequences.
 | 
					// decode_quoted_escapes returns an error for any disallowed escape sequences.
 | 
				
			||||||
| 
						 | 
					@ -53,12 +58,9 @@ fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
 | 
				
			||||||
// \\         - backslash       (U+005C)
 | 
					// \\         - backslash       (U+005C)
 | 
				
			||||||
// \uXXXX     - Unicode         (U+XXXX)
 | 
					// \uXXXX     - Unicode         (U+XXXX)
 | 
				
			||||||
// \UXXXXXXXX - Unicode         (U+XXXXXXXX)
 | 
					// \UXXXXXXXX - Unicode         (U+XXXXXXXX)
 | 
				
			||||||
fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
 | 
					pub fn decode_quoted_escapes(mut q ast.Quoted) ? {
 | 
				
			||||||
	// Setup a scanner in stack memory for easier navigation.
 | 
						// Setup a scanner in stack memory for easier navigation.
 | 
				
			||||||
	mut s := scanner.new_simple(q.text) ?
 | 
						mut eat_whitespace := false
 | 
				
			||||||
 | 
					 | 
				
			||||||
	q.text = q.text.replace('\\"', '"')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	// TODO use string builder
 | 
						// TODO use string builder
 | 
				
			||||||
	mut decoded_s := ''
 | 
						mut decoded_s := ''
 | 
				
			||||||
	// See https://toml.io/en/v1.0.0#string for more info on string types.
 | 
						// See https://toml.io/en/v1.0.0#string for more info on string types.
 | 
				
			||||||
| 
						 | 
					@ -66,6 +68,10 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
 | 
				
			||||||
	if !is_basic {
 | 
						if !is_basic {
 | 
				
			||||||
		return
 | 
							return
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mut s := scanner.new_simple(q.text) ?
 | 
				
			||||||
 | 
						q.text = q.text.replace('\\"', '"')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for {
 | 
						for {
 | 
				
			||||||
		ch := s.next()
 | 
							ch := s.next()
 | 
				
			||||||
		if ch == scanner.end_of_text {
 | 
							if ch == scanner.end_of_text {
 | 
				
			||||||
| 
						 | 
					@ -73,15 +79,28 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		ch_byte := byte(ch)
 | 
							ch_byte := byte(ch)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if eat_whitespace && ch_byte.is_space() {
 | 
				
			||||||
 | 
								continue
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							eat_whitespace = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if ch == `\\` {
 | 
							if ch == `\\` {
 | 
				
			||||||
			ch_next := byte(s.at())
 | 
								ch_next := s.at()
 | 
				
			||||||
 | 
								ch_next_byte := byte(ch_next)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			if ch_next == `\\` {
 | 
								if ch_next == `\\` {
 | 
				
			||||||
				decoded_s += ch_next.ascii_str()
 | 
									decoded_s += ch_next_byte.ascii_str()
 | 
				
			||||||
				s.next()
 | 
									s.next()
 | 
				
			||||||
				continue
 | 
									continue
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								if q.is_multiline {
 | 
				
			||||||
 | 
									if ch_next_byte.is_space() {
 | 
				
			||||||
 | 
										eat_whitespace = true
 | 
				
			||||||
 | 
										continue
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			if ch_next == `"` {
 | 
								if ch_next == `"` {
 | 
				
			||||||
				decoded_s += '"'
 | 
									decoded_s += '"'
 | 
				
			||||||
				s.next()
 | 
									s.next()
 | 
				
			||||||
| 
						 | 
					@ -94,33 +113,79 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
 | 
				
			||||||
				continue
 | 
									continue
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			escape := ch_byte.ascii_str() + ch_next.ascii_str()
 | 
								if ch_next == `t` {
 | 
				
			||||||
 | 
									decoded_s += '\t'
 | 
				
			||||||
 | 
									s.next()
 | 
				
			||||||
 | 
									continue
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								if ch_next == `b` {
 | 
				
			||||||
 | 
									decoded_s += '\b'
 | 
				
			||||||
 | 
									s.next()
 | 
				
			||||||
 | 
									continue
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								if ch_next == `r` {
 | 
				
			||||||
 | 
									decoded_s += '\r'
 | 
				
			||||||
 | 
									s.next()
 | 
				
			||||||
 | 
									continue
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								if ch_next == `f` {
 | 
				
			||||||
 | 
									decoded_s += '\f'
 | 
				
			||||||
 | 
									s.next()
 | 
				
			||||||
 | 
									continue
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								escape := ch_byte.ascii_str() + ch_next_byte.ascii_str()
 | 
				
			||||||
			// Decode unicode escapes
 | 
								// Decode unicode escapes
 | 
				
			||||||
			if escape.to_lower() == '\\u' {
 | 
								if escape.to_lower() == '\\u' {
 | 
				
			||||||
				// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
 | 
									is_valid_short := byte(s.peek(1)).is_hex_digit() && byte(s.peek(2)).is_hex_digit()
 | 
				
			||||||
				// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
 | 
										&& byte(s.peek(3)).is_hex_digit() && byte(s.peek(4)).is_hex_digit()
 | 
				
			||||||
				// of 9 chars plus one extra.
 | 
					
 | 
				
			||||||
				mut decoded := ''
 | 
									if is_valid_short {
 | 
				
			||||||
				if s.remaining() >= 10 {
 | 
										// is_valid_long := byte(s.peek(5)).is_hex_digit() && byte(s.peek(6)).is_hex_digit() && byte(s.peek(7)).is_hex_digit() && byte(s.peek(8)).is_hex_digit()
 | 
				
			||||||
					pos := s.state().pos
 | 
										// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
 | 
				
			||||||
					decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or {
 | 
										// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
 | 
				
			||||||
						st := s.state()
 | 
										// of 9 chars plus one extra.
 | 
				
			||||||
						return error(@MOD + '.' + @STRUCT + '.' + @FN +
 | 
										mut decoded := ''
 | 
				
			||||||
							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
 | 
										mut sequence_length := 0
 | 
				
			||||||
 | 
										mut unicode_val := 0
 | 
				
			||||||
 | 
										if s.remaining() >= 10 {
 | 
				
			||||||
 | 
											pos := s.state().pos
 | 
				
			||||||
 | 
											sequence := s.text[pos..pos + 11]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
											decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
 | 
				
			||||||
 | 
												decoded_s += escape
 | 
				
			||||||
 | 
												continue
 | 
				
			||||||
 | 
											}
 | 
				
			||||||
 | 
											if unicode_val > decoder.utf8_max || unicode_val < 0 {
 | 
				
			||||||
 | 
												decoded_s += escape
 | 
				
			||||||
 | 
												continue
 | 
				
			||||||
 | 
											}
 | 
				
			||||||
 | 
											// Check if the Unicode value is actually in the valid Unicode scalar value ranges.
 | 
				
			||||||
 | 
											if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF)
 | 
				
			||||||
 | 
												|| (unicode_val >= 0xE000 && unicode_val <= decoder.utf8_max)) {
 | 
				
			||||||
 | 
												decoded_s += escape
 | 
				
			||||||
 | 
												continue
 | 
				
			||||||
 | 
											}
 | 
				
			||||||
 | 
											if unicode_val in [0x7F, 0x1F, 0x5C, 0x75] {
 | 
				
			||||||
 | 
												sequence_length -= 2
 | 
				
			||||||
 | 
											}
 | 
				
			||||||
 | 
											decoded_s += decoded
 | 
				
			||||||
 | 
											s.skip_n(s.text[pos..pos + 2 + sequence_length + 1].len)
 | 
				
			||||||
 | 
											continue
 | 
				
			||||||
 | 
										} else {
 | 
				
			||||||
 | 
											pos := s.state().pos
 | 
				
			||||||
 | 
											sequence := s.text[pos..]
 | 
				
			||||||
 | 
											decoded, _, _ = decode_unicode_escape(sequence) or {
 | 
				
			||||||
 | 
												decoded_s += escape
 | 
				
			||||||
 | 
												continue
 | 
				
			||||||
 | 
											}
 | 
				
			||||||
 | 
											decoded_s += decoded
 | 
				
			||||||
 | 
											s.skip_n(s.text[pos..].len)
 | 
				
			||||||
 | 
											continue
 | 
				
			||||||
					}
 | 
										}
 | 
				
			||||||
					decoded_s += decoded
 | 
					 | 
				
			||||||
					s.skip_n(s.text[pos..pos + 11].len)
 | 
					 | 
				
			||||||
					continue
 | 
					 | 
				
			||||||
				} else {
 | 
					 | 
				
			||||||
					pos := s.state().pos
 | 
					 | 
				
			||||||
					decoded = d.decode_unicode_escape(s.text[pos..]) or {
 | 
					 | 
				
			||||||
						st := s.state()
 | 
					 | 
				
			||||||
						return error(@MOD + '.' + @STRUCT + '.' + @FN +
 | 
					 | 
				
			||||||
							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
 | 
					 | 
				
			||||||
					}
 | 
					 | 
				
			||||||
					decoded_s += decoded
 | 
					 | 
				
			||||||
					s.skip_n(s.text[pos..].len)
 | 
					 | 
				
			||||||
					continue
 | 
					 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
| 
						 | 
					@ -132,10 +197,11 @@ fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
 | 
				
			||||||
// decode_unicode_escape returns an error if `esc_unicode` is not
 | 
					// decode_unicode_escape returns an error if `esc_unicode` is not
 | 
				
			||||||
// a valid Unicode escape sequence. `esc_unicode` is expected to be
 | 
					// a valid Unicode escape sequence. `esc_unicode` is expected to be
 | 
				
			||||||
// prefixed with either `u` or `U`.
 | 
					// prefixed with either `u` or `U`.
 | 
				
			||||||
fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
 | 
					fn decode_unicode_escape(esc_unicode string) ?(string, int, int) {
 | 
				
			||||||
	is_long_esc_type := esc_unicode.starts_with('U')
 | 
						is_long_esc_type := esc_unicode.starts_with('U')
 | 
				
			||||||
	mut sequence := esc_unicode[1..]
 | 
						mut sequence := esc_unicode[1..]
 | 
				
			||||||
	hex_digits_len := if is_long_esc_type { 8 } else { 4 }
 | 
						hex_digits_len := if is_long_esc_type { 8 } else { 4 }
 | 
				
			||||||
 | 
						mut sequence_len := hex_digits_len
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	sequence = sequence[..hex_digits_len]
 | 
						sequence = sequence[..hex_digits_len]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -143,6 +209,7 @@ fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
 | 
				
			||||||
	if unicode_point.len < 8 {
 | 
						if unicode_point.len < 8 {
 | 
				
			||||||
		unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
 | 
							unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
 | 
						i64_val := strconv.parse_int(unicode_point, 16, 0) ?
 | 
				
			||||||
	return '$rn'
 | 
						rn := rune(i64_val)
 | 
				
			||||||
 | 
						return '$rn', int(i64_val), sequence_len
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,7 @@
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import toml
 | 
					import toml
 | 
				
			||||||
import toml.ast
 | 
					import toml.ast
 | 
				
			||||||
import toml.scanner
 | 
					 | 
				
			||||||
import x.json2
 | 
					import x.json2
 | 
				
			||||||
import strconv
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Instructions for developers:
 | 
					// Instructions for developers:
 | 
				
			||||||
// The actual tests and data can be obtained by doing:
 | 
					// The actual tests and data can be obtained by doing:
 | 
				
			||||||
| 
						 | 
					@ -17,22 +15,14 @@ const (
 | 
				
			||||||
	invalid_exceptions     = []string{}
 | 
						invalid_exceptions     = []string{}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	valid_value_exceptions = [
 | 
						valid_value_exceptions = [
 | 
				
			||||||
		// String
 | 
					 | 
				
			||||||
		'string/escapes.toml',
 | 
					 | 
				
			||||||
		'string/multiline.toml',
 | 
					 | 
				
			||||||
		// Integer
 | 
							// Integer
 | 
				
			||||||
		'integer/long.toml',
 | 
							'integer/long.toml',
 | 
				
			||||||
		// Float
 | 
							// Float
 | 
				
			||||||
		'float/inf-and-nan.toml',
 | 
							'float/inf-and-nan.toml',
 | 
				
			||||||
		// Comment
 | 
					 | 
				
			||||||
		'comment/tricky.toml',
 | 
					 | 
				
			||||||
		// Table
 | 
							// Table
 | 
				
			||||||
		'table/array-implicit.toml',
 | 
							'table/array-implicit.toml',
 | 
				
			||||||
		'table/names.toml',
 | 
					 | 
				
			||||||
		// Date-time
 | 
							// Date-time
 | 
				
			||||||
		'datetime/milliseconds.toml',
 | 
							'datetime/milliseconds.toml',
 | 
				
			||||||
		// Inline-table
 | 
					 | 
				
			||||||
		'inline-table/multiline.toml',
 | 
					 | 
				
			||||||
		// Key
 | 
							// Key
 | 
				
			||||||
		'key/escapes.toml',
 | 
							'key/escapes.toml',
 | 
				
			||||||
	]
 | 
						]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue