From d53bb54c0a1e6c62365df7e011c1bb3260c7ca96 Mon Sep 17 00:00:00 2001 From: Larpon Date: Wed, 27 Oct 2021 14:28:46 +0200 Subject: [PATCH] toml: implement checks for UTF-8 validity (#12313) --- vlib/toml/checker/checker.v | 101 +++++++++++++++++++- vlib/toml/tests/burntsushi.toml-test_test.v | 7 -- 2 files changed, 96 insertions(+), 12 deletions(-) diff --git a/vlib/toml/checker/checker.v b/vlib/toml/checker/checker.v index 9880ad1ee4..b76fc24f7d 100644 --- a/vlib/toml/checker/checker.v +++ b/vlib/toml/checker/checker.v @@ -12,6 +12,9 @@ import encoding.utf8 pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`] +// utf8_max is the value of hex2int('10FFFF') +const utf8_max = 1114111 + // Checker checks a tree of TOML `ast.Value`'s for common errors. pub struct Checker { scanner &scanner.Scanner @@ -288,12 +291,13 @@ fn (c Checker) check_quoted(q ast.Quoted) ? { // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) -// \uXXXX - unicode (U+XXXX) -// \UXXXXXXXX - unicode (U+XXXXXXXX) +// \uXXXX - Unicode (U+XXXX) +// \UXXXXXXXX - Unicode (U+XXXXXXXX) fn (c Checker) check_quoted_escapes(q ast.Quoted) ? { // Setup a scanner in stack memory for easier navigation. mut s := scanner.new_simple(q.text) ? + // See https://toml.io/en/v1.0.0#string for more info on string types. is_basic := q.quote == `\"` for { ch := s.next() @@ -308,6 +312,7 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? { s.next() continue } + escape := ch_byte.ascii_str() + next_ch.ascii_str() if is_basic { if q.is_multiline { @@ -327,6 +332,27 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? { ' unknown basic string escape character `$next_ch.ascii_str()` in `$escape` ($st.line_nr,$st.col) in ...${c.excerpt(q.pos)}...') } } + // Check Unicode escapes + if is_basic && escape.to_lower() == '\\u' { + // Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters + // we pass in 10 characters from the `u`/`U` which is the longest possible sequence + // of 9 chars plus one extra. + if s.remaining() >= 10 { + pos := s.state().pos + c.check_unicode_escape(s.text[pos..pos + 11]) or { + st := s.state() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${c.excerpt(q.pos)}...') + } + } else { + pos := s.state().pos + c.check_unicode_escape(s.text[pos..]) or { + st := s.state() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${c.excerpt(q.pos)}...') + } + } + } } } } @@ -340,8 +366,73 @@ fn (c Checker) check_utf8_validity(q ast.Quoted) ? { } } -pub fn (c Checker) check_comment(cmt ast.Comment) ? { - lit := cmt.text +// hex2int returns the value of `hex` as `int`. +// NOTE that the code assumes `hex` to be in uppercase A-F. +// It does not work if the length of the input string is beyond the max value of `int`. +// Also and there is no error trapping for illegal hex characters. +fn hex2int(hex string) int { + // Adapted from https://stackoverflow.com/a/130552/1904615 + mut val := 0 + for i := 0; i < hex.len; i++ { + if hex[i] <= 57 { + val += (hex[i] - 48) * (1 << (4 * (hex.len - 1 - i))) + } else { + val += (hex[i] - 55) * (1 << (4 * (hex.len - 1 - i))) + } + } + return val +} + +// validate_utf8_codepoint_string returns an error if `str` is not a valid Unicode code point. +// `str` is expected to be a `string` containing *only* hex values. +// Any preludes or prefixes like `0x` could pontentially yield wrong results. +fn validate_utf8_codepoint_string(str string) ? { + int_val := hex2int(str) + if int_val > checker.utf8_max || int_val < 0 { + return error('Unicode code point `$str` is outside the valid Unicode scalar value ranges.') + } + // Check if the Unicode value is actually in the valid Unicode scalar value ranges. + // TODO should probably be transferred / implemented in `utf8.validate(...)` also? + if !((int_val >= 0x0000 && int_val <= 0xD7FF) || (int_val >= 0xE000 && int_val <= 0x10FFFF)) { + return error('Unicode code point `$str` is not a valid Unicode scalar value.') + } + bytes := str.bytes() + if !utf8.validate(bytes.data, bytes.len) { + return error('Unicode code point `$str` is not a valid UTF-8 code point.') + } +} + +// check_unicode_escape returns an error if `esc_unicode` is not +// a valid Unicode escape sequence. `esc_unicode` is expected to be +// prefixed with either `u` or `U`. +fn (c Checker) check_unicode_escape(esc_unicode string) ? { + if esc_unicode.len < 5 || !esc_unicode.to_lower().starts_with('u') { + // Makes sure the input to this function is actually valid. + return error('`$esc_unicode` is not a valid escaped Unicode sequence.') + } + is_long_esc_type := esc_unicode.starts_with('U') + mut sequence := esc_unicode[1..] + hex_digits_len := if is_long_esc_type { 8 } else { 4 } + if sequence.len < hex_digits_len { + return error('Unicode escape sequence `$esc_unicode` should be at least $hex_digits_len in length.') + } + sequence = sequence[..hex_digits_len] + // TODO not enforced in BurnSushi testsuite?? + // if !sequence.is_upper() { + // return error('Unicode escape sequence `$esc_unicode` is not in all uppercase.') + //} + validate_utf8_codepoint_string(sequence.to_upper()) ? + if is_long_esc_type { + // Long escape type checks + } else { + // Short escape type checks + } +} + +// check_comment returns an error if the contents of `comment` isn't +// a valid TOML comment. +pub fn (c Checker) check_comment(comment ast.Comment) ? { + lit := comment.text // Setup a scanner in stack memory for easier navigation. mut s := scanner.new_simple(lit) ? for { @@ -361,6 +452,6 @@ pub fn (c Checker) check_comment(cmt ast.Comment) ? { // Check for bad UTF-8 encoding if !utf8.validate_str(lit) { return error(@MOD + '.' + @STRUCT + '.' + @FN + - ' comment "$lit" is not valid UTF-8 in ...${c.excerpt(cmt.pos)}...') + ' comment "$lit" is not valid UTF-8 in ...${c.excerpt(comment.pos)}...') } } diff --git a/vlib/toml/tests/burntsushi.toml-test_test.v b/vlib/toml/tests/burntsushi.toml-test_test.v index c65aff1f43..3967ba36c3 100644 --- a/vlib/toml/tests/burntsushi.toml-test_test.v +++ b/vlib/toml/tests/burntsushi.toml-test_test.v @@ -12,13 +12,6 @@ const ( 'table/array-table-array.toml', ] invalid_exceptions = [ - // String - 'string/basic-multiline-out-of-range-unicode-escape-1.toml', - 'string/bad-codepoint.toml', - 'string/basic-multiline-out-of-range-unicode-escape-2.toml', - 'string/basic-out-of-range-unicode-escape-1.toml', - 'string/basic-out-of-range-unicode-escape-2.toml', - 'string/bad-uni-esc.toml', // Table 'table/rrbrace.toml', 'table/duplicate-table-array2.toml',