From 8273c0582b7f7d5b8459b56a56358048c29470c5 Mon Sep 17 00:00:00 2001 From: Larpon Date: Fri, 22 Oct 2021 18:57:32 +0200 Subject: [PATCH] toml: check for illegal characters, fix all related skipped tests (#12270) --- vlib/toml/scanner/scanner.v | 19 +++++++++++++++-- vlib/toml/tests/burntsushi.toml-test_test.v | 23 +-------------------- vlib/toml/util/util.v | 13 ++++++++++++ 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v index e2ae4c1a09..b35b887160 100644 --- a/vlib/toml/scanner/scanner.v +++ b/vlib/toml/scanner/scanner.v @@ -175,7 +175,7 @@ pub fn (mut s Scanner) scan() ?token.Token { } `#` { start := s.pos //+ 1 - s.ignore_line() + s.ignore_line() ? hash := s.text[start..s.pos] util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comment hash "$hash" ($hash.len)') return s.new_token(.hash, hash, hash.len + 1) @@ -318,9 +318,14 @@ fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token { // ignore_line forwards the scanner to the end of the current line. [direct_array_access; inline] -fn (mut s Scanner) ignore_line() { +fn (mut s Scanner) ignore_line() ? { util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL') for c := s.at(); c != -1 && c != `\n`; c = s.at() { + // Check for control characters (allow TAB) + if util.is_illegal_ascii_control_character(c) { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' control character `$c.hex()` is not allowed ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') + } s.next() util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${byte(c).ascii_str()}"') continue @@ -394,6 +399,11 @@ fn (mut s Scanner) extract_string() ?string { continue } } + // Check for control characters (allow TAB) + if util.is_illegal_ascii_control_character(c) { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' control character `$c.hex()` is not allowed at $start ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') + } if c == quote { s.pos++ @@ -457,6 +467,11 @@ fn (mut s Scanner) extract_multiline_string() ?string { continue } } + // Check for control characters (allow TAB) + if util.is_illegal_ascii_control_character(c) { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' control character `$c.hex()` is not allowed at $start ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') + } if c == quote { if s.peek(1) == quote && s.peek(2) == quote { diff --git a/vlib/toml/tests/burntsushi.toml-test_test.v b/vlib/toml/tests/burntsushi.toml-test_test.v index 9423cbdb6a..ca01cb9361 100644 --- a/vlib/toml/tests/burntsushi.toml-test_test.v +++ b/vlib/toml/tests/burntsushi.toml-test_test.v @@ -42,28 +42,7 @@ const ( 'datetime/impossible-date.toml', 'datetime/no-leads-with-milli.toml', 'datetime/no-leads.toml', - // Control - 'control/string-us.toml', - 'control/comment-lf.toml', - 'control/multi-us.toml', - 'control/rawstring-del.toml', - 'control/rawmulti-del.toml', - 'control/rawstring-us.toml', - 'control/string-bs.toml', - 'control/multi-null.toml', - 'control/rawstring-lf.toml', - 'control/rawmulti-null.toml', - 'control/comment-null.toml', - 'control/multi-lf.toml', - 'control/comment-del.toml', - 'control/rawstring-null.toml', - 'control/rawmulti-lf.toml', - 'control/multi-del.toml', - 'control/string-del.toml', - 'control/rawmulti-us.toml', - 'control/comment-us.toml', - 'control/string-lf.toml', - 'control/string-null.toml', + // Inline table 'inline-table/empty.toml', 'inline-table/double-comma.toml', 'inline-table/trailing-comma.toml', diff --git a/vlib/toml/util/util.v b/vlib/toml/util/util.v index 274a9e4723..60f42c8b05 100644 --- a/vlib/toml/util/util.v +++ b/vlib/toml/util/util.v @@ -8,6 +8,19 @@ pub fn is_key_char(c byte) bool { return (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) // || c == `_` || c == `-` <- these are identified when tokenizing } +// is_ascii_control_character returns true if `byte_char` is an ASCII control character. +[inline] +pub fn is_ascii_control_character(byte_char byte) bool { + return (byte_char >= 0 && byte_char <= 0x1f) || byte_char == 0x7f +} + +// is_illegal_ascii_control_character returns true if a `byte_char` ASCII control character +// is considered "illegal" in TOML . +[inline] +pub fn is_illegal_ascii_control_character(byte_char byte) bool { + return byte_char != 0x09 && is_ascii_control_character(byte_char) +} + [if trace_toml ?] pub fn printdbg(id string, message string) { eprintln(id + ' ' + message)