toml: check for illegal characters, fix all related skipped tests (#12270)

pull/12274/head
Larpon 2021-10-22 18:57:32 +02:00 committed by GitHub
parent eb364f0301
commit 8273c0582b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 24 deletions

View File

@ -175,7 +175,7 @@ pub fn (mut s Scanner) scan() ?token.Token {
}
`#` {
start := s.pos //+ 1
s.ignore_line()
s.ignore_line() ?
hash := s.text[start..s.pos]
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comment hash "$hash" ($hash.len)')
return s.new_token(.hash, hash, hash.len + 1)
@ -318,9 +318,14 @@ fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
// ignore_line forwards the scanner to the end of the current line.
[direct_array_access; inline]
fn (mut s Scanner) ignore_line() {
fn (mut s Scanner) ignore_line() ? {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL')
for c := s.at(); c != -1 && c != `\n`; c = s.at() {
// Check for control characters (allow TAB)
if util.is_illegal_ascii_control_character(c) {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' control character `$c.hex()` is not allowed ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
}
s.next()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${byte(c).ascii_str()}"')
continue
@ -394,6 +399,11 @@ fn (mut s Scanner) extract_string() ?string {
continue
}
}
// Check for control characters (allow TAB)
if util.is_illegal_ascii_control_character(c) {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' control character `$c.hex()` is not allowed at $start ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
}
if c == quote {
s.pos++
@ -457,6 +467,11 @@ fn (mut s Scanner) extract_multiline_string() ?string {
continue
}
}
// Check for control characters (allow TAB)
if util.is_illegal_ascii_control_character(c) {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' control character `$c.hex()` is not allowed at $start ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
}
if c == quote {
if s.peek(1) == quote && s.peek(2) == quote {

View File

@ -42,28 +42,7 @@ const (
'datetime/impossible-date.toml',
'datetime/no-leads-with-milli.toml',
'datetime/no-leads.toml',
// Control
'control/string-us.toml',
'control/comment-lf.toml',
'control/multi-us.toml',
'control/rawstring-del.toml',
'control/rawmulti-del.toml',
'control/rawstring-us.toml',
'control/string-bs.toml',
'control/multi-null.toml',
'control/rawstring-lf.toml',
'control/rawmulti-null.toml',
'control/comment-null.toml',
'control/multi-lf.toml',
'control/comment-del.toml',
'control/rawstring-null.toml',
'control/rawmulti-lf.toml',
'control/multi-del.toml',
'control/string-del.toml',
'control/rawmulti-us.toml',
'control/comment-us.toml',
'control/string-lf.toml',
'control/string-null.toml',
// Inline table
'inline-table/empty.toml',
'inline-table/double-comma.toml',
'inline-table/trailing-comma.toml',

View File

@ -8,6 +8,19 @@ pub fn is_key_char(c byte) bool {
return (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) // || c == `_` || c == `-` <- these are identified when tokenizing
}
// is_ascii_control_character returns true if `byte_char` is an ASCII control character.
[inline]
pub fn is_ascii_control_character(byte_char byte) bool {
return (byte_char >= 0 && byte_char <= 0x1f) || byte_char == 0x7f
}
// is_illegal_ascii_control_character returns true if a `byte_char` ASCII control character
// is considered "illegal" in TOML .
[inline]
pub fn is_illegal_ascii_control_character(byte_char byte) bool {
return byte_char != 0x09 && is_ascii_control_character(byte_char)
}
[if trace_toml ?]
pub fn printdbg(id string, message string) {
eprintln(id + ' ' + message)