toml: add UTF header support, add BOM tests (#12326)

pull/12298/head
Larpon 2021-10-28 15:38:49 +02:00 committed by GitHub
parent 99e71d0868
commit a987440e2f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 129 additions and 7 deletions

View File

@ -22,6 +22,7 @@ mut:
col int // current column number (x coordinate) col int // current column number (x coordinate)
line_nr int = 1 // current line number (y coordinate) line_nr int = 1 // current line number (y coordinate)
pos int // current flat/index position in the `text` field pos int // current flat/index position in the `text` field
header_len int // Length, how many bytes of header was found
} }
// State is a read-only copy of the scanner's internal state. // State is a read-only copy of the scanner's internal state.
@ -73,6 +74,8 @@ pub fn new_simple(toml_input string) ?Scanner {
// scan returns the next token from the input. // scan returns the next token from the input.
[direct_array_access] [direct_array_access]
pub fn (mut s Scanner) scan() ?token.Token { pub fn (mut s Scanner) scan() ?token.Token {
s.validate_and_skip_headers() ?
for { for {
c := s.next() c := s.next()
byte_c := byte(c) byte_c := byte(c)
@ -290,19 +293,23 @@ pub fn (mut s Scanner) reset() {
s.pos = 0 s.pos = 0
s.col = 0 s.col = 0
s.line_nr = 1 s.line_nr = 1
s.header_len = 0
} }
// new_token returns a new `token.Token`. // new_token returns a new `token.Token`.
[inline] [inline]
fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token { fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
// line_offset := 1
// println('new_token($lit)') // println('new_token($lit)')
mut col := s.col - len + 1
if s.line_nr == 1 {
col -= s.header_len
}
return token.Token{ return token.Token{
kind: kind kind: kind
lit: lit lit: lit
col: mathutil.max(1, s.col - len + 1) col: mathutil.max(1, col)
line_nr: s.line_nr + 1 //+ line_offset line_nr: s.line_nr + 1
pos: s.pos - len + 1 pos: s.pos - s.header_len - len + 1
len: len len: len
} }
} }
@ -605,3 +612,36 @@ pub fn (s Scanner) state() State {
pos: s.pos pos: s.pos
} }
} }
fn (mut s Scanner) validate_and_skip_headers() ? {
// UTF-16 / UTF-32 headers (BE/LE)
s.check_utf16_or_32_bom() ?
// NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only.
// Skip optional UTF-8 heaser, if any.
if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)')
s.header_len = 3
s.skip_n(s.header_len)
}
// Check after we've skipped UTF-8 BOM
s.check_utf16_or_32_bom() ?
}
fn (mut s Scanner) check_utf16_or_32_bom() ? {
if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00)
|| (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) {
s.header_len = 4
s.skip_n(s.header_len)
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' UTF-32 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...')
}
if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) {
s.header_len = 2
s.skip_n(s.header_len)
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' UTF-16 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...')
}
}

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,33 @@
# This is a TOML document with an UTF-8 BOM header.
title = "TOML Example"
[owner]
name = "Tom Preston-Werner"
dob = 1979-05-27T07:32:00-08:00 # First class dates
[database]
server = "192.168.1.1"
ports = [ 8000, 8001, 8002 ]
connection_max = 5000
enabled = true
[servers]
# Indentation (tabs and/or spaces) is allowed but not required
[servers.alpha]
ip = "10.0.0.1"
dc = "eqdc10"
[servers.beta]
ip = "10.0.0.2"
dc = "eqdc10"
[clients]
data = [ ["gamma", "delta"], [1, 2] ]
# Line breaks are OK when inside arrays
hosts = [
"alpha",
"omega"
]

View File

@ -0,0 +1,49 @@
import os
import toml
import toml.ast
const empty_toml_document = toml.Doc{
ast: &ast.Root(0)
}
const (
toml_text_with_utf8_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
'toml_with_utf8_bom' + '.toml'))) or { panic(err) }
toml_text_with_utf16_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
'toml_with_utf16_bom' + '.toml'))) or { panic(err) }
toml_text_with_utf32_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
'toml_with_utf32_bom' + '.toml'))) or { panic(err) }
)
fn test_toml_with_bom() {
toml_doc := toml.parse(toml_text_with_utf8_bom) or { panic(err) }
toml_json := toml_doc.to_json()
title := toml_doc.value('title')
assert title == toml.Any('TOML Example')
assert title as string == 'TOML Example'
owner := toml_doc.value('owner') as map[string]toml.Any
any_name := owner.value('name') or { panic(err) }
assert any_name.string() == 'Tom Preston-Werner'
database := toml_doc.value('database') as map[string]toml.Any
db_serv := database['server'] or {
panic('could not access "server" index in "database" variable')
}
assert db_serv as string == '192.168.1.1'
// Re-cycle bad_toml_doc
mut bad_toml_doc := empty_toml_document
bad_toml_doc = toml.parse(toml_text_with_utf16_bom) or {
println(' $err.msg')
assert true
empty_toml_document
}
bad_toml_doc = toml.parse(toml_text_with_utf32_bom) or {
println(' $err.msg')
assert true
empty_toml_document
}
}