toml: add UTF header support, add BOM tests (#12326)
parent
99e71d0868
commit
a987440e2f
|
@ -22,6 +22,7 @@ mut:
|
|||
col int // current column number (x coordinate)
|
||||
line_nr int = 1 // current line number (y coordinate)
|
||||
pos int // current flat/index position in the `text` field
|
||||
header_len int // Length, how many bytes of header was found
|
||||
}
|
||||
|
||||
// State is a read-only copy of the scanner's internal state.
|
||||
|
@ -73,6 +74,8 @@ pub fn new_simple(toml_input string) ?Scanner {
|
|||
// scan returns the next token from the input.
|
||||
[direct_array_access]
|
||||
pub fn (mut s Scanner) scan() ?token.Token {
|
||||
s.validate_and_skip_headers() ?
|
||||
|
||||
for {
|
||||
c := s.next()
|
||||
byte_c := byte(c)
|
||||
|
@ -290,19 +293,23 @@ pub fn (mut s Scanner) reset() {
|
|||
s.pos = 0
|
||||
s.col = 0
|
||||
s.line_nr = 1
|
||||
s.header_len = 0
|
||||
}
|
||||
|
||||
// new_token returns a new `token.Token`.
|
||||
[inline]
|
||||
fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
|
||||
// line_offset := 1
|
||||
// println('new_token($lit)')
|
||||
mut col := s.col - len + 1
|
||||
if s.line_nr == 1 {
|
||||
col -= s.header_len
|
||||
}
|
||||
return token.Token{
|
||||
kind: kind
|
||||
lit: lit
|
||||
col: mathutil.max(1, s.col - len + 1)
|
||||
line_nr: s.line_nr + 1 //+ line_offset
|
||||
pos: s.pos - len + 1
|
||||
col: mathutil.max(1, col)
|
||||
line_nr: s.line_nr + 1
|
||||
pos: s.pos - s.header_len - len + 1
|
||||
len: len
|
||||
}
|
||||
}
|
||||
|
@ -605,3 +612,36 @@ pub fn (s Scanner) state() State {
|
|||
pos: s.pos
|
||||
}
|
||||
}
|
||||
|
||||
fn (mut s Scanner) validate_and_skip_headers() ? {
|
||||
// UTF-16 / UTF-32 headers (BE/LE)
|
||||
s.check_utf16_or_32_bom() ?
|
||||
|
||||
// NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only.
|
||||
|
||||
// Skip optional UTF-8 heaser, if any.
|
||||
if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF {
|
||||
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)')
|
||||
s.header_len = 3
|
||||
s.skip_n(s.header_len)
|
||||
}
|
||||
|
||||
// Check after we've skipped UTF-8 BOM
|
||||
s.check_utf16_or_32_bom() ?
|
||||
}
|
||||
|
||||
fn (mut s Scanner) check_utf16_or_32_bom() ? {
|
||||
if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00)
|
||||
|| (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) {
|
||||
s.header_len = 4
|
||||
s.skip_n(s.header_len)
|
||||
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
||||
' UTF-32 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...')
|
||||
}
|
||||
if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) {
|
||||
s.header_len = 2
|
||||
s.skip_n(s.header_len)
|
||||
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
||||
' UTF-16 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...')
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,33 @@
|
|||
# This is a TOML document with an UTF-8 BOM header.
|
||||
|
||||
title = "TOML Example"
|
||||
|
||||
[owner]
|
||||
name = "Tom Preston-Werner"
|
||||
dob = 1979-05-27T07:32:00-08:00 # First class dates
|
||||
|
||||
[database]
|
||||
server = "192.168.1.1"
|
||||
ports = [ 8000, 8001, 8002 ]
|
||||
connection_max = 5000
|
||||
enabled = true
|
||||
|
||||
[servers]
|
||||
|
||||
# Indentation (tabs and/or spaces) is allowed but not required
|
||||
[servers.alpha]
|
||||
ip = "10.0.0.1"
|
||||
dc = "eqdc10"
|
||||
|
||||
[servers.beta]
|
||||
ip = "10.0.0.2"
|
||||
dc = "eqdc10"
|
||||
|
||||
[clients]
|
||||
data = [ ["gamma", "delta"], [1, 2] ]
|
||||
|
||||
# Line breaks are OK when inside arrays
|
||||
hosts = [
|
||||
"alpha",
|
||||
"omega"
|
||||
]
|
|
@ -0,0 +1,49 @@
|
|||
import os
|
||||
import toml
|
||||
import toml.ast
|
||||
|
||||
const empty_toml_document = toml.Doc{
|
||||
ast: &ast.Root(0)
|
||||
}
|
||||
|
||||
const (
|
||||
toml_text_with_utf8_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
|
||||
'toml_with_utf8_bom' + '.toml'))) or { panic(err) }
|
||||
toml_text_with_utf16_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
|
||||
'toml_with_utf16_bom' + '.toml'))) or { panic(err) }
|
||||
toml_text_with_utf32_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
|
||||
'toml_with_utf32_bom' + '.toml'))) or { panic(err) }
|
||||
)
|
||||
|
||||
fn test_toml_with_bom() {
|
||||
toml_doc := toml.parse(toml_text_with_utf8_bom) or { panic(err) }
|
||||
toml_json := toml_doc.to_json()
|
||||
|
||||
title := toml_doc.value('title')
|
||||
assert title == toml.Any('TOML Example')
|
||||
assert title as string == 'TOML Example'
|
||||
|
||||
owner := toml_doc.value('owner') as map[string]toml.Any
|
||||
any_name := owner.value('name') or { panic(err) }
|
||||
assert any_name.string() == 'Tom Preston-Werner'
|
||||
|
||||
database := toml_doc.value('database') as map[string]toml.Any
|
||||
db_serv := database['server'] or {
|
||||
panic('could not access "server" index in "database" variable')
|
||||
}
|
||||
assert db_serv as string == '192.168.1.1'
|
||||
|
||||
// Re-cycle bad_toml_doc
|
||||
mut bad_toml_doc := empty_toml_document
|
||||
bad_toml_doc = toml.parse(toml_text_with_utf16_bom) or {
|
||||
println(' $err.msg')
|
||||
assert true
|
||||
empty_toml_document
|
||||
}
|
||||
|
||||
bad_toml_doc = toml.parse(toml_text_with_utf32_bom) or {
|
||||
println(' $err.msg')
|
||||
assert true
|
||||
empty_toml_document
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue