toml: add UTF header support, add BOM tests (#12326)
parent
99e71d0868
commit
a987440e2f
|
@ -22,6 +22,7 @@ mut:
|
||||||
col int // current column number (x coordinate)
|
col int // current column number (x coordinate)
|
||||||
line_nr int = 1 // current line number (y coordinate)
|
line_nr int = 1 // current line number (y coordinate)
|
||||||
pos int // current flat/index position in the `text` field
|
pos int // current flat/index position in the `text` field
|
||||||
|
header_len int // Length, how many bytes of header was found
|
||||||
}
|
}
|
||||||
|
|
||||||
// State is a read-only copy of the scanner's internal state.
|
// State is a read-only copy of the scanner's internal state.
|
||||||
|
@ -73,6 +74,8 @@ pub fn new_simple(toml_input string) ?Scanner {
|
||||||
// scan returns the next token from the input.
|
// scan returns the next token from the input.
|
||||||
[direct_array_access]
|
[direct_array_access]
|
||||||
pub fn (mut s Scanner) scan() ?token.Token {
|
pub fn (mut s Scanner) scan() ?token.Token {
|
||||||
|
s.validate_and_skip_headers() ?
|
||||||
|
|
||||||
for {
|
for {
|
||||||
c := s.next()
|
c := s.next()
|
||||||
byte_c := byte(c)
|
byte_c := byte(c)
|
||||||
|
@ -290,19 +293,23 @@ pub fn (mut s Scanner) reset() {
|
||||||
s.pos = 0
|
s.pos = 0
|
||||||
s.col = 0
|
s.col = 0
|
||||||
s.line_nr = 1
|
s.line_nr = 1
|
||||||
|
s.header_len = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// new_token returns a new `token.Token`.
|
// new_token returns a new `token.Token`.
|
||||||
[inline]
|
[inline]
|
||||||
fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
|
fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
|
||||||
// line_offset := 1
|
|
||||||
// println('new_token($lit)')
|
// println('new_token($lit)')
|
||||||
|
mut col := s.col - len + 1
|
||||||
|
if s.line_nr == 1 {
|
||||||
|
col -= s.header_len
|
||||||
|
}
|
||||||
return token.Token{
|
return token.Token{
|
||||||
kind: kind
|
kind: kind
|
||||||
lit: lit
|
lit: lit
|
||||||
col: mathutil.max(1, s.col - len + 1)
|
col: mathutil.max(1, col)
|
||||||
line_nr: s.line_nr + 1 //+ line_offset
|
line_nr: s.line_nr + 1
|
||||||
pos: s.pos - len + 1
|
pos: s.pos - s.header_len - len + 1
|
||||||
len: len
|
len: len
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -605,3 +612,36 @@ pub fn (s Scanner) state() State {
|
||||||
pos: s.pos
|
pos: s.pos
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn (mut s Scanner) validate_and_skip_headers() ? {
|
||||||
|
// UTF-16 / UTF-32 headers (BE/LE)
|
||||||
|
s.check_utf16_or_32_bom() ?
|
||||||
|
|
||||||
|
// NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only.
|
||||||
|
|
||||||
|
// Skip optional UTF-8 heaser, if any.
|
||||||
|
if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF {
|
||||||
|
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)')
|
||||||
|
s.header_len = 3
|
||||||
|
s.skip_n(s.header_len)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check after we've skipped UTF-8 BOM
|
||||||
|
s.check_utf16_or_32_bom() ?
|
||||||
|
}
|
||||||
|
|
||||||
|
fn (mut s Scanner) check_utf16_or_32_bom() ? {
|
||||||
|
if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00)
|
||||||
|
|| (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) {
|
||||||
|
s.header_len = 4
|
||||||
|
s.skip_n(s.header_len)
|
||||||
|
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
||||||
|
' UTF-32 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...')
|
||||||
|
}
|
||||||
|
if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) {
|
||||||
|
s.header_len = 2
|
||||||
|
s.skip_n(s.header_len)
|
||||||
|
return error(@MOD + '.' + @STRUCT + '.' + @FN +
|
||||||
|
' UTF-16 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,33 @@
|
||||||
|
# This is a TOML document with an UTF-8 BOM header.
|
||||||
|
|
||||||
|
title = "TOML Example"
|
||||||
|
|
||||||
|
[owner]
|
||||||
|
name = "Tom Preston-Werner"
|
||||||
|
dob = 1979-05-27T07:32:00-08:00 # First class dates
|
||||||
|
|
||||||
|
[database]
|
||||||
|
server = "192.168.1.1"
|
||||||
|
ports = [ 8000, 8001, 8002 ]
|
||||||
|
connection_max = 5000
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
[servers]
|
||||||
|
|
||||||
|
# Indentation (tabs and/or spaces) is allowed but not required
|
||||||
|
[servers.alpha]
|
||||||
|
ip = "10.0.0.1"
|
||||||
|
dc = "eqdc10"
|
||||||
|
|
||||||
|
[servers.beta]
|
||||||
|
ip = "10.0.0.2"
|
||||||
|
dc = "eqdc10"
|
||||||
|
|
||||||
|
[clients]
|
||||||
|
data = [ ["gamma", "delta"], [1, 2] ]
|
||||||
|
|
||||||
|
# Line breaks are OK when inside arrays
|
||||||
|
hosts = [
|
||||||
|
"alpha",
|
||||||
|
"omega"
|
||||||
|
]
|
|
@ -0,0 +1,49 @@
|
||||||
|
import os
|
||||||
|
import toml
|
||||||
|
import toml.ast
|
||||||
|
|
||||||
|
const empty_toml_document = toml.Doc{
|
||||||
|
ast: &ast.Root(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
toml_text_with_utf8_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
|
||||||
|
'toml_with_utf8_bom' + '.toml'))) or { panic(err) }
|
||||||
|
toml_text_with_utf16_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
|
||||||
|
'toml_with_utf16_bom' + '.toml'))) or { panic(err) }
|
||||||
|
toml_text_with_utf32_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata',
|
||||||
|
'toml_with_utf32_bom' + '.toml'))) or { panic(err) }
|
||||||
|
)
|
||||||
|
|
||||||
|
fn test_toml_with_bom() {
|
||||||
|
toml_doc := toml.parse(toml_text_with_utf8_bom) or { panic(err) }
|
||||||
|
toml_json := toml_doc.to_json()
|
||||||
|
|
||||||
|
title := toml_doc.value('title')
|
||||||
|
assert title == toml.Any('TOML Example')
|
||||||
|
assert title as string == 'TOML Example'
|
||||||
|
|
||||||
|
owner := toml_doc.value('owner') as map[string]toml.Any
|
||||||
|
any_name := owner.value('name') or { panic(err) }
|
||||||
|
assert any_name.string() == 'Tom Preston-Werner'
|
||||||
|
|
||||||
|
database := toml_doc.value('database') as map[string]toml.Any
|
||||||
|
db_serv := database['server'] or {
|
||||||
|
panic('could not access "server" index in "database" variable')
|
||||||
|
}
|
||||||
|
assert db_serv as string == '192.168.1.1'
|
||||||
|
|
||||||
|
// Re-cycle bad_toml_doc
|
||||||
|
mut bad_toml_doc := empty_toml_document
|
||||||
|
bad_toml_doc = toml.parse(toml_text_with_utf16_bom) or {
|
||||||
|
println(' $err.msg')
|
||||||
|
assert true
|
||||||
|
empty_toml_document
|
||||||
|
}
|
||||||
|
|
||||||
|
bad_toml_doc = toml.parse(toml_text_with_utf32_bom) or {
|
||||||
|
println(' $err.msg')
|
||||||
|
assert true
|
||||||
|
empty_toml_document
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue