From 5541ec8670fb2c95a70f14d54a918651df9d550d Mon Sep 17 00:00:00 2001 From: Larpon Date: Fri, 24 Sep 2021 20:13:52 +0200 Subject: [PATCH] vlib: add toml module + tests (#11964) --- .github/workflows/toml_ci.yml | 27 + examples/toml.v | 48 ++ vlib/time/parse.c.v | 55 ++ vlib/toml/README.md | 55 ++ vlib/toml/any.v | 213 +++++ vlib/toml/ast/ast.v | 28 + vlib/toml/ast/types.v | 241 ++++++ vlib/toml/ast/walker/walker.v | 37 + vlib/toml/checker/checker.v | 168 ++++ vlib/toml/input/input.v | 22 + vlib/toml/parser/parser.v | 849 ++++++++++++++++++++ vlib/toml/scanner/scanner.v | 526 ++++++++++++ vlib/toml/scanner/scanner_test.v | 82 ++ vlib/toml/tests/burntsushi.toml-test_test.v | 185 +++++ vlib/toml/tests/compact_test.v | 83 ++ vlib/toml/tests/datetime_test.v | 73 ++ vlib/toml/tests/json_test.v | 19 + vlib/toml/tests/nested_test.v | 43 + vlib/toml/tests/strings_test.v | 67 ++ vlib/toml/tests/table_test.v | 87 ++ vlib/toml/tests/testdata/json_test.out | 1 + vlib/toml/tests/testdata/json_test.toml | 25 + vlib/toml/tests/testdata/strings_test.toml | 15 + vlib/toml/tests/testdata/toml_test.out | 1 + vlib/toml/tests/testdata/toml_test.toml | 33 + vlib/toml/tests/toml_test.v | 110 +++ vlib/toml/tests/types_test.v | 70 ++ vlib/toml/token/position.v | 13 + vlib/toml/token/token.v | 52 ++ vlib/toml/toml.v | 217 +++++ vlib/toml/util/util.v | 14 + 31 files changed, 3459 insertions(+) create mode 100644 .github/workflows/toml_ci.yml create mode 100644 examples/toml.v create mode 100644 vlib/toml/README.md create mode 100644 vlib/toml/any.v create mode 100644 vlib/toml/ast/ast.v create mode 100644 vlib/toml/ast/types.v create mode 100644 vlib/toml/ast/walker/walker.v create mode 100644 vlib/toml/checker/checker.v create mode 100644 vlib/toml/input/input.v create mode 100644 vlib/toml/parser/parser.v create mode 100644 vlib/toml/scanner/scanner.v create mode 100644 vlib/toml/scanner/scanner_test.v create mode 100644 vlib/toml/tests/burntsushi.toml-test_test.v create mode 100644 vlib/toml/tests/compact_test.v create mode 100644 vlib/toml/tests/datetime_test.v create mode 100644 vlib/toml/tests/json_test.v create mode 100644 vlib/toml/tests/nested_test.v create mode 100644 vlib/toml/tests/strings_test.v create mode 100644 vlib/toml/tests/table_test.v create mode 100644 vlib/toml/tests/testdata/json_test.out create mode 100644 vlib/toml/tests/testdata/json_test.toml create mode 100644 vlib/toml/tests/testdata/strings_test.toml create mode 100644 vlib/toml/tests/testdata/toml_test.out create mode 100644 vlib/toml/tests/testdata/toml_test.toml create mode 100644 vlib/toml/tests/toml_test.v create mode 100644 vlib/toml/tests/types_test.v create mode 100644 vlib/toml/token/position.v create mode 100644 vlib/toml/token/token.v create mode 100644 vlib/toml/toml.v create mode 100644 vlib/toml/util/util.v diff --git a/.github/workflows/toml_ci.yml b/.github/workflows/toml_ci.yml new file mode 100644 index 0000000000..2a0f0fe653 --- /dev/null +++ b/.github/workflows/toml_ci.yml @@ -0,0 +1,27 @@ +name: toml CI + +on: + push: + paths-ignore: + - "**.md" + pull_request: + paths-ignore: + - "**.md" + +jobs: + toml-module-pass-external-test-suites: + runs-on: ubuntu-20.04 + timeout-minutes: 121 + steps: + + - uses: actions/checkout@v2 + - name: Build V + run: make -j2 && ./v -cc gcc -o v cmd/v + + - name: Clone BurntSushi/toml-test + run: | + cd vlib/toml/tests/testdata + git clone --depth 1 https://github.com/BurntSushi/toml-test.git burntsushi/toml-test + + - name: Run tests + run: ./v -stats test vlib/toml diff --git a/examples/toml.v b/examples/toml.v new file mode 100644 index 0000000000..1d90d8d08c --- /dev/null +++ b/examples/toml.v @@ -0,0 +1,48 @@ +import toml + +// Complete text from the example in the README.md: +// https://github.com/toml-lang/toml/blob/3b11f6921da7b6f5db37af039aa021fee450c091/README.md#Example +const toml_text = '# This is a TOML document. + +title = "TOML Example" + +[owner] +name = "Tom Preston-Werner" +dob = 1979-05-27T07:32:00-08:00 # First class dates + +[database] +server = "192.168.1.1" +ports = [ 8000, 8001, 8002 ] +connection_max = 5000 +enabled = true + +[servers] + + # Indentation (tabs and/or spaces) is allowed but not required + [servers.alpha] + ip = "10.0.0.1" + dc = "eqdc10" + + [servers.beta] + ip = "10.0.0.2" + dc = "eqdc10" + +[clients] +data = [ ["gamma", "delta"], [1, 2] ] + +# Line breaks are OK when inside arrays +hosts = [ + "alpha", + "omega" +]' + +fn main() { + doc := toml.parse(toml_text) or { panic(err) } + title := doc.value('title').string() + println('title: "$title"') + ip := doc.value('servers.alpha.ip').string() + println('Server IP: "$ip"') + + toml_json := doc.to_json() + println(toml_json) +} diff --git a/vlib/time/parse.c.v b/vlib/time/parse.c.v index b74cd412c7..b6fcb88eb8 100644 --- a/vlib/time/parse.c.v +++ b/vlib/time/parse.c.v @@ -43,6 +43,61 @@ pub fn parse_rfc2822(s string) ?Time { } } +// ----- rfc3339 ----- +const ( + err_invalid_3339 = 'Invalid 3339 format' +) + +// parse_rfc3339 returns time from a date string in RFC 3339 datetime format. +pub fn parse_rfc3339(s string) ?Time { + if s == '' { + return error(time.err_invalid_3339 + ' cannot parse empty string') + } + mut t := parse_iso8601(s) or { Time{} } + // If parse_iso8601 DID NOT result in default values (i.e. date was parsed correctly) + if t != Time{} { + return t + } + + t_i := s.index('T') or { -1 } + parts := if t_i != -1 { [s[..t_i], s[t_i + 1..]] } else { s.split(' ') } + + // Check if s is date only + if !parts[0].contains_any(' Z') && parts[0].contains('-') { + year, month, day := parse_iso8601_date(s) ? + t = new_time(Time{ + year: year + month: month + day: day + }) + return t + } + // Check if s is time only + if !parts[0].contains('-') && parts[0].contains(':') { + mut hour_, mut minute_, mut second_, mut microsecond_, mut unix_offset, mut is_local_time := 0, 0, 0, 0, i64(0), true + hour_, minute_, second_, microsecond_, unix_offset, is_local_time = parse_iso8601_time(parts[0]) ? + t = new_time(Time{ + hour: hour_ + minute: minute_ + second: second_ + microsecond: microsecond_ + }) + if is_local_time { + return t // Time is already local time + } + mut unix_time := t.unix + if unix_offset < 0 { + unix_time -= (-unix_offset) + } else if unix_offset > 0 { + unix_time += unix_offset + } + t = unix2(i64(unix_time), t.microsecond) + return t + } + + return error(time.err_invalid_3339 + '. Could not parse "$s"') +} + // ----- iso8601 ----- const ( err_invalid_8601 = 'Invalid 8601 Format' diff --git a/vlib/toml/README.md b/vlib/toml/README.md new file mode 100644 index 0000000000..5540e4cf4a --- /dev/null +++ b/vlib/toml/README.md @@ -0,0 +1,55 @@ +# TOML module +`toml` is a fully fledged TOML v1.0.0 compatible parser written in pure V. + +## Usage + +```v +import toml + +// Complete text from the example in the README.md: +// https://github.com/toml-lang/toml/blob/3b11f6921da7b6f5db37af039aa021fee450c091/README.md#Example +const toml_text = '# This is a TOML document. + +title = "TOML Example" + +[owner] +name = "Tom Preston-Werner" +dob = 1979-05-27T07:32:00-08:00 # First class dates + +[database] +server = "192.168.1.1" +ports = [ 8000, 8001, 8002 ] +connection_max = 5000 +enabled = true + +[servers] + + # Indentation (tabs and/or spaces) is allowed but not required + [servers.alpha] + ip = "10.0.0.1" + dc = "eqdc10" + + [servers.beta] + ip = "10.0.0.2" + dc = "eqdc10" + +[clients] +data = [ ["gamma", "delta"], [1, 2] ] + +# Line breaks are OK when inside arrays +hosts = [ + "alpha", + "omega" +]' + +fn main() { + doc := toml.parse(toml_text) or { panic(err) } + title := doc.value('title').string() + println('title: "$title"') + ip := doc.value('servers.alpha.ip').string() + println('Server IP: "$ip"') + + toml_json := doc.to_json() + println(toml_json) +} +``` diff --git a/vlib/toml/any.v b/vlib/toml/any.v new file mode 100644 index 0000000000..5782243540 --- /dev/null +++ b/vlib/toml/any.v @@ -0,0 +1,213 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module toml + +import time + +// Pretty much all json2 types plus time.Time +pub type Any = Null + | []Any + | bool + | f32 + | f64 + | i64 + | int + | map[string]Any + | string + | time.Time + | u64 + +// string returns `Any` as a string. +pub fn (a Any) string() string { + match a { + string { return a as string } + time.Time { return a.format_ss_micro() } + else { return a.str() } + } +} + +// int returns `Any` as an 32-bit integer. +pub fn (a Any) int() int { + match a { + int { return a } + i64, f32, f64, bool { return int(a) } + // time.Time { return int(0) } // TODO + else { return 0 } + } +} + +// i64 returns `Any` as a 64-bit integer. +pub fn (a Any) i64() i64 { + match a { + i64 { return a } + int, f32, f64, bool { return i64(a) } + // time.Time { return i64(0) } // TODO + else { return 0 } + } +} + +// u64 returns `Any` as a 64-bit unsigned integer. +pub fn (a Any) u64() u64 { + match a { + u64 { return a } + int, i64, f32, f64, bool { return u64(a) } + // time.Time { return u64(0) } // TODO + else { return 0 } + } +} + +// f32 returns `Any` as a 32-bit float. +pub fn (a Any) f32() f32 { + match a { + f32 { return a } + int, i64, f64 { return f32(a) } + // time.Time { return f32(0) } // TODO + else { return 0.0 } + } +} + +// f64 returns `Any` as a 64-bit float. +pub fn (a Any) f64() f64 { + match a { + f64 { return a } + int, i64, f32 { return f64(a) } + // time.Time { return f64(0) } // TODO + else { return 0.0 } + } +} + +// array returns `Any` as an array. +pub fn (a Any) array() []Any { + if a is []Any { + return a + } else if a is map[string]Any { + mut arr := []Any{} + for _, v in a { + arr << v + } + return arr + } + return [a] +} + +// as_map returns `Any` as a map (TOML table). +pub fn (a Any) as_map() map[string]Any { + if a is map[string]Any { + return a + } else if a is []Any { + mut mp := map[string]Any{} + for i, fi in a { + mp['$i'] = fi + } + return mp + } + return { + '0': a + } +} + +// bool returns `Any` as a boolean. +pub fn (a Any) bool() bool { + match a { + bool { return a } + string { return a.bool() } + else { return false } + } +} + +// date returns `Any` as a date encoded in a `time.Time` struct. +pub fn (a Any) date() time.Time { + mut time := time.Time{} + match a { + // string { } // TODO + time.Time { return a } + else { return time } + } +} + +// date returns `Any` as a time encoded in a `time.Time` struct. +pub fn (a Any) time() time.Time { + mut time := time.Time{} + match a { + // string { } // TODO + time.Time { return a } + else { return time } + } +} + +// date returns `Any` as a date+time encoded in a `time.Time` struct. +pub fn (a Any) datetime() time.Time { + mut time := time.Time{} + match a { + // string { } // TODO + time.Time { return a } + else { return time } + } +} + +pub fn (m map[string]Any) value(key string) ?Any { + // return m[key] ? + key_split := key.split('.') + // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' getting "${key_split[0]}"') + if key_split[0] in m.keys() { + value := m[key_split[0]] or { + return error(@MOD + '.' + @STRUCT + '.' + @FN + ' key "$key" does not exist') + } + // `match` isn't currently very suitable for these types of sum type constructs... + if value is map[string]Any { + nm := (value as map[string]Any) + next_key := key_split[1..].join('.') + if next_key == '' { + return value + } + return nm.value(next_key) + } + return value + } + return error(@MOD + '.' + @STRUCT + '.' + @FN + ' key "$key" does not exist') +} + +pub fn (a []Any) as_strings() []string { + mut sa := []string{} + for any in a { + sa << any.string() + } + return sa +} + +// to_json returns `Any` as a JSON encoded string. +pub fn (a Any) to_json() string { + match a { + Null { + return 'null' + } + string { + return '"$a.str()"' + } + bool, f32, f64, i64, int, u64 { + return a.str() + } + map[string]Any { + mut str := '{' + for key, val in a { + str += ' "$key": $val.to_json(),' + } + str = str.trim_right(',') + str += ' }' + return str + } + []Any { + mut str := '[' + for val in a { + str += ' $val.to_json(),' + } + str = str.trim_right(',') + str += ' ]' + return str + } + time.Time { + return '"$a.format_ss_micro()"' + } + } +} diff --git a/vlib/toml/ast/ast.v b/vlib/toml/ast/ast.v new file mode 100644 index 0000000000..a20bab9b3f --- /dev/null +++ b/vlib/toml/ast/ast.v @@ -0,0 +1,28 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module ast + +import toml.input + +// Root represents the root structure of any parsed TOML text snippet or file. +[heap] +pub struct Root { +pub: + input input.Config // User input configuration +pub mut: + table Node + // errors []errors.Error // all the checker errors in the file +} + +pub fn (r Root) str() string { + mut s := typeof(r).name + '{\n' + s += ' input: $r.input\n' + s += ' table: $r.table\n' + s += '}' + return s +} + +pub fn (r Root) to_json() string { + return r.table.to_json() +} diff --git a/vlib/toml/ast/types.v b/vlib/toml/ast/types.v new file mode 100644 index 0000000000..4237950535 --- /dev/null +++ b/vlib/toml/ast/types.v @@ -0,0 +1,241 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module ast + +import toml.token + +// Key is a sumtype representing all types of keys that +// can be found in a TOML document. +pub type Key = Bare | Bool | Null | Number | Quoted + +pub fn (k Key) str() string { + return k.text +} + +// Node is a sumtype representing all possible value types +// found in a TOML document. +pub type Node = Bool | Date | DateTime | Null | Number | Quoted | Time | []Node | map[string]Node + +pub fn (v Node) to_json() string { + match v { + Quoted, Date, DateTime, Time { + return '"$v.text"' + } + Bool, Null, Number { + return v.text + } + map[string]Node { + mut str := '{' + for key, val in v { + str += ' "$key": $val.to_json(),' + } + str = str.trim_right(',') + str += ' }' + return str + } + []Node { + mut str := '[' + for val in v { + str += ' $val.to_json(),' + } + str = str.trim_right(',') + str += ' ]' + return str + } + } +} + +// DateTimeType is a sumtype representing all possible date types +// found in a TOML document. +pub type DateTimeType = Date | DateTime | Time + +pub fn (dtt DateTimeType) str() string { + return dtt.text +} + +// value queries a value from the map. +// `key` should be in "dotted" form e.g.: `"a.b.c.d"` +pub fn (v map[string]Node) value(key string) &Node { + null := &Node(Null{}) + key_split := key.split('.') + // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' retreiving value at "$key"') + if key_split[0] in v.keys() { + value := v[key_split[0]] or { + return null + // TODO return error(@MOD + '.' + @STRUCT + '.' + @FN + ' key "$key" does not exist') + } + // `match` isn't currently very suitable for these types of sum type constructs... + if value is map[string]Node { + m := (value as map[string]Node) + next_key := key_split[1..].join('.') + if next_key == '' { + return &value + } + return m.value(next_key) + } + return &value + } + return null + // TODO return error(@MOD + '.' + @STRUCT + '.' + @FN + ' key "$key" does not exist') +} + +// value queries a value from the map. +pub fn (v map[string]Node) exists(key string) bool { + key_split := key.split('.') + if key_split[0] in v.keys() { + value := v[key_split[0]] or { return false } + // `match` isn't currently very suitable for these types of sum type constructs... + if value is map[string]Node { + m := (value as map[string]Node) + next_key := key_split[1..].join('.') + if next_key == '' { + return true + } + return m.exists(next_key) + } + return true + } + return false +} + +pub struct Comment { +pub: + text string + pos token.Position +} + +pub fn (c Comment) str() string { + mut s := typeof(c).name + '{\n' + s += ' text: \'$c.text\'\n' + s += ' pos: $c.pos\n' + s += '}' + return s +} + +// Null is used in sumtype checks as a "default" value when nothing else is possible. +pub struct Null { +pub: + text string + pos token.Position +} + +pub fn (n Null) str() string { + return n.text +} + +pub struct Quoted { +pub: + text string + pos token.Position +} + +pub fn (q Quoted) str() string { + mut str := typeof(q).name + '{\n' + str += ' text: \'$q.text\'\n' + str += ' pos: $q.pos\n' + str += '}' + return str +} + +pub struct Bare { +pub: + text string + pos token.Position +} + +pub fn (b Bare) str() string { + mut str := typeof(b).name + '{\n' + str += ' text: \'$b.text\'\n' + str += ' pos: $b.pos\n' + str += '}' + return str +} + +pub struct Bool { +pub: + text string + pos token.Position +} + +pub fn (b Bool) str() string { + mut str := typeof(b).name + '{\n' + str += ' text: \'$b.text\'\n' + str += ' pos: $b.pos\n' + str += '}' + return str +} + +pub struct Number { +pub: + text string + pos token.Position +} + +pub fn (n Number) str() string { + mut str := typeof(n).name + '{\n' + str += ' text: \'$n.text\'\n' + str += ' pos: $n.pos\n' + str += '}' + return str +} + +pub struct Date { +pub: + text string + pos token.Position +} + +pub fn (d Date) str() string { + mut str := typeof(d).name + '{\n' + str += ' text: \'$d.text\'\n' + str += ' pos: $d.pos\n' + str += '}' + return str +} + +pub struct Time { +pub: + text string + offset int + pos token.Position +} + +pub fn (t Time) str() string { + mut str := typeof(t).name + '{\n' + str += ' text: \'$t.text\'\n' + str += ' offset: \'$t.offset\'\n' + str += ' pos: $t.pos\n' + str += '}' + return str +} + +pub struct DateTime { +pub: + text string + pos token.Position + date Date + time Time +} + +pub fn (dt DateTime) str() string { + mut str := typeof(dt).name + '{\n' + str += ' text: \'$dt.text\'\n' + str += ' date: \'$dt.date\'\n' + str += ' time: \'$dt.time\'\n' + str += ' pos: $dt.pos\n' + str += '}' + return str +} + +pub struct EOF { +pub: + pos token.Position +} + +pub fn (e EOF) str() string { + mut str := typeof(e).name + '{\n' + str += ' pos: $e.pos\n' + str += '}' + return str +} diff --git a/vlib/toml/ast/walker/walker.v b/vlib/toml/ast/walker/walker.v new file mode 100644 index 0000000000..5f41fedacf --- /dev/null +++ b/vlib/toml/ast/walker/walker.v @@ -0,0 +1,37 @@ +module walker + +import toml.ast + +// Visitor defines a visit method which is invoked by the walker in each node it encounters. +pub interface Visitor { + visit(node &ast.Node) ? +} + +pub type InspectorFn = fn (node &ast.Node, data voidptr) ? + +struct Inspector { + inspector_callback InspectorFn +mut: + data voidptr +} + +pub fn (i &Inspector) visit(node &ast.Node) ? { + i.inspector_callback(node, i.data) or { return err } +} + +// inspect traverses and checks the AST node on a depth-first order and based on the data given +pub fn inspect(node &ast.Node, data voidptr, inspector_callback InspectorFn) ? { + walk(Inspector{inspector_callback, data}, node) ? +} + +// walk traverses the AST using the given visitor +pub fn walk(visitor Visitor, node &ast.Node) ? { + if node is map[string]ast.Node { + n := node as map[string]ast.Node + for _, nn in n { + walk(visitor, &nn) ? + } + } else { + visitor.visit(node) ? + } +} diff --git a/vlib/toml/checker/checker.v b/vlib/toml/checker/checker.v new file mode 100644 index 0000000000..dc91bfb8f7 --- /dev/null +++ b/vlib/toml/checker/checker.v @@ -0,0 +1,168 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module checker + +import toml.ast +import toml.ast.walker +// import toml.util +import toml.token +import toml.scanner + +// Checker checks a tree of TOML `ast.Node`'s for common errors. +pub struct Checker { + scanner &scanner.Scanner +} + +pub fn (c Checker) check(n &ast.Node) ? { + walker.walk(c, n) ? +} + +fn (c Checker) visit(node &ast.Node) ? { + match node { + ast.Number { + c.check_number(node) ? + } + ast.Bool { + c.check_boolean(node) ? + } + else { + // TODO add more checks to make BurntSushi/toml-test invalid TOML pass + } + } +} + +// excerpt returns a string of the characters surrounding` +fn (c Checker) excerpt(tp token.Position) string { + return c.scanner.excerpt(tp.pos, 10) +} + +fn is_hex_bin_oct(hbo string) bool { + return hbo.len > 2 && (hbo.starts_with('0x') || hbo.starts_with('0o') || hbo.starts_with('0b')) +} + +fn has_repeating(str string, repeats []rune) bool { + for i, r in str { + if r in repeats && i + 1 < str.len { + if r == str[i + 1] { + return true + } + } + } + return false +} + +fn (c Checker) check_number(num ast.Number) ? { + lit := num.text + if lit in ['0', '0.0', '+0', '-0', '+0.0', '-0.0', '0e0', '+0e0', '-0e0', '0e00'] { + return + } + + if lit.contains('_') { + if lit.starts_with('_') || lit.ends_with('_') { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" can not start or end with `_` in ...${c.excerpt(num.pos)}...') + } + if lit.contains('__') { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" can not have more than one underscore (`_`) in ...${c.excerpt(num.pos)}...') + } + } + + mut hex_bin_oct := is_hex_bin_oct(lit) + is_hex := lit.contains('0x') + is_float := lit.to_lower().all_before('e').contains('.') + has_exponent_notation := lit.to_lower().contains('e') + float_decimal_index := lit.index('.') or { -1 } + // mut is_first_digit := byte(lit[0]).is_digit() + mut ascii := byte(lit[0]).ascii_str() + is_sign_prefixed := lit[0] in [`+`, `-`] + if is_sign_prefixed { // +/- ... + n := lit[1..] + hex_bin_oct = is_hex_bin_oct(n) + if hex_bin_oct { + ascii = byte(lit[0]).ascii_str() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (hex, octal and binary) can not start with `$ascii` in ...${c.excerpt(num.pos)}...') + } + // is_first_digit = byte(n[0]).is_digit() + if lit.len > 1 && n.starts_with('0') { + ascii = byte(n[0]).ascii_str() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" can not start with `$ascii` in ...${c.excerpt(num.pos)}...') + } + } else { + if !hex_bin_oct { + if !is_float && lit[0] == `0` { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" can not start with a zero in ...${c.excerpt(num.pos)}...') + } + + if is_float && lit[0] == `0` && float_decimal_index > 1 { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" can not start with a zero in ...${c.excerpt(num.pos)}...') + } + } + } + + if has_repeating(lit, [`_`, `.`, `x`, `o`, `b`]) { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" can not have $scanner.digit_extras as repeating characters in ...${c.excerpt(num.pos)}...') + } + + if hex_bin_oct { + third := lit[2] + if third in scanner.digit_extras { + ascii = byte(third).ascii_str() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (hex, octal and binary) can not have `$ascii` in ...${c.excerpt(num.pos)}...') + } + } + + if has_exponent_notation { + if lit.to_lower().all_after('e').contains('.') { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (with exponent) can not have a decimal point in ...${c.excerpt(num.pos)}...') + } + if !is_hex && lit.to_lower().count('e') > 1 { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (with exponent) can only have one exponent in ...${c.excerpt(num.pos)}...') + } + } + + if is_float { + if lit.count('.') > 1 { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (float) can only have one decimal point in ...${c.excerpt(num.pos)}...') + } + last := lit[lit.len - 1] + if last in scanner.digit_extras { + ascii = byte(last).ascii_str() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (float) can not start with `$ascii` in ...${c.excerpt(num.pos)}...') + } + if lit.contains('_.') || lit.contains('._') { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (float) can not have underscores before or after the decimal point in ...${c.excerpt(num.pos)}...') + } + if lit.contains('e.') || lit.contains('.e') || lit.contains('E.') || lit.contains('.E') { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" (float) can not have underscores before or after the decimal point in ...${c.excerpt(num.pos)}...') + } + } else { + if lit.len > 1 && lit.starts_with('0') && lit[1] !in [`x`, `o`, `b`] { + ascii = byte(lit[0]).ascii_str() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' numbers like "$lit" can not start with `$ascii` in ...${c.excerpt(num.pos)}...') + } + } +} + +fn (c Checker) check_boolean(b ast.Bool) ? { + lit := b.text + if lit in ['true', 'false'] { + return + } + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' boolean values like "$lit" can only be `true` or `false` literals, not `$lit` in ...${c.excerpt(b.pos)}...') +} diff --git a/vlib/toml/input/input.v b/vlib/toml/input/input.v new file mode 100644 index 0000000000..2b7f199a53 --- /dev/null +++ b/vlib/toml/input/input.v @@ -0,0 +1,22 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module input + +// Config is used to configure input to the toml module. +// Only one of the fields `text` and `file_path` is allowed to be set at time of configuration. +pub struct Config { +pub: + text string // TOML text + file_path string // '/path/to/file.toml' +} + +pub fn (c Config) validate() ? { + if c.file_path != '' && c.text != '' { + error(@MOD + '.' + @FN + + ' ${typeof(c).name} should contain only one of the fields `file_path` OR `text` filled out') + } else if c.file_path == '' && c.text == '' { + error(@MOD + '.' + @FN + + ' ${typeof(c).name} must either contain a valid `file_path` OR a non-empty `text` field') + } +} diff --git a/vlib/toml/parser/parser.v b/vlib/toml/parser/parser.v new file mode 100644 index 0000000000..7d9aead1c5 --- /dev/null +++ b/vlib/toml/parser/parser.v @@ -0,0 +1,849 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module parser + +import toml.ast +import toml.checker +import toml.util +import toml.token +import toml.scanner + +// Scanner contains the necessary fields for the state of the scan process. +// the task the scanner does is also refered to as "lexing" or "tokenizing". +// The Scanner methods are based on much of the work in `vlib/strings/textscanner`. +pub struct Parser { +pub: + config Config +mut: + scanner &scanner.Scanner + prev_tok token.Token + tok token.Token + peek_tok token.Token + skip_next bool + // The root map (map is called table in TOML world) + root_map map[string]ast.Node + root_map_key string + // Array of Tables state + last_aot string + last_aot_index int + // Root of the tree + ast_root &ast.Root = &ast.Root{} +} + +// Config is used to configure a Scanner instance. +// Only one of the fields `text` and `file_path` is allowed to be set at time of configuration. +pub struct Config { +pub: + scanner &scanner.Scanner + run_checks bool = true +} + +// new_parser returns a new, stack allocated, `Parser`. +pub fn new_parser(config Config) Parser { + return Parser{ + config: config + scanner: config.scanner + } +} + +// init initializes the parser. +pub fn (mut p Parser) init() ? { + p.root_map = map[string]ast.Node{} + p.next() ? +} + +// run_checker validates the parsed `ast.Node` nodes in the +// the generated AST. +fn (mut p Parser) run_checker() ? { + if p.config.run_checks { + chckr := checker.Checker{ + scanner: p.scanner + } + chckr.check(p.root_map) ? + } +} + +// parse starts parsing the input and returns the root +// of the generated AST. +pub fn (mut p Parser) parse() ?&ast.Root { + p.init() ? + p.root_table() ? + p.run_checker() ? + p.ast_root.table = p.root_map + return p.ast_root +} + +// next forwards the parser to the next token. +fn (mut p Parser) next() ? { + p.prev_tok = p.tok + p.tok = p.peek_tok + p.peek_tok = p.scanner.scan() ? +} + +// check returns true if the current token's `Kind` is equal that of `expected_token`. +fn (mut p Parser) check(check_token token.Kind) ? { + if p.tok.kind == check_token { + p.next() ? + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' expected token "$check_token" but found "$p.tok.kind" in this (excerpt): "...${p.excerpt()}..."') + } +} + +// check_one_of returns true if the current token's `Kind` is equal that of `expected_token`. +fn (mut p Parser) check_one_of(tokens []token.Kind) ? { + if p.tok.kind in tokens { + p.next() ? + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' expected one of $tokens but found "$p.tok.kind" in this (excerpt): "...${p.excerpt()}..."') + } +} + +// is_at returns true if the token kind is equal to `expected_token`. +fn (mut p Parser) is_at(expected_token token.Kind) bool { + return p.tok.kind == expected_token +} + +// expect will error if the token kind is not equal to `expected_token`. +fn (mut p Parser) expect(expected_token token.Kind) ? { + if p.tok.kind == expected_token { + return + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' expected token "$expected_token" but found "$p.tok.kind" in this text "...${p.excerpt()}..."') + } +} + +// find_table returns a reference to a map if found in the root table given a "dotted" key ('a.b.c'). +// If some segments of the key does not exist in the root table find_table will +// allocate a new map for each segment. This behavior is needed because you can +// reference maps by multiple keys "dotted" (separated by "." periods) in TOML documents. +pub fn (mut p Parser) find_table() ?&map[string]ast.Node { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'locating "$p.root_map_key" in map ${ptr_str(p.root_map)}') + mut t := &map[string]ast.Node{} + unsafe { + t = &p.root_map + } + if p.root_map_key == '' { + return t + } + + return p.find_in_table(mut t, p.root_map_key) +} + +pub fn (mut p Parser) sub_table_key(key string) (string, string) { + mut ks := key.split('.') + last := ks.last() + ks.delete_last() + return ks.join('.'), last +} + +// find_sub_table returns a reference to a map if found in `table` given a "dotted" key ('aa.bb.cc'). +// If some segments of the key does not exist in the input map find_in_table will +// allocate a new map for the segment. This behavior is needed because you can +// reference maps by multiple keys "dotted" (separated by "." periods) in TOML documents. +pub fn (mut p Parser) find_sub_table(key string) ?&map[string]ast.Node { + mut ky := p.root_map_key + '.' + key + if p.root_map_key == '' { + ky = key + } + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'locating "$ky" in map ${ptr_str(p.root_map)}') + mut t := &map[string]ast.Node{} + unsafe { + t = &p.root_map + } + if ky == '' { + return t + } + + return p.find_in_table(mut t, ky) +} + +// find_in_table returns a reference to a map if found in `table` given a "dotted" key ('aa.bb.cc'). +// If some segments of the key does not exist in the input map find_in_table will +// allocate a new map for the segment. This behavior is needed because you can +// reference maps by multiple keys "dotted" (separated by "." periods) in TOML documents. +pub fn (mut p Parser) find_in_table(mut table map[string]ast.Node, key string) ?&map[string]ast.Node { + // NOTE This code is the result of much trial and error. + // I'm still not quite sure *exactly* why it works. All I can leave here is a hope + // that this kind of minefield someday will be easier in V :) + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'locating "$key" in map ${ptr_str(table)}') + mut t := &map[string]ast.Node{} + unsafe { + t = &table + } + ks := key.split('.') + unsafe { + for k in ks { + if k in t.keys() { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'found key "$k" in $t.keys()') + if val := t[k] or { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' this should never happen. Key "$k" was checked before access') + } + { + if val is map[string]ast.Node { + // unsafe { + t = &(t[k] as map[string]ast.Node) + //} + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + ' "$k" is not a map') + } + } + } else { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'no key "$k" found, allocating new map "$k" in map ${ptr_str(t)}"') + // unsafe { + t[k] = map[string]ast.Node{} + t = &(t[k] as map[string]ast.Node) + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'allocated new map ${ptr_str(t)}"') + //} + } + } + } + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'returning map ${ptr_str(t)}"') + return t +} + +pub fn (mut p Parser) sub_key() ?string { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing nested key...') + key := p.key() ? + mut text := key.str() + for p.peek_tok.kind == .period { + p.next() ? // . + p.check(.period) ? + next_key := p.key() ? + text += '.' + next_key.text + } + p.next() ? + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed nested key `$text` now at "$p.tok.kind" "$p.tok.lit"') + return text +} + +// root_table parses next tokens into the root map of `ast.Node`s. +// The V `map` type is corresponding to a "table" in TOML. +pub fn (mut p Parser) root_table() ? { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing root table...') + + for p.tok.kind != .eof { + if !p.skip_next { + p.next() ? + } else { + p.skip_next = false + } + + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing token "$p.tok.kind" "$p.tok.lit"') + match p.tok.kind { + .hash { + // TODO table.comments << p.comment() + c := p.comment() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping comment "$c.text"') + } + //.whitespace, .tab, .nl { + // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "$p.tok.kind "$p.tok.lit"') + //} + .bare, .quoted, .boolean, .number, .underscore { // NOTE .boolean allows for use of "true" and "false" as table keys + if p.peek_tok.kind == .assign + || (p.tok.kind == .number && p.peek_tok.kind == .minus) { + key, val := p.key_value() ? + + t := p.find_table() ? + unsafe { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'setting "$key.str()" = $val.to_json() in table ${ptr_str(t)}') + t[key.str()] = val + } + } else if p.peek_tok.kind == .period { + subkey := p.sub_key() ? + + p.check(.assign) ? + val := p.value() ? + + sub_table, key := p.sub_table_key(subkey) + + t := p.find_sub_table(sub_table) ? + unsafe { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'setting "$key" = $val.to_json() in table ${ptr_str(t)}') + t[key] = val + } + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' dead end at "$p.tok.kind" "$p.tok.lit"') + } + } + .lsbr { + p.check(.lsbr) ? // '[' bracket + + if p.tok.kind == .lsbr { + p.array_of_tables(mut &p.root_map) ? + p.skip_next = true // skip calling p.next() in coming iteration + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'leaving double bracket at "$p.tok.kind "$p.tok.lit". NEXT is "$p.peek_tok.kind "$p.peek_tok.lit"') + } else if p.peek_tok.kind == .period { + p.root_map_key = p.sub_key() ? + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'setting root map key to `$p.root_map_key` at "$p.tok.kind" "$p.tok.lit"') + p.expect(.rsbr) ? + } else { + key := p.key() ? + p.root_map_key = key.str() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'setting root map key to `$p.root_map_key` at "$p.tok.kind" "$p.tok.lit"') + p.next() ? + p.expect(.rsbr) ? + } + } + .eof { + return + } + else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' could not parse "$p.tok.kind" "$p.tok.lit" in this (excerpt): "...${p.excerpt()}..."') + } + } + } +} + +// excerpt returns a string of the characters surrounding `Parser.tok.pos` +fn (p Parser) excerpt() string { + return p.scanner.excerpt(p.tok.pos, 10) +} + +// inline_table parses next tokens into a map of `ast.Node`s. +// The V map type is corresponding to a "table" in TOML. +pub fn (mut p Parser) inline_table(mut tbl map[string]ast.Node) ? { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing inline table into ${ptr_str(tbl)}...') + + for p.tok.kind != .eof { + p.next() ? + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing token "$p.tok.kind"') + match p.tok.kind { + .hash { + // TODO table.comments << p.comment() + c := p.comment() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping comment "$c.text"') + } + //.whitespace, .tab, .nl { + // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "$p.tok.kind "$p.tok.lit"') + //} + .comma { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping comma table value seperator "$p.tok.lit"') + continue + } + .rcbr { + // ']' bracket + return + } + .bare, .quoted, .boolean, .number, .underscore { + if p.peek_tok.kind == .assign { + key, val := p.key_value() ? + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'inserting @5 "$key.str()" = $val.to_json() into ${ptr_str(tbl)}') + tbl[key.str()] = val + } else if p.peek_tok.kind == .period { + subkey := p.sub_key() ? + p.check(.assign) ? + val := p.value() ? + + sub_table, key := p.sub_table_key(subkey) + + mut t := p.find_in_table(mut tbl, sub_table) ? + unsafe { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'inserting @6 "$key" = $val.to_json() into ${ptr_str(t)}') + t[key] = val + } + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' dead end at "$p.tok.kind" "$p.tok.lit"') + } + } + .lsbr { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' unexpected "$p.tok.kind" "$p.tok.lit" at this (excerpt): "...${p.excerpt()}..."') + } + .eof { + return + } + else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' could not parse $p.tok.kind ("$p.tok.lit") in this (excerpt): "...${p.excerpt()}..." token \n$p.tok') + } + } + if p.peek_tok.kind == .lsbr { + return + } + } +} + +// array_of_tables parses next tokens into an array of `ast.Node`s. +pub fn (mut p Parser) array_of_tables(mut table map[string]ast.Node) ? { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing array of tables "$p.tok.kind" "$p.tok.lit"') + // NOTE this is starting to get ugly. TOML isn't simple at this point + p.check(.lsbr) ? // '[' bracket + + // [[key.key]] horror + if p.peek_tok.kind == .period { + p.double_array_of_tables(mut table) ? + return + } + + key := p.key() ? + p.next() ? + p.check(.rsbr) ? + p.check(.rsbr) ? + + key_str := key.str() + unsafe { + if key_str in table.keys() { + if val := table[key_str] or { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' this should never happen. Key "$key_str" was checked before access') + } + { + if val is []ast.Node { + arr := &(table[key_str] as []ast.Node) + arr << p.double_bracket_array() ? + table[key_str] = arr + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' table[$key_str] is not an array. (excerpt): "...${p.excerpt()}..."') + } + } + } else { + table[key_str] = p.double_bracket_array() ? + } + } + p.last_aot = key_str + p.last_aot_index = 0 +} + +// double_array_of_tables parses next tokens into an array of tables of arrays of `ast.Node`s... +pub fn (mut p Parser) double_array_of_tables(mut table map[string]ast.Node) ? { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing array of tables of arrays "$p.tok.kind" "$p.tok.lit"') + + key := p.key() ? + mut key_str := key.str() + for p.peek_tok.kind == .period { + p.next() ? // . + p.check(.period) ? + next_key := p.key() ? + key_str += '.' + next_key.text + } + + p.next() ? + p.check(.rsbr) ? + p.check(.rsbr) ? + + ks := key_str.split('.') + + if ks.len != 2 { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' nested array of tables does not support more than 2 levels. (excerpt): "...${p.excerpt()}..."') + } + + first := ks[0] + last := ks[1] + + unsafe { + // NOTE this is starting to get EVEN uglier. TOML is not at all simple at this point... + if p.last_aot != first { + table[first] = []ast.Node{} + p.last_aot = first + mut t_arr := &(table[p.last_aot] as []ast.Node) + t_arr << map[string]ast.Node{} + p.last_aot_index = 0 + } + + mut t_arr := &(table[p.last_aot] as []ast.Node) + mut t_map := t_arr[p.last_aot_index] + mut t := &(t_map as map[string]ast.Node) + + if last in t.keys() { + if val := t[last] or { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' this should never happen. Key "$last" was checked before access') + } + { + if val is []ast.Node { + arr := &(val as []ast.Node) + arr << p.double_bracket_array() ? + t[last] = arr + } else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' t[$last] is not an array. (excerpt): "...${p.excerpt()}..."') + } + } + } else { + t[last] = p.double_bracket_array() ? + } + } +} + +// array parses next tokens into an array of `ast.Node`s. +pub fn (mut p Parser) double_bracket_array() ?[]ast.Node { + mut arr := []ast.Node{} + for p.tok.kind in [.bare, .quoted, .boolean, .number] && p.peek_tok.kind == .assign { + mut tbl := map[string]ast.Node{} + key, val := p.key_value() ? + tbl[key.str()] = val + arr << tbl + p.next() ? + } + return arr +} + +// array parses next tokens into an array of `ast.Node`s. +pub fn (mut p Parser) array() ?[]ast.Node { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing array...') + mut arr := []ast.Node{} + p.expect(.lsbr) ? // '[' bracket + for p.tok.kind != .eof { + p.next() ? + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing token "$p.tok.kind" "$p.tok.lit"') + match p.tok.kind { + .boolean { + arr << ast.Node(p.boolean() ?) + } + .comma { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping comma array value seperator "$p.tok.lit"') + continue + } + .eof { + // End Of File + return arr + } + .hash { + // TODO array.comments << p.comment() + c := p.comment() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping comment "$c.text"') + } + .lcbr { + mut t := map[string]ast.Node{} + p.inline_table(mut t) ? + ast.Node(t) + } + .number { + val := p.number_or_date() ? + arr << val + } + .quoted { + arr << ast.Node(p.quoted()) + } + .lsbr { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing array in array "$p.tok.kind" "$p.tok.lit"') + arr << ast.Node(p.array() ?) + } + .rsbr { + break + } + else { + error(@MOD + '.' + @STRUCT + '.' + @FN + + ' could not parse "$p.tok.kind" "$p.tok.lit" ("$p.tok.lit") in this (excerpt): "...${p.excerpt()}..."') + } + } + } + p.expect(.rsbr) ? // ']' bracket + $if debug { + flat := arr.str().replace('\n', r'\n') + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed array: $flat . Currently @ token "$p.tok.kind"') + } + return arr +} + +// comment returns an `ast.Comment` type. +pub fn (mut p Parser) comment() ast.Comment { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed hash comment "#$p.tok.lit"') + return ast.Comment{ + text: p.tok.lit + pos: p.tok.position() + } +} + +// key parse and returns an `ast.Key` type. +// Keys are the token(s) appearing before an assignment operator (=). +pub fn (mut p Parser) key() ?ast.Key { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing key from "$p.tok.lit" ...') + + mut key := ast.Key(ast.Null{}) + if p.tok.kind == .number { + if p.peek_tok.kind == .minus { + mut lits := p.tok.lit + pos := p.tok.position() + for p.peek_tok.kind != .assign { + p.next() ? + lits += p.tok.lit + } + return ast.Key(ast.Bare{ + text: lits + pos: pos + }) + } + // number := p.number() as ast.Number + key = ast.Key(p.number()) + } else { + key = match p.tok.kind { + .bare, .underscore { + ast.Key(p.bare()) + } + .boolean { + ast.Key(p.boolean() ?) + } + .quoted { + ast.Key(p.quoted()) + } + else { + error(@MOD + '.' + @STRUCT + '.' + @FN + + ' key expected .bare, .number, .quoted or .boolean but got "$p.tok.kind"') + ast.Key(ast.Bare{}) // TODO workaround bug + } + } + } + + // NOTE kept for eased debugging + // util.printdbg(@MOD +'.' + @STRUCT + '.' + @FN, 'parsed key "$p.tok.lit"') + // panic(@MOD + '.' + @STRUCT + '.' + @FN + ' could not parse ${p.tok.kind} ("${p.tok.lit}") token \n$p.tok') + // return ast.Key(ast.Bare{}) + + return key +} + +// key_value parse and returns a pair `ast.Key` and `ast.Node` type. +// see also `key()` and `value()` +pub fn (mut p Parser) key_value() ?(ast.Key, ast.Node) { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing key value pair...') + key := p.key() ? + p.next() ? + p.check(.assign) ? // Assignment operator + value := p.value() ? + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed key value pair. "$key" = $value.to_json()') + return key, value +} + +// value parse and returns an `ast.Node` type. +// values are the token(s) appearing after an assignment operator (=). +pub fn (mut p Parser) value() ?ast.Node { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing value...') + // println('parsed comment "${p.tok.lit}"') + + mut value := ast.Node(ast.Null{}) + + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing token "$p.tok.kind" "$p.tok.lit"') + // mut value := ast.Node{} + if p.tok.kind == .number { + number_or_date := p.number_or_date() ? + value = number_or_date + } else { + value = match p.tok.kind { + .quoted { + ast.Node(p.quoted()) + } + .boolean { + ast.Node(p.boolean() ?) + } + .lsbr { + ast.Node(p.array() ?) + } + .lcbr { + mut t := map[string]ast.Node{} + p.inline_table(mut t) ? + // table[key_str] = ast.Node(t) + ast.Node(t) + } + else { + error(@MOD + '.' + @STRUCT + '.' + @FN + + ' value expected .boolean, .quoted, .lsbr, .lcbr or .number got "$p.tok.kind" "$p.tok.lit"') + ast.Node(ast.Null{}) // TODO workaround bug + } + } + } + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed value $value.to_json()') + return value +} + +// number_or_date parse and returns an `ast.Node` type as +// one of [`ast.Date`, `ast.Time`, `ast.DateTime`, `ast.Number`] +pub fn (mut p Parser) number_or_date() ?ast.Node { + // Handle Date/Time + if p.peek_tok.kind == .minus || p.peek_tok.kind == .colon { + date_time_type := p.date_time() ? + match date_time_type { + ast.Date { + return ast.Node(date_time_type as ast.Date) + } + ast.Time { + return ast.Node(date_time_type as ast.Time) + } + ast.DateTime { + return ast.Node(date_time_type as ast.DateTime) + } + } + } + return ast.Node(p.number()) +} + +// bare parse and returns an `ast.Bare` type. +pub fn (mut p Parser) bare() ast.Bare { + return ast.Bare{ + text: p.tok.lit + pos: p.tok.position() + } +} + +// quoted parse and returns an `ast.Quoted` type. +pub fn (mut p Parser) quoted() ast.Quoted { + return ast.Quoted{ + text: p.tok.lit + pos: p.tok.position() + } +} + +// boolean parse and returns an `ast.Bool` type. +pub fn (mut p Parser) boolean() ?ast.Bool { + if p.tok.lit !in ['true', 'false'] { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' expected literal to be either `true` or `false` got "$p.tok.kind"') + } + return ast.Bool{ + text: p.tok.lit + pos: p.tok.position() + } +} + +// number parse and returns an `ast.Number` type. +pub fn (mut p Parser) number() ast.Number { + return ast.Number{ + text: p.tok.lit + pos: p.tok.position() + } +} + +// date_time parses dates and time in RFC 3339 format. +// https://datatracker.ietf.org/doc/html/rfc3339 +pub fn (mut p Parser) date_time() ?ast.DateTimeType { + // Date and/or Time + mut lit := '' + pos := p.tok.position() + mut date := ast.Date{} + mut time := ast.Time{} + + if p.peek_tok.kind == .minus { + date = p.date() ? + lit += date.text + // Look for any THH:MM:SS or HH:MM:SS + if (p.peek_tok.kind == .bare && (p.peek_tok.lit.starts_with('T') + || p.peek_tok.lit.starts_with('t'))) || p.peek_tok.kind == .whitespace { + p.next() ? // Advance to token with Txx or whitespace special case + if p.tok.lit.starts_with('T') || p.tok.lit.starts_with('t') { + lit += p.tok.lit[0].ascii_str() //'T' or 't' + } else { + lit += p.tok.lit + p.next() ? + } + time = p.time() ? + lit += time.text + + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed date-time: "$lit"') + return ast.DateTime{ + text: lit + pos: pos + date: date + time: time + } + } + } else if p.peek_tok.kind == .colon { + time = p.time() ? + return time + } + + return ast.Date{ + text: lit + pos: pos + } +} + +// date parse and returns an `ast.Date` type. +pub fn (mut p Parser) date() ?ast.Date { + // Date + mut lit := p.tok.lit + pos := p.tok.position() + + p.check(.number) ? + lit += p.tok.lit + p.check(.minus) ? + lit += p.tok.lit + p.check(.number) ? + lit += p.tok.lit + p.check(.minus) ? + lit += p.tok.lit + p.expect(.number) ? + + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed date: "$lit"') + return ast.Date{ + text: lit + pos: pos + } +} + +// time parse and returns an `ast.Time` type. +pub fn (mut p Parser) time() ?ast.Time { + // Time + mut lit := p.tok.lit + pos := p.tok.position() + + if p.is_at(.bare) && (lit.starts_with('T') || lit.starts_with('t')) { + if p.tok.lit.starts_with('T') { + lit = lit.all_after('T') + } else if p.tok.lit.starts_with('t') { + lit = lit.all_after('t') + } + p.next() ? + } else { + p.check(.number) ? + } + lit += p.tok.lit + p.check(.colon) ? + lit += p.tok.lit + p.check(.number) ? + lit += p.tok.lit + // TODO does TOML even have optional seconds? + // if p.peek_tok.kind == .colon { + p.check(.colon) ? + lit += p.tok.lit + p.expect(.number) ? + //} + + // Optional milliseconds + if p.peek_tok.kind == .period { + p.next() ? + lit += p.tok.lit // lit += '.' + p.check(.period) ? + lit += p.tok.lit + p.expect(.number) ? + } + + // Parse offset + if p.peek_tok.kind == .minus || p.peek_tok.kind == .plus { + p.next() ? + lit += p.tok.lit // lit += '-' + p.check_one_of([.minus, .plus]) ? + lit += p.tok.lit + p.check(.number) ? + lit += p.tok.lit + p.check(.colon) ? + lit += p.tok.lit + p.expect(.number) ? + } else if p.peek_tok.kind == .bare && (p.peek_tok.lit == 'Z' || p.peek_tok.lit == 'z') { + p.next() ? + lit += p.tok.lit + p.expect(.bare) ? + } + + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsed time: "$lit"') + return ast.Time{ + text: lit + pos: pos + } +} + +// eof returns an `ast.EOF` type. +pub fn (mut p Parser) eof() ast.EOF { + return ast.EOF{ + pos: p.tok.position() + } +} diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v new file mode 100644 index 0000000000..ed309358ca --- /dev/null +++ b/vlib/toml/scanner/scanner.v @@ -0,0 +1,526 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module scanner + +import os +import math.mathutil +import toml.input +import toml.token +import toml.util + +pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`] + +// Scanner contains the necessary fields for the state of the scan process. +// the task the scanner does is also refered to as "lexing" or "tokenizing". +// The Scanner methods are based on much of the work in `vlib/strings/textscanner`. +pub struct Scanner { +pub: + config Config + text string // the input TOML text +mut: + col int // current column number (x coordinate) + line_nr int = 1 // current line number (y coordinate) + pos int // current flat/index position in the `text` field + mode Mode // sub-mode of the scanner +} + +enum Mode { + normal + inside_string +} + +// Config is used to configure a Scanner instance. +// Only one of the fields `text` and `file_path` is allowed to be set at time of configuration. +pub struct Config { +pub: + input input.Config + tokenize_formating bool // if true, generate tokens for `\n`, ` `, `\t`, `\r` etc. +} + +// new_scanner returns a new heap allocated `Scanner` instance. +pub fn new_scanner(config Config) ?&Scanner { + config.input.validate() ? + mut text := config.input.text + file_path := config.input.file_path + if os.is_file(file_path) { + text = os.read_file(file_path) or { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' Could not read "$file_path": "$err.msg"') + } + } + mut s := &Scanner{ + config: config + text: text + } + return s +} + +// scan returns the next token from the input. +[direct_array_access] +pub fn (mut s Scanner) scan() ?token.Token { + for { + c := s.next() + byte_c := byte(c) + if c == -1 { + s.inc_line_number() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'reached EOF') + return s.new_token(.eof, '', 1) + } + + ascii := byte_c.ascii_str() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'current char "$ascii"') + + is_sign := byte_c in [`+`, `-`] + is_signed_number := is_sign && byte(s.at()).is_digit() && !byte(s.peek(-1)).is_digit() + + // TODO (+/-)nan & (+/-)inf + /* + mut is_nan := s.peek(1) == `n` && s.peek(2) == `a` && s.peek(3) == `n` + mut is_inf := s.peek(1) == `i` && s.peek(2) == `n` && s.peek(3) == `f` + if is_nan || is_inf { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a special number "$key" ($key.len)') + return s.new_token(.number, key, key.len) + } + */ + + is_digit := byte_c.is_digit() + if is_digit || is_signed_number { + num := s.extract_number() ? + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a number "$num" ($num.len)') + return s.new_token(.number, num, num.len) + } + + if util.is_key_char(byte_c) { + key := s.extract_key() + if key.to_lower() in ['true', 'false'] { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a boolean "$key" ($key.len)') + return s.new_token(.boolean, key, key.len) + } + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a bare key "$key" ($key.len)') + return s.new_token(.bare, key, key.len) + } + + match rune(c) { + ` `, `\t`, `\n`, `\r` { + if c == `\n` { + s.inc_line_number() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'incremented line nr to $s.line_nr') + } + // Date-Time in RFC 3339 is allowed to have a space between the date and time in supplement to the 'T' + // so we allow space characters to slip through to the parser if the space is between two digits... + // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, '"'+byte(s.peek(-1)).ascii_str()+'" < "$ascii" > "'+byte(s.at()).ascii_str()+'"') + if c == ` ` && byte(s.peek(-1)).is_digit() && byte(s.at()).is_digit() { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified, what could be, a space between a RFC 3339 date and time ("$ascii") ($ascii.len)') + return s.new_token(token.Kind.whitespace, ascii, ascii.len) + } + if s.config.tokenize_formating { + mut kind := token.Kind.whitespace + if c == `\t` { + kind = token.Kind.tab + } else if c == `\n` { + kind = token.Kind.nl + } + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified one of " ", "\\t" or "\\n" ("$ascii") ($ascii.len)') + return s.new_token(kind, ascii, ascii.len) + } else { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping " ", "\\t" or "\\n" ("$ascii") ($ascii.len)') + } + continue + } + `-` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified minus "$ascii" ($ascii.len)') + return s.new_token(.minus, ascii, ascii.len) + } + `_` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified underscore "$ascii" ($ascii.len)') + return s.new_token(.underscore, ascii, ascii.len) + } + `+` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified plus "$ascii" ($ascii.len)') + return s.new_token(.plus, ascii, ascii.len) + } + `=` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified assignment "$ascii" ($ascii.len)') + return s.new_token(.assign, ascii, ascii.len) + } + `"`, `'` { // ... some string "/' + ident_string, is_multiline := s.extract_string() ? + token_length := if is_multiline { 2 * 3 } else { 2 } + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified quoted string (multiline: $is_multiline) `$ident_string`') + return s.new_token(.quoted, ident_string, ident_string.len + token_length) // + quote length + } + `#` { + start := s.pos //+ 1 + s.ignore_line() + hash := s.text[start..s.pos] + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comment hash "$hash" ($hash.len)') + return s.new_token(.hash, hash, hash.len + 1) + } + `{` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified left curly bracket "$ascii" ($ascii.len)') + return s.new_token(.lcbr, ascii, ascii.len) + } + `}` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified right curly bracket "$ascii" ($ascii.len)') + return s.new_token(.rcbr, ascii, ascii.len) + } + `[` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified left square bracket "$ascii" ($ascii.len)') + return s.new_token(.lsbr, ascii, ascii.len) + } + `]` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified right square bracket "$ascii" ($ascii.len)') + return s.new_token(.rsbr, ascii, ascii.len) + } + `:` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified colon "$ascii" ($ascii.len)') + return s.new_token(.colon, ascii, ascii.len) + } + `,` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comma "$ascii" ($ascii.len)') + return s.new_token(.comma, ascii, ascii.len) + } + `.` { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified period "$ascii" ($ascii.len)') + return s.new_token(.period, ascii, ascii.len) + } + else { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' could not scan character `$ascii` / $c at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...') + } + } + } + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'unknown character code at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, + 5)}...') + return s.new_token(.unknown, '', 0) +} + +// free frees all allocated resources. +[unsafe] +pub fn (mut s Scanner) free() { + unsafe { + s.text.free() + } +} + +// remaining returns how many characters remain in the text input. +[inline] +pub fn (s &Scanner) remaining() int { + return s.text.len - s.pos +} + +// next returns the next character code from the input text. +// next returns `-1` if it can't reach the next character. +[direct_array_access; inline] +pub fn (mut s Scanner) next() int { + if s.pos < s.text.len { + opos := s.pos + s.pos++ + s.col++ + c := s.text[opos] + return c + } + return -1 +} + +// skip skips one character ahead. +[inline] +pub fn (mut s Scanner) skip() { + if s.pos + 1 < s.text.len { + s.pos++ + s.col++ + } +} + +// skip_n skips ahead `n` characters. +// If the skip goes out of bounds from the length of `Scanner.text`, +// the scanner position will be sat to the last character possible. +[inline] +pub fn (mut s Scanner) skip_n(n int) { + s.pos += n + if s.pos > s.text.len { + s.pos = s.text.len + } + s.col = s.pos +} + +// at returns the *current* character code from the input text. +// at returns `-1` if it can't get the current character. +// unlike `next()`, `at()` does not change the state of the scanner. +[direct_array_access; inline] +pub fn (s &Scanner) at() byte { + if s.pos < s.text.len { + return s.text[s.pos] + } + return byte(-1) +} + +// peek returns the character code from the input text at position + `n`. +// peek returns `-1` if it can't peek `n` characters ahead. +[direct_array_access; inline] +pub fn (s &Scanner) peek(n int) int { + if s.pos + n < s.text.len { + // Allow peeking back - needed for spaces between date and time in RFC 3339 format :/ + if n - 1 < 0 && s.pos + n - 1 >= 0 { + // util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'LOOKING BAAAA-AACK - OOOVER MY SHOOOOULDEEEER "${s.text[s.pos + n-1]}"') + return s.text[s.pos + n - 1] + } + return s.text[s.pos + n] + } + return -1 +} + +// reset resets the internal state of the scanner. +pub fn (mut s Scanner) reset() { + s.pos = 0 + s.col = 0 + s.line_nr = 1 +} + +// new_token returns a new `token.Token`. +[inline] +fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token { + // line_offset := 1 + // println('new_token($lit)') + return token.Token{ + kind: kind + lit: lit + col: mathutil.max(1, s.col - len + 1) + line_nr: s.line_nr + 1 //+ line_offset + pos: s.pos - len + 1 + len: len + } +} + +// ignore_line forwards the scanner to the end of the current line. +[direct_array_access; inline] +fn (mut s Scanner) ignore_line() { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL') + for c := s.at(); c != -1 && c != `\n`; c = s.at() { + s.next() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${byte(c).ascii_str()}"') + continue + } +} + +// inc_line_number increases the internal line number. +[inline] +fn (mut s Scanner) inc_line_number() { + s.col = 0 + s.line_nr++ +} + +// extract_key parses and returns a TOML key as a string. +[direct_array_access; inline] +fn (mut s Scanner) extract_key() string { + s.pos-- + s.col-- + start := s.pos + for s.pos < s.text.len { + c := s.at() + if !(util.is_key_char(c) || c.is_digit() || c in [`_`, `-`]) { + break + } + s.pos++ + s.col++ + } + key := s.text[start..s.pos] + return key +} + +// extract_string collects and returns a string containing +// any bytes recognized as a TOML string. +// TOML strings are everything found between two double or single quotation marks (`"`/`'`). +[direct_array_access; inline] +fn (mut s Scanner) extract_string() ?(string, bool) { + // extract_string is called when the scanner has already reached + // a byte that is the start of a string so we rewind it to start at the correct + s.pos-- + s.col-- + quote := s.at() + start := s.pos + mut lit := '' + + is_multiline := s.text[s.pos + 1] == quote && s.text[s.pos + 2] == quote + // Check for escaped multiline quote + if is_multiline { + mls := s.extract_multiline_string() ? + return mls, is_multiline + } + + for { + s.pos++ + s.col++ + + if s.pos >= s.text.len { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' unfinished string literal `$quote.ascii_str()` started at $start ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') + } + + c := s.at() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `$c.ascii_str()` / $c (quote type: $quote/$quote.ascii_str())') + + // Check for escaped chars + if c == byte(92) { + esc, skip := s.handle_escapes(quote, is_multiline) + lit += esc + if skip > 0 { + s.pos += skip + s.col += skip + continue + } + } + + if c == quote { + s.pos++ + s.col++ + return lit, is_multiline + } + + lit += c.ascii_str() + } + return lit, is_multiline +} + +// extract_multiline_string collects and returns a string containing +// any bytes recognized as a TOML string. +// TOML strings are everything found between two double or single quotation marks (`"`/`'`). +[direct_array_access; inline] +fn (mut s Scanner) extract_multiline_string() ?string { + // extract_multiline_string is called from extract_string so we know the 3 first + // characters is the quotes + quote := s.at() + start := s.pos + mut lit := '' + + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'multiline `$quote.ascii_str()${s.text[s.pos + 1].ascii_str()}${s.text[ + s.pos + 2].ascii_str()}` string started at pos $start ($s.line_nr,$s.col) (quote type: $quote.ascii_str() / $quote)') + + s.pos += 2 + s.col += 2 + + for { + s.pos++ + s.col++ + + if s.pos >= s.text.len { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' unfinished multiline string literal ($quote.ascii_str()$quote.ascii_str()$quote.ascii_str()) started at $start ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') + } + + c := s.at() + if c == `\n` { + s.inc_line_number() + lit += c.ascii_str() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `\\n` / $c') + continue + } + // Check for escaped chars + if c == byte(92) { + esc, skip := s.handle_escapes(quote, true) + lit += esc + if skip > 0 { + s.pos += skip + s.col += skip + continue + } + } + + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `$c.ascii_str()` / $c') + + if c == quote { + if s.peek(1) == quote && s.peek(2) == quote { + if s.peek(3) == -1 { + s.pos += 3 + s.col += 3 + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'returning at $c.ascii_str() `$lit`') + return lit + } else if s.peek(3) != quote { + // lit += c.ascii_str() + // lit += quote.ascii_str() + s.pos += 3 + s.col += 3 + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'returning at $c.ascii_str() `$lit`') + return lit + } + } + } + lit += c.ascii_str() + } + return lit +} + +// handle_escapes +fn (mut s Scanner) handle_escapes(quote byte, is_multiline bool) (string, int) { + c := s.at() + mut lit := c.ascii_str() + if s.peek(1) == byte(92) { + lit += lit + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`') + return lit, 1 + } else if s.peek(1) == quote { + if (!is_multiline && s.peek(2) == `\n`) + || (is_multiline && s.peek(2) == quote && s.peek(3) == quote && s.peek(4) == `\n`) { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'ignore special case escaped `$lit` at end of string') + return '', 0 + } + lit += quote.ascii_str() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`') + return lit, 1 + } else if s.peek(1) == `u` && byte(s.peek(2)).is_hex_digit() && byte(s.peek(3)).is_hex_digit() + && byte(s.peek(4)).is_hex_digit() && byte(s.peek(5)).is_hex_digit() { + lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`') + return lit, 4 + } + return '', 0 +} + +// extract_number collects and returns a string containing +// any bytes recognized as a TOML number. +// TOML numbers can include digits 0-9 and `_`. +[direct_array_access; inline] +fn (mut s Scanner) extract_number() ?string { + // extract_number is called when the scanner has already reached + // a byte that is a number or +/- - so we rewind it to start at the correct + // position to get the complete number. Even if it's only one digit + s.pos-- + s.col-- + start := s.pos + + mut c := s.at() + is_digit := byte(c).is_digit() + if !(is_digit || c in [`+`, `-`]) { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' ${byte(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}') + } + s.pos++ + s.col++ + for s.pos < s.text.len { + c = s.at() + // Handle signed exponent notation. I.e.: 3e2, 3E2, 3e-2, 3E+2, 3e0, 3.1e2, 3.1E2, -1E-1 + if c in [`e`, `E`] && s.peek(1) in [`+`, `-`] && byte(s.peek(2)).is_digit() { + s.pos += 2 + s.col += 2 + } + c = s.at() + if !(byte(c).is_hex_digit() || c in scanner.digit_extras) { + break + } + s.pos++ + s.col++ + } + key := s.text[start..s.pos] + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified number "$key" in range [$start .. $s.pos]') + return key +} + +// excerpt returns a string excerpt of the input text centered +// at `pos`. The `margin` argument defines how many chacters +// on each side of `pos` is returned +pub fn (s Scanner) excerpt(pos int, margin int) string { + start := if pos > 0 && pos >= margin { pos - margin } else { 0 } + end := if pos + margin < s.text.len { pos + margin } else { s.text.len } + return s.text[start..end].replace('\n', r'\n') +} diff --git a/vlib/toml/scanner/scanner_test.v b/vlib/toml/scanner/scanner_test.v new file mode 100644 index 0000000000..1ec75e80eb --- /dev/null +++ b/vlib/toml/scanner/scanner_test.v @@ -0,0 +1,82 @@ +import toml.input +import toml.scanner + +const scan_input = input.Config{ + text: 'abc' +} + +fn test_remaining() { + mut s := scanner.new_scanner(input: scan_input) or { panic(err) } + assert s.remaining() == 3 + s.next() + s.next() + assert s.remaining() == 1 + s.next() + assert s.remaining() == 0 + s.next() + s.next() + assert s.remaining() == 0 + s.reset() + assert s.remaining() == 3 +} + +fn test_next() { + mut s := scanner.new_scanner(input: scan_input) or { panic(err) } + assert s.next() == `a` + assert s.next() == `b` + assert s.next() == `c` + assert s.next() == -1 + assert s.next() == -1 + assert s.next() == -1 +} + +fn test_skip() { + mut s := scanner.new_scanner(input: scan_input) or { panic(err) } + assert s.next() == `a` + s.skip() + assert s.next() == `c` + assert s.next() == -1 +} + +fn test_skip_n() { + mut s := scanner.new_scanner(input: scan_input) or { panic(err) } + s.skip_n(2) + assert s.next() == `c` + assert s.next() == -1 +} + +fn test_at() { + mut s := scanner.new_scanner(input: scan_input) or { panic(err) } + assert s.at() == `a` + assert s.at() == `a` + assert s.at() == `a` + // + assert s.next() == `a` + assert s.next() == `b` + assert s.next() == `c` + assert s.next() == -1 +} + +fn test_peek() { + mut s := scanner.new_scanner(input: scan_input) or { panic(err) } + assert s.peek(0) == `a` + assert s.peek(1) == `b` + assert s.peek(2) == `c` + assert s.peek(3) == -1 + assert s.peek(4) == -1 + // + assert s.next() == `a` + assert s.next() == `b` + assert s.next() == `c` + assert s.next() == -1 +} + +fn test_reset() { + mut s := scanner.new_scanner(input: scan_input) or { panic(err) } + assert s.next() == `a` + s.next() + s.next() + assert s.next() == -1 + s.reset() + assert s.next() == `a` +} diff --git a/vlib/toml/tests/burntsushi.toml-test_test.v b/vlib/toml/tests/burntsushi.toml-test_test.v new file mode 100644 index 0000000000..3e2e3e0458 --- /dev/null +++ b/vlib/toml/tests/burntsushi.toml-test_test.v @@ -0,0 +1,185 @@ +import os +import toml + +// Instructions for developers: +// The actual tests and data can be obtained by doing: +// `cd vlib/toml/tests/testdata` +// `git clone --depth 1 https://github.com/BurntSushi/toml-test.git burntsushi/toml-test` +// See also the CI toml tests +// TODO Goal: make parsing AND value retrieval of all of https://github.com/BurntSushi/toml-test/test/ pass +const ( + valid_exceptions = [ + 'float/inf-and-nan.toml', + 'table/array-table-array.toml', + ] + invalid_exceptions = [ + // String + 'string/basic-multiline-out-of-range-unicode-escape-1.toml', + 'string/basic-byte-escapes.toml', + 'string/bad-multiline.toml', + 'string/multiline-escape-space.toml', + 'string/bad-codepoint.toml', + 'string/literal-multiline-quotes-1.toml', + 'string/literal-multiline-quotes-2.toml', + 'string/multiline-quotes-1.toml', + 'string/basic-multiline-out-of-range-unicode-escape-2.toml', + 'string/bad-slash-escape.toml', + 'string/basic-out-of-range-unicode-escape-1.toml', + 'string/basic-out-of-range-unicode-escape-2.toml', + 'string/multiline-quotes-2.toml', + 'string/bad-uni-esc.toml', + 'string/bad-escape.toml', + 'string/basic-multiline-unknown-escape.toml', + 'string/missing-quotes.toml', + 'string/bad-byte-escape.toml', + 'string/basic-unknown-escape.toml', + // Integer + 'integer/capital-bin.toml', + 'integer/invalid-bin.toml', + 'integer/invalid-oct.toml', + // Encoding + 'encoding/bad-utf8-in-comment.toml', + 'encoding/bad-utf8-in-string.toml', + // Float + 'float/exp-double-us.toml', + 'float/exp-leading-us.toml', + 'float/nan_underscore.toml', + 'float/nan-incomplete-1.toml', + 'invalid/float/exp-point-1.toml', + 'float/trailing-us.toml', + 'float/us-after-point.toml', + 'float/exp-double-e-1.toml', + 'float/inf-incomplete-1.toml', + 'float/inf_underscore.toml', + // Table + 'table/rrbrace.toml', + 'table/duplicate-table-array2.toml', + 'table/duplicate.toml', + 'table/array-implicit.toml', + 'table/injection-2.toml', + 'table/llbrace.toml', + 'table/injection-1.toml', + 'table/duplicate-table-array.toml', + // Array + 'array/tables-1.toml', + 'array/no-close-2.toml', + 'array/missing-separator.toml', + 'array/text-after-array-entries.toml', + 'array/no-close.toml', + 'array/text-before-array-separator.toml', + // Date / Time + 'datetime/impossible-date.toml', + 'datetime/no-leads-with-milli.toml', + 'datetime/no-leads.toml', + // Control + 'control/string-us.toml', + 'control/comment-lf.toml', + 'control/multi-us.toml', + 'control/rawstring-del.toml', + 'control/rawmulti-del.toml', + 'control/rawstring-us.toml', + 'control/string-bs.toml', + 'control/multi-null.toml', + 'control/rawstring-lf.toml', + 'control/rawmulti-null.toml', + 'control/comment-null.toml', + 'control/multi-lf.toml', + 'control/comment-del.toml', + 'control/rawstring-null.toml', + 'control/rawmulti-lf.toml', + 'control/multi-del.toml', + 'control/string-del.toml', + 'control/rawmulti-us.toml', + 'control/comment-us.toml', + 'control/string-lf.toml', + 'control/string-null.toml', + 'inline-table/empty.toml', + 'inline-table/double-comma.toml', + 'inline-table/trailing-comma.toml', + 'inline-table/linebreak-4.toml', + 'inline-table/linebreak-3.toml', + 'inline-table/linebreak-1.toml', + 'inline-table/linebreak-2.toml', + 'inline-table/no-comma.toml', + // Key + 'key/duplicate.toml', + 'key/after-table.toml', + 'key/duplicate-keys.toml', + 'key/after-value.toml', + 'key/newline.toml', + 'key/without-value-2.toml', + 'key/no-eol.toml', + 'key/after-array.toml', + 'key/multiline.toml', + ] +) + +// test_burnt_sushi_tomltest run though 'testdata/burntsushi/toml-test/*' if found. +fn test_burnt_sushi_tomltest() { + this_file := @FILE + test_root := os.join_path(os.dir(this_file), 'testdata', 'burntsushi', 'toml-test', + 'tests') + if os.is_dir(test_root) { + valid_test_files := os.walk_ext(os.join_path(test_root, 'valid'), '.toml') + println('Testing $valid_test_files.len valid TOML files...') + mut valid := 0 + mut e := 0 + for i, valid_test_file in valid_test_files { + relative := valid_test_file.all_after(os.join_path('toml-test', 'tests', 'valid')).trim_left(os.path_separator) + if relative !in valid_exceptions { + println('OK [$i/$valid_test_files.len] "$valid_test_file"...') + toml_doc := toml.parse_file(valid_test_file) or { panic(err) } + + // parsed_json := toml_doc.to_json().replace(' ','') + // mut test_suite_json := os.read_file(valid_test_file.all_before_last('.')+'.json') or { panic(err) } + // test_suite_json = test_suite_json.replace('\n ','').replace(' ','') + // println(test_suite_json.replace('\n ','').replace(' ','')) + // assert parsed_json == test_suite_json + valid++ + } else { + e++ + println('SKIP [$i/$valid_test_files.len] "$valid_test_file" EXCEPTION [$e/$valid_exceptions.len]...') + } + } + println('$valid/$valid_test_files.len TOML files was parsed correctly') + if valid_exceptions.len > 0 { + println('TODO Skipped parsing of $valid_exceptions.len valid TOML files...') + } + + // NOTE uncomment to see list of skipped files + // assert false + + // TODO test cases where the parser should fail + invalid_test_files := os.walk_ext(os.join_path(test_root, 'invalid'), '.toml') + println('Testing $invalid_test_files.len invalid TOML files...') + mut invalid := 0 + e = 0 + for i, invalid_test_file in invalid_test_files { + relative := invalid_test_file.all_after(os.join_path('toml-test', 'tests', + 'invalid')).trim_left(os.path_separator) + if relative !in invalid_exceptions { + println('OK [$i/$invalid_test_files.len] "$invalid_test_file"...') + if toml_doc := toml.parse_file(invalid_test_file) { + assert false + } else { + println(' $err.msg') + assert true // err.msg == 'your error' + } + invalid++ + } else { + e++ + println('SKIP [$i/$invalid_test_files.len] "$invalid_test_file" EXCEPTION [$e/$invalid_exceptions.len]...') + } + } + println('$invalid/$invalid_test_files.len TOML files was parsed correctly') + if invalid_exceptions.len > 0 { + println('TODO Skipped parsing of $invalid_exceptions.len invalid TOML files...') + } + + // NOTE uncomment to see list of skipped files + // assert false + } else { + println('No test data directory found in "$test_root"') + assert true + } +} diff --git a/vlib/toml/tests/compact_test.v b/vlib/toml/tests/compact_test.v new file mode 100644 index 0000000000..a5267280a6 --- /dev/null +++ b/vlib/toml/tests/compact_test.v @@ -0,0 +1,83 @@ +import toml + +// Complete text from the example in the README.md: +// https://github.com/toml-lang/toml/blob/3b11f6921da7b6f5db37af039aa021fee450c091/README.md#Example +const toml_text = '# This is a TOML document. +title = "TOML Example" +[owner] +name = "Tom Preston-Werner" +dob = 1979-05-27T07:32:00-08:00 # First class dates +[database] +server = "192.168.1.1" +ports = [ 8000, 8001, 8002 ] +connection_max = 5000 +enabled = true +[servers] +# Indentation (tabs and/or spaces) is allowed but not required +[servers.alpha] +ip = "10.0.0.1" +dc = "eqdc10" +[servers.beta] +ip = "10.0.0.2" +dc = "eqdc10" +[clients] +data=[["gamma","delta"],[1,2]] +# Line breaks are OK when inside arrays +hosts = [ +"alpha", +"omega" +]' + +fn test_parse_compact_text() { + toml_doc := toml.parse(toml_text) or { panic(err) } + + title := toml_doc.value('title') + assert title == toml.Any('TOML Example') + assert title as string == 'TOML Example' + + owner := toml_doc.value('owner') as map[string]toml.Any + any_name := owner.value('name') or { panic(err) } + assert any_name.string() == 'Tom Preston-Werner' + + database := toml_doc.value('database') as map[string]toml.Any + db_serv := database['server'] or { + panic('could not access "server" index in "database" variable') + } + assert db_serv as string == '192.168.1.1' + + // TODO BUG depending on WHAT directory the tests is run from, this one assert sometimes fail?!?! + // assert toml_doc.value('owner.name') as string == 'Tom Preston-Werner' + + assert toml_doc.value('database.server') as string == '192.168.1.1' + + database_ports := toml_doc.value('database.ports') as []toml.Any + assert database_ports[0] as i64 == 8000 + assert database_ports[1] as i64 == 8001 + assert database_ports[2] as i64 == 8002 + assert database_ports[0].int() == 8000 + assert database_ports[1].int() == 8001 + assert database_ports[2].int() == 8002 + + assert toml_doc.value('database.connection_max') as i64 == 5000 + assert toml_doc.value('database.enabled') as bool == true + + assert toml_doc.value('servers.alpha.ip').string() == '10.0.0.1' + assert toml_doc.value('servers.alpha.dc').string() == 'eqdc10' + + assert toml_doc.value('servers.beta.ip').string() == '10.0.0.2' + assert toml_doc.value('servers.beta.dc').string() == 'eqdc10' + + clients_data := (toml_doc.value('clients.data') as []toml.Any) + // dump(clients_data) + // assert false + gamma_delta_array := clients_data[0] as []toml.Any + digits_array := clients_data[1] as []toml.Any + assert gamma_delta_array[0].string() == 'gamma' + assert gamma_delta_array[1].string() == 'delta' + assert digits_array[0].int() == 1 + assert digits_array[1].int() == 2 + + clients_hosts := (toml_doc.value('clients.hosts') as []toml.Any).as_strings() + assert clients_hosts[0] == 'alpha' + assert clients_hosts[1] == 'omega' +} diff --git a/vlib/toml/tests/datetime_test.v b/vlib/toml/tests/datetime_test.v new file mode 100644 index 0000000000..4ec7517acc --- /dev/null +++ b/vlib/toml/tests/datetime_test.v @@ -0,0 +1,73 @@ +import toml +import time + +fn test_dates() { + toml_txt := ' + # Offset Date-Time + odt1 = 1979-05-27T07:32:00Z + odt2 = 1979-05-27T00:32:00-07:00 + odt3 = 1979-05-27T00:32:00.999999-07:00 + odt4 = 1979-05-27 07:32:00Z + # Local Date-Time + ldt1 = 1979-05-27T07:32:00 + ldt2 = 1979-05-27T00:32:00.999999 + # Local Date + ld1 = 1979-05-27 + # Local Time + lt1 = 07:32:00 + lt2 = 00:32:00.999999 +' + toml_doc := toml.parse(toml_txt) or { panic(err) } + + // Re-use vars + mut odt_time := time.parse_rfc3339('1979-05-27T07:32:00Z') or { panic(err) } + mut odt_str := toml_doc.value('odt1').string() + + // odt1 test section + assert odt_str == '1979-05-26 07:32:00.000000' // W00t?! why 26th? Z=UTC? + odt1 := toml_doc.value('odt1') + assert odt1.datetime() == odt_time + + // odt2 test section + odt_time = time.parse_rfc3339('1979-05-27T00:32:00-07:00') or { panic(err) } + odt2 := toml_doc.value('odt2') + assert odt2.datetime() == odt_time + + // odt3 test section + odt_time = time.parse_rfc3339('1979-05-27T00:32:00.999999-07:00') or { panic(err) } + odt3 := toml_doc.value('odt3') + assert odt3.datetime() == odt_time + + // odt4 test section + odt_time = time.parse_rfc3339('1979-05-27 07:32:00Z') or { panic(err) } + odt4 := toml_doc.value('odt4') + assert odt4.datetime() == odt_time + + // ldt1 test section + odt_time = time.parse_rfc3339('1979-05-27T07:32:00') or { panic(err) } + ldt1 := toml_doc.value('ldt1') + assert ldt1.datetime() == odt_time + + // ldt2 test section + odt_time = time.parse_rfc3339('1979-05-27T00:32:00.999999') or { panic(err) } + ldt2 := toml_doc.value('ldt2') + assert ldt2.datetime() == odt_time + + // ld1 test section + odt_time = time.parse_rfc3339('1979-05-27') or { panic(err) } + ld1 := toml_doc.value('ld1') + assert ld1.datetime() == odt_time + assert ld1.string() == '1979-05-27 00:00:00.000000' + + // lt1 test section + odt_time = time.parse_rfc3339('07:32:00') or { panic(err) } + lt1 := toml_doc.value('lt1') + assert lt1.datetime() == odt_time + assert lt1.string() == '0000-00-00 07:32:00.000000' + + // lt2 test section + odt_time = time.parse_rfc3339('00:32:00.999999') or { panic(err) } + lt2 := toml_doc.value('lt2') + assert lt2.datetime() == odt_time + assert lt2.string() == '0000-00-00 00:32:00.999999' +} diff --git a/vlib/toml/tests/json_test.v b/vlib/toml/tests/json_test.v new file mode 100644 index 0000000000..b38cc2de4c --- /dev/null +++ b/vlib/toml/tests/json_test.v @@ -0,0 +1,19 @@ +import os +import toml + +fn test_parse() { + toml_file := + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.toml' + toml_doc := toml.parse(toml_file) or { panic(err) } + + toml_json := toml_doc.to_json() + out_file := + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.out' + out_file_json := os.read_file(out_file) or { panic(err) } + println(toml_json) + assert toml_json == out_file_json + + // assert false +} diff --git a/vlib/toml/tests/nested_test.v b/vlib/toml/tests/nested_test.v new file mode 100644 index 0000000000..dd0390e415 --- /dev/null +++ b/vlib/toml/tests/nested_test.v @@ -0,0 +1,43 @@ +import toml + +const toml_text = ' +[db] +enabled = true + +[servers] + # Indentation (tabs and/or spaces) is allowed but not required + [servers.alpha] + ip = "10.0.0.1" + dc = "eqdc10" + + [servers.beta] + ip = "10.0.0.2" + dc = "eqdc10" + + [servers.alpha.tricky] + ip = "10.0.0.100" + +[firewall.rules.limit] + ip = "10.0.0.101" + + [firewall.rules] + block = true +' + +fn test_parse() { + toml_doc := toml.parse(toml_text) or { panic(err) } + // dump(toml_doc.ast) + // assert false + + assert toml_doc.value('db.enabled').bool() + // TODO make this work + assert toml_doc.value('servers.alpha.ip').string() == '10.0.0.1' + assert toml_doc.value('servers.alpha.dc').string() == 'eqdc10' + + assert toml_doc.value('servers.beta.ip').string() == '10.0.0.2' + assert toml_doc.value('servers.beta.dc').string() == 'eqdc10' + + assert toml_doc.value('servers.alpha.tricky.ip').string() == '10.0.0.100' + assert toml_doc.value('firewall.rules.limit.ip').string() == '10.0.0.101' + assert toml_doc.value('firewall.rules.block').bool() == true +} diff --git a/vlib/toml/tests/strings_test.v b/vlib/toml/tests/strings_test.v new file mode 100644 index 0000000000..c1be2abb05 --- /dev/null +++ b/vlib/toml/tests/strings_test.v @@ -0,0 +1,67 @@ +import os +import toml + +const ( + toml_multiline_text_1 = 'multi1 = """one""" +multi2 = """one +two""" +multi3 = """ +one +two +three""" +multi4 = """ +one +two +three +four +"""' + toml_multiline_text_2 = "multi1 = '''one''' +multi2 = '''one +two''' +multi3 = ''' +one +two +three''' +multi4 = ''' +one +two +three +four +'''" +) + +fn test_multiline_strings() { + mut toml_doc := toml.parse(toml_multiline_text_1) or { panic(err) } + + mut value := toml_doc.value('multi1') + assert value.string() == 'one' + value = toml_doc.value('multi2') + assert value.string() == 'one\ntwo' + value = toml_doc.value('multi3') + assert value.string() == '\none\ntwo\nthree' + value = toml_doc.value('multi4') + assert value.string() == '\none\ntwo\nthree\nfour\n' + + toml_doc = toml.parse(toml_multiline_text_2) or { panic(err) } + value = toml_doc.value('multi1') + assert value.string() == 'one' + value = toml_doc.value('multi2') + assert value.string() == 'one\ntwo' + value = toml_doc.value('multi3') + assert value.string() == '\none\ntwo\nthree' + value = toml_doc.value('multi4') + assert value.string() == '\none\ntwo\nthree\nfour\n' + + toml_file := + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.toml' + toml_doc = toml.parse(toml_file) or { panic(err) } + value = toml_doc.value('lit_one') + assert value.string() == "'one quote'" + value = toml_doc.value('lit_two') + assert value.string() == "''two quotes''" + value = toml_doc.value('mismatch1') + assert value.string() == 'aaa' + "'''" + 'bbb' + value = toml_doc.value('mismatch2') + assert value.string() == 'aaa' + '"""' + 'bbb' +} diff --git a/vlib/toml/tests/table_test.v b/vlib/toml/tests/table_test.v new file mode 100644 index 0000000000..13b1d8f47b --- /dev/null +++ b/vlib/toml/tests/table_test.v @@ -0,0 +1,87 @@ +import toml + +const ( + toml_table_text = 'inline = {a.b = 42} + +many.dots.here.dot.dot.dot = {a.b.c = 1, a.b.d = 2} + +a = { a.b = 1 } +b = { "a"."b" = 1 } +c = { a . b = 1 } +d = { \'a\' . "b" = 1 } +e = {a.b=1} + +[tbl] +a.b.c = {d.e=1} + +[tbl.x] +a.b.c = {d.e=1} + +[[arr]] +t = {a.b=1} +T = {a.b=1} + +[[arr]] +t = {a.b=2} +T = {a.b=2}' +) + +fn test_tables() { + mut toml_doc := toml.parse(toml_table_text) or { panic(err) } + + mut value := toml_doc.value('inline.a.b') + assert value.int() == 42 + + value = toml_doc.value('many.dots.here.dot.dot.dot.a.b.c') + assert value.int() == 1 + + value = toml_doc.value('many.dots.here.dot.dot.dot.a.b.d') + assert value.int() == 2 + + value = toml_doc.value('a.a.b') + assert value.int() == 1 + + value = toml_doc.value('b.a.b') + assert value.int() == 1 + + value = toml_doc.value('c.a.b') + assert value.int() == 1 + + value = toml_doc.value('d.a.b') + assert value.int() == 1 + + value = toml_doc.value('e.a.b') + assert value.int() == 1 + + value = toml_doc.value('tbl.a.b.c.d.e') + assert value.int() == 1 + + value = toml_doc.value('tbl.x.a.b.c.d.e') + assert value.int() == 1 + + mut m := toml_doc.value('tbl') as map[string]toml.Any + + value = m.value('a.b.c.d.e') or { panic(err) } + assert value.int() == 1 + + value = m.value('x.a.b.c.d.e') or { panic(err) } + assert value.int() == 1 + + arr := toml_doc.value('arr') as []toml.Any + + arr0 := arr[0] as map[string]toml.Any + value = arr0.value('t.a.b') or { panic(err) } + assert value.int() == 1 + + arr1 := arr[1] as map[string]toml.Any + value = arr1.value('T.a.b') or { panic(err) } + assert value.int() == 1 + + arr2 := arr[2] as map[string]toml.Any + value = arr2.value('t.a.b') or { panic(err) } + assert value.int() == 2 + + arr3 := arr[3] as map[string]toml.Any + value = arr3.value('T.a.b') or { panic(err) } + assert value.int() == 2 +} diff --git a/vlib/toml/tests/testdata/json_test.out b/vlib/toml/tests/testdata/json_test.out new file mode 100644 index 0000000000..5361df007a --- /dev/null +++ b/vlib/toml/tests/testdata/json_test.out @@ -0,0 +1 @@ +{ "v": true, "animal": { "type": { "name": "pug" } }, "inline": { "a": 4, "b.c": 6, "b": { "c": 7 } }, "db": { "t": true }, "ij": { "a": { "i": 1, "j": 2 }, "b": { "i": "3", "j": "4" } }, "fruit": { "apple": { "color": "red", "taste": { "sweet": true }, "texture": { "smooth": true } } } } \ No newline at end of file diff --git a/vlib/toml/tests/testdata/json_test.toml b/vlib/toml/tests/testdata/json_test.toml new file mode 100644 index 0000000000..b8ad817f95 --- /dev/null +++ b/vlib/toml/tests/testdata/json_test.toml @@ -0,0 +1,25 @@ + +v = true + +animal = { type.name = "pug" } + +inline = { "a" = 4, "b.c" = 6, b.c = 7 } + +[db] +t = true + +[ij] + [ij.a] + i = 1 + j = 2 + + [ij.b] + i = "3" + j = "4" + +[fruit] +apple.color = "red" +apple.taste.sweet = true + +[fruit.apple.texture] +smooth = true diff --git a/vlib/toml/tests/testdata/strings_test.toml b/vlib/toml/tests/testdata/strings_test.toml new file mode 100644 index 0000000000..e138e2fa04 --- /dev/null +++ b/vlib/toml/tests/testdata/strings_test.toml @@ -0,0 +1,15 @@ +# Make sure that quotes inside multiline strings are allowed, including right +# after the opening '''/""" and before the closing '''/""" + +lit_one = ''''one quote'''' +lit_two = '''''two quotes''''' +lit_one_space = ''' 'one quote' ''' +lit_two_space = ''' ''two quotes'' ''' + +one = """"one quote"""" +two = """""two quotes""""" +one_space = """ "one quote" """ +two_space = """ ""two quotes"" """ + +mismatch1 = """aaa'''bbb""" +mismatch2 = '''aaa"""bbb''' diff --git a/vlib/toml/tests/testdata/toml_test.out b/vlib/toml/tests/testdata/toml_test.out new file mode 100644 index 0000000000..ca444b1461 --- /dev/null +++ b/vlib/toml/tests/testdata/toml_test.out @@ -0,0 +1 @@ +{ "title": "TOML Example", "owner": { "name": "Tom Preston-Werner", "dob": "1979-05-27T07:32:00-08:00" }, "database": { "server": "192.168.1.1", "ports": [ 8000, 8001, 8002 ], "connection_max": 5000, "enabled": true }, "servers": { "alpha": { "ip": "10.0.0.1", "dc": "eqdc10" }, "beta": { "ip": "10.0.0.2", "dc": "eqdc10" } }, "clients": { "data": [ [ "gamma", "delta" ], [ 1, 2 ] ], "hosts": [ "alpha", "omega" ] } } \ No newline at end of file diff --git a/vlib/toml/tests/testdata/toml_test.toml b/vlib/toml/tests/testdata/toml_test.toml new file mode 100644 index 0000000000..175515ad8f --- /dev/null +++ b/vlib/toml/tests/testdata/toml_test.toml @@ -0,0 +1,33 @@ +# This is a TOML document. + +title = "TOML Example" + +[owner] +name = "Tom Preston-Werner" +dob = 1979-05-27T07:32:00-08:00 # First class dates + +[database] +server = "192.168.1.1" +ports = [ 8000, 8001, 8002 ] +connection_max = 5000 +enabled = true + +[servers] + + # Indentation (tabs and/or spaces) is allowed but not required + [servers.alpha] + ip = "10.0.0.1" + dc = "eqdc10" + + [servers.beta] + ip = "10.0.0.2" + dc = "eqdc10" + +[clients] +data = [ ["gamma", "delta"], [1, 2] ] + +# Line breaks are OK when inside arrays +hosts = [ + "alpha", + "omega" +] diff --git a/vlib/toml/tests/toml_test.v b/vlib/toml/tests/toml_test.v new file mode 100644 index 0000000000..fb1ebe4786 --- /dev/null +++ b/vlib/toml/tests/toml_test.v @@ -0,0 +1,110 @@ +import os +import toml + +const toml_text = os.read_file( + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.toml') or { panic(err) } + +fn test_toml() { + // File containing the complete text from the example in the official TOML project README.md: + // https://github.com/toml-lang/toml/blob/3b11f6921da7b6f5db37af039aa021fee450c091/README.md#Example + toml_doc := toml.parse(toml_text) or { panic(err) } + toml_json := toml_doc.to_json() + + // NOTE Kept for easier debugging: + // dump(toml_doc.ast) + // println(toml_json) + // assert false + + assert toml_json == os.read_file( + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.out') or { panic(err) } + + title := toml_doc.value('title') + assert title == toml.Any('TOML Example') + assert title as string == 'TOML Example' + + owner := toml_doc.value('owner') as map[string]toml.Any + any_name := owner.value('name') or { panic(err) } + assert any_name.string() == 'Tom Preston-Werner' + + database := toml_doc.value('database') as map[string]toml.Any + db_serv := database['server'] or { + panic('could not access "server" index in "database" variable') + } + assert db_serv as string == '192.168.1.1' + + // TODO BUG depending on WHAT directory the tests is run from, this one assert sometimes fail?!?! + // assert toml_doc.value('owner.name') as string == 'Tom Preston-Werner' + + assert toml_doc.value('database.server') as string == '192.168.1.1' + + database_ports := toml_doc.value('database.ports') as []toml.Any + assert database_ports[0] as i64 == 8000 + assert database_ports[1] as i64 == 8001 + assert database_ports[2] as i64 == 8002 + assert database_ports[0].int() == 8000 + assert database_ports[1].int() == 8001 + assert database_ports[2].int() == 8002 + + assert toml_doc.value('database.connection_max') as i64 == 5000 + assert toml_doc.value('database.enabled') as bool == true + + assert toml_doc.value('servers.alpha.ip').string() == '10.0.0.1' + assert toml_doc.value('servers.alpha.dc').string() == 'eqdc10' + + assert toml_doc.value('servers.beta.ip').string() == '10.0.0.2' + assert toml_doc.value('servers.beta.dc').string() == 'eqdc10' + + clients_data := (toml_doc.value('clients.data') as []toml.Any) + // dump(clients_data) + // assert false + gamma_delta_array := clients_data[0] as []toml.Any + digits_array := clients_data[1] as []toml.Any + assert gamma_delta_array[0].string() == 'gamma' + assert gamma_delta_array[1].string() == 'delta' + assert digits_array[0].int() == 1 + assert digits_array[1].int() == 2 + + clients_hosts := (toml_doc.value('clients.hosts') as []toml.Any).as_strings() + assert clients_hosts[0] == 'alpha' + assert clients_hosts[1] == 'omega' +} + +fn test_toml_file() { + out_path := os.join_path(os.temp_dir(), 'v_toml_tests') + test_file := os.join_path(out_path, 'toml_example.toml') + os.mkdir_all(out_path) or { assert false } + defer { + os.rmdir_all(out_path) or {} + } + os.write_file(test_file, toml_text) or { assert false } + toml_doc := toml.parse_file(test_file) or { panic(err) } + + toml_json := toml_doc.to_json() + + // NOTE Kept for easier debugging: + // dump(toml_doc.ast) + // println(toml_json) + // assert false + + assert toml_json == os.read_file( + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.out') or { panic(err) } +} + +fn test_toml_parse_text() { + toml_doc := toml.parse_text(toml_text) or { panic(err) } + toml_json := toml_doc.to_json() + assert toml_json == os.read_file( + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.out') or { panic(err) } +} + +fn test_toml_parse() { + toml_doc := toml.parse(toml_text) or { panic(err) } + toml_json := toml_doc.to_json() + assert toml_json == os.read_file( + os.real_path(os.join_path(os.dir(@FILE), 'testdata', os.file_name(@FILE).all_before_last('.'))) + + '.out') or { panic(err) } +} diff --git a/vlib/toml/tests/types_test.v b/vlib/toml/tests/types_test.v new file mode 100644 index 0000000000..524a51bdc3 --- /dev/null +++ b/vlib/toml/tests/types_test.v @@ -0,0 +1,70 @@ +import toml + +fn test_string() { + str_value := 'test string' + toml_txt := 'string = "test string"' + toml_doc := toml.parse(toml_txt) or { panic(err) } + + value := toml_doc.value('string') + assert value == toml.Any(str_value) + assert value as string == str_value + assert value.string() == str_value +} + +fn test_i64() { + toml_txt := 'i64 = 120' + toml_doc := toml.parse(toml_txt) or { panic(err) } + + value := toml_doc.value('i64') + assert value == toml.Any(i64(120)) + assert value as i64 == 120 + assert value.i64() == i64(120) +} + +fn test_bool() { + toml_txt := ' +bool_true = true +bool_false = false' + toml_doc := toml.parse(toml_txt) or { panic(err) } + + value_true := toml_doc.value('bool_true') + assert value_true == toml.Any(true) + assert value_true as bool == true + assert value_true != toml.Any(false) + assert value_true as bool != false + assert value_true.bool() == true + + value_false := toml_doc.value('bool_false') + assert value_false == toml.Any(false) + assert value_false as bool == false + assert value_false != toml.Any(true) + assert value_false as bool != true + assert value_false.bool() == false +} + +fn test_bool_key_is_not_value() { + toml_txt := 'true = true +false = false' + toml_doc := toml.parse(toml_txt) or { panic(err) } + + value_true := toml_doc.value('true') + assert value_true == toml.Any(true) + assert value_true as bool == true + assert value_true != toml.Any(false) + assert value_true as bool != false + + value_false := toml_doc.value('false') + assert value_false == toml.Any(false) + assert value_false as bool == false + assert value_false != toml.Any(true) + assert value_false as bool != true +} + +fn test_single_letter_key() { + toml_txt := '[v] +open_sourced = "Jun 22 2019 20:20:28"' + toml_doc := toml.parse(toml_txt) or { panic(err) } + + value := toml_doc.value('v.open_sourced').string() + assert value == 'Jun 22 2019 20:20:28' +} diff --git a/vlib/toml/token/position.v b/vlib/toml/token/position.v new file mode 100644 index 0000000000..478dfee025 --- /dev/null +++ b/vlib/toml/token/position.v @@ -0,0 +1,13 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module token + +// Position represents a position in a TOML document. +pub struct Position { +pub: + len int // length of the literal in the source + line_nr int // the line number in the source where the token occured + pos int // the position of the token in scanner text + col int // the column in the source where the token occured +} diff --git a/vlib/toml/token/token.v b/vlib/toml/token/token.v new file mode 100644 index 0000000000..6438f0c985 --- /dev/null +++ b/vlib/toml/token/token.v @@ -0,0 +1,52 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module token + +// Token holds information about the current scan of bytes. +pub struct Token { +pub: + kind Kind // the token number/enum; for quick comparisons + lit string // literal representation of the token + col int // the column in the source where the token occured + line_nr int // the line number in the source where the token occured + pos int // the position of the token in scanner text + len int // length of the literal +} + +// Kind represents a logical type of entity found in any given TOML document. +pub enum Kind { + unknown + eof + bare // user + boolean // true or false + number // 123 + quoted // 'foo', "foo", """foo""" or '''foo''' + plus // + + minus // - + underscore // _ + comma // , + colon // : + hash // # comment + assign // = + lcbr // { + rcbr // } + lsbr // [ + rsbr // ] + nl // \n linefeed / newline character + cr // \r carrige return + tab // \t character + whitespace // ` ` + period // . + _end_ +} + +[inline] +pub fn (tok &Token) position() Position { + return Position{ + len: tok.len + line_nr: tok.line_nr - 1 + pos: tok.pos + col: tok.col - 1 + } +} diff --git a/vlib/toml/toml.v b/vlib/toml/toml.v new file mode 100644 index 0000000000..b94f08bee8 --- /dev/null +++ b/vlib/toml/toml.v @@ -0,0 +1,217 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module toml + +import os +import toml.ast +import toml.util +import toml.input +import toml.scanner +import toml.parser +import time + +// Null is used in sumtype checks as a "default" value when nothing else is possible. +pub struct Null { +} + +// Config is used to configure the toml parser. +// Only one of the fields `text` or `file_path`, is allowed to be set at time of configuration. +pub struct Config { +pub: + text string // TOML text + file_path string // '/path/to/file.toml' + parse_comments bool +} + +// Doc is a representation of a TOML document. +// A document can be constructed from a `string` buffer or from a file path +pub struct Doc { +pub: + ast &ast.Root +} + +// parse_file parses the TOML file in `path`. +pub fn parse_file(path string) ?Doc { + input_config := input.Config{ + file_path: path + } + scanner_config := scanner.Config{ + input: input_config + } + parser_config := parser.Config{ + scanner: scanner.new_scanner(scanner_config) ? + } + mut p := parser.new_parser(parser_config) + ast := p.parse() ? + return Doc{ + ast: ast + } +} + +// parse_text parses the TOML document provided in `text`. +pub fn parse_text(text string) ?Doc { + input_config := input.Config{ + text: text + } + scanner_config := scanner.Config{ + input: input_config + } + parser_config := parser.Config{ + scanner: scanner.new_scanner(scanner_config) ? + } + mut p := parser.new_parser(parser_config) + ast := p.parse() ? + return Doc{ + ast: ast + } +} + +// parse parses the TOML document provided in `input`. +// parse automatically try to determine if the type of `input` is a file or text. +// For explicit parsing of input see `parse_file` or `parse_text`. +pub fn parse(toml string) ?Doc { + mut input_config := input.Config{} + if !toml.contains('\n') && os.is_file(toml) { + input_config = input.Config{ + file_path: toml + } + } else { + input_config = input.Config{ + text: toml + } + } + + scanner_config := scanner.Config{ + input: input_config + } + parser_config := parser.Config{ + scanner: scanner.new_scanner(scanner_config) ? + } + mut p := parser.new_parser(parser_config) + ast := p.parse() ? + return Doc{ + ast: ast + } +} + +// to_json returns a compact json string of the complete document +pub fn (d Doc) to_json() string { + return d.ast.to_json() +} + +// value queries a value from the TOML document. +pub fn (d Doc) value(key string) Any { + values := d.ast.table as map[string]ast.Node + // any_values := d.ast_to_any(values) as map[string]Any + return d.get_map_value_as_any(values, key) +} + +// ast_to_any_value converts `from` ast.Node to toml.Any value. +fn (d Doc) ast_to_any(value ast.Node) Any { + // `match` isn't currently very suitable for further unwrapping sumtypes in the if's... + if value is ast.Date || value is ast.Time || value is ast.DateTime { + mut tim := time.Time{} + if value is ast.Date { + date_str := (value as ast.Date).text + + tim = time.parse_rfc3339(date_str) or { + return Any(Null{}) + // TODO decide this + // panic(@MOD + '.' + @STRUCT + '.' + @FN + + // ' failed converting "$date_str" to iso8601: $err') + } + } else if value is ast.Time { + time_str := (value as ast.Time).text + + tim = time.parse_rfc3339(time_str) or { + return Any(Null{}) + // TODO decide this + // panic(@MOD + '.' + @STRUCT + '.' + @FN + + // ' failed converting "$time_str" to rfc3339: $err') + } + } else { + // value is ast.DateTime + datetime_str := (value as ast.DateTime).text + + tim = time.parse_rfc3339(datetime_str) or { + return Any(Null{}) + // TODO decide this + // panic(@MOD + '.' + @STRUCT + '.' + @FN + + // ' failed converting "$datetime_str" to rfc3339: $err') + } + } + return Any(tim) + } + + match value { + ast.Quoted { + return Any((value as ast.Quoted).text) + } + ast.Number { + str := (value as ast.Number).text + if str.contains('.') { + return Any(str.f64()) + } + return Any(str.i64()) + } + ast.Bool { + str := (value as ast.Bool).text + if str == 'true' { + return Any(true) + } + return Any(false) + } + map[string]ast.Node { + m := (value as map[string]ast.Node) + mut am := map[string]Any{} + for k, v in m { + am[k] = d.ast_to_any(v) + } + return am + // return d.get_map_value(m, key_split[1..].join('.')) + } + []ast.Node { + a := (value as []ast.Node) + mut aa := []Any{} + for val in a { + aa << d.ast_to_any(val) + } + return aa + } + else { + return Any(Null{}) + } + } + + return Any(Null{}) + // TODO decide this + // panic(@MOD + '.' + @STRUCT + '.' + @FN + ' can\'t convert "$value"') + // return Any('') +} + +// get_map_value_as_any returns the value found at `key` in the map `values` as `Any` type. +fn (d Doc) get_map_value_as_any(values map[string]ast.Node, key string) Any { + key_split := key.split('.') + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' getting "${key_split[0]}"') + if key_split[0] in values.keys() { + value := values[key_split[0]] or { + return Any(Null{}) + // TODO decide this + // panic(@MOD + '.' + @STRUCT + '.' + @FN + ' key "$key" does not exist') + } + // `match` isn't currently very suitable for these types of sum type constructs... + if value is map[string]ast.Node { + m := (value as map[string]ast.Node) + next_key := key_split[1..].join('.') + if next_key == '' { + return d.ast_to_any(value) + } + return d.get_map_value_as_any(m, next_key) + } + return d.ast_to_any(value) + } + return Any(Null{}) + // TODO decide this + // panic(@MOD + '.' + @STRUCT + '.' + @FN + ' key "$key" does not exist') +} diff --git a/vlib/toml/util/util.v b/vlib/toml/util/util.v new file mode 100644 index 0000000000..274a9e4723 --- /dev/null +++ b/vlib/toml/util/util.v @@ -0,0 +1,14 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module util + +[inline] +pub fn is_key_char(c byte) bool { + return (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) // || c == `_` || c == `-` <- these are identified when tokenizing +} + +[if trace_toml ?] +pub fn printdbg(id string, message string) { + eprintln(id + ' ' + message) +}