x.json2: proper string encoding + minor fixes (#9026)
parent
506041a15b
commit
ee879f3e41
|
@ -6,7 +6,7 @@ module json2
|
|||
import strings
|
||||
|
||||
fn write_value(v Any, i int, len int, mut wr strings.Builder) {
|
||||
str := v.str()
|
||||
str := v.json_str()
|
||||
if v is string {
|
||||
wr.write_string('"$str"')
|
||||
} else {
|
||||
|
@ -51,11 +51,21 @@ pub fn (flds []Any) str() string {
|
|||
return res
|
||||
}
|
||||
|
||||
// str returns the string representation of the `Any` type.
|
||||
// str returns the string representation of the `Any` type. Use the `json_str` method
|
||||
// if you want to use the escaped str() version of the `Any` type.
|
||||
pub fn (f Any) str() string {
|
||||
if f is string {
|
||||
return f
|
||||
} else {
|
||||
return f.json_str()
|
||||
}
|
||||
}
|
||||
|
||||
// json_str returns the JSON string representation of the `Any` type.
|
||||
pub fn (f Any) json_str() string {
|
||||
match f {
|
||||
string {
|
||||
return f
|
||||
return json_string(f)
|
||||
}
|
||||
int {
|
||||
return f.str()
|
||||
|
@ -85,3 +95,76 @@ pub fn (f Any) str() string {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// char_len_list is a modified version of builtin.utf8_str_len
|
||||
// that returns an array of character lengths. (e.g "t✔" => [1,2])
|
||||
fn char_len_list(s string) []int {
|
||||
mut l := 1
|
||||
mut ls := []int{}
|
||||
for i := 0; i < s.len; i++ {
|
||||
c := s[i]
|
||||
if (c & (1 << 7)) != 0 {
|
||||
for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
|
||||
l++
|
||||
i++
|
||||
}
|
||||
}
|
||||
ls << l
|
||||
l = 1
|
||||
}
|
||||
return ls
|
||||
}
|
||||
|
||||
const escaped_chars = [r'\b', r'\f', r'\n', r'\r', r'\t']
|
||||
|
||||
// json_string returns the JSON spec-compliant version of the string.
|
||||
[manualfree]
|
||||
fn json_string(s string) string {
|
||||
// not the best implementation but will revisit it soon
|
||||
char_lens := char_len_list(s)
|
||||
mut sb := strings.new_builder(s.len)
|
||||
mut i := 0
|
||||
defer {
|
||||
unsafe {
|
||||
char_lens.free()
|
||||
// freeing string builder on defer after
|
||||
// returning .str() still isn't working :(
|
||||
// sb.free()
|
||||
}
|
||||
}
|
||||
for char_len in char_lens {
|
||||
if char_len == 1 {
|
||||
chr := s[i]
|
||||
if chr in json2.important_escapable_chars {
|
||||
for j := 0 ; j < json2.important_escapable_chars.len; j++ {
|
||||
if chr == json2.important_escapable_chars[j] {
|
||||
sb.write_string(escaped_chars[j])
|
||||
break
|
||||
}
|
||||
}
|
||||
} else if chr == `"` || chr == `/` || chr == `\\` {
|
||||
sb.write_string('\\' + chr.ascii_str())
|
||||
} else {
|
||||
sb.write_b(chr)
|
||||
}
|
||||
} else {
|
||||
slice := s[i .. i + char_len]
|
||||
hex_code := slice.utf32_code().hex()
|
||||
if hex_code.len == 4 {
|
||||
sb.write_string('\\u$hex_code')
|
||||
} else {
|
||||
// TODO: still figuring out what
|
||||
// to do with more than 4 chars
|
||||
sb.write_b(` `)
|
||||
}
|
||||
unsafe {
|
||||
slice.free()
|
||||
hex_code.free()
|
||||
}
|
||||
}
|
||||
i += char_len
|
||||
}
|
||||
str := sb.str()
|
||||
unsafe { sb.free() }
|
||||
return str
|
||||
}
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
import x.json2
|
||||
|
||||
fn test_json_string_characters() {
|
||||
text := json2.raw_decode(r'"\n\r\b\f\t\\\"\/"') or { '' }
|
||||
assert text.json_str() == '\\n\\r\\b\\f\\t\\\\\\"\\/'
|
||||
}
|
||||
|
||||
fn test_json_string() {
|
||||
text := json2.Any('te✔st')
|
||||
assert text.json_str() == r'te\u2714st'
|
||||
}
|
||||
|
||||
fn test_json_string_emoji() {
|
||||
text := json2.Any('🐈')
|
||||
assert text.json_str() == r' '
|
||||
}
|
||||
|
||||
fn test_json_string_non_ascii() {
|
||||
text := json2.Any('ひらがな')
|
||||
assert text.json_str() == r'\u3072\u3089\u304c\u306a'
|
||||
}
|
|
@ -41,12 +41,12 @@ const (
|
|||
// list of characters commonly used in JSON.
|
||||
char_list = [`{`, `}`, `[`, `]`, `,`, `:`]
|
||||
// list of newlines to check when moving to a new position.
|
||||
newlines = [`\r`, `\n`, byte(9), `\t`]
|
||||
newlines = [`\r`, `\n`, `\t`]
|
||||
// list of escapable that needs to be escaped inside a JSON string.
|
||||
// double quotes and forward slashes are excluded intentionally since
|
||||
// they have their own separate checks for it in order to pass the
|
||||
// JSON test suite (https://github.com/nst/JSONTestSuite/).
|
||||
important_escapable_chars = [byte(9), 10, 0, `\b`, `\f`, `\n`, `\r`, `\t`]
|
||||
important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]
|
||||
// list of valid unicode escapes aside from \u{4-hex digits}
|
||||
valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]
|
||||
// used for transforming escapes into valid unicode (eg. n => \n)
|
||||
|
@ -129,7 +129,7 @@ fn (mut s Scanner) text_scan() Token {
|
|||
} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`)
|
||||
&& ch in json2.important_escapable_chars {
|
||||
return s.error('character must be escaped with a backslash')
|
||||
} else if s.pos == s.text.len - 1 && ch == `\\` {
|
||||
} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == byte(0) {
|
||||
return s.error('invalid backslash escape')
|
||||
} else if s.pos + 1 < s.text.len && ch == `\\` {
|
||||
peek := s.text[s.pos + 1]
|
||||
|
@ -154,8 +154,15 @@ fn (mut s Scanner) text_scan() Token {
|
|||
if codepoint.len != 4 {
|
||||
return s.error('unicode escape must have 4 hex digits')
|
||||
}
|
||||
chrs << byte(strconv.parse_uint(codepoint.bytestr(), 16, 32))
|
||||
unsafe { codepoint.free() }
|
||||
val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32))
|
||||
converted := utf32_to_str(val)
|
||||
converted_bytes := converted.bytes()
|
||||
chrs << converted_bytes
|
||||
unsafe {
|
||||
converted.free()
|
||||
converted_bytes.free()
|
||||
codepoint.free()
|
||||
}
|
||||
continue
|
||||
} else {
|
||||
return s.error('incomplete unicode escape')
|
||||
|
|
|
@ -20,6 +20,16 @@ fn test_str_valid_unicode_escape() {
|
|||
assert tok.lit.bytestr() == 'H'
|
||||
}
|
||||
|
||||
fn test_str_valid_unicode_escape_2() {
|
||||
mut sc := Scanner{
|
||||
text: r'"\u2714"'.bytes()
|
||||
}
|
||||
tok := sc.scan()
|
||||
assert tok.kind == .str_
|
||||
assert tok.lit.len == 3
|
||||
assert tok.lit.bytestr() == '✔'
|
||||
}
|
||||
|
||||
fn test_str_invalid_escape() {
|
||||
mut sc := Scanner{
|
||||
text: r'"\z"'.bytes()
|
||||
|
|
Loading…
Reference in New Issue