x.json2: proper string encoding + minor fixes (#9026)

pull/9036/head
Ned Palacios 2021-03-01 17:22:36 +08:00 committed by GitHub
parent 506041a15b
commit ee879f3e41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 129 additions and 8 deletions

View File

@ -6,7 +6,7 @@ module json2
import strings
fn write_value(v Any, i int, len int, mut wr strings.Builder) {
str := v.str()
str := v.json_str()
if v is string {
wr.write_string('"$str"')
} else {
@ -51,11 +51,21 @@ pub fn (flds []Any) str() string {
return res
}
// str returns the string representation of the `Any` type.
// str returns the string representation of the `Any` type. Use the `json_str` method
// if you want to use the escaped str() version of the `Any` type.
pub fn (f Any) str() string {
if f is string {
return f
} else {
return f.json_str()
}
}
// json_str returns the JSON string representation of the `Any` type.
pub fn (f Any) json_str() string {
match f {
string {
return f
return json_string(f)
}
int {
return f.str()
@ -85,3 +95,76 @@ pub fn (f Any) str() string {
}
}
}
// char_len_list is a modified version of builtin.utf8_str_len
// that returns an array of character lengths. (e.g "t✔" => [1,2])
fn char_len_list(s string) []int {
mut l := 1
mut ls := []int{}
for i := 0; i < s.len; i++ {
c := s[i]
if (c & (1 << 7)) != 0 {
for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
l++
i++
}
}
ls << l
l = 1
}
return ls
}
const escaped_chars = [r'\b', r'\f', r'\n', r'\r', r'\t']
// json_string returns the JSON spec-compliant version of the string.
[manualfree]
fn json_string(s string) string {
// not the best implementation but will revisit it soon
char_lens := char_len_list(s)
mut sb := strings.new_builder(s.len)
mut i := 0
defer {
unsafe {
char_lens.free()
// freeing string builder on defer after
// returning .str() still isn't working :(
// sb.free()
}
}
for char_len in char_lens {
if char_len == 1 {
chr := s[i]
if chr in json2.important_escapable_chars {
for j := 0 ; j < json2.important_escapable_chars.len; j++ {
if chr == json2.important_escapable_chars[j] {
sb.write_string(escaped_chars[j])
break
}
}
} else if chr == `"` || chr == `/` || chr == `\\` {
sb.write_string('\\' + chr.ascii_str())
} else {
sb.write_b(chr)
}
} else {
slice := s[i .. i + char_len]
hex_code := slice.utf32_code().hex()
if hex_code.len == 4 {
sb.write_string('\\u$hex_code')
} else {
// TODO: still figuring out what
// to do with more than 4 chars
sb.write_b(` `)
}
unsafe {
slice.free()
hex_code.free()
}
}
i += char_len
}
str := sb.str()
unsafe { sb.free() }
return str
}

View File

@ -0,0 +1,21 @@
import x.json2
fn test_json_string_characters() {
text := json2.raw_decode(r'"\n\r\b\f\t\\\"\/"') or { '' }
assert text.json_str() == '\\n\\r\\b\\f\\t\\\\\\"\\/'
}
fn test_json_string() {
text := json2.Any('test')
assert text.json_str() == r'te\u2714st'
}
fn test_json_string_emoji() {
text := json2.Any('🐈')
assert text.json_str() == r' '
}
fn test_json_string_non_ascii() {
text := json2.Any('')
assert text.json_str() == r'\u3072\u3089\u304c\u306a'
}

View File

@ -41,12 +41,12 @@ const (
// list of characters commonly used in JSON.
char_list = [`{`, `}`, `[`, `]`, `,`, `:`]
// list of newlines to check when moving to a new position.
newlines = [`\r`, `\n`, byte(9), `\t`]
newlines = [`\r`, `\n`, `\t`]
// list of escapable that needs to be escaped inside a JSON string.
// double quotes and forward slashes are excluded intentionally since
// they have their own separate checks for it in order to pass the
// JSON test suite (https://github.com/nst/JSONTestSuite/).
important_escapable_chars = [byte(9), 10, 0, `\b`, `\f`, `\n`, `\r`, `\t`]
important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]
// list of valid unicode escapes aside from \u{4-hex digits}
valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]
// used for transforming escapes into valid unicode (eg. n => \n)
@ -129,7 +129,7 @@ fn (mut s Scanner) text_scan() Token {
} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`)
&& ch in json2.important_escapable_chars {
return s.error('character must be escaped with a backslash')
} else if s.pos == s.text.len - 1 && ch == `\\` {
} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == byte(0) {
return s.error('invalid backslash escape')
} else if s.pos + 1 < s.text.len && ch == `\\` {
peek := s.text[s.pos + 1]
@ -154,8 +154,15 @@ fn (mut s Scanner) text_scan() Token {
if codepoint.len != 4 {
return s.error('unicode escape must have 4 hex digits')
}
chrs << byte(strconv.parse_uint(codepoint.bytestr(), 16, 32))
unsafe { codepoint.free() }
val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32))
converted := utf32_to_str(val)
converted_bytes := converted.bytes()
chrs << converted_bytes
unsafe {
converted.free()
converted_bytes.free()
codepoint.free()
}
continue
} else {
return s.error('incomplete unicode escape')

View File

@ -20,6 +20,16 @@ fn test_str_valid_unicode_escape() {
assert tok.lit.bytestr() == 'H'
}
fn test_str_valid_unicode_escape_2() {
mut sc := Scanner{
text: r'"\u2714"'.bytes()
}
tok := sc.scan()
assert tok.kind == .str_
assert tok.lit.len == 3
assert tok.lit.bytestr() == ''
}
fn test_str_invalid_escape() {
mut sc := Scanner{
text: r'"\z"'.bytes()