From ee879f3e41c6552730b526868f30b2dfacaf3715 Mon Sep 17 00:00:00 2001 From: Ned Palacios <7358345+nedpals@users.noreply.github.com> Date: Mon, 1 Mar 2021 17:22:36 +0800 Subject: [PATCH] x.json2: proper string encoding + minor fixes (#9026) --- vlib/x/json2/encoder.v | 89 +++++++++++++++++++++++++++++++++++-- vlib/x/json2/encoder_test.v | 21 +++++++++ vlib/x/json2/scanner.v | 17 ++++--- vlib/x/json2/scanner_test.v | 10 +++++ 4 files changed, 129 insertions(+), 8 deletions(-) create mode 100644 vlib/x/json2/encoder_test.v diff --git a/vlib/x/json2/encoder.v b/vlib/x/json2/encoder.v index 0476231a03..cb3f51f2b2 100644 --- a/vlib/x/json2/encoder.v +++ b/vlib/x/json2/encoder.v @@ -6,7 +6,7 @@ module json2 import strings fn write_value(v Any, i int, len int, mut wr strings.Builder) { - str := v.str() + str := v.json_str() if v is string { wr.write_string('"$str"') } else { @@ -51,11 +51,21 @@ pub fn (flds []Any) str() string { return res } -// str returns the string representation of the `Any` type. +// str returns the string representation of the `Any` type. Use the `json_str` method +// if you want to use the escaped str() version of the `Any` type. pub fn (f Any) str() string { + if f is string { + return f + } else { + return f.json_str() + } +} + +// json_str returns the JSON string representation of the `Any` type. +pub fn (f Any) json_str() string { match f { string { - return f + return json_string(f) } int { return f.str() @@ -85,3 +95,76 @@ pub fn (f Any) str() string { } } } + +// char_len_list is a modified version of builtin.utf8_str_len +// that returns an array of character lengths. (e.g "tβœ”" => [1,2]) +fn char_len_list(s string) []int { + mut l := 1 + mut ls := []int{} + for i := 0; i < s.len; i++ { + c := s[i] + if (c & (1 << 7)) != 0 { + for t := byte(1 << 6); (c & t) != 0; t >>= 1 { + l++ + i++ + } + } + ls << l + l = 1 + } + return ls +} + +const escaped_chars = [r'\b', r'\f', r'\n', r'\r', r'\t'] + +// json_string returns the JSON spec-compliant version of the string. +[manualfree] +fn json_string(s string) string { + // not the best implementation but will revisit it soon + char_lens := char_len_list(s) + mut sb := strings.new_builder(s.len) + mut i := 0 + defer { + unsafe { + char_lens.free() + // freeing string builder on defer after + // returning .str() still isn't working :( + // sb.free() + } + } + for char_len in char_lens { + if char_len == 1 { + chr := s[i] + if chr in json2.important_escapable_chars { + for j := 0 ; j < json2.important_escapable_chars.len; j++ { + if chr == json2.important_escapable_chars[j] { + sb.write_string(escaped_chars[j]) + break + } + } + } else if chr == `"` || chr == `/` || chr == `\\` { + sb.write_string('\\' + chr.ascii_str()) + } else { + sb.write_b(chr) + } + } else { + slice := s[i .. i + char_len] + hex_code := slice.utf32_code().hex() + if hex_code.len == 4 { + sb.write_string('\\u$hex_code') + } else { + // TODO: still figuring out what + // to do with more than 4 chars + sb.write_b(` `) + } + unsafe { + slice.free() + hex_code.free() + } + } + i += char_len + } + str := sb.str() + unsafe { sb.free() } + return str +} diff --git a/vlib/x/json2/encoder_test.v b/vlib/x/json2/encoder_test.v new file mode 100644 index 0000000000..bb506752a8 --- /dev/null +++ b/vlib/x/json2/encoder_test.v @@ -0,0 +1,21 @@ +import x.json2 + +fn test_json_string_characters() { + text := json2.raw_decode(r'"\n\r\b\f\t\\\"\/"') or { '' } + assert text.json_str() == '\\n\\r\\b\\f\\t\\\\\\"\\/' +} + +fn test_json_string() { + text := json2.Any('teβœ”st') + assert text.json_str() == r'te\u2714st' +} + +fn test_json_string_emoji() { + text := json2.Any('🐈') + assert text.json_str() == r' ' +} + +fn test_json_string_non_ascii() { + text := json2.Any('γ²γ‚‰γŒγͺ') + assert text.json_str() == r'\u3072\u3089\u304c\u306a' +} diff --git a/vlib/x/json2/scanner.v b/vlib/x/json2/scanner.v index 20aad798e8..82144753b0 100644 --- a/vlib/x/json2/scanner.v +++ b/vlib/x/json2/scanner.v @@ -41,12 +41,12 @@ const ( // list of characters commonly used in JSON. char_list = [`{`, `}`, `[`, `]`, `,`, `:`] // list of newlines to check when moving to a new position. - newlines = [`\r`, `\n`, byte(9), `\t`] + newlines = [`\r`, `\n`, `\t`] // list of escapable that needs to be escaped inside a JSON string. // double quotes and forward slashes are excluded intentionally since // they have their own separate checks for it in order to pass the // JSON test suite (https://github.com/nst/JSONTestSuite/). - important_escapable_chars = [byte(9), 10, 0, `\b`, `\f`, `\n`, `\r`, `\t`] + important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`] // list of valid unicode escapes aside from \u{4-hex digits} valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`] // used for transforming escapes into valid unicode (eg. n => \n) @@ -129,7 +129,7 @@ fn (mut s Scanner) text_scan() Token { } else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch in json2.important_escapable_chars { return s.error('character must be escaped with a backslash') - } else if s.pos == s.text.len - 1 && ch == `\\` { + } else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == byte(0) { return s.error('invalid backslash escape') } else if s.pos + 1 < s.text.len && ch == `\\` { peek := s.text[s.pos + 1] @@ -154,8 +154,15 @@ fn (mut s Scanner) text_scan() Token { if codepoint.len != 4 { return s.error('unicode escape must have 4 hex digits') } - chrs << byte(strconv.parse_uint(codepoint.bytestr(), 16, 32)) - unsafe { codepoint.free() } + val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32)) + converted := utf32_to_str(val) + converted_bytes := converted.bytes() + chrs << converted_bytes + unsafe { + converted.free() + converted_bytes.free() + codepoint.free() + } continue } else { return s.error('incomplete unicode escape') diff --git a/vlib/x/json2/scanner_test.v b/vlib/x/json2/scanner_test.v index 5e1686b760..935f3be8fc 100644 --- a/vlib/x/json2/scanner_test.v +++ b/vlib/x/json2/scanner_test.v @@ -20,6 +20,16 @@ fn test_str_valid_unicode_escape() { assert tok.lit.bytestr() == 'H' } +fn test_str_valid_unicode_escape_2() { + mut sc := Scanner{ + text: r'"\u2714"'.bytes() + } + tok := sc.scan() + assert tok.kind == .str_ + assert tok.lit.len == 3 + assert tok.lit.bytestr() == 'βœ”' +} + fn test_str_invalid_escape() { mut sc := Scanner{ text: r'"\z"'.bytes()