x.json2: proper string encoding + minor fixes (#9026)

2021-03-01 17:22:36 +08:00 · 2021-03-01 17:22:36 +08:00 · ee879f3e41
parent 506041a15b
commit ee879f3e41
4 changed files with 129 additions and 8 deletions
--- a/vlib/x/json2/encoder.v
+++ b/vlib/x/json2/encoder.v
@ -6,7 +6,7 @@ module json2
 import strings
 fn write_value(v Any, i int, len int, mut wr strings.Builder) {
-	str := v.str()
+	str := v.json_str()
 	if v is string {
 		wr.write_string('"$str"')
 	} else {
@ -51,11 +51,21 @@ pub fn (flds []Any) str() string {
 	return res
 }
-// str returns the string representation of the `Any` type.
+// str returns the string representation of the `Any` type. Use the `json_str` method
 // if you want to use the escaped str() version of the `Any` type.
 pub fn (f Any) str() string {
 	if f is string {
 		return f
 	} else {
 		return f.json_str()
 	}
 }
 // json_str returns the JSON string representation of the `Any` type.
 pub fn (f Any) json_str() string {
 	match f {
 		string {
-			return f
+			return json_string(f)
 		}
 		int {
 			return f.str()
@ -85,3 +95,76 @@ pub fn (f Any) str() string {
 		}
 	}
 }
 // char_len_list is a modified version of builtin.utf8_str_len
 // that returns an array of character lengths. (e.g "t✔" => [1,2])
 fn char_len_list(s string) []int {
 	mut l := 1
 	mut ls := []int{}
 	for i := 0; i < s.len; i++ {
 		c := s[i]
 		if (c & (1 << 7)) != 0 {
 			for t := byte(1 << 6); (c & t) != 0; t >>= 1 {
 				l++
 				i++
 			}
 		}
 		ls << l
 		l = 1
 	}
 	return ls
 }
 const escaped_chars = [r'\b', r'\f', r'\n', r'\r', r'\t']
 // json_string returns the JSON spec-compliant version of the string.
 [manualfree]
 fn json_string(s string) string {
 	// not the best implementation but will revisit it soon
 	char_lens := char_len_list(s)
 	mut sb := strings.new_builder(s.len)
 	mut i := 0
 	defer {
 		unsafe {
 			char_lens.free()
 			// freeing string builder on defer after
 			// returning .str() still isn't working :(
 			// sb.free()
 		}
 	}
 	for char_len in char_lens {
 		if char_len == 1 {
 			chr := s[i]
 			if chr in json2.important_escapable_chars {
 				for j := 0 ; j < json2.important_escapable_chars.len; j++ {
 					if chr == json2.important_escapable_chars[j] {
 						sb.write_string(escaped_chars[j])
 						break
 					}
 				}
 			} else if chr == `"` || chr == `/` || chr == `\\` {
 				sb.write_string('\\' + chr.ascii_str())
 			} else {
 				sb.write_b(chr)
 			}
 		} else {
 			slice := s[i .. i + char_len]
 			hex_code := slice.utf32_code().hex()
 			if hex_code.len == 4 {
 				sb.write_string('\\u$hex_code')
 			} else {
 				// TODO: still figuring out what
 				// to do with more than 4 chars
 				sb.write_b(` `)
 			}
 			unsafe {
 				slice.free()
 				hex_code.free()
 			}
 		}
 		i += char_len
 	}
 	str := sb.str()
 	unsafe { sb.free() }
 	return str
 }
--- a/vlib/x/json2/encoder_test.v
+++ b/vlib/x/json2/encoder_test.v
@ -0,0 +1,21 @@
 import x.json2
 fn test_json_string_characters() {
 	text := json2.raw_decode(r'"\n\r\b\f\t\\\"\/"') or { '' }
 	assert text.json_str() == '\\n\\r\\b\\f\\t\\\\\\"\\/'
 }
 fn test_json_string() {
 	text := json2.Any('te✔st')
 	assert text.json_str() == r'te\u2714st'
 }
 fn test_json_string_emoji() {
 	text := json2.Any('🐈')
 	assert text.json_str() == r' '
 }
 fn test_json_string_non_ascii() {
 	text := json2.Any('ひらがな')
 	assert text.json_str() == r'\u3072\u3089\u304c\u306a'
 }
--- a/vlib/x/json2/scanner.v
+++ b/vlib/x/json2/scanner.v
@ -41,12 +41,12 @@ const (
 	// list of characters commonly used in JSON.
 	char_list                 = [`{`, `}`, `[`, `]`, `,`, `:`]
 	// list of newlines to check when moving to a new position.
-	newlines                  = [`\r`, `\n`, byte(9), `\t`]
+	newlines                  = [`\r`, `\n`, `\t`]
 	// list of escapable that needs to be escaped inside a JSON string.
 	// double quotes and forward slashes are excluded intentionally since
 	// they have their own separate checks for it in order to pass the
 	// JSON test suite (https://github.com/nst/JSONTestSuite/).
-	important_escapable_chars = [byte(9), 10, 0, `\b`, `\f`, `\n`, `\r`, `\t`]
+	important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]
 	// list of valid unicode escapes aside from \u{4-hex digits}
 	valid_unicode_escapes     = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]
 	// used for transforming escapes into valid unicode (eg. n => \n)
@ -129,7 +129,7 @@ fn (mut s Scanner) text_scan() Token {
 		} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`)
 			&& ch in json2.important_escapable_chars {
 			return s.error('character must be escaped with a backslash')
-		} else if s.pos == s.text.len - 1 && ch == `\\` {
+		} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == byte(0) {
 			return s.error('invalid backslash escape')
 		} else if s.pos + 1 < s.text.len && ch == `\\` {
 			peek := s.text[s.pos + 1]
@ -154,8 +154,15 @@ fn (mut s Scanner) text_scan() Token {
 					if codepoint.len != 4 {
 						return s.error('unicode escape must have 4 hex digits')
 					}
-					chrs << byte(strconv.parse_uint(codepoint.bytestr(), 16, 32))
+					val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32))
-					unsafe { codepoint.free() }
+					converted := utf32_to_str(val)
 					converted_bytes := converted.bytes()
 					chrs << converted_bytes
 					unsafe {
 						converted.free()
 						converted_bytes.free()
 						codepoint.free()
 					}
 					continue
 				} else {
 					return s.error('incomplete unicode escape')
--- a/vlib/x/json2/scanner_test.v
+++ b/vlib/x/json2/scanner_test.v
@ -20,6 +20,16 @@ fn test_str_valid_unicode_escape() {
 	assert tok.lit.bytestr() == 'H'
 }
 fn test_str_valid_unicode_escape_2() {
 	mut sc := Scanner{
 		text: r'"\u2714"'.bytes()
 	}
 	tok := sc.scan()
 	assert tok.kind == .str_
 	assert tok.lit.len == 3
 	assert tok.lit.bytestr() == '✔'
 }
 fn test_str_invalid_escape() {
 	mut sc := Scanner{
 		text: r'"\z"'.bytes()