418 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			V
		
	
	
			
		
		
	
	
			418 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			V
		
	
	
| // Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
 | |
| // Use of this source code is governed by an MIT license
 | |
| // that can be found in the LICENSE file.
 | |
| module json2
 | |
| 
 | |
| import strings
 | |
| import strconv
 | |
| import v.scanner
 | |
| import v.token
 | |
| import v.util
 | |
| import v.pref
 | |
| 
 | |
| // `Any` is a sum type that lists the possible types to be decoded and used.
 | |
| pub type Any = string | int | i64 | f32 | f64 | any_int | any_float | bool | Null | []Any | map[string]Any
 | |
| 
 | |
| // `Null` struct is a simple representation of the `null` value in JSON.
 | |
| pub struct Null {
 | |
| }
 | |
| 
 | |
| enum ParseMode {
 | |
| 	array
 | |
| 	bool
 | |
| 	invalid
 | |
| 	null
 | |
| 	number
 | |
| 	object
 | |
| 	string
 | |
| }
 | |
| 
 | |
| const (
 | |
| 	formfeed_err = 'formfeed not allowed.'
 | |
| 	eof_err      = 'reached eof. data not closed properly.'
 | |
| )
 | |
| 
 | |
| struct Parser {
 | |
| mut:
 | |
| 	scanner      &scanner.Scanner
 | |
| 	p_tok        token.Token
 | |
| 	tok          token.Token
 | |
| 	n_tok        token.Token
 | |
| 	mode         ParseMode = .invalid
 | |
| 	n_level      int
 | |
| 	convert_type bool = true
 | |
| }
 | |
| 
 | |
| fn (mut p Parser) next() {
 | |
| 	p.p_tok = p.tok
 | |
| 	p.tok = p.n_tok
 | |
| 	p.n_tok = p.scanner.scan()
 | |
| }
 | |
| 
 | |
| fn (p Parser) emit_error(msg string) string {
 | |
| 	source := p.scanner.text
 | |
| 	cur := p.tok
 | |
| 	mut pp := util.imax(0, util.imin(source.len - 1, cur.pos))
 | |
| 	if source.len > 0 {
 | |
| 		for pp >= 0 {
 | |
| 			if source[pp] !in [`\r`, `\n`] {
 | |
| 				pp--
 | |
| 				continue
 | |
| 			}
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| 	column := util.imax(0, cur.pos - pp + cur.len - 1)
 | |
| 	line := cur.line_nr
 | |
| 	return '[json] $msg ($line:$column)'
 | |
| }
 | |
| 
 | |
| fn new_parser(srce string, convert_type bool) Parser {
 | |
| 	mut src := srce
 | |
| 	// from v/util/util.v
 | |
| 	if src.len >= 3 {
 | |
| 		c_text := src.str
 | |
| 		unsafe {
 | |
| 			if c_text[0] == 0xEF && c_text[1] == 0xBB && c_text[2] == 0xBF {
 | |
| 				// skip three BOM bytes
 | |
| 				offset_from_begin := 3
 | |
| 				src = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return Parser{
 | |
| 		scanner: scanner.new_scanner(src, .parse_comments, &pref.Preferences{})
 | |
| 		convert_type: convert_type
 | |
| 	}
 | |
| }
 | |
| 
 | |
| fn check_valid_hex(str string) ? {
 | |
| 	if str.len != 4 {
 | |
| 		return error('hex string must be 4 characters.')
 | |
| 	}
 | |
| 	for l in str {
 | |
| 		if l.is_hex_digit() {
 | |
| 			continue
 | |
| 		}
 | |
| 		return error('provided string is not a hex digit.')
 | |
| 	}
 | |
| }
 | |
| 
 | |
| fn (mut p Parser) decode() ?Any {
 | |
| 	p.detect_parse_mode()
 | |
| 	if p.mode == .invalid {
 | |
| 		return error(p.emit_error('invalid JSON.'))
 | |
| 	}
 | |
| 	fi := p.decode_value() or {
 | |
| 		return error(p.emit_error(err))
 | |
| 	}
 | |
| 	if p.tok.kind != .eof {
 | |
| 		return error(p.emit_error('unknown token `$p.tok.kind`.'))
 | |
| 	}
 | |
| 	return fi
 | |
| }
 | |
| 
 | |
| fn (p Parser) is_formfeed() bool {
 | |
| 	prev_tok_pos := p.p_tok.pos + p.p_tok.len - 2
 | |
| 	if prev_tok_pos < p.scanner.text.len && p.scanner.text[prev_tok_pos] == 0x0c {
 | |
| 		return true
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| fn (p Parser) is_singlequote() bool {
 | |
| 	src := p.scanner.text
 | |
| 	prev_tok_pos := p.p_tok.pos + p.p_tok.len
 | |
| 	return src[prev_tok_pos] == `\'`
 | |
| }
 | |
| 
 | |
| fn (mut p Parser) detect_parse_mode() {
 | |
| 	src := p.scanner.text
 | |
| 	if src.len > 1 && src[0].is_digit() && !src[1].is_digit() {
 | |
| 		p.mode = .invalid
 | |
| 		return
 | |
| 	}
 | |
| 	p.tok = p.scanner.scan()
 | |
| 	p.n_tok = p.scanner.scan()
 | |
| 	if src.len == 1 && p.tok.kind == .string && p.n_tok.kind == .eof {
 | |
| 		p.mode = .invalid
 | |
| 		return
 | |
| 	}
 | |
| 	match p.tok.kind {
 | |
| 		.lcbr {
 | |
| 			p.mode = .object
 | |
| 		}
 | |
| 		.lsbr {
 | |
| 			p.mode = .array
 | |
| 		}
 | |
| 		.number {
 | |
| 			p.mode = .number
 | |
| 		}
 | |
| 		.key_true, .key_false {
 | |
| 			p.mode = .bool
 | |
| 		}
 | |
| 		.string {
 | |
| 			p.mode = .string
 | |
| 		}
 | |
| 		.name {
 | |
| 			if p.tok.lit == 'null' {
 | |
| 				p.mode = .null
 | |
| 			}
 | |
| 		}
 | |
| 		.minus {
 | |
| 			if p.n_tok.kind == .number {
 | |
| 				p.mode = .number
 | |
| 			}
 | |
| 		}
 | |
| 		else {}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| fn (mut p Parser) decode_value() ?Any {
 | |
| 	if p.n_level == 500 {
 | |
| 		return error('reached maximum nesting level of 500.')
 | |
| 	}
 | |
| 	if (p.tok.kind == .lsbr && p.n_tok.kind == .lcbr) ||
 | |
| 		(p.p_tok.kind == p.tok.kind && p.tok.kind == .lsbr) {
 | |
| 		p.n_level++
 | |
| 	}
 | |
| 	match p.tok.kind {
 | |
| 		.lsbr {
 | |
| 			return p.decode_array()
 | |
| 		}
 | |
| 		.lcbr {
 | |
| 			return p.decode_object()
 | |
| 		}
 | |
| 		.number {
 | |
| 			return p.decode_number()
 | |
| 		}
 | |
| 		.key_true {
 | |
| 			p.next()
 | |
| 			return if p.convert_type {
 | |
| 				Any(true)
 | |
| 			} else {
 | |
| 				Any('true')
 | |
| 			}
 | |
| 		}
 | |
| 		.key_false {
 | |
| 			p.next()
 | |
| 			return if p.convert_type {
 | |
| 				Any(false)
 | |
| 			} else {
 | |
| 				Any('false')
 | |
| 			}
 | |
| 		}
 | |
| 		.name {
 | |
| 			if p.tok.lit != 'null' {
 | |
| 				return error('unknown identifier `$p.tok.lit`')
 | |
| 			}
 | |
| 			p.next()
 | |
| 			return if p.convert_type {
 | |
| 				Any(Null{})
 | |
| 			} else {
 | |
| 				Any('null')
 | |
| 			}
 | |
| 		}
 | |
| 		.string {
 | |
| 			if p.is_singlequote() {
 | |
| 				return error('strings must be in double-quotes.')
 | |
| 			}
 | |
| 			return p.decode_string()
 | |
| 		}
 | |
| 		else {
 | |
| 			if p.tok.kind == .minus && p.n_tok.kind == .number && p.n_tok.pos == p.tok.pos + 1 {
 | |
| 				p.next()
 | |
| 				d_num := p.decode_number() ?
 | |
| 				return d_num
 | |
| 			}
 | |
| 			return error("unknown token '$p.tok.lit' when decoding value")
 | |
| 		}
 | |
| 	}
 | |
| 	if p.is_formfeed() {
 | |
| 		return error(formfeed_err)
 | |
| 	}
 | |
| 	return Any{}
 | |
| }
 | |
| 
 | |
| fn (mut p Parser) decode_string() ?Any {
 | |
| 	mut strwr := strings.new_builder(200)
 | |
| 	for i := 0; i < p.tok.lit.len; i++ {
 | |
| 		if ((i - 1 >= 0 && p.tok.lit[i - 1] != `/`) || i == 0) && int(p.tok.lit[i]) in [9, 10, 0] {
 | |
| 			return error('character must be escaped with a backslash.')
 | |
| 		}
 | |
| 		if i == p.tok.lit.len - 1 && p.tok.lit[i] == 92 {
 | |
| 			return error('invalid backslash escape.')
 | |
| 		}
 | |
| 		if i + 1 < p.tok.lit.len && p.tok.lit[i] == 92 {
 | |
| 			peek := p.tok.lit[i + 1]
 | |
| 			match peek {
 | |
| 				`b` {
 | |
| 					i++
 | |
| 					strwr.write_b(`\b`)
 | |
| 					continue
 | |
| 				}
 | |
| 				`f` {
 | |
| 					i++
 | |
| 					strwr.write_b(`\f`)
 | |
| 					continue
 | |
| 				}
 | |
| 				`n` {
 | |
| 					i++
 | |
| 					strwr.write_b(`\n`)
 | |
| 					continue
 | |
| 				}
 | |
| 				`r` {
 | |
| 					i++
 | |
| 					strwr.write_b(`\r`)
 | |
| 					continue
 | |
| 				}
 | |
| 				`t` {
 | |
| 					i++
 | |
| 					strwr.write_b(`\t`)
 | |
| 					continue
 | |
| 				}
 | |
| 				`u` {
 | |
| 					if i + 5 < p.tok.lit.len {
 | |
| 						codepoint := p.tok.lit[i + 2..i + 6]
 | |
| 						check_valid_hex(codepoint) ?
 | |
| 						hex_val := strconv.parse_int(codepoint, 16, 0)
 | |
| 						strwr.write_b(byte(hex_val))
 | |
| 						i += 5
 | |
| 						continue
 | |
| 					} else {
 | |
| 						return error('incomplete unicode escape.')
 | |
| 					}
 | |
| 				}
 | |
| 				`\\` {
 | |
| 					i++
 | |
| 					strwr.write_b(`\\`)
 | |
| 					continue
 | |
| 				}
 | |
| 				`"` {
 | |
| 					i++
 | |
| 					strwr.write_b(`\"`)
 | |
| 					continue
 | |
| 				}
 | |
| 				`/` {
 | |
| 					i++
 | |
| 					strwr.write_b(`/`)
 | |
| 					continue
 | |
| 				}
 | |
| 				else { return error('invalid backslash escape.') }
 | |
| 			}
 | |
| 			if int(peek) == 85 {
 | |
| 				return error('unicode endpoints must be in lowercase `u`.')
 | |
| 			}
 | |
| 			if int(peek) in [9, 229] {
 | |
| 				return error('unicode endpoint not allowed.')
 | |
| 			}
 | |
| 		}
 | |
| 		strwr.write_b(p.tok.lit[i])
 | |
| 	}
 | |
| 	p.next()
 | |
| 	defer {
 | |
| 		strwr.free()
 | |
| 	}
 | |
| 	str := strwr.str()
 | |
| 	return Any(str)
 | |
| }
 | |
| 
 | |
| // now returns string instead of int or float
 | |
| fn (mut p Parser) decode_number() ?Any {
 | |
| 	src := p.scanner.text
 | |
| 	mut tl := p.tok.lit
 | |
| 	mut is_fl := false
 | |
| 	sep_by_dot := tl.to_lower().split('.')
 | |
| 	if tl.starts_with('0x') && tl.all_after('0x').len <= 2 {
 | |
| 		return error('hex numbers should not be less than or equal to two digits.')
 | |
| 	}
 | |
| 	if src[p.p_tok.pos + p.p_tok.len] == `0` && src[p.p_tok.pos + p.p_tok.len + 1].is_digit() {
 | |
| 		return error('leading zeroes in integers are not allowed.')
 | |
| 	}
 | |
| 	if tl.starts_with('.') {
 | |
| 		return error('decimals must start with a digit followed by a dot.')
 | |
| 	}
 | |
| 	if tl.ends_with('+') || tl.ends_with('-') {
 | |
| 		return error('exponents must have a digit before the sign.')
 | |
| 	}
 | |
| 	if sep_by_dot.len > 1 {
 | |
| 		// analyze json number structure
 | |
| 		// -[digit][dot][digit][E/e][-/+][digit]
 | |
| 		// float number
 | |
| 		is_fl = true
 | |
| 		last := sep_by_dot.last()
 | |
| 		if last.starts_with('e') {
 | |
| 			return error('exponents must have a digit before the exponent notation.')
 | |
| 		}
 | |
| 	}
 | |
| 	if p.p_tok.kind == .minus && p.tok.pos == p.p_tok.pos + 1 {
 | |
| 		tl = '-$tl'
 | |
| 	}
 | |
| 	p.next()
 | |
| 	if p.convert_type {
 | |
| 		return if is_fl {
 | |
| 			Any(tl.f64())
 | |
| 		} else {
 | |
| 			Any(tl.i64())
 | |
| 		}
 | |
| 	}
 | |
| 	return Any(tl)
 | |
| }
 | |
| 
 | |
| fn (mut p Parser) decode_array() ?Any {
 | |
| 	mut items := []Any{}
 | |
| 	p.next()
 | |
| 	for p.tok.kind != .rsbr {
 | |
| 		if p.tok.kind == .eof {
 | |
| 			return error(eof_err)
 | |
| 		}
 | |
| 		item := p.decode_value() ?
 | |
| 		items << item
 | |
| 		if p.tok.kind == .comma && p.n_tok.kind !in [.rsbr, .comma] {
 | |
| 			p.next()
 | |
| 			continue
 | |
| 		}
 | |
| 		if p.tok.kind == .rsbr {
 | |
| 			break
 | |
| 		}
 | |
| 		return error("unknown token '$p.tok.lit' when decoding arrays.")
 | |
| 	}
 | |
| 	p.next()
 | |
| 	return Any(items)
 | |
| }
 | |
| 
 | |
| fn (mut p Parser) decode_object() ?Any {
 | |
| 	mut fields := map[string]Any{}
 | |
| 	mut cur_key := ''
 | |
| 	p.next()
 | |
| 	for p.tok.kind != .rcbr {
 | |
| 		is_key := p.tok.kind == .string && p.n_tok.kind == .colon
 | |
| 		// todo
 | |
| 		// if p.is_formfeed() {
 | |
| 		// return error(formfeed_err)
 | |
| 		// }
 | |
| 		if p.tok.kind == .eof {
 | |
| 			return error(eof_err)
 | |
| 		}
 | |
| 		if p.is_singlequote() {
 | |
| 			return error('object keys must be in single quotes.')
 | |
| 		}
 | |
| 		if !is_key {
 | |
| 			return error("invalid token `$p.tok.lit`, expected \'string\'")
 | |
| 		}
 | |
| 		cur_key = p.tok.lit
 | |
| 		p.next()
 | |
| 		p.next()
 | |
| 		fields[cur_key] = p.decode_value() ?
 | |
| 		if p.tok.kind == .comma && p.n_tok.kind !in [.rcbr, .comma] {
 | |
| 			p.next()
 | |
| 			continue
 | |
| 		} else if p.tok.kind == .rcbr {
 | |
| 			break
 | |
| 		}
 | |
| 		return error("unknown token '$p.tok.lit' when decoding object.")
 | |
| 	}
 | |
| 	p.next()
 | |
| 	return Any(fields)
 | |
| }
 |