312 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			V
		
	
	
			
		
		
	
	
			312 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			V
		
	
	
// Copyright (c) 2019-2022 Alexander Medvednikov. All rights reserved.
 | 
						|
// Use of this source code is governed by an MIT license
 | 
						|
// that can be found in the LICENSE file.
 | 
						|
module json2
 | 
						|
 | 
						|
import strconv
 | 
						|
 | 
						|
struct Scanner {
 | 
						|
mut:
 | 
						|
	text []byte
 | 
						|
	pos  int
 | 
						|
	line int
 | 
						|
	col  int
 | 
						|
}
 | 
						|
 | 
						|
enum TokenKind {
 | 
						|
	none_
 | 
						|
	error
 | 
						|
	str_
 | 
						|
	float
 | 
						|
	int_
 | 
						|
	null
 | 
						|
	bool_
 | 
						|
	eof
 | 
						|
	comma = 44
 | 
						|
	colon = 58
 | 
						|
	lsbr = 91
 | 
						|
	rsbr = 93
 | 
						|
	lcbr = 123
 | 
						|
	rcbr = 125
 | 
						|
}
 | 
						|
 | 
						|
pub struct Token {
 | 
						|
	lit  []byte
 | 
						|
	kind TokenKind
 | 
						|
	line int
 | 
						|
	col  int
 | 
						|
}
 | 
						|
 | 
						|
// full_col returns the full column information which includes the length
 | 
						|
pub fn (t Token) full_col() int {
 | 
						|
	return t.col + t.lit.len
 | 
						|
}
 | 
						|
 | 
						|
const (
 | 
						|
	// list of characters commonly used in JSON.
 | 
						|
	char_list                 = [`{`, `}`, `[`, `]`, `,`, `:`]
 | 
						|
	// list of newlines to check when moving to a new position.
 | 
						|
	newlines                  = [`\r`, `\n`, `\t`]
 | 
						|
	// list of escapable that needs to be escaped inside a JSON string.
 | 
						|
	// double quotes and forward slashes are excluded intentionally since
 | 
						|
	// they have their own separate checks for it in order to pass the
 | 
						|
	// JSON test suite (https://github.com/nst/JSONTestSuite/).
 | 
						|
	important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]
 | 
						|
	// list of valid unicode escapes aside from \u{4-hex digits}
 | 
						|
	valid_unicode_escapes     = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]
 | 
						|
	// used for transforming escapes into valid unicode (eg. n => \n)
 | 
						|
	unicode_transform_escapes = {
 | 
						|
		98:  `\b`
 | 
						|
		102: `\f`
 | 
						|
		110: `\n`
 | 
						|
		114: `\r`
 | 
						|
		116: `\t`
 | 
						|
		92:  `\\`
 | 
						|
		34:  `"`
 | 
						|
		47:  `/`
 | 
						|
	}
 | 
						|
	exp_signs = [byte(`-`), `+`]
 | 
						|
)
 | 
						|
 | 
						|
// move_pos proceeds to the next position.
 | 
						|
fn (mut s Scanner) move() {
 | 
						|
	s.move_pos(true, true)
 | 
						|
}
 | 
						|
 | 
						|
// move_pos_with_newlines is the same as move_pos but only enables newline checking.
 | 
						|
fn (mut s Scanner) move_pos_with_newlines() {
 | 
						|
	s.move_pos(false, true)
 | 
						|
}
 | 
						|
 | 
						|
fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
 | 
						|
	s.pos++
 | 
						|
	if s.pos < s.text.len {
 | 
						|
		if include_newlines && s.text[s.pos] in json2.newlines {
 | 
						|
			s.line++
 | 
						|
			s.col = 0
 | 
						|
			if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
 | 
						|
				s.pos++
 | 
						|
			}
 | 
						|
			for s.pos < s.text.len && s.text[s.pos] in json2.newlines {
 | 
						|
				s.move()
 | 
						|
			}
 | 
						|
		} else if include_space && s.text[s.pos] == ` ` {
 | 
						|
			s.pos++
 | 
						|
			s.col++
 | 
						|
			for s.pos < s.text.len && s.text[s.pos] == ` ` {
 | 
						|
				s.move()
 | 
						|
			}
 | 
						|
		}
 | 
						|
	} else {
 | 
						|
		s.col++
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// error returns an error token.
 | 
						|
fn (s Scanner) error(description string) Token {
 | 
						|
	return s.tokenize(description.bytes(), .error)
 | 
						|
}
 | 
						|
 | 
						|
// tokenize returns a token based on the given lit and kind.
 | 
						|
fn (s Scanner) tokenize(lit []byte, kind TokenKind) Token {
 | 
						|
	return Token{
 | 
						|
		lit: lit
 | 
						|
		kind: kind
 | 
						|
		col: s.col
 | 
						|
		line: s.line
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// text_scan scans and returns a string token.
 | 
						|
[manualfree]
 | 
						|
fn (mut s Scanner) text_scan() Token {
 | 
						|
	mut has_closed := false
 | 
						|
	mut chrs := []byte{}
 | 
						|
	for {
 | 
						|
		s.pos++
 | 
						|
		s.col++
 | 
						|
		if s.pos >= s.text.len {
 | 
						|
			break
 | 
						|
		}
 | 
						|
		ch := s.text[s.pos]
 | 
						|
		if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` {
 | 
						|
			has_closed = true
 | 
						|
			break
 | 
						|
		} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`)
 | 
						|
			&& ch in json2.important_escapable_chars {
 | 
						|
			return s.error('character must be escaped with a backslash')
 | 
						|
		} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == byte(0) {
 | 
						|
			return s.error('invalid backslash escape')
 | 
						|
		} else if s.pos + 1 < s.text.len && ch == `\\` {
 | 
						|
			peek := s.text[s.pos + 1]
 | 
						|
			if peek in json2.valid_unicode_escapes {
 | 
						|
				chrs << json2.unicode_transform_escapes[int(peek)]
 | 
						|
				s.pos++
 | 
						|
				s.col++
 | 
						|
				continue
 | 
						|
			} else if peek == `u` {
 | 
						|
				if s.pos + 5 < s.text.len {
 | 
						|
					s.pos++
 | 
						|
					s.col++
 | 
						|
					mut codepoint := []byte{}
 | 
						|
					codepoint_start := s.pos
 | 
						|
					for s.pos < s.text.len && s.pos < codepoint_start + 4 {
 | 
						|
						s.pos++
 | 
						|
						s.col++
 | 
						|
						if s.text[s.pos] == `"` {
 | 
						|
							break
 | 
						|
						} else if !s.text[s.pos].is_hex_digit() {
 | 
						|
							x := s.text[s.pos].ascii_str()
 | 
						|
							return s.error('`$x` is not a hex digit')
 | 
						|
						}
 | 
						|
						codepoint << s.text[s.pos]
 | 
						|
					}
 | 
						|
					if codepoint.len != 4 {
 | 
						|
						return s.error('unicode escape must have 4 hex digits')
 | 
						|
					}
 | 
						|
					val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
 | 
						|
					converted := utf32_to_str(val)
 | 
						|
					converted_bytes := converted.bytes()
 | 
						|
					chrs << converted_bytes
 | 
						|
					unsafe {
 | 
						|
						converted.free()
 | 
						|
						converted_bytes.free()
 | 
						|
						codepoint.free()
 | 
						|
					}
 | 
						|
					continue
 | 
						|
				} else {
 | 
						|
					return s.error('incomplete unicode escape')
 | 
						|
				}
 | 
						|
			} else if peek == `U` {
 | 
						|
				return s.error('unicode endpoints must be in lowercase `u`')
 | 
						|
			} else if peek == byte(229) {
 | 
						|
				return s.error('unicode endpoint not allowed')
 | 
						|
			} else {
 | 
						|
				return s.error('invalid backslash escape')
 | 
						|
			}
 | 
						|
		}
 | 
						|
		chrs << ch
 | 
						|
	}
 | 
						|
	tok := s.tokenize(chrs, .str_)
 | 
						|
	s.move()
 | 
						|
	if !has_closed {
 | 
						|
		return s.error('missing double quotes in string closing')
 | 
						|
	}
 | 
						|
	return tok
 | 
						|
}
 | 
						|
 | 
						|
// num_scan scans and returns an int/float token.
 | 
						|
fn (mut s Scanner) num_scan() Token {
 | 
						|
	// analyze json number structure
 | 
						|
	// -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
 | 
						|
	mut is_fl := false
 | 
						|
	mut dot_index := -1
 | 
						|
	mut digits := []byte{}
 | 
						|
	if s.text[s.pos] == `-` {
 | 
						|
		digits << `-`
 | 
						|
		if !s.text[s.pos + 1].is_digit() {
 | 
						|
			return s.invalid_token()
 | 
						|
		}
 | 
						|
		s.move_pos_with_newlines()
 | 
						|
	}
 | 
						|
	if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
 | 
						|
		return s.error('leading zeroes in a number are not allowed')
 | 
						|
	}
 | 
						|
	for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
 | 
						|
		digits << s.text[s.pos]
 | 
						|
		if s.text[s.pos] == `.` {
 | 
						|
			is_fl = true
 | 
						|
			dot_index = digits.len - 1
 | 
						|
		}
 | 
						|
		s.move_pos_with_newlines()
 | 
						|
	}
 | 
						|
	if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
 | 
						|
		return s.error('invalid float')
 | 
						|
	}
 | 
						|
	if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
 | 
						|
		digits << s.text[s.pos]
 | 
						|
		s.move_pos_with_newlines()
 | 
						|
		if s.pos < s.text.len && s.text[s.pos] in json2.exp_signs {
 | 
						|
			digits << s.text[s.pos]
 | 
						|
			s.move_pos_with_newlines()
 | 
						|
		}
 | 
						|
		mut exp_digits_count := 0
 | 
						|
		for s.pos < s.text.len && s.text[s.pos].is_digit() {
 | 
						|
			digits << s.text[s.pos]
 | 
						|
			exp_digits_count++
 | 
						|
			s.move_pos_with_newlines()
 | 
						|
		}
 | 
						|
		if exp_digits_count == 0 {
 | 
						|
			return s.error('invalid exponent')
 | 
						|
		}
 | 
						|
	}
 | 
						|
	kind := if is_fl { TokenKind.float } else { TokenKind.int_ }
 | 
						|
	return s.tokenize(digits, kind)
 | 
						|
}
 | 
						|
 | 
						|
// invalid_token returns an error token with the invalid token message.
 | 
						|
fn (s Scanner) invalid_token() Token {
 | 
						|
	if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 {
 | 
						|
		x := s.text[s.pos].ascii_str()
 | 
						|
		return s.error('invalid token `$x`')
 | 
						|
	} else {
 | 
						|
		x := s.text[s.pos].str_escaped()
 | 
						|
		return s.error('invalid token `$x`')
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// scan returns a token based on the scanner's current position.
 | 
						|
[manualfree]
 | 
						|
fn (mut s Scanner) scan() Token {
 | 
						|
	if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in json2.newlines) {
 | 
						|
		s.move()
 | 
						|
	}
 | 
						|
	if s.pos >= s.text.len {
 | 
						|
		return s.tokenize([]byte{}, .eof)
 | 
						|
	} else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
 | 
						|
		ident := s.text[s.pos..s.pos + 4].bytestr()
 | 
						|
		if ident == 'true' || ident == 'null' {
 | 
						|
			mut kind := TokenKind.null
 | 
						|
			if ident == 'true' {
 | 
						|
				kind = .bool_
 | 
						|
			}
 | 
						|
			unsafe { ident.free() }
 | 
						|
			val := s.text[s.pos..s.pos + 4]
 | 
						|
			tok := s.tokenize(val, kind)
 | 
						|
			s.move() // n / t
 | 
						|
			s.move() // u / r
 | 
						|
			s.move() // l / u
 | 
						|
			s.move() // l / e
 | 
						|
			return tok
 | 
						|
		}
 | 
						|
		unsafe { ident.free() }
 | 
						|
		return s.invalid_token()
 | 
						|
	} else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
 | 
						|
		ident := s.text[s.pos..s.pos + 5].bytestr()
 | 
						|
		if ident == 'false' {
 | 
						|
			unsafe { ident.free() }
 | 
						|
			val := s.text[s.pos..s.pos + 5]
 | 
						|
			tok := s.tokenize(val, .bool_)
 | 
						|
			s.move() // f
 | 
						|
			s.move() // a
 | 
						|
			s.move() // l
 | 
						|
			s.move() // s
 | 
						|
			s.move() // e
 | 
						|
			return tok
 | 
						|
		}
 | 
						|
		unsafe { ident.free() }
 | 
						|
		return s.invalid_token()
 | 
						|
	} else if s.text[s.pos] in json2.char_list {
 | 
						|
		chr := s.text[s.pos]
 | 
						|
		tok := s.tokenize([]byte{}, TokenKind(int(chr)))
 | 
						|
		s.move()
 | 
						|
		return tok
 | 
						|
	} else if s.text[s.pos] == `"` {
 | 
						|
		return s.text_scan()
 | 
						|
	} else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
 | 
						|
		return s.num_scan()
 | 
						|
	} else {
 | 
						|
		return s.invalid_token()
 | 
						|
	}
 | 
						|
}
 |