scanner: parse multiple comments and long strings without a stackoverflow

2020-07-11 20:52:05 +03:00 · 2020-07-11 20:52:05 +03:00 · 022cc72740
parent a2cb01e16a
commit 022cc72740
1 changed files with 459 additions and 450 deletions
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -614,307 +614,314 @@ fn (s Scanner) look_ahead(n int) byte {
 }

 fn (mut s Scanner) text_scan() token.Token {
-	// if s.comments_mode == .parse_comments {
-	// println('\nscan()')
-	// }
-	// if s.line_comment != '' {
-	// s.fgenln('// LC "$s.line_comment"')
-	// s.line_comment = ''
-	// }
-	if s.is_started {
-		s.pos++
-	}
-	s.is_started = true
-	if s.pos >= s.text.len {
-		return s.end_of_file()
-	}
-	if !s.is_inside_string {
-		s.skip_whitespace()
-	}
-	// End of $var, start next string
-	if s.is_inter_end {
-		if s.text[s.pos] == s.quote {
-			s.is_inter_end = false
-			return s.new_token(.string, '', 1)
-		}
-		s.is_inter_end = false
-		ident_string := s.ident_string()
-		return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
-	}
-	s.skip_whitespace()
-	// end of file
-	if s.pos >= s.text.len {
-		return s.end_of_file()
-	}
-	// handle each char
-	c := s.text[s.pos]
-	nextc := s.look_ahead(1)
-	// name or keyword
-	if util.is_name_char(c) {
-		name := s.ident_name()
-		// tmp hack to detect . in ${}
-		// Check if not .eof to prevent panic
-		next_char := s.look_ahead(1)
-		kind := token.keywords[name]
-		if kind != .unknown {
-			if kind == .key_fn {
-				s.struct_name = s.ident_struct_name()
-				s.fn_name = s.ident_fn_name()
-			} else if kind == .key_module {
-				s.mod_name = s.ident_mod_name()
-			}
-			return s.new_token(kind, name, name.len)
-		}
-		// 'asdf $b' => "b" is the last name in the string, dont start parsing string
-		// at the next ', skip it
-		if s.is_inside_string {
-			if next_char == s.quote {
-				s.is_inter_end = true
-				s.is_inter_start = false
-				s.is_inside_string = false
-			}
-		}
-		// end of `$expr`
-		// allow `'$a.b'` and `'$a.c()'`
-		if s.is_inter_start && next_char != `.` && next_char != `(` {
-			s.is_inter_end = true
-			s.is_inter_start = false
-		}
-		if s.pos == 0 && next_char == ` ` {
-			// If a single letter name at the start of the file, increment
-			// Otherwise the scanner would be stuck at s.pos = 0
+	// The for loop here is so that instead of doing
+	// `return s.scan()` (which will use a new call stack frame),
+	// text_scan can just do continue, keeping
+	// memory & stack usage low.
+	// That optimization mostly matters for long sections
+	// of comments and string literals.
+	for {
+		// if s.comments_mode == .parse_comments {
+		// println('\nscan()')
+		// }
+		// if s.line_comment != '' {
+		// s.fgenln('// LC "$s.line_comment"')
+		// s.line_comment = ''
+		// }
+		if s.is_started {
 			s.pos++
 		}
-		return s.new_token(.name, name, name.len)
-	} else if c.is_digit() || (c == `.` && nextc.is_digit()) {
-		// `123`, `.123`
+		s.is_started = true
+		if s.pos >= s.text.len {
+			return s.end_of_file()
+		}
 		if !s.is_inside_string {
-			// In C ints with `0` prefix are octal (in V they're decimal), so discarding heading zeros is needed.
-			mut start_pos := s.pos
-			for start_pos < s.text.len && s.text[start_pos] == `0` {
-				start_pos++
-			}
-			mut prefix_zero_num := start_pos - s.pos // how many prefix zeros should be jumped
-			// for 0b, 0o, 0x the heading zero shouldn't be jumped
-			if start_pos == s.text.len || (c == `0` && !s.text[start_pos].is_digit()) {
-				prefix_zero_num--
-			}
-			s.pos += prefix_zero_num // jump these zeros
+			s.skip_whitespace()
 		}
-		num := s.ident_number()
-		return s.new_token(.number, num, num.len)
-	}
-	// Handle `'$fn()'`
-	if c == `)` && s.is_inter_start {
-		next_char := s.look_ahead(1)
-		if next_char != `.` {
-			s.is_inter_end = true
-			s.is_inter_start = false
-			if next_char == s.quote {
-				s.is_inside_string = false
+		// End of $var, start next string
+		if s.is_inter_end {
+			if s.text[s.pos] == s.quote {
+				s.is_inter_end = false
+				return s.new_token(.string, '', 1)
 			}
-			return s.new_token(.rpar, '', 1)
-		}
-	}
-	// all other tokens
-	match c {
-		`+` {
-			if nextc == `+` {
-				s.pos++
-				return s.new_token(.inc, '', 2)
-			} else if nextc == `=` {
-				s.pos++
-				return s.new_token(.plus_assign, '', 2)
-			}
-			return s.new_token(.plus, '', 1)
-		}
-		`-` {
-			if nextc == `-` {
-				s.pos++
-				return s.new_token(.dec, '', 2)
-			} else if nextc == `=` {
-				s.pos++
-				return s.new_token(.minus_assign, '', 2)
-			}
-			return s.new_token(.minus, '', 1)
-		}
-		`*` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.mult_assign, '', 2)
-			}
-			return s.new_token(.mul, '', 1)
-		}
-		`^` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.xor_assign, '', 2)
-			}
-			return s.new_token(.xor, '', 1)
-		}
-		`%` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.mod_assign, '', 2)
-			}
-			return s.new_token(.mod, '', 1)
-		}
-		`?` {
-			return s.new_token(.question, '', 1)
-		}
-		single_quote, double_quote {
+			s.is_inter_end = false
 			ident_string := s.ident_string()
 			return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
 		}
-		`\`` {
-			// ` // apostrophe balance comment. do not remove
-			ident_char := s.ident_char()
-			return s.new_token(.chartoken, ident_char, ident_char.len + 2) // + two quotes
+		s.skip_whitespace()
+		// end of file
+		if s.pos >= s.text.len {
+			return s.end_of_file()
 		}
-		`(` {
-			// TODO `$if vet {` for performance
-			if s.pref.is_vet && s.text[s.pos + 1] == ` ` {
-				println('$s.file_path:$s.line_nr: Looks like you are adding a space after `(`')
-			}
-			return s.new_token(.lpar, '', 1)
-		}
-		`)` {
-			// TODO `$if vet {` for performance
-			if s.pref.is_vet && s.text[s.pos - 1] == ` ` {
-				println('$s.file_path:$s.line_nr: Looks like you are adding a space before `)`')
-			}
-			return s.new_token(.rpar, '', 1)
-		}
-		`[` {
-			return s.new_token(.lsbr, '', 1)
-		}
-		`]` {
-			return s.new_token(.rsbr, '', 1)
-		}
-		`{` {
-			// Skip { in `${` in strings
-			if s.is_inside_string {
-				return s.scan()
-			}
-			return s.new_token(.lcbr, '', 1)
-		}
-		`$` {
-			if s.is_inside_string {
-				return s.new_token(.str_dollar, '', 1)
-			} else {
-				return s.new_token(.dollar, '', 1)
-			}
-		}
-		`}` {
-			// s = `hello $name !`
-			// s = `hello ${name} !`
-			if s.is_inside_string {
-				s.pos++
-				if s.text[s.pos] == s.quote {
-					s.is_inside_string = false
-					return s.new_token(.string, '', 1)
-				}
-				ident_string := s.ident_string()
-				return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
-			} else {
-				return s.new_token(.rcbr, '', 1)
-			}
-		}
-		`&` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.and_assign, '', 2)
-			}
-			afternextc := s.look_ahead(2)
-			if nextc == `&` && afternextc.is_space() {
-				s.pos++
-				return s.new_token(.and, '', 2)
-			}
-			return s.new_token(.amp, '', 1)
-		}
-		`|` {
-			if nextc == `|` {
-				s.pos++
-				return s.new_token(.logical_or, '', 2)
-			}
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.or_assign, '', 2)
-			}
-			return s.new_token(.pipe, '', 1)
-		}
-		`,` {
-			return s.new_token(.comma, '', 1)
-		}
-		`@` {
-			s.pos++
+		// handle each char
+		c := s.text[s.pos]
+		nextc := s.look_ahead(1)
+		// name or keyword
+		if util.is_name_char(c) {
 			name := s.ident_name()
-			if s.is_fmt {
-				return s.new_token(.name, '@' + name, name.len + 1)
-			}
-			// @FN => will be substituted with the name of the current V function
-			// @MOD => will be substituted with the name of the current V module
-			// @STRUCT => will be substituted with the name of the current V struct
-			// @VEXE => will be substituted with the path to the V compiler
-			// @FILE => will be substituted with the path of the V source file
-			// @LINE => will be substituted with the V line number where it appears (as a string).
-			// @COLUMN => will be substituted with the column where it appears (as a string).
-			// @VHASH  => will be substituted with the shortened commit hash of the V compiler (as a string).
-			// @VMOD_FILE => will be substituted with the contents of the nearest v.mod file (as a string).
-			// This allows things like this:
-			// println( 'file: ' + @FILE + ' | line: ' + @LINE + ' | fn: ' + @MOD + '.' + @FN)
-			// ... which is useful while debugging/tracing
-			if name == 'FN' {
-				return s.new_token(.string, s.fn_name, 3)
-			}
-			if name == 'MOD' {
-				return s.new_token(.string, s.mod_name, 4)
-			}
-			if name == 'STRUCT' {
-				return s.new_token(.string, s.struct_name, 7)
-			}
-			if name == 'VEXE' {
-				vexe := pref.vexe_path()
-				return s.new_token(.string, util.cescaped_path(vexe), 5)
-			}
-			if name == 'FILE' {
-				fpath := os.real_path(s.file_path)
-				return s.new_token(.string, util.cescaped_path(fpath), 5)
-			}
-			if name == 'LINE' {
-				return s.new_token(.string, (s.line_nr + 1).str(), 5)
-			}
-			if name == 'COLUMN' {
-				return s.new_token(.string, s.current_column().str(), 7)
-			}
-			if name == 'VHASH' {
-				return s.new_token(.string, util.vhash(), 6)
-			}
-			if name == 'VMOD_FILE' {
-				if s.vmod_file_content.len == 0 {
-					mcache := vmod.get_cache()
-					vmod_file_location := mcache.get_by_file(s.file_path)
-					if vmod_file_location.vmod_file.len == 0 {
-						s.error('@VMOD_FILE can be used only in projects, that have v.mod file')
-					}
-					vmod_content := os.read_file(vmod_file_location.vmod_file) or {
-						''
-					}
-					$if windows {
-						s.vmod_file_content = vmod_content.replace('\r\n', '\n')
-					} $else {
-						s.vmod_file_content = vmod_content
-					}
+			// tmp hack to detect . in ${}
+			// Check if not .eof to prevent panic
+			next_char := s.look_ahead(1)
+			kind := token.keywords[name]
+			if kind != .unknown {
+				if kind == .key_fn {
+					s.struct_name = s.ident_struct_name()
+					s.fn_name = s.ident_fn_name()
+				} else if kind == .key_module {
+					s.mod_name = s.ident_mod_name()
 				}
-				return s.new_token(.string, s.vmod_file_content, 10)
+				return s.new_token(kind, name, name.len)
 			}
-			if !token.is_key(name) {
-				s.error('@ must be used before keywords (e.g. `@type string`)')
+			// 'asdf $b' => "b" is the last name in the string, dont start parsing string
+			// at the next ', skip it
+			if s.is_inside_string {
+				if next_char == s.quote {
+					s.is_inter_end = true
+					s.is_inter_start = false
+					s.is_inside_string = false
+				}
+			}
+			// end of `$expr`
+			// allow `'$a.b'` and `'$a.c()'`
+			if s.is_inter_start && next_char != `.` && next_char != `(` {
+				s.is_inter_end = true
+				s.is_inter_start = false
+			}
+			if s.pos == 0 && next_char == ` ` {
+				// If a single letter name at the start of the file, increment
+				// Otherwise the scanner would be stuck at s.pos = 0
+				s.pos++
 			}
 			return s.new_token(.name, name, name.len)
+		} else if c.is_digit() || (c == `.` && nextc.is_digit()) {
+			// `123`, `.123`
+			if !s.is_inside_string {
+				// In C ints with `0` prefix are octal (in V they're decimal), so discarding heading zeros is needed.
+				mut start_pos := s.pos
+				for start_pos < s.text.len && s.text[start_pos] == `0` {
+					start_pos++
+				}
+				mut prefix_zero_num := start_pos - s.pos // how many prefix zeros should be jumped
+				// for 0b, 0o, 0x the heading zero shouldn't be jumped
+				if start_pos == s.text.len || (c == `0` && !s.text[start_pos].is_digit()) {
+					prefix_zero_num--
+				}
+				s.pos += prefix_zero_num // jump these zeros
+			}
+			num := s.ident_number()
+			return s.new_token(.number, num, num.len)
 		}
-		/*
-		case `\r`:
+		// Handle `'$fn()'`
+		if c == `)` && s.is_inter_start {
+			next_char := s.look_ahead(1)
+			if next_char != `.` {
+				s.is_inter_end = true
+				s.is_inter_start = false
+				if next_char == s.quote {
+					s.is_inside_string = false
+				}
+				return s.new_token(.rpar, '', 1)
+			}
+		}
+		// all other tokens
+		match c {
+			`+` {
+				if nextc == `+` {
+					s.pos++
+					return s.new_token(.inc, '', 2)
+				} else if nextc == `=` {
+					s.pos++
+					return s.new_token(.plus_assign, '', 2)
+				}
+				return s.new_token(.plus, '', 1)
+			}
+			`-` {
+				if nextc == `-` {
+					s.pos++
+					return s.new_token(.dec, '', 2)
+				} else if nextc == `=` {
+					s.pos++
+					return s.new_token(.minus_assign, '', 2)
+				}
+				return s.new_token(.minus, '', 1)
+			}
+			`*` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.mult_assign, '', 2)
+				}
+				return s.new_token(.mul, '', 1)
+			}
+			`^` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.xor_assign, '', 2)
+				}
+				return s.new_token(.xor, '', 1)
+			}
+			`%` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.mod_assign, '', 2)
+				}
+				return s.new_token(.mod, '', 1)
+			}
+			`?` {
+				return s.new_token(.question, '', 1)
+			}
+			single_quote, double_quote {
+				ident_string := s.ident_string()
+				return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
+			}
+			`\`` {
+				// ` // apostrophe balance comment. do not remove
+				ident_char := s.ident_char()
+				return s.new_token(.chartoken, ident_char, ident_char.len + 2) // + two quotes
+			}
+			`(` {
+				// TODO `$if vet {` for performance
+				if s.pref.is_vet && s.text[s.pos + 1] == ` ` {
+					println('$s.file_path:$s.line_nr: Looks like you are adding a space after `(`')
+				}
+				return s.new_token(.lpar, '', 1)
+			}
+			`)` {
+				// TODO `$if vet {` for performance
+				if s.pref.is_vet && s.text[s.pos - 1] == ` ` {
+					println('$s.file_path:$s.line_nr: Looks like you are adding a space before `)`')
+				}
+				return s.new_token(.rpar, '', 1)
+			}
+			`[` {
+				return s.new_token(.lsbr, '', 1)
+			}
+			`]` {
+				return s.new_token(.rsbr, '', 1)
+			}
+			`{` {
+				// Skip { in `${` in strings
+				if s.is_inside_string {
+					continue
+				}
+				return s.new_token(.lcbr, '', 1)
+			}
+			`$` {
+				if s.is_inside_string {
+					return s.new_token(.str_dollar, '', 1)
+				} else {
+					return s.new_token(.dollar, '', 1)
+				}
+			}
+			`}` {
+				// s = `hello $name !`
+				// s = `hello ${name} !`
+				if s.is_inside_string {
+					s.pos++
+					if s.text[s.pos] == s.quote {
+						s.is_inside_string = false
+						return s.new_token(.string, '', 1)
+					}
+					ident_string := s.ident_string()
+					return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
+				} else {
+					return s.new_token(.rcbr, '', 1)
+				}
+			}
+			`&` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.and_assign, '', 2)
+				}
+				afternextc := s.look_ahead(2)
+				if nextc == `&` && afternextc.is_space() {
+					s.pos++
+					return s.new_token(.and, '', 2)
+				}
+				return s.new_token(.amp, '', 1)
+			}
+			`|` {
+				if nextc == `|` {
+					s.pos++
+					return s.new_token(.logical_or, '', 2)
+				}
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.or_assign, '', 2)
+				}
+				return s.new_token(.pipe, '', 1)
+			}
+			`,` {
+				return s.new_token(.comma, '', 1)
+			}
+			`@` {
+				s.pos++
+				name := s.ident_name()
+				if s.is_fmt {
+					return s.new_token(.name, '@' + name, name.len + 1)
+				}
+				// @FN => will be substituted with the name of the current V function
+				// @MOD => will be substituted with the name of the current V module
+				// @STRUCT => will be substituted with the name of the current V struct
+				// @VEXE => will be substituted with the path to the V compiler
+				// @FILE => will be substituted with the path of the V source file
+				// @LINE => will be substituted with the V line number where it appears (as a string).
+				// @COLUMN => will be substituted with the column where it appears (as a string).
+				// @VHASH  => will be substituted with the shortened commit hash of the V compiler (as a string).
+				// @VMOD_FILE => will be substituted with the contents of the nearest v.mod file (as a string).
+				// This allows things like this:
+				// println( 'file: ' + @FILE + ' | line: ' + @LINE + ' | fn: ' + @MOD + '.' + @FN)
+				// ... which is useful while debugging/tracing
+				if name == 'FN' {
+					return s.new_token(.string, s.fn_name, 3)
+				}
+				if name == 'MOD' {
+					return s.new_token(.string, s.mod_name, 4)
+				}
+				if name == 'STRUCT' {
+					return s.new_token(.string, s.struct_name, 7)
+				}
+				if name == 'VEXE' {
+					vexe := pref.vexe_path()
+					return s.new_token(.string, util.cescaped_path(vexe), 5)
+				}
+				if name == 'FILE' {
+					fpath := os.real_path(s.file_path)
+					return s.new_token(.string, util.cescaped_path(fpath), 5)
+				}
+				if name == 'LINE' {
+					return s.new_token(.string, (s.line_nr + 1).str(), 5)
+				}
+				if name == 'COLUMN' {
+					return s.new_token(.string, s.current_column().str(), 7)
+				}
+				if name == 'VHASH' {
+					return s.new_token(.string, util.vhash(), 6)
+				}
+				if name == 'VMOD_FILE' {
+					if s.vmod_file_content.len == 0 {
+						mcache := vmod.get_cache()
+						vmod_file_location := mcache.get_by_file(s.file_path)
+						if vmod_file_location.vmod_file.len == 0 {
+							s.error('@VMOD_FILE can be used only in projects, that have v.mod file')
+						}
+						vmod_content := os.read_file(vmod_file_location.vmod_file) or {
+							''
+						}
+						$if windows {
+							s.vmod_file_content = vmod_content.replace('\r\n', '\n')
+						} $else {
+							s.vmod_file_content = vmod_content
+						}
+					}
+					return s.new_token(.string, s.vmod_file_content, 10)
+				}
+				if !token.is_key(name) {
+					s.error('@ must be used before keywords (e.g. `@type string`)')
+				}
+				return s.new_token(.name, name, name.len)
+			}
+			/*
+			case `\r`:
 		if nextc == `\n` {
 			s.pos++
 			s.last_nl_pos = s.pos
@ -925,184 +932,186 @@ fn (mut s Scanner) text_scan() token.Token {
 		s.last_nl_pos = s.pos
 		return s.new_token(.nl, '')
 	 }
-		*/
-		`.` {
-			if nextc == `.` {
-				s.pos++
-				if s.text[s.pos + 1] == `.` {
+			*/
+			`.` {
+				if nextc == `.` {
 					s.pos++
-					return s.new_token(.ellipsis, '', 3)
+					if s.text[s.pos + 1] == `.` {
+						s.pos++
+						return s.new_token(.ellipsis, '', 3)
+					}
+					return s.new_token(.dotdot, '', 2)
 				}
-				return s.new_token(.dotdot, '', 2)
+				return s.new_token(.dot, '', 1)
 			}
-			return s.new_token(.dot, '', 1)
-		}
-		`#` {
-			start := s.pos + 1
-			s.ignore_line()
-			if nextc == `!` {
-				// treat shebang line (#!) as a comment
-				s.line_comment = s.text[start + 1..s.pos].trim_space()
-				// s.fgenln('// shebang line "$s.line_comment"')
-				return s.scan()
-			}
-			hash := s.text[start..s.pos].trim_space()
-			return s.new_token(.hash, hash, hash.len)
-		}
-		`>` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.ge, '', 2)
-			} else if nextc == `>` {
-				if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
-					s.pos += 2
-					return s.new_token(.right_shift_assign, '', 3)
-				}
-				s.pos++
-				return s.new_token(.right_shift, '', 2)
-			} else {
-				return s.new_token(.gt, '', 1)
-			}
-		}
-		0xE2 {
-			if nextc == 0x89 && s.text[s.pos + 2] == 0xA0 {
-				// case `≠`:
-				s.pos += 2
-				return s.new_token(.ne, '', 3)
-			} else if nextc == 0x89 && s.text[s.pos + 2] == 0xBD {
-				s.pos += 2
-				return s.new_token(.le, '', 3)
-			} else if nextc == 0xA9 && s.text[s.pos + 2] == 0xBE {
-				s.pos += 2
-				return s.new_token(.ge, '', 3)
-			}
-		}
-		`<` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.le, '', 2)
-			} else if nextc == `<` {
-				if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
-					s.pos += 2
-					return s.new_token(.left_shift_assign, '', 3)
-				}
-				s.pos++
-				return s.new_token(.left_shift, '', 2)
-			} else {
-				return s.new_token(.lt, '', 1)
-			}
-		}
-		`=` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.eq, '', 2)
-			} else if nextc == `>` {
-				s.pos++
-				return s.new_token(.arrow, '', 2)
-			} else {
-				return s.new_token(.assign, '', 1)
-			}
-		}
-		`:` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.decl_assign, '', 2)
-			} else {
-				return s.new_token(.colon, '', 1)
-			}
-		}
-		`;` {
-			return s.new_token(.semicolon, '', 1)
-		}
-		`!` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.ne, '', 2)
-			} else if nextc == `i` && s.text[s.pos + 2] == `n` && s.text[s.pos + 3].is_space() {
-				s.pos += 2
-				return s.new_token(.not_in, '', 3)
-			} else if nextc == `i` && s.text[s.pos + 2] == `s` && s.text[s.pos + 3].is_space() {
-				s.pos += 2
-				return s.new_token(.not_is, '', 3)
-			} else {
-				return s.new_token(.not, '', 1)
-			}
-		}
-		`~` {
-			return s.new_token(.bit_not, '', 1)
-		}
-		`/` {
-			if nextc == `=` {
-				s.pos++
-				return s.new_token(.div_assign, '', 2)
-			}
-			if nextc == `/` {
+			`#` {
 				start := s.pos + 1
 				s.ignore_line()
-				s.line_comment = s.text[start + 1..s.pos]
-				mut comment := s.line_comment.trim_space()
-				s.pos--
-				// fix line_nr, \n was read, and the comment is marked
-				// on the next line
-				s.line_nr--
-				if s.should_parse_comment() {
-					// Find out if this comment is on its own line (for vfmt)
-					mut is_separate_line_comment := true
-					for j := start - 2; j >= 0 && s.text[j] != `\n`; j-- {
-						if s.text[j] !in [`\t`, ` `] {
-							is_separate_line_comment = false
+				if nextc == `!` {
+					// treat shebang line (#!) as a comment
+					s.line_comment = s.text[start + 1..s.pos].trim_space()
+					// s.fgenln('// shebang line "$s.line_comment"')
+					continue
+				}
+				hash := s.text[start..s.pos].trim_space()
+				return s.new_token(.hash, hash, hash.len)
+			}
+			`>` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.ge, '', 2)
+				} else if nextc == `>` {
+					if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
+						s.pos += 2
+						return s.new_token(.right_shift_assign, '', 3)
+					}
+					s.pos++
+					return s.new_token(.right_shift, '', 2)
+				} else {
+					return s.new_token(.gt, '', 1)
+				}
+			}
+			0xE2 {
+				if nextc == 0x89 && s.text[s.pos + 2] == 0xA0 {
+					// case `≠`:
+					s.pos += 2
+					return s.new_token(.ne, '', 3)
+				} else if nextc == 0x89 && s.text[s.pos + 2] == 0xBD {
+					s.pos += 2
+					return s.new_token(.le, '', 3)
+				} else if nextc == 0xA9 && s.text[s.pos + 2] == 0xBE {
+					s.pos += 2
+					return s.new_token(.ge, '', 3)
+				}
+			}
+			`<` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.le, '', 2)
+				} else if nextc == `<` {
+					if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
+						s.pos += 2
+						return s.new_token(.left_shift_assign, '', 3)
+					}
+					s.pos++
+					return s.new_token(.left_shift, '', 2)
+				} else {
+					return s.new_token(.lt, '', 1)
+				}
+			}
+			`=` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.eq, '', 2)
+				} else if nextc == `>` {
+					s.pos++
+					return s.new_token(.arrow, '', 2)
+				} else {
+					return s.new_token(.assign, '', 1)
+				}
+			}
+			`:` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.decl_assign, '', 2)
+				} else {
+					return s.new_token(.colon, '', 1)
+				}
+			}
+			`;` {
+				return s.new_token(.semicolon, '', 1)
+			}
+			`!` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.ne, '', 2)
+				} else if nextc == `i` && s.text[s.pos + 2] == `n` && s.text[s.pos + 3].is_space() {
+					s.pos += 2
+					return s.new_token(.not_in, '', 3)
+				} else if nextc == `i` && s.text[s.pos + 2] == `s` && s.text[s.pos + 3].is_space() {
+					s.pos += 2
+					return s.new_token(.not_is, '', 3)
+				} else {
+					return s.new_token(.not, '', 1)
+				}
+			}
+			`~` {
+				return s.new_token(.bit_not, '', 1)
+			}
+			`/` {
+				if nextc == `=` {
+					s.pos++
+					return s.new_token(.div_assign, '', 2)
+				}
+				if nextc == `/` {
+					start := s.pos + 1
+					s.ignore_line()
+					s.line_comment = s.text[start + 1..s.pos]
+					mut comment := s.line_comment.trim_space()
+					s.pos--
+					// fix line_nr, \n was read, and the comment is marked
+					// on the next line
+					s.line_nr--
+					if s.should_parse_comment() {
+						// Find out if this comment is on its own line (for vfmt)
+						mut is_separate_line_comment := true
+						for j := start - 2; j >= 0 && s.text[j] != `\n`; j-- {
+							if s.text[j] !in [`\t`, ` `] {
+								is_separate_line_comment = false
+							}
+						}
+						if is_separate_line_comment {
+							comment = '|' + comment
+						}
+						return s.new_token(.comment, comment, comment.len + 2)
+					}
+					// s.fgenln('// ${s.prev_tok.str()} "$s.line_comment"')
+					// Skip the comment (return the next token)
+					continue
+				}
+				// Multiline comments
+				if nextc == `*` {
+					start := s.pos + 2
+					mut nest_count := 1
+					// Skip comment
+					for nest_count > 0 {
+						s.pos++
+						if s.pos >= s.text.len {
+							s.line_nr--
+							s.error('comment not terminated')
+						}
+						if s.text[s.pos] == `\n` {
+							s.inc_line_number()
+							continue
+						}
+						if s.expect('/*', s.pos) {
+							nest_count++
+							continue
+						}
+						if s.expect('*/', s.pos) {
+							nest_count--
 						}
 					}
-					if is_separate_line_comment {
-						comment = '|' + comment
-					}
-					return s.new_token(.comment, comment, comment.len + 2)
-				}
-				// s.fgenln('// ${s.prev_tok.str()} "$s.line_comment"')
-				// Skip the comment (return the next token)
-				return s.scan()
-			}
-			// Multiline comments
-			if nextc == `*` {
-				start := s.pos + 2
-				mut nest_count := 1
-				// Skip comment
-				for nest_count > 0 {
 					s.pos++
-					if s.pos >= s.text.len {
-						s.line_nr--
-						s.error('comment not terminated')
-					}
-					if s.text[s.pos] == `\n` {
-						s.inc_line_number()
-						continue
-					}
-					if s.expect('/*', s.pos) {
-						nest_count++
-						continue
-					}
-					if s.expect('*/', s.pos) {
-						nest_count--
+					if s.should_parse_comment() {
+						comment := s.text[start..(s.pos - 1)].trim_space()
+						return s.new_token(.comment, comment, comment.len + 4)
 					}
+					// Skip if not in fmt mode
+					continue
 				}
-				s.pos++
-				if s.should_parse_comment() {
-					comment := s.text[start..(s.pos - 1)].trim_space()
-					return s.new_token(.comment, comment, comment.len + 4)
-				}
-				// Skip if not in fmt mode
-				return s.scan()
+				return s.new_token(.div, '', 1)
 			}
-			return s.new_token(.div, '', 1)
+			else {}
 		}
-		else {}
-	}
-	$if windows {
-		if c == `\0` {
-			return s.end_of_file()
+		$if windows {
+			if c == `\0` {
+				return s.end_of_file()
+			}
 		}
+		s.error('invalid character `$c.str()`')
+		break
 	}
-	s.error('invalid character `$c.str()`')
 	return s.end_of_file()
 }