v/vlib/v/scanner/scanner.v

1474 lines
39 KiB
V
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module scanner
import math.mathutil
import os
import strconv
import v.token
import v.pref
import v.util
import v.vet
import v.errors
import v.ast
const (
single_quote = `'`
double_quote = `"`
// char used as number separator
num_sep = `_`
b_lf = 10
b_cr = 13
backslash = `\\`
)
pub struct Scanner {
pub mut:
file_path string // '/path/to/file.v'
file_base string // 'file.v'
text string // the whole text of the file
pos int // current position in the file, first character is s.text[0]
line_nr int // current line number
last_nl_pos int = -1 // for calculating column
is_crlf bool // special check when computing columns
is_inside_string bool // set to true in a string, *at the start* of an $var or ${expr}
is_inter_start bool // for hacky string interpolation TODO simplify
is_inter_end bool
is_enclosed_inter bool
line_comment string
last_lt int = -1 // position of latest <
// prev_tok TokenKind
is_started bool
is_print_line_on_error bool
is_print_colored_error bool
is_print_rel_paths_on_error bool
quote byte // which quote is used to denote current string: ' or "
inter_quote byte
line_ends []int // the positions of source lines ends (i.e. \n signs)
nr_lines int // total number of lines in the source file that were scanned
is_vh bool // Keep newlines
is_fmt bool // Used for v fmt.
comments_mode CommentsMode
is_inside_toplvl_statement bool // *only* used in comments_mode: .toplevel_comments, toggled by parser
all_tokens []token.Token // *only* used in comments_mode: .toplevel_comments, contains all tokens
tidx int
eofs int
pref &pref.Preferences
errors []errors.Error
warnings []errors.Warning
notices []errors.Notice
vet_errors []vet.Error
should_abort bool // when too many errors/warnings/notices are accumulated, should_abort becomes true, and the scanner should stop
}
/*
How the .toplevel_comments mode works:
In this mode, the scanner scans *everything* at once, before parsing starts,
including all the comments, and stores the results in an buffer s.all_tokens.
Then .scan() just returns s.all_tokens[ s.tidx++ ] *ignoring* the
comment tokens. In other words, by default in this mode, the parser
*will not see any comments* inside top level statements, so it has
no reason to complain about them.
When the parser determines, that it is outside of a top level statement,
it tells the scanner to backtrack s.tidx to the current p.tok index,
then it changes .is_inside_toplvl_statement to false , and refills its
lookahead buffer (i.e. p.peek_tok), from the scanner.
In effect, from the parser's point of view, the next tokens, that it will
receive with p.next(), will be the same, as if comments are not ignored
anymore, *between* top level statements.
When the parser determines, that it is going again inside a top level
statement, it does the same, this time setting .is_inside_toplvl_statement
to true, again refilling the lookahead buffer => calling .next() in this
mode, will again ignore all the comment tokens, till the top level statement
is finished.
*/
// The different kinds of scanner modes:
//
// .skip_comments - simplest/fastest, just ignores all comments early.
// This mode is used by the compiler itself.
//
// .parse_comments is used by vfmt. Ideally it should handle inline /* */
// comments too, i.e. it returns every kind of comment as a new token.
//
// .toplevel_comments is used by vdoc, parses *only* top level ones
// that are *outside* structs/enums/fns.
pub enum CommentsMode {
skip_comments
parse_comments
toplevel_comments
}
// new scanner from file.
pub fn new_scanner_file(file_path string, comments_mode CommentsMode, pref &pref.Preferences) &Scanner {
if !os.is_file(file_path) {
verror('$file_path is not a file')
}
raw_text := util.read_file(file_path) or {
verror(err.msg)
return voidptr(0)
}
mut s := &Scanner{
pref: pref
text: raw_text
is_print_line_on_error: true
is_print_colored_error: true
is_print_rel_paths_on_error: true
is_fmt: pref.is_fmt
comments_mode: comments_mode
file_path: file_path
file_base: os.base(file_path)
}
s.init_scanner()
return s
}
// new scanner from string.
pub fn new_scanner(text string, comments_mode CommentsMode, pref &pref.Preferences) &Scanner {
mut s := &Scanner{
pref: pref
text: text
is_print_line_on_error: true
is_print_colored_error: true
is_print_rel_paths_on_error: true
is_fmt: pref.is_fmt
comments_mode: comments_mode
file_path: 'internal_memory'
file_base: 'internal_memory'
}
s.init_scanner()
return s
}
fn (mut s Scanner) init_scanner() {
s.scan_all_tokens_in_buffer(s.comments_mode)
}
[unsafe]
pub fn (mut s Scanner) free() {
unsafe {
s.text.free()
}
}
[inline]
fn (s &Scanner) should_parse_comment() bool {
return (s.comments_mode == .parse_comments)
|| (s.comments_mode == .toplevel_comments && !s.is_inside_toplvl_statement)
}
// NB: this is called by v's parser
pub fn (mut s Scanner) set_is_inside_toplevel_statement(newstate bool) {
s.is_inside_toplvl_statement = newstate
}
pub fn (mut s Scanner) set_current_tidx(cidx int) {
mut tidx := if cidx < 0 { 0 } else { cidx }
tidx = if tidx > s.all_tokens.len { s.all_tokens.len } else { tidx }
s.tidx = tidx
}
[inline]
fn (mut s Scanner) new_token(tok_kind token.Kind, lit string, len int) token.Token {
cidx := s.tidx
s.tidx++
line_offset := if tok_kind == .hash { 0 } else { 1 }
return token.Token{
kind: tok_kind
lit: lit
line_nr: s.line_nr + line_offset
col: mathutil.max(1, s.current_column() - len + 1)
pos: s.pos - len + 1
len: len
tidx: cidx
}
}
[inline]
fn (s &Scanner) new_eof_token() token.Token {
return token.Token{
kind: .eof
lit: ''
line_nr: s.line_nr + 1
col: s.current_column()
pos: s.pos
len: 1
tidx: s.tidx
}
}
[inline]
fn (mut s Scanner) new_multiline_token(tok_kind token.Kind, lit string, len int, start_line int) token.Token {
cidx := s.tidx
s.tidx++
return token.Token{
kind: tok_kind
lit: lit
line_nr: start_line + 1
col: mathutil.max(1, s.current_column() - len + 1)
pos: s.pos - len + 1
len: len
tidx: cidx
}
}
[direct_array_access; inline]
fn (mut s Scanner) ident_name() string {
start := s.pos
s.pos++
for s.pos < s.text.len {
c := s.text[s.pos]
if !(util.is_name_char(c) || c.is_digit()) {
break
}
s.pos++
}
name := s.text[start..s.pos]
s.pos--
return name
}
fn (s Scanner) num_lit(start int, end int) string {
if s.is_fmt {
return s.text[start..end]
}
unsafe {
txt := s.text.str
mut b := malloc_noscan(end - start + 1) // add a byte for the endstring 0
mut i1 := 0
for i := start; i < end; i++ {
if txt[i] != scanner.num_sep {
b[i1] = txt[i]
i1++
}
}
b[i1] = 0 // C string compatibility
return b.vstring_with_len(i1)
}
}
fn (mut s Scanner) ident_bin_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
start_pos := s.pos
s.pos += 2 // skip '0b'
if s.pos < s.text.len && s.text[s.pos] == scanner.num_sep {
s.error('separator `_` is only valid between digits in a numeric literal')
}
for s.pos < s.text.len {
c := s.text[s.pos]
if c == scanner.num_sep && s.text[s.pos - 1] == scanner.num_sep {
s.error('cannot use `_` consecutively')
}
if !c.is_bin_digit() && c != scanner.num_sep {
if (!c.is_digit() && !c.is_letter()) || s.is_inside_string {
break
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
if s.text[s.pos - 1] == scanner.num_sep {
s.pos--
s.error('cannot use `_` at the end of a numeric literal')
} else if start_pos + 2 == s.pos {
s.pos-- // adjust error position
s.error('number part of this binary is not provided')
} else if has_wrong_digit {
s.pos = first_wrong_digit_pos // adjust error position
s.error('this binary number has unsuitable digit `$first_wrong_digit.str()`')
}
number := s.num_lit(start_pos, s.pos)
s.pos--
return number
}
[direct_array_access]
fn (mut s Scanner) ident_hex_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
start_pos := s.pos
if s.pos + 2 >= s.text.len {
return '0x'
}
s.pos += 2 // skip '0x'
if s.pos < s.text.len && s.text[s.pos] == scanner.num_sep {
s.error('separator `_` is only valid between digits in a numeric literal')
}
for s.pos < s.text.len {
c := s.text[s.pos]
if c == scanner.num_sep && s.text[s.pos - 1] == scanner.num_sep {
s.error('cannot use `_` consecutively')
}
if !c.is_hex_digit() && c != scanner.num_sep {
if !c.is_letter() || s.is_inside_string {
break
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
if s.text[s.pos - 1] == scanner.num_sep {
s.pos--
s.error('cannot use `_` at the end of a numeric literal')
} else if start_pos + 2 == s.pos {
s.pos-- // adjust error position
s.error('number part of this hexadecimal is not provided')
} else if has_wrong_digit {
s.pos = first_wrong_digit_pos // adjust error position
s.error('this hexadecimal number has unsuitable digit `$first_wrong_digit.str()`')
}
number := s.num_lit(start_pos, s.pos)
s.pos--
return number
}
fn (mut s Scanner) ident_oct_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
start_pos := s.pos
s.pos += 2 // skip '0o'
if s.pos < s.text.len && s.text[s.pos] == scanner.num_sep {
s.error('separator `_` is only valid between digits in a numeric literal')
}
for s.pos < s.text.len {
c := s.text[s.pos]
if c == scanner.num_sep && s.text[s.pos - 1] == scanner.num_sep {
s.error('cannot use `_` consecutively')
}
if !c.is_oct_digit() && c != scanner.num_sep {
if (!c.is_digit() && !c.is_letter()) || s.is_inside_string {
break
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
if s.text[s.pos - 1] == scanner.num_sep {
s.pos--
s.error('cannot use `_` at the end of a numeric literal')
} else if start_pos + 2 == s.pos {
s.pos-- // adjust error position
s.error('number part of this octal is not provided')
} else if has_wrong_digit {
s.pos = first_wrong_digit_pos // adjust error position
s.error('this octal number has unsuitable digit `$first_wrong_digit.str()`')
}
number := s.num_lit(start_pos, s.pos)
s.pos--
return number
}
[direct_array_access]
fn (mut s Scanner) ident_dec_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
start_pos := s.pos
// scan integer part
for s.pos < s.text.len {
c := s.text[s.pos]
if c == scanner.num_sep && s.text[s.pos - 1] == scanner.num_sep {
s.error('cannot use `_` consecutively')
}
if !c.is_digit() && c != scanner.num_sep {
if !c.is_letter() || c in [`e`, `E`] || s.is_inside_string {
break
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
if s.text[s.pos - 1] == scanner.num_sep {
s.pos--
s.error('cannot use `_` at the end of a numeric literal')
}
mut call_method := false // true for, e.g., 5.str(), 5.5.str(), 5e5.str()
mut is_range := false // true for, e.g., 5..10
// scan fractional part
if s.pos < s.text.len && s.text[s.pos] == `.` {
s.pos++
if s.pos < s.text.len {
// 5.5, 5.5.str()
if s.text[s.pos].is_digit() {
for s.pos < s.text.len {
c := s.text[s.pos]
if !c.is_digit() {
if !c.is_letter() || c in [`e`, `E`] || s.is_inside_string {
// 5.5.str()
if c == `.` && s.pos + 1 < s.text.len && s.text[s.pos + 1].is_letter() {
call_method = true
}
break
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
} else if s.text[s.pos] == `.` {
// 5.. (a range)
is_range = true
s.pos--
} else if s.text[s.pos] in [`e`, `E`] {
// 5.e5
} else if s.text[s.pos].is_letter() {
// 5.str()
call_method = true
s.pos--
} else {
// 5.
mut symbol_length := 0
for i := s.pos - 2; i > 0 && s.text[i - 1].is_digit(); i-- {
symbol_length++
}
float_symbol := s.text[s.pos - 2 - symbol_length..s.pos - 1]
s.warn('float literals should have a digit after the decimal point, e.g. `${float_symbol}.0`')
}
}
}
// scan exponential part
mut has_exp := false
if s.pos < s.text.len && s.text[s.pos] in [`e`, `E`] {
has_exp = true
s.pos++
if s.pos < s.text.len && s.text[s.pos] in [`-`, `+`] {
s.pos++
}
for s.pos < s.text.len {
c := s.text[s.pos]
if !c.is_digit() {
if !c.is_letter() || s.is_inside_string {
// 5e5.str()
if c == `.` && s.pos + 1 < s.text.len && s.text[s.pos + 1].is_letter() {
call_method = true
}
break
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
}
if has_wrong_digit {
// error check: wrong digit
s.pos = first_wrong_digit_pos // adjust error position
s.error('this number has unsuitable digit `$first_wrong_digit.str()`')
} else if s.text[s.pos - 1] in [`e`, `E`] {
// error check: 5e
s.pos-- // adjust error position
s.error('exponent has no digits')
} else if s.pos < s.text.len && s.text[s.pos] == `.` && !is_range && !call_method {
// error check: 1.23.4, 123.e+3.4
if has_exp {
s.error('exponential part should be integer')
} else {
s.error('too many decimal points in number')
}
}
number := s.num_lit(start_pos, s.pos)
s.pos--
return number
}
fn (mut s Scanner) ident_number() string {
if s.expect('0b', s.pos) {
return s.ident_bin_number()
} else if s.expect('0x', s.pos) {
return s.ident_hex_number()
} else if s.expect('0o', s.pos) {
return s.ident_oct_number()
} else {
return s.ident_dec_number()
}
}
[direct_array_access; inline]
fn (mut s Scanner) skip_whitespace() {
for s.pos < s.text.len {
c := s.text[s.pos]
if !(c == 32 || (c > 8 && c < 14) || (c == 0x85) || (c == 0xa0)) {
return
}
c_is_nl := c == scanner.b_cr || c == scanner.b_lf
if c_is_nl && s.is_vh {
return
}
if s.pos + 1 < s.text.len && c == scanner.b_cr && s.text[s.pos + 1] == scanner.b_lf {
s.is_crlf = true
}
// Count \r\n as one line
if c_is_nl && !(s.pos > 0 && s.text[s.pos - 1] == scanner.b_cr && c == scanner.b_lf) {
s.inc_line_number()
}
s.pos++
}
}
fn (mut s Scanner) end_of_file() token.Token {
s.eofs++
if s.eofs > 50 {
s.line_nr--
panic(
'the end of file `$s.file_path` has been reached 50 times already, the v parser is probably stuck.\n' +
'This should not happen. Please report the bug here, and include the last 2-3 lines of your source code:\n' +
'https://github.com/vlang/v/issues/new?labels=Bug&template=bug_report.md')
}
if s.pos != s.text.len && s.eofs == 1 {
s.inc_line_number()
}
s.pos = s.text.len
return s.new_eof_token()
}
pub fn (mut s Scanner) scan_all_tokens_in_buffer(mode CommentsMode) {
mut timers := util.get_timers()
timers.measure_pause('PARSE')
util.timing_start('SCAN')
defer {
util.timing_measure_cumulative('SCAN')
timers.measure_resume('PARSE')
}
oldmode := s.comments_mode
s.comments_mode = mode
s.scan_remaining_text()
s.comments_mode = oldmode
s.tidx = 0
$if debugscanner ? {
for t in s.all_tokens {
eprintln('> tidx:${t.tidx:-5} | kind: ${t.kind:-10} | lit: $t.lit')
}
}
}
pub fn (mut s Scanner) scan_remaining_text() {
for {
t := s.text_scan()
if s.comments_mode == .skip_comments && t.kind == .comment {
continue
}
s.all_tokens << t
if t.kind == .eof || s.should_abort {
break
}
}
}
pub fn (mut s Scanner) scan() token.Token {
return s.buffer_scan()
}
[direct_array_access]
pub fn (mut s Scanner) buffer_scan() token.Token {
for {
cidx := s.tidx
s.tidx++
if cidx >= s.all_tokens.len || s.should_abort {
return s.end_of_file()
}
if s.all_tokens[cidx].kind == .comment {
if !s.should_parse_comment() {
continue
}
}
return s.all_tokens[cidx]
}
return s.new_eof_token()
}
[direct_array_access; inline]
pub fn (s &Scanner) peek_token(n int) token.Token {
idx := s.tidx + n
if idx >= s.all_tokens.len {
return s.new_eof_token()
}
t := s.all_tokens[idx]
return t
}
[direct_array_access; inline]
fn (s &Scanner) look_ahead(n int) byte {
if s.pos + n < s.text.len {
return s.text[s.pos + n]
} else {
return `\0`
}
}
[direct_array_access]
fn (mut s Scanner) text_scan() token.Token {
// The for loop here is so that instead of doing
// `return s.scan()` (which will use a new call stack frame),
// text_scan can just do continue, keeping
// memory & stack usage low.
// That optimization mostly matters for long sections
// of comments and string literals.
for {
// if s.comments_mode == .parse_comments {
// println('\nscan()')
// }
// if s.line_comment != '' {
// s.fgenln('// LC "$s.line_comment"')
// s.line_comment = ''
// }
if s.is_started {
s.pos++
} else {
s.is_started = true
}
if !s.is_inside_string {
s.skip_whitespace()
}
if s.pos >= s.text.len || s.should_abort {
return s.end_of_file()
}
// End of $var, start next string
if s.is_inter_end {
if s.text[s.pos] == s.quote {
s.is_inter_end = false
return s.new_token(.string, '', 1)
}
s.is_inter_end = false
ident_string := s.ident_string()
return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
}
s.skip_whitespace()
// end of file
if s.pos >= s.text.len {
return s.end_of_file()
}
// handle each char
c := s.text[s.pos]
nextc := s.look_ahead(1)
// name or keyword
if util.is_name_char(c) {
name := s.ident_name()
// tmp hack to detect . in ${}
// Check if not .eof to prevent panic
next_char := s.look_ahead(1)
kind := token.keywords[name]
if kind != .unknown {
return s.new_token(kind, name, name.len)
}
// 'asdf $b' => "b" is the last name in the string, dont start parsing string
// at the next ', skip it
if s.is_inside_string {
if next_char == s.quote {
s.is_inter_end = true
s.is_inter_start = false
s.is_inside_string = false
}
}
// end of `$expr`
// allow `'$a.b'` and `'$a.c()'`
if s.is_inter_start && next_char == `\\`
&& s.look_ahead(2) !in [`x`, `n`, `r`, `\\`, `t`, `e`, `"`, `'`] {
s.warn('unknown escape sequence \\${s.look_ahead(2)}')
}
if s.is_inter_start && next_char == `(` {
if s.look_ahead(2) != `)` {
s.warn('use `\${f(expr)}` instead of `\$f(expr)`')
}
} else if s.is_inter_start && next_char != `.` {
s.is_inter_end = true
s.is_inter_start = false
}
return s.new_token(.name, name, name.len)
} else if c.is_digit() || (c == `.` && nextc.is_digit()) {
// `123`, `.123`
if !s.is_inside_string {
// In C ints with `0` prefix are octal (in V they're decimal), so discarding heading zeros is needed.
mut start_pos := s.pos
for start_pos < s.text.len && s.text[start_pos] == `0` {
start_pos++
}
mut prefix_zero_num := start_pos - s.pos // how many prefix zeros should be jumped
// for 0b, 0o, 0x the heading zero shouldn't be jumped
if start_pos == s.text.len || (c == `0` && !s.text[start_pos].is_digit()) {
prefix_zero_num--
}
s.pos += prefix_zero_num // jump these zeros
}
num := s.ident_number()
return s.new_token(.number, num, num.len)
}
// Handle `'$fn()'`
if c == `)` && s.is_inter_start {
next_char := s.look_ahead(1)
if next_char != `.` {
s.is_inter_end = true
s.is_inter_start = false
if next_char == s.quote {
s.is_inside_string = false
}
return s.new_token(.rpar, '', 1)
}
}
// all other tokens
match c {
`+` {
if nextc == `+` {
s.pos++
return s.new_token(.inc, '', 2)
} else if nextc == `=` {
s.pos++
return s.new_token(.plus_assign, '', 2)
}
return s.new_token(.plus, '', 1)
}
`-` {
if nextc == `-` {
s.pos++
return s.new_token(.dec, '', 2)
} else if nextc == `=` {
s.pos++
return s.new_token(.minus_assign, '', 2)
}
return s.new_token(.minus, '', 1)
}
`*` {
if nextc == `=` {
s.pos++
return s.new_token(.mult_assign, '', 2)
}
return s.new_token(.mul, '', 1)
}
`^` {
if nextc == `=` {
s.pos++
return s.new_token(.xor_assign, '', 2)
}
return s.new_token(.xor, '', 1)
}
`%` {
if nextc == `=` {
s.pos++
return s.new_token(.mod_assign, '', 2)
}
return s.new_token(.mod, '', 1)
}
`?` {
return s.new_token(.question, '?', 1)
}
scanner.single_quote, scanner.double_quote {
start_line := s.line_nr
ident_string := s.ident_string()
return s.new_multiline_token(.string, ident_string, ident_string.len + 2,
start_line) // + two quotes
}
`\`` {
// ` // apostrophe balance comment. do not remove
ident_char := s.ident_char()
return s.new_token(.chartoken, ident_char, ident_char.len + 2) // + two quotes
}
`(` {
// TODO `$if vet {` for performance
if s.pref.is_vet && s.text[s.pos + 1] == ` ` {
s.vet_error('Looks like you are adding a space after `(`', .vfmt)
}
return s.new_token(.lpar, '', 1)
}
`)` {
// TODO `$if vet {` for performance
if s.pref.is_vet && s.text[s.pos - 1] == ` ` {
s.vet_error('Looks like you are adding a space before `)`', .vfmt)
}
return s.new_token(.rpar, '', 1)
}
`[` {
return s.new_token(.lsbr, '', 1)
}
`]` {
return s.new_token(.rsbr, '', 1)
}
`{` {
// Skip { in `${` in strings
if s.is_inside_string {
continue
}
return s.new_token(.lcbr, '', 1)
}
`$` {
if s.is_inside_string {
return s.new_token(.str_dollar, '', 1)
} else {
return s.new_token(.dollar, '', 1)
}
}
`}` {
// s = `hello $name !`
// s = `hello ${name} !`
if s.is_enclosed_inter {
if s.pos < s.text.len - 1 {
s.pos++
} else {
s.error('unfinished string literal')
}
if s.text[s.pos] == s.quote {
s.is_inside_string = false
s.is_enclosed_inter = false
return s.new_token(.string, '', 1)
}
s.is_enclosed_inter = false
ident_string := s.ident_string()
return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
} else {
return s.new_token(.rcbr, '', 1)
}
}
`&` {
if nextc == `=` {
s.pos++
return s.new_token(.and_assign, '', 2)
}
afternextc := s.look_ahead(2)
if nextc == `&` && afternextc.is_space() {
s.pos++
return s.new_token(.and, '', 2)
}
return s.new_token(.amp, '', 1)
}
`|` {
if nextc == `|` {
s.pos++
return s.new_token(.logical_or, '', 2)
}
if nextc == `=` {
s.pos++
return s.new_token(.or_assign, '', 2)
}
return s.new_token(.pipe, '', 1)
}
`,` {
return s.new_token(.comma, '', 1)
}
`@` {
mut name := ''
if nextc != `\0` {
s.pos++
name = s.ident_name()
}
if s.is_fmt {
return s.new_token(.name, '@' + name, name.len + 1)
}
// @FN, @STRUCT, @MOD etc. See full list in token.valid_at_tokens
if '@' + name in token.valid_at_tokens || name.starts_with('cc') { // `=@cccond` in inline assembly
return s.new_token(.at, '@' + name, name.len + 1)
}
if !token.is_key(name) {
mut at_error_msg := '@ must be used before keywords or compile time variables (e.g. `@type string` or `@FN`)'
// If name is all uppercase, the user is probably looking for a compile time variable ("at-token")
if name.is_upper() {
at_error_msg += '\nAvailable compile time variables:\n$token.valid_at_tokens'
}
s.error(at_error_msg)
}
return s.new_token(.name, name, name.len)
}
`.` {
if nextc == `.` {
s.pos++
if s.pos + 1 < s.text.len && s.text[s.pos + 1] == `.` {
s.pos++
return s.new_token(.ellipsis, '', 3)
}
return s.new_token(.dotdot, '', 2)
}
return s.new_token(.dot, '', 1)
}
`#` {
start := s.pos + 1
s.ignore_line()
if nextc == `!` {
// treat shebang line (#!) as a comment
comment := s.text[start - 1..s.pos].trim_space()
// s.fgenln('// shebang line "$s.line_comment"')
return s.new_token(.comment, comment, comment.len + 2)
}
hash := s.text[start..s.pos].trim_space()
return s.new_token(.hash, hash, hash.len + 2)
}
`>` {
if nextc == `=` {
s.pos++
return s.new_token(.ge, '', 2)
} else if nextc == `>` {
if s.pos + 2 < s.text.len {
// an algorithm to decide it's generic or non-generic
// such as `foo<Baz, Bar<int>>(a)` vs `a, b := Foo{}<Foo{}, bar>>(baz)`
// @SleepyRoy if you have smarter algorithm :-)
// almost correct heuristics: last <T> of generic cannot be extremely long
// here we set the limit 100 which should be nice for real cases
// e.g. ...Bar<int, []Foo<int>, Baz_, [20]f64, map[string][]bool>> =>
// <int, Baz_, [20]f64, map[string][]bool => int, Baz_, f64, bool
is_generic := if s.last_lt >= 0 && s.pos - s.last_lt < 100 {
typs := s.text[s.last_lt + 1..s.pos].split(',').map(it.trim_space().trim_right('>').after(']'))
// if any typ is neither Type nor builtin, then the case is non-generic
typs.all(it.len > 0
&& ((it[0].is_capital() && it[1..].bytes().all(it.is_alnum()
|| it == `_`)) || it in ast.builtin_type_names))
} else {
false
}
if is_generic {
return s.new_token(.gt, '', 1)
} else if s.text[s.pos + 2] == `=` {
s.pos += 2
return s.new_token(.right_shift_assign, '', 3)
} else if s.text[s.pos + 2] == `>` {
if s.pos + 3 < s.text.len && s.text[s.pos + 3] == `=` {
s.pos += 3
return s.new_token(.unsigned_right_shift_assign, '', 4)
}
s.pos += 2
return s.new_token(.unsigned_right_shift, '', 3)
}
}
s.pos++
return s.new_token(.right_shift, '', 2)
}
return s.new_token(.gt, '', 1)
}
`<` {
if nextc == `=` {
s.pos++
return s.new_token(.le, '', 2)
} else if nextc == `<` {
if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
s.pos += 2
return s.new_token(.left_shift_assign, '', 3)
}
s.pos++
return s.new_token(.left_shift, '', 2)
} else if nextc == `-` {
s.pos++
return s.new_token(.arrow, '', 2)
} else {
s.last_lt = s.pos
return s.new_token(.lt, '', 1)
}
}
`=` {
if nextc == `=` {
s.pos++
return s.new_token(.eq, '', 2)
} else {
return s.new_token(.assign, '', 1)
}
}
`:` {
if nextc == `=` {
s.pos++
return s.new_token(.decl_assign, '', 2)
} else {
return s.new_token(.colon, '', 1)
}
}
`;` {
return s.new_token(.semicolon, '', 1)
}
`!` {
if nextc == `=` {
s.pos++
return s.new_token(.ne, '', 2)
} else if s.text.len > s.pos + 3 && nextc == `i` && s.text[s.pos + 2] == `n`
&& s.text[s.pos + 3].is_space() {
s.pos += 2
return s.new_token(.not_in, '', 3)
} else if s.text.len > s.pos + 3 && nextc == `i` && s.text[s.pos + 2] == `s`
&& s.text[s.pos + 3].is_space() {
s.pos += 2
return s.new_token(.not_is, '', 3)
} else {
return s.new_token(.not, '', 1)
}
}
`~` {
return s.new_token(.bit_not, '', 1)
}
`/` {
if nextc == `=` {
s.pos++
return s.new_token(.div_assign, '', 2)
}
if nextc == `/` { // Single line comments
start := s.pos + 1
s.ignore_line()
mut comment_line_end := s.pos
if s.text[s.pos - 1] == scanner.b_cr {
comment_line_end--
} else {
// fix line_nr, \n was read; the comment is marked on the next line
s.pos--
s.line_nr--
}
if s.should_parse_comment() {
s.line_comment = s.text[start + 1..comment_line_end]
mut comment := s.line_comment
// Find out if this comment is on its own line (for vfmt)
mut is_separate_line_comment := true
for j := start - 2; j >= 0 && s.text[j] != scanner.b_lf; j-- {
if s.text[j] !in [`\t`, ` `] {
is_separate_line_comment = false
}
}
if is_separate_line_comment {
// NB: ´\x01´ is used to preserve the initial whitespace in comments
// that are on a separate line
comment = '\x01' + comment
}
return s.new_token(.comment, comment, comment.len + 2)
}
// s.fgenln('// ${s.prev_tok.str()} "$s.line_comment"')
// Skip the comment (return the next token)
continue
} else if nextc == `*` { // Multiline comments
start := s.pos + 2
start_line := s.line_nr
mut nest_count := 1
s.pos++
// Skip comment
for nest_count > 0 && s.pos < s.text.len - 1 {
s.pos++
if s.pos >= s.text.len {
s.line_nr--
s.error('comment not terminated')
}
if s.text[s.pos] == scanner.b_lf {
s.inc_line_number()
continue
}
if s.expect('/*', s.pos) {
nest_count++
continue
}
if s.expect('*/', s.pos) {
nest_count--
}
}
s.pos++
if s.should_parse_comment() {
mut comment := s.text[start..(s.pos - 1)].trim(' ')
if !comment.contains('\n') {
comment = '\x01' + comment
}
return s.new_multiline_token(.comment, comment, comment.len + 4,
start_line)
}
// Skip if not in fmt mode
continue
}
return s.new_token(.div, '', 1)
}
else {}
}
$if windows {
if c == `\0` {
return s.end_of_file()
}
}
s.invalid_character()
break
}
return s.end_of_file()
}
fn (mut s Scanner) invalid_character() {
len := utf8_char_len(s.text[s.pos])
end := mathutil.min(s.pos + len, s.text.len)
c := s.text[s.pos..end]
s.error('invalid character `$c`')
}
fn (s &Scanner) current_column() int {
return s.pos - s.last_nl_pos
}
fn (s &Scanner) count_symbol_before(p int, sym byte) int {
mut count := 0
for i := p; i >= 0; i-- {
if s.text[i] != sym {
break
}
count++
}
return count
}
[direct_array_access]
fn (mut s Scanner) ident_string() string {
q := s.text[s.pos]
is_quote := q == scanner.single_quote || q == scanner.double_quote
is_raw := is_quote && s.pos > 0 && s.text[s.pos - 1] == `r` && !s.is_inside_string
is_cstr := is_quote && s.pos > 0 && s.text[s.pos - 1] == `c` && !s.is_inside_string
if is_quote {
if s.is_inside_string || s.is_enclosed_inter || s.is_inter_start {
s.inter_quote = q
} else {
s.quote = q
}
}
// if s.file_path.contains('string_test') {
// println('\nident_string() at char=${s.text[s.pos].str()}')
// println('linenr=$s.line_nr quote= $qquote ${qquote.str()}')
// }
mut n_cr_chars := 0
mut start := s.pos
start_char := s.text[start]
if start_char == s.quote
|| (start_char == s.inter_quote && (s.is_inter_start || s.is_enclosed_inter)) {
start++
} else if start_char == scanner.b_lf {
s.inc_line_number()
}
s.is_inside_string = false
mut u_escapes_pos := []int{} // pos list of \uXXXX
mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 }
for {
s.pos++
if s.pos >= s.text.len {
s.error('unfinished string literal')
break
}
c := s.text[s.pos]
prevc := s.text[s.pos - 1]
if c == scanner.backslash {
backslash_count++
}
// end of string
if c == s.quote && (is_raw || backslash_count % 2 == 0) {
// handle '123\\' backslash at the end
break
}
if c == s.inter_quote && (s.is_inter_start || s.is_enclosed_inter) {
break
}
if c == scanner.b_cr {
n_cr_chars++
}
if c == scanner.b_lf {
s.inc_line_number()
}
// Don't allow \0
if c == `0` && s.pos > 2 && prevc == scanner.backslash {
if (s.pos < s.text.len - 1 && s.text[s.pos + 1].is_digit())
|| s.count_symbol_before(s.pos - 1, scanner.backslash) % 2 == 0 {
} else if !is_cstr && !is_raw {
s.error(r'cannot use `\0` (NULL character) in the string literal')
}
}
// Don't allow \x00
if c == `0` && s.pos > 5 && s.expect('\\x0', s.pos - 3) {
if s.count_symbol_before(s.pos - 3, scanner.backslash) % 2 == 0 {
} else if !is_cstr && !is_raw {
s.error(r'cannot use `\x00` (NULL character) in the string literal')
}
}
// Escape `\x` `\u`
if backslash_count % 2 == 1 && !is_raw && !is_cstr {
// Escape `\x`
if c == `x` && (s.text[s.pos + 1] == s.quote || !s.text[s.pos + 1].is_hex_digit()) {
s.error(r'`\x` used with no following hex digits')
}
// Escape `\u`
if c == `u` {
if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote
|| s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
|| !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
|| !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit() {
s.error(r'`\u` incomplete unicode character value')
}
u_escapes_pos << s.pos - 1
}
}
// ${var} (ignore in vfmt mode) (skip \$)
if prevc == `$` && c == `{` && !is_raw
&& s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 {
s.is_inside_string = true
s.is_enclosed_inter = true
// so that s.pos points to $ at the next step
s.pos -= 2
break
}
// $var
if prevc == `$` && util.is_name_char(c) && !is_raw
&& s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 {
s.is_inside_string = true
s.is_inter_start = true
s.pos -= 2
break
}
if c != scanner.backslash {
backslash_count = 0
}
}
mut lit := ''
mut end := s.pos
if s.is_inside_string {
end++
}
if start <= s.pos {
mut string_so_far := s.text[start..end]
if !s.is_fmt && u_escapes_pos.len > 0 {
string_so_far = decode_u_escapes(string_so_far, start, u_escapes_pos)
}
if n_cr_chars > 0 {
string_so_far = string_so_far.replace('\r', '')
}
if string_so_far.contains('\\\n') {
lit = trim_slash_line_break(string_so_far)
} else {
lit = string_so_far
}
}
return lit
}
fn decode_u_escapes(s string, start int, escapes_pos []int) string {
if escapes_pos.len == 0 {
return s
}
mut ss := []string{cap: escapes_pos.len * 2 + 1}
ss << s[..escapes_pos.first() - start]
for i, pos in escapes_pos {
idx := pos - start
end_idx := idx + 6 // "\uXXXX".len == 6
ss << utf32_to_str(u32(strconv.parse_uint(s[idx + 2..end_idx], 16, 32) or { 0 }))
if i + 1 < escapes_pos.len {
ss << s[end_idx..escapes_pos[i + 1] - start]
} else {
ss << s[end_idx..]
}
}
return ss.join('')
}
fn trim_slash_line_break(s string) string {
mut start := 0
mut ret_str := s
for {
idx := ret_str.index_after('\\\n', start)
if idx != -1 {
ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
start = idx
} else {
break
}
}
return ret_str
}
fn (mut s Scanner) ident_char() string {
start := s.pos
slash := `\\`
mut len := 0
for {
s.pos++
if s.pos >= s.text.len {
break
}
if s.text[s.pos] != slash {
len++
}
double_slash := s.expect('\\\\', s.pos - 2)
if s.text[s.pos] == `\`` && (s.text[s.pos - 1] != slash || double_slash) {
// ` // apostrophe balance comment. do not remove
if double_slash {
len++
}
break
}
}
len--
c := s.text[start + 1..s.pos]
if len != 1 {
u := c.runes()
if u.len != 1 {
s.error('invalid character literal (more than one character)\n' +
'use quotes for strings, backticks for characters')
}
}
// Escapes a `'` character
if c == "'" {
return '\\' + c
}
return c
}
[direct_array_access; inline]
fn (s &Scanner) expect(want string, start_pos int) bool {
end_pos := start_pos + want.len
if start_pos < 0 || end_pos < 0 || start_pos >= s.text.len || end_pos > s.text.len {
return false
}
for pos in start_pos .. end_pos {
if s.text[pos] != want[pos - start_pos] {
return false
}
}
return true
}
[inline]
fn (mut s Scanner) ignore_line() {
s.eat_to_end_of_line()
s.inc_line_number()
}
[direct_array_access; inline]
fn (mut s Scanner) eat_to_end_of_line() {
for s.pos < s.text.len && s.text[s.pos] != scanner.b_lf {
s.pos++
}
}
[inline]
fn (mut s Scanner) inc_line_number() {
s.last_nl_pos = mathutil.min(s.text.len - 1, s.pos)
if s.is_crlf {
s.last_nl_pos++
}
s.line_nr++
s.line_ends << s.pos
if s.line_nr > s.nr_lines {
s.nr_lines = s.line_nr
}
}
pub fn (mut s Scanner) note(msg string) {
pos := token.Position{
line_nr: s.line_nr
pos: s.pos
}
if s.pref.output_mode == .stdout && !s.pref.check_only {
eprintln(util.formatted_error('notice:', msg, s.file_path, pos))
} else {
s.notices << errors.Notice{
file_path: s.file_path
pos: pos
reporter: .scanner
message: msg
}
}
}
pub fn (mut s Scanner) warn(msg string) {
if s.pref.warns_are_errors {
s.error(msg)
return
}
pos := token.Position{
line_nr: s.line_nr
pos: s.pos
col: s.current_column() - 1
}
if s.pref.output_mode == .stdout && !s.pref.check_only {
eprintln(util.formatted_error('warning:', msg, s.file_path, pos))
} else {
if s.pref.message_limit >= 0 && s.warnings.len >= s.pref.message_limit {
s.should_abort = true
return
}
s.warnings << errors.Warning{
file_path: s.file_path
pos: pos
reporter: .scanner
message: msg
}
}
}
pub fn (mut s Scanner) error(msg string) {
pos := token.Position{
line_nr: s.line_nr
pos: s.pos
col: s.current_column() - 1
}
if s.pref.output_mode == .stdout && !s.pref.check_only {
eprintln(util.formatted_error('error:', msg, s.file_path, pos))
exit(1)
} else {
if s.pref.fatal_errors {
exit(1)
}
if s.pref.message_limit >= 0 && s.errors.len >= s.pref.message_limit {
s.should_abort = true
return
}
s.errors << errors.Error{
file_path: s.file_path
pos: pos
reporter: .scanner
message: msg
}
}
}
fn (mut s Scanner) vet_error(msg string, fix vet.FixKind) {
ve := vet.Error{
message: msg
file_path: s.file_path
pos: token.Position{
line_nr: s.line_nr
col: s.current_column() - 1
}
kind: .error
fix: fix
typ: .default
}
s.vet_errors << ve
}
[noreturn]
pub fn verror(s string) {
util.verror('scanner error', s)
}
pub fn (mut s Scanner) codegen(newtext string) {
$if debug_codegen ? {
eprintln('scanner.codegen:\n $newtext')
}
// codegen makes sense only during normal compilation
// feeding code generated V code to vfmt or vdoc will
// cause them to output/document ephemeral stuff.
if s.comments_mode == .skip_comments {
s.all_tokens.delete_last() // remove .eof from end of .all_tokens
s.text += newtext
old_tidx := s.tidx
s.tidx = s.all_tokens.len
s.scan_remaining_text()
s.tidx = old_tidx
}
}
fn (mut s Scanner) trace(fbase string, message string) {
if s.file_base == fbase {
println('> s.trace | ${fbase:-10s} | $message')
}
}