v/vlib/v/scanner/scanner.v

1420 lines
37 KiB
V
Raw Normal View History

2020-01-23 21:04:46 +01:00
// Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
2019-12-22 02:34:37 +01:00
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module scanner
import os
import v.token
import v.pref
import v.util
2020-05-26 22:39:15 +02:00
import v.vmod
2019-12-22 02:34:37 +01:00
const (
single_quote = `\'`
double_quote = `"`
2020-07-04 15:14:30 +02:00
// char used as number separator
num_sep = `_`
2019-12-22 02:34:37 +01:00
)
pub struct Scanner {
2020-05-09 15:16:48 +02:00
pub mut:
file_path string
text string
pos int
line_nr int
last_nl_pos int // for calculating column
is_inside_string bool
is_inter_start bool // for hacky string interpolation TODO simplify
is_inter_end bool
is_debug bool
line_comment string
2019-12-28 09:43:22 +01:00
// prev_tok TokenKind
is_started bool
fn_name string // needed for @FN
2020-05-25 18:33:41 +02:00
mod_name string // needed for @MOD
struct_name string // needed for @STRUCT
2020-05-26 22:39:15 +02:00
vmod_file_content string // needed for @VMOD_FILE, contents of the file, *NOT its path*
is_print_line_on_error bool
is_print_colored_error bool
is_print_rel_paths_on_error bool
quote byte // which quote is used to denote current string: ' or "
line_ends []int // the positions of source lines ends (i.e. \n signs)
nr_lines int // total number of lines in the source file that were scanned
is_vh bool // Keep newlines
is_fmt bool // Used only for skipping ${} in strings, since we need literal
2019-12-22 02:34:37 +01:00
// string values when generating formatted code.
comments_mode CommentsMode
is_inside_toplvl_statement bool // *only* used in comments_mode: .toplevel_comments, toggled by parser
all_tokens []token.Token // *only* used in comments_mode: .toplevel_comments, contains all tokens
tidx int
eofs int
2020-07-11 11:41:39 +02:00
pref &pref.Preferences
vet_errors &[]string
2019-12-22 02:34:37 +01:00
}
2020-07-04 15:14:30 +02:00
/*
How the .toplevel_comments mode works:
2020-02-29 17:51:35 +01:00
In this mode, the scanner scans *everything* at once, before parsing starts,
including all the comments, and stores the results in an buffer s.all_tokens.
Then .scan() just returns s.all_tokens[ s.tidx++ ] *ignoring* the
comment tokens. In other words, by default in this mode, the parser
*will not see any comments* inside top level statements, so it has
no reason to complain about them.
When the parser determines, that it is outside of a top level statement,
it tells the scanner to backtrack s.tidx to the current p.tok index,
then it changes .is_inside_toplvl_statement to false , and refills its
lookahead buffer (i.e. p.peek_tok, p.peek_tok2, p.peek_tok3) from the
scanner.
In effect, from the parser's point of view, the next tokens, that it will
receive with p.next(), will be the same, as if comments are not ignored
anymore, *between* top level statements.
When the parser determines, that it is going again inside a top level
statement, it does the same, this time setting .is_inside_toplvl_statement
to true, again refilling the lookahead buffer => calling .next() in this
mode, will again ignore all the comment tokens, till the top level statement
is finished.
*/
// The different kinds of scanner modes:
//
// .skip_comments - simplest/fastest, just ignores all comments early.
// This mode is used by the compiler itself.
//
// .parse_comments is used by vfmt. Ideally it should handle inline /* */
// comments too, i.e. it returns every kind of comment as a new token.
//
// .toplevel_comments is used by vdoc, parses *only* top level ones
// that are *outside* structs/enums/fns.
2020-02-29 17:51:35 +01:00
pub enum CommentsMode {
skip_comments
parse_comments
toplevel_comments
2020-02-29 17:51:35 +01:00
}
2019-12-22 02:34:37 +01:00
// new scanner from file.
2020-07-11 11:41:39 +02:00
pub fn new_scanner_file(file_path string, comments_mode CommentsMode, pref &pref.Preferences) &Scanner {
return new_vet_scanner_file(file_path, comments_mode, pref, voidptr(0))
}
pub fn new_vet_scanner_file(file_path string, comments_mode CommentsMode, pref &pref.Preferences, vet_errors &[]string) &Scanner {
2019-12-22 02:34:37 +01:00
if !os.exists(file_path) {
verror("$file_path doesn't exist")
}
2020-07-04 15:14:30 +02:00
raw_text := util.read_file(file_path) or {
verror(err)
return voidptr(0)
2019-12-22 02:34:37 +01:00
}
mut s := new_vet_scanner(raw_text, comments_mode, pref, vet_errors)
2019-12-22 02:34:37 +01:00
s.file_path = file_path
return s
}
// new scanner from string.
2020-07-11 11:41:39 +02:00
pub fn new_scanner(text string, comments_mode CommentsMode, pref &pref.Preferences) &Scanner {
return new_vet_scanner(text, comments_mode, pref, voidptr(0))
}
pub fn new_vet_scanner(text string, comments_mode CommentsMode, pref &pref.Preferences, vet_errors &[]string) &Scanner {
2020-07-11 11:41:39 +02:00
is_fmt := pref.is_fmt
2020-07-25 14:57:57 +02:00
mut s := &Scanner{
2020-07-11 11:41:39 +02:00
pref: pref
2019-12-22 02:34:37 +01:00
text: text
is_print_line_on_error: true
is_print_colored_error: true
is_print_rel_paths_on_error: true
is_fmt: is_fmt
2020-02-29 17:51:35 +01:00
comments_mode: comments_mode
vet_errors: vet_errors
2019-12-22 02:34:37 +01:00
}
2020-07-25 14:57:57 +02:00
s.file_path = 'internal_memory'
return s
2019-12-22 02:34:37 +01:00
}
[inline]
fn (s &Scanner) should_parse_comment() bool {
2020-07-04 15:14:30 +02:00
res := (s.comments_mode == .parse_comments) ||
(s.comments_mode == .toplevel_comments && !s.is_inside_toplvl_statement)
return res
}
2020-07-04 15:14:30 +02:00
// NB: this is called by v's parser
pub fn (mut s Scanner) set_is_inside_toplevel_statement(newstate bool) {
s.is_inside_toplvl_statement = newstate
}
2020-07-04 15:14:30 +02:00
pub fn (mut s Scanner) set_current_tidx(cidx int) {
mut tidx := if cidx < 0 { 0 } else { cidx }
tidx = if tidx > s.all_tokens.len { s.all_tokens.len } else { tidx }
s.tidx = tidx
}
2020-07-04 15:14:30 +02:00
[inline]
fn (mut s Scanner) new_token(tok_kind token.Kind, lit string, len int) token.Token {
cidx := s.tidx
s.tidx++
return token.Token{
kind: tok_kind
lit: lit
line_nr: s.line_nr + 1
2020-04-10 00:09:34 +02:00
pos: s.pos - len + 1
len: len
tidx: cidx
}
2019-12-22 02:34:37 +01:00
}
[inline]
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_name() string {
2019-12-22 02:34:37 +01:00
start := s.pos
2020-02-25 22:58:51 +01:00
s.pos++
for s.pos < s.text.len && (util.is_name_char(s.text[s.pos]) || s.text[s.pos].is_digit()) {
2019-12-22 02:34:37 +01:00
s.pos++
}
name := s.text[start..s.pos]
s.pos--
return name
}
// ident_fn_name looks ahead and returns name of the function if possible, otherwise an empty string
fn (s &Scanner) ident_fn_name() string {
start := s.pos
mut pos := s.pos
pos++
if s.current_column() - 2 != 0 {
return s.fn_name
}
has_struct_name := s.struct_name != ''
if has_struct_name {
for pos < s.text.len && s.text[pos] != `(` {
pos++
}
if pos >= s.text.len {
return ''
}
pos++
}
for pos < s.text.len && s.text[pos] != `(` {
pos++
}
if pos >= s.text.len {
return ''
}
pos--
// Eat whitespaces
for pos > start && s.text[pos].is_space() {
pos--
}
if pos < start {
return ''
}
end_pos := pos + 1
pos--
// Search for the start position
for pos > start && util.is_func_char(s.text[pos]) {
pos--
}
pos++
start_pos := pos
2020-07-04 15:14:30 +02:00
if pos <= start || pos >= s.text.len {
return ''
}
if s.text[start_pos].is_digit() || end_pos > s.text.len || end_pos <= start_pos ||
end_pos <= start || start_pos < start {
return ''
}
fn_name := s.text[start_pos..end_pos]
return fn_name
}
2020-05-25 18:33:41 +02:00
// ident_mod_name look ahead and return name of module this file belongs to if possible, otherwise empty string
fn (s &Scanner) ident_mod_name() string {
2020-05-25 18:33:41 +02:00
start := s.pos
mut pos := s.pos
pos++
// Eat whitespaces
for pos < s.text.len && s.text[pos].is_space() {
pos++
}
if pos >= s.text.len {
return ''
}
start_pos := pos
// Search for next occurrence of a whitespace or newline
for pos < s.text.len && !s.text[pos].is_space() && !util.is_nl(s.text[pos]) {
pos++
}
if pos >= s.text.len {
return ''
}
end_pos := pos
if end_pos > s.text.len || end_pos <= start_pos || end_pos <= start || start_pos <= start {
return ''
}
mod_name := s.text[start_pos..end_pos]
return mod_name
}
// ident_struct_name look ahead and return name of last encountered struct if possible, otherwise empty string
fn (s &Scanner) ident_struct_name() string {
start := s.pos
mut pos := s.pos
// Return last known stuct_name encountered to avoid using high order/anonymous function definitions
if s.current_column() - 2 != 0 {
return s.struct_name
}
pos++
// Eat whitespaces
for pos < s.text.len && s.text[pos].is_space() {
pos++
}
if pos >= s.text.len {
return ''
}
// Return if `(` is not the first character after "fn ..."
if s.text[pos] != `(` {
return ''
}
// Search for closing parenthesis
for pos < s.text.len && s.text[pos] != `)` {
pos++
}
if pos >= s.text.len {
return ''
}
pos--
// Search backwards for end position of struct name
// Eat whitespaces
for pos > start && s.text[pos].is_space() {
pos--
}
if pos < start {
return ''
}
end_pos := pos + 1
// Go back while we have a name character or digit
for pos > start && (util.is_name_char(s.text[pos]) || s.text[pos].is_digit()) {
pos--
}
if pos < start {
return ''
}
start_pos := pos + 1
if s.text[start_pos].is_digit() || end_pos > s.text.len || end_pos <= start_pos ||
end_pos <= start || start_pos <= start {
return ''
}
struct_name := s.text[start_pos..end_pos]
return struct_name
}
2020-07-04 15:14:30 +02:00
fn filter_num_sep(txt byteptr, start, end int) string {
unsafe {
2020-02-07 22:10:48 +01:00
mut b := malloc(end - start + 1) // add a byte for the endstring 0
mut i1 := 0
2020-07-04 15:14:30 +02:00
for i := start; i < end; i++ {
if txt[i] != num_sep {
2020-02-07 22:10:48 +01:00
b[i1] = txt[i]
i1++
}
}
2020-02-07 22:10:48 +01:00
b[i1] = 0 // C string compatibility
return b.vstring_with_len(i1)
}
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_bin_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
start_pos := s.pos
s.pos += 2 // skip '0b'
if s.text[s.pos] == num_sep {
s.error('separator `_` is only valid between digits in a numeric literal')
}
2020-02-25 22:58:51 +01:00
for s.pos < s.text.len {
c := s.text[s.pos]
if c == num_sep && s.text[s.pos + 1] == num_sep {
s.error('cannot use `_` consecutively')
}
if !c.is_bin_digit() && c != num_sep {
if (!c.is_digit() && !c.is_letter()) || s.is_inside_string {
break
2020-07-04 15:14:30 +02:00
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
if s.text[s.pos - 1] == num_sep {
s.error('cannot use `_` at the end of a numeric literal')
} else if start_pos + 2 == s.pos {
s.pos-- // adjust error position
s.error('number part of this binary is not provided')
2020-07-04 15:14:30 +02:00
} else if has_wrong_digit {
s.pos = first_wrong_digit_pos // adjust error position
2020-07-04 15:14:30 +02:00
s.error('this binary number has unsuitable digit `$first_wrong_digit.str()`')
}
number := filter_num_sep(s.text.str, start_pos, s.pos)
s.pos--
return number
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_hex_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
2019-12-22 02:34:37 +01:00
start_pos := s.pos
s.pos += 2 // skip '0x'
if s.text[s.pos] == num_sep {
s.error('separator `_` is only valid between digits in a numeric literal')
}
2020-02-25 22:58:51 +01:00
for s.pos < s.text.len {
2019-12-22 02:34:37 +01:00
c := s.text[s.pos]
if c == num_sep && s.text[s.pos + 1] == num_sep {
s.error('cannot use `_` consecutively')
}
if !c.is_hex_digit() && c != num_sep {
if !c.is_letter() || s.is_inside_string {
break
2020-07-04 15:14:30 +02:00
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
2019-12-22 02:34:37 +01:00
}
s.pos++
}
if s.text[s.pos - 1] == num_sep {
s.error('cannot use `_` at the end of a numeric literal')
} else if start_pos + 2 == s.pos {
s.pos-- // adjust error position
s.error('number part of this hexadecimal is not provided')
2020-07-04 15:14:30 +02:00
} else if has_wrong_digit {
s.pos = first_wrong_digit_pos // adjust error position
2020-07-04 15:14:30 +02:00
s.error('this hexadecimal number has unsuitable digit `$first_wrong_digit.str()`')
}
number := filter_num_sep(s.text.str, start_pos, s.pos)
2019-12-22 02:34:37 +01:00
s.pos--
return number
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_oct_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
2019-12-22 02:34:37 +01:00
start_pos := s.pos
2020-02-23 12:33:07 +01:00
s.pos += 2 // skip '0o'
if s.text[s.pos] == num_sep {
s.error('separator `_` is only valid between digits in a numeric literal')
}
2020-02-25 22:58:51 +01:00
for s.pos < s.text.len {
2019-12-22 02:34:37 +01:00
c := s.text[s.pos]
if c == num_sep && s.text[s.pos + 1] == num_sep {
s.error('cannot use `_` consecutively')
}
2020-02-23 12:33:07 +01:00
if !c.is_oct_digit() && c != num_sep {
if (!c.is_digit() && !c.is_letter()) || s.is_inside_string {
break
2020-07-04 15:14:30 +02:00
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
2019-12-22 02:34:37 +01:00
}
s.pos++
}
if s.text[s.pos - 1] == num_sep {
s.error('cannot use `_` at the end of a numeric literal')
} else if start_pos + 2 == s.pos {
s.pos-- // adjust error position
s.error('number part of this octal is not provided')
2020-07-04 15:14:30 +02:00
} else if has_wrong_digit {
s.pos = first_wrong_digit_pos // adjust error position
2020-07-04 15:14:30 +02:00
s.error('this octal number has unsuitable digit `$first_wrong_digit.str()`')
}
number := filter_num_sep(s.text.str, start_pos, s.pos)
2019-12-22 02:34:37 +01:00
s.pos--
return number
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_dec_number() string {
mut has_wrong_digit := false
mut first_wrong_digit_pos := 0
mut first_wrong_digit := `\0`
2019-12-22 02:34:37 +01:00
start_pos := s.pos
// scan integer part
for s.pos < s.text.len {
2020-02-25 22:58:51 +01:00
c := s.text[s.pos]
if c == num_sep && s.text[s.pos + 1] == num_sep {
s.error('cannot use `_` consecutively')
}
2020-02-25 22:58:51 +01:00
if !c.is_digit() && c != num_sep {
if !c.is_letter() || c in [`e`, `E`] || s.is_inside_string {
break
2020-07-04 15:14:30 +02:00
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
2020-02-25 22:58:51 +01:00
first_wrong_digit = c
}
}
2019-12-22 02:34:37 +01:00
s.pos++
}
if s.text[s.pos - 1] == num_sep {
s.error('cannot use `_` at the end of a numeric literal')
}
2020-07-04 15:14:30 +02:00
mut call_method := false // true for, e.g., 5.str(), 5.5.str(), 5e5.str()
mut is_range := false // true for, e.g., 5..10
2019-12-22 02:34:37 +01:00
// scan fractional part
if s.pos < s.text.len && s.text[s.pos] == `.` {
s.pos++
if s.pos < s.text.len {
2020-04-18 17:49:27 +02:00
// 5.5, 5.5.str()
if s.text[s.pos].is_digit() {
for s.pos < s.text.len {
c := s.text[s.pos]
if !c.is_digit() {
if !c.is_letter() || c in [`e`, `E`] || s.is_inside_string {
2020-04-18 17:49:27 +02:00
// 5.5.str()
if c == `.` && s.pos + 1 < s.text.len && s.text[s.pos + 1].is_letter() {
call_method = true
}
break
2020-07-04 15:14:30 +02:00
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
first_wrong_digit = c
}
}
s.pos++
}
2020-07-04 15:14:30 +02:00
} else if s.text[s.pos] == `.` {
// 5.. (a range)
2020-04-18 17:49:27 +02:00
is_range = true
s.pos--
2020-07-04 15:14:30 +02:00
} else if s.text[s.pos] in [`e`, `E`] {
// 5.e5
} else if s.text[s.pos].is_letter() {
// 5.str()
call_method = true
s.pos--
} else {
2020-07-04 15:14:30 +02:00
// 5.
2020-04-18 17:49:27 +02:00
}
}
2019-12-22 02:34:37 +01:00
}
// scan exponential part
2020-04-18 17:49:27 +02:00
mut has_exp := false
if s.pos < s.text.len && s.text[s.pos] in [`e`, `E`] {
has_exp = true
s.pos++
if s.pos < s.text.len && s.text[s.pos] in [`-`, `+`] {
2020-02-17 02:35:01 +01:00
s.pos++
}
for s.pos < s.text.len {
2020-02-25 22:58:51 +01:00
c := s.text[s.pos]
if !c.is_digit() {
if !c.is_letter() || s.is_inside_string {
2020-04-18 17:49:27 +02:00
// 5e5.str()
if c == `.` && s.pos + 1 < s.text.len && s.text[s.pos + 1].is_letter() {
call_method = true
}
break
2020-07-04 15:14:30 +02:00
} else if !has_wrong_digit {
has_wrong_digit = true
first_wrong_digit_pos = s.pos
2020-02-25 22:58:51 +01:00
first_wrong_digit = c
}
}
2019-12-22 02:34:37 +01:00
s.pos++
}
2020-04-18 17:49:27 +02:00
}
if has_wrong_digit {
2020-07-04 15:14:30 +02:00
// error check: wrong digit
s.pos = first_wrong_digit_pos // adjust error position
2020-07-04 15:14:30 +02:00
s.error('this number has unsuitable digit `$first_wrong_digit.str()`')
} else if s.text[s.pos - 1] in [`e`, `E`] {
// error check: 5e
s.pos-- // adjust error position
2020-04-18 17:49:27 +02:00
s.error('exponent has no digits')
} else if s.pos < s.text.len && s.text[s.pos] == `.` && !is_range && !call_method {
2020-07-04 15:14:30 +02:00
// error check: 1.23.4, 123.e+3.4
2020-04-18 17:49:27 +02:00
if has_exp {
2019-12-22 02:34:37 +01:00
s.error('exponential part should be integer')
2020-07-04 15:14:30 +02:00
} else {
2019-12-22 02:34:37 +01:00
s.error('too many decimal points in number')
}
}
number := filter_num_sep(s.text.str, start_pos, s.pos)
2019-12-22 02:34:37 +01:00
s.pos--
return number
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_number() string {
if s.expect('0b', s.pos) {
return s.ident_bin_number()
2020-07-04 15:14:30 +02:00
} else if s.expect('0x', s.pos) {
2019-12-22 02:34:37 +01:00
return s.ident_hex_number()
2020-07-04 15:14:30 +02:00
} else if s.expect('0o', s.pos) {
2020-02-23 12:33:07 +01:00
return s.ident_oct_number()
2020-07-04 15:14:30 +02:00
} else {
2019-12-22 02:34:37 +01:00
return s.ident_dec_number()
}
}
[inline]
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) skip_whitespace() {
2019-12-22 02:34:37 +01:00
// if s.is_vh { println('vh') return }
for s.pos < s.text.len && s.text[s.pos].is_space() {
if util.is_nl(s.text[s.pos]) && s.is_vh {
2019-12-22 02:34:37 +01:00
return
}
// Count \r\n as one line
if util.is_nl(s.text[s.pos]) && !s.expect('\r\n', s.pos - 1) {
2019-12-22 02:34:37 +01:00
s.inc_line_number()
}
s.pos++
}
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) end_of_file() token.Token {
s.eofs++
if s.eofs > 50 {
s.line_nr--
s.error('the end of file `$s.file_path` has been reached 50 times already, the v parser is probably stuck.\n' +
2020-07-04 15:14:30 +02:00
'This should not happen. Please report the bug here, and include the last 2-3 lines of your source code:\n' +
'https://github.com/vlang/v/issues/new?labels=Bug&template=bug_report.md')
}
if s.pos != s.text.len && s.eofs == 1 {
s.inc_line_number()
}
2019-12-22 02:34:37 +01:00
s.pos = s.text.len
2020-04-10 00:09:34 +02:00
return s.new_token(.eof, '', 1)
2019-12-22 02:34:37 +01:00
}
2020-07-04 15:14:30 +02:00
pub fn (mut s Scanner) scan_all_tokens_in_buffer() {
// s.scan_all_tokens_in_buffer is used mainly by vdoc,
// in order to implement the .toplevel_comments mode.
cmode := s.comments_mode
s.comments_mode = .parse_comments
for {
t := s.text_scan()
s.all_tokens << t
if t.kind == .eof {
break
}
}
s.comments_mode = cmode
s.tidx = 0
$if debugscanner ? {
for t in s.all_tokens {
2020-07-04 15:14:30 +02:00
eprintln('> tidx:${t.tidx:-5} | kind: ${t.kind:-10} | lit: $t.lit')
}
}
}
2020-05-17 13:51:18 +02:00
pub fn (mut s Scanner) scan() token.Token {
if s.comments_mode == .toplevel_comments {
return s.buffer_scan()
}
return s.text_scan()
}
pub fn (mut s Scanner) buffer_scan() token.Token {
for {
cidx := s.tidx
s.tidx++
if cidx >= s.all_tokens.len {
return s.end_of_file()
}
if s.all_tokens[cidx].kind == .comment {
if !s.should_parse_comment() {
continue
}
}
return s.all_tokens[cidx]
}
}
2020-06-13 00:01:44 +02:00
[inline]
fn (s Scanner) look_ahead(n int) byte {
if s.pos + n < s.text.len {
return s.text[s.pos + n]
} else {
return `\0`
}
}
fn (mut s Scanner) text_scan() token.Token {
// The for loop here is so that instead of doing
// `return s.scan()` (which will use a new call stack frame),
// text_scan can just do continue, keeping
// memory & stack usage low.
// That optimization mostly matters for long sections
// of comments and string literals.
for {
// if s.comments_mode == .parse_comments {
// println('\nscan()')
// }
// if s.line_comment != '' {
// s.fgenln('// LC "$s.line_comment"')
// s.line_comment = ''
// }
if s.is_started {
s.pos++
2019-12-22 02:34:37 +01:00
}
s.is_started = true
if s.pos >= s.text.len {
return s.end_of_file()
2019-12-22 02:34:37 +01:00
}
if !s.is_inside_string {
s.skip_whitespace()
2019-12-22 02:34:37 +01:00
}
// End of $var, start next string
if s.is_inter_end {
if s.text[s.pos] == s.quote {
s.is_inter_end = false
return s.new_token(.string, '', 1)
}
s.is_inter_end = false
ident_string := s.ident_string()
return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
2019-12-22 02:34:37 +01:00
}
s.skip_whitespace()
// end of file
if s.pos >= s.text.len {
return s.end_of_file()
2019-12-22 02:34:37 +01:00
}
// handle each char
c := s.text[s.pos]
nextc := s.look_ahead(1)
// name or keyword
if util.is_name_char(c) {
name := s.ident_name()
// tmp hack to detect . in ${}
// Check if not .eof to prevent panic
next_char := s.look_ahead(1)
kind := token.keywords[name]
if kind != .unknown {
if kind == .key_fn {
s.struct_name = s.ident_struct_name()
s.fn_name = s.ident_fn_name()
} else if kind == .key_module {
s.mod_name = s.ident_mod_name()
}
return s.new_token(kind, name, name.len)
}
// 'asdf $b' => "b" is the last name in the string, dont start parsing string
// at the next ', skip it
if s.is_inside_string {
if next_char == s.quote {
s.is_inter_end = true
s.is_inter_start = false
s.is_inside_string = false
}
}
// end of `$expr`
// allow `'$a.b'` and `'$a.c()'`
if s.is_inter_start && next_char == `(` {
if s.look_ahead(2) != `)` {
s.warn('use e.g. `\${f(expr)}` or `\$name\\(` instead of `\$f(expr)`')
}
} else if s.is_inter_start && next_char != `.` {
s.is_inter_end = true
s.is_inter_start = false
}
if s.pos == 0 && next_char == ` ` {
// If a single letter name at the start of the file, increment
// Otherwise the scanner would be stuck at s.pos = 0
2019-12-22 02:34:37 +01:00
s.pos++
}
return s.new_token(.name, name, name.len)
} else if c.is_digit() || (c == `.` && nextc.is_digit()) {
// `123`, `.123`
if !s.is_inside_string {
// In C ints with `0` prefix are octal (in V they're decimal), so discarding heading zeros is needed.
mut start_pos := s.pos
for start_pos < s.text.len && s.text[start_pos] == `0` {
start_pos++
}
mut prefix_zero_num := start_pos - s.pos // how many prefix zeros should be jumped
// for 0b, 0o, 0x the heading zero shouldn't be jumped
if start_pos == s.text.len || (c == `0` && !s.text[start_pos].is_digit()) {
prefix_zero_num--
}
s.pos += prefix_zero_num // jump these zeros
2019-12-22 02:34:37 +01:00
}
num := s.ident_number()
return s.new_token(.number, num, num.len)
2019-12-22 02:34:37 +01:00
}
// Handle `'$fn()'`
if c == `)` && s.is_inter_start {
next_char := s.look_ahead(1)
if next_char != `.` {
s.is_inter_end = true
s.is_inter_start = false
if next_char == s.quote {
s.is_inside_string = false
}
return s.new_token(.rpar, '', 1)
2019-12-22 02:34:37 +01:00
}
}
// all other tokens
match c {
`+` {
if nextc == `+` {
s.pos++
return s.new_token(.inc, '', 2)
} else if nextc == `=` {
s.pos++
return s.new_token(.plus_assign, '', 2)
}
return s.new_token(.plus, '', 1)
2019-12-22 02:34:37 +01:00
}
`-` {
if nextc == `-` {
s.pos++
return s.new_token(.dec, '', 2)
} else if nextc == `=` {
s.pos++
return s.new_token(.minus_assign, '', 2)
}
return s.new_token(.minus, '', 1)
2019-12-22 02:34:37 +01:00
}
`*` {
if nextc == `=` {
s.pos++
return s.new_token(.mult_assign, '', 2)
}
return s.new_token(.mul, '', 1)
2020-07-11 11:41:39 +02:00
}
`^` {
if nextc == `=` {
s.pos++
return s.new_token(.xor_assign, '', 2)
}
return s.new_token(.xor, '', 1)
2020-07-11 12:14:10 +02:00
}
`%` {
if nextc == `=` {
s.pos++
return s.new_token(.mod_assign, '', 2)
}
return s.new_token(.mod, '', 1)
2019-12-22 02:34:37 +01:00
}
`?` {
return s.new_token(.question, '', 1)
2019-12-22 02:34:37 +01:00
}
single_quote, double_quote {
2020-04-10 00:09:34 +02:00
ident_string := s.ident_string()
return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
2019-12-22 02:34:37 +01:00
}
`\`` {
// ` // apostrophe balance comment. do not remove
ident_char := s.ident_char()
return s.new_token(.chartoken, ident_char, ident_char.len + 2) // + two quotes
2019-12-22 02:34:37 +01:00
}
`(` {
// TODO `$if vet {` for performance
if s.pref.is_vet && s.text[s.pos + 1] == ` ` {
s.vet_error('Looks like you are adding a space after `(`')
}
return s.new_token(.lpar, '', 1)
2019-12-22 02:34:37 +01:00
}
`)` {
// TODO `$if vet {` for performance
if s.pref.is_vet && s.text[s.pos - 1] == ` ` {
s.vet_error('Looks like you are adding a space before `)`')
}
return s.new_token(.rpar, '', 1)
}
`[` {
return s.new_token(.lsbr, '', 1)
2019-12-22 02:34:37 +01:00
}
`]` {
return s.new_token(.rsbr, '', 1)
2020-05-25 18:33:41 +02:00
}
`{` {
// Skip { in `${` in strings
if s.is_inside_string {
continue
}
return s.new_token(.lcbr, '', 1)
}
`$` {
if s.is_inside_string {
return s.new_token(.str_dollar, '', 1)
} else {
return s.new_token(.dollar, '', 1)
}
2020-03-28 21:51:45 +01:00
}
`}` {
// s = `hello $name !`
// s = `hello ${name} !`
if s.is_inside_string {
s.pos++
if s.text[s.pos] == s.quote {
s.is_inside_string = false
return s.new_token(.string, '', 1)
}
ident_string := s.ident_string()
return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
} else {
return s.new_token(.rcbr, '', 1)
}
2019-12-22 02:34:37 +01:00
}
`&` {
if nextc == `=` {
s.pos++
return s.new_token(.and_assign, '', 2)
}
afternextc := s.look_ahead(2)
if nextc == `&` && afternextc.is_space() {
s.pos++
return s.new_token(.and, '', 2)
}
return s.new_token(.amp, '', 1)
2019-12-22 02:34:37 +01:00
}
`|` {
if nextc == `|` {
s.pos++
return s.new_token(.logical_or, '', 2)
}
if nextc == `=` {
s.pos++
return s.new_token(.or_assign, '', 2)
}
return s.new_token(.pipe, '', 1)
2019-12-22 02:34:37 +01:00
}
`,` {
return s.new_token(.comma, '', 1)
2019-12-22 02:34:37 +01:00
}
`@` {
s.pos++
name := s.ident_name()
if s.is_fmt {
return s.new_token(.name, '@' + name, name.len + 1)
}
// @FN => will be substituted with the name of the current V function
// @MOD => will be substituted with the name of the current V module
// @STRUCT => will be substituted with the name of the current V struct
// @VEXE => will be substituted with the path to the V compiler
// @FILE => will be substituted with the path of the V source file
// @LINE => will be substituted with the V line number where it appears (as a string).
// @COLUMN => will be substituted with the column where it appears (as a string).
// @VHASH => will be substituted with the shortened commit hash of the V compiler (as a string).
// @VMOD_FILE => will be substituted with the contents of the nearest v.mod file (as a string).
// This allows things like this:
// println( 'file: ' + @FILE + ' | line: ' + @LINE + ' | fn: ' + @MOD + '.' + @FN)
// ... which is useful while debugging/tracing
if name == 'FN' {
return s.new_token(.string, s.fn_name, 3)
}
if name == 'MOD' {
return s.new_token(.string, s.mod_name, 4)
}
if name == 'STRUCT' {
return s.new_token(.string, s.struct_name, 7)
}
if name == 'VEXE' {
vexe := pref.vexe_path()
return s.new_token(.string, util.cescaped_path(vexe), 5)
}
if name == 'FILE' {
fpath := os.real_path(s.file_path)
return s.new_token(.string, util.cescaped_path(fpath), 5)
}
if name == 'LINE' {
return s.new_token(.string, (s.line_nr + 1).str(), 5)
}
if name == 'COLUMN' {
return s.new_token(.string, s.current_column().str(), 7)
}
if name == 'VHASH' {
return s.new_token(.string, util.vhash(), 6)
}
if name == 'VMOD_FILE' {
if s.vmod_file_content.len == 0 {
mut mcache := vmod.get_cache()
vmod_file_location := mcache.get_by_file(s.file_path)
if vmod_file_location.vmod_file.len == 0 {
s.error('@VMOD_FILE can be used only in projects, that have v.mod file')
}
vmod_content := os.read_file(vmod_file_location.vmod_file) or {
''
}
$if windows {
s.vmod_file_content = vmod_content.replace('\r\n', '\n')
} $else {
s.vmod_file_content = vmod_content
}
2020-05-27 03:32:39 +02:00
}
return s.new_token(.string, s.vmod_file_content, 10)
2020-05-26 22:39:15 +02:00
}
if !token.is_key(name) {
s.error('@ must be used before keywords (e.g. `@type string`)')
}
return s.new_token(.name, name, name.len)
2019-12-22 02:34:37 +01:00
}
/*
case `\r`:
2019-12-22 02:34:37 +01:00
if nextc == `\n` {
s.pos++
s.last_nl_pos = s.pos
2020-04-02 09:27:00 +02:00
return s.new_token(.nl, '')
2019-12-22 02:34:37 +01:00
}
}
case `\n`:
s.last_nl_pos = s.pos
2020-04-02 09:27:00 +02:00
return s.new_token(.nl, '')
2019-12-22 02:34:37 +01:00
}
*/
`.` {
if nextc == `.` {
2019-12-22 02:34:37 +01:00
s.pos++
if s.text[s.pos + 1] == `.` {
s.pos++
return s.new_token(.ellipsis, '', 3)
}
return s.new_token(.dotdot, '', 2)
2019-12-22 02:34:37 +01:00
}
return s.new_token(.dot, '', 1)
2019-12-22 02:34:37 +01:00
}
`#` {
start := s.pos + 1
s.ignore_line()
if nextc == `!` {
// treat shebang line (#!) as a comment
s.line_comment = s.text[start + 1..s.pos].trim_space()
// s.fgenln('// shebang line "$s.line_comment"')
continue
2019-12-22 02:34:37 +01:00
}
hash := s.text[start..s.pos].trim_space()
return s.new_token(.hash, hash, hash.len)
2019-12-22 02:34:37 +01:00
}
`>` {
if nextc == `=` {
s.pos++
return s.new_token(.ge, '', 2)
} else if nextc == `>` {
if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
s.pos += 2
return s.new_token(.right_shift_assign, '', 3)
}
s.pos++
return s.new_token(.right_shift, '', 2)
} else {
return s.new_token(.gt, '', 1)
}
2019-12-22 02:34:37 +01:00
}
0xE2 {
if nextc == 0x89 && s.text[s.pos + 2] == 0xA0 {
// case `≠`:
2019-12-22 02:34:37 +01:00
s.pos += 2
return s.new_token(.ne, '', 3)
} else if nextc == 0x89 && s.text[s.pos + 2] == 0xBD {
s.pos += 2
return s.new_token(.le, '', 3)
} else if nextc == 0xA9 && s.text[s.pos + 2] == 0xBE {
s.pos += 2
return s.new_token(.ge, '', 3)
2019-12-22 02:34:37 +01:00
}
}
`<` {
if nextc == `=` {
s.pos++
return s.new_token(.le, '', 2)
} else if nextc == `<` {
if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
s.pos += 2
return s.new_token(.left_shift_assign, '', 3)
}
s.pos++
return s.new_token(.left_shift, '', 2)
} else if nextc == `-` {
s.pos++
return s.new_token(.arrow, '', 2)
} else {
return s.new_token(.lt, '', 1)
}
2019-12-22 02:34:37 +01:00
}
`=` {
if nextc == `=` {
s.pos++
return s.new_token(.eq, '', 2)
} else {
return s.new_token(.assign, '', 1)
}
2019-12-22 02:34:37 +01:00
}
`:` {
if nextc == `=` {
s.pos++
return s.new_token(.decl_assign, '', 2)
} else {
return s.new_token(.colon, '', 1)
}
2019-12-22 02:34:37 +01:00
}
`;` {
return s.new_token(.semicolon, '', 1)
2019-12-22 02:34:37 +01:00
}
`!` {
if nextc == `=` {
s.pos++
return s.new_token(.ne, '', 2)
} else if nextc == `i` && s.text[s.pos + 2] == `n` && s.text[s.pos + 3].is_space() {
s.pos += 2
return s.new_token(.not_in, '', 3)
} else if nextc == `i` && s.text[s.pos + 2] == `s` && s.text[s.pos + 3].is_space() {
s.pos += 2
return s.new_token(.not_is, '', 3)
} else {
return s.new_token(.not, '', 1)
2019-12-22 02:34:37 +01:00
}
}
`~` {
return s.new_token(.bit_not, '', 1)
}
`/` {
if nextc == `=` {
2019-12-22 02:34:37 +01:00
s.pos++
return s.new_token(.div_assign, '', 2)
}
if nextc == `/` {
start := s.pos + 1
s.ignore_line()
mut comment_line_end := s.pos
if s.text[s.pos - 1] == `\r` {
comment_line_end--
} else {
// fix line_nr, \n was read; the comment is marked on the next line
s.pos--
s.line_nr--
}
if s.should_parse_comment() {
s.line_comment = s.text[start + 1..comment_line_end]
mut comment := s.line_comment.trim_space()
// Find out if this comment is on its own line (for vfmt)
mut is_separate_line_comment := true
for j := start - 2; j >= 0 && s.text[j] != `\n`; j-- {
if s.text[j] !in [`\t`, ` `] {
is_separate_line_comment = false
}
}
if is_separate_line_comment {
comment = '|' + comment
}
return s.new_token(.comment, comment, comment.len + 2)
2019-12-22 02:34:37 +01:00
}
// s.fgenln('// ${s.prev_tok.str()} "$s.line_comment"')
// Skip the comment (return the next token)
continue
}
// Multiline comments
if nextc == `*` {
start := s.pos + 2
mut nest_count := 1
// Skip comment
for nest_count > 0 {
s.pos++
if s.pos >= s.text.len {
s.line_nr--
s.error('comment not terminated')
}
if s.text[s.pos] == `\n` {
s.inc_line_number()
continue
}
if s.expect('/*', s.pos) {
nest_count++
continue
}
if s.expect('*/', s.pos) {
nest_count--
}
2019-12-22 02:34:37 +01:00
}
s.pos++
if s.should_parse_comment() {
comment := s.text[start..(s.pos - 1)].trim_space()
return s.new_token(.comment, comment, comment.len + 4)
2019-12-22 02:34:37 +01:00
}
// Skip if not in fmt mode
continue
2019-12-22 02:34:37 +01:00
}
return s.new_token(.div, '', 1)
2019-12-22 02:34:37 +01:00
}
else {}
2019-12-22 02:34:37 +01:00
}
$if windows {
if c == `\0` {
return s.end_of_file()
}
2019-12-22 02:34:37 +01:00
}
s.error('invalid character `$c.str()`')
break
2019-12-22 02:34:37 +01:00
}
return s.end_of_file()
}
fn (s &Scanner) current_column() int {
return s.pos - s.last_nl_pos
}
fn (s &Scanner) count_symbol_before(p int, sym byte) int {
2019-12-22 02:34:37 +01:00
mut count := 0
for i := p; i >= 0; i-- {
if s.text[i] != sym {
break
}
count++
}
return count
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_string() string {
2019-12-22 02:34:37 +01:00
q := s.text[s.pos]
is_quote := q == single_quote || q == double_quote
is_raw := is_quote && s.pos > 0 && s.text[s.pos - 1] == `r`
is_cstr := is_quote && s.pos > 0 && s.text[s.pos - 1] == `c`
if is_quote && !s.is_inside_string {
2019-12-22 02:34:37 +01:00
s.quote = q
}
// if s.file_path.contains('string_test') {
// println('\nident_string() at char=${s.text[s.pos].str()}')
// println('linenr=$s.line_nr quote= $qquote ${qquote.str()}')
// }
mut n_cr_chars := 0
2019-12-22 02:34:37 +01:00
mut start := s.pos
s.is_inside_string = false
2019-12-22 02:34:37 +01:00
slash := `\\`
for {
s.pos++
if s.pos >= s.text.len {
s.error('unfinished string literal')
2019-12-22 02:34:37 +01:00
}
c := s.text[s.pos]
prevc := s.text[s.pos - 1]
// end of string
if c == s.quote && (prevc != slash || (prevc == slash && s.text[s.pos - 2] == slash)) {
// handle '123\\' slash at the end
break
}
if c == `\r` {
n_cr_chars++
}
2019-12-22 02:34:37 +01:00
if c == `\n` {
s.inc_line_number()
}
// Don't allow \0
if c == `0` && s.pos > 2 && s.text[s.pos - 1] == slash {
if (s.pos < s.text.len - 1 && s.text[s.pos + 1].is_digit()) ||
s.count_symbol_before(s.pos - 1, slash) % 2 == 0 {
} else if !is_cstr && !is_raw {
s.error(r'cannot use `\0` (NULL character) in the string literal')
2019-12-22 02:34:37 +01:00
}
}
// Don't allow \x00
if c == `0` && s.pos > 5 && s.expect('\\x0', s.pos - 3) {
if s.count_symbol_before(s.pos - 3, slash) % 2 == 0 {
} else if !is_cstr && !is_raw {
s.error(r'cannot use `\x00` (NULL character) in the string literal')
}
2019-12-22 02:34:37 +01:00
}
// ${var} (ignore in vfmt mode)
if prevc == `$` && c == `{` && !is_raw && s.count_symbol_before(s.pos - 2, slash) % 2 == 0 {
s.is_inside_string = true
2019-12-22 02:34:37 +01:00
// so that s.pos points to $ at the next step
s.pos -= 2
break
}
// $var
if prevc == `$` && util.is_name_char(c) && !is_raw && s.count_symbol_before(s.pos - 2, slash) %
2 == 0 {
s.is_inside_string = true
s.is_inter_start = true
2019-12-22 02:34:37 +01:00
s.pos -= 2
break
}
}
mut lit := ''
if s.text[start] == s.quote {
start++
}
mut end := s.pos
if s.is_inside_string {
2019-12-22 02:34:37 +01:00
end++
}
if start <= s.pos {
mut string_so_far := s.text[start..end]
if n_cr_chars > 0 {
string_so_far = string_so_far.replace('\r', '')
}
if string_so_far.contains('\\\n') {
lit = trim_slash_line_break(string_so_far)
} else {
lit = string_so_far
}
2019-12-22 02:34:37 +01:00
}
return lit
}
fn trim_slash_line_break(s string) string {
mut start := 0
mut ret_str := s
for {
idx := ret_str.index_after('\\\n', start)
if idx != -1 {
2020-07-04 15:14:30 +02:00
ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
start = idx
} else {
break
}
}
return ret_str
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ident_char() string {
2019-12-22 02:34:37 +01:00
start := s.pos
slash := `\\`
mut len := 0
for {
s.pos++
if s.pos >= s.text.len {
break
}
if s.text[s.pos] != slash {
len++
}
double_slash := s.expect('\\\\', s.pos - 2)
if s.text[s.pos] == `\`` && (s.text[s.pos - 1] != slash || double_slash) {
// ` // apostrophe balance comment. do not remove
if double_slash {
len++
}
break
}
}
len--
c := s.text[start + 1..s.pos]
if len != 1 {
u := c.ustring()
if u.len != 1 {
s.error('invalid character literal (more than one character)\n' + 'use quotes for strings, backticks for characters')
}
}
// Escapes a `'` character
2020-07-04 15:14:30 +02:00
return if c == "\'" {
'\\' + c
} else {
c
}
2019-12-22 02:34:37 +01:00
}
[inline]
2019-12-22 02:34:37 +01:00
fn (s &Scanner) expect(want string, start_pos int) bool {
end_pos := start_pos + want.len
if start_pos < 0 || start_pos >= s.text.len {
return false
}
if end_pos < 0 || end_pos > s.text.len {
return false
}
for pos in start_pos .. end_pos {
if s.text[pos] != want[pos - start_pos] {
return false
}
}
return true
}
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) debug_tokens() {
2019-12-22 02:34:37 +01:00
s.pos = 0
s.is_started = false
s.is_debug = true
fname := s.file_path.all_after_last(os.path_separator)
2019-12-22 02:34:37 +01:00
println('\n===DEBUG TOKENS $fname===')
for {
tok := s.scan()
tok_kind := tok.kind
lit := tok.lit
print(tok_kind.str())
2019-12-22 02:34:37 +01:00
if lit != '' {
println(' `$lit`')
2020-07-04 15:14:30 +02:00
} else {
2019-12-22 02:34:37 +01:00
println('')
}
if tok_kind == .eof {
2019-12-22 02:34:37 +01:00
println('============ END OF DEBUG TOKENS ==================')
break
}
}
}
[inline]
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) ignore_line() {
2019-12-22 02:34:37 +01:00
s.eat_to_end_of_line()
s.inc_line_number()
}
[inline]
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) eat_to_end_of_line() {
2019-12-22 02:34:37 +01:00
for s.pos < s.text.len && s.text[s.pos] != `\n` {
s.pos++
}
}
[inline]
2020-05-17 13:51:18 +02:00
fn (mut s Scanner) inc_line_number() {
2019-12-22 02:34:37 +01:00
s.last_nl_pos = s.pos
s.line_nr++
s.line_ends << s.pos
if s.line_nr > s.nr_lines {
s.nr_lines = s.line_nr
}
}
pub fn (s &Scanner) warn(msg string) {
pos := token.Position{
line_nr: s.line_nr
pos: s.pos
}
eprintln(util.formatted_error('warning:', msg, s.file_path, pos))
}
2019-12-22 02:34:37 +01:00
pub fn (s &Scanner) error(msg string) {
pos := token.Position{
line_nr: s.line_nr
pos: s.pos
}
eprintln(util.formatted_error('error:', msg, s.file_path, pos))
2019-12-22 02:34:37 +01:00
exit(1)
}
fn (mut s Scanner) vet_error(msg string) {
2020-07-25 14:57:57 +02:00
eline := '$s.file_path:$s.line_nr: $msg'
if s.vet_errors == 0 {
eprintln(eline)
return
}
s.vet_errors << eline
}
2019-12-22 02:34:37 +01:00
pub fn verror(s string) {
util.verror('scanner error', s)
2019-12-22 02:34:37 +01:00
}
pub fn (mut s Scanner) codegen(newtext string) {
// codegen makes sense only during normal compilation
// feeding code generated V code to vfmt or vdoc will
// cause them to output/document ephemeral stuff.
if s.comments_mode == .skip_comments {
s.text += newtext
$if debug_codegen ? {
eprintln('scanner.codegen:\n $newtext')
}
}
}