scanner: speed up text_scan by using a specialised keywords matcher, instead of a generic V map of keywords
parent
b1bb1d361a
commit
8014235e0e
|
@ -499,6 +499,28 @@ fn (s string) == (a string) bool {
|
|||
}
|
||||
}
|
||||
|
||||
// compare returns -1 if `s` < `a`, 0 if `s` == `a`, and 1 if `s` > `a`
|
||||
[direct_array_access]
|
||||
pub fn (s string) compare(a string) int {
|
||||
min_len := if s.len < a.len { s.len } else { a.len }
|
||||
for i in 0 .. min_len {
|
||||
if s[i] < a[i] {
|
||||
return -1
|
||||
}
|
||||
if s[i] > a[i] {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
if s.len < a.len {
|
||||
return -1
|
||||
}
|
||||
if s.len > a.len {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
[direct_array_access]
|
||||
fn (s string) < (a string) bool {
|
||||
for i in 0 .. s.len {
|
||||
if i >= a.len || s[i] > a[i] {
|
||||
|
@ -513,6 +535,7 @@ fn (s string) < (a string) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
[direct_array_access]
|
||||
fn (s string) + (a string) string {
|
||||
new_len := a.len + s.len
|
||||
mut res := string{
|
||||
|
|
|
@ -675,7 +675,7 @@ fn (mut s Scanner) text_scan() token.Token {
|
|||
// tmp hack to detect . in ${}
|
||||
// Check if not .eof to prevent panic
|
||||
next_char := s.look_ahead(1)
|
||||
kind := token.keywords[name]
|
||||
kind := token.matcher.find(name)
|
||||
if kind != .unknown {
|
||||
return s.new_token(kind, name, name.len)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
import v.token
|
||||
import benchmark
|
||||
|
||||
const max_repetitions = 4_000_000
|
||||
|
||||
fn main() {
|
||||
km := token.new_keywords_matcher(token.keywords)
|
||||
for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
|
||||
'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
|
||||
mut res := token.Kind{}
|
||||
mut bmark := benchmark.start()
|
||||
for _ in 0 .. max_repetitions {
|
||||
res = token.keywords[kw]
|
||||
}
|
||||
bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
|
||||
for _ in 0 .. max_repetitions {
|
||||
res = km.find(kw)
|
||||
}
|
||||
bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
|
||||
println('--------------------------------')
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
module token
|
||||
|
||||
// bump token.max_keyword_len, if you add a longer keyword
|
||||
const max_keyword_len = 11
|
||||
|
||||
// KeywordsMatcher provides a faster way of determinining whether a given name
|
||||
// is a reserved keyword, by doing a comparison with only the keywords that
|
||||
// have exactly the same length as `name`.
|
||||
// Benchmarking shows that with -prod, it is 20-25% slower in the worst case
|
||||
// compared to just using token.keywords[name], but can be 20x faster
|
||||
// in the case, where there is a length mismatch, and 2x-3x faster in most
|
||||
// cases, where there is a match.
|
||||
// Without -prod, with tcc, using KeywordsMatcher is always faster
|
||||
// (2x to 14x times), compared to using a hash of all the keywords.
|
||||
pub struct KeywordsMatcher {
|
||||
mut:
|
||||
len_min int = 9999
|
||||
len_max int = -1
|
||||
words [max_keyword_len][]WKind
|
||||
}
|
||||
|
||||
struct WKind {
|
||||
mut:
|
||||
word string
|
||||
kind Kind
|
||||
}
|
||||
|
||||
pub fn new_keywords_matcher(kw_map map[string]Kind) KeywordsMatcher {
|
||||
mut km := KeywordsMatcher{}
|
||||
// TODO: remove this loop. It is currently needed, because a
|
||||
// fixed array of arrays is not initialised properly automatically
|
||||
// as of 2021/10/28
|
||||
for i in 0 .. token.max_keyword_len {
|
||||
km.words[i] = []WKind{}
|
||||
}
|
||||
for k, v in kw_map {
|
||||
km.add_word(k, v)
|
||||
}
|
||||
for i in 0 .. token.max_keyword_len {
|
||||
if km.words[i].len > 0 {
|
||||
km.words[i].sort(a.word < b.word)
|
||||
$if trace_keyword_matcher_initialisation ? {
|
||||
print('word len: ${i:3} | words: ')
|
||||
for w in km.words[i] {
|
||||
print('$w.word, ')
|
||||
}
|
||||
println('')
|
||||
}
|
||||
}
|
||||
}
|
||||
return km
|
||||
}
|
||||
|
||||
fn (mut km KeywordsMatcher) add_word(word string, kind Kind) {
|
||||
if word.len >= token.max_keyword_len {
|
||||
panic('increase max_keyword_len to > $word.len')
|
||||
}
|
||||
if km.len_max < word.len {
|
||||
km.len_max = word.len
|
||||
}
|
||||
if word.len < km.len_min {
|
||||
km.len_min = word.len
|
||||
}
|
||||
km.words[word.len] << WKind{word, kind}
|
||||
}
|
||||
|
||||
// find returns the Kind given a word, by doing a binary search
|
||||
// on the sorted list of words for each bin
|
||||
[direct_array_access]
|
||||
pub fn (km &KeywordsMatcher) find(word string) Kind {
|
||||
wlen := word.len
|
||||
if wlen < km.len_min || wlen > km.len_max {
|
||||
return Kind.unknown
|
||||
}
|
||||
list_len := km.words[wlen].len
|
||||
if list_len == 0 {
|
||||
return Kind.unknown
|
||||
}
|
||||
mut lo := 0
|
||||
mut hi := list_len - 1
|
||||
for lo <= hi {
|
||||
mid := lo + (hi - lo) / 2
|
||||
cmp := km.words[wlen][mid].word.compare(word)
|
||||
match cmp {
|
||||
0 { return km.words[wlen][mid].kind }
|
||||
-1 { lo = mid + 1 }
|
||||
1 { hi = mid - 1 }
|
||||
else {}
|
||||
}
|
||||
}
|
||||
return Kind.unknown
|
||||
}
|
|
@ -132,15 +132,11 @@ pub enum Kind {
|
|||
_end_
|
||||
}
|
||||
|
||||
pub const (
|
||||
assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign, .xor_assign,
|
||||
.mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
|
||||
pub const assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign,
|
||||
.xor_assign, .mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
|
||||
.unsigned_right_shift_assign]
|
||||
)
|
||||
|
||||
const (
|
||||
nr_tokens = int(Kind._end_)
|
||||
)
|
||||
const nr_tokens = int(Kind._end_)
|
||||
|
||||
// @FN => will be substituted with the name of the current V function
|
||||
// @METHOD => will be substituted with ReceiverType.MethodName
|
||||
|
@ -182,10 +178,8 @@ pub enum AtKind {
|
|||
vexeroot_path
|
||||
}
|
||||
|
||||
pub const (
|
||||
valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
|
||||
pub const valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
|
||||
'@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE']
|
||||
)
|
||||
|
||||
// build_keys genereates a map with keywords' string values:
|
||||
// Keywords['return'] == .key_return
|
||||
|
@ -315,13 +309,11 @@ fn build_token_str() []string {
|
|||
return s
|
||||
}
|
||||
|
||||
const (
|
||||
token_str = build_token_str()
|
||||
)
|
||||
const token_str = build_token_str()
|
||||
|
||||
pub const (
|
||||
keywords = build_keys()
|
||||
)
|
||||
pub const keywords = build_keys()
|
||||
|
||||
pub const matcher = new_keywords_matcher(keywords)
|
||||
|
||||
[inline]
|
||||
pub fn is_key(key string) bool {
|
||||
|
@ -365,10 +357,8 @@ pub fn (t Token) str() string {
|
|||
|
||||
// Representation of highest and lowest precedence
|
||||
/*
|
||||
pub const (
|
||||
lowest_prec = 0
|
||||
highest_prec = 8
|
||||
)
|
||||
pub const lowest_prec = 0
|
||||
pub const highest_prec = 8
|
||||
*/
|
||||
pub enum Precedence {
|
||||
lowest
|
||||
|
@ -439,9 +429,7 @@ pub fn build_precedences() []Precedence {
|
|||
return p
|
||||
}
|
||||
|
||||
const (
|
||||
precedences = build_precedences()
|
||||
)
|
||||
const precedences = build_precedences()
|
||||
|
||||
// precedence returns a tokens precedence if defined, otherwise lowest_prec
|
||||
[inline]
|
||||
|
|
Loading…
Reference in New Issue