scanner: speed up text_scan by using a specialised keywords matcher, instead of a generic V map of keywords

pull/12298/head
Delyan Angelov 2021-10-28 15:09:41 +03:00
parent b1bb1d361a
commit 8014235e0e
No known key found for this signature in database
GPG Key ID: 66886C0F12D595ED
5 changed files with 151 additions and 26 deletions

View File

@ -499,6 +499,28 @@ fn (s string) == (a string) bool {
}
}
// compare returns -1 if `s` < `a`, 0 if `s` == `a`, and 1 if `s` > `a`
[direct_array_access]
pub fn (s string) compare(a string) int {
min_len := if s.len < a.len { s.len } else { a.len }
for i in 0 .. min_len {
if s[i] < a[i] {
return -1
}
if s[i] > a[i] {
return 1
}
}
if s.len < a.len {
return -1
}
if s.len > a.len {
return 1
}
return 0
}
[direct_array_access]
fn (s string) < (a string) bool {
for i in 0 .. s.len {
if i >= a.len || s[i] > a[i] {
@ -513,6 +535,7 @@ fn (s string) < (a string) bool {
return false
}
[direct_array_access]
fn (s string) + (a string) string {
new_len := a.len + s.len
mut res := string{

View File

@ -675,7 +675,7 @@ fn (mut s Scanner) text_scan() token.Token {
// tmp hack to detect . in ${}
// Check if not .eof to prevent panic
next_char := s.look_ahead(1)
kind := token.keywords[name]
kind := token.matcher.find(name)
if kind != .unknown {
return s.new_token(kind, name, name.len)
}

View File

@ -0,0 +1,22 @@
import v.token
import benchmark
const max_repetitions = 4_000_000
fn main() {
km := token.new_keywords_matcher(token.keywords)
for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
mut res := token.Kind{}
mut bmark := benchmark.start()
for _ in 0 .. max_repetitions {
res = token.keywords[kw]
}
bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
for _ in 0 .. max_repetitions {
res = km.find(kw)
}
bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
println('--------------------------------')
}
}

View File

@ -0,0 +1,92 @@
module token
// bump token.max_keyword_len, if you add a longer keyword
const max_keyword_len = 11
// KeywordsMatcher provides a faster way of determinining whether a given name
// is a reserved keyword, by doing a comparison with only the keywords that
// have exactly the same length as `name`.
// Benchmarking shows that with -prod, it is 20-25% slower in the worst case
// compared to just using token.keywords[name], but can be 20x faster
// in the case, where there is a length mismatch, and 2x-3x faster in most
// cases, where there is a match.
// Without -prod, with tcc, using KeywordsMatcher is always faster
// (2x to 14x times), compared to using a hash of all the keywords.
pub struct KeywordsMatcher {
mut:
len_min int = 9999
len_max int = -1
words [max_keyword_len][]WKind
}
struct WKind {
mut:
word string
kind Kind
}
pub fn new_keywords_matcher(kw_map map[string]Kind) KeywordsMatcher {
mut km := KeywordsMatcher{}
// TODO: remove this loop. It is currently needed, because a
// fixed array of arrays is not initialised properly automatically
// as of 2021/10/28
for i in 0 .. token.max_keyword_len {
km.words[i] = []WKind{}
}
for k, v in kw_map {
km.add_word(k, v)
}
for i in 0 .. token.max_keyword_len {
if km.words[i].len > 0 {
km.words[i].sort(a.word < b.word)
$if trace_keyword_matcher_initialisation ? {
print('word len: ${i:3} | words: ')
for w in km.words[i] {
print('$w.word, ')
}
println('')
}
}
}
return km
}
fn (mut km KeywordsMatcher) add_word(word string, kind Kind) {
if word.len >= token.max_keyword_len {
panic('increase max_keyword_len to > $word.len')
}
if km.len_max < word.len {
km.len_max = word.len
}
if word.len < km.len_min {
km.len_min = word.len
}
km.words[word.len] << WKind{word, kind}
}
// find returns the Kind given a word, by doing a binary search
// on the sorted list of words for each bin
[direct_array_access]
pub fn (km &KeywordsMatcher) find(word string) Kind {
wlen := word.len
if wlen < km.len_min || wlen > km.len_max {
return Kind.unknown
}
list_len := km.words[wlen].len
if list_len == 0 {
return Kind.unknown
}
mut lo := 0
mut hi := list_len - 1
for lo <= hi {
mid := lo + (hi - lo) / 2
cmp := km.words[wlen][mid].word.compare(word)
match cmp {
0 { return km.words[wlen][mid].kind }
-1 { lo = mid + 1 }
1 { hi = mid - 1 }
else {}
}
}
return Kind.unknown
}

View File

@ -132,15 +132,11 @@ pub enum Kind {
_end_
}
pub const (
assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign, .xor_assign,
.mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
pub const assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign,
.xor_assign, .mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
.unsigned_right_shift_assign]
)
const (
nr_tokens = int(Kind._end_)
)
const nr_tokens = int(Kind._end_)
// @FN => will be substituted with the name of the current V function
// @METHOD => will be substituted with ReceiverType.MethodName
@ -182,10 +178,8 @@ pub enum AtKind {
vexeroot_path
}
pub const (
valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
pub const valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
'@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE']
)
// build_keys genereates a map with keywords' string values:
// Keywords['return'] == .key_return
@ -315,13 +309,11 @@ fn build_token_str() []string {
return s
}
const (
token_str = build_token_str()
)
const token_str = build_token_str()
pub const (
keywords = build_keys()
)
pub const keywords = build_keys()
pub const matcher = new_keywords_matcher(keywords)
[inline]
pub fn is_key(key string) bool {
@ -365,10 +357,8 @@ pub fn (t Token) str() string {
// Representation of highest and lowest precedence
/*
pub const (
lowest_prec = 0
highest_prec = 8
)
pub const lowest_prec = 0
pub const highest_prec = 8
*/
pub enum Precedence {
lowest
@ -439,9 +429,7 @@ pub fn build_precedences() []Precedence {
return p
}
const (
precedences = build_precedences()
)
const precedences = build_precedences()
// precedence returns a tokens precedence if defined, otherwise lowest_prec
[inline]