scanner: speed up text_scan by using a specialised keywords matcher, instead of a generic V map of keywords
parent
b1bb1d361a
commit
8014235e0e
|
@ -499,6 +499,28 @@ fn (s string) == (a string) bool {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// compare returns -1 if `s` < `a`, 0 if `s` == `a`, and 1 if `s` > `a`
|
||||||
|
[direct_array_access]
|
||||||
|
pub fn (s string) compare(a string) int {
|
||||||
|
min_len := if s.len < a.len { s.len } else { a.len }
|
||||||
|
for i in 0 .. min_len {
|
||||||
|
if s[i] < a[i] {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
if s[i] > a[i] {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if s.len < a.len {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
if s.len > a.len {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
[direct_array_access]
|
||||||
fn (s string) < (a string) bool {
|
fn (s string) < (a string) bool {
|
||||||
for i in 0 .. s.len {
|
for i in 0 .. s.len {
|
||||||
if i >= a.len || s[i] > a[i] {
|
if i >= a.len || s[i] > a[i] {
|
||||||
|
@ -513,6 +535,7 @@ fn (s string) < (a string) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[direct_array_access]
|
||||||
fn (s string) + (a string) string {
|
fn (s string) + (a string) string {
|
||||||
new_len := a.len + s.len
|
new_len := a.len + s.len
|
||||||
mut res := string{
|
mut res := string{
|
||||||
|
|
|
@ -675,7 +675,7 @@ fn (mut s Scanner) text_scan() token.Token {
|
||||||
// tmp hack to detect . in ${}
|
// tmp hack to detect . in ${}
|
||||||
// Check if not .eof to prevent panic
|
// Check if not .eof to prevent panic
|
||||||
next_char := s.look_ahead(1)
|
next_char := s.look_ahead(1)
|
||||||
kind := token.keywords[name]
|
kind := token.matcher.find(name)
|
||||||
if kind != .unknown {
|
if kind != .unknown {
|
||||||
return s.new_token(kind, name, name.len)
|
return s.new_token(kind, name, name.len)
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
import v.token
|
||||||
|
import benchmark
|
||||||
|
|
||||||
|
const max_repetitions = 4_000_000
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
km := token.new_keywords_matcher(token.keywords)
|
||||||
|
for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
|
||||||
|
'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
|
||||||
|
mut res := token.Kind{}
|
||||||
|
mut bmark := benchmark.start()
|
||||||
|
for _ in 0 .. max_repetitions {
|
||||||
|
res = token.keywords[kw]
|
||||||
|
}
|
||||||
|
bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
|
||||||
|
for _ in 0 .. max_repetitions {
|
||||||
|
res = km.find(kw)
|
||||||
|
}
|
||||||
|
bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
|
||||||
|
println('--------------------------------')
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,92 @@
|
||||||
|
module token
|
||||||
|
|
||||||
|
// bump token.max_keyword_len, if you add a longer keyword
|
||||||
|
const max_keyword_len = 11
|
||||||
|
|
||||||
|
// KeywordsMatcher provides a faster way of determinining whether a given name
|
||||||
|
// is a reserved keyword, by doing a comparison with only the keywords that
|
||||||
|
// have exactly the same length as `name`.
|
||||||
|
// Benchmarking shows that with -prod, it is 20-25% slower in the worst case
|
||||||
|
// compared to just using token.keywords[name], but can be 20x faster
|
||||||
|
// in the case, where there is a length mismatch, and 2x-3x faster in most
|
||||||
|
// cases, where there is a match.
|
||||||
|
// Without -prod, with tcc, using KeywordsMatcher is always faster
|
||||||
|
// (2x to 14x times), compared to using a hash of all the keywords.
|
||||||
|
pub struct KeywordsMatcher {
|
||||||
|
mut:
|
||||||
|
len_min int = 9999
|
||||||
|
len_max int = -1
|
||||||
|
words [max_keyword_len][]WKind
|
||||||
|
}
|
||||||
|
|
||||||
|
struct WKind {
|
||||||
|
mut:
|
||||||
|
word string
|
||||||
|
kind Kind
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_keywords_matcher(kw_map map[string]Kind) KeywordsMatcher {
|
||||||
|
mut km := KeywordsMatcher{}
|
||||||
|
// TODO: remove this loop. It is currently needed, because a
|
||||||
|
// fixed array of arrays is not initialised properly automatically
|
||||||
|
// as of 2021/10/28
|
||||||
|
for i in 0 .. token.max_keyword_len {
|
||||||
|
km.words[i] = []WKind{}
|
||||||
|
}
|
||||||
|
for k, v in kw_map {
|
||||||
|
km.add_word(k, v)
|
||||||
|
}
|
||||||
|
for i in 0 .. token.max_keyword_len {
|
||||||
|
if km.words[i].len > 0 {
|
||||||
|
km.words[i].sort(a.word < b.word)
|
||||||
|
$if trace_keyword_matcher_initialisation ? {
|
||||||
|
print('word len: ${i:3} | words: ')
|
||||||
|
for w in km.words[i] {
|
||||||
|
print('$w.word, ')
|
||||||
|
}
|
||||||
|
println('')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return km
|
||||||
|
}
|
||||||
|
|
||||||
|
fn (mut km KeywordsMatcher) add_word(word string, kind Kind) {
|
||||||
|
if word.len >= token.max_keyword_len {
|
||||||
|
panic('increase max_keyword_len to > $word.len')
|
||||||
|
}
|
||||||
|
if km.len_max < word.len {
|
||||||
|
km.len_max = word.len
|
||||||
|
}
|
||||||
|
if word.len < km.len_min {
|
||||||
|
km.len_min = word.len
|
||||||
|
}
|
||||||
|
km.words[word.len] << WKind{word, kind}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find returns the Kind given a word, by doing a binary search
|
||||||
|
// on the sorted list of words for each bin
|
||||||
|
[direct_array_access]
|
||||||
|
pub fn (km &KeywordsMatcher) find(word string) Kind {
|
||||||
|
wlen := word.len
|
||||||
|
if wlen < km.len_min || wlen > km.len_max {
|
||||||
|
return Kind.unknown
|
||||||
|
}
|
||||||
|
list_len := km.words[wlen].len
|
||||||
|
if list_len == 0 {
|
||||||
|
return Kind.unknown
|
||||||
|
}
|
||||||
|
mut lo := 0
|
||||||
|
mut hi := list_len - 1
|
||||||
|
for lo <= hi {
|
||||||
|
mid := lo + (hi - lo) / 2
|
||||||
|
cmp := km.words[wlen][mid].word.compare(word)
|
||||||
|
match cmp {
|
||||||
|
0 { return km.words[wlen][mid].kind }
|
||||||
|
-1 { lo = mid + 1 }
|
||||||
|
1 { hi = mid - 1 }
|
||||||
|
else {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Kind.unknown
|
||||||
|
}
|
|
@ -132,15 +132,11 @@ pub enum Kind {
|
||||||
_end_
|
_end_
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const (
|
pub const assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign,
|
||||||
assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign, .xor_assign,
|
.xor_assign, .mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
|
||||||
.mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
|
|
||||||
.unsigned_right_shift_assign]
|
.unsigned_right_shift_assign]
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
const nr_tokens = int(Kind._end_)
|
||||||
nr_tokens = int(Kind._end_)
|
|
||||||
)
|
|
||||||
|
|
||||||
// @FN => will be substituted with the name of the current V function
|
// @FN => will be substituted with the name of the current V function
|
||||||
// @METHOD => will be substituted with ReceiverType.MethodName
|
// @METHOD => will be substituted with ReceiverType.MethodName
|
||||||
|
@ -182,10 +178,8 @@ pub enum AtKind {
|
||||||
vexeroot_path
|
vexeroot_path
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const (
|
pub const valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
|
||||||
valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
|
|
||||||
'@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE']
|
'@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE']
|
||||||
)
|
|
||||||
|
|
||||||
// build_keys genereates a map with keywords' string values:
|
// build_keys genereates a map with keywords' string values:
|
||||||
// Keywords['return'] == .key_return
|
// Keywords['return'] == .key_return
|
||||||
|
@ -315,13 +309,11 @@ fn build_token_str() []string {
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const token_str = build_token_str()
|
||||||
token_str = build_token_str()
|
|
||||||
)
|
|
||||||
|
|
||||||
pub const (
|
pub const keywords = build_keys()
|
||||||
keywords = build_keys()
|
|
||||||
)
|
pub const matcher = new_keywords_matcher(keywords)
|
||||||
|
|
||||||
[inline]
|
[inline]
|
||||||
pub fn is_key(key string) bool {
|
pub fn is_key(key string) bool {
|
||||||
|
@ -365,10 +357,8 @@ pub fn (t Token) str() string {
|
||||||
|
|
||||||
// Representation of highest and lowest precedence
|
// Representation of highest and lowest precedence
|
||||||
/*
|
/*
|
||||||
pub const (
|
pub const lowest_prec = 0
|
||||||
lowest_prec = 0
|
pub const highest_prec = 8
|
||||||
highest_prec = 8
|
|
||||||
)
|
|
||||||
*/
|
*/
|
||||||
pub enum Precedence {
|
pub enum Precedence {
|
||||||
lowest
|
lowest
|
||||||
|
@ -439,9 +429,7 @@ pub fn build_precedences() []Precedence {
|
||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const precedences = build_precedences()
|
||||||
precedences = build_precedences()
|
|
||||||
)
|
|
||||||
|
|
||||||
// precedence returns a tokens precedence if defined, otherwise lowest_prec
|
// precedence returns a tokens precedence if defined, otherwise lowest_prec
|
||||||
[inline]
|
[inline]
|
||||||
|
|
Loading…
Reference in New Issue