scanner: speed up text_scan by using a specialised keywords matcher, instead of a generic V map of keywords

2021-10-28 15:09:41 +03:00 · 2021-10-28 15:09:41 +03:00 · 8014235e0e
parent b1bb1d361a
commit 8014235e0e
5 changed files with 151 additions and 26 deletions
--- a/vlib/builtin/string.v
+++ b/vlib/builtin/string.v
@ -499,6 +499,28 @@ fn (s string) == (a string) bool {
 	}
 }
 // compare returns -1 if `s` < `a`, 0 if `s` == `a`, and 1 if `s` > `a`
 [direct_array_access]
 pub fn (s string) compare(a string) int {
 	min_len := if s.len < a.len { s.len } else { a.len }
 	for i in 0 .. min_len {
 		if s[i] < a[i] {
 			return -1
 		}
 		if s[i] > a[i] {
 			return 1
 		}
 	}
 	if s.len < a.len {
 		return -1
 	}
 	if s.len > a.len {
 		return 1
 	}
 	return 0
 }
 [direct_array_access]
 fn (s string) < (a string) bool {
 	for i in 0 .. s.len {
 		if i >= a.len || s[i] > a[i] {
@ -513,6 +535,7 @@ fn (s string) < (a string) bool {
 	return false
 }
 [direct_array_access]
 fn (s string) + (a string) string {
 	new_len := a.len + s.len
 	mut res := string{
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -675,7 +675,7 @@ fn (mut s Scanner) text_scan() token.Token {
 			// tmp hack to detect . in ${}
 			// Check if not .eof to prevent panic
 			next_char := s.look_ahead(1)
-			kind := token.keywords[name]
+			kind := token.matcher.find(name)
 			if kind != .unknown {
 				return s.new_token(kind, name, name.len)
 			}
--- a/vlib/v/tests/bench/bench_compare_tokens.v
+++ b/vlib/v/tests/bench/bench_compare_tokens.v
@ -0,0 +1,22 @@
 import v.token
 import benchmark
 const max_repetitions = 4_000_000
 fn main() {
 	km := token.new_keywords_matcher(token.keywords)
 	for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
 		'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
 		mut res := token.Kind{}
 		mut bmark := benchmark.start()
 		for _ in 0 .. max_repetitions {
 			res = token.keywords[kw]
 		}
 		bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
 		for _ in 0 .. max_repetitions {
 			res = km.find(kw)
 		}
 		bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
 		println('--------------------------------')
 	}
 }
--- a/vlib/v/token/keywords_matcher.v
+++ b/vlib/v/token/keywords_matcher.v
@ -0,0 +1,92 @@
 module token
 // bump token.max_keyword_len, if you add a longer keyword
 const max_keyword_len = 11
 // KeywordsMatcher provides a faster way of determinining whether a given name
 // is a reserved keyword, by doing a comparison with only the keywords that
 // have exactly the same length as `name`.
 // Benchmarking shows that with -prod, it is 20-25% slower in the worst case
 // compared to just using token.keywords[name], but can be 20x faster
 // in the case, where there is a length mismatch, and 2x-3x faster in most
 // cases, where there is a match.
 // Without -prod, with tcc, using KeywordsMatcher is always faster
 // (2x to 14x times), compared to using a hash of all the keywords.
 pub struct KeywordsMatcher {
 mut:
 	len_min int = 9999
 	len_max int = -1
 	words   [max_keyword_len][]WKind
 }
 struct WKind {
 mut:
 	word string
 	kind Kind
 }
 pub fn new_keywords_matcher(kw_map map[string]Kind) KeywordsMatcher {
 	mut km := KeywordsMatcher{}
 	// TODO: remove this loop. It is currently needed, because a
 	// fixed array of arrays is not initialised properly automatically
 	// as of 2021/10/28
 	for i in 0 .. token.max_keyword_len {
 		km.words[i] = []WKind{}
 	}
 	for k, v in kw_map {
 		km.add_word(k, v)
 	}
 	for i in 0 .. token.max_keyword_len {
 		if km.words[i].len > 0 {
 			km.words[i].sort(a.word < b.word)
 			$if trace_keyword_matcher_initialisation ? {
 				print('word len: ${i:3} | words: ')
 				for w in km.words[i] {
 					print('$w.word, ')
 				}
 				println('')
 			}
 		}
 	}
 	return km
 }
 fn (mut km KeywordsMatcher) add_word(word string, kind Kind) {
 	if word.len >= token.max_keyword_len {
 		panic('increase max_keyword_len to > $word.len')
 	}
 	if km.len_max < word.len {
 		km.len_max = word.len
 	}
 	if word.len < km.len_min {
 		km.len_min = word.len
 	}
 	km.words[word.len] << WKind{word, kind}
 }
 // find returns the Kind given a word, by doing a binary search
 // on the sorted list of words for each bin
 [direct_array_access]
 pub fn (km &KeywordsMatcher) find(word string) Kind {
 	wlen := word.len
 	if wlen < km.len_min || wlen > km.len_max {
 		return Kind.unknown
 	}
 	list_len := km.words[wlen].len
 	if list_len == 0 {
 		return Kind.unknown
 	}
 	mut lo := 0
 	mut hi := list_len - 1
 	for lo <= hi {
 		mid := lo + (hi - lo) / 2
 		cmp := km.words[wlen][mid].word.compare(word)
 		match cmp {
 			0 { return km.words[wlen][mid].kind }
 			-1 { lo = mid + 1 }
 			1 { hi = mid - 1 }
 			else {}
 		}
 	}
 	return Kind.unknown
 }
--- a/vlib/v/token/token.v
+++ b/vlib/v/token/token.v
@ -132,15 +132,11 @@ pub enum Kind {
 	_end_
 }
-pub const (
+pub const assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign,
-	assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign, .xor_assign,
+	.xor_assign, .mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
 		.mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
 	.unsigned_right_shift_assign]
 )
-const (
+const nr_tokens = int(Kind._end_)
 	nr_tokens = int(Kind._end_)
 )
 // @FN => will be substituted with the name of the current V function
 // @METHOD => will be substituted with ReceiverType.MethodName
@ -182,10 +178,8 @@ pub enum AtKind {
 	vexeroot_path
 }
-pub const (
+pub const valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
 	valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
 	'@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE']
 )
 // build_keys genereates a map with keywords' string values:
 // Keywords['return'] == .key_return
@ -315,13 +309,11 @@ fn build_token_str() []string {
 	return s
 }
-const (
+const token_str = build_token_str()
 	token_str = build_token_str()
 )
-pub const (
+pub const keywords = build_keys()
-	keywords = build_keys()
+
-)
+pub const matcher = new_keywords_matcher(keywords)
 [inline]
 pub fn is_key(key string) bool {
@ -365,10 +357,8 @@ pub fn (t Token) str() string {
 // Representation of highest and lowest precedence
 /*
-pub const (
+pub const lowest_prec = 0
-	lowest_prec = 0
+pub const highest_prec = 8
 	highest_prec = 8
 )
 */
 pub enum Precedence {
 	lowest
@ -439,9 +429,7 @@ pub fn build_precedences() []Precedence {
 	return p
 }
-const (
+const precedences = build_precedences()
 	precedences = build_precedences()
 )
 // precedence returns a tokens precedence if defined, otherwise lowest_prec
 [inline]