scanner: speed up text_scan by using a specialised keywords matcher, instead of a generic V map of keywords

2021-10-28 15:09:41 +03:00 · 2021-10-28 15:09:41 +03:00 · 8014235e0e
parent b1bb1d361a
commit 8014235e0e
5 changed files with 151 additions and 26 deletions
--- a/vlib/builtin/string.v
+++ b/vlib/builtin/string.v
@ -499,6 +499,28 @@ fn (s string) == (a string) bool {
 	}
 }

+// compare returns -1 if `s` < `a`, 0 if `s` == `a`, and 1 if `s` > `a`
+[direct_array_access]
+pub fn (s string) compare(a string) int {
+	min_len := if s.len < a.len { s.len } else { a.len }
+	for i in 0 .. min_len {
+		if s[i] < a[i] {
+			return -1
+		}
+		if s[i] > a[i] {
+			return 1
+		}
+	}
+	if s.len < a.len {
+		return -1
+	}
+	if s.len > a.len {
+		return 1
+	}
+	return 0
+}
+
+[direct_array_access]
 fn (s string) < (a string) bool {
 	for i in 0 .. s.len {
 		if i >= a.len || s[i] > a[i] {
@ -513,6 +535,7 @@ fn (s string) < (a string) bool {
 	return false
 }

+[direct_array_access]
 fn (s string) + (a string) string {
 	new_len := a.len + s.len
 	mut res := string{
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -675,7 +675,7 @@ fn (mut s Scanner) text_scan() token.Token {
 			// tmp hack to detect . in ${}
 			// Check if not .eof to prevent panic
 			next_char := s.look_ahead(1)
-			kind := token.keywords[name]
+			kind := token.matcher.find(name)
 			if kind != .unknown {
 				return s.new_token(kind, name, name.len)
 			}
--- a/vlib/v/tests/bench/bench_compare_tokens.v
+++ b/vlib/v/tests/bench/bench_compare_tokens.v
@ -0,0 +1,22 @@
+import v.token
+import benchmark
+
+const max_repetitions = 4_000_000
+
+fn main() {
+	km := token.new_keywords_matcher(token.keywords)
+	for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
+		'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
+		mut res := token.Kind{}
+		mut bmark := benchmark.start()
+		for _ in 0 .. max_repetitions {
+			res = token.keywords[kw]
+		}
+		bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
+		for _ in 0 .. max_repetitions {
+			res = km.find(kw)
+		}
+		bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
+		println('--------------------------------')
+	}
+}
--- a/vlib/v/token/keywords_matcher.v
+++ b/vlib/v/token/keywords_matcher.v
@ -0,0 +1,92 @@
+module token
+
+// bump token.max_keyword_len, if you add a longer keyword
+const max_keyword_len = 11
+
+// KeywordsMatcher provides a faster way of determinining whether a given name
+// is a reserved keyword, by doing a comparison with only the keywords that
+// have exactly the same length as `name`.
+// Benchmarking shows that with -prod, it is 20-25% slower in the worst case
+// compared to just using token.keywords[name], but can be 20x faster
+// in the case, where there is a length mismatch, and 2x-3x faster in most
+// cases, where there is a match.
+// Without -prod, with tcc, using KeywordsMatcher is always faster
+// (2x to 14x times), compared to using a hash of all the keywords.
+pub struct KeywordsMatcher {
+mut:
+	len_min int = 9999
+	len_max int = -1
+	words   [max_keyword_len][]WKind
+}
+
+struct WKind {
+mut:
+	word string
+	kind Kind
+}
+
+pub fn new_keywords_matcher(kw_map map[string]Kind) KeywordsMatcher {
+	mut km := KeywordsMatcher{}
+	// TODO: remove this loop. It is currently needed, because a
+	// fixed array of arrays is not initialised properly automatically
+	// as of 2021/10/28
+	for i in 0 .. token.max_keyword_len {
+		km.words[i] = []WKind{}
+	}
+	for k, v in kw_map {
+		km.add_word(k, v)
+	}
+	for i in 0 .. token.max_keyword_len {
+		if km.words[i].len > 0 {
+			km.words[i].sort(a.word < b.word)
+			$if trace_keyword_matcher_initialisation ? {
+				print('word len: ${i:3} | words: ')
+				for w in km.words[i] {
+					print('$w.word, ')
+				}
+				println('')
+			}
+		}
+	}
+	return km
+}
+
+fn (mut km KeywordsMatcher) add_word(word string, kind Kind) {
+	if word.len >= token.max_keyword_len {
+		panic('increase max_keyword_len to > $word.len')
+	}
+	if km.len_max < word.len {
+		km.len_max = word.len
+	}
+	if word.len < km.len_min {
+		km.len_min = word.len
+	}
+	km.words[word.len] << WKind{word, kind}
+}
+
+// find returns the Kind given a word, by doing a binary search
+// on the sorted list of words for each bin
+[direct_array_access]
+pub fn (km &KeywordsMatcher) find(word string) Kind {
+	wlen := word.len
+	if wlen < km.len_min || wlen > km.len_max {
+		return Kind.unknown
+	}
+	list_len := km.words[wlen].len
+	if list_len == 0 {
+		return Kind.unknown
+	}
+	mut lo := 0
+	mut hi := list_len - 1
+	for lo <= hi {
+		mid := lo + (hi - lo) / 2
+		cmp := km.words[wlen][mid].word.compare(word)
+		match cmp {
+			0 { return km.words[wlen][mid].kind }
+			-1 { lo = mid + 1 }
+			1 { hi = mid - 1 }
+			else {}
+		}
+	}
+	return Kind.unknown
+}
--- a/vlib/v/token/token.v
+++ b/vlib/v/token/token.v
@ -132,15 +132,11 @@ pub enum Kind {
 	_end_
 }

-pub const (
-	assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign, .xor_assign,
-		.mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
+pub const assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign,
+	.xor_assign, .mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign,
 	.unsigned_right_shift_assign]
-)

-const (
-	nr_tokens = int(Kind._end_)
-)
+const nr_tokens = int(Kind._end_)

 // @FN => will be substituted with the name of the current V function
 // @METHOD => will be substituted with ReceiverType.MethodName
@ -182,10 +178,8 @@ pub enum AtKind {
 	vexeroot_path
 }

-pub const (
-	valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
+pub const valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT',
 	'@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE']
-)

 // build_keys genereates a map with keywords' string values:
 // Keywords['return'] == .key_return
@ -315,13 +309,11 @@ fn build_token_str() []string {
 	return s
 }

-const (
-	token_str = build_token_str()
-)
+const token_str = build_token_str()

-pub const (
-	keywords = build_keys()
-)
+pub const keywords = build_keys()
+
+pub const matcher = new_keywords_matcher(keywords)

 [inline]
 pub fn is_key(key string) bool {
@ -365,10 +357,8 @@ pub fn (t Token) str() string {

 // Representation of highest and lowest precedence
 /*
-pub const (
-	lowest_prec = 0
-	highest_prec = 8
-)
+pub const lowest_prec = 0
+pub const highest_prec = 8
 */
 pub enum Precedence {
 	lowest
@ -439,9 +429,7 @@ pub fn build_precedences() []Precedence {
 	return p
 }

-const (
-	precedences = build_precedences()
-)
+const precedences = build_precedences()

 // precedence returns a tokens precedence if defined, otherwise lowest_prec
 [inline]