map: small cleanup

2020-03-21 13:55:07 +01:00 · 2020-03-21 13:55:07 +01:00 · bf97afb9ed
parent f962d92623
commit bf97afb9ed
1 changed files with 81 additions and 96 deletions
--- a/vlib/builtin/map.v
+++ b/vlib/builtin/map.v
@ -1,14 +1,12 @@
 // Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
 // Use of this source code is governed by an MIT license
 // that can be found in the LICENSE file.
-
 module builtin

 import (
 	strings
 	hash.wyhash
 )
-
 /*
 This is a very fast hashmap implementation. It has several properties that in
 combination makes it very fast. Here is a short explanation of each property.
@ -17,7 +15,7 @@ After reading this you should have a basic understanding of how it works:
 1. |Hash-function (Wyhash)|. Wyhash is the fastest hash-function passing SMHash-
 er, so it was an easy choice.

-2. |Open addressing (Robin Hood Hashing)|. With this method a hash collision is
+2. |Open addressing (Robin Hood Hashing)|. With this method, a hash collision is
 resolved by probing. As opposed to linear probing, Robin Hood hashing has a sim-
 ple but clever twist: As new keys are inserted, old keys are shifted around in a
 way such that all keys stay reasonably close to the slot they originally hash to.
@ -26,50 +24,48 @@ way such that all keys stay reasonably close to the slot they originally hash to
 ge of roughly 6.25% unused memory, as opposed to most other dynamic array imple-
 mentations with a growth factor of 1.5 or 2. The key-values keep their index in
 the array - they are not probed. Instead, this implementation uses another array
-"metas" storing "metas" (meta-data). Each Key-value has a corresponding meta. A
+"metas" storing "meta"s (meta-data). Each Key-value has a corresponding meta. A
 meta stores a reference to its key-value, and its index in "metas" is determined
 by the hash of the key and probing. A meta also stores bits from the hash (for
 faster rehashing etc.) and how far away it is from the index it was originally
 hashed to (probe_count). probe_count is 0 if empty, 1 if not probed, 2 if probed
-by 1.
+by 1, etc..

 meta (64 bit) =  kv_index (32 bit) | probe_count (8 bits) | hashbits (24 bits)
 metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...]
 key_values = [kv, kv, kv, kv, kv, ...]

 4. |Power of two size array|. The size of metas is a power of two. This makes it
-possible to find a bucket from a hash code you can use hash & (SIZE -1) instead
-of abs(hash) % SIZE. Modulo is extremely expensive so using '&' is a big perfor-
-mance improvement. The general concern with this is that you only use the lower
-bits of the hash and can cause many collisions. This is solved by using very go-
-od hash-function.
+possible to find a bucket from a hash code by using "hash & (SIZE -1)" instead
+of "abs(hash) % SIZE". Modulo is extremely expensive so using '&' is a big perf-
+ormance improvement. The general concern with this is that you only use the low-
+er bits of the hash and that can cause more collisions. This is solved by using
+good hash-function.

 5. |Extra metas|. The hashmap keeps track of the highest probe_count. The trick
-is to allocate extra metas > max(probe_count), so you never have to do any boun-
+is to allocate extra_metas > max(probe_count), so you never have to do any boun-
 ds-checking because the extra metas ensures that an element will never go beyond
-index the last index.
+the last index.

 6. |Cached rehashing|. When the load_factor of the map exceeds the max_load_fac-
 tor the size of metas is doubled and all the elements need to be "rehashed" to
-find the index in the new array. Instead of rehashing complete, it simply uses
+find the index in the new array. Instead of rehashing completely, it simply uses
 the hashbits stored in the meta.
 */

+
 const (
-	// Number of bits from the hash stored for each entry
+// Number of bits from the hash stored for each entry
 	hashbits = 24
-	// Number of bits from the hash stored for rehasing
+	// Number of bits from the hash stored for rehashing
 	cached_hashbits = 16
 	// Initial log-number of buckets in the hashtable
 	init_log_capicity = 5
 	// Initial number of buckets in the hashtable
-	init_capicity = 1 << init_log_capicity
-	// Initial max load-factor
-	init_max_load_factor = 0.8
-	// Minimum Load-factor.
-	// Number is picked to make delete O(1) amortized
-	min_load_factor = 0.3
-	// Initial range cap
+	init_capicity = 1<<init_log_capicity
+	// Maximum load-factor (size / capacity)
+	max_load_factor = 0.8
+	// Initial highest even index in metas
 	init_cap = init_capicity - 2
 	// Used for incrementing `extra_metas` when max
 	// probe count is too high, to avoid overflow
@ -78,8 +74,6 @@ const (
 	hash_mask = u32(0x00FFFFFF)
 	// Used for incrementing the probe-count
 	probe_inc = u32(0x01000000)
-	// Bitmask for maximum probe count
-	max_probe = u32(0xFF000000)
 )

 struct KeyValue {
@ -91,20 +85,20 @@ mut:
 // Dynamic array with very low growth factor
 struct DenseArray {
 mut:
-	data &KeyValue
-	cap  u32
-	size u32
+	cap     u32
+	size    u32
 	deletes u32
+	data    &KeyValue
 }

 [inline]
 fn new_dense_array() DenseArray {
-	unsafe {
-		return DenseArray {
-			data: &KeyValue(malloc(8 * sizeof(KeyValue)))
+	unsafe{
+		return DenseArray{
 			cap: 8
 			size: 0
 			deletes: 0
+			data: &KeyValue(malloc(8 * sizeof(KeyValue)))
 		}
 	}
 }
@ -114,7 +108,7 @@ fn new_dense_array() DenseArray {
 [inline]
 fn (d mut DenseArray) push(kv KeyValue) u32 {
 	if d.cap == d.size {
-		d.cap += d.cap >> 3
+		d.cap += d.cap>>3
 		d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
 	}
 	push_index := d.size
@ -127,7 +121,7 @@ fn (d mut DenseArray) push(kv KeyValue) u32 {
 // and resize array
 fn (d mut DenseArray) zeros_to_end() {
 	mut count := u32(0)
-	for i in 0..d.size {
+	for i in 0 .. d.size {
 		if d.data[i].key.str != 0 {
 			tmp := d.data[count]
 			d.data[count] = d.data[i]
@ -135,34 +129,34 @@ fn (d mut DenseArray) zeros_to_end() {
 			count++
 		}
 	}
-	count++
+	d.deletes = 0
 	d.size = count
-	d.cap = if count < 8 {8} else {count}
+	d.cap = if count < 8 { 8 } else { count }
 	d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
 }

 pub struct map {
 	// Byte size of value
-	value_bytes     int
+	value_bytes int
 mut:
-	// Index of the highest index in the hashtable
-	cap             u32
+// Index of the highest index in the hashtable
+	cap         u32
 	// Number of cached hashbits left for rehasing
-	window          byte
+	window      byte
 	// Used for right-shifting out used hashbits
-	shift           byte
-	// Pointer to Key-value memory
-	key_values      DenseArray
-	// Pointer to meta-data
-	metas           &u32
-	// Measure that decides when to increase the capacity
-	max_load_factor f32
+	shift       byte
+	// Array storing key-values (ordered)
+	key_values  DenseArray
+	// Pointer to meta-data:
+	// Odd indices stores index in `key_values`.
+	// Even indices stores probe_count and hashbits.
+	metas       &u32
 	// Extra metas that allows for no ranging when incrementing
 	// index in the hashmap
-	extra_metas     u32
+	extra_metas u32
 pub mut:
-	// Number of key-values currently in the hashmap
-	size            int
+// Number of key-values currently in the hashmap
+	size        int
 }

 fn new_map(n, value_bytes int) map {
@ -173,7 +167,6 @@ fn new_map(n, value_bytes int) map {
 		shift: init_log_capicity
 		key_values: new_dense_array()
 		metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc)))
-		max_load_factor: init_max_load_factor
 		extra_metas: extra_metas_inc
 		size: 0
 	}
@ -188,26 +181,26 @@ fn new_map_init(n, value_bytes int, keys &string, values voidptr) map {
 }

 [inline]
-fn (m map) key_to_index(key string) (u64, u32) {
-	hash := wyhash.wyhash_c(key.str, u64(key.len), 0)
+fn (m map) key_to_index(key string) (u32,u32) {
+	hash := u32(wyhash.wyhash_c(key.str, u64(key.len), 0))
 	index := hash & m.cap
-	meta := u32(((hash>>m.shift) & hash_mask) | probe_inc)
-	return index, meta
+	meta := ((hash>>m.shift) & hash_mask) | probe_inc
+	return index,meta
 }

 [inline]
-fn meta_less(metas &u32, i u64, m u32) (u64, u32){
+fn meta_less(metas &u32, i u32, m u32) (u32,u32) {
 	mut index := i
 	mut meta := m
 	for meta < metas[index] {
 		index += 2
 		meta += probe_inc
 	}
-	return index, meta
+	return index,meta
 }

 [inline]
-fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 {
+fn (m mut map) meta_greater(ms &u32, i u32, me u32, kvi u32) &u32 {
 	mut metas := ms
 	mut meta := me
 	mut index := i
@ -226,27 +219,27 @@ fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 {
 	}
 	metas[index] = meta
 	metas[index + 1] = kv_index
-	probe_count := (meta >> hashbits) - 1
-	if (probe_count << 1) == m.extra_metas {
+	probe_count := (meta>>hashbits) - 1
+	if (probe_count<<1) == m.extra_metas {
 		m.extra_metas += extra_metas_inc
 		mem_size := (m.cap + 2 + m.extra_metas)
 		metas = &u32(C.realloc(metas, sizeof(u32) * mem_size))
 		C.memset(metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
 		// Should almost never happen
 		if probe_count == 252 {
-			panic("Probe overflow")
+			panic('Probe overflow')
 		}
 	}
 	return metas
 }

 fn (m mut map) set(key string, value voidptr) {
-	load_factor := f32(m.size << 1) / f32(m.cap)
-	if load_factor > m.max_load_factor {
+	load_factor := f32(m.size<<1) / f32(m.cap)
+	if load_factor > max_load_factor {
 		m.expand()
 	}
-	mut index, mut meta := m.key_to_index(key)
-	index, meta = meta_less(m.metas, index, meta)
+	mut index,mut meta := m.key_to_index(key)
+	index,meta = meta_less(m.metas, index, meta)
 	// While we might have a match
 	for meta == m.metas[index] {
 		kv_index := m.metas[index + 1]
@ -275,8 +268,8 @@ fn (m mut map) expand() {
 	// Check if any hashbits are left
 	if m.window == 0 {
 		m.shift += cached_hashbits
-		m.rehash()
 		m.window = cached_hashbits
+		m.rehash()
 	}
 	else {
 		m.cached_rehash(old_cap)
@ -293,8 +286,8 @@ fn (m mut map) rehash() {
 			continue
 		}
 		kv := m.key_values.data[i]
-		mut index, mut meta := m.key_to_index(kv.key)
-		index, meta = meta_less(m.metas, index, meta)
+		mut index,mut meta := m.key_to_index(kv.key)
+		index,meta = meta_less(m.metas, index, meta)
 		m.metas = m.meta_greater(m.metas, index, meta, i)
 	}
 }
@ -302,16 +295,16 @@ fn (m mut map) rehash() {
 fn (m mut map) cached_rehash(old_cap u32) {
 	mut new_meta := &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas)))
 	old_extra_metas := m.extra_metas
-	for i := 0; i <= old_cap + old_extra_metas; i += 2 {
+	for i := u32(0); i <= old_cap + old_extra_metas; i += 2 {
 		if m.metas[i] == 0 {
 			continue
 		}
 		old_meta := m.metas[i]
-		old_probe_count := u64((old_meta>>hashbits) - 1) << 1
-		old_index := (i - old_probe_count) & (m.cap >> 1)
-		mut index := u64(old_index) | (old_meta << m.shift) & m.cap
+		old_probe_count := ((old_meta>>hashbits) - 1)<<1
+		old_index := (i - old_probe_count) & (m.cap>>1)
+		mut index := (old_index | (old_meta<<m.shift)) & m.cap
 		mut meta := (old_meta & hash_mask) | probe_inc
-		index, meta = meta_less(new_meta, index, meta)
+		index,meta = meta_less(new_meta, index, meta)
 		kv_index := m.metas[i + 1]
 		new_meta = m.meta_greater(new_meta, index, meta, kv_index)
 	}
@ -321,10 +314,9 @@ fn (m mut map) cached_rehash(old_cap u32) {
 	m.metas = new_meta
 }

-[inline]
 fn (m map) get(key string, out voidptr) bool {
-	mut index, mut meta := m.key_to_index(key)
-	index, meta = meta_less(m.metas, index, meta)
+	mut index,mut meta := m.key_to_index(key)
+	index,meta = meta_less(m.metas, index, meta)
 	for meta == m.metas[index] {
 		kv_index := m.metas[index + 1]
 		if key == m.key_values.data[kv_index].key {
@ -337,10 +329,9 @@ fn (m map) get(key string, out voidptr) bool {
 	return false
 }

-[inline]
 fn (m map) get2(key string) voidptr {
-	mut index, mut meta := m.key_to_index(key)
-	index, meta = meta_less(m.metas, index, meta)
+	mut index,mut meta := m.key_to_index(key)
+	index,meta = meta_less(m.metas, index, meta)
 	for meta == m.metas[index] {
 		kv_index := m.metas[index + 1]
 		if key == m.key_values.data[kv_index].key {
@ -354,13 +345,12 @@ fn (m map) get2(key string) voidptr {
 	return voidptr(0)
 }

-[inline]
 fn (m map) exists(key string) bool {
 	if m.value_bytes == 0 {
 		return false
 	}
-	mut index, mut meta := m.key_to_index(key)
-	index, meta = meta_less(m.metas, index, meta)
+	mut index,mut meta := m.key_to_index(key)
+	index,meta = meta_less(m.metas, index, meta)
 	for meta == m.metas[index] {
 		kv_index := m.metas[index + 1]
 		if key == m.key_values.data[kv_index].key {
@ -373,29 +363,24 @@ fn (m map) exists(key string) bool {
 }

 pub fn (m mut map) delete(key string) {
-	mut index, mut meta := m.key_to_index(key)
-	index, meta = meta_less(m.metas, index, meta)
+	mut index,mut meta := m.key_to_index(key)
+	index,meta = meta_less(m.metas, index, meta)
 	// Perform backwards shifting
 	for meta == m.metas[index] {
 		kv_index := m.metas[index + 1]
 		if key == m.key_values.data[kv_index].key {
 			C.memset(&m.key_values.data[kv_index], 0, sizeof(KeyValue))
-			mut old_index := index
-			index += 2
-			mut cur_meta := m.metas[index]
-			mut cur_index := m.metas[index + 1]
-			for (cur_meta >> hashbits) > 1 {
-				m.metas[old_index] = cur_meta - probe_inc
-				m.metas[old_index + 1] = cur_index
-				old_index = index
+			for (m.metas[index + 2]>>hashbits) > 1 {
+				m.metas[index] = m.metas[index + 2] - probe_inc
+				m.metas[index + 1] = m.metas[index + 3]
 				index += 2
-				cur_meta = m.metas[index]
-				cur_index = m.metas[index + 1]
 			}
-			m.metas[old_index] = 0
 			m.size--
+			m.metas[index] = 0
 			m.key_values.deletes++
-			if m.key_values.size <= 32 {return}
+			if m.key_values.size <= 32 {
+				return
+			}
 			if (f32(m.key_values.size) / f32(m.key_values.deletes)) < 1 {
 				m.key_values.zeros_to_end()
 				m.rehash()
@ -424,7 +409,7 @@ pub fn (m &map) keys() []string {
 }

 pub fn (m map) free() {
-	unsafe {
+	unsafe{
 		free(m.metas)
 		free(m.key_values.data)
 	}
@ -440,7 +425,7 @@ pub fn (m map_string) str() string {
 	}
 	mut sb := strings.new_builder(50)
 	sb.writeln('{')
-	for key, val  in m {
+	for key, val in m {
 		sb.writeln('  "$key" => "$val"')
 	}
 	sb.writeln('}')