From bf97afb9edc86da5239ecbaba6edd01c2418f17a Mon Sep 17 00:00:00 2001 From: ka-weihe Date: Sat, 21 Mar 2020 13:55:07 +0100 Subject: [PATCH] map: small cleanup --- vlib/builtin/map.v | 177 +++++++++++++++++++++------------------------ 1 file changed, 81 insertions(+), 96 deletions(-) diff --git a/vlib/builtin/map.v b/vlib/builtin/map.v index 7ec50424f4..35a9a71cec 100644 --- a/vlib/builtin/map.v +++ b/vlib/builtin/map.v @@ -1,14 +1,12 @@ // Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved. // Use of this source code is governed by an MIT license // that can be found in the LICENSE file. - module builtin import ( strings hash.wyhash ) - /* This is a very fast hashmap implementation. It has several properties that in combination makes it very fast. Here is a short explanation of each property. @@ -17,7 +15,7 @@ After reading this you should have a basic understanding of how it works: 1. |Hash-function (Wyhash)|. Wyhash is the fastest hash-function passing SMHash- er, so it was an easy choice. -2. |Open addressing (Robin Hood Hashing)|. With this method a hash collision is +2. |Open addressing (Robin Hood Hashing)|. With this method, a hash collision is resolved by probing. As opposed to linear probing, Robin Hood hashing has a sim- ple but clever twist: As new keys are inserted, old keys are shifted around in a way such that all keys stay reasonably close to the slot they originally hash to. @@ -26,50 +24,48 @@ way such that all keys stay reasonably close to the slot they originally hash to ge of roughly 6.25% unused memory, as opposed to most other dynamic array imple- mentations with a growth factor of 1.5 or 2. The key-values keep their index in the array - they are not probed. Instead, this implementation uses another array -"metas" storing "metas" (meta-data). Each Key-value has a corresponding meta. A +"metas" storing "meta"s (meta-data). Each Key-value has a corresponding meta. A meta stores a reference to its key-value, and its index in "metas" is determined by the hash of the key and probing. A meta also stores bits from the hash (for faster rehashing etc.) and how far away it is from the index it was originally hashed to (probe_count). probe_count is 0 if empty, 1 if not probed, 2 if probed -by 1. +by 1, etc.. meta (64 bit) = kv_index (32 bit) | probe_count (8 bits) | hashbits (24 bits) metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...] key_values = [kv, kv, kv, kv, kv, ...] 4. |Power of two size array|. The size of metas is a power of two. This makes it -possible to find a bucket from a hash code you can use hash & (SIZE -1) instead -of abs(hash) % SIZE. Modulo is extremely expensive so using '&' is a big perfor- -mance improvement. The general concern with this is that you only use the lower -bits of the hash and can cause many collisions. This is solved by using very go- -od hash-function. +possible to find a bucket from a hash code by using "hash & (SIZE -1)" instead +of "abs(hash) % SIZE". Modulo is extremely expensive so using '&' is a big perf- +ormance improvement. The general concern with this is that you only use the low- +er bits of the hash and that can cause more collisions. This is solved by using +good hash-function. 5. |Extra metas|. The hashmap keeps track of the highest probe_count. The trick -is to allocate extra metas > max(probe_count), so you never have to do any boun- +is to allocate extra_metas > max(probe_count), so you never have to do any boun- ds-checking because the extra metas ensures that an element will never go beyond -index the last index. +the last index. 6. |Cached rehashing|. When the load_factor of the map exceeds the max_load_fac- tor the size of metas is doubled and all the elements need to be "rehashed" to -find the index in the new array. Instead of rehashing complete, it simply uses +find the index in the new array. Instead of rehashing completely, it simply uses the hashbits stored in the meta. */ + const ( - // Number of bits from the hash stored for each entry +// Number of bits from the hash stored for each entry hashbits = 24 - // Number of bits from the hash stored for rehasing + // Number of bits from the hash stored for rehashing cached_hashbits = 16 // Initial log-number of buckets in the hashtable init_log_capicity = 5 // Initial number of buckets in the hashtable - init_capicity = 1 << init_log_capicity - // Initial max load-factor - init_max_load_factor = 0.8 - // Minimum Load-factor. - // Number is picked to make delete O(1) amortized - min_load_factor = 0.3 - // Initial range cap + init_capicity = 1<> 3 + d.cap += d.cap>>3 d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap)) } push_index := d.size @@ -127,7 +121,7 @@ fn (d mut DenseArray) push(kv KeyValue) u32 { // and resize array fn (d mut DenseArray) zeros_to_end() { mut count := u32(0) - for i in 0..d.size { + for i in 0 .. d.size { if d.data[i].key.str != 0 { tmp := d.data[count] d.data[count] = d.data[i] @@ -135,34 +129,34 @@ fn (d mut DenseArray) zeros_to_end() { count++ } } - count++ + d.deletes = 0 d.size = count - d.cap = if count < 8 {8} else {count} + d.cap = if count < 8 { 8 } else { count } d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap)) } pub struct map { // Byte size of value - value_bytes int + value_bytes int mut: - // Index of the highest index in the hashtable - cap u32 +// Index of the highest index in the hashtable + cap u32 // Number of cached hashbits left for rehasing - window byte + window byte // Used for right-shifting out used hashbits - shift byte - // Pointer to Key-value memory - key_values DenseArray - // Pointer to meta-data - metas &u32 - // Measure that decides when to increase the capacity - max_load_factor f32 + shift byte + // Array storing key-values (ordered) + key_values DenseArray + // Pointer to meta-data: + // Odd indices stores index in `key_values`. + // Even indices stores probe_count and hashbits. + metas &u32 // Extra metas that allows for no ranging when incrementing // index in the hashmap - extra_metas u32 + extra_metas u32 pub mut: - // Number of key-values currently in the hashmap - size int +// Number of key-values currently in the hashmap + size int } fn new_map(n, value_bytes int) map { @@ -173,7 +167,6 @@ fn new_map(n, value_bytes int) map { shift: init_log_capicity key_values: new_dense_array() metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc))) - max_load_factor: init_max_load_factor extra_metas: extra_metas_inc size: 0 } @@ -188,26 +181,26 @@ fn new_map_init(n, value_bytes int, keys &string, values voidptr) map { } [inline] -fn (m map) key_to_index(key string) (u64, u32) { - hash := wyhash.wyhash_c(key.str, u64(key.len), 0) +fn (m map) key_to_index(key string) (u32,u32) { + hash := u32(wyhash.wyhash_c(key.str, u64(key.len), 0)) index := hash & m.cap - meta := u32(((hash>>m.shift) & hash_mask) | probe_inc) - return index, meta + meta := ((hash>>m.shift) & hash_mask) | probe_inc + return index,meta } [inline] -fn meta_less(metas &u32, i u64, m u32) (u64, u32){ +fn meta_less(metas &u32, i u32, m u32) (u32,u32) { mut index := i mut meta := m for meta < metas[index] { index += 2 meta += probe_inc } - return index, meta + return index,meta } [inline] -fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 { +fn (m mut map) meta_greater(ms &u32, i u32, me u32, kvi u32) &u32 { mut metas := ms mut meta := me mut index := i @@ -226,27 +219,27 @@ fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 { } metas[index] = meta metas[index + 1] = kv_index - probe_count := (meta >> hashbits) - 1 - if (probe_count << 1) == m.extra_metas { + probe_count := (meta>>hashbits) - 1 + if (probe_count<<1) == m.extra_metas { m.extra_metas += extra_metas_inc mem_size := (m.cap + 2 + m.extra_metas) metas = &u32(C.realloc(metas, sizeof(u32) * mem_size)) C.memset(metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc) // Should almost never happen if probe_count == 252 { - panic("Probe overflow") + panic('Probe overflow') } } return metas } fn (m mut map) set(key string, value voidptr) { - load_factor := f32(m.size << 1) / f32(m.cap) - if load_factor > m.max_load_factor { + load_factor := f32(m.size<<1) / f32(m.cap) + if load_factor > max_load_factor { m.expand() } - mut index, mut meta := m.key_to_index(key) - index, meta = meta_less(m.metas, index, meta) + mut index,mut meta := m.key_to_index(key) + index,meta = meta_less(m.metas, index, meta) // While we might have a match for meta == m.metas[index] { kv_index := m.metas[index + 1] @@ -275,8 +268,8 @@ fn (m mut map) expand() { // Check if any hashbits are left if m.window == 0 { m.shift += cached_hashbits - m.rehash() m.window = cached_hashbits + m.rehash() } else { m.cached_rehash(old_cap) @@ -293,8 +286,8 @@ fn (m mut map) rehash() { continue } kv := m.key_values.data[i] - mut index, mut meta := m.key_to_index(kv.key) - index, meta = meta_less(m.metas, index, meta) + mut index,mut meta := m.key_to_index(kv.key) + index,meta = meta_less(m.metas, index, meta) m.metas = m.meta_greater(m.metas, index, meta, i) } } @@ -302,16 +295,16 @@ fn (m mut map) rehash() { fn (m mut map) cached_rehash(old_cap u32) { mut new_meta := &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas))) old_extra_metas := m.extra_metas - for i := 0; i <= old_cap + old_extra_metas; i += 2 { + for i := u32(0); i <= old_cap + old_extra_metas; i += 2 { if m.metas[i] == 0 { continue } old_meta := m.metas[i] - old_probe_count := u64((old_meta>>hashbits) - 1) << 1 - old_index := (i - old_probe_count) & (m.cap >> 1) - mut index := u64(old_index) | (old_meta << m.shift) & m.cap + old_probe_count := ((old_meta>>hashbits) - 1)<<1 + old_index := (i - old_probe_count) & (m.cap>>1) + mut index := (old_index | (old_meta<> hashbits) > 1 { - m.metas[old_index] = cur_meta - probe_inc - m.metas[old_index + 1] = cur_index - old_index = index + for (m.metas[index + 2]>>hashbits) > 1 { + m.metas[index] = m.metas[index + 2] - probe_inc + m.metas[index + 1] = m.metas[index + 3] index += 2 - cur_meta = m.metas[index] - cur_index = m.metas[index + 1] } - m.metas[old_index] = 0 m.size-- + m.metas[index] = 0 m.key_values.deletes++ - if m.key_values.size <= 32 {return} + if m.key_values.size <= 32 { + return + } if (f32(m.key_values.size) / f32(m.key_values.deletes)) < 1 { m.key_values.zeros_to_end() m.rehash() @@ -424,7 +409,7 @@ pub fn (m &map) keys() []string { } pub fn (m map) free() { - unsafe { + unsafe{ free(m.metas) free(m.key_values.data) } @@ -440,7 +425,7 @@ pub fn (m map_string) str() string { } mut sb := strings.new_builder(50) sb.writeln('{') - for key, val in m { + for key, val in m { sb.writeln(' "$key" => "$val"') } sb.writeln('}')