From 6fd175d9be10729ce48c4d3c378e0faeac22dff4 Mon Sep 17 00:00:00 2001 From: ka-weihe Date: Fri, 24 Jan 2020 20:13:17 +0100 Subject: [PATCH] hashmap: new and fast hashmap with dynamic size --- vlib/builtin/hashmap/hashmap.v | 332 +++++++++++++++++++--------- vlib/builtin/hashmap/hashmap_test.v | 5 +- 2 files changed, 229 insertions(+), 108 deletions(-) diff --git a/vlib/builtin/hashmap/hashmap.v b/vlib/builtin/hashmap/hashmap.v index d3e779b2a8..518e725e83 100644 --- a/vlib/builtin/hashmap/hashmap.v +++ b/vlib/builtin/hashmap/hashmap.v @@ -2,120 +2,242 @@ // Use of this source code is governed by an MIT license // that can be found in the LICENSE file. module hashmap -/* - This is work in progress. - A very early test version of the Hashmap with a fixed size. - Only works with string keys and int values for now. - - I added this to improve performance of the V compiler, - which uses lots of O(log n) map get's. Turned out with N < 10 000 - the performance gains are basically non-existent. -*/ - - -struct Hashmap { - cap int - keys []string - table []Hashmapentry - elm_size int -pub mut: - nr_collisions int -} - -struct Hashmapentry { -mut: - key string - val int - next &Hashmapentry // linked list for collisions -} const ( - min_cap = 2<<10 - max_cap = 2<<20 -) - -const( - fnv64_prime = 1099511628211 + initial_size = 2<<4 + initial_cap = initial_size - 1 + load_factor = 0.5 + probe_offset = u16(256) + fnv64_prime = 1099511628211 fnv64_offset_basis = 14695981039346656037 + fnv32_offset_basis = u32(2166136261) + fnv32_prime = u32(16777619) ) -const( - fnv32_offset_basis = u32(2166136261) - fnv32_prime = u32(16777619) -) - -pub fn new_hashmap(planned_nr_items int) Hashmap { - mut cap := planned_nr_items * 5 - if cap < min_cap { - cap = min_cap - } - if cap > max_cap { - cap = max_cap - } - return Hashmap{ - cap: cap - elm_size: 4 - table: make(cap, cap, sizeof(Hashmapentry)) - } +pub struct Hashmap { +mut: + info &u16 + key_values &KeyValue + cap int +pub mut: + size int } -pub fn (m mut Hashmap) set(key string, val int) { - // mut hash := int(b_fabs(key.hash())) - // idx := hash % m.cap - idx := int(fnv1a32(key) % m.cap) - if m.table[idx].key.len != 0 { - // println('\nset() idx=$idx key="$key" hash="$hash" val=$val') - m.nr_collisions++ - // println('collision:' + m.table[idx].key) - mut e := &m.table[idx] - for e.next != 0 { - e = e.next - } - e.next = &Hashmapentry{ - key,val,0} - } - else { - m.table[idx] = Hashmapentry{ - key,val,0} - } -} - -pub fn (m &Hashmap) get(key string) int { - // mut hash := int(b_fabs(key.hash())) - // idx := hash % m.cap - idx := int(fnv1a32(key) % m.cap) - mut e := &m.table[idx] - for e.next != 0 { - // todo unsafe { - if e.key == key { - return e.val - } - e = e.next - } - return e.val -} - -[inline] -fn b_fabs(v int) f64 { - return if v < 0 { -v } else { v } -} - -// inline functions here for speed -// rather than full impl in vlib -[inline] -fn fnv1a32(data string) u32 { - mut hash := fnv32_offset_basis - for i := 0; i < data.len; i++ { - hash = (hash ^ u32(data[i])) * fnv32_prime - } - return hash +struct KeyValue { + key string +mut: + value int } [inline] fn fnv1a64(data string) u64 { - mut hash := fnv64_offset_basis - for i := 0; i < data.len; i++ { - hash = (hash ^ u64(data[i])) * fnv64_prime - } - return hash + mut hash := fnv64_offset_basis + for i := 0; i < data.len; i++ { + hash = (hash ^ u64(data[i])) * fnv64_prime + } + return hash +} + +pub fn new_hashmap() Hashmap { + return Hashmap{ + info: &u16(calloc(sizeof(u16) * initial_size)) + key_values: &KeyValue(calloc(sizeof(KeyValue) * initial_size)) + cap: initial_cap + size: 0 + } +} + +pub fn (h mut Hashmap) set(key string, value int) { + // The load factor is 0.5. + // It will be adjustable in the future and with + // a higher default settings to lower memory usage. + if (h.size<<1) == (h.cap - 1) { + h.rehash() + } + // Hash-function will be swapped for wyhash + hash := fnv1a64(key) + mut info := u16((hash>>56) | probe_offset) + mut index := hash & h.cap + // While probe count is less + for info < h.info[index] { + index = (index + 1) & h.cap + info += probe_offset + } + // While we might have a match + for info == h.info[index] { + if key == h.key_values[index].key { + h.key_values[index].value = value + return + } + index = (index + 1) & h.cap + info += probe_offset + } + // Match is not possible anymore. + // Probe until an empty index is found. + // Swap when probe count is higher/richer (Robin Hood). + mut current_key := key + mut current_value := value + for h.info[index] != 0 { + if info > h.info[index] { + tmp_kv := h.key_values[index] + tmp_info := h.info[index] + h.key_values[index] = KeyValue{ + current_key,current_value} + h.info[index] = info + current_key = tmp_kv.key + current_value = tmp_kv.value + info = tmp_info + } + index = (index + 1) & h.cap + info += probe_offset + } + // Should almost never happen + if (info & 0xFF00) == 0xFF00 { + h.rehash() + h.set(current_key, current_value) + return + } + h.info[index] = info + h.key_values[index] = KeyValue{ + current_key,current_value} + h.size++ +} + +fn (h mut Hashmap) rehash() { + old_cap := h.cap + h.cap = ((h.cap + 1)<<1) - 1 + mut new_key_values := &KeyValue(calloc(sizeof(KeyValue) * (h.cap + 1))) + mut new_info := &u16(calloc(sizeof(u16) * (h.cap + 1))) + for i in 0 .. (old_cap + 1) { + if h.info[i] != 0 { + key := h.key_values[i].key + value := h.key_values[i].value + hash := fnv1a64(key) + mut info := u16((hash>>56) | probe_offset) + mut index := hash & h.cap + // While probe count is less + for info < new_info[index] { + index = (index + 1) & h.cap + info += probe_offset + } + // While we might have a match + for info == new_info[index] { + if key == new_key_values[index].key { + new_key_values[index].value = value + return + } + index = (index + 1) & h.cap + info += probe_offset + } + // Match is not possible anymore. + // Probe until an empty index is found. + // Swap when probe count is higher/richer (Robin Hood). + mut current_key := key + mut current_value := value + for new_info[index] != 0 { + if info > new_info[index] { + tmp_kv := new_key_values[index] + tmp_info := new_info[index] + new_key_values[index] = KeyValue{ + current_key,current_value} + new_info[index] = info + current_key = tmp_kv.key + current_value = tmp_kv.value + info = tmp_info + } + index = (index + 1) & h.cap + info += probe_offset + } + // Should almost never happen + if (info & 0xFF00) == 0xFF00 { + h.rehash() + h.set(current_key, current_value) + return + } + new_info[index] = info + new_key_values[index] = KeyValue{ + current_key,current_value} + } + } + h.key_values = new_key_values + h.info = new_info +} + +pub fn (h mut Hashmap) delete(key string) { + hash := fnv1a64(key) + mut index := hash & h.cap + mut info := u16((hash>>56) | probe_offset) + for info < h.info[index] { + index = (index + 1) & h.cap + info += probe_offset + } + // Perform backwards shifting + for info == h.info[index] { + if key == h.key_values[index].key { + mut old_index := index + index = (index + 1) & h.cap + mut current_info := h.info[index] + for (current_info>>8) > 1 { + h.info[old_index] = current_info - probe_offset + h.key_values[old_index] = h.key_values[index] + old_index = index + index = (index + 1) & h.cap + current_info = h.info[index] + } + h.info[old_index] = 0 + h.size-- + return + } + index = (index + 1) & h.cap + info += probe_offset + } +} + +pub fn (h Hashmap) get(key string) int { + hash := fnv1a64(key) + mut index := hash & h.cap + mut info := u16((hash>>56) | probe_offset) + for info < h.info[index] { + index = (index + 1) & h.cap + info += probe_offset + } + for info == h.info[index] { + if key == h.key_values[index].key { + return h.key_values[index].value + } + index = (index + 1) & h.cap + info += probe_offset + } + return 0 +} + +pub fn (h Hashmap) exists(key string) bool { + hash := fnv1a64(key) + mut index := hash & h.cap + mut info := u16((hash>>56) | probe_offset) + for info < h.info[index] { + index = (index + 1) & h.cap + info += probe_offset + } + for info == h.info[index] { + if key == h.key_values[index].key { + return true + } + index = (index + 1) & h.cap + info += probe_offset + } + return false +} + +pub fn (h Hashmap) keys() []string { + size := h.size + mut keys := [''].repeat(size) + mut j := 0 + for i in 0 .. (h.cap + 1) { + if h.info[i] != 0 { + keys[j] = h.key_values[i].key + j++ + } + } + return keys } diff --git a/vlib/builtin/hashmap/hashmap_test.v b/vlib/builtin/hashmap/hashmap_test.v index 739e4226f1..cb0de6de86 100644 --- a/vlib/builtin/hashmap/hashmap_test.v +++ b/vlib/builtin/hashmap/hashmap_test.v @@ -3,7 +3,7 @@ module hashmap import rand fn test_random_strings() { - mut m := new_hashmap(1000) + mut m := new_hashmap() for i in 0..1000 { mut buf := []byte for j in 0..10 { @@ -21,12 +21,11 @@ fn test_random_strings() { fn test_large_hashmap() { N := 300 * 1000 - mut nums := new_hashmap(N) + mut nums := new_hashmap() for i := 0; i < N; i++ { key := i.str() nums.set(key, i) } - println('nr collisions: $nums.nr_collisions') for i := 0; i < N; i++ { key := i.str() assert nums.get(key) == i