map: small cleanup
parent
f962d92623
commit
bf97afb9ed
|
@ -1,14 +1,12 @@
|
||||||
// Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
|
// Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
|
||||||
// Use of this source code is governed by an MIT license
|
// Use of this source code is governed by an MIT license
|
||||||
// that can be found in the LICENSE file.
|
// that can be found in the LICENSE file.
|
||||||
|
|
||||||
module builtin
|
module builtin
|
||||||
|
|
||||||
import (
|
import (
|
||||||
strings
|
strings
|
||||||
hash.wyhash
|
hash.wyhash
|
||||||
)
|
)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
This is a very fast hashmap implementation. It has several properties that in
|
This is a very fast hashmap implementation. It has several properties that in
|
||||||
combination makes it very fast. Here is a short explanation of each property.
|
combination makes it very fast. Here is a short explanation of each property.
|
||||||
|
@ -17,7 +15,7 @@ After reading this you should have a basic understanding of how it works:
|
||||||
1. |Hash-function (Wyhash)|. Wyhash is the fastest hash-function passing SMHash-
|
1. |Hash-function (Wyhash)|. Wyhash is the fastest hash-function passing SMHash-
|
||||||
er, so it was an easy choice.
|
er, so it was an easy choice.
|
||||||
|
|
||||||
2. |Open addressing (Robin Hood Hashing)|. With this method a hash collision is
|
2. |Open addressing (Robin Hood Hashing)|. With this method, a hash collision is
|
||||||
resolved by probing. As opposed to linear probing, Robin Hood hashing has a sim-
|
resolved by probing. As opposed to linear probing, Robin Hood hashing has a sim-
|
||||||
ple but clever twist: As new keys are inserted, old keys are shifted around in a
|
ple but clever twist: As new keys are inserted, old keys are shifted around in a
|
||||||
way such that all keys stay reasonably close to the slot they originally hash to.
|
way such that all keys stay reasonably close to the slot they originally hash to.
|
||||||
|
@ -26,50 +24,48 @@ way such that all keys stay reasonably close to the slot they originally hash to
|
||||||
ge of roughly 6.25% unused memory, as opposed to most other dynamic array imple-
|
ge of roughly 6.25% unused memory, as opposed to most other dynamic array imple-
|
||||||
mentations with a growth factor of 1.5 or 2. The key-values keep their index in
|
mentations with a growth factor of 1.5 or 2. The key-values keep their index in
|
||||||
the array - they are not probed. Instead, this implementation uses another array
|
the array - they are not probed. Instead, this implementation uses another array
|
||||||
"metas" storing "metas" (meta-data). Each Key-value has a corresponding meta. A
|
"metas" storing "meta"s (meta-data). Each Key-value has a corresponding meta. A
|
||||||
meta stores a reference to its key-value, and its index in "metas" is determined
|
meta stores a reference to its key-value, and its index in "metas" is determined
|
||||||
by the hash of the key and probing. A meta also stores bits from the hash (for
|
by the hash of the key and probing. A meta also stores bits from the hash (for
|
||||||
faster rehashing etc.) and how far away it is from the index it was originally
|
faster rehashing etc.) and how far away it is from the index it was originally
|
||||||
hashed to (probe_count). probe_count is 0 if empty, 1 if not probed, 2 if probed
|
hashed to (probe_count). probe_count is 0 if empty, 1 if not probed, 2 if probed
|
||||||
by 1.
|
by 1, etc..
|
||||||
|
|
||||||
meta (64 bit) = kv_index (32 bit) | probe_count (8 bits) | hashbits (24 bits)
|
meta (64 bit) = kv_index (32 bit) | probe_count (8 bits) | hashbits (24 bits)
|
||||||
metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...]
|
metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...]
|
||||||
key_values = [kv, kv, kv, kv, kv, ...]
|
key_values = [kv, kv, kv, kv, kv, ...]
|
||||||
|
|
||||||
4. |Power of two size array|. The size of metas is a power of two. This makes it
|
4. |Power of two size array|. The size of metas is a power of two. This makes it
|
||||||
possible to find a bucket from a hash code you can use hash & (SIZE -1) instead
|
possible to find a bucket from a hash code by using "hash & (SIZE -1)" instead
|
||||||
of abs(hash) % SIZE. Modulo is extremely expensive so using '&' is a big perfor-
|
of "abs(hash) % SIZE". Modulo is extremely expensive so using '&' is a big perf-
|
||||||
mance improvement. The general concern with this is that you only use the lower
|
ormance improvement. The general concern with this is that you only use the low-
|
||||||
bits of the hash and can cause many collisions. This is solved by using very go-
|
er bits of the hash and that can cause more collisions. This is solved by using
|
||||||
od hash-function.
|
good hash-function.
|
||||||
|
|
||||||
5. |Extra metas|. The hashmap keeps track of the highest probe_count. The trick
|
5. |Extra metas|. The hashmap keeps track of the highest probe_count. The trick
|
||||||
is to allocate extra metas > max(probe_count), so you never have to do any boun-
|
is to allocate extra_metas > max(probe_count), so you never have to do any boun-
|
||||||
ds-checking because the extra metas ensures that an element will never go beyond
|
ds-checking because the extra metas ensures that an element will never go beyond
|
||||||
index the last index.
|
the last index.
|
||||||
|
|
||||||
6. |Cached rehashing|. When the load_factor of the map exceeds the max_load_fac-
|
6. |Cached rehashing|. When the load_factor of the map exceeds the max_load_fac-
|
||||||
tor the size of metas is doubled and all the elements need to be "rehashed" to
|
tor the size of metas is doubled and all the elements need to be "rehashed" to
|
||||||
find the index in the new array. Instead of rehashing complete, it simply uses
|
find the index in the new array. Instead of rehashing completely, it simply uses
|
||||||
the hashbits stored in the meta.
|
the hashbits stored in the meta.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// Number of bits from the hash stored for each entry
|
// Number of bits from the hash stored for each entry
|
||||||
hashbits = 24
|
hashbits = 24
|
||||||
// Number of bits from the hash stored for rehasing
|
// Number of bits from the hash stored for rehashing
|
||||||
cached_hashbits = 16
|
cached_hashbits = 16
|
||||||
// Initial log-number of buckets in the hashtable
|
// Initial log-number of buckets in the hashtable
|
||||||
init_log_capicity = 5
|
init_log_capicity = 5
|
||||||
// Initial number of buckets in the hashtable
|
// Initial number of buckets in the hashtable
|
||||||
init_capicity = 1<<init_log_capicity
|
init_capicity = 1<<init_log_capicity
|
||||||
// Initial max load-factor
|
// Maximum load-factor (size / capacity)
|
||||||
init_max_load_factor = 0.8
|
max_load_factor = 0.8
|
||||||
// Minimum Load-factor.
|
// Initial highest even index in metas
|
||||||
// Number is picked to make delete O(1) amortized
|
|
||||||
min_load_factor = 0.3
|
|
||||||
// Initial range cap
|
|
||||||
init_cap = init_capicity - 2
|
init_cap = init_capicity - 2
|
||||||
// Used for incrementing `extra_metas` when max
|
// Used for incrementing `extra_metas` when max
|
||||||
// probe count is too high, to avoid overflow
|
// probe count is too high, to avoid overflow
|
||||||
|
@ -78,8 +74,6 @@ const (
|
||||||
hash_mask = u32(0x00FFFFFF)
|
hash_mask = u32(0x00FFFFFF)
|
||||||
// Used for incrementing the probe-count
|
// Used for incrementing the probe-count
|
||||||
probe_inc = u32(0x01000000)
|
probe_inc = u32(0x01000000)
|
||||||
// Bitmask for maximum probe count
|
|
||||||
max_probe = u32(0xFF000000)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
struct KeyValue {
|
struct KeyValue {
|
||||||
|
@ -91,20 +85,20 @@ mut:
|
||||||
// Dynamic array with very low growth factor
|
// Dynamic array with very low growth factor
|
||||||
struct DenseArray {
|
struct DenseArray {
|
||||||
mut:
|
mut:
|
||||||
data &KeyValue
|
|
||||||
cap u32
|
cap u32
|
||||||
size u32
|
size u32
|
||||||
deletes u32
|
deletes u32
|
||||||
|
data &KeyValue
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
[inline]
|
||||||
fn new_dense_array() DenseArray {
|
fn new_dense_array() DenseArray {
|
||||||
unsafe{
|
unsafe{
|
||||||
return DenseArray{
|
return DenseArray{
|
||||||
data: &KeyValue(malloc(8 * sizeof(KeyValue)))
|
|
||||||
cap: 8
|
cap: 8
|
||||||
size: 0
|
size: 0
|
||||||
deletes: 0
|
deletes: 0
|
||||||
|
data: &KeyValue(malloc(8 * sizeof(KeyValue)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -135,7 +129,7 @@ fn (d mut DenseArray) zeros_to_end() {
|
||||||
count++
|
count++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
count++
|
d.deletes = 0
|
||||||
d.size = count
|
d.size = count
|
||||||
d.cap = if count < 8 { 8 } else { count }
|
d.cap = if count < 8 { 8 } else { count }
|
||||||
d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
|
d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
|
||||||
|
@ -151,12 +145,12 @@ mut:
|
||||||
window byte
|
window byte
|
||||||
// Used for right-shifting out used hashbits
|
// Used for right-shifting out used hashbits
|
||||||
shift byte
|
shift byte
|
||||||
// Pointer to Key-value memory
|
// Array storing key-values (ordered)
|
||||||
key_values DenseArray
|
key_values DenseArray
|
||||||
// Pointer to meta-data
|
// Pointer to meta-data:
|
||||||
|
// Odd indices stores index in `key_values`.
|
||||||
|
// Even indices stores probe_count and hashbits.
|
||||||
metas &u32
|
metas &u32
|
||||||
// Measure that decides when to increase the capacity
|
|
||||||
max_load_factor f32
|
|
||||||
// Extra metas that allows for no ranging when incrementing
|
// Extra metas that allows for no ranging when incrementing
|
||||||
// index in the hashmap
|
// index in the hashmap
|
||||||
extra_metas u32
|
extra_metas u32
|
||||||
|
@ -173,7 +167,6 @@ fn new_map(n, value_bytes int) map {
|
||||||
shift: init_log_capicity
|
shift: init_log_capicity
|
||||||
key_values: new_dense_array()
|
key_values: new_dense_array()
|
||||||
metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc)))
|
metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc)))
|
||||||
max_load_factor: init_max_load_factor
|
|
||||||
extra_metas: extra_metas_inc
|
extra_metas: extra_metas_inc
|
||||||
size: 0
|
size: 0
|
||||||
}
|
}
|
||||||
|
@ -188,15 +181,15 @@ fn new_map_init(n, value_bytes int, keys &string, values voidptr) map {
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
[inline]
|
||||||
fn (m map) key_to_index(key string) (u64, u32) {
|
fn (m map) key_to_index(key string) (u32,u32) {
|
||||||
hash := wyhash.wyhash_c(key.str, u64(key.len), 0)
|
hash := u32(wyhash.wyhash_c(key.str, u64(key.len), 0))
|
||||||
index := hash & m.cap
|
index := hash & m.cap
|
||||||
meta := u32(((hash>>m.shift) & hash_mask) | probe_inc)
|
meta := ((hash>>m.shift) & hash_mask) | probe_inc
|
||||||
return index,meta
|
return index,meta
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
[inline]
|
||||||
fn meta_less(metas &u32, i u64, m u32) (u64, u32){
|
fn meta_less(metas &u32, i u32, m u32) (u32,u32) {
|
||||||
mut index := i
|
mut index := i
|
||||||
mut meta := m
|
mut meta := m
|
||||||
for meta < metas[index] {
|
for meta < metas[index] {
|
||||||
|
@ -207,7 +200,7 @@ fn meta_less(metas &u32, i u64, m u32) (u64, u32){
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
[inline]
|
||||||
fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 {
|
fn (m mut map) meta_greater(ms &u32, i u32, me u32, kvi u32) &u32 {
|
||||||
mut metas := ms
|
mut metas := ms
|
||||||
mut meta := me
|
mut meta := me
|
||||||
mut index := i
|
mut index := i
|
||||||
|
@ -234,7 +227,7 @@ fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 {
|
||||||
C.memset(metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
|
C.memset(metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
|
||||||
// Should almost never happen
|
// Should almost never happen
|
||||||
if probe_count == 252 {
|
if probe_count == 252 {
|
||||||
panic("Probe overflow")
|
panic('Probe overflow')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return metas
|
return metas
|
||||||
|
@ -242,7 +235,7 @@ fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 {
|
||||||
|
|
||||||
fn (m mut map) set(key string, value voidptr) {
|
fn (m mut map) set(key string, value voidptr) {
|
||||||
load_factor := f32(m.size<<1) / f32(m.cap)
|
load_factor := f32(m.size<<1) / f32(m.cap)
|
||||||
if load_factor > m.max_load_factor {
|
if load_factor > max_load_factor {
|
||||||
m.expand()
|
m.expand()
|
||||||
}
|
}
|
||||||
mut index,mut meta := m.key_to_index(key)
|
mut index,mut meta := m.key_to_index(key)
|
||||||
|
@ -275,8 +268,8 @@ fn (m mut map) expand() {
|
||||||
// Check if any hashbits are left
|
// Check if any hashbits are left
|
||||||
if m.window == 0 {
|
if m.window == 0 {
|
||||||
m.shift += cached_hashbits
|
m.shift += cached_hashbits
|
||||||
m.rehash()
|
|
||||||
m.window = cached_hashbits
|
m.window = cached_hashbits
|
||||||
|
m.rehash()
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
m.cached_rehash(old_cap)
|
m.cached_rehash(old_cap)
|
||||||
|
@ -302,14 +295,14 @@ fn (m mut map) rehash() {
|
||||||
fn (m mut map) cached_rehash(old_cap u32) {
|
fn (m mut map) cached_rehash(old_cap u32) {
|
||||||
mut new_meta := &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas)))
|
mut new_meta := &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas)))
|
||||||
old_extra_metas := m.extra_metas
|
old_extra_metas := m.extra_metas
|
||||||
for i := 0; i <= old_cap + old_extra_metas; i += 2 {
|
for i := u32(0); i <= old_cap + old_extra_metas; i += 2 {
|
||||||
if m.metas[i] == 0 {
|
if m.metas[i] == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
old_meta := m.metas[i]
|
old_meta := m.metas[i]
|
||||||
old_probe_count := u64((old_meta>>hashbits) - 1) << 1
|
old_probe_count := ((old_meta>>hashbits) - 1)<<1
|
||||||
old_index := (i - old_probe_count) & (m.cap>>1)
|
old_index := (i - old_probe_count) & (m.cap>>1)
|
||||||
mut index := u64(old_index) | (old_meta << m.shift) & m.cap
|
mut index := (old_index | (old_meta<<m.shift)) & m.cap
|
||||||
mut meta := (old_meta & hash_mask) | probe_inc
|
mut meta := (old_meta & hash_mask) | probe_inc
|
||||||
index,meta = meta_less(new_meta, index, meta)
|
index,meta = meta_less(new_meta, index, meta)
|
||||||
kv_index := m.metas[i + 1]
|
kv_index := m.metas[i + 1]
|
||||||
|
@ -321,7 +314,6 @@ fn (m mut map) cached_rehash(old_cap u32) {
|
||||||
m.metas = new_meta
|
m.metas = new_meta
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
|
||||||
fn (m map) get(key string, out voidptr) bool {
|
fn (m map) get(key string, out voidptr) bool {
|
||||||
mut index,mut meta := m.key_to_index(key)
|
mut index,mut meta := m.key_to_index(key)
|
||||||
index,meta = meta_less(m.metas, index, meta)
|
index,meta = meta_less(m.metas, index, meta)
|
||||||
|
@ -337,7 +329,6 @@ fn (m map) get(key string, out voidptr) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
|
||||||
fn (m map) get2(key string) voidptr {
|
fn (m map) get2(key string) voidptr {
|
||||||
mut index,mut meta := m.key_to_index(key)
|
mut index,mut meta := m.key_to_index(key)
|
||||||
index,meta = meta_less(m.metas, index, meta)
|
index,meta = meta_less(m.metas, index, meta)
|
||||||
|
@ -354,7 +345,6 @@ fn (m map) get2(key string) voidptr {
|
||||||
return voidptr(0)
|
return voidptr(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
[inline]
|
|
||||||
fn (m map) exists(key string) bool {
|
fn (m map) exists(key string) bool {
|
||||||
if m.value_bytes == 0 {
|
if m.value_bytes == 0 {
|
||||||
return false
|
return false
|
||||||
|
@ -380,22 +370,17 @@ pub fn (m mut map) delete(key string) {
|
||||||
kv_index := m.metas[index + 1]
|
kv_index := m.metas[index + 1]
|
||||||
if key == m.key_values.data[kv_index].key {
|
if key == m.key_values.data[kv_index].key {
|
||||||
C.memset(&m.key_values.data[kv_index], 0, sizeof(KeyValue))
|
C.memset(&m.key_values.data[kv_index], 0, sizeof(KeyValue))
|
||||||
mut old_index := index
|
for (m.metas[index + 2]>>hashbits) > 1 {
|
||||||
|
m.metas[index] = m.metas[index + 2] - probe_inc
|
||||||
|
m.metas[index + 1] = m.metas[index + 3]
|
||||||
index += 2
|
index += 2
|
||||||
mut cur_meta := m.metas[index]
|
|
||||||
mut cur_index := m.metas[index + 1]
|
|
||||||
for (cur_meta >> hashbits) > 1 {
|
|
||||||
m.metas[old_index] = cur_meta - probe_inc
|
|
||||||
m.metas[old_index + 1] = cur_index
|
|
||||||
old_index = index
|
|
||||||
index += 2
|
|
||||||
cur_meta = m.metas[index]
|
|
||||||
cur_index = m.metas[index + 1]
|
|
||||||
}
|
}
|
||||||
m.metas[old_index] = 0
|
|
||||||
m.size--
|
m.size--
|
||||||
|
m.metas[index] = 0
|
||||||
m.key_values.deletes++
|
m.key_values.deletes++
|
||||||
if m.key_values.size <= 32 {return}
|
if m.key_values.size <= 32 {
|
||||||
|
return
|
||||||
|
}
|
||||||
if (f32(m.key_values.size) / f32(m.key_values.deletes)) < 1 {
|
if (f32(m.key_values.size) / f32(m.key_values.deletes)) < 1 {
|
||||||
m.key_values.zeros_to_end()
|
m.key_values.zeros_to_end()
|
||||||
m.rehash()
|
m.rehash()
|
||||||
|
|
Loading…
Reference in New Issue