map: small cleanup

pull/4093/head
ka-weihe 2020-03-21 13:55:07 +01:00 committed by GitHub
parent f962d92623
commit bf97afb9ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 81 additions and 96 deletions

View File

@ -1,14 +1,12 @@
// Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved. // Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license // Use of this source code is governed by an MIT license
// that can be found in the LICENSE file. // that can be found in the LICENSE file.
module builtin module builtin
import ( import (
strings strings
hash.wyhash hash.wyhash
) )
/* /*
This is a very fast hashmap implementation. It has several properties that in This is a very fast hashmap implementation. It has several properties that in
combination makes it very fast. Here is a short explanation of each property. combination makes it very fast. Here is a short explanation of each property.
@ -17,7 +15,7 @@ After reading this you should have a basic understanding of how it works:
1. |Hash-function (Wyhash)|. Wyhash is the fastest hash-function passing SMHash- 1. |Hash-function (Wyhash)|. Wyhash is the fastest hash-function passing SMHash-
er, so it was an easy choice. er, so it was an easy choice.
2. |Open addressing (Robin Hood Hashing)|. With this method a hash collision is 2. |Open addressing (Robin Hood Hashing)|. With this method, a hash collision is
resolved by probing. As opposed to linear probing, Robin Hood hashing has a sim- resolved by probing. As opposed to linear probing, Robin Hood hashing has a sim-
ple but clever twist: As new keys are inserted, old keys are shifted around in a ple but clever twist: As new keys are inserted, old keys are shifted around in a
way such that all keys stay reasonably close to the slot they originally hash to. way such that all keys stay reasonably close to the slot they originally hash to.
@ -26,50 +24,48 @@ way such that all keys stay reasonably close to the slot they originally hash to
ge of roughly 6.25% unused memory, as opposed to most other dynamic array imple- ge of roughly 6.25% unused memory, as opposed to most other dynamic array imple-
mentations with a growth factor of 1.5 or 2. The key-values keep their index in mentations with a growth factor of 1.5 or 2. The key-values keep their index in
the array - they are not probed. Instead, this implementation uses another array the array - they are not probed. Instead, this implementation uses another array
"metas" storing "metas" (meta-data). Each Key-value has a corresponding meta. A "metas" storing "meta"s (meta-data). Each Key-value has a corresponding meta. A
meta stores a reference to its key-value, and its index in "metas" is determined meta stores a reference to its key-value, and its index in "metas" is determined
by the hash of the key and probing. A meta also stores bits from the hash (for by the hash of the key and probing. A meta also stores bits from the hash (for
faster rehashing etc.) and how far away it is from the index it was originally faster rehashing etc.) and how far away it is from the index it was originally
hashed to (probe_count). probe_count is 0 if empty, 1 if not probed, 2 if probed hashed to (probe_count). probe_count is 0 if empty, 1 if not probed, 2 if probed
by 1. by 1, etc..
meta (64 bit) = kv_index (32 bit) | probe_count (8 bits) | hashbits (24 bits) meta (64 bit) = kv_index (32 bit) | probe_count (8 bits) | hashbits (24 bits)
metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...] metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...]
key_values = [kv, kv, kv, kv, kv, ...] key_values = [kv, kv, kv, kv, kv, ...]
4. |Power of two size array|. The size of metas is a power of two. This makes it 4. |Power of two size array|. The size of metas is a power of two. This makes it
possible to find a bucket from a hash code you can use hash & (SIZE -1) instead possible to find a bucket from a hash code by using "hash & (SIZE -1)" instead
of abs(hash) % SIZE. Modulo is extremely expensive so using '&' is a big perfor- of "abs(hash) % SIZE". Modulo is extremely expensive so using '&' is a big perf-
mance improvement. The general concern with this is that you only use the lower ormance improvement. The general concern with this is that you only use the low-
bits of the hash and can cause many collisions. This is solved by using very go- er bits of the hash and that can cause more collisions. This is solved by using
od hash-function. good hash-function.
5. |Extra metas|. The hashmap keeps track of the highest probe_count. The trick 5. |Extra metas|. The hashmap keeps track of the highest probe_count. The trick
is to allocate extra metas > max(probe_count), so you never have to do any boun- is to allocate extra_metas > max(probe_count), so you never have to do any boun-
ds-checking because the extra metas ensures that an element will never go beyond ds-checking because the extra metas ensures that an element will never go beyond
index the last index. the last index.
6. |Cached rehashing|. When the load_factor of the map exceeds the max_load_fac- 6. |Cached rehashing|. When the load_factor of the map exceeds the max_load_fac-
tor the size of metas is doubled and all the elements need to be "rehashed" to tor the size of metas is doubled and all the elements need to be "rehashed" to
find the index in the new array. Instead of rehashing complete, it simply uses find the index in the new array. Instead of rehashing completely, it simply uses
the hashbits stored in the meta. the hashbits stored in the meta.
*/ */
const ( const (
// Number of bits from the hash stored for each entry // Number of bits from the hash stored for each entry
hashbits = 24 hashbits = 24
// Number of bits from the hash stored for rehasing // Number of bits from the hash stored for rehashing
cached_hashbits = 16 cached_hashbits = 16
// Initial log-number of buckets in the hashtable // Initial log-number of buckets in the hashtable
init_log_capicity = 5 init_log_capicity = 5
// Initial number of buckets in the hashtable // Initial number of buckets in the hashtable
init_capicity = 1 << init_log_capicity init_capicity = 1<<init_log_capicity
// Initial max load-factor // Maximum load-factor (size / capacity)
init_max_load_factor = 0.8 max_load_factor = 0.8
// Minimum Load-factor. // Initial highest even index in metas
// Number is picked to make delete O(1) amortized
min_load_factor = 0.3
// Initial range cap
init_cap = init_capicity - 2 init_cap = init_capicity - 2
// Used for incrementing `extra_metas` when max // Used for incrementing `extra_metas` when max
// probe count is too high, to avoid overflow // probe count is too high, to avoid overflow
@ -78,8 +74,6 @@ const (
hash_mask = u32(0x00FFFFFF) hash_mask = u32(0x00FFFFFF)
// Used for incrementing the probe-count // Used for incrementing the probe-count
probe_inc = u32(0x01000000) probe_inc = u32(0x01000000)
// Bitmask for maximum probe count
max_probe = u32(0xFF000000)
) )
struct KeyValue { struct KeyValue {
@ -91,20 +85,20 @@ mut:
// Dynamic array with very low growth factor // Dynamic array with very low growth factor
struct DenseArray { struct DenseArray {
mut: mut:
data &KeyValue
cap u32 cap u32
size u32 size u32
deletes u32 deletes u32
data &KeyValue
} }
[inline] [inline]
fn new_dense_array() DenseArray { fn new_dense_array() DenseArray {
unsafe { unsafe{
return DenseArray { return DenseArray{
data: &KeyValue(malloc(8 * sizeof(KeyValue)))
cap: 8 cap: 8
size: 0 size: 0
deletes: 0 deletes: 0
data: &KeyValue(malloc(8 * sizeof(KeyValue)))
} }
} }
} }
@ -114,7 +108,7 @@ fn new_dense_array() DenseArray {
[inline] [inline]
fn (d mut DenseArray) push(kv KeyValue) u32 { fn (d mut DenseArray) push(kv KeyValue) u32 {
if d.cap == d.size { if d.cap == d.size {
d.cap += d.cap >> 3 d.cap += d.cap>>3
d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap)) d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
} }
push_index := d.size push_index := d.size
@ -127,7 +121,7 @@ fn (d mut DenseArray) push(kv KeyValue) u32 {
// and resize array // and resize array
fn (d mut DenseArray) zeros_to_end() { fn (d mut DenseArray) zeros_to_end() {
mut count := u32(0) mut count := u32(0)
for i in 0..d.size { for i in 0 .. d.size {
if d.data[i].key.str != 0 { if d.data[i].key.str != 0 {
tmp := d.data[count] tmp := d.data[count]
d.data[count] = d.data[i] d.data[count] = d.data[i]
@ -135,9 +129,9 @@ fn (d mut DenseArray) zeros_to_end() {
count++ count++
} }
} }
count++ d.deletes = 0
d.size = count d.size = count
d.cap = if count < 8 {8} else {count} d.cap = if count < 8 { 8 } else { count }
d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap)) d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
} }
@ -145,23 +139,23 @@ pub struct map {
// Byte size of value // Byte size of value
value_bytes int value_bytes int
mut: mut:
// Index of the highest index in the hashtable // Index of the highest index in the hashtable
cap u32 cap u32
// Number of cached hashbits left for rehasing // Number of cached hashbits left for rehasing
window byte window byte
// Used for right-shifting out used hashbits // Used for right-shifting out used hashbits
shift byte shift byte
// Pointer to Key-value memory // Array storing key-values (ordered)
key_values DenseArray key_values DenseArray
// Pointer to meta-data // Pointer to meta-data:
// Odd indices stores index in `key_values`.
// Even indices stores probe_count and hashbits.
metas &u32 metas &u32
// Measure that decides when to increase the capacity
max_load_factor f32
// Extra metas that allows for no ranging when incrementing // Extra metas that allows for no ranging when incrementing
// index in the hashmap // index in the hashmap
extra_metas u32 extra_metas u32
pub mut: pub mut:
// Number of key-values currently in the hashmap // Number of key-values currently in the hashmap
size int size int
} }
@ -173,7 +167,6 @@ fn new_map(n, value_bytes int) map {
shift: init_log_capicity shift: init_log_capicity
key_values: new_dense_array() key_values: new_dense_array()
metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc))) metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc)))
max_load_factor: init_max_load_factor
extra_metas: extra_metas_inc extra_metas: extra_metas_inc
size: 0 size: 0
} }
@ -188,26 +181,26 @@ fn new_map_init(n, value_bytes int, keys &string, values voidptr) map {
} }
[inline] [inline]
fn (m map) key_to_index(key string) (u64, u32) { fn (m map) key_to_index(key string) (u32,u32) {
hash := wyhash.wyhash_c(key.str, u64(key.len), 0) hash := u32(wyhash.wyhash_c(key.str, u64(key.len), 0))
index := hash & m.cap index := hash & m.cap
meta := u32(((hash>>m.shift) & hash_mask) | probe_inc) meta := ((hash>>m.shift) & hash_mask) | probe_inc
return index, meta return index,meta
} }
[inline] [inline]
fn meta_less(metas &u32, i u64, m u32) (u64, u32){ fn meta_less(metas &u32, i u32, m u32) (u32,u32) {
mut index := i mut index := i
mut meta := m mut meta := m
for meta < metas[index] { for meta < metas[index] {
index += 2 index += 2
meta += probe_inc meta += probe_inc
} }
return index, meta return index,meta
} }
[inline] [inline]
fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 { fn (m mut map) meta_greater(ms &u32, i u32, me u32, kvi u32) &u32 {
mut metas := ms mut metas := ms
mut meta := me mut meta := me
mut index := i mut index := i
@ -226,27 +219,27 @@ fn (m mut map) meta_greater(ms &u32, i u64, me u32, kvi u32) &u32 {
} }
metas[index] = meta metas[index] = meta
metas[index + 1] = kv_index metas[index + 1] = kv_index
probe_count := (meta >> hashbits) - 1 probe_count := (meta>>hashbits) - 1
if (probe_count << 1) == m.extra_metas { if (probe_count<<1) == m.extra_metas {
m.extra_metas += extra_metas_inc m.extra_metas += extra_metas_inc
mem_size := (m.cap + 2 + m.extra_metas) mem_size := (m.cap + 2 + m.extra_metas)
metas = &u32(C.realloc(metas, sizeof(u32) * mem_size)) metas = &u32(C.realloc(metas, sizeof(u32) * mem_size))
C.memset(metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc) C.memset(metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
// Should almost never happen // Should almost never happen
if probe_count == 252 { if probe_count == 252 {
panic("Probe overflow") panic('Probe overflow')
} }
} }
return metas return metas
} }
fn (m mut map) set(key string, value voidptr) { fn (m mut map) set(key string, value voidptr) {
load_factor := f32(m.size << 1) / f32(m.cap) load_factor := f32(m.size<<1) / f32(m.cap)
if load_factor > m.max_load_factor { if load_factor > max_load_factor {
m.expand() m.expand()
} }
mut index, mut meta := m.key_to_index(key) mut index,mut meta := m.key_to_index(key)
index, meta = meta_less(m.metas, index, meta) index,meta = meta_less(m.metas, index, meta)
// While we might have a match // While we might have a match
for meta == m.metas[index] { for meta == m.metas[index] {
kv_index := m.metas[index + 1] kv_index := m.metas[index + 1]
@ -275,8 +268,8 @@ fn (m mut map) expand() {
// Check if any hashbits are left // Check if any hashbits are left
if m.window == 0 { if m.window == 0 {
m.shift += cached_hashbits m.shift += cached_hashbits
m.rehash()
m.window = cached_hashbits m.window = cached_hashbits
m.rehash()
} }
else { else {
m.cached_rehash(old_cap) m.cached_rehash(old_cap)
@ -293,8 +286,8 @@ fn (m mut map) rehash() {
continue continue
} }
kv := m.key_values.data[i] kv := m.key_values.data[i]
mut index, mut meta := m.key_to_index(kv.key) mut index,mut meta := m.key_to_index(kv.key)
index, meta = meta_less(m.metas, index, meta) index,meta = meta_less(m.metas, index, meta)
m.metas = m.meta_greater(m.metas, index, meta, i) m.metas = m.meta_greater(m.metas, index, meta, i)
} }
} }
@ -302,16 +295,16 @@ fn (m mut map) rehash() {
fn (m mut map) cached_rehash(old_cap u32) { fn (m mut map) cached_rehash(old_cap u32) {
mut new_meta := &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas))) mut new_meta := &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas)))
old_extra_metas := m.extra_metas old_extra_metas := m.extra_metas
for i := 0; i <= old_cap + old_extra_metas; i += 2 { for i := u32(0); i <= old_cap + old_extra_metas; i += 2 {
if m.metas[i] == 0 { if m.metas[i] == 0 {
continue continue
} }
old_meta := m.metas[i] old_meta := m.metas[i]
old_probe_count := u64((old_meta>>hashbits) - 1) << 1 old_probe_count := ((old_meta>>hashbits) - 1)<<1
old_index := (i - old_probe_count) & (m.cap >> 1) old_index := (i - old_probe_count) & (m.cap>>1)
mut index := u64(old_index) | (old_meta << m.shift) & m.cap mut index := (old_index | (old_meta<<m.shift)) & m.cap
mut meta := (old_meta & hash_mask) | probe_inc mut meta := (old_meta & hash_mask) | probe_inc
index, meta = meta_less(new_meta, index, meta) index,meta = meta_less(new_meta, index, meta)
kv_index := m.metas[i + 1] kv_index := m.metas[i + 1]
new_meta = m.meta_greater(new_meta, index, meta, kv_index) new_meta = m.meta_greater(new_meta, index, meta, kv_index)
} }
@ -321,10 +314,9 @@ fn (m mut map) cached_rehash(old_cap u32) {
m.metas = new_meta m.metas = new_meta
} }
[inline]
fn (m map) get(key string, out voidptr) bool { fn (m map) get(key string, out voidptr) bool {
mut index, mut meta := m.key_to_index(key) mut index,mut meta := m.key_to_index(key)
index, meta = meta_less(m.metas, index, meta) index,meta = meta_less(m.metas, index, meta)
for meta == m.metas[index] { for meta == m.metas[index] {
kv_index := m.metas[index + 1] kv_index := m.metas[index + 1]
if key == m.key_values.data[kv_index].key { if key == m.key_values.data[kv_index].key {
@ -337,10 +329,9 @@ fn (m map) get(key string, out voidptr) bool {
return false return false
} }
[inline]
fn (m map) get2(key string) voidptr { fn (m map) get2(key string) voidptr {
mut index, mut meta := m.key_to_index(key) mut index,mut meta := m.key_to_index(key)
index, meta = meta_less(m.metas, index, meta) index,meta = meta_less(m.metas, index, meta)
for meta == m.metas[index] { for meta == m.metas[index] {
kv_index := m.metas[index + 1] kv_index := m.metas[index + 1]
if key == m.key_values.data[kv_index].key { if key == m.key_values.data[kv_index].key {
@ -354,13 +345,12 @@ fn (m map) get2(key string) voidptr {
return voidptr(0) return voidptr(0)
} }
[inline]
fn (m map) exists(key string) bool { fn (m map) exists(key string) bool {
if m.value_bytes == 0 { if m.value_bytes == 0 {
return false return false
} }
mut index, mut meta := m.key_to_index(key) mut index,mut meta := m.key_to_index(key)
index, meta = meta_less(m.metas, index, meta) index,meta = meta_less(m.metas, index, meta)
for meta == m.metas[index] { for meta == m.metas[index] {
kv_index := m.metas[index + 1] kv_index := m.metas[index + 1]
if key == m.key_values.data[kv_index].key { if key == m.key_values.data[kv_index].key {
@ -373,29 +363,24 @@ fn (m map) exists(key string) bool {
} }
pub fn (m mut map) delete(key string) { pub fn (m mut map) delete(key string) {
mut index, mut meta := m.key_to_index(key) mut index,mut meta := m.key_to_index(key)
index, meta = meta_less(m.metas, index, meta) index,meta = meta_less(m.metas, index, meta)
// Perform backwards shifting // Perform backwards shifting
for meta == m.metas[index] { for meta == m.metas[index] {
kv_index := m.metas[index + 1] kv_index := m.metas[index + 1]
if key == m.key_values.data[kv_index].key { if key == m.key_values.data[kv_index].key {
C.memset(&m.key_values.data[kv_index], 0, sizeof(KeyValue)) C.memset(&m.key_values.data[kv_index], 0, sizeof(KeyValue))
mut old_index := index for (m.metas[index + 2]>>hashbits) > 1 {
m.metas[index] = m.metas[index + 2] - probe_inc
m.metas[index + 1] = m.metas[index + 3]
index += 2 index += 2
mut cur_meta := m.metas[index]
mut cur_index := m.metas[index + 1]
for (cur_meta >> hashbits) > 1 {
m.metas[old_index] = cur_meta - probe_inc
m.metas[old_index + 1] = cur_index
old_index = index
index += 2
cur_meta = m.metas[index]
cur_index = m.metas[index + 1]
} }
m.metas[old_index] = 0
m.size-- m.size--
m.metas[index] = 0
m.key_values.deletes++ m.key_values.deletes++
if m.key_values.size <= 32 {return} if m.key_values.size <= 32 {
return
}
if (f32(m.key_values.size) / f32(m.key_values.deletes)) < 1 { if (f32(m.key_values.size) / f32(m.key_values.deletes)) < 1 {
m.key_values.zeros_to_end() m.key_values.zeros_to_end()
m.rehash() m.rehash()
@ -424,7 +409,7 @@ pub fn (m &map) keys() []string {
} }
pub fn (m map) free() { pub fn (m map) free() {
unsafe { unsafe{
free(m.metas) free(m.metas)
free(m.key_values.data) free(m.key_values.data)
} }