v/vlib/builtin/map.v

427 lines
11 KiB
V
Raw Normal View History

2020-02-03 05:00:36 +01:00
// Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
2019-06-23 04:21:30 +02:00
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
2019-06-22 20:20:28 +02:00
module builtin
2020-02-20 20:04:06 +01:00
import (
strings
hash.wyhash
)
2020-04-08 00:02:15 +02:00
fn C.strcmp(byteptr, byteptr) int
2020-03-19 06:52:34 +01:00
/*
2020-03-19 07:05:20 +01:00
This is a very fast hashmap implementation. It has several properties that in
combination makes it very fast. Here is a short explanation of each property.
2020-03-19 06:52:34 +01:00
After reading this you should have a basic understanding of how it works:
1. |Hash-function (Wyhash)|. Wyhash is the fastest hash-function passing SMHash-
er, so it was an easy choice.
2020-03-21 13:55:07 +01:00
2. |Open addressing (Robin Hood Hashing)|. With this method, a hash collision is
2020-03-19 06:52:34 +01:00
resolved by probing. As opposed to linear probing, Robin Hood hashing has a sim-
ple but clever twist: As new keys are inserted, old keys are shifted around in a
way such that all keys stay reasonably close to the slot they originally hash to.
3. |Memory layout|. Key-value pairs are stored in a `DenseArray`, with an avera-
ge of roughly 6.25% unused memory, as opposed to most other dynamic array imple-
2020-03-19 07:05:20 +01:00
mentations with a growth factor of 1.5 or 2. The key-values keep their index in
the array - they are not probed. Instead, this implementation uses another array
2020-03-21 13:55:07 +01:00
"metas" storing "meta"s (meta-data). Each Key-value has a corresponding meta. A
2020-03-19 06:52:34 +01:00
meta stores a reference to its key-value, and its index in "metas" is determined
2020-03-19 07:05:20 +01:00
by the hash of the key and probing. A meta also stores bits from the hash (for
faster rehashing etc.) and how far away it is from the index it was originally
2020-03-19 06:52:34 +01:00
hashed to (probe_count). probe_count is 0 if empty, 1 if not probed, 2 if probed
2020-03-21 13:55:07 +01:00
by 1, etc..
2020-03-19 06:52:34 +01:00
meta (64 bit) = kv_index (32 bit) | probe_count (8 bits) | hashbits (24 bits)
metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...]
key_values = [kv, kv, kv, kv, kv, ...]
2020-03-19 07:05:20 +01:00
4. |Power of two size array|. The size of metas is a power of two. This makes it
2020-03-21 13:55:07 +01:00
possible to find a bucket from a hash code by using "hash & (SIZE -1)" instead
of "abs(hash) % SIZE". Modulo is extremely expensive so using '&' is a big perf-
ormance improvement. The general concern with this is that you only use the low-
er bits of the hash and that can cause more collisions. This is solved by using
good hash-function.
2020-03-19 06:52:34 +01:00
2020-03-19 07:05:20 +01:00
5. |Extra metas|. The hashmap keeps track of the highest probe_count. The trick
2020-03-21 13:55:07 +01:00
is to allocate extra_metas > max(probe_count), so you never have to do any boun-
2020-03-19 07:05:20 +01:00
ds-checking because the extra metas ensures that an element will never go beyond
2020-03-21 13:55:07 +01:00
the last index.
2020-03-19 06:52:34 +01:00
6. |Cached rehashing|. When the load_factor of the map exceeds the max_load_fac-
tor the size of metas is doubled and all the elements need to be "rehashed" to
2020-03-21 13:55:07 +01:00
find the index in the new array. Instead of rehashing completely, it simply uses
2020-03-19 07:05:20 +01:00
the hashbits stored in the meta.
2020-03-19 06:52:34 +01:00
*/
2020-03-21 13:55:07 +01:00
2020-01-24 20:13:59 +01:00
const (
2020-03-21 13:55:07 +01:00
// Number of bits from the hash stored for each entry
2020-02-20 20:04:06 +01:00
hashbits = 24
2020-03-21 13:55:07 +01:00
// Number of bits from the hash stored for rehashing
2020-04-05 22:09:52 +02:00
max_cached_hashbits = 16
2020-02-20 20:04:06 +01:00
// Initial log-number of buckets in the hashtable
init_log_capicity = 5
// Initial number of buckets in the hashtable
2020-03-21 13:55:07 +01:00
init_capicity = 1<<init_log_capicity
// Maximum load-factor (size / capacity)
max_load_factor = 0.8
// Initial highest even index in metas
2020-03-19 06:52:34 +01:00
init_cap = init_capicity - 2
// Used for incrementing `extra_metas` when max
// probe count is too high, to avoid overflow
extra_metas_inc = 4
2020-02-20 20:04:06 +01:00
// Bitmask to select all the hashbits
hash_mask = u32(0x00FFFFFF)
// Used for incrementing the probe-count
2020-03-19 07:05:20 +01:00
probe_inc = u32(0x01000000)
2020-01-24 20:13:59 +01:00
)
2020-02-20 20:04:06 +01:00
struct KeyValue {
key string
2020-01-24 20:13:59 +01:00
mut:
2020-02-20 20:04:06 +01:00
value voidptr
2019-06-22 20:20:28 +02:00
}
// Dynamic array with very low growth factor
struct DenseArray {
mut:
2020-03-21 13:55:07 +01:00
cap u32
size u32
2020-03-19 06:52:34 +01:00
deletes u32
2020-03-21 13:55:07 +01:00
data &KeyValue
2020-03-19 06:52:34 +01:00
}
[inline]
fn new_dense_array() DenseArray {
2020-03-21 13:55:07 +01:00
unsafe{
return DenseArray{
2020-03-19 06:52:34 +01:00
cap: 8
size: 0
deletes: 0
2020-03-21 13:55:07 +01:00
data: &KeyValue(malloc(8 * sizeof(KeyValue)))
2020-03-19 06:52:34 +01:00
}
}
}
// Push element to array and return index
// The growth-factor is roughly 12.5 `(x + (x >> 3))`
[inline]
fn (d mut DenseArray) push(kv KeyValue) u32 {
if d.cap == d.size {
2020-03-21 13:55:07 +01:00
d.cap += d.cap>>3
2020-03-19 06:52:34 +01:00
d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
}
push_index := d.size
d.data[push_index] = kv
d.size++
return push_index
}
// Move all zeros to the end of the array
// and resize array
fn (d mut DenseArray) zeros_to_end() {
mut count := u32(0)
2020-03-21 13:55:07 +01:00
for i in 0 .. d.size {
2020-03-19 06:52:34 +01:00
if d.data[i].key.str != 0 {
tmp := d.data[count]
d.data[count] = d.data[i]
d.data[i] = tmp
count++
}
}
2020-03-21 13:55:07 +01:00
d.deletes = 0
2020-03-19 06:52:34 +01:00
d.size = count
2020-03-21 13:55:07 +01:00
d.cap = if count < 8 { 8 } else { count }
2020-03-19 06:52:34 +01:00
d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
}
pub struct map {
// Byte size of value
2020-03-21 13:55:07 +01:00
value_bytes int
2020-03-19 06:52:34 +01:00
mut:
2020-04-05 22:09:52 +02:00
// highest even index in the hashtable
2020-03-21 13:55:07 +01:00
cap u32
2020-03-19 06:52:34 +01:00
// Number of cached hashbits left for rehasing
2020-04-05 22:09:52 +02:00
cached_hashbits byte
2020-03-19 06:52:34 +01:00
// Used for right-shifting out used hashbits
2020-03-21 13:55:07 +01:00
shift byte
// Array storing key-values (ordered)
key_values DenseArray
// Pointer to meta-data:
// Odd indices stores index in `key_values`.
// Even indices stores probe_count and hashbits.
metas &u32
2020-03-19 06:52:34 +01:00
// Extra metas that allows for no ranging when incrementing
// index in the hashmap
2020-03-21 13:55:07 +01:00
extra_metas u32
2020-03-19 06:52:34 +01:00
pub mut:
2020-03-21 13:55:07 +01:00
// Number of key-values currently in the hashmap
size int
}
2020-02-20 20:04:06 +01:00
fn new_map(n, value_bytes int) map {
return map{
2020-01-24 20:13:59 +01:00
value_bytes: value_bytes
2020-03-19 06:52:34 +01:00
cap: init_cap
2020-04-05 22:09:52 +02:00
cached_hashbits: max_cached_hashbits
2020-03-19 06:52:34 +01:00
shift: init_log_capicity
key_values: new_dense_array()
metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc)))
extra_metas: extra_metas_inc
2020-01-24 20:13:59 +01:00
size: 0
2019-06-22 20:20:28 +02:00
}
}
2020-01-24 20:13:59 +01:00
fn new_map_init(n, value_bytes int, keys &string, values voidptr) map {
mut out := new_map(n, value_bytes)
for i in 0 .. n {
2020-04-02 15:31:44 +02:00
out.set(keys[i], byteptr(values) + i * value_bytes)
2019-08-03 09:44:08 +02:00
}
2020-01-24 20:13:59 +01:00
return out
2019-08-29 00:52:32 +02:00
}
2019-08-03 09:44:08 +02:00
2020-03-19 06:52:34 +01:00
[inline]
2020-03-21 13:55:07 +01:00
fn (m map) key_to_index(key string) (u32,u32) {
2020-04-05 22:09:52 +02:00
hash := wyhash.wyhash_c(key.str, u64(key.len), 0)
2020-03-19 06:52:34 +01:00
index := hash & m.cap
2020-03-21 13:55:07 +01:00
meta := ((hash>>m.shift) & hash_mask) | probe_inc
2020-04-05 22:09:52 +02:00
return u32(index),u32(meta)
2020-03-19 06:52:34 +01:00
}
[inline]
2020-04-05 22:09:52 +02:00
fn (m map) meta_less(_index u32, _metas u32) (u32,u32) {
mut index := _index
mut meta := _metas
for meta < m.metas[index] {
2020-03-19 06:52:34 +01:00
index += 2
meta += probe_inc
2020-02-20 20:04:06 +01:00
}
2020-03-21 13:55:07 +01:00
return index,meta
2020-03-19 06:52:34 +01:00
}
[inline]
2020-04-05 22:09:52 +02:00
fn (m mut map) meta_greater(_index u32, _metas u32, kvi u32) {
mut meta := _metas
mut index := _index
2020-03-19 06:52:34 +01:00
mut kv_index := kvi
2020-04-05 22:09:52 +02:00
for m.metas[index] != 0 {
if meta > m.metas[index] {
tmp_meta := m.metas[index]
m.metas[index] = meta
2020-03-19 06:52:34 +01:00
meta = tmp_meta
2020-04-05 22:09:52 +02:00
tmp_index := m.metas[index + 1]
m.metas[index + 1] = kv_index
2020-03-19 06:52:34 +01:00
kv_index = tmp_index
2020-01-24 20:13:59 +01:00
}
2020-03-19 06:52:34 +01:00
index += 2
meta += probe_inc
2020-02-20 20:04:06 +01:00
}
2020-04-05 22:09:52 +02:00
m.metas[index] = meta
m.metas[index + 1] = kv_index
2020-03-21 13:55:07 +01:00
probe_count := (meta>>hashbits) - 1
if (probe_count<<1) == m.extra_metas {
2020-03-19 06:52:34 +01:00
m.extra_metas += extra_metas_inc
mem_size := (m.cap + 2 + m.extra_metas)
2020-04-05 22:09:52 +02:00
m.metas = &u32(C.realloc(m.metas, sizeof(u32) * mem_size))
C.memset(m.metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
2020-03-19 06:52:34 +01:00
// Should almost never happen
if probe_count == 252 {
2020-03-21 13:55:07 +01:00
panic('Probe overflow')
2020-03-19 06:52:34 +01:00
}
2020-03-19 07:05:20 +01:00
}
2020-03-19 06:52:34 +01:00
}
fn (m mut map) set(key string, value voidptr) {
2020-03-21 13:55:07 +01:00
load_factor := f32(m.size<<1) / f32(m.cap)
if load_factor > max_load_factor {
2020-03-19 06:52:34 +01:00
m.expand()
2020-02-20 20:30:34 +01:00
}
2020-03-21 13:55:07 +01:00
mut index,mut meta := m.key_to_index(key)
2020-04-05 22:09:52 +02:00
index,meta = m.meta_less(index, meta)
2020-03-19 06:52:34 +01:00
// While we might have a match
for meta == m.metas[index] {
kv_index := m.metas[index + 1]
2020-04-08 00:02:15 +02:00
if C.strcmp(key.str, m.key_values.data[kv_index].key.str) == 0 {
2020-03-19 06:52:34 +01:00
C.memcpy(m.key_values.data[kv_index].value, value, m.value_bytes)
return
}
2020-03-19 06:52:34 +01:00
index += 2
meta += probe_inc
}
2020-03-19 06:52:34 +01:00
// Match not possible anymore
kv := KeyValue{
2020-03-19 07:05:20 +01:00
key: key
2020-03-19 06:52:34 +01:00
value: malloc(m.value_bytes)
}
2020-03-19 06:52:34 +01:00
C.memcpy(kv.value, value, m.value_bytes)
kv_index := m.key_values.push(kv)
2020-04-05 22:09:52 +02:00
m.meta_greater(index, meta, kv_index)
2020-02-20 20:04:06 +01:00
m.size++
}
2020-03-19 06:52:34 +01:00
// Doubles the size of the hashmap
2020-02-20 20:04:06 +01:00
fn (m mut map) expand() {
2020-03-19 06:52:34 +01:00
old_cap := m.cap
m.cap = ((m.cap + 2)<<1) - 2
// Check if any hashbits are left
2020-04-05 22:09:52 +02:00
if m.cached_hashbits == 0 {
m.shift += max_cached_hashbits
m.cached_hashbits = max_cached_hashbits
2020-03-21 13:55:07 +01:00
m.rehash()
2019-06-22 20:20:28 +02:00
}
2020-02-20 20:04:06 +01:00
else {
2020-03-19 06:52:34 +01:00
m.cached_rehash(old_cap)
2019-08-29 00:52:32 +02:00
}
2020-04-05 22:09:52 +02:00
m.cached_hashbits--
}
2020-03-19 06:52:34 +01:00
fn (m mut map) rehash() {
meta_bytes := sizeof(u32) * (m.cap + 2 + m.extra_metas)
m.metas = &u32(C.realloc(m.metas, meta_bytes))
C.memset(m.metas, 0, meta_bytes)
for i := u32(0); i < m.key_values.size; i++ {
if m.key_values.data[i].key.str == 0 {
continue
}
2020-03-19 06:52:34 +01:00
kv := m.key_values.data[i]
2020-03-21 13:55:07 +01:00
mut index,mut meta := m.key_to_index(kv.key)
2020-04-05 22:09:52 +02:00
index,meta = m.meta_less(index, meta)
m.meta_greater(index, meta, i)
}
2019-08-29 00:52:32 +02:00
}
2020-03-19 06:52:34 +01:00
fn (m mut map) cached_rehash(old_cap u32) {
2020-04-05 22:09:52 +02:00
old_metas := m.metas
m.metas = &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas)))
2020-03-19 06:52:34 +01:00
old_extra_metas := m.extra_metas
2020-03-21 13:55:07 +01:00
for i := u32(0); i <= old_cap + old_extra_metas; i += 2 {
2020-04-05 22:09:52 +02:00
if old_metas[i] == 0 {
2020-03-19 06:52:34 +01:00
continue
}
2020-04-05 22:09:52 +02:00
old_meta := old_metas[i]
2020-03-21 13:55:07 +01:00
old_probe_count := ((old_meta>>hashbits) - 1)<<1
old_index := (i - old_probe_count) & (m.cap>>1)
mut index := (old_index | (old_meta<<m.shift)) & m.cap
2020-03-19 06:52:34 +01:00
mut meta := (old_meta & hash_mask) | probe_inc
2020-04-05 22:09:52 +02:00
index,meta = m.meta_less(index, meta)
kv_index := old_metas[i + 1]
m.meta_greater(index, meta, kv_index)
}
2020-02-20 20:04:06 +01:00
unsafe{
2020-04-05 22:09:52 +02:00
free(old_metas)
2020-01-24 20:13:59 +01:00
}
}
fn (m map) get3(key string, zero voidptr) voidptr {
mut index,mut meta := m.key_to_index(key)
2020-04-05 22:09:52 +02:00
index,meta = m.meta_less(index, meta)
for meta == m.metas[index] {
kv_index := m.metas[index + 1]
2020-04-08 00:02:15 +02:00
if C.strcmp(key.str, m.key_values.data[kv_index].key.str) == 0 {
out := malloc(m.value_bytes)
C.memcpy(out, m.key_values.data[kv_index].value, m.value_bytes)
return out
}
index += 2
meta += probe_inc
}
return zero
}
2020-02-20 20:04:06 +01:00
fn (m map) exists(key string) bool {
if m.value_bytes == 0 {
return false
}
2020-03-21 13:55:07 +01:00
mut index,mut meta := m.key_to_index(key)
2020-04-05 22:09:52 +02:00
index,meta = m.meta_less(index, meta)
2020-03-19 06:52:34 +01:00
for meta == m.metas[index] {
kv_index := m.metas[index + 1]
2020-04-08 00:02:15 +02:00
if C.strcmp(key.str, m.key_values.data[kv_index].key.str) == 0 {
2020-02-20 20:04:06 +01:00
return true
}
2020-03-19 06:52:34 +01:00
index += 2
meta += probe_inc
}
2020-02-20 20:04:06 +01:00
return false
2019-12-30 06:57:56 +01:00
}
2020-03-19 06:52:34 +01:00
pub fn (m mut map) delete(key string) {
2020-03-21 13:55:07 +01:00
mut index,mut meta := m.key_to_index(key)
2020-04-05 22:09:52 +02:00
index,meta = m.meta_less(index, meta)
2020-03-19 06:52:34 +01:00
// Perform backwards shifting
for meta == m.metas[index] {
kv_index := m.metas[index + 1]
2020-04-08 00:02:15 +02:00
if C.strcmp(key.str, m.key_values.data[kv_index].key.str) == 0 {
2020-03-21 13:55:07 +01:00
for (m.metas[index + 2]>>hashbits) > 1 {
m.metas[index] = m.metas[index + 2] - probe_inc
m.metas[index + 1] = m.metas[index + 3]
2020-03-19 06:52:34 +01:00
index += 2
}
m.size--
2020-03-21 13:55:07 +01:00
m.metas[index] = 0
2020-03-19 06:52:34 +01:00
m.key_values.deletes++
2020-04-09 03:57:08 +02:00
C.memset(&m.key_values.data[kv_index], 0, sizeof(KeyValue))
2020-03-21 13:55:07 +01:00
if m.key_values.size <= 32 {
return
}
2020-04-05 22:09:52 +02:00
// Clean up key_values if too many have been deleted
if m.key_values.deletes >= (m.key_values.size >> 1) {
2020-03-19 06:52:34 +01:00
m.key_values.zeros_to_end()
m.rehash()
2020-04-05 22:09:52 +02:00
m.key_values.deletes = 0
2020-03-19 06:52:34 +01:00
}
return
}
index += 2
meta += probe_inc
}
}
2020-01-24 20:13:59 +01:00
pub fn (m &map) keys() []string {
mut keys := [''].repeat(m.size)
2020-02-20 20:04:06 +01:00
if m.value_bytes == 0 {
2020-01-24 20:13:59 +01:00
return keys
2019-12-30 06:57:56 +01:00
}
2020-02-20 20:04:06 +01:00
mut j := 0
2020-03-19 06:52:34 +01:00
for i := u32(0); i < m.key_values.size; i++ {
if m.key_values.data[i].key.str == 0 {
continue
2020-02-20 20:04:06 +01:00
}
2020-03-19 06:52:34 +01:00
keys[j] = m.key_values.data[i].key
j++
2020-02-20 20:04:06 +01:00
}
2020-01-24 20:13:59 +01:00
return keys
}
2020-04-05 23:31:53 +02:00
[unsafe_fn]
2020-03-19 06:52:34 +01:00
pub fn (m map) free() {
2020-04-05 23:31:53 +02:00
free(m.metas)
for i := u32(0); i < m.key_values.size; i++ {
if m.key_values.data[i].key.str == 0 {
continue
}
m.key_values.data[i].key.free()
2020-01-24 20:13:59 +01:00
}
2020-04-05 23:31:53 +02:00
free(m.key_values.data)
2019-07-23 22:57:06 +02:00
}
pub fn (m map) print() {
2020-02-20 20:04:06 +01:00
println('TODO')
2019-06-22 20:20:28 +02:00
}
pub fn (m map_string) str() string {
2019-07-14 11:01:32 +02:00
if m.size == 0 {
2019-06-22 20:20:28 +02:00
return '{}'
}
mut sb := strings.new_builder(50)
2019-08-29 00:52:32 +02:00
sb.writeln('{')
2020-03-21 13:55:07 +01:00
for key, val in m {
2019-08-29 00:52:32 +02:00
sb.writeln(' "$key" => "$val"')
2019-08-05 04:34:12 +02:00
}
2019-08-29 00:52:32 +02:00
sb.writeln('}')
return sb.str()
2020-03-19 07:05:20 +01:00
}