hash: add wyhash + benchmark, add fnv1a, add u64.hex() (#3584)

pull/3590/head
joe-conigliaro 2020-01-29 09:43:09 +11:00 committed by GitHub
parent 5a2534122e
commit 007baa2305
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 374 additions and 0 deletions

25
thirdparty/wyhash/LICENSE vendored 100644
View File

@ -0,0 +1,25 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

76
thirdparty/wyhash/wyhash.h vendored 100644
View File

@ -0,0 +1,76 @@
// Author: Wang Yi <godspeed_china@yeah.net>
#ifndef wyhash_version_4
#define wyhash_version_4
#include <stdint.h>
#include <string.h>
#if defined(_MSC_VER) && defined(_M_X64)
#include <intrin.h>
#pragma intrinsic(_umul128)
#endif
const uint64_t _wyp0=0xa0761d6478bd642full, _wyp1=0xe7037ed1a0b428dbull, _wyp2=0x8ebc6af09c88c6e3ull, _wyp3=0x589965cc75374cc3ull, _wyp4=0x1d8e4e27c47d124full;
static inline uint64_t _wyrotr(uint64_t v, unsigned k) { return (v>>k)|(v<<(64-k)); }
static inline uint64_t _wymum(uint64_t A, uint64_t B) {
#ifdef WYHASH32
uint64_t hh=(A>>32)*(B>>32), hl=(A>>32)*(unsigned)B, lh=(unsigned)A*(B>>32), ll=(uint64_t)(unsigned)A*(unsigned)B;
return _wyrotr(hl,32)^_wyrotr(lh,32)^hh^ll;
#else
#ifdef __SIZEOF_INT128__
__uint128_t r=A; r*=B; return (r>>64)^r;
#elif defined(_MSC_VER) && defined(_M_X64)
A=_umul128(A, B, &B); return A^B;
#else
uint64_t ha=A>>32, hb=B>>32, la=(uint32_t)A, lb=(uint32_t)B, hi, lo;
uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t<rl;
lo=t+(rm1<<32); c+=lo<t;hi=rh+(rm0>>32)+(rm1>>32)+c; return hi^lo;
#endif
#endif
}
#ifndef WYHASH_LITTLE_ENDIAN
#if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define WYHASH_LITTLE_ENDIAN 1
#elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define WYHASH_LITTLE_ENDIAN 0
#endif
#endif
#if(WYHASH_LITTLE_ENDIAN) || defined(__TINYC__)
static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return v; }
static inline uint64_t _wyr4(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return v; }
#else
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return __builtin_bswap64(v); }
static inline uint64_t _wyr4(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return __builtin_bswap32(v); }
#elif defined(_MSC_VER)
static inline uint64_t _wyr8(const uint8_t *p) { uint64_t v; memcpy(&v, p, 8); return _byteswap_uint64(v);}
static inline uint64_t _wyr4(const uint8_t *p) { unsigned v; memcpy(&v, p, 4); return _byteswap_ulong(v); }
#endif
#endif
static inline uint64_t _wyr3(const uint8_t *p, unsigned k) { return (((uint64_t)p[0])<<16)|(((uint64_t)p[k>>1])<<8)|p[k-1]; }
static inline uint64_t wyhash(const void* key, uint64_t len, uint64_t seed) {
const uint8_t *p=(const uint8_t*)key; uint64_t i=len&63;
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define _like_(x) __builtin_expect(x,1)
#define _unlike_(x) __builtin_expect(x,0)
#else
#define _like_(x) (x)
#define _unlike_(x) (x)
#endif
if(_unlike_(!i)) { }
else if(_unlike_(i<4)) seed=_wymum(_wyr3(p,i)^seed^_wyp0,seed^_wyp1);
else if(_like_(i<=8)) seed=_wymum(_wyr4(p)^seed^_wyp0,_wyr4(p+i-4)^seed^_wyp1);
else if(_like_(i<=16)) seed=_wymum(_wyr8(p)^seed^_wyp0,_wyr8(p+i-8)^seed^_wyp1);
else if(_like_(i<=24)) seed=_wymum(_wyr8(p)^seed^_wyp0,_wyr8(p+8)^seed^_wyp1)^_wymum(_wyr8(p+i-8)^seed^_wyp2,seed^_wyp3);
else if(_like_(i<=32)) seed=_wymum(_wyr8(p)^seed^_wyp0,_wyr8(p+8)^seed^_wyp1)^_wymum(_wyr8(p+16)^seed^_wyp2,_wyr8(p+i-8)^seed^_wyp3);
else{ seed=_wymum(_wyr8(p)^seed^_wyp0,_wyr8(p+8)^seed^_wyp1)^_wymum(_wyr8(p+16)^seed^_wyp2,_wyr8(p+24)^seed^_wyp3)^_wymum(_wyr8(p+i-32)^seed^_wyp1,_wyr8(p+i-24)^seed^_wyp2)^_wymum(_wyr8(p+i-16)^seed^_wyp3,_wyr8(p+i-8)^seed^_wyp0); }
if(_like_(i==len)) return _wymum(seed,len^_wyp4);
uint64_t see1=seed, see2=seed, see3=seed;
for(p+=i,i=len-i; _like_(i>=64); i-=64,p+=64) {
seed=_wymum(_wyr8(p)^seed^_wyp0,_wyr8(p+8)^seed^_wyp1); see1=_wymum(_wyr8(p+16)^see1^_wyp2,_wyr8(p+24)^see1^_wyp3);
see2=_wymum(_wyr8(p+32)^see2^_wyp1,_wyr8(p+40)^see2^_wyp2); see3=_wymum(_wyr8(p+48)^see3^_wyp3,_wyr8(p+56)^see3^_wyp0);
}
return _wymum(seed^see1^see2,see3^len^_wyp4);
}
static inline uint64_t wyhash64(uint64_t A, uint64_t B) { return _wymum(_wymum(A^_wyp0, B^_wyp1), _wyp2); }
static inline uint64_t wyrand(uint64_t *seed) { *seed+=_wyp0; return _wymum(*seed^_wyp1,*seed); }
static inline double wy2u01(uint64_t r) { const double _wynorm=1.0/(1ull<<52); return (r>>11)*_wynorm; }
static inline double wy2gau(uint64_t r) { const double _wynorm=1.0/(1ull<<20); return ((r&0x1fffff)+((r>>21)&0x1fffff)+((r>>42)&0x1fffff))*_wynorm-3.0; }
#endif

View File

@ -0,0 +1,55 @@
module main
import (
hash.fnv1a
hash.wyhash
rand
time
)
fn main() {
sample_size := 10000000
min_str_len := 20
max_str_len := 40
println('Generating $sample_size strings between $min_str_len - $max_str_len chars long...')
mut bytepile := []byte
for _ in 0 .. sample_size * max_str_len {
bytepile << byte(40 + rand.next(125 - 40))
}
mut str_lens := []int
for _ in 0 .. sample_size {
str_lens << min_str_len + rand.next(max_str_len - min_str_len)
}
println('Hashing each of the generated strings...')
t0 := time.ticks()
mut start_pos := 0
for len in str_lens {
end_pos := start_pos + len
str := string(bytepile[start_pos..end_pos],len)
_ = wyhash.wyhash_c(&str.str, u64(str.len), 1)
start_pos = end_pos
}
t1 := time.ticks()
d1 := t1 - t0
println(' * wyhash4 C: ${d1}ms')
start_pos = 0
for len in str_lens {
end_pos := start_pos + len
str := string(bytepile[start_pos..end_pos],len)
_ = wyhash.sum64_string(str, 1)
start_pos = end_pos
}
t2 := time.ticks()
d2 := t2 - t1
println(' * wyhash4: ${d2}ms')
start_pos = 0
for len in str_lens {
end_pos := start_pos + len
str := string(bytepile[start_pos..end_pos],len)
_ = fnv1a.sum64_string(str)
start_pos = end_pos
}
t3 := time.ticks()
d3 := t3 - t2
println(' * fnv1a64: ${d3}ms')
}

View File

@ -159,6 +159,13 @@ pub fn (n i64) hex() string {
return tos(hex, count) return tos(hex, count)
} }
pub fn (n u64) hex() string {
len := if n >= u64(0) { n.str().len + 3 } else { 19 }
hex := malloc(len)
count := int(C.sprintf(charptr(hex), '0x%'C.PRIx64, n))
return tos(hex, count)
}
pub fn (a []byte) contains(val byte) bool { pub fn (a []byte) contains(val byte) bool {
for aa in a { for aa in a {
if aa == val { if aa == val {

View File

@ -0,0 +1,44 @@
module fnv1a
const (
fnv64_prime = 1099511628211
fnv64_offset_basis = 14695981039346656037
fnv32_offset_basis = u32(2166136261)
fnv32_prime = u32(16777619)
)
[inline]
pub fn sum32_string(data string) u32 {
mut hash := fnv32_offset_basis
for i := 0; i < data.len; i++ {
hash = (hash ^ u32(data[i])) * fnv32_prime
}
return hash
}
[inline]
pub fn sum32(data []byte) u32 {
mut hash := fnv32_offset_basis
for i := 0; i < data.len; i++ {
hash = (hash ^ u32(data[i])) * fnv32_prime
}
return hash
}
[inline]
pub fn sum64_string(data string) u64 {
mut hash := fnv64_offset_basis
for i := 0; i < data.len; i++ {
hash = (hash ^ u64(data[i])) * fnv64_prime
}
return hash
}
[inline]
pub fn sum64(data []byte) u64 {
mut hash := fnv64_offset_basis
for i := 0; i < data.len; i++ {
hash = (hash ^ u64(data[i])) * fnv64_prime
}
return hash
}

View File

@ -0,0 +1,9 @@
import hash.fnv1a
fn test_fnv1a() {
a := 'apple'
b := fnv1a.sum64_string(a)
c := fnv1a.sum64(a.bytes())
assert b.hex() == '0xf74a62a458befdbf'
assert c.hex() == '0xf74a62a458befdbf'
}

View File

@ -0,0 +1,129 @@
// Copyright (c) 2019 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
//
// this is an implementation of wyhash v4
// from https://github.com/wangyi-fudan/wyhash
//
// TODO: use u128 once implemented
// currently the C version performs slightly better
// because it uses 128 bit int when available and
// branch prediction hints. the C version will be
// removed once the perfomance is matched.
// you can test performance by running:
// v run tools/wyhash_benchmark.v
// try running with and without the -prod flag
module wyhash
#flag -I @VROOT/thirdparty/wyhash
#include "wyhash.h"
fn C.wyhash(byteptr, u64, u64) u64
const (
wyp0 = 0xa0761d6478bd642f
wyp1 = 0xe7037ed1a0b428db
wyp2 = 0x8ebc6af09c88c6e3
wyp3 = 0x589965cc75374cc3
wyp4 = 0x1d8e4e27c47d124f
)
[inline]
pub fn wyhash_c(key byteptr, len, seed u64) u64 {
return C.wyhash(key, len, seed)
}
[inline]
pub fn sum64_string(key string, seed u64) u64 {
return wyhash64(key.str, u64(key.len), seed)
}
[inline]
pub fn sum64(key []byte, seed u64) u64 {
return wyhash64(key.data, u64(key.len), seed)
}
[inline]
fn wyhash64(key byteptr, len, seed_ u64) u64 {
if len == 0 {
return 0
}
mut p := &key[0]
mut seed := seed_
mut i := len & 63
if i < 4 {
seed = wymum(wyr3(p, i) ^ seed ^ wyp0, seed ^ wyp1)
}
else if i <= 8 {
seed = wymum(wyr4(p) ^ seed ^ wyp0, wyr4(p + i - 4) ^ seed ^ wyp1)
}
else if i <= 16 {
seed = wymum(wyr8(p) ^ seed ^ wyp0, wyr8(p + i - 8) ^ seed ^ wyp1)
}
else if i <= 24 {
seed = wymum(wyr8(p) ^ seed ^ wyp0, wyr8(p + 8) ^ seed ^ wyp1) ^ wymum(wyr8(p + i - 8) ^ seed ^ wyp2, seed ^ wyp3)
}
else if i <= 32 {
seed = wymum(wyr8(p) ^ seed ^ wyp0, wyr8(p + 8) ^ seed ^ wyp1) ^ wymum(wyr8(p + 16) ^ seed ^ wyp2, wyr8(p + i - 8) ^ seed ^ wyp3)
}
else {
seed = wymum(wyr8(p) ^ seed ^ wyp0, wyr8(p + 8) ^ seed ^ wyp1) ^ wymum(wyr8(p + 16) ^ seed ^ wyp2, wyr8(p + 24) ^ seed ^ wyp3) ^ wymum(wyr8(p + i - 32) ^ seed ^ wyp1, wyr8(p + i - 24) ^ seed ^ wyp2) ^ wymum(wyr8(p + i - 16) ^ seed ^ wyp3, wyr8(p + i - 8) ^ seed ^ wyp0)
}
if i == len {
return wymum(seed, len ^ wyp4)
}
mut see1 := seed
mut see2 := seed
mut see3 := seed
p = p + i
for i = len - i; i >= 64; i -= 64 {
seed = wymum(wyr8(p) ^ seed ^ wyp0, wyr8(p + 8) ^ seed ^ wyp1)
see1 = wymum(wyr8(p + 16) ^ see1 ^ wyp2, wyr8(p + 24) ^ see1 ^ wyp3)
see2 = wymum(wyr8(p + 32) ^ see2 ^ wyp1, wyr8(p + 40) ^ see2 ^ wyp2)
see3 = wymum(wyr8(p + 48) ^ see3 ^ wyp3, wyr8(p + 56) ^ see3 ^ wyp0)
p = p + 64
}
return wymum(seed ^ see1 ^ see2, see3 ^ len ^ wyp4)
}
[inline]
fn wyrotr(v u64, k u32) u64 {
return (v>>k) | (v<<(64 - k))
}
[inline]
fn wymum(a, b u64) u64 {
/*
mut r := u128(a)
r = r*b
return (r>>64)^r
*/
mask32 := u32(4294967295)
x0 := a & mask32
x1 := a>>32
y0 := b & mask32
y1 := b>>32
w0 := x0 * y0
t := x1 * y0 + (w0>>32)
mut w1 := t & mask32
w2 := t>>32
w1 += x0 * y1
hi := x1 * y1 + w2 + (w1>>32)
lo := a * b
return hi ^ lo
}
[inline]
fn wyr3(p byteptr, k u64) u64 {
return (u64(p[0])<<16) | (u64(p[k>>1])<<8) | u64(p[k - 1])
}
[inline]
fn wyr4(p byteptr) u64 {
return u32(p[0]) | (u32(p[1])<<u32(8)) | (u32(p[2])<<u32(16)) | (u32(p[3])<<u32(24))
}
[inline]
fn wyr8(p byteptr) u64 {
return u64(p[0]) | (u64(p[1])<<8) | (u64(p[2])<<16) | (u64(p[3])<<24) | (u64(p[4])<<32) | (u64(p[5])<<40) | (u64(p[6])<<48) | (u64(p[7])<<56)
}

View File

@ -0,0 +1,29 @@
import hash.wyhash
struct WyHashTest {
s string
seed u64
expected u64
}
fn test_wyhash() {
tests := [WyHashTest{
'',0,0x0},
WyHashTest{
'v',1,0xc72a8f8bdfdd82},
WyHashTest{
'is',2,0xa1099c1c58fc13e},
WyHashTest{
'the best',3,0x1b1215ef0b0b94c},
WyHashTest{
'abcdefghijklmnopqrstuvwxyz',4,0x6db0e773d1503fac},
WyHashTest{
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',5,0xe062dfda99413626},
]
for test in tests {
got := wyhash.sum64(test.s.bytes(), test.seed)
// println(' # GOT: $got | $got.hex()')
// println(' # EXPECTED: $test.expected | $test.expected.hex()')
assert got == test.expected
}
}