diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:19:13 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:19:13 +0000 |
commit | ccd992355df7192993c666236047820244914598 (patch) | |
tree | f00fea65147227b7743083c6148396f74cd66935 /src/hash/maphash | |
parent | Initial commit. (diff) | |
download | golang-1.21-ccd992355df7192993c666236047820244914598.tar.xz golang-1.21-ccd992355df7192993c666236047820244914598.zip |
Adding upstream version 1.21.8.upstream/1.21.8
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/hash/maphash')
-rw-r--r-- | src/hash/maphash/example_test.go | 37 | ||||
-rw-r--r-- | src/hash/maphash/maphash.go | 277 | ||||
-rw-r--r-- | src/hash/maphash/maphash_purego.go | 104 | ||||
-rw-r--r-- | src/hash/maphash/maphash_runtime.go | 43 | ||||
-rw-r--r-- | src/hash/maphash/maphash_test.go | 255 | ||||
-rw-r--r-- | src/hash/maphash/smhasher_test.go | 474 |
6 files changed, 1190 insertions, 0 deletions
diff --git a/src/hash/maphash/example_test.go b/src/hash/maphash/example_test.go new file mode 100644 index 0000000..78690fd --- /dev/null +++ b/src/hash/maphash/example_test.go @@ -0,0 +1,37 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package maphash_test + +import ( + "fmt" + "hash/maphash" +) + +func Example() { + // The zero Hash value is valid and ready to use; setting an + // initial seed is not necessary. + var h maphash.Hash + + // Add a string to the hash, and print the current hash value. + h.WriteString("hello, ") + fmt.Printf("%#x\n", h.Sum64()) + + // Append additional data (in the form of a byte array). + h.Write([]byte{'w', 'o', 'r', 'l', 'd'}) + fmt.Printf("%#x\n", h.Sum64()) + + // Reset discards all data previously added to the Hash, without + // changing its seed. + h.Reset() + + // Use SetSeed to create a new Hash h2 which will behave + // identically to h. + var h2 maphash.Hash + h2.SetSeed(h.Seed()) + + h.WriteString("same") + h2.WriteString("same") + fmt.Printf("%#x == %#x\n", h.Sum64(), h2.Sum64()) +} diff --git a/src/hash/maphash/maphash.go b/src/hash/maphash/maphash.go new file mode 100644 index 0000000..c2e9e40 --- /dev/null +++ b/src/hash/maphash/maphash.go @@ -0,0 +1,277 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package maphash provides hash functions on byte sequences. +// These hash functions are intended to be used to implement hash tables or +// other data structures that need to map arbitrary strings or byte +// sequences to a uniform distribution on unsigned 64-bit integers. +// Each different instance of a hash table or data structure should use its own Seed. +// +// The hash functions are not cryptographically secure. +// (See crypto/sha256 and crypto/sha512 for cryptographic use.) +package maphash + +// A Seed is a random value that selects the specific hash function +// computed by a Hash. If two Hashes use the same Seeds, they +// will compute the same hash values for any given input. +// If two Hashes use different Seeds, they are very likely to compute +// distinct hash values for any given input. +// +// A Seed must be initialized by calling MakeSeed. +// The zero seed is uninitialized and not valid for use with Hash's SetSeed method. +// +// Each Seed value is local to a single process and cannot be serialized +// or otherwise recreated in a different process. +type Seed struct { + s uint64 +} + +// Bytes returns the hash of b with the given seed. +// +// Bytes is equivalent to, but more convenient and efficient than: +// +// var h Hash +// h.SetSeed(seed) +// h.Write(b) +// return h.Sum64() +func Bytes(seed Seed, b []byte) uint64 { + state := seed.s + if state == 0 { + panic("maphash: use of uninitialized Seed") + } + + if len(b) > bufSize { + b = b[:len(b):len(b)] // merge len and cap calculations when reslicing + for len(b) > bufSize { + state = rthash(b[:bufSize], state) + b = b[bufSize:] + } + } + return rthash(b, state) +} + +// String returns the hash of s with the given seed. +// +// String is equivalent to, but more convenient and efficient than: +// +// var h Hash +// h.SetSeed(seed) +// h.WriteString(s) +// return h.Sum64() +func String(seed Seed, s string) uint64 { + state := seed.s + if state == 0 { + panic("maphash: use of uninitialized Seed") + } + for len(s) > bufSize { + state = rthashString(s[:bufSize], state) + s = s[bufSize:] + } + return rthashString(s, state) +} + +// A Hash computes a seeded hash of a byte sequence. +// +// The zero Hash is a valid Hash ready to use. +// A zero Hash chooses a random seed for itself during +// the first call to a Reset, Write, Seed, or Sum64 method. +// For control over the seed, use SetSeed. +// +// The computed hash values depend only on the initial seed and +// the sequence of bytes provided to the Hash object, not on the way +// in which the bytes are provided. For example, the three sequences +// +// h.Write([]byte{'f','o','o'}) +// h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o') +// h.WriteString("foo") +// +// all have the same effect. +// +// Hashes are intended to be collision-resistant, even for situations +// where an adversary controls the byte sequences being hashed. +// +// A Hash is not safe for concurrent use by multiple goroutines, but a Seed is. +// If multiple goroutines must compute the same seeded hash, +// each can declare its own Hash and call SetSeed with a common Seed. +type Hash struct { + _ [0]func() // not comparable + seed Seed // initial seed used for this hash + state Seed // current hash of all flushed bytes + buf [bufSize]byte // unflushed byte buffer + n int // number of unflushed bytes +} + +// bufSize is the size of the Hash write buffer. +// The buffer ensures that writes depend only on the sequence of bytes, +// not the sequence of WriteByte/Write/WriteString calls, +// by always calling rthash with a full buffer (except for the tail). +const bufSize = 128 + +// initSeed seeds the hash if necessary. +// initSeed is called lazily before any operation that actually uses h.seed/h.state. +// Note that this does not include Write/WriteByte/WriteString in the case +// where they only add to h.buf. (If they write too much, they call h.flush, +// which does call h.initSeed.) +func (h *Hash) initSeed() { + if h.seed.s == 0 { + seed := MakeSeed() + h.seed = seed + h.state = seed + } +} + +// WriteByte adds b to the sequence of bytes hashed by h. +// It never fails; the error result is for implementing io.ByteWriter. +func (h *Hash) WriteByte(b byte) error { + if h.n == len(h.buf) { + h.flush() + } + h.buf[h.n] = b + h.n++ + return nil +} + +// Write adds b to the sequence of bytes hashed by h. +// It always writes all of b and never fails; the count and error result are for implementing io.Writer. +func (h *Hash) Write(b []byte) (int, error) { + size := len(b) + // Deal with bytes left over in h.buf. + // h.n <= bufSize is always true. + // Checking it is ~free and it lets the compiler eliminate a bounds check. + if h.n > 0 && h.n <= bufSize { + k := copy(h.buf[h.n:], b) + h.n += k + if h.n < bufSize { + // Copied the entirety of b to h.buf. + return size, nil + } + b = b[k:] + h.flush() + // No need to set h.n = 0 here; it happens just before exit. + } + // Process as many full buffers as possible, without copying, and calling initSeed only once. + if len(b) > bufSize { + h.initSeed() + for len(b) > bufSize { + h.state.s = rthash(b[:bufSize], h.state.s) + b = b[bufSize:] + } + } + // Copy the tail. + copy(h.buf[:], b) + h.n = len(b) + return size, nil +} + +// WriteString adds the bytes of s to the sequence of bytes hashed by h. +// It always writes all of s and never fails; the count and error result are for implementing io.StringWriter. +func (h *Hash) WriteString(s string) (int, error) { + // WriteString mirrors Write. See Write for comments. + size := len(s) + if h.n > 0 && h.n <= bufSize { + k := copy(h.buf[h.n:], s) + h.n += k + if h.n < bufSize { + return size, nil + } + s = s[k:] + h.flush() + } + if len(s) > bufSize { + h.initSeed() + for len(s) > bufSize { + h.state.s = rthashString(s[:bufSize], h.state.s) + s = s[bufSize:] + } + } + copy(h.buf[:], s) + h.n = len(s) + return size, nil +} + +// Seed returns h's seed value. +func (h *Hash) Seed() Seed { + h.initSeed() + return h.seed +} + +// SetSeed sets h to use seed, which must have been returned by MakeSeed +// or by another Hash's Seed method. +// Two Hash objects with the same seed behave identically. +// Two Hash objects with different seeds will very likely behave differently. +// Any bytes added to h before this call will be discarded. +func (h *Hash) SetSeed(seed Seed) { + if seed.s == 0 { + panic("maphash: use of uninitialized Seed") + } + h.seed = seed + h.state = seed + h.n = 0 +} + +// Reset discards all bytes added to h. +// (The seed remains the same.) +func (h *Hash) Reset() { + h.initSeed() + h.state = h.seed + h.n = 0 +} + +// precondition: buffer is full. +func (h *Hash) flush() { + if h.n != len(h.buf) { + panic("maphash: flush of partially full buffer") + } + h.initSeed() + h.state.s = rthash(h.buf[:h.n], h.state.s) + h.n = 0 +} + +// Sum64 returns h's current 64-bit value, which depends on +// h's seed and the sequence of bytes added to h since the +// last call to Reset or SetSeed. +// +// All bits of the Sum64 result are close to uniformly and +// independently distributed, so it can be safely reduced +// by using bit masking, shifting, or modular arithmetic. +func (h *Hash) Sum64() uint64 { + h.initSeed() + return rthash(h.buf[:h.n], h.state.s) +} + +// MakeSeed returns a new random seed. +func MakeSeed() Seed { + var s uint64 + for { + s = randUint64() + // We use seed 0 to indicate an uninitialized seed/hash, + // so keep trying until we get a non-zero seed. + if s != 0 { + break + } + } + return Seed{s: s} +} + +// Sum appends the hash's current 64-bit value to b. +// It exists for implementing hash.Hash. +// For direct calls, it is more efficient to use Sum64. +func (h *Hash) Sum(b []byte) []byte { + x := h.Sum64() + return append(b, + byte(x>>0), + byte(x>>8), + byte(x>>16), + byte(x>>24), + byte(x>>32), + byte(x>>40), + byte(x>>48), + byte(x>>56)) +} + +// Size returns h's hash value size, 8 bytes. +func (h *Hash) Size() int { return 8 } + +// BlockSize returns h's block size. +func (h *Hash) BlockSize() int { return len(h.buf) } diff --git a/src/hash/maphash/maphash_purego.go b/src/hash/maphash/maphash_purego.go new file mode 100644 index 0000000..d49a44a --- /dev/null +++ b/src/hash/maphash/maphash_purego.go @@ -0,0 +1,104 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build purego + +package maphash + +import ( + "crypto/rand" + "math/bits" +) + +func rthash(buf []byte, seed uint64) uint64 { + if len(buf) == 0 { + return seed + } + return wyhash(buf, seed, uint64(len(buf))) +} + +func rthashString(s string, state uint64) uint64 { + return rthash([]byte(s), state) +} + +func randUint64() uint64 { + buf := make([]byte, 8) + _, _ = rand.Read(buf) + return leUint64(buf) +} + +// This is a port of wyhash implementation in runtime/hash64.go, +// without using unsafe for purego. + +const ( + m1 = 0xa0761d6478bd642f + m2 = 0xe7037ed1a0b428db + m3 = 0x8ebc6af09c88c6e3 + m4 = 0x589965cc75374cc3 + m5 = 0x1d8e4e27c47d124f +) + +func wyhash(key []byte, seed, len uint64) uint64 { + p := key + i := len + var a, b uint64 + seed ^= m1 + + if i > 16 { + if i > 48 { + seed1 := seed + seed2 := seed + for ; i > 48; i -= 48 { + seed = mix(r8(p)^m2, r8(p[8:])^seed) + seed1 = mix(r8(p[16:])^m3, r8(p[24:])^seed1) + seed2 = mix(r8(p[32:])^m4, r8(p[40:])^seed2) + p = p[48:] + } + seed ^= seed1 ^ seed2 + } + for ; i > 16; i -= 16 { + seed = mix(r8(p)^m2, r8(p[8:])^seed) + p = p[16:] + } + } + switch { + case i == 0: + return seed + case i < 4: + a = r3(p, i) + default: + n := (i >> 3) << 2 + a = r4(p)<<32 | r4(p[n:]) + b = r4(p[i-4:])<<32 | r4(p[i-4-n:]) + } + return mix(m5^len, mix(a^m2, b^seed)) +} + +func r3(p []byte, k uint64) uint64 { + return (uint64(p[0]) << 16) | (uint64(p[k>>1]) << 8) | uint64(p[k-1]) +} + +func r4(p []byte) uint64 { + return uint64(leUint32(p)) +} + +func r8(p []byte) uint64 { + return leUint64(p) +} + +func mix(a, b uint64) uint64 { + hi, lo := bits.Mul64(a, b) + return hi ^ lo +} + +func leUint32(b []byte) uint32 { + _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func leUint64(b []byte) uint64 { + _ = b[7] // bounds check hint to compiler; see golang.org/issue/14808 + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} diff --git a/src/hash/maphash/maphash_runtime.go b/src/hash/maphash/maphash_runtime.go new file mode 100644 index 0000000..98097ff --- /dev/null +++ b/src/hash/maphash/maphash_runtime.go @@ -0,0 +1,43 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +package maphash + +import ( + "unsafe" +) + +//go:linkname runtime_fastrand64 runtime.fastrand64 +func runtime_fastrand64() uint64 + +//go:linkname runtime_memhash runtime.memhash +//go:noescape +func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr + +func rthash(buf []byte, seed uint64) uint64 { + if len(buf) == 0 { + return seed + } + len := len(buf) + // The runtime hasher only works on uintptr. For 64-bit + // architectures, we use the hasher directly. Otherwise, + // we use two parallel hashers on the lower and upper 32 bits. + if unsafe.Sizeof(uintptr(0)) == 8 { + return uint64(runtime_memhash(unsafe.Pointer(&buf[0]), uintptr(seed), uintptr(len))) + } + lo := runtime_memhash(unsafe.Pointer(&buf[0]), uintptr(seed), uintptr(len)) + hi := runtime_memhash(unsafe.Pointer(&buf[0]), uintptr(seed>>32), uintptr(len)) + return uint64(hi)<<32 | uint64(lo) +} + +func rthashString(s string, state uint64) uint64 { + buf := unsafe.Slice(unsafe.StringData(s), len(s)) + return rthash(buf, state) +} + +func randUint64() uint64 { + return runtime_fastrand64() +} diff --git a/src/hash/maphash/maphash_test.go b/src/hash/maphash/maphash_test.go new file mode 100644 index 0000000..ed70f0c --- /dev/null +++ b/src/hash/maphash/maphash_test.go @@ -0,0 +1,255 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package maphash + +import ( + "bytes" + "fmt" + "hash" + "testing" +) + +func TestUnseededHash(t *testing.T) { + m := map[uint64]struct{}{} + for i := 0; i < 1000; i++ { + h := new(Hash) + m[h.Sum64()] = struct{}{} + } + if len(m) < 900 { + t.Errorf("empty hash not sufficiently random: got %d, want 1000", len(m)) + } +} + +func TestSeededHash(t *testing.T) { + s := MakeSeed() + m := map[uint64]struct{}{} + for i := 0; i < 1000; i++ { + h := new(Hash) + h.SetSeed(s) + m[h.Sum64()] = struct{}{} + } + if len(m) != 1 { + t.Errorf("seeded hash is random: got %d, want 1", len(m)) + } +} + +func TestHashGrouping(t *testing.T) { + b := bytes.Repeat([]byte("foo"), 100) + hh := make([]*Hash, 7) + for i := range hh { + hh[i] = new(Hash) + } + for _, h := range hh[1:] { + h.SetSeed(hh[0].Seed()) + } + hh[0].Write(b) + hh[1].WriteString(string(b)) + + writeByte := func(h *Hash, b byte) { + err := h.WriteByte(b) + if err != nil { + t.Fatalf("WriteByte: %v", err) + } + } + writeSingleByte := func(h *Hash, b byte) { + _, err := h.Write([]byte{b}) + if err != nil { + t.Fatalf("Write single byte: %v", err) + } + } + writeStringSingleByte := func(h *Hash, b byte) { + _, err := h.WriteString(string([]byte{b})) + if err != nil { + t.Fatalf("WriteString single byte: %v", err) + } + } + + for i, x := range b { + writeByte(hh[2], x) + writeSingleByte(hh[3], x) + if i == 0 { + writeByte(hh[4], x) + } else { + writeSingleByte(hh[4], x) + } + writeStringSingleByte(hh[5], x) + if i == 0 { + writeByte(hh[6], x) + } else { + writeStringSingleByte(hh[6], x) + } + } + + sum := hh[0].Sum64() + for i, h := range hh { + if sum != h.Sum64() { + t.Errorf("hash %d not identical to a single Write", i) + } + } + + if sum1 := Bytes(hh[0].Seed(), b); sum1 != hh[0].Sum64() { + t.Errorf("hash using Bytes not identical to a single Write") + } + + if sum1 := String(hh[0].Seed(), string(b)); sum1 != hh[0].Sum64() { + t.Errorf("hash using String not identical to a single Write") + } +} + +func TestHashBytesVsString(t *testing.T) { + s := "foo" + b := []byte(s) + h1 := new(Hash) + h2 := new(Hash) + h2.SetSeed(h1.Seed()) + n1, err1 := h1.WriteString(s) + if n1 != len(s) || err1 != nil { + t.Fatalf("WriteString(s) = %d, %v, want %d, nil", n1, err1, len(s)) + } + n2, err2 := h2.Write(b) + if n2 != len(b) || err2 != nil { + t.Fatalf("Write(b) = %d, %v, want %d, nil", n2, err2, len(b)) + } + if h1.Sum64() != h2.Sum64() { + t.Errorf("hash of string and bytes not identical") + } +} + +func TestHashHighBytes(t *testing.T) { + // See issue 34925. + const N = 10 + m := map[uint64]struct{}{} + for i := 0; i < N; i++ { + h := new(Hash) + h.WriteString("foo") + m[h.Sum64()>>32] = struct{}{} + } + if len(m) < N/2 { + t.Errorf("from %d seeds, wanted at least %d different hashes; got %d", N, N/2, len(m)) + } +} + +func TestRepeat(t *testing.T) { + h1 := new(Hash) + h1.WriteString("testing") + sum1 := h1.Sum64() + + h1.Reset() + h1.WriteString("testing") + sum2 := h1.Sum64() + + if sum1 != sum2 { + t.Errorf("different sum after resetting: %#x != %#x", sum1, sum2) + } + + h2 := new(Hash) + h2.SetSeed(h1.Seed()) + h2.WriteString("testing") + sum3 := h2.Sum64() + + if sum1 != sum3 { + t.Errorf("different sum on the same seed: %#x != %#x", sum1, sum3) + } +} + +func TestSeedFromSum64(t *testing.T) { + h1 := new(Hash) + h1.WriteString("foo") + x := h1.Sum64() // seed generated here + h2 := new(Hash) + h2.SetSeed(h1.Seed()) + h2.WriteString("foo") + y := h2.Sum64() + if x != y { + t.Errorf("hashes don't match: want %x, got %x", x, y) + } +} + +func TestSeedFromSeed(t *testing.T) { + h1 := new(Hash) + h1.WriteString("foo") + _ = h1.Seed() // seed generated here + x := h1.Sum64() + h2 := new(Hash) + h2.SetSeed(h1.Seed()) + h2.WriteString("foo") + y := h2.Sum64() + if x != y { + t.Errorf("hashes don't match: want %x, got %x", x, y) + } +} + +func TestSeedFromFlush(t *testing.T) { + b := make([]byte, 65) + h1 := new(Hash) + h1.Write(b) // seed generated here + x := h1.Sum64() + h2 := new(Hash) + h2.SetSeed(h1.Seed()) + h2.Write(b) + y := h2.Sum64() + if x != y { + t.Errorf("hashes don't match: want %x, got %x", x, y) + } +} + +func TestSeedFromReset(t *testing.T) { + h1 := new(Hash) + h1.WriteString("foo") + h1.Reset() // seed generated here + h1.WriteString("foo") + x := h1.Sum64() + h2 := new(Hash) + h2.SetSeed(h1.Seed()) + h2.WriteString("foo") + y := h2.Sum64() + if x != y { + t.Errorf("hashes don't match: want %x, got %x", x, y) + } +} + +// Make sure a Hash implements the hash.Hash and hash.Hash64 interfaces. +var _ hash.Hash = &Hash{} +var _ hash.Hash64 = &Hash{} + +func benchmarkSize(b *testing.B, size int) { + h := &Hash{} + buf := make([]byte, size) + s := string(buf) + + b.Run("Write", func(b *testing.B) { + b.SetBytes(int64(size)) + for i := 0; i < b.N; i++ { + h.Reset() + h.Write(buf) + h.Sum64() + } + }) + + b.Run("Bytes", func(b *testing.B) { + b.SetBytes(int64(size)) + seed := h.Seed() + for i := 0; i < b.N; i++ { + Bytes(seed, buf) + } + }) + + b.Run("String", func(b *testing.B) { + b.SetBytes(int64(size)) + seed := h.Seed() + for i := 0; i < b.N; i++ { + String(seed, s) + } + }) +} + +func BenchmarkHash(b *testing.B) { + sizes := []int{4, 8, 16, 32, 64, 256, 320, 1024, 4096, 16384} + for _, size := range sizes { + b.Run(fmt.Sprint("n=", size), func(b *testing.B) { + benchmarkSize(b, size) + }) + } +} diff --git a/src/hash/maphash/smhasher_test.go b/src/hash/maphash/smhasher_test.go new file mode 100644 index 0000000..a6e8a21 --- /dev/null +++ b/src/hash/maphash/smhasher_test.go @@ -0,0 +1,474 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !race + +package maphash + +import ( + "fmt" + "math" + "math/rand" + "runtime" + "strings" + "testing" + "unsafe" +) + +// Smhasher is a torture test for hash functions. +// https://code.google.com/p/smhasher/ +// This code is a port of some of the Smhasher tests to Go. + +// Note: due to the long running time of these tests, they are +// currently disabled in -race mode. + +var fixedSeed = MakeSeed() + +// Sanity checks. +// hash should not depend on values outside key. +// hash should not depend on alignment. +func TestSmhasherSanity(t *testing.T) { + r := rand.New(rand.NewSource(1234)) + const REP = 10 + const KEYMAX = 128 + const PAD = 16 + const OFFMAX = 16 + for k := 0; k < REP; k++ { + for n := 0; n < KEYMAX; n++ { + for i := 0; i < OFFMAX; i++ { + var b [KEYMAX + OFFMAX + 2*PAD]byte + var c [KEYMAX + OFFMAX + 2*PAD]byte + randBytes(r, b[:]) + randBytes(r, c[:]) + copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n]) + if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) { + t.Errorf("hash depends on bytes outside key") + } + } + } + } +} + +func bytesHash(b []byte) uint64 { + var h Hash + h.SetSeed(fixedSeed) + h.Write(b) + return h.Sum64() +} +func stringHash(s string) uint64 { + var h Hash + h.SetSeed(fixedSeed) + h.WriteString(s) + return h.Sum64() +} + +const hashSize = 64 + +func randBytes(r *rand.Rand, b []byte) { + r.Read(b) // can't fail +} + +// A hashSet measures the frequency of hash collisions. +type hashSet struct { + m map[uint64]struct{} // set of hashes added + n int // number of hashes added +} + +func newHashSet() *hashSet { + return &hashSet{make(map[uint64]struct{}), 0} +} +func (s *hashSet) add(h uint64) { + s.m[h] = struct{}{} + s.n++ +} +func (s *hashSet) addS(x string) { + s.add(stringHash(x)) +} +func (s *hashSet) addB(x []byte) { + s.add(bytesHash(x)) +} +func (s *hashSet) addS_seed(x string, seed Seed) { + var h Hash + h.SetSeed(seed) + h.WriteString(x) + s.add(h.Sum64()) +} +func (s *hashSet) check(t *testing.T) { + const SLOP = 10.0 + collisions := s.n - len(s.m) + pairs := int64(s.n) * int64(s.n-1) / 2 + expected := float64(pairs) / math.Pow(2.0, float64(hashSize)) + stddev := math.Sqrt(expected) + if float64(collisions) > expected+SLOP*(3*stddev+1) { + t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev) + } +} + +// a string plus adding zeros must make distinct hashes +func TestSmhasherAppendedZeros(t *testing.T) { + s := "hello" + strings.Repeat("\x00", 256) + h := newHashSet() + for i := 0; i <= len(s); i++ { + h.addS(s[:i]) + } + h.check(t) +} + +// All 0-3 byte strings have distinct hashes. +func TestSmhasherSmallKeys(t *testing.T) { + h := newHashSet() + var b [3]byte + for i := 0; i < 256; i++ { + b[0] = byte(i) + h.addB(b[:1]) + for j := 0; j < 256; j++ { + b[1] = byte(j) + h.addB(b[:2]) + if !testing.Short() { + for k := 0; k < 256; k++ { + b[2] = byte(k) + h.addB(b[:3]) + } + } + } + } + h.check(t) +} + +// Different length strings of all zeros have distinct hashes. +func TestSmhasherZeros(t *testing.T) { + N := 256 * 1024 + if testing.Short() { + N = 1024 + } + h := newHashSet() + b := make([]byte, N) + for i := 0; i <= N; i++ { + h.addB(b[:i]) + } + h.check(t) +} + +// Strings with up to two nonzero bytes all have distinct hashes. +func TestSmhasherTwoNonzero(t *testing.T) { + if runtime.GOARCH == "wasm" { + t.Skip("Too slow on wasm") + } + if testing.Short() { + t.Skip("Skipping in short mode") + } + h := newHashSet() + for n := 2; n <= 16; n++ { + twoNonZero(h, n) + } + h.check(t) +} +func twoNonZero(h *hashSet, n int) { + b := make([]byte, n) + + // all zero + h.addB(b) + + // one non-zero byte + for i := 0; i < n; i++ { + for x := 1; x < 256; x++ { + b[i] = byte(x) + h.addB(b) + b[i] = 0 + } + } + + // two non-zero bytes + for i := 0; i < n; i++ { + for x := 1; x < 256; x++ { + b[i] = byte(x) + for j := i + 1; j < n; j++ { + for y := 1; y < 256; y++ { + b[j] = byte(y) + h.addB(b) + b[j] = 0 + } + } + b[i] = 0 + } + } +} + +// Test strings with repeats, like "abcdabcdabcdabcd..." +func TestSmhasherCyclic(t *testing.T) { + if testing.Short() { + t.Skip("Skipping in short mode") + } + r := rand.New(rand.NewSource(1234)) + const REPEAT = 8 + const N = 1000000 + for n := 4; n <= 12; n++ { + h := newHashSet() + b := make([]byte, REPEAT*n) + for i := 0; i < N; i++ { + b[0] = byte(i * 79 % 97) + b[1] = byte(i * 43 % 137) + b[2] = byte(i * 151 % 197) + b[3] = byte(i * 199 % 251) + randBytes(r, b[4:n]) + for j := n; j < n*REPEAT; j++ { + b[j] = b[j-n] + } + h.addB(b) + } + h.check(t) + } +} + +// Test strings with only a few bits set +func TestSmhasherSparse(t *testing.T) { + if runtime.GOARCH == "wasm" { + t.Skip("Too slow on wasm") + } + if testing.Short() { + t.Skip("Skipping in short mode") + } + sparse(t, 32, 6) + sparse(t, 40, 6) + sparse(t, 48, 5) + sparse(t, 56, 5) + sparse(t, 64, 5) + sparse(t, 96, 4) + sparse(t, 256, 3) + sparse(t, 2048, 2) +} +func sparse(t *testing.T, n int, k int) { + b := make([]byte, n/8) + h := newHashSet() + setbits(h, b, 0, k) + h.check(t) +} + +// set up to k bits at index i and greater +func setbits(h *hashSet, b []byte, i int, k int) { + h.addB(b) + if k == 0 { + return + } + for j := i; j < len(b)*8; j++ { + b[j/8] |= byte(1 << uint(j&7)) + setbits(h, b, j+1, k-1) + b[j/8] &= byte(^(1 << uint(j&7))) + } +} + +// Test all possible combinations of n blocks from the set s. +// "permutation" is a bad name here, but it is what Smhasher uses. +func TestSmhasherPermutation(t *testing.T) { + if runtime.GOARCH == "wasm" { + t.Skip("Too slow on wasm") + } + if testing.Short() { + t.Skip("Skipping in short mode") + } + permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8) + permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8) + permutation(t, []uint32{0, 1}, 20) + permutation(t, []uint32{0, 1 << 31}, 20) + permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6) +} +func permutation(t *testing.T, s []uint32, n int) { + b := make([]byte, n*4) + h := newHashSet() + genPerm(h, b, s, 0) + h.check(t) +} +func genPerm(h *hashSet, b []byte, s []uint32, n int) { + h.addB(b[:n]) + if n == len(b) { + return + } + for _, v := range s { + b[n] = byte(v) + b[n+1] = byte(v >> 8) + b[n+2] = byte(v >> 16) + b[n+3] = byte(v >> 24) + genPerm(h, b, s, n+4) + } +} + +type key interface { + clear() // set bits all to 0 + random(r *rand.Rand) // set key to something random + bits() int // how many bits key has + flipBit(i int) // flip bit i of the key + hash() uint64 // hash the key + name() string // for error reporting +} + +type bytesKey struct { + b []byte +} + +func (k *bytesKey) clear() { + for i := range k.b { + k.b[i] = 0 + } +} +func (k *bytesKey) random(r *rand.Rand) { + randBytes(r, k.b) +} +func (k *bytesKey) bits() int { + return len(k.b) * 8 +} +func (k *bytesKey) flipBit(i int) { + k.b[i>>3] ^= byte(1 << uint(i&7)) +} +func (k *bytesKey) hash() uint64 { + return bytesHash(k.b) +} +func (k *bytesKey) name() string { + return fmt.Sprintf("bytes%d", len(k.b)) +} + +// Flipping a single bit of a key should flip each output bit with 50% probability. +func TestSmhasherAvalanche(t *testing.T) { + if runtime.GOARCH == "wasm" { + t.Skip("Too slow on wasm") + } + if testing.Short() { + t.Skip("Skipping in short mode") + } + avalancheTest1(t, &bytesKey{make([]byte, 2)}) + avalancheTest1(t, &bytesKey{make([]byte, 4)}) + avalancheTest1(t, &bytesKey{make([]byte, 8)}) + avalancheTest1(t, &bytesKey{make([]byte, 16)}) + avalancheTest1(t, &bytesKey{make([]byte, 32)}) + avalancheTest1(t, &bytesKey{make([]byte, 200)}) +} +func avalancheTest1(t *testing.T, k key) { + const REP = 100000 + r := rand.New(rand.NewSource(1234)) + n := k.bits() + + // grid[i][j] is a count of whether flipping + // input bit i affects output bit j. + grid := make([][hashSize]int, n) + + for z := 0; z < REP; z++ { + // pick a random key, hash it + k.random(r) + h := k.hash() + + // flip each bit, hash & compare the results + for i := 0; i < n; i++ { + k.flipBit(i) + d := h ^ k.hash() + k.flipBit(i) + + // record the effects of that bit flip + g := &grid[i] + for j := 0; j < hashSize; j++ { + g[j] += int(d & 1) + d >>= 1 + } + } + } + + // Each entry in the grid should be about REP/2. + // More precisely, we did N = k.bits() * hashSize experiments where + // each is the sum of REP coin flips. We want to find bounds on the + // sum of coin flips such that a truly random experiment would have + // all sums inside those bounds with 99% probability. + N := n * hashSize + var c float64 + // find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999 + for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 { + } + c *= 8.0 // allowed slack - we don't need to be perfectly random + mean := .5 * REP + stddev := .5 * math.Sqrt(REP) + low := int(mean - c*stddev) + high := int(mean + c*stddev) + for i := 0; i < n; i++ { + for j := 0; j < hashSize; j++ { + x := grid[i][j] + if x < low || x > high { + t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP) + } + } + } +} + +// All bit rotations of a set of distinct keys +func TestSmhasherWindowed(t *testing.T) { + windowed(t, &bytesKey{make([]byte, 128)}) +} +func windowed(t *testing.T, k key) { + if runtime.GOARCH == "wasm" { + t.Skip("Too slow on wasm") + } + if testing.Short() { + t.Skip("Skipping in short mode") + } + const BITS = 16 + + for r := 0; r < k.bits(); r++ { + h := newHashSet() + for i := 0; i < 1<<BITS; i++ { + k.clear() + for j := 0; j < BITS; j++ { + if i>>uint(j)&1 != 0 { + k.flipBit((j + r) % k.bits()) + } + } + h.add(k.hash()) + } + h.check(t) + } +} + +// All keys of the form prefix + [A-Za-z0-9]*N + suffix. +func TestSmhasherText(t *testing.T) { + if testing.Short() { + t.Skip("Skipping in short mode") + } + text(t, "Foo", "Bar") + text(t, "FooBar", "") + text(t, "", "FooBar") +} +func text(t *testing.T, prefix, suffix string) { + const N = 4 + const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789" + const L = len(S) + b := make([]byte, len(prefix)+N+len(suffix)) + copy(b, prefix) + copy(b[len(prefix)+N:], suffix) + h := newHashSet() + c := b[len(prefix):] + for i := 0; i < L; i++ { + c[0] = S[i] + for j := 0; j < L; j++ { + c[1] = S[j] + for k := 0; k < L; k++ { + c[2] = S[k] + for x := 0; x < L; x++ { + c[3] = S[x] + h.addB(b) + } + } + } + } + h.check(t) +} + +// Make sure different seed values generate different hashes. +func TestSmhasherSeed(t *testing.T) { + if unsafe.Sizeof(uintptr(0)) == 4 { + t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)") + } + h := newHashSet() + const N = 100000 + s := "hello" + for i := 0; i < N; i++ { + h.addS_seed(s, Seed{s: uint64(i + 1)}) + h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used + } + h.check(t) +} |