summaryrefslogtreecommitdiffstats
path: root/src/hash/maphash
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 13:15:26 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 13:15:26 +0000
commit82539ad8d59729fb45b0bb0edda8f2bddb719eb1 (patch)
tree58f0b58e6f44f0e04d4a6373132cf426fa835fa7 /src/hash/maphash
parentInitial commit. (diff)
downloadgolang-1.17-upstream.tar.xz
golang-1.17-upstream.zip
Adding upstream version 1.17.13.upstream/1.17.13upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/hash/maphash')
-rw-r--r--src/hash/maphash/example_test.go37
-rw-r--r--src/hash/maphash/maphash.go263
-rw-r--r--src/hash/maphash/maphash_test.go235
-rw-r--r--src/hash/maphash/smhasher_test.go469
4 files changed, 1004 insertions, 0 deletions
diff --git a/src/hash/maphash/example_test.go b/src/hash/maphash/example_test.go
new file mode 100644
index 0000000..78690fd
--- /dev/null
+++ b/src/hash/maphash/example_test.go
@@ -0,0 +1,37 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package maphash_test
+
+import (
+ "fmt"
+ "hash/maphash"
+)
+
+func Example() {
+ // The zero Hash value is valid and ready to use; setting an
+ // initial seed is not necessary.
+ var h maphash.Hash
+
+ // Add a string to the hash, and print the current hash value.
+ h.WriteString("hello, ")
+ fmt.Printf("%#x\n", h.Sum64())
+
+ // Append additional data (in the form of a byte array).
+ h.Write([]byte{'w', 'o', 'r', 'l', 'd'})
+ fmt.Printf("%#x\n", h.Sum64())
+
+ // Reset discards all data previously added to the Hash, without
+ // changing its seed.
+ h.Reset()
+
+ // Use SetSeed to create a new Hash h2 which will behave
+ // identically to h.
+ var h2 maphash.Hash
+ h2.SetSeed(h.Seed())
+
+ h.WriteString("same")
+ h2.WriteString("same")
+ fmt.Printf("%#x == %#x\n", h.Sum64(), h2.Sum64())
+}
diff --git a/src/hash/maphash/maphash.go b/src/hash/maphash/maphash.go
new file mode 100644
index 0000000..d022d74
--- /dev/null
+++ b/src/hash/maphash/maphash.go
@@ -0,0 +1,263 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package maphash provides hash functions on byte sequences.
+// These hash functions are intended to be used to implement hash tables or
+// other data structures that need to map arbitrary strings or byte
+// sequences to a uniform distribution on unsigned 64-bit integers.
+// Each different instance of a hash table or data structure should use its own Seed.
+//
+// The hash functions are not cryptographically secure.
+// (See crypto/sha256 and crypto/sha512 for cryptographic use.)
+//
+package maphash
+
+import (
+ "internal/unsafeheader"
+ "unsafe"
+)
+
+// A Seed is a random value that selects the specific hash function
+// computed by a Hash. If two Hashes use the same Seeds, they
+// will compute the same hash values for any given input.
+// If two Hashes use different Seeds, they are very likely to compute
+// distinct hash values for any given input.
+//
+// A Seed must be initialized by calling MakeSeed.
+// The zero seed is uninitialized and not valid for use with Hash's SetSeed method.
+//
+// Each Seed value is local to a single process and cannot be serialized
+// or otherwise recreated in a different process.
+type Seed struct {
+ s uint64
+}
+
+// A Hash computes a seeded hash of a byte sequence.
+//
+// The zero Hash is a valid Hash ready to use.
+// A zero Hash chooses a random seed for itself during
+// the first call to a Reset, Write, Seed, or Sum64 method.
+// For control over the seed, use SetSeed.
+//
+// The computed hash values depend only on the initial seed and
+// the sequence of bytes provided to the Hash object, not on the way
+// in which the bytes are provided. For example, the three sequences
+//
+// h.Write([]byte{'f','o','o'})
+// h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
+// h.WriteString("foo")
+//
+// all have the same effect.
+//
+// Hashes are intended to be collision-resistant, even for situations
+// where an adversary controls the byte sequences being hashed.
+//
+// A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
+// If multiple goroutines must compute the same seeded hash,
+// each can declare its own Hash and call SetSeed with a common Seed.
+type Hash struct {
+ _ [0]func() // not comparable
+ seed Seed // initial seed used for this hash
+ state Seed // current hash of all flushed bytes
+ buf [bufSize]byte // unflushed byte buffer
+ n int // number of unflushed bytes
+}
+
+// bufSize is the size of the Hash write buffer.
+// The buffer ensures that writes depend only on the sequence of bytes,
+// not the sequence of WriteByte/Write/WriteString calls,
+// by always calling rthash with a full buffer (except for the tail).
+const bufSize = 128
+
+// initSeed seeds the hash if necessary.
+// initSeed is called lazily before any operation that actually uses h.seed/h.state.
+// Note that this does not include Write/WriteByte/WriteString in the case
+// where they only add to h.buf. (If they write too much, they call h.flush,
+// which does call h.initSeed.)
+func (h *Hash) initSeed() {
+ if h.seed.s == 0 {
+ seed := MakeSeed()
+ h.seed = seed
+ h.state = seed
+ }
+}
+
+// WriteByte adds b to the sequence of bytes hashed by h.
+// It never fails; the error result is for implementing io.ByteWriter.
+func (h *Hash) WriteByte(b byte) error {
+ if h.n == len(h.buf) {
+ h.flush()
+ }
+ h.buf[h.n] = b
+ h.n++
+ return nil
+}
+
+// Write adds b to the sequence of bytes hashed by h.
+// It always writes all of b and never fails; the count and error result are for implementing io.Writer.
+func (h *Hash) Write(b []byte) (int, error) {
+ size := len(b)
+ // Deal with bytes left over in h.buf.
+ // h.n <= bufSize is always true.
+ // Checking it is ~free and it lets the compiler eliminate a bounds check.
+ if h.n > 0 && h.n <= bufSize {
+ k := copy(h.buf[h.n:], b)
+ h.n += k
+ if h.n < bufSize {
+ // Copied the entirety of b to h.buf.
+ return size, nil
+ }
+ b = b[k:]
+ h.flush()
+ // No need to set h.n = 0 here; it happens just before exit.
+ }
+ // Process as many full buffers as possible, without copying, and calling initSeed only once.
+ if len(b) > bufSize {
+ h.initSeed()
+ for len(b) > bufSize {
+ h.state.s = rthash(&b[0], bufSize, h.state.s)
+ b = b[bufSize:]
+ }
+ }
+ // Copy the tail.
+ copy(h.buf[:], b)
+ h.n = len(b)
+ return size, nil
+}
+
+// WriteString adds the bytes of s to the sequence of bytes hashed by h.
+// It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.
+func (h *Hash) WriteString(s string) (int, error) {
+ // WriteString mirrors Write. See Write for comments.
+ size := len(s)
+ if h.n > 0 && h.n <= bufSize {
+ k := copy(h.buf[h.n:], s)
+ h.n += k
+ if h.n < bufSize {
+ return size, nil
+ }
+ s = s[k:]
+ h.flush()
+ }
+ if len(s) > bufSize {
+ h.initSeed()
+ for len(s) > bufSize {
+ ptr := (*byte)((*unsafeheader.String)(unsafe.Pointer(&s)).Data)
+ h.state.s = rthash(ptr, bufSize, h.state.s)
+ s = s[bufSize:]
+ }
+ }
+ copy(h.buf[:], s)
+ h.n = len(s)
+ return size, nil
+}
+
+// Seed returns h's seed value.
+func (h *Hash) Seed() Seed {
+ h.initSeed()
+ return h.seed
+}
+
+// SetSeed sets h to use seed, which must have been returned by MakeSeed
+// or by another Hash's Seed method.
+// Two Hash objects with the same seed behave identically.
+// Two Hash objects with different seeds will very likely behave differently.
+// Any bytes added to h before this call will be discarded.
+func (h *Hash) SetSeed(seed Seed) {
+ if seed.s == 0 {
+ panic("maphash: use of uninitialized Seed")
+ }
+ h.seed = seed
+ h.state = seed
+ h.n = 0
+}
+
+// Reset discards all bytes added to h.
+// (The seed remains the same.)
+func (h *Hash) Reset() {
+ h.initSeed()
+ h.state = h.seed
+ h.n = 0
+}
+
+// precondition: buffer is full.
+func (h *Hash) flush() {
+ if h.n != len(h.buf) {
+ panic("maphash: flush of partially full buffer")
+ }
+ h.initSeed()
+ h.state.s = rthash(&h.buf[0], h.n, h.state.s)
+ h.n = 0
+}
+
+// Sum64 returns h's current 64-bit value, which depends on
+// h's seed and the sequence of bytes added to h since the
+// last call to Reset or SetSeed.
+//
+// All bits of the Sum64 result are close to uniformly and
+// independently distributed, so it can be safely reduced
+// by using bit masking, shifting, or modular arithmetic.
+func (h *Hash) Sum64() uint64 {
+ h.initSeed()
+ return rthash(&h.buf[0], h.n, h.state.s)
+}
+
+// MakeSeed returns a new random seed.
+func MakeSeed() Seed {
+ var s1, s2 uint64
+ for {
+ s1 = uint64(runtime_fastrand())
+ s2 = uint64(runtime_fastrand())
+ // We use seed 0 to indicate an uninitialized seed/hash,
+ // so keep trying until we get a non-zero seed.
+ if s1|s2 != 0 {
+ break
+ }
+ }
+ return Seed{s: s1<<32 + s2}
+}
+
+//go:linkname runtime_fastrand runtime.fastrand
+func runtime_fastrand() uint32
+
+func rthash(ptr *byte, len int, seed uint64) uint64 {
+ if len == 0 {
+ return seed
+ }
+ // The runtime hasher only works on uintptr. For 64-bit
+ // architectures, we use the hasher directly. Otherwise,
+ // we use two parallel hashers on the lower and upper 32 bits.
+ if unsafe.Sizeof(uintptr(0)) == 8 {
+ return uint64(runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len)))
+ }
+ lo := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len))
+ hi := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed>>32), uintptr(len))
+ return uint64(hi)<<32 | uint64(lo)
+}
+
+//go:linkname runtime_memhash runtime.memhash
+//go:noescape
+func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
+
+// Sum appends the hash's current 64-bit value to b.
+// It exists for implementing hash.Hash.
+// For direct calls, it is more efficient to use Sum64.
+func (h *Hash) Sum(b []byte) []byte {
+ x := h.Sum64()
+ return append(b,
+ byte(x>>0),
+ byte(x>>8),
+ byte(x>>16),
+ byte(x>>24),
+ byte(x>>32),
+ byte(x>>40),
+ byte(x>>48),
+ byte(x>>56))
+}
+
+// Size returns h's hash value size, 8 bytes.
+func (h *Hash) Size() int { return 8 }
+
+// BlockSize returns h's block size.
+func (h *Hash) BlockSize() int { return len(h.buf) }
diff --git a/src/hash/maphash/maphash_test.go b/src/hash/maphash/maphash_test.go
new file mode 100644
index 0000000..78cdfc0
--- /dev/null
+++ b/src/hash/maphash/maphash_test.go
@@ -0,0 +1,235 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package maphash
+
+import (
+ "bytes"
+ "hash"
+ "testing"
+)
+
+func TestUnseededHash(t *testing.T) {
+ m := map[uint64]struct{}{}
+ for i := 0; i < 1000; i++ {
+ h := new(Hash)
+ m[h.Sum64()] = struct{}{}
+ }
+ if len(m) < 900 {
+ t.Errorf("empty hash not sufficiently random: got %d, want 1000", len(m))
+ }
+}
+
+func TestSeededHash(t *testing.T) {
+ s := MakeSeed()
+ m := map[uint64]struct{}{}
+ for i := 0; i < 1000; i++ {
+ h := new(Hash)
+ h.SetSeed(s)
+ m[h.Sum64()] = struct{}{}
+ }
+ if len(m) != 1 {
+ t.Errorf("seeded hash is random: got %d, want 1", len(m))
+ }
+}
+
+func TestHashGrouping(t *testing.T) {
+ b := bytes.Repeat([]byte("foo"), 100)
+ hh := make([]*Hash, 7)
+ for i := range hh {
+ hh[i] = new(Hash)
+ }
+ for _, h := range hh[1:] {
+ h.SetSeed(hh[0].Seed())
+ }
+ hh[0].Write(b)
+ hh[1].WriteString(string(b))
+
+ writeByte := func(h *Hash, b byte) {
+ err := h.WriteByte(b)
+ if err != nil {
+ t.Fatalf("WriteByte: %v", err)
+ }
+ }
+ writeSingleByte := func(h *Hash, b byte) {
+ _, err := h.Write([]byte{b})
+ if err != nil {
+ t.Fatalf("Write single byte: %v", err)
+ }
+ }
+ writeStringSingleByte := func(h *Hash, b byte) {
+ _, err := h.WriteString(string([]byte{b}))
+ if err != nil {
+ t.Fatalf("WriteString single byte: %v", err)
+ }
+ }
+
+ for i, x := range b {
+ writeByte(hh[2], x)
+ writeSingleByte(hh[3], x)
+ if i == 0 {
+ writeByte(hh[4], x)
+ } else {
+ writeSingleByte(hh[4], x)
+ }
+ writeStringSingleByte(hh[5], x)
+ if i == 0 {
+ writeByte(hh[6], x)
+ } else {
+ writeStringSingleByte(hh[6], x)
+ }
+ }
+
+ sum := hh[0].Sum64()
+ for i, h := range hh {
+ if sum != h.Sum64() {
+ t.Errorf("hash %d not identical to a single Write", i)
+ }
+ }
+}
+
+func TestHashBytesVsString(t *testing.T) {
+ s := "foo"
+ b := []byte(s)
+ h1 := new(Hash)
+ h2 := new(Hash)
+ h2.SetSeed(h1.Seed())
+ n1, err1 := h1.WriteString(s)
+ if n1 != len(s) || err1 != nil {
+ t.Fatalf("WriteString(s) = %d, %v, want %d, nil", n1, err1, len(s))
+ }
+ n2, err2 := h2.Write(b)
+ if n2 != len(b) || err2 != nil {
+ t.Fatalf("Write(b) = %d, %v, want %d, nil", n2, err2, len(b))
+ }
+ if h1.Sum64() != h2.Sum64() {
+ t.Errorf("hash of string and bytes not identical")
+ }
+}
+
+func TestHashHighBytes(t *testing.T) {
+ // See issue 34925.
+ const N = 10
+ m := map[uint64]struct{}{}
+ for i := 0; i < N; i++ {
+ h := new(Hash)
+ h.WriteString("foo")
+ m[h.Sum64()>>32] = struct{}{}
+ }
+ if len(m) < N/2 {
+ t.Errorf("from %d seeds, wanted at least %d different hashes; got %d", N, N/2, len(m))
+ }
+}
+
+func TestRepeat(t *testing.T) {
+ h1 := new(Hash)
+ h1.WriteString("testing")
+ sum1 := h1.Sum64()
+
+ h1.Reset()
+ h1.WriteString("testing")
+ sum2 := h1.Sum64()
+
+ if sum1 != sum2 {
+ t.Errorf("different sum after reseting: %#x != %#x", sum1, sum2)
+ }
+
+ h2 := new(Hash)
+ h2.SetSeed(h1.Seed())
+ h2.WriteString("testing")
+ sum3 := h2.Sum64()
+
+ if sum1 != sum3 {
+ t.Errorf("different sum on the same seed: %#x != %#x", sum1, sum3)
+ }
+}
+
+func TestSeedFromSum64(t *testing.T) {
+ h1 := new(Hash)
+ h1.WriteString("foo")
+ x := h1.Sum64() // seed generated here
+ h2 := new(Hash)
+ h2.SetSeed(h1.Seed())
+ h2.WriteString("foo")
+ y := h2.Sum64()
+ if x != y {
+ t.Errorf("hashes don't match: want %x, got %x", x, y)
+ }
+}
+
+func TestSeedFromSeed(t *testing.T) {
+ h1 := new(Hash)
+ h1.WriteString("foo")
+ _ = h1.Seed() // seed generated here
+ x := h1.Sum64()
+ h2 := new(Hash)
+ h2.SetSeed(h1.Seed())
+ h2.WriteString("foo")
+ y := h2.Sum64()
+ if x != y {
+ t.Errorf("hashes don't match: want %x, got %x", x, y)
+ }
+}
+
+func TestSeedFromFlush(t *testing.T) {
+ b := make([]byte, 65)
+ h1 := new(Hash)
+ h1.Write(b) // seed generated here
+ x := h1.Sum64()
+ h2 := new(Hash)
+ h2.SetSeed(h1.Seed())
+ h2.Write(b)
+ y := h2.Sum64()
+ if x != y {
+ t.Errorf("hashes don't match: want %x, got %x", x, y)
+ }
+}
+
+func TestSeedFromReset(t *testing.T) {
+ h1 := new(Hash)
+ h1.WriteString("foo")
+ h1.Reset() // seed generated here
+ h1.WriteString("foo")
+ x := h1.Sum64()
+ h2 := new(Hash)
+ h2.SetSeed(h1.Seed())
+ h2.WriteString("foo")
+ y := h2.Sum64()
+ if x != y {
+ t.Errorf("hashes don't match: want %x, got %x", x, y)
+ }
+}
+
+// Make sure a Hash implements the hash.Hash and hash.Hash64 interfaces.
+var _ hash.Hash = &Hash{}
+var _ hash.Hash64 = &Hash{}
+
+func benchmarkSize(b *testing.B, size int) {
+ h := &Hash{}
+ buf := make([]byte, size)
+ b.SetBytes(int64(size))
+ b.ResetTimer()
+
+ for i := 0; i < b.N; i++ {
+ h.Reset()
+ h.Write(buf)
+ h.Sum64()
+ }
+}
+
+func BenchmarkHash8Bytes(b *testing.B) {
+ benchmarkSize(b, 8)
+}
+
+func BenchmarkHash320Bytes(b *testing.B) {
+ benchmarkSize(b, 320)
+}
+
+func BenchmarkHash1K(b *testing.B) {
+ benchmarkSize(b, 1024)
+}
+
+func BenchmarkHash8K(b *testing.B) {
+ benchmarkSize(b, 8192)
+}
diff --git a/src/hash/maphash/smhasher_test.go b/src/hash/maphash/smhasher_test.go
new file mode 100644
index 0000000..6e6f298
--- /dev/null
+++ b/src/hash/maphash/smhasher_test.go
@@ -0,0 +1,469 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package maphash
+
+import (
+ "fmt"
+ "math"
+ "math/rand"
+ "runtime"
+ "strings"
+ "testing"
+ "unsafe"
+)
+
+// Smhasher is a torture test for hash functions.
+// https://code.google.com/p/smhasher/
+// This code is a port of some of the Smhasher tests to Go.
+
+var fixedSeed = MakeSeed()
+
+// Sanity checks.
+// hash should not depend on values outside key.
+// hash should not depend on alignment.
+func TestSmhasherSanity(t *testing.T) {
+ r := rand.New(rand.NewSource(1234))
+ const REP = 10
+ const KEYMAX = 128
+ const PAD = 16
+ const OFFMAX = 16
+ for k := 0; k < REP; k++ {
+ for n := 0; n < KEYMAX; n++ {
+ for i := 0; i < OFFMAX; i++ {
+ var b [KEYMAX + OFFMAX + 2*PAD]byte
+ var c [KEYMAX + OFFMAX + 2*PAD]byte
+ randBytes(r, b[:])
+ randBytes(r, c[:])
+ copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
+ if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
+ t.Errorf("hash depends on bytes outside key")
+ }
+ }
+ }
+ }
+}
+
+func bytesHash(b []byte) uint64 {
+ var h Hash
+ h.SetSeed(fixedSeed)
+ h.Write(b)
+ return h.Sum64()
+}
+func stringHash(s string) uint64 {
+ var h Hash
+ h.SetSeed(fixedSeed)
+ h.WriteString(s)
+ return h.Sum64()
+}
+
+const hashSize = 64
+
+func randBytes(r *rand.Rand, b []byte) {
+ r.Read(b) // can't fail
+}
+
+// A hashSet measures the frequency of hash collisions.
+type hashSet struct {
+ m map[uint64]struct{} // set of hashes added
+ n int // number of hashes added
+}
+
+func newHashSet() *hashSet {
+ return &hashSet{make(map[uint64]struct{}), 0}
+}
+func (s *hashSet) add(h uint64) {
+ s.m[h] = struct{}{}
+ s.n++
+}
+func (s *hashSet) addS(x string) {
+ s.add(stringHash(x))
+}
+func (s *hashSet) addB(x []byte) {
+ s.add(bytesHash(x))
+}
+func (s *hashSet) addS_seed(x string, seed Seed) {
+ var h Hash
+ h.SetSeed(seed)
+ h.WriteString(x)
+ s.add(h.Sum64())
+}
+func (s *hashSet) check(t *testing.T) {
+ const SLOP = 10.0
+ collisions := s.n - len(s.m)
+ pairs := int64(s.n) * int64(s.n-1) / 2
+ expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
+ stddev := math.Sqrt(expected)
+ if float64(collisions) > expected+SLOP*(3*stddev+1) {
+ t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
+ }
+}
+
+// a string plus adding zeros must make distinct hashes
+func TestSmhasherAppendedZeros(t *testing.T) {
+ s := "hello" + strings.Repeat("\x00", 256)
+ h := newHashSet()
+ for i := 0; i <= len(s); i++ {
+ h.addS(s[:i])
+ }
+ h.check(t)
+}
+
+// All 0-3 byte strings have distinct hashes.
+func TestSmhasherSmallKeys(t *testing.T) {
+ h := newHashSet()
+ var b [3]byte
+ for i := 0; i < 256; i++ {
+ b[0] = byte(i)
+ h.addB(b[:1])
+ for j := 0; j < 256; j++ {
+ b[1] = byte(j)
+ h.addB(b[:2])
+ if !testing.Short() {
+ for k := 0; k < 256; k++ {
+ b[2] = byte(k)
+ h.addB(b[:3])
+ }
+ }
+ }
+ }
+ h.check(t)
+}
+
+// Different length strings of all zeros have distinct hashes.
+func TestSmhasherZeros(t *testing.T) {
+ N := 256 * 1024
+ if testing.Short() {
+ N = 1024
+ }
+ h := newHashSet()
+ b := make([]byte, N)
+ for i := 0; i <= N; i++ {
+ h.addB(b[:i])
+ }
+ h.check(t)
+}
+
+// Strings with up to two nonzero bytes all have distinct hashes.
+func TestSmhasherTwoNonzero(t *testing.T) {
+ if runtime.GOARCH == "wasm" {
+ t.Skip("Too slow on wasm")
+ }
+ if testing.Short() {
+ t.Skip("Skipping in short mode")
+ }
+ h := newHashSet()
+ for n := 2; n <= 16; n++ {
+ twoNonZero(h, n)
+ }
+ h.check(t)
+}
+func twoNonZero(h *hashSet, n int) {
+ b := make([]byte, n)
+
+ // all zero
+ h.addB(b)
+
+ // one non-zero byte
+ for i := 0; i < n; i++ {
+ for x := 1; x < 256; x++ {
+ b[i] = byte(x)
+ h.addB(b)
+ b[i] = 0
+ }
+ }
+
+ // two non-zero bytes
+ for i := 0; i < n; i++ {
+ for x := 1; x < 256; x++ {
+ b[i] = byte(x)
+ for j := i + 1; j < n; j++ {
+ for y := 1; y < 256; y++ {
+ b[j] = byte(y)
+ h.addB(b)
+ b[j] = 0
+ }
+ }
+ b[i] = 0
+ }
+ }
+}
+
+// Test strings with repeats, like "abcdabcdabcdabcd..."
+func TestSmhasherCyclic(t *testing.T) {
+ if testing.Short() {
+ t.Skip("Skipping in short mode")
+ }
+ r := rand.New(rand.NewSource(1234))
+ const REPEAT = 8
+ const N = 1000000
+ for n := 4; n <= 12; n++ {
+ h := newHashSet()
+ b := make([]byte, REPEAT*n)
+ for i := 0; i < N; i++ {
+ b[0] = byte(i * 79 % 97)
+ b[1] = byte(i * 43 % 137)
+ b[2] = byte(i * 151 % 197)
+ b[3] = byte(i * 199 % 251)
+ randBytes(r, b[4:n])
+ for j := n; j < n*REPEAT; j++ {
+ b[j] = b[j-n]
+ }
+ h.addB(b)
+ }
+ h.check(t)
+ }
+}
+
+// Test strings with only a few bits set
+func TestSmhasherSparse(t *testing.T) {
+ if runtime.GOARCH == "wasm" {
+ t.Skip("Too slow on wasm")
+ }
+ if testing.Short() {
+ t.Skip("Skipping in short mode")
+ }
+ sparse(t, 32, 6)
+ sparse(t, 40, 6)
+ sparse(t, 48, 5)
+ sparse(t, 56, 5)
+ sparse(t, 64, 5)
+ sparse(t, 96, 4)
+ sparse(t, 256, 3)
+ sparse(t, 2048, 2)
+}
+func sparse(t *testing.T, n int, k int) {
+ b := make([]byte, n/8)
+ h := newHashSet()
+ setbits(h, b, 0, k)
+ h.check(t)
+}
+
+// set up to k bits at index i and greater
+func setbits(h *hashSet, b []byte, i int, k int) {
+ h.addB(b)
+ if k == 0 {
+ return
+ }
+ for j := i; j < len(b)*8; j++ {
+ b[j/8] |= byte(1 << uint(j&7))
+ setbits(h, b, j+1, k-1)
+ b[j/8] &= byte(^(1 << uint(j&7)))
+ }
+}
+
+// Test all possible combinations of n blocks from the set s.
+// "permutation" is a bad name here, but it is what Smhasher uses.
+func TestSmhasherPermutation(t *testing.T) {
+ if runtime.GOARCH == "wasm" {
+ t.Skip("Too slow on wasm")
+ }
+ if testing.Short() {
+ t.Skip("Skipping in short mode")
+ }
+ permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
+ permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
+ permutation(t, []uint32{0, 1}, 20)
+ permutation(t, []uint32{0, 1 << 31}, 20)
+ permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
+}
+func permutation(t *testing.T, s []uint32, n int) {
+ b := make([]byte, n*4)
+ h := newHashSet()
+ genPerm(h, b, s, 0)
+ h.check(t)
+}
+func genPerm(h *hashSet, b []byte, s []uint32, n int) {
+ h.addB(b[:n])
+ if n == len(b) {
+ return
+ }
+ for _, v := range s {
+ b[n] = byte(v)
+ b[n+1] = byte(v >> 8)
+ b[n+2] = byte(v >> 16)
+ b[n+3] = byte(v >> 24)
+ genPerm(h, b, s, n+4)
+ }
+}
+
+type key interface {
+ clear() // set bits all to 0
+ random(r *rand.Rand) // set key to something random
+ bits() int // how many bits key has
+ flipBit(i int) // flip bit i of the key
+ hash() uint64 // hash the key
+ name() string // for error reporting
+}
+
+type bytesKey struct {
+ b []byte
+}
+
+func (k *bytesKey) clear() {
+ for i := range k.b {
+ k.b[i] = 0
+ }
+}
+func (k *bytesKey) random(r *rand.Rand) {
+ randBytes(r, k.b)
+}
+func (k *bytesKey) bits() int {
+ return len(k.b) * 8
+}
+func (k *bytesKey) flipBit(i int) {
+ k.b[i>>3] ^= byte(1 << uint(i&7))
+}
+func (k *bytesKey) hash() uint64 {
+ return bytesHash(k.b)
+}
+func (k *bytesKey) name() string {
+ return fmt.Sprintf("bytes%d", len(k.b))
+}
+
+// Flipping a single bit of a key should flip each output bit with 50% probability.
+func TestSmhasherAvalanche(t *testing.T) {
+ if runtime.GOARCH == "wasm" {
+ t.Skip("Too slow on wasm")
+ }
+ if testing.Short() {
+ t.Skip("Skipping in short mode")
+ }
+ avalancheTest1(t, &bytesKey{make([]byte, 2)})
+ avalancheTest1(t, &bytesKey{make([]byte, 4)})
+ avalancheTest1(t, &bytesKey{make([]byte, 8)})
+ avalancheTest1(t, &bytesKey{make([]byte, 16)})
+ avalancheTest1(t, &bytesKey{make([]byte, 32)})
+ avalancheTest1(t, &bytesKey{make([]byte, 200)})
+}
+func avalancheTest1(t *testing.T, k key) {
+ const REP = 100000
+ r := rand.New(rand.NewSource(1234))
+ n := k.bits()
+
+ // grid[i][j] is a count of whether flipping
+ // input bit i affects output bit j.
+ grid := make([][hashSize]int, n)
+
+ for z := 0; z < REP; z++ {
+ // pick a random key, hash it
+ k.random(r)
+ h := k.hash()
+
+ // flip each bit, hash & compare the results
+ for i := 0; i < n; i++ {
+ k.flipBit(i)
+ d := h ^ k.hash()
+ k.flipBit(i)
+
+ // record the effects of that bit flip
+ g := &grid[i]
+ for j := 0; j < hashSize; j++ {
+ g[j] += int(d & 1)
+ d >>= 1
+ }
+ }
+ }
+
+ // Each entry in the grid should be about REP/2.
+ // More precisely, we did N = k.bits() * hashSize experiments where
+ // each is the sum of REP coin flips. We want to find bounds on the
+ // sum of coin flips such that a truly random experiment would have
+ // all sums inside those bounds with 99% probability.
+ N := n * hashSize
+ var c float64
+ // find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
+ for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
+ }
+ c *= 4.0 // allowed slack - we don't need to be perfectly random
+ mean := .5 * REP
+ stddev := .5 * math.Sqrt(REP)
+ low := int(mean - c*stddev)
+ high := int(mean + c*stddev)
+ for i := 0; i < n; i++ {
+ for j := 0; j < hashSize; j++ {
+ x := grid[i][j]
+ if x < low || x > high {
+ t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
+ }
+ }
+ }
+}
+
+// All bit rotations of a set of distinct keys
+func TestSmhasherWindowed(t *testing.T) {
+ windowed(t, &bytesKey{make([]byte, 128)})
+}
+func windowed(t *testing.T, k key) {
+ if runtime.GOARCH == "wasm" {
+ t.Skip("Too slow on wasm")
+ }
+ if testing.Short() {
+ t.Skip("Skipping in short mode")
+ }
+ const BITS = 16
+
+ for r := 0; r < k.bits(); r++ {
+ h := newHashSet()
+ for i := 0; i < 1<<BITS; i++ {
+ k.clear()
+ for j := 0; j < BITS; j++ {
+ if i>>uint(j)&1 != 0 {
+ k.flipBit((j + r) % k.bits())
+ }
+ }
+ h.add(k.hash())
+ }
+ h.check(t)
+ }
+}
+
+// All keys of the form prefix + [A-Za-z0-9]*N + suffix.
+func TestSmhasherText(t *testing.T) {
+ if testing.Short() {
+ t.Skip("Skipping in short mode")
+ }
+ text(t, "Foo", "Bar")
+ text(t, "FooBar", "")
+ text(t, "", "FooBar")
+}
+func text(t *testing.T, prefix, suffix string) {
+ const N = 4
+ const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
+ const L = len(S)
+ b := make([]byte, len(prefix)+N+len(suffix))
+ copy(b, prefix)
+ copy(b[len(prefix)+N:], suffix)
+ h := newHashSet()
+ c := b[len(prefix):]
+ for i := 0; i < L; i++ {
+ c[0] = S[i]
+ for j := 0; j < L; j++ {
+ c[1] = S[j]
+ for k := 0; k < L; k++ {
+ c[2] = S[k]
+ for x := 0; x < L; x++ {
+ c[3] = S[x]
+ h.addB(b)
+ }
+ }
+ }
+ }
+ h.check(t)
+}
+
+// Make sure different seed values generate different hashes.
+func TestSmhasherSeed(t *testing.T) {
+ if unsafe.Sizeof(uintptr(0)) == 4 {
+ t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)")
+ }
+ h := newHashSet()
+ const N = 100000
+ s := "hello"
+ for i := 0; i < N; i++ {
+ h.addS_seed(s, Seed{s: uint64(i + 1)})
+ h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used
+ }
+ h.check(t)
+}