summaryrefslogtreecommitdiffstats
path: root/src/arrow/go/parquet/internal/hashing
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/go/parquet/internal/hashing
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/go/parquet/internal/hashing')
-rw-r--r--src/arrow/go/parquet/internal/hashing/hashing_test.go114
-rw-r--r--src/arrow/go/parquet/internal/hashing/types.tmpldata18
-rw-r--r--src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go1103
-rw-r--r--src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl327
-rw-r--r--src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go400
5 files changed, 1962 insertions, 0 deletions
diff --git a/src/arrow/go/parquet/internal/hashing/hashing_test.go b/src/arrow/go/parquet/internal/hashing/hashing_test.go
new file mode 100644
index 000000000..875424a9d
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/hashing_test.go
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+ "math/rand"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func MakeDistinctIntegers(nvals int) map[int]bool {
+ r := rand.New(rand.NewSource(42))
+ values := make(map[int]bool)
+ for len(values) < nvals {
+ values[r.Int()] = true
+ }
+ return values
+}
+
+func MakeSequentialIntegers(nvals int) map[int]bool {
+ values := make(map[int]bool)
+ for i := 0; i < nvals; i++ {
+ values[i] = true
+ }
+ return values
+}
+
+func MakeDistinctStrings(nvals int) map[string]bool {
+ values := make(map[string]bool)
+
+ r := rand.New(rand.NewSource(42))
+
+ max := 'z'
+ min := '0'
+ for len(values) < nvals {
+ data := make([]byte, r.Intn(24))
+ for idx := range data {
+ data[idx] = byte(r.Intn(int(max-min+1)) + int(min))
+ }
+ values[string(data)] = true
+ }
+ return values
+}
+
+func TestHashingQualityInt(t *testing.T) {
+ const nvalues = 10000
+
+ tests := []struct {
+ name string
+ values map[int]bool
+ quality float64
+ }{
+ {"distinct", MakeDistinctIntegers(nvalues), 0.96},
+ {"sequential", MakeSequentialIntegers(nvalues), 0.96},
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ hashes := make(map[uint64]bool)
+ for k := range tt.values {
+ hashes[hashInt(uint64(k), 0)] = true
+ hashes[hashInt(uint64(k), 1)] = true
+ }
+ assert.GreaterOrEqual(t, float64(len(hashes)), tt.quality*float64(2*len(tt.values)))
+ })
+ }
+}
+
+func TestHashingBoundsStrings(t *testing.T) {
+ sizes := []int{1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21}
+ for _, s := range sizes {
+ str := make([]byte, s)
+ for idx := range str {
+ str[idx] = uint8(idx)
+ }
+
+ h := hash(str, 1)
+ diff := 0
+ for i := 0; i < 120; i++ {
+ str[len(str)-1] = uint8(i)
+ if hash(str, 1) != h {
+ diff++
+ }
+ }
+ assert.GreaterOrEqual(t, diff, 118)
+ }
+}
+
+func TestHashingQualityString(t *testing.T) {
+ const nvalues = 10000
+ values := MakeDistinctStrings(nvalues)
+
+ hashes := make(map[uint64]bool)
+ for k := range values {
+ hashes[hashString(k, 0)] = true
+ hashes[hashString(k, 1)] = true
+ }
+ assert.GreaterOrEqual(t, float64(len(hashes)), 0.96*float64(2*len(values)))
+}
diff --git a/src/arrow/go/parquet/internal/hashing/types.tmpldata b/src/arrow/go/parquet/internal/hashing/types.tmpldata
new file mode 100644
index 000000000..2e97e9814
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/types.tmpldata
@@ -0,0 +1,18 @@
+[
+ {
+ "Name": "Int32",
+ "name": "int32"
+ },
+ {
+ "Name": "Int64",
+ "name": "int64"
+ },
+ {
+ "Name": "Float32",
+ "name": "float32"
+ },
+ {
+ "Name": "Float64",
+ "name": "float64"
+ }
+]
diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go
new file mode 100644
index 000000000..ca891dc33
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go
@@ -0,0 +1,1103 @@
+// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+ "math"
+
+ "github.com/apache/arrow/go/v6/arrow"
+ "github.com/apache/arrow/go/v6/arrow/bitutil"
+ "github.com/apache/arrow/go/v6/parquet/internal/utils"
+)
+
+type payloadInt32 struct {
+ val int32
+ memoIdx int32
+}
+
+type entryInt32 struct {
+ h uint64
+ payload payloadInt32
+}
+
+func (e entryInt32) Valid() bool { return e.h != sentinel }
+
+// Int32HashTable is a hashtable specifically for int32 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int32HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryInt32
+}
+
+// NewInt32HashTable returns a new hash table for int32 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt32HashTable(cap uint64) *Int32HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryInt32, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int32HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryInt32, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int32HashTable) CopyValues(out []int32) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) {
+ h.VisitEntries(func(e *entryInt32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Int32HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Int32HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Int32Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryInt32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEInt32(e.payload.val)
+ }
+ })
+}
+
+func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int32HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryInt32
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Int32HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryInt32, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int32MemoTable struct {
+ tbl *Int32HashTable
+ nullIdx int32
+}
+
+// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt32MemoTable(num int64) *Int32MemoTable {
+ return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int32MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int32MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int32MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int32MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]int32))
+}
+
+func (s *Int32MemoTable) WriteOut(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int32MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(int32)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(int32)), 0)
+ e, ok := s.tbl.Lookup(h, func(v int32) bool {
+ return val.(int32) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(int32), int32(idx))
+ }
+ return
+}
+
+type payloadInt64 struct {
+ val int64
+ memoIdx int32
+}
+
+type entryInt64 struct {
+ h uint64
+ payload payloadInt64
+}
+
+func (e entryInt64) Valid() bool { return e.h != sentinel }
+
+// Int64HashTable is a hashtable specifically for int64 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int64HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryInt64
+}
+
+// NewInt64HashTable returns a new hash table for int64 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt64HashTable(cap uint64) *Int64HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryInt64, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int64HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryInt64, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int64HashTable) CopyValues(out []int64) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) {
+ h.VisitEntries(func(e *entryInt64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Int64HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Int64HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Int64Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryInt64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEInt64(e.payload.val)
+ }
+ })
+}
+
+func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int64HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryInt64
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Int64HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryInt64, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int64MemoTable struct {
+ tbl *Int64HashTable
+ nullIdx int32
+}
+
+// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt64MemoTable(num int64) *Int64MemoTable {
+ return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int64MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int64MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int64MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int64MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]int64))
+}
+
+func (s *Int64MemoTable) WriteOut(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int64MemoTable) Get(val interface{}) (int, bool) {
+
+ h := hashInt(uint64(val.(int64)), 0)
+ if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ h := hashInt(uint64(val.(int64)), 0)
+ e, ok := s.tbl.Lookup(h, func(v int64) bool {
+ return val.(int64) == v
+ })
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(int64), int32(idx))
+ }
+ return
+}
+
+type payloadFloat32 struct {
+ val float32
+ memoIdx int32
+}
+
+type entryFloat32 struct {
+ h uint64
+ payload payloadFloat32
+}
+
+func (e entryFloat32) Valid() bool { return e.h != sentinel }
+
+// Float32HashTable is a hashtable specifically for float32 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Float32HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryFloat32
+}
+
+// NewFloat32HashTable returns a new hash table for float32 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewFloat32HashTable(cap uint64) *Float32HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryFloat32, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Float32HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryFloat32, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Float32HashTable) CopyValues(out []float32) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) {
+ h.VisitEntries(func(e *entryFloat32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Float32HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Float32HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Float32Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryFloat32) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEFloat32(e.payload.val)
+ }
+ })
+}
+
+func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Float32HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryFloat32
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Float32HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryFloat32, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Float32MemoTable struct {
+ tbl *Float32HashTable
+ nullIdx int32
+}
+
+// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewFloat32MemoTable(num int64) *Float32MemoTable {
+ return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Float32MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Float32MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Float32MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Float32MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]float32))
+}
+
+func (s *Float32MemoTable) WriteOut(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Float32MemoTable) Get(val interface{}) (int, bool) {
+ var cmp func(float32) bool
+
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ } else {
+ cmp = func(v float32) bool { return val.(float32) == v }
+ }
+
+ h := hashFloat32(val.(float32), 0)
+ if e, ok := s.tbl.Lookup(h, cmp); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ var cmp func(float32) bool
+
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ } else {
+ cmp = func(v float32) bool { return val.(float32) == v }
+ }
+
+ h := hashFloat32(val.(float32), 0)
+ e, ok := s.tbl.Lookup(h, cmp)
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(float32), int32(idx))
+ }
+ return
+}
+
+type payloadFloat64 struct {
+ val float64
+ memoIdx int32
+}
+
+type entryFloat64 struct {
+ h uint64
+ payload payloadFloat64
+}
+
+func (e entryFloat64) Valid() bool { return e.h != sentinel }
+
+// Float64HashTable is a hashtable specifically for float64 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Float64HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entryFloat64
+}
+
+// NewFloat64HashTable returns a new hash table for float64 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewFloat64HashTable(cap uint64) *Float64HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entryFloat64, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Float64HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entryFloat64, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Float64HashTable) CopyValues(out []float64) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) {
+ h.VisitEntries(func(e *entryFloat64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *Float64HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *Float64HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.Float64Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entryFloat64) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLEFloat64(e.payload.val)
+ }
+ })
+}
+
+func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Float64HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entryFloat64
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *Float64HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entryFloat64, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Float64MemoTable struct {
+ tbl *Float64HashTable
+ nullIdx int32
+}
+
+// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewFloat64MemoTable(num int64) *Float64MemoTable {
+ return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Float64MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Float64MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Float64MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Float64MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]float64))
+}
+
+func (s *Float64MemoTable) WriteOut(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Float64MemoTable) Get(val interface{}) (int, bool) {
+ var cmp func(float64) bool
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ } else {
+ cmp = func(v float64) bool { return val.(float64) == v }
+ }
+
+ h := hashFloat64(val.(float64), 0)
+ if e, ok := s.tbl.Lookup(h, cmp); ok {
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+ var cmp func(float64) bool
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ } else {
+ cmp = func(v float64) bool { return val.(float64) == v }
+ }
+
+ h := hashFloat64(val.(float64), 0)
+ e, ok := s.tbl.Lookup(h, cmp)
+
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.(float64), int32(idx))
+ }
+ return
+}
diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl
new file mode 100644
index 000000000..518ba5af0
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl
@@ -0,0 +1,327 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+ "github.com/apache/arrow/go/v6/arrow/bitutil"
+ "github.com/apache/arrow/go/v6/parquet/internal/utils"
+)
+
+{{range .In}}
+type payload{{.Name}} struct {
+ val {{.name}}
+ memoIdx int32
+}
+
+type entry{{.Name}} struct {
+ h uint64
+ payload payload{{.Name}}
+}
+
+func (e entry{{.Name}}) Valid() bool { return e.h != sentinel }
+
+// {{.Name}}HashTable is a hashtable specifically for {{.name}} that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type {{.Name}}HashTable struct {
+ cap uint64
+ capMask uint64
+ size uint64
+
+ entries []entry{{.Name}}
+}
+
+// New{{.Name}}HashTable returns a new hash table for {{.name}} values
+// initialized with the passed in capacity or 32 whichever is larger.
+func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable {
+ initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+ ret.entries = make([]entry{{.Name}}, initCap)
+ return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *{{.Name}}HashTable) Reset(cap uint64) {
+ h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+ h.capMask = h.cap - 1
+ h.size = 0
+ h.entries = make([]entry{{.Name}}, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) {
+ h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) {
+ h.VisitEntries(func(e *entry{{.Name}}) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ out[idx] = e.payload.val
+ }
+ })
+}
+
+func (h *{{.Name}}HashTable) WriteOut(out []byte) {
+ h.WriteOutSubset(0, out)
+}
+
+func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) {
+ data := arrow.{{.Name}}Traits.CastFromBytes(out)
+ h.VisitEntries(func(e *entry{{.Name}}) {
+ idx := e.payload.memoIdx - int32(start)
+ if idx >= 0 {
+ data[idx] = utils.ToLE{{.Name}}(e.payload.val)
+ }
+ })
+}
+
+func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func ({{.Name}}HashTable) fixHash(v uint64) uint64 {
+ if v == sentinel {
+ return 42
+ }
+ return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) {
+ idx, ok := h.lookup(v, h.capMask, cmp)
+ return &h.entries[idx], ok
+}
+
+func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) {
+ const perturbShift uint8 = 5
+
+ var (
+ idx uint64
+ perturb uint64
+ e *entry{{.Name}}
+ )
+
+ v = h.fixHash(v)
+ idx = v & szMask
+ perturb = (v >> uint64(perturbShift)) + 1
+
+ for {
+ e = &h.entries[idx]
+ if e.h == v && cmp(e.payload.val) {
+ return idx, true
+ }
+
+ if e.h == sentinel {
+ return idx, false
+ }
+
+ // perturbation logic inspired from CPython's set/dict object
+ // the goal is that all 64 bits of unmasked hash value eventually
+ // participate int he probing sequence, to minimize clustering
+ idx = (idx + perturb) & szMask
+ perturb = (perturb >> uint64(perturbShift)) + 1
+ }
+}
+
+func (h *{{.Name}}HashTable) upsize(newcap uint64) error {
+ newMask := newcap - 1
+
+ oldEntries := h.entries
+ h.entries = make([]entry{{.Name}}, newcap)
+ for _, e := range oldEntries {
+ if e.Valid() {
+ idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false })
+ h.entries[idx] = e
+ }
+ }
+ h.cap = newcap
+ h.capMask = newMask
+ return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error {
+ e.h = h.fixHash(v)
+ e.payload.val = val
+ e.payload.memoIdx = memoIdx
+ h.size++
+
+ if h.needUpsize() {
+ h.upsize(h.cap * uint64(loadFactor) * 2)
+ }
+ return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) {
+ for _, e := range h.entries {
+ if e.Valid() {
+ visit(&e)
+ }
+ }
+}
+
+// {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type {{.Name}}MemoTable struct {
+ tbl *{{.Name}}HashTable
+ nullIdx int32
+}
+
+// New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable {
+ return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *{{.Name}}MemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *{{.Name}}MemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *{{.Name}}MemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = s.GetNull()
+ if !found {
+ idx = s.Size()
+ s.nullIdx = int32(idx)
+ }
+ return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *{{.Name}}MemoTable) CopyValues(out interface{}) {
+ s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) {
+ s.tbl.CopyValuesSubset(start, out.([]{{.name}}))
+}
+
+func (s *{{.Name}}MemoTable) WriteOut(out []byte) {
+ s.tbl.WriteOut(out)
+}
+
+func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) {
+ s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) {
+{{if or (eq .Name "Int32") (eq .Name "Int64") }}
+ h := hashInt(uint64(val.({{.name}})), 0)
+ if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok {
+{{ else -}}
+ var cmp func({{.name}}) bool
+ {{if eq .Name "Float32"}}
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ {{ else -}}
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ {{end -}}
+ } else {
+ cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
+ }
+
+ h := hash{{.Name}}(val.({{.name}}), 0)
+ if e, ok := s.tbl.Lookup(h, cmp); ok {
+{{ end -}}
+ return int(e.payload.memoIdx), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+ {{if or (eq .Name "Int32") (eq .Name "Int64") }}
+ h := hashInt(uint64(val.({{.name}})), 0)
+ e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool {
+ return val.({{.name}}) == v
+ })
+{{ else }}
+ var cmp func({{.name}}) bool
+ {{if eq .Name "Float32"}}
+ if math.IsNaN(float64(val.(float32))) {
+ cmp = isNan32Cmp
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = float32(math.NaN())
+ {{ else -}}
+ if math.IsNaN(val.(float64)) {
+ cmp = math.IsNaN
+ // use consistent internal bit pattern for NaN regardless of the pattern
+ // that is passed to us. NaN is NaN is NaN
+ val = math.NaN()
+ {{end -}}
+ } else {
+ cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
+ }
+
+ h := hash{{.Name}}(val.({{.name}}), 0)
+ e, ok := s.tbl.Lookup(h, cmp)
+{{ end }}
+ if ok {
+ idx = int(e.payload.memoIdx)
+ found = true
+ } else {
+ idx = s.Size()
+ s.tbl.Insert(e, h, val.({{.name}}), int32(idx))
+ }
+ return
+}
+{{end}}
diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go
new file mode 100644
index 000000000..54d245187
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go
@@ -0,0 +1,400 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hashing provides utilities for and an implementation of a hash
+// table which is more performant than the default go map implementation
+// by leveraging xxh3 and some custom hash functions.
+package hashing
+
+import (
+ "bytes"
+ "math"
+ "math/bits"
+ "reflect"
+ "unsafe"
+
+ "github.com/apache/arrow/go/v6/arrow"
+ "github.com/apache/arrow/go/v6/arrow/array"
+ "github.com/apache/arrow/go/v6/arrow/memory"
+ "github.com/apache/arrow/go/v6/parquet"
+
+ "github.com/zeebo/xxh3"
+)
+
+//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl
+
+func hashInt(val uint64, alg uint64) uint64 {
+ // Two of xxhash's prime multipliers (which are chosen for their
+ // bit dispersion properties)
+ var multipliers = [2]uint64{11400714785074694791, 14029467366897019727}
+ // Multiplying by the prime number mixes the low bits into the high bits,
+ // then byte-swapping (which is a single CPU instruction) allows the
+ // combined high and low bits to participate in the initial hash table index.
+ return bits.ReverseBytes64(multipliers[alg] * val)
+}
+
+func hashFloat32(val float32, alg uint64) uint64 {
+ // grab the raw byte pattern of the
+ bt := *(*[4]byte)(unsafe.Pointer(&val))
+ x := uint64(*(*uint32)(unsafe.Pointer(&bt[0])))
+ hx := hashInt(x, alg)
+ hy := hashInt(x, alg^1)
+ return 4 ^ hx ^ hy
+}
+
+func hashFloat64(val float64, alg uint64) uint64 {
+ bt := *(*[8]byte)(unsafe.Pointer(&val))
+ hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg)
+ hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1)
+ return 8 ^ hx ^ hy
+}
+
+func hashString(val string, alg uint64) uint64 {
+ buf := *(*[]byte)(unsafe.Pointer(&val))
+ (*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val)
+ return hash(buf, alg)
+}
+
+// prime constants used for slightly increasing the hash quality further
+var exprimes = [2]uint64{1609587929392839161, 9650029242287828579}
+
+// for smaller amounts of bytes this is faster than even calling into
+// xxh3 to do the hash, so we specialize in order to get the benefits
+// of that performance.
+func hash(b []byte, alg uint64) uint64 {
+ n := uint32(len(b))
+ if n <= 16 {
+ switch {
+ case n > 8:
+ // 8 < length <= 16
+ // apply same principle as above, but as two 64-bit ints
+ x := *(*uint64)(unsafe.Pointer(&b[n-8]))
+ y := *(*uint64)(unsafe.Pointer(&b[0]))
+ hx := hashInt(x, alg)
+ hy := hashInt(y, alg^1)
+ return uint64(n) ^ hx ^ hy
+ case n >= 4:
+ // 4 < length <= 8
+ // we can read the bytes as two overlapping 32-bit ints, apply different
+ // hash functions to each in parallel
+ // then xor the results
+ x := *(*uint32)(unsafe.Pointer(&b[n-4]))
+ y := *(*uint32)(unsafe.Pointer(&b[0]))
+ hx := hashInt(uint64(x), alg)
+ hy := hashInt(uint64(y), alg^1)
+ return uint64(n) ^ hx ^ hy
+ case n > 0:
+ x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1]))
+ return hashInt(uint64(x), alg)
+ case n == 0:
+ return 1
+ }
+ }
+
+ // increase differentiation enough to improve hash quality
+ return xxh3.Hash(b) + exprimes[alg]
+}
+
+const (
+ sentinel uint64 = 0
+ loadFactor int64 = 2
+)
+
+func max(a, b uint64) uint64 {
+ if a > b {
+ return a
+ }
+ return b
+}
+
+var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) }
+
+// KeyNotFound is the constant returned by memo table functions when a key isn't found in the table
+const KeyNotFound = -1
+
+// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder
+// to construct the actual data in an easy to pass around way with minimal copies
+// while using a hash table to keep track of the indexes into the dictionary that
+// is created as we go.
+type BinaryMemoTable struct {
+ tbl *Int32HashTable
+ builder *array.BinaryBuilder
+ nullIdx int
+}
+
+// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will
+// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.
+// initial and valuesize can be used to pre-allocate the table to reduce allocations. With
+// initial being the initial number of entries to allocate for and valuesize being the starting
+// amount of space allocated for writing the actual binary data.
+func NewBinaryMemoTable(mem memory.Allocator, initial, valuesize int) *BinaryMemoTable {
+ if mem == nil {
+ mem = memory.DefaultAllocator
+ }
+ bldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary)
+ bldr.Reserve(int(initial))
+ datasize := valuesize
+ if datasize <= 0 {
+ datasize = initial * 4
+ }
+ bldr.ReserveData(datasize)
+ return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound}
+}
+
+// Reset dumps all of the data in the table allowing it to be reutilized.
+func (s *BinaryMemoTable) Reset() {
+ s.tbl.Reset(32)
+ s.builder.NewArray().Release()
+ s.builder.Reserve(int(32))
+ s.builder.ReserveData(int(32) * 4)
+ s.nullIdx = KeyNotFound
+}
+
+// GetNull returns the index of a null that has been inserted into the table or
+// KeyNotFound. The bool returned will be true if there was a null inserted into
+// the table, and false otherwise.
+func (s *BinaryMemoTable) GetNull() (int, bool) {
+ return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// Size returns the current size of the memo table including the null value
+// if one has been inserted.
+func (s *BinaryMemoTable) Size() int {
+ sz := int(s.tbl.size)
+ if _, ok := s.GetNull(); ok {
+ sz++
+ }
+ return sz
+}
+
+// helper function to easily return a byte slice for any given value
+// regardless of the type if it's a []byte, parquet.ByteArray,
+// parquet.FixedLenByteArray or string.
+func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte {
+ switch v := val.(type) {
+ case []byte:
+ return v
+ case parquet.ByteArray:
+ return *(*[]byte)(unsafe.Pointer(&v))
+ case parquet.FixedLenByteArray:
+ return *(*[]byte)(unsafe.Pointer(&v))
+ case string:
+ var out []byte
+ h := (*reflect.StringHeader)(unsafe.Pointer(&v))
+ s := (*reflect.SliceHeader)(unsafe.Pointer(&out))
+ s.Data = h.Data
+ s.Len = h.Len
+ s.Cap = h.Len
+ return out
+ default:
+ panic("invalid type for binarymemotable")
+ }
+}
+
+// helper function to get the hash value regardless of the underlying binary type
+func (BinaryMemoTable) getHash(val interface{}) uint64 {
+ switch v := val.(type) {
+ case string:
+ return hashString(v, 0)
+ case []byte:
+ return hash(v, 0)
+ case parquet.ByteArray:
+ return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
+ case parquet.FixedLenByteArray:
+ return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
+ default:
+ panic("invalid type for binarymemotable")
+ }
+}
+
+// helper function to append the given value to the builder regardless
+// of the underlying binary type.
+func (b *BinaryMemoTable) appendVal(val interface{}) {
+ switch v := val.(type) {
+ case string:
+ b.builder.AppendString(v)
+ case []byte:
+ b.builder.Append(v)
+ case parquet.ByteArray:
+ b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
+ case parquet.FixedLenByteArray:
+ b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
+ }
+}
+
+func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) {
+ return b.tbl.Lookup(h, func(i int32) bool {
+ return bytes.Equal(val, b.builder.Value(int(i)))
+ })
+}
+
+// Get returns the index of the specified value in the table or KeyNotFound,
+// and a boolean indicating whether it was found in the table.
+func (b *BinaryMemoTable) Get(val interface{}) (int, bool) {
+ if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok {
+ return int(p.payload.val), ok
+ }
+ return KeyNotFound, false
+}
+
+// GetOrInsert returns the index of the given value in the table, if not found
+// it is inserted into the table. The return value 'found' indicates whether the value
+// was found in the table (true) or inserted (false) along with any possible error.
+func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+ h := b.getHash(val)
+ p, found := b.lookup(h, b.valAsByteSlice(val))
+ if found {
+ idx = int(p.payload.val)
+ } else {
+ idx = b.Size()
+ b.appendVal(val)
+ b.tbl.Insert(p, h, int32(idx), -1)
+ }
+ return
+}
+
+// GetOrInsertNull retrieves the index of a null in the table or inserts
+// null into the table, returning the index and a boolean indicating if it was
+// found in the table (true) or was inserted (false).
+func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) {
+ idx, found = b.GetNull()
+ if !found {
+ idx = b.Size()
+ b.nullIdx = idx
+ b.builder.AppendNull()
+ }
+ return
+}
+
+// helper function to get the offset into the builder data for a given
+// index value.
+func (b *BinaryMemoTable) findOffset(idx int) uintptr {
+ val := b.builder.Value(idx)
+ for len(val) == 0 {
+ idx++
+ if idx >= b.builder.Len() {
+ break
+ }
+ val = b.builder.Value(idx)
+ }
+ if len(val) != 0 {
+ return uintptr(unsafe.Pointer(&val[0]))
+ }
+ return uintptr(b.builder.DataLen()) + b.findOffset(0)
+}
+
+// CopyOffsets copies the list of offsets into the passed in slice, the offsets
+// being the start and end values of the underlying allocated bytes in the builder
+// for the individual values of the table. out should be at least sized to Size()+1
+func (b *BinaryMemoTable) CopyOffsets(out []int8) {
+ b.CopyOffsetsSubset(0, out)
+}
+
+// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
+// it gets a subset of the offsets in the table starting at the index provided by "start".
+func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int8) {
+ if b.builder.Len() <= start {
+ return
+ }
+
+ first := b.findOffset(0)
+ delta := b.findOffset(start)
+ for i := start; i < b.Size(); i++ {
+ offset := int8(b.findOffset(i) - delta)
+ out[i-start] = offset
+ }
+
+ out[b.Size()-start] = int8(b.builder.DataLen() - int(delta) - int(first))
+}
+
+// CopyValues copies the raw binary data bytes out, out should be a []byte
+// with at least ValuesSize bytes allocated to copy into.
+func (b *BinaryMemoTable) CopyValues(out interface{}) {
+ b.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies the raw binary data bytes out starting with the value
+// at the index start, out should be a []byte with at least ValuesSize bytes allocated
+func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) {
+ var (
+ first = b.findOffset(0)
+ offset = b.findOffset(int(start))
+ length = b.builder.DataLen() - int(offset-first)
+ )
+
+ outval := out.([]byte)
+ copy(outval, b.builder.Value(start)[0:length])
+}
+
+func (b *BinaryMemoTable) WriteOut(out []byte) {
+ b.CopyValues(out)
+}
+
+func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) {
+ b.CopyValuesSubset(start, out)
+}
+
+// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep
+// track of the fixed width when inserting the null value the databuffer holds a
+// zero length byte slice for the null value (if found)
+func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) {
+ if start >= b.Size() {
+ return
+ }
+
+ null, exists := b.GetNull()
+ if !exists || null < start {
+ // nothing to skip, proceed as usual
+ b.CopyValuesSubset(start, out)
+ return
+ }
+
+ var (
+ leftOffset = b.findOffset(start)
+ nullOffset = b.findOffset(null)
+ leftSize = nullOffset - leftOffset
+ )
+
+ if leftSize > 0 {
+ copy(out, b.builder.Value(start)[0:leftSize])
+ }
+
+ rightSize := b.ValuesSize() - int(nullOffset)
+ if rightSize > 0 {
+ // skip the null fixed size value
+ copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize])
+ }
+}
+
+// VisitValues exists to run the visitFn on each value currently in the hash table.
+func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) {
+ for i := int(start); i < b.Size(); i++ {
+ visitFn(b.builder.Value(i))
+ }
+}
+
+// Release is used to tell the underlying builder that it can release the memory allocated
+// when the reference count reaches 0, this is safe to be called from multiple goroutines
+// simultaneously
+func (b *BinaryMemoTable) Release() { b.builder.Release() }
+
+// Retain increases the ref count, it is safe to call it from multiple goroutines
+// simultaneously.
+func (b *BinaryMemoTable) Retain() { b.builder.Retain() }
+
+// ValuesSize returns the current total size of all the raw bytes that have been inserted
+// into the memotable so far.
+func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }