Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/go/parquet/internal/hashing
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
5 files changed, 1962 insertions, 0 deletions
diff --git a/src/arrow/go/parquet/internal/hashing/hashing_test.go b/src/arrow/go/parquet/internal/hashing/hashing_test.go
new file mode 100644
index 000000000..875424a9d
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/hashing_test.go
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+	"math/rand"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func MakeDistinctIntegers(nvals int) map[int]bool {
+	r := rand.New(rand.NewSource(42))
+	values := make(map[int]bool)
+	for len(values) < nvals {
+		values[r.Int()] = true
+	}
+	return values
+}
+
+func MakeSequentialIntegers(nvals int) map[int]bool {
+	values := make(map[int]bool)
+	for i := 0; i < nvals; i++ {
+		values[i] = true
+	}
+	return values
+}
+
+func MakeDistinctStrings(nvals int) map[string]bool {
+	values := make(map[string]bool)
+
+	r := rand.New(rand.NewSource(42))
+
+	max := 'z'
+	min := '0'
+	for len(values) < nvals {
+		data := make([]byte, r.Intn(24))
+		for idx := range data {
+			data[idx] = byte(r.Intn(int(max-min+1)) + int(min))
+		}
+		values[string(data)] = true
+	}
+	return values
+}
+
+func TestHashingQualityInt(t *testing.T) {
+	const nvalues = 10000
+
+	tests := []struct {
+		name    string
+		values  map[int]bool
+		quality float64
+	}{
+		{"distinct", MakeDistinctIntegers(nvalues), 0.96},
+		{"sequential", MakeSequentialIntegers(nvalues), 0.96},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			hashes := make(map[uint64]bool)
+			for k := range tt.values {
+				hashes[hashInt(uint64(k), 0)] = true
+				hashes[hashInt(uint64(k), 1)] = true
+			}
+			assert.GreaterOrEqual(t, float64(len(hashes)), tt.quality*float64(2*len(tt.values)))
+		})
+	}
+}
+
+func TestHashingBoundsStrings(t *testing.T) {
+	sizes := []int{1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21}
+	for _, s := range sizes {
+		str := make([]byte, s)
+		for idx := range str {
+			str[idx] = uint8(idx)
+		}
+
+		h := hash(str, 1)
+		diff := 0
+		for i := 0; i < 120; i++ {
+			str[len(str)-1] = uint8(i)
+			if hash(str, 1) != h {
+				diff++
+			}
+		}
+		assert.GreaterOrEqual(t, diff, 118)
+	}
+}
+
+func TestHashingQualityString(t *testing.T) {
+	const nvalues = 10000
+	values := MakeDistinctStrings(nvalues)
+
+	hashes := make(map[uint64]bool)
+	for k := range values {
+		hashes[hashString(k, 0)] = true
+		hashes[hashString(k, 1)] = true
+	}
+	assert.GreaterOrEqual(t, float64(len(hashes)), 0.96*float64(2*len(values)))
+}
diff --git a/src/arrow/go/parquet/internal/hashing/types.tmpldata b/src/arrow/go/parquet/internal/hashing/types.tmpldata
new file mode 100644
index 000000000..2e97e9814
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/types.tmpldata
@@ -0,0 +1,18 @@
+[
+  {
+    "Name": "Int32",
+    "name": "int32"
+  },
+  {
+    "Name": "Int64",
+    "name": "int64"
+  },
+  {
+    "Name": "Float32",
+    "name": "float32"
+  },
+  {
+    "Name": "Float64",
+    "name": "float64"
+  }
+]
diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go
new file mode 100644
index 000000000..ca891dc33
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go
@@ -0,0 +1,1103 @@
+// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT.
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+	"math"
+
+	"github.com/apache/arrow/go/v6/arrow"
+	"github.com/apache/arrow/go/v6/arrow/bitutil"
+	"github.com/apache/arrow/go/v6/parquet/internal/utils"
+)
+
+type payloadInt32 struct {
+	val     int32
+	memoIdx int32
+}
+
+type entryInt32 struct {
+	h       uint64
+	payload payloadInt32
+}
+
+func (e entryInt32) Valid() bool { return e.h != sentinel }
+
+// Int32HashTable is a hashtable specifically for int32 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int32HashTable struct {
+	cap     uint64
+	capMask uint64
+	size    uint64
+
+	entries []entryInt32
+}
+
+// NewInt32HashTable returns a new hash table for int32 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt32HashTable(cap uint64) *Int32HashTable {
+	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+	ret.entries = make([]entryInt32, initCap)
+	return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int32HashTable) Reset(cap uint64) {
+	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	h.capMask = h.cap - 1
+	h.size = 0
+	h.entries = make([]entryInt32, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int32HashTable) CopyValues(out []int32) {
+	h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) {
+	h.VisitEntries(func(e *entryInt32) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			out[idx] = e.payload.val
+		}
+	})
+}
+
+func (h *Int32HashTable) WriteOut(out []byte) {
+	h.WriteOutSubset(0, out)
+}
+
+func (h *Int32HashTable) WriteOutSubset(start int, out []byte) {
+	data := arrow.Int32Traits.CastFromBytes(out)
+	h.VisitEntries(func(e *entryInt32) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			data[idx] = utils.ToLEInt32(e.payload.val)
+		}
+	})
+}
+
+func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int32HashTable) fixHash(v uint64) uint64 {
+	if v == sentinel {
+		return 42
+	}
+	return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) {
+	idx, ok := h.lookup(v, h.capMask, cmp)
+	return &h.entries[idx], ok
+}
+
+func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) {
+	const perturbShift uint8 = 5
+
+	var (
+		idx     uint64
+		perturb uint64
+		e       *entryInt32
+	)
+
+	v = h.fixHash(v)
+	idx = v & szMask
+	perturb = (v >> uint64(perturbShift)) + 1
+
+	for {
+		e = &h.entries[idx]
+		if e.h == v && cmp(e.payload.val) {
+			return idx, true
+		}
+
+		if e.h == sentinel {
+			return idx, false
+		}
+
+		// perturbation logic inspired from CPython's set/dict object
+		// the goal is that all 64 bits of unmasked hash value eventually
+		// participate int he probing sequence, to minimize clustering
+		idx = (idx + perturb) & szMask
+		perturb = (perturb >> uint64(perturbShift)) + 1
+	}
+}
+
+func (h *Int32HashTable) upsize(newcap uint64) error {
+	newMask := newcap - 1
+
+	oldEntries := h.entries
+	h.entries = make([]entryInt32, newcap)
+	for _, e := range oldEntries {
+		if e.Valid() {
+			idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false })
+			h.entries[idx] = e
+		}
+	}
+	h.cap = newcap
+	h.capMask = newMask
+	return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error {
+	e.h = h.fixHash(v)
+	e.payload.val = val
+	e.payload.memoIdx = memoIdx
+	h.size++
+
+	if h.needUpsize() {
+		h.upsize(h.cap * uint64(loadFactor) * 2)
+	}
+	return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) {
+	for _, e := range h.entries {
+		if e.Valid() {
+			visit(&e)
+		}
+	}
+}
+
+// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int32MemoTable struct {
+	tbl     *Int32HashTable
+	nullIdx int32
+}
+
+// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt32MemoTable(num int64) *Int32MemoTable {
+	return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int32MemoTable) Reset() {
+	s.tbl.Reset(32)
+	s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int32MemoTable) Size() int {
+	sz := int(s.tbl.size)
+	if _, ok := s.GetNull(); ok {
+		sz++
+	}
+	return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int32MemoTable) GetNull() (int, bool) {
+	return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) {
+	idx, found = s.GetNull()
+	if !found {
+		idx = s.Size()
+		s.nullIdx = int32(idx)
+	}
+	return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int32MemoTable) CopyValues(out interface{}) {
+	s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) {
+	s.tbl.CopyValuesSubset(start, out.([]int32))
+}
+
+func (s *Int32MemoTable) WriteOut(out []byte) {
+	s.tbl.WriteOut(out)
+}
+
+func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) {
+	s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int32MemoTable) Get(val interface{}) (int, bool) {
+
+	h := hashInt(uint64(val.(int32)), 0)
+	if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok {
+		return int(e.payload.memoIdx), ok
+	}
+	return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+	h := hashInt(uint64(val.(int32)), 0)
+	e, ok := s.tbl.Lookup(h, func(v int32) bool {
+		return val.(int32) == v
+	})
+
+	if ok {
+		idx = int(e.payload.memoIdx)
+		found = true
+	} else {
+		idx = s.Size()
+		s.tbl.Insert(e, h, val.(int32), int32(idx))
+	}
+	return
+}
+
+type payloadInt64 struct {
+	val     int64
+	memoIdx int32
+}
+
+type entryInt64 struct {
+	h       uint64
+	payload payloadInt64
+}
+
+func (e entryInt64) Valid() bool { return e.h != sentinel }
+
+// Int64HashTable is a hashtable specifically for int64 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Int64HashTable struct {
+	cap     uint64
+	capMask uint64
+	size    uint64
+
+	entries []entryInt64
+}
+
+// NewInt64HashTable returns a new hash table for int64 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewInt64HashTable(cap uint64) *Int64HashTable {
+	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+	ret.entries = make([]entryInt64, initCap)
+	return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Int64HashTable) Reset(cap uint64) {
+	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	h.capMask = h.cap - 1
+	h.size = 0
+	h.entries = make([]entryInt64, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Int64HashTable) CopyValues(out []int64) {
+	h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) {
+	h.VisitEntries(func(e *entryInt64) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			out[idx] = e.payload.val
+		}
+	})
+}
+
+func (h *Int64HashTable) WriteOut(out []byte) {
+	h.WriteOutSubset(0, out)
+}
+
+func (h *Int64HashTable) WriteOutSubset(start int, out []byte) {
+	data := arrow.Int64Traits.CastFromBytes(out)
+	h.VisitEntries(func(e *entryInt64) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			data[idx] = utils.ToLEInt64(e.payload.val)
+		}
+	})
+}
+
+func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Int64HashTable) fixHash(v uint64) uint64 {
+	if v == sentinel {
+		return 42
+	}
+	return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) {
+	idx, ok := h.lookup(v, h.capMask, cmp)
+	return &h.entries[idx], ok
+}
+
+func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) {
+	const perturbShift uint8 = 5
+
+	var (
+		idx     uint64
+		perturb uint64
+		e       *entryInt64
+	)
+
+	v = h.fixHash(v)
+	idx = v & szMask
+	perturb = (v >> uint64(perturbShift)) + 1
+
+	for {
+		e = &h.entries[idx]
+		if e.h == v && cmp(e.payload.val) {
+			return idx, true
+		}
+
+		if e.h == sentinel {
+			return idx, false
+		}
+
+		// perturbation logic inspired from CPython's set/dict object
+		// the goal is that all 64 bits of unmasked hash value eventually
+		// participate int he probing sequence, to minimize clustering
+		idx = (idx + perturb) & szMask
+		perturb = (perturb >> uint64(perturbShift)) + 1
+	}
+}
+
+func (h *Int64HashTable) upsize(newcap uint64) error {
+	newMask := newcap - 1
+
+	oldEntries := h.entries
+	h.entries = make([]entryInt64, newcap)
+	for _, e := range oldEntries {
+		if e.Valid() {
+			idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false })
+			h.entries[idx] = e
+		}
+	}
+	h.cap = newcap
+	h.capMask = newMask
+	return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error {
+	e.h = h.fixHash(v)
+	e.payload.val = val
+	e.payload.memoIdx = memoIdx
+	h.size++
+
+	if h.needUpsize() {
+		h.upsize(h.cap * uint64(loadFactor) * 2)
+	}
+	return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) {
+	for _, e := range h.entries {
+		if e.Valid() {
+			visit(&e)
+		}
+	}
+}
+
+// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Int64MemoTable struct {
+	tbl     *Int64HashTable
+	nullIdx int32
+}
+
+// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewInt64MemoTable(num int64) *Int64MemoTable {
+	return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Int64MemoTable) Reset() {
+	s.tbl.Reset(32)
+	s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Int64MemoTable) Size() int {
+	sz := int(s.tbl.size)
+	if _, ok := s.GetNull(); ok {
+		sz++
+	}
+	return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Int64MemoTable) GetNull() (int, bool) {
+	return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) {
+	idx, found = s.GetNull()
+	if !found {
+		idx = s.Size()
+		s.nullIdx = int32(idx)
+	}
+	return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Int64MemoTable) CopyValues(out interface{}) {
+	s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) {
+	s.tbl.CopyValuesSubset(start, out.([]int64))
+}
+
+func (s *Int64MemoTable) WriteOut(out []byte) {
+	s.tbl.WriteOut(out)
+}
+
+func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) {
+	s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Int64MemoTable) Get(val interface{}) (int, bool) {
+
+	h := hashInt(uint64(val.(int64)), 0)
+	if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok {
+		return int(e.payload.memoIdx), ok
+	}
+	return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+	h := hashInt(uint64(val.(int64)), 0)
+	e, ok := s.tbl.Lookup(h, func(v int64) bool {
+		return val.(int64) == v
+	})
+
+	if ok {
+		idx = int(e.payload.memoIdx)
+		found = true
+	} else {
+		idx = s.Size()
+		s.tbl.Insert(e, h, val.(int64), int32(idx))
+	}
+	return
+}
+
+type payloadFloat32 struct {
+	val     float32
+	memoIdx int32
+}
+
+type entryFloat32 struct {
+	h       uint64
+	payload payloadFloat32
+}
+
+func (e entryFloat32) Valid() bool { return e.h != sentinel }
+
+// Float32HashTable is a hashtable specifically for float32 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Float32HashTable struct {
+	cap     uint64
+	capMask uint64
+	size    uint64
+
+	entries []entryFloat32
+}
+
+// NewFloat32HashTable returns a new hash table for float32 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewFloat32HashTable(cap uint64) *Float32HashTable {
+	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+	ret.entries = make([]entryFloat32, initCap)
+	return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Float32HashTable) Reset(cap uint64) {
+	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	h.capMask = h.cap - 1
+	h.size = 0
+	h.entries = make([]entryFloat32, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Float32HashTable) CopyValues(out []float32) {
+	h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) {
+	h.VisitEntries(func(e *entryFloat32) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			out[idx] = e.payload.val
+		}
+	})
+}
+
+func (h *Float32HashTable) WriteOut(out []byte) {
+	h.WriteOutSubset(0, out)
+}
+
+func (h *Float32HashTable) WriteOutSubset(start int, out []byte) {
+	data := arrow.Float32Traits.CastFromBytes(out)
+	h.VisitEntries(func(e *entryFloat32) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			data[idx] = utils.ToLEFloat32(e.payload.val)
+		}
+	})
+}
+
+func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Float32HashTable) fixHash(v uint64) uint64 {
+	if v == sentinel {
+		return 42
+	}
+	return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) {
+	idx, ok := h.lookup(v, h.capMask, cmp)
+	return &h.entries[idx], ok
+}
+
+func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) {
+	const perturbShift uint8 = 5
+
+	var (
+		idx     uint64
+		perturb uint64
+		e       *entryFloat32
+	)
+
+	v = h.fixHash(v)
+	idx = v & szMask
+	perturb = (v >> uint64(perturbShift)) + 1
+
+	for {
+		e = &h.entries[idx]
+		if e.h == v && cmp(e.payload.val) {
+			return idx, true
+		}
+
+		if e.h == sentinel {
+			return idx, false
+		}
+
+		// perturbation logic inspired from CPython's set/dict object
+		// the goal is that all 64 bits of unmasked hash value eventually
+		// participate int he probing sequence, to minimize clustering
+		idx = (idx + perturb) & szMask
+		perturb = (perturb >> uint64(perturbShift)) + 1
+	}
+}
+
+func (h *Float32HashTable) upsize(newcap uint64) error {
+	newMask := newcap - 1
+
+	oldEntries := h.entries
+	h.entries = make([]entryFloat32, newcap)
+	for _, e := range oldEntries {
+		if e.Valid() {
+			idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false })
+			h.entries[idx] = e
+		}
+	}
+	h.cap = newcap
+	h.capMask = newMask
+	return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error {
+	e.h = h.fixHash(v)
+	e.payload.val = val
+	e.payload.memoIdx = memoIdx
+	h.size++
+
+	if h.needUpsize() {
+		h.upsize(h.cap * uint64(loadFactor) * 2)
+	}
+	return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) {
+	for _, e := range h.entries {
+		if e.Valid() {
+			visit(&e)
+		}
+	}
+}
+
+// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Float32MemoTable struct {
+	tbl     *Float32HashTable
+	nullIdx int32
+}
+
+// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewFloat32MemoTable(num int64) *Float32MemoTable {
+	return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Float32MemoTable) Reset() {
+	s.tbl.Reset(32)
+	s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Float32MemoTable) Size() int {
+	sz := int(s.tbl.size)
+	if _, ok := s.GetNull(); ok {
+		sz++
+	}
+	return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Float32MemoTable) GetNull() (int, bool) {
+	return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) {
+	idx, found = s.GetNull()
+	if !found {
+		idx = s.Size()
+		s.nullIdx = int32(idx)
+	}
+	return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Float32MemoTable) CopyValues(out interface{}) {
+	s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) {
+	s.tbl.CopyValuesSubset(start, out.([]float32))
+}
+
+func (s *Float32MemoTable) WriteOut(out []byte) {
+	s.tbl.WriteOut(out)
+}
+
+func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) {
+	s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Float32MemoTable) Get(val interface{}) (int, bool) {
+	var cmp func(float32) bool
+
+	if math.IsNaN(float64(val.(float32))) {
+		cmp = isNan32Cmp
+		// use consistent internal bit pattern for NaN regardless of the pattern
+		// that is passed to us. NaN is NaN is NaN
+		val = float32(math.NaN())
+	} else {
+		cmp = func(v float32) bool { return val.(float32) == v }
+	}
+
+	h := hashFloat32(val.(float32), 0)
+	if e, ok := s.tbl.Lookup(h, cmp); ok {
+		return int(e.payload.memoIdx), ok
+	}
+	return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+	var cmp func(float32) bool
+
+	if math.IsNaN(float64(val.(float32))) {
+		cmp = isNan32Cmp
+		// use consistent internal bit pattern for NaN regardless of the pattern
+		// that is passed to us. NaN is NaN is NaN
+		val = float32(math.NaN())
+	} else {
+		cmp = func(v float32) bool { return val.(float32) == v }
+	}
+
+	h := hashFloat32(val.(float32), 0)
+	e, ok := s.tbl.Lookup(h, cmp)
+
+	if ok {
+		idx = int(e.payload.memoIdx)
+		found = true
+	} else {
+		idx = s.Size()
+		s.tbl.Insert(e, h, val.(float32), int32(idx))
+	}
+	return
+}
+
+type payloadFloat64 struct {
+	val     float64
+	memoIdx int32
+}
+
+type entryFloat64 struct {
+	h       uint64
+	payload payloadFloat64
+}
+
+func (e entryFloat64) Valid() bool { return e.h != sentinel }
+
+// Float64HashTable is a hashtable specifically for float64 that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type Float64HashTable struct {
+	cap     uint64
+	capMask uint64
+	size    uint64
+
+	entries []entryFloat64
+}
+
+// NewFloat64HashTable returns a new hash table for float64 values
+// initialized with the passed in capacity or 32 whichever is larger.
+func NewFloat64HashTable(cap uint64) *Float64HashTable {
+	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+	ret.entries = make([]entryFloat64, initCap)
+	return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *Float64HashTable) Reset(cap uint64) {
+	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	h.capMask = h.cap - 1
+	h.size = 0
+	h.entries = make([]entryFloat64, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *Float64HashTable) CopyValues(out []float64) {
+	h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) {
+	h.VisitEntries(func(e *entryFloat64) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			out[idx] = e.payload.val
+		}
+	})
+}
+
+func (h *Float64HashTable) WriteOut(out []byte) {
+	h.WriteOutSubset(0, out)
+}
+
+func (h *Float64HashTable) WriteOutSubset(start int, out []byte) {
+	data := arrow.Float64Traits.CastFromBytes(out)
+	h.VisitEntries(func(e *entryFloat64) {
+		idx := e.payload.memoIdx - int32(start)
+		if idx >= 0 {
+			data[idx] = utils.ToLEFloat64(e.payload.val)
+		}
+	})
+}
+
+func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func (Float64HashTable) fixHash(v uint64) uint64 {
+	if v == sentinel {
+		return 42
+	}
+	return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) {
+	idx, ok := h.lookup(v, h.capMask, cmp)
+	return &h.entries[idx], ok
+}
+
+func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) {
+	const perturbShift uint8 = 5
+
+	var (
+		idx     uint64
+		perturb uint64
+		e       *entryFloat64
+	)
+
+	v = h.fixHash(v)
+	idx = v & szMask
+	perturb = (v >> uint64(perturbShift)) + 1
+
+	for {
+		e = &h.entries[idx]
+		if e.h == v && cmp(e.payload.val) {
+			return idx, true
+		}
+
+		if e.h == sentinel {
+			return idx, false
+		}
+
+		// perturbation logic inspired from CPython's set/dict object
+		// the goal is that all 64 bits of unmasked hash value eventually
+		// participate int he probing sequence, to minimize clustering
+		idx = (idx + perturb) & szMask
+		perturb = (perturb >> uint64(perturbShift)) + 1
+	}
+}
+
+func (h *Float64HashTable) upsize(newcap uint64) error {
+	newMask := newcap - 1
+
+	oldEntries := h.entries
+	h.entries = make([]entryFloat64, newcap)
+	for _, e := range oldEntries {
+		if e.Valid() {
+			idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false })
+			h.entries[idx] = e
+		}
+	}
+	h.cap = newcap
+	h.capMask = newMask
+	return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error {
+	e.h = h.fixHash(v)
+	e.payload.val = val
+	e.payload.memoIdx = memoIdx
+	h.size++
+
+	if h.needUpsize() {
+		h.upsize(h.cap * uint64(loadFactor) * 2)
+	}
+	return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) {
+	for _, e := range h.entries {
+		if e.Valid() {
+			visit(&e)
+		}
+	}
+}
+
+// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type Float64MemoTable struct {
+	tbl     *Float64HashTable
+	nullIdx int32
+}
+
+// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func NewFloat64MemoTable(num int64) *Float64MemoTable {
+	return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *Float64MemoTable) Reset() {
+	s.tbl.Reset(32)
+	s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *Float64MemoTable) Size() int {
+	sz := int(s.tbl.size)
+	if _, ok := s.GetNull(); ok {
+		sz++
+	}
+	return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *Float64MemoTable) GetNull() (int, bool) {
+	return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) {
+	idx, found = s.GetNull()
+	if !found {
+		idx = s.Size()
+		s.nullIdx = int32(idx)
+	}
+	return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *Float64MemoTable) CopyValues(out interface{}) {
+	s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) {
+	s.tbl.CopyValuesSubset(start, out.([]float64))
+}
+
+func (s *Float64MemoTable) WriteOut(out []byte) {
+	s.tbl.WriteOut(out)
+}
+
+func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) {
+	s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *Float64MemoTable) Get(val interface{}) (int, bool) {
+	var cmp func(float64) bool
+	if math.IsNaN(val.(float64)) {
+		cmp = math.IsNaN
+		// use consistent internal bit pattern for NaN regardless of the pattern
+		// that is passed to us. NaN is NaN is NaN
+		val = math.NaN()
+	} else {
+		cmp = func(v float64) bool { return val.(float64) == v }
+	}
+
+	h := hashFloat64(val.(float64), 0)
+	if e, ok := s.tbl.Lookup(h, cmp); ok {
+		return int(e.payload.memoIdx), ok
+	}
+	return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+
+	var cmp func(float64) bool
+	if math.IsNaN(val.(float64)) {
+		cmp = math.IsNaN
+		// use consistent internal bit pattern for NaN regardless of the pattern
+		// that is passed to us. NaN is NaN is NaN
+		val = math.NaN()
+	} else {
+		cmp = func(v float64) bool { return val.(float64) == v }
+	}
+
+	h := hashFloat64(val.(float64), 0)
+	e, ok := s.tbl.Lookup(h, cmp)
+
+	if ok {
+		idx = int(e.payload.memoIdx)
+		found = true
+	} else {
+		idx = s.Size()
+		s.tbl.Insert(e, h, val.(float64), int32(idx))
+	}
+	return
+}
diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl
new file mode 100644
index 000000000..518ba5af0
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl
@@ -0,0 +1,327 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hashing
+
+import (
+  "github.com/apache/arrow/go/v6/arrow/bitutil"
+  "github.com/apache/arrow/go/v6/parquet/internal/utils"
+)
+
+{{range .In}}
+type payload{{.Name}} struct {
+	val     {{.name}}
+	memoIdx int32
+}
+
+type entry{{.Name}} struct {
+	h       uint64
+	payload payload{{.Name}}
+}
+
+func (e entry{{.Name}}) Valid() bool { return e.h != sentinel }
+
+// {{.Name}}HashTable is a hashtable specifically for {{.name}} that
+// is utilized with the MemoTable to generalize interactions for easier
+// implementation of dictionaries without losing performance.
+type {{.Name}}HashTable struct {
+	cap     uint64
+	capMask uint64
+	size    uint64
+
+	entries []entry{{.Name}}
+}
+
+// New{{.Name}}HashTable returns a new hash table for {{.name}} values
+// initialized with the passed in capacity or 32 whichever is larger.
+func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable {
+	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0}
+	ret.entries = make([]entry{{.Name}}, initCap)
+	return ret
+}
+
+// Reset drops all of the values in this hash table and re-initializes it
+// with the specified initial capacity as if by calling New, but without having
+// to reallocate the object.
+func (h *{{.Name}}HashTable) Reset(cap uint64) {
+	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
+	h.capMask = h.cap - 1
+	h.size = 0
+	h.entries = make([]entry{{.Name}}, h.cap)
+}
+
+// CopyValues is used for copying the values out of the hash table into the
+// passed in slice, in the order that they were first inserted
+func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) {
+  h.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies a subset of the values in the hashtable out, starting
+// with the value at start, in the order that they were inserted.
+func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) {
+  h.VisitEntries(func(e *entry{{.Name}}) {
+    idx := e.payload.memoIdx - int32(start)
+    if idx >= 0 {
+      out[idx] = e.payload.val
+    }
+  })
+}
+
+func (h *{{.Name}}HashTable) WriteOut(out []byte) {
+  h.WriteOutSubset(0, out)
+}
+
+func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) {
+  data := arrow.{{.Name}}Traits.CastFromBytes(out)
+  h.VisitEntries(func(e *entry{{.Name}}) {
+    idx := e.payload.memoIdx - int32(start)
+    if idx >= 0 {
+      data[idx] = utils.ToLE{{.Name}}(e.payload.val)
+    }
+  })
+}
+
+func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
+
+func ({{.Name}}HashTable) fixHash(v uint64) uint64 {
+	if v == sentinel {
+		return 42
+	}
+	return v
+}
+
+// Lookup retrieves the entry for a given hash value assuming it's payload value returns
+// true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
+// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
+func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) {
+	idx, ok := h.lookup(v, h.capMask, cmp)
+	return &h.entries[idx], ok
+}
+
+func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) {
+	const perturbShift uint8 = 5
+
+	var (
+		idx     uint64
+		perturb uint64
+		e       *entry{{.Name}}
+	)
+
+	v = h.fixHash(v)
+	idx = v & szMask
+	perturb = (v >> uint64(perturbShift)) + 1
+
+	for {
+		e = &h.entries[idx]
+		if e.h == v && cmp(e.payload.val) {
+			return idx, true
+		}
+
+		if e.h == sentinel {
+			return idx, false
+		}
+
+		// perturbation logic inspired from CPython's set/dict object
+		// the goal is that all 64 bits of unmasked hash value eventually
+		// participate int he probing sequence, to minimize clustering
+		idx = (idx + perturb) & szMask
+		perturb = (perturb >> uint64(perturbShift)) + 1
+	}
+}
+
+func (h *{{.Name}}HashTable) upsize(newcap uint64) error {
+	newMask := newcap - 1
+
+	oldEntries := h.entries
+	h.entries = make([]entry{{.Name}}, newcap)
+	for _, e := range oldEntries {
+		if e.Valid() {
+			idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false })
+			h.entries[idx] = e
+		}
+	}
+	h.cap = newcap
+	h.capMask = newMask
+	return nil
+}
+
+// Insert updates the given entry with the provided hash value, payload value and memo index.
+// The entry pointer must have been retrieved via lookup in order to actually insert properly.
+func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error {
+	e.h = h.fixHash(v)
+	e.payload.val = val
+	e.payload.memoIdx = memoIdx
+	h.size++
+
+	if h.needUpsize() {
+		h.upsize(h.cap * uint64(loadFactor) * 2)
+	}
+	return nil
+}
+
+// VisitEntries will call the passed in function on each *valid* entry in the hash table,
+// a valid entry being one which has had a value inserted into it.
+func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) {
+	for _, e := range h.entries {
+		if e.Valid() {
+			visit(&e)
+		}
+	}
+}
+
+// {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface
+// conforming to the MemoTable interface defined in the encoding package for general interactions
+// regarding dictionaries.
+type {{.Name}}MemoTable struct {
+  tbl *{{.Name}}HashTable
+  nullIdx int32
+}
+
+// New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further
+// allocations when inserting.
+func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable {
+  return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound}
+}
+
+// Reset allows this table to be re-used by dumping all the data currently in the table.
+func (s *{{.Name}}MemoTable) Reset() {
+  s.tbl.Reset(32)
+  s.nullIdx = KeyNotFound
+}
+
+// Size returns the current number of inserted elements into the table including if a null
+// has been inserted.
+func (s *{{.Name}}MemoTable) Size() int {
+  sz := int(s.tbl.size)
+  if _, ok := s.GetNull(); ok {
+    sz++
+  }
+  return sz
+}
+
+// GetNull returns the index of an inserted null or KeyNotFound along with a bool
+// that will be true if found and false if not.
+func (s *{{.Name}}MemoTable) GetNull() (int, bool) {
+  return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// GetOrInsertNull will return the index of the null entry or insert a null entry
+// if one currently doesn't exist. The found value will be true if there was already
+// a null in the table, and false if it inserted one.
+func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) {
+  idx, found = s.GetNull()
+  if !found {
+    idx = s.Size()
+    s.nullIdx = int32(idx)
+  }
+  return
+}
+
+// CopyValues will copy the values from the memo table out into the passed in slice
+// which must be of the appropriate type.
+func (s *{{.Name}}MemoTable) CopyValues(out interface{}) {
+  s.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset is like CopyValues but only copies a subset of values starting
+// at the provided start index
+func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) {
+  s.tbl.CopyValuesSubset(start, out.([]{{.name}}))
+}
+
+func (s *{{.Name}}MemoTable) WriteOut(out []byte) {
+  s.tbl.WriteOut(out)
+}
+
+func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) {
+  s.tbl.WriteOutSubset(start, out)
+}
+
+// Get returns the index of the requested value in the hash table or KeyNotFound
+// along with a boolean indicating if it was found or not.
+func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) {
+{{if or (eq .Name "Int32") (eq .Name "Int64") }}
+  h := hashInt(uint64(val.({{.name}})), 0)
+  if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok {
+{{ else -}}
+  var cmp func({{.name}}) bool
+  {{if eq .Name "Float32"}}
+  if math.IsNaN(float64(val.(float32))) {
+    cmp = isNan32Cmp
+    // use consistent internal bit pattern for NaN regardless of the pattern
+    // that is passed to us. NaN is NaN is NaN
+    val = float32(math.NaN())
+  {{ else -}}
+  if math.IsNaN(val.(float64)) {
+    cmp = math.IsNaN
+    // use consistent internal bit pattern for NaN regardless of the pattern
+    // that is passed to us. NaN is NaN is NaN
+    val = math.NaN()
+  {{end -}}
+  } else {
+    cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
+  }
+
+  h := hash{{.Name}}(val.({{.name}}), 0)  
+  if e, ok := s.tbl.Lookup(h, cmp); ok {
+{{ end -}}
+    return int(e.payload.memoIdx), ok
+  }
+  return KeyNotFound, false
+}
+
+// GetOrInsert will return the index of the specified value in the table, or insert the
+// value into the table and return the new index. found indicates whether or not it already
+// existed in the table (true) or was inserted by this call (false).
+func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+  {{if or (eq .Name "Int32") (eq .Name "Int64") }}
+  h := hashInt(uint64(val.({{.name}})), 0)
+  e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool {
+    return val.({{.name}}) == v
+  })
+{{ else }}  
+  var cmp func({{.name}}) bool
+  {{if eq .Name "Float32"}}
+  if math.IsNaN(float64(val.(float32))) {
+    cmp = isNan32Cmp
+    // use consistent internal bit pattern for NaN regardless of the pattern
+    // that is passed to us. NaN is NaN is NaN
+    val = float32(math.NaN()) 
+  {{ else -}}
+  if math.IsNaN(val.(float64)) {  
+    cmp = math.IsNaN
+    // use consistent internal bit pattern for NaN regardless of the pattern
+    // that is passed to us. NaN is NaN is NaN
+    val = math.NaN()
+  {{end -}}
+  } else {
+    cmp = func(v {{.name}}) bool { return val.({{.name}}) == v }
+  }
+  
+  h := hash{{.Name}}(val.({{.name}}), 0)
+  e, ok := s.tbl.Lookup(h, cmp)
+{{ end }}
+  if ok {
+    idx = int(e.payload.memoIdx)
+    found = true
+  } else {
+    idx = s.Size()
+    s.tbl.Insert(e, h, val.({{.name}}), int32(idx))
+  }
+  return
+}
+{{end}}
diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go
new file mode 100644
index 000000000..54d245187
--- /dev/null
+++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go
@@ -0,0 +1,400 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hashing provides utilities for and an implementation of a hash
+// table which is more performant than the default go map implementation
+// by leveraging xxh3 and some custom hash functions.
+package hashing
+
+import (
+	"bytes"
+	"math"
+	"math/bits"
+	"reflect"
+	"unsafe"
+
+	"github.com/apache/arrow/go/v6/arrow"
+	"github.com/apache/arrow/go/v6/arrow/array"
+	"github.com/apache/arrow/go/v6/arrow/memory"
+	"github.com/apache/arrow/go/v6/parquet"
+
+	"github.com/zeebo/xxh3"
+)
+
+//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl
+
+func hashInt(val uint64, alg uint64) uint64 {
+	// Two of xxhash's prime multipliers (which are chosen for their
+	// bit dispersion properties)
+	var multipliers = [2]uint64{11400714785074694791, 14029467366897019727}
+	// Multiplying by the prime number mixes the low bits into the high bits,
+	// then byte-swapping (which is a single CPU instruction) allows the
+	// combined high and low bits to participate in the initial hash table index.
+	return bits.ReverseBytes64(multipliers[alg] * val)
+}
+
+func hashFloat32(val float32, alg uint64) uint64 {
+	// grab the raw byte pattern of the
+	bt := *(*[4]byte)(unsafe.Pointer(&val))
+	x := uint64(*(*uint32)(unsafe.Pointer(&bt[0])))
+	hx := hashInt(x, alg)
+	hy := hashInt(x, alg^1)
+	return 4 ^ hx ^ hy
+}
+
+func hashFloat64(val float64, alg uint64) uint64 {
+	bt := *(*[8]byte)(unsafe.Pointer(&val))
+	hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg)
+	hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1)
+	return 8 ^ hx ^ hy
+}
+
+func hashString(val string, alg uint64) uint64 {
+	buf := *(*[]byte)(unsafe.Pointer(&val))
+	(*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val)
+	return hash(buf, alg)
+}
+
+// prime constants used for slightly increasing the hash quality further
+var exprimes = [2]uint64{1609587929392839161, 9650029242287828579}
+
+// for smaller amounts of bytes this is faster than even calling into
+// xxh3 to do the hash, so we specialize in order to get the benefits
+// of that performance.
+func hash(b []byte, alg uint64) uint64 {
+	n := uint32(len(b))
+	if n <= 16 {
+		switch {
+		case n > 8:
+			// 8 < length <= 16
+			// apply same principle as above, but as two 64-bit ints
+			x := *(*uint64)(unsafe.Pointer(&b[n-8]))
+			y := *(*uint64)(unsafe.Pointer(&b[0]))
+			hx := hashInt(x, alg)
+			hy := hashInt(y, alg^1)
+			return uint64(n) ^ hx ^ hy
+		case n >= 4:
+			// 4 < length <= 8
+			// we can read the bytes as two overlapping 32-bit ints, apply different
+			// hash functions to each in parallel
+			// then xor the results
+			x := *(*uint32)(unsafe.Pointer(&b[n-4]))
+			y := *(*uint32)(unsafe.Pointer(&b[0]))
+			hx := hashInt(uint64(x), alg)
+			hy := hashInt(uint64(y), alg^1)
+			return uint64(n) ^ hx ^ hy
+		case n > 0:
+			x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1]))
+			return hashInt(uint64(x), alg)
+		case n == 0:
+			return 1
+		}
+	}
+
+	// increase differentiation enough to improve hash quality
+	return xxh3.Hash(b) + exprimes[alg]
+}
+
+const (
+	sentinel   uint64 = 0
+	loadFactor int64  = 2
+)
+
+func max(a, b uint64) uint64 {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) }
+
+// KeyNotFound is the constant returned by memo table functions when a key isn't found in the table
+const KeyNotFound = -1
+
+// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder
+// to construct the actual data in an easy to pass around way with minimal copies
+// while using a hash table to keep track of the indexes into the dictionary that
+// is created as we go.
+type BinaryMemoTable struct {
+	tbl     *Int32HashTable
+	builder *array.BinaryBuilder
+	nullIdx int
+}
+
+// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will
+// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.
+// initial and valuesize can be used to pre-allocate the table to reduce allocations. With
+// initial being the initial number of entries to allocate for and valuesize being the starting
+// amount of space allocated for writing the actual binary data.
+func NewBinaryMemoTable(mem memory.Allocator, initial, valuesize int) *BinaryMemoTable {
+	if mem == nil {
+		mem = memory.DefaultAllocator
+	}
+	bldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary)
+	bldr.Reserve(int(initial))
+	datasize := valuesize
+	if datasize <= 0 {
+		datasize = initial * 4
+	}
+	bldr.ReserveData(datasize)
+	return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound}
+}
+
+// Reset dumps all of the data in the table allowing it to be reutilized.
+func (s *BinaryMemoTable) Reset() {
+	s.tbl.Reset(32)
+	s.builder.NewArray().Release()
+	s.builder.Reserve(int(32))
+	s.builder.ReserveData(int(32) * 4)
+	s.nullIdx = KeyNotFound
+}
+
+// GetNull returns the index of a null that has been inserted into the table or
+// KeyNotFound. The bool returned will be true if there was a null inserted into
+// the table, and false otherwise.
+func (s *BinaryMemoTable) GetNull() (int, bool) {
+	return int(s.nullIdx), s.nullIdx != KeyNotFound
+}
+
+// Size returns the current size of the memo table including the null value
+// if one has been inserted.
+func (s *BinaryMemoTable) Size() int {
+	sz := int(s.tbl.size)
+	if _, ok := s.GetNull(); ok {
+		sz++
+	}
+	return sz
+}
+
+// helper function to easily return a byte slice for any given value
+// regardless of the type if it's a []byte, parquet.ByteArray,
+// parquet.FixedLenByteArray or string.
+func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte {
+	switch v := val.(type) {
+	case []byte:
+		return v
+	case parquet.ByteArray:
+		return *(*[]byte)(unsafe.Pointer(&v))
+	case parquet.FixedLenByteArray:
+		return *(*[]byte)(unsafe.Pointer(&v))
+	case string:
+		var out []byte
+		h := (*reflect.StringHeader)(unsafe.Pointer(&v))
+		s := (*reflect.SliceHeader)(unsafe.Pointer(&out))
+		s.Data = h.Data
+		s.Len = h.Len
+		s.Cap = h.Len
+		return out
+	default:
+		panic("invalid type for binarymemotable")
+	}
+}
+
+// helper function to get the hash value regardless of the underlying binary type
+func (BinaryMemoTable) getHash(val interface{}) uint64 {
+	switch v := val.(type) {
+	case string:
+		return hashString(v, 0)
+	case []byte:
+		return hash(v, 0)
+	case parquet.ByteArray:
+		return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
+	case parquet.FixedLenByteArray:
+		return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
+	default:
+		panic("invalid type for binarymemotable")
+	}
+}
+
+// helper function to append the given value to the builder regardless
+// of the underlying binary type.
+func (b *BinaryMemoTable) appendVal(val interface{}) {
+	switch v := val.(type) {
+	case string:
+		b.builder.AppendString(v)
+	case []byte:
+		b.builder.Append(v)
+	case parquet.ByteArray:
+		b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
+	case parquet.FixedLenByteArray:
+		b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
+	}
+}
+
+func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) {
+	return b.tbl.Lookup(h, func(i int32) bool {
+		return bytes.Equal(val, b.builder.Value(int(i)))
+	})
+}
+
+// Get returns the index of the specified value in the table or KeyNotFound,
+// and a boolean indicating whether it was found in the table.
+func (b *BinaryMemoTable) Get(val interface{}) (int, bool) {
+	if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok {
+		return int(p.payload.val), ok
+	}
+	return KeyNotFound, false
+}
+
+// GetOrInsert returns the index of the given value in the table, if not found
+// it is inserted into the table. The return value 'found' indicates whether the value
+// was found in the table (true) or inserted (false) along with any possible error.
+func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
+	h := b.getHash(val)
+	p, found := b.lookup(h, b.valAsByteSlice(val))
+	if found {
+		idx = int(p.payload.val)
+	} else {
+		idx = b.Size()
+		b.appendVal(val)
+		b.tbl.Insert(p, h, int32(idx), -1)
+	}
+	return
+}
+
+// GetOrInsertNull retrieves the index of a null in the table or inserts
+// null into the table, returning the index and a boolean indicating if it was
+// found in the table (true) or was inserted (false).
+func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) {
+	idx, found = b.GetNull()
+	if !found {
+		idx = b.Size()
+		b.nullIdx = idx
+		b.builder.AppendNull()
+	}
+	return
+}
+
+// helper function to get the offset into the builder data for a given
+// index value.
+func (b *BinaryMemoTable) findOffset(idx int) uintptr {
+	val := b.builder.Value(idx)
+	for len(val) == 0 {
+		idx++
+		if idx >= b.builder.Len() {
+			break
+		}
+		val = b.builder.Value(idx)
+	}
+	if len(val) != 0 {
+		return uintptr(unsafe.Pointer(&val[0]))
+	}
+	return uintptr(b.builder.DataLen()) + b.findOffset(0)
+}
+
+// CopyOffsets copies the list of offsets into the passed in slice, the offsets
+// being the start and end values of the underlying allocated bytes in the builder
+// for the individual values of the table. out should be at least sized to Size()+1
+func (b *BinaryMemoTable) CopyOffsets(out []int8) {
+	b.CopyOffsetsSubset(0, out)
+}
+
+// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
+// it gets a subset of the offsets in the table starting at the index provided by "start".
+func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int8) {
+	if b.builder.Len() <= start {
+		return
+	}
+
+	first := b.findOffset(0)
+	delta := b.findOffset(start)
+	for i := start; i < b.Size(); i++ {
+		offset := int8(b.findOffset(i) - delta)
+		out[i-start] = offset
+	}
+
+	out[b.Size()-start] = int8(b.builder.DataLen() - int(delta) - int(first))
+}
+
+// CopyValues copies the raw binary data bytes out, out should be a []byte
+// with at least ValuesSize bytes allocated to copy into.
+func (b *BinaryMemoTable) CopyValues(out interface{}) {
+	b.CopyValuesSubset(0, out)
+}
+
+// CopyValuesSubset copies the raw binary data bytes out starting with the value
+// at the index start, out should be a []byte with at least ValuesSize bytes allocated
+func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) {
+	var (
+		first  = b.findOffset(0)
+		offset = b.findOffset(int(start))
+		length = b.builder.DataLen() - int(offset-first)
+	)
+
+	outval := out.([]byte)
+	copy(outval, b.builder.Value(start)[0:length])
+}
+
+func (b *BinaryMemoTable) WriteOut(out []byte) {
+	b.CopyValues(out)
+}
+
+func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) {
+	b.CopyValuesSubset(start, out)
+}
+
+// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep
+// track of the fixed width when inserting the null value the databuffer holds a
+// zero length byte slice for the null value (if found)
+func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) {
+	if start >= b.Size() {
+		return
+	}
+
+	null, exists := b.GetNull()
+	if !exists || null < start {
+		// nothing to skip, proceed as usual
+		b.CopyValuesSubset(start, out)
+		return
+	}
+
+	var (
+		leftOffset = b.findOffset(start)
+		nullOffset = b.findOffset(null)
+		leftSize   = nullOffset - leftOffset
+	)
+
+	if leftSize > 0 {
+		copy(out, b.builder.Value(start)[0:leftSize])
+	}
+
+	rightSize := b.ValuesSize() - int(nullOffset)
+	if rightSize > 0 {
+		// skip the null fixed size value
+		copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize])
+	}
+}
+
+// VisitValues exists to run the visitFn on each value currently in the hash table.
+func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) {
+	for i := int(start); i < b.Size(); i++ {
+		visitFn(b.builder.Value(i))
+	}
+}
+
+// Release is used to tell the underlying builder that it can release the memory allocated
+// when the reference count reaches 0, this is safe to be called from multiple goroutines
+// simultaneously
+func (b *BinaryMemoTable) Release() { b.builder.Release() }
+
+// Retain increases the ref count, it is safe to call it from multiple goroutines
+// simultaneously.
+func (b *BinaryMemoTable) Retain() { b.builder.Retain() }
+
+// ValuesSize returns the current total size of all the raw bytes that have been inserted
+// into the memotable so far.
+func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/go/parquet/internal/hashing
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip