diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/go/parquet/internal/hashing | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/go/parquet/internal/hashing')
5 files changed, 1962 insertions, 0 deletions
diff --git a/src/arrow/go/parquet/internal/hashing/hashing_test.go b/src/arrow/go/parquet/internal/hashing/hashing_test.go new file mode 100644 index 000000000..875424a9d --- /dev/null +++ b/src/arrow/go/parquet/internal/hashing/hashing_test.go @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "math/rand" + "testing" + + "github.com/stretchr/testify/assert" +) + +func MakeDistinctIntegers(nvals int) map[int]bool { + r := rand.New(rand.NewSource(42)) + values := make(map[int]bool) + for len(values) < nvals { + values[r.Int()] = true + } + return values +} + +func MakeSequentialIntegers(nvals int) map[int]bool { + values := make(map[int]bool) + for i := 0; i < nvals; i++ { + values[i] = true + } + return values +} + +func MakeDistinctStrings(nvals int) map[string]bool { + values := make(map[string]bool) + + r := rand.New(rand.NewSource(42)) + + max := 'z' + min := '0' + for len(values) < nvals { + data := make([]byte, r.Intn(24)) + for idx := range data { + data[idx] = byte(r.Intn(int(max-min+1)) + int(min)) + } + values[string(data)] = true + } + return values +} + +func TestHashingQualityInt(t *testing.T) { + const nvalues = 10000 + + tests := []struct { + name string + values map[int]bool + quality float64 + }{ + {"distinct", MakeDistinctIntegers(nvalues), 0.96}, + {"sequential", MakeSequentialIntegers(nvalues), 0.96}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + hashes := make(map[uint64]bool) + for k := range tt.values { + hashes[hashInt(uint64(k), 0)] = true + hashes[hashInt(uint64(k), 1)] = true + } + assert.GreaterOrEqual(t, float64(len(hashes)), tt.quality*float64(2*len(tt.values))) + }) + } +} + +func TestHashingBoundsStrings(t *testing.T) { + sizes := []int{1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21} + for _, s := range sizes { + str := make([]byte, s) + for idx := range str { + str[idx] = uint8(idx) + } + + h := hash(str, 1) + diff := 0 + for i := 0; i < 120; i++ { + str[len(str)-1] = uint8(i) + if hash(str, 1) != h { + diff++ + } + } + assert.GreaterOrEqual(t, diff, 118) + } +} + +func TestHashingQualityString(t *testing.T) { + const nvalues = 10000 + values := MakeDistinctStrings(nvalues) + + hashes := make(map[uint64]bool) + for k := range values { + hashes[hashString(k, 0)] = true + hashes[hashString(k, 1)] = true + } + assert.GreaterOrEqual(t, float64(len(hashes)), 0.96*float64(2*len(values))) +} diff --git a/src/arrow/go/parquet/internal/hashing/types.tmpldata b/src/arrow/go/parquet/internal/hashing/types.tmpldata new file mode 100644 index 000000000..2e97e9814 --- /dev/null +++ b/src/arrow/go/parquet/internal/hashing/types.tmpldata @@ -0,0 +1,18 @@ +[ + { + "Name": "Int32", + "name": "int32" + }, + { + "Name": "Int64", + "name": "int64" + }, + { + "Name": "Float32", + "name": "float32" + }, + { + "Name": "Float64", + "name": "float64" + } +] diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go new file mode 100644 index 000000000..ca891dc33 --- /dev/null +++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go @@ -0,0 +1,1103 @@ +// Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "math" + + "github.com/apache/arrow/go/v6/arrow" + "github.com/apache/arrow/go/v6/arrow/bitutil" + "github.com/apache/arrow/go/v6/parquet/internal/utils" +) + +type payloadInt32 struct { + val int32 + memoIdx int32 +} + +type entryInt32 struct { + h uint64 + payload payloadInt32 +} + +func (e entryInt32) Valid() bool { return e.h != sentinel } + +// Int32HashTable is a hashtable specifically for int32 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int32HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt32 +} + +// NewInt32HashTable returns a new hash table for int32 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt32HashTable(cap uint64) *Int32HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt32, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int32HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt32, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int32HashTable) CopyValues(out []int32) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) { + h.VisitEntries(func(e *entryInt32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int32HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Int32HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Int32Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryInt32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEInt32(e.payload.val) + } + }) +} + +func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int32HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt32 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int32HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt32, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int32MemoTable struct { + tbl *Int32HashTable + nullIdx int32 +} + +// NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt32MemoTable(num int64) *Int32MemoTable { + return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int32MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int32MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int32MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int32MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int32)) +} + +func (s *Int32MemoTable) WriteOut(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int32MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int32)), 0) + if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int32)), 0) + e, ok := s.tbl.Lookup(h, func(v int32) bool { + return val.(int32) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int32), int32(idx)) + } + return +} + +type payloadInt64 struct { + val int64 + memoIdx int32 +} + +type entryInt64 struct { + h uint64 + payload payloadInt64 +} + +func (e entryInt64) Valid() bool { return e.h != sentinel } + +// Int64HashTable is a hashtable specifically for int64 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Int64HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryInt64 +} + +// NewInt64HashTable returns a new hash table for int64 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewInt64HashTable(cap uint64) *Int64HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryInt64, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Int64HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryInt64, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Int64HashTable) CopyValues(out []int64) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) { + h.VisitEntries(func(e *entryInt64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Int64HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Int64HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Int64Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryInt64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEInt64(e.payload.val) + } + }) +} + +func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Int64HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryInt64 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Int64HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryInt64, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Int64MemoTable struct { + tbl *Int64HashTable + nullIdx int32 +} + +// NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewInt64MemoTable(num int64) *Int64MemoTable { + return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Int64MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Int64MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Int64MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Int64MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]int64)) +} + +func (s *Int64MemoTable) WriteOut(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Int64MemoTable) Get(val interface{}) (int, bool) { + + h := hashInt(uint64(val.(int64)), 0) + if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + h := hashInt(uint64(val.(int64)), 0) + e, ok := s.tbl.Lookup(h, func(v int64) bool { + return val.(int64) == v + }) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(int64), int32(idx)) + } + return +} + +type payloadFloat32 struct { + val float32 + memoIdx int32 +} + +type entryFloat32 struct { + h uint64 + payload payloadFloat32 +} + +func (e entryFloat32) Valid() bool { return e.h != sentinel } + +// Float32HashTable is a hashtable specifically for float32 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Float32HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryFloat32 +} + +// NewFloat32HashTable returns a new hash table for float32 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewFloat32HashTable(cap uint64) *Float32HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryFloat32, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Float32HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryFloat32, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Float32HashTable) CopyValues(out []float32) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) { + h.VisitEntries(func(e *entryFloat32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Float32HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Float32HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Float32Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryFloat32) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEFloat32(e.payload.val) + } + }) +} + +func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Float32HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryFloat32 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Float32HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryFloat32, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Float32MemoTable struct { + tbl *Float32HashTable + nullIdx int32 +} + +// NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewFloat32MemoTable(num int64) *Float32MemoTable { + return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Float32MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Float32MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Float32MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Float32MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]float32)) +} + +func (s *Float32MemoTable) WriteOut(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Float32MemoTable) Get(val interface{}) (int, bool) { + var cmp func(float32) bool + + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + } else { + cmp = func(v float32) bool { return val.(float32) == v } + } + + h := hashFloat32(val.(float32), 0) + if e, ok := s.tbl.Lookup(h, cmp); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + var cmp func(float32) bool + + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + } else { + cmp = func(v float32) bool { return val.(float32) == v } + } + + h := hashFloat32(val.(float32), 0) + e, ok := s.tbl.Lookup(h, cmp) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(float32), int32(idx)) + } + return +} + +type payloadFloat64 struct { + val float64 + memoIdx int32 +} + +type entryFloat64 struct { + h uint64 + payload payloadFloat64 +} + +func (e entryFloat64) Valid() bool { return e.h != sentinel } + +// Float64HashTable is a hashtable specifically for float64 that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type Float64HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entryFloat64 +} + +// NewFloat64HashTable returns a new hash table for float64 values +// initialized with the passed in capacity or 32 whichever is larger. +func NewFloat64HashTable(cap uint64) *Float64HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entryFloat64, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *Float64HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entryFloat64, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *Float64HashTable) CopyValues(out []float64) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) { + h.VisitEntries(func(e *entryFloat64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *Float64HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *Float64HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.Float64Traits.CastFromBytes(out) + h.VisitEntries(func(e *entryFloat64) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLEFloat64(e.payload.val) + } + }) +} + +func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func (Float64HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entryFloat64 + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *Float64HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entryFloat64, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type Float64MemoTable struct { + tbl *Float64HashTable + nullIdx int32 +} + +// NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func NewFloat64MemoTable(num int64) *Float64MemoTable { + return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *Float64MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *Float64MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *Float64MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *Float64MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]float64)) +} + +func (s *Float64MemoTable) WriteOut(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *Float64MemoTable) Get(val interface{}) (int, bool) { + var cmp func(float64) bool + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + } else { + cmp = func(v float64) bool { return val.(float64) == v } + } + + h := hashFloat64(val.(float64), 0) + if e, ok := s.tbl.Lookup(h, cmp); ok { + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + + var cmp func(float64) bool + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + } else { + cmp = func(v float64) bool { return val.(float64) == v } + } + + h := hashFloat64(val.(float64), 0) + e, ok := s.tbl.Lookup(h, cmp) + + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.(float64), int32(idx)) + } + return +} diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl new file mode 100644 index 000000000..518ba5af0 --- /dev/null +++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.gen.go.tmpl @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hashing + +import ( + "github.com/apache/arrow/go/v6/arrow/bitutil" + "github.com/apache/arrow/go/v6/parquet/internal/utils" +) + +{{range .In}} +type payload{{.Name}} struct { + val {{.name}} + memoIdx int32 +} + +type entry{{.Name}} struct { + h uint64 + payload payload{{.Name}} +} + +func (e entry{{.Name}}) Valid() bool { return e.h != sentinel } + +// {{.Name}}HashTable is a hashtable specifically for {{.name}} that +// is utilized with the MemoTable to generalize interactions for easier +// implementation of dictionaries without losing performance. +type {{.Name}}HashTable struct { + cap uint64 + capMask uint64 + size uint64 + + entries []entry{{.Name}} +} + +// New{{.Name}}HashTable returns a new hash table for {{.name}} values +// initialized with the passed in capacity or 32 whichever is larger. +func New{{.Name}}HashTable(cap uint64) *{{.Name}}HashTable { + initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + ret := &{{.Name}}HashTable{cap: initCap, capMask: initCap - 1, size: 0} + ret.entries = make([]entry{{.Name}}, initCap) + return ret +} + +// Reset drops all of the values in this hash table and re-initializes it +// with the specified initial capacity as if by calling New, but without having +// to reallocate the object. +func (h *{{.Name}}HashTable) Reset(cap uint64) { + h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32)))) + h.capMask = h.cap - 1 + h.size = 0 + h.entries = make([]entry{{.Name}}, h.cap) +} + +// CopyValues is used for copying the values out of the hash table into the +// passed in slice, in the order that they were first inserted +func (h *{{.Name}}HashTable) CopyValues(out []{{.name}}) { + h.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies a subset of the values in the hashtable out, starting +// with the value at start, in the order that they were inserted. +func (h *{{.Name}}HashTable) CopyValuesSubset(start int, out []{{.name}}) { + h.VisitEntries(func(e *entry{{.Name}}) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + out[idx] = e.payload.val + } + }) +} + +func (h *{{.Name}}HashTable) WriteOut(out []byte) { + h.WriteOutSubset(0, out) +} + +func (h *{{.Name}}HashTable) WriteOutSubset(start int, out []byte) { + data := arrow.{{.Name}}Traits.CastFromBytes(out) + h.VisitEntries(func(e *entry{{.Name}}) { + idx := e.payload.memoIdx - int32(start) + if idx >= 0 { + data[idx] = utils.ToLE{{.Name}}(e.payload.val) + } + }) +} + +func (h *{{.Name}}HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap } + +func ({{.Name}}HashTable) fixHash(v uint64) uint64 { + if v == sentinel { + return 42 + } + return v +} + +// Lookup retrieves the entry for a given hash value assuming it's payload value returns +// true when passed to the cmp func. Returns a pointer to the entry for the given hash value, +// and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false. +func (h *{{.Name}}HashTable) Lookup(v uint64, cmp func({{.name}}) bool) (*entry{{.Name}}, bool) { + idx, ok := h.lookup(v, h.capMask, cmp) + return &h.entries[idx], ok +} + +func (h *{{.Name}}HashTable) lookup(v uint64, szMask uint64, cmp func({{.name}}) bool) (uint64, bool) { + const perturbShift uint8 = 5 + + var ( + idx uint64 + perturb uint64 + e *entry{{.Name}} + ) + + v = h.fixHash(v) + idx = v & szMask + perturb = (v >> uint64(perturbShift)) + 1 + + for { + e = &h.entries[idx] + if e.h == v && cmp(e.payload.val) { + return idx, true + } + + if e.h == sentinel { + return idx, false + } + + // perturbation logic inspired from CPython's set/dict object + // the goal is that all 64 bits of unmasked hash value eventually + // participate int he probing sequence, to minimize clustering + idx = (idx + perturb) & szMask + perturb = (perturb >> uint64(perturbShift)) + 1 + } +} + +func (h *{{.Name}}HashTable) upsize(newcap uint64) error { + newMask := newcap - 1 + + oldEntries := h.entries + h.entries = make([]entry{{.Name}}, newcap) + for _, e := range oldEntries { + if e.Valid() { + idx, _ := h.lookup(e.h, newMask, func({{.name}}) bool { return false }) + h.entries[idx] = e + } + } + h.cap = newcap + h.capMask = newMask + return nil +} + +// Insert updates the given entry with the provided hash value, payload value and memo index. +// The entry pointer must have been retrieved via lookup in order to actually insert properly. +func (h *{{.Name}}HashTable) Insert(e *entry{{.Name}}, v uint64, val {{.name}}, memoIdx int32) error { + e.h = h.fixHash(v) + e.payload.val = val + e.payload.memoIdx = memoIdx + h.size++ + + if h.needUpsize() { + h.upsize(h.cap * uint64(loadFactor) * 2) + } + return nil +} + +// VisitEntries will call the passed in function on each *valid* entry in the hash table, +// a valid entry being one which has had a value inserted into it. +func (h *{{.Name}}HashTable) VisitEntries(visit func(*entry{{.Name}})) { + for _, e := range h.entries { + if e.Valid() { + visit(&e) + } + } +} + +// {{.Name}}MemoTable is a wrapper over the appropriate hashtable to provide an interface +// conforming to the MemoTable interface defined in the encoding package for general interactions +// regarding dictionaries. +type {{.Name}}MemoTable struct { + tbl *{{.Name}}HashTable + nullIdx int32 +} + +// New{{.Name}}MemoTable returns a new memotable with num entries pre-allocated to reduce further +// allocations when inserting. +func New{{.Name}}MemoTable(num int64) *{{.Name}}MemoTable { + return &{{.Name}}MemoTable{tbl: New{{.Name}}HashTable(uint64(num)), nullIdx: KeyNotFound} +} + +// Reset allows this table to be re-used by dumping all the data currently in the table. +func (s *{{.Name}}MemoTable) Reset() { + s.tbl.Reset(32) + s.nullIdx = KeyNotFound +} + +// Size returns the current number of inserted elements into the table including if a null +// has been inserted. +func (s *{{.Name}}MemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// GetNull returns the index of an inserted null or KeyNotFound along with a bool +// that will be true if found and false if not. +func (s *{{.Name}}MemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// GetOrInsertNull will return the index of the null entry or insert a null entry +// if one currently doesn't exist. The found value will be true if there was already +// a null in the table, and false if it inserted one. +func (s *{{.Name}}MemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = s.GetNull() + if !found { + idx = s.Size() + s.nullIdx = int32(idx) + } + return +} + +// CopyValues will copy the values from the memo table out into the passed in slice +// which must be of the appropriate type. +func (s *{{.Name}}MemoTable) CopyValues(out interface{}) { + s.CopyValuesSubset(0, out) +} + +// CopyValuesSubset is like CopyValues but only copies a subset of values starting +// at the provided start index +func (s *{{.Name}}MemoTable) CopyValuesSubset(start int, out interface{}) { + s.tbl.CopyValuesSubset(start, out.([]{{.name}})) +} + +func (s *{{.Name}}MemoTable) WriteOut(out []byte) { + s.tbl.WriteOut(out) +} + +func (s *{{.Name}}MemoTable) WriteOutSubset(start int, out []byte) { + s.tbl.WriteOutSubset(start, out) +} + +// Get returns the index of the requested value in the hash table or KeyNotFound +// along with a boolean indicating if it was found or not. +func (s *{{.Name}}MemoTable) Get(val interface{}) (int, bool) { +{{if or (eq .Name "Int32") (eq .Name "Int64") }} + h := hashInt(uint64(val.({{.name}})), 0) + if e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { return val.({{.name}}) == v }); ok { +{{ else -}} + var cmp func({{.name}}) bool + {{if eq .Name "Float32"}} + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + {{ else -}} + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + {{end -}} + } else { + cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } + } + + h := hash{{.Name}}(val.({{.name}}), 0) + if e, ok := s.tbl.Lookup(h, cmp); ok { +{{ end -}} + return int(e.payload.memoIdx), ok + } + return KeyNotFound, false +} + +// GetOrInsert will return the index of the specified value in the table, or insert the +// value into the table and return the new index. found indicates whether or not it already +// existed in the table (true) or was inserted by this call (false). +func (s *{{.Name}}MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + {{if or (eq .Name "Int32") (eq .Name "Int64") }} + h := hashInt(uint64(val.({{.name}})), 0) + e, ok := s.tbl.Lookup(h, func(v {{.name}}) bool { + return val.({{.name}}) == v + }) +{{ else }} + var cmp func({{.name}}) bool + {{if eq .Name "Float32"}} + if math.IsNaN(float64(val.(float32))) { + cmp = isNan32Cmp + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = float32(math.NaN()) + {{ else -}} + if math.IsNaN(val.(float64)) { + cmp = math.IsNaN + // use consistent internal bit pattern for NaN regardless of the pattern + // that is passed to us. NaN is NaN is NaN + val = math.NaN() + {{end -}} + } else { + cmp = func(v {{.name}}) bool { return val.({{.name}}) == v } + } + + h := hash{{.Name}}(val.({{.name}}), 0) + e, ok := s.tbl.Lookup(h, cmp) +{{ end }} + if ok { + idx = int(e.payload.memoIdx) + found = true + } else { + idx = s.Size() + s.tbl.Insert(e, h, val.({{.name}}), int32(idx)) + } + return +} +{{end}} diff --git a/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go new file mode 100644 index 000000000..54d245187 --- /dev/null +++ b/src/arrow/go/parquet/internal/hashing/xxh3_memo_table.go @@ -0,0 +1,400 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hashing provides utilities for and an implementation of a hash +// table which is more performant than the default go map implementation +// by leveraging xxh3 and some custom hash functions. +package hashing + +import ( + "bytes" + "math" + "math/bits" + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v6/arrow" + "github.com/apache/arrow/go/v6/arrow/array" + "github.com/apache/arrow/go/v6/arrow/memory" + "github.com/apache/arrow/go/v6/parquet" + + "github.com/zeebo/xxh3" +) + +//go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl + +func hashInt(val uint64, alg uint64) uint64 { + // Two of xxhash's prime multipliers (which are chosen for their + // bit dispersion properties) + var multipliers = [2]uint64{11400714785074694791, 14029467366897019727} + // Multiplying by the prime number mixes the low bits into the high bits, + // then byte-swapping (which is a single CPU instruction) allows the + // combined high and low bits to participate in the initial hash table index. + return bits.ReverseBytes64(multipliers[alg] * val) +} + +func hashFloat32(val float32, alg uint64) uint64 { + // grab the raw byte pattern of the + bt := *(*[4]byte)(unsafe.Pointer(&val)) + x := uint64(*(*uint32)(unsafe.Pointer(&bt[0]))) + hx := hashInt(x, alg) + hy := hashInt(x, alg^1) + return 4 ^ hx ^ hy +} + +func hashFloat64(val float64, alg uint64) uint64 { + bt := *(*[8]byte)(unsafe.Pointer(&val)) + hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg) + hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1) + return 8 ^ hx ^ hy +} + +func hashString(val string, alg uint64) uint64 { + buf := *(*[]byte)(unsafe.Pointer(&val)) + (*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val) + return hash(buf, alg) +} + +// prime constants used for slightly increasing the hash quality further +var exprimes = [2]uint64{1609587929392839161, 9650029242287828579} + +// for smaller amounts of bytes this is faster than even calling into +// xxh3 to do the hash, so we specialize in order to get the benefits +// of that performance. +func hash(b []byte, alg uint64) uint64 { + n := uint32(len(b)) + if n <= 16 { + switch { + case n > 8: + // 8 < length <= 16 + // apply same principle as above, but as two 64-bit ints + x := *(*uint64)(unsafe.Pointer(&b[n-8])) + y := *(*uint64)(unsafe.Pointer(&b[0])) + hx := hashInt(x, alg) + hy := hashInt(y, alg^1) + return uint64(n) ^ hx ^ hy + case n >= 4: + // 4 < length <= 8 + // we can read the bytes as two overlapping 32-bit ints, apply different + // hash functions to each in parallel + // then xor the results + x := *(*uint32)(unsafe.Pointer(&b[n-4])) + y := *(*uint32)(unsafe.Pointer(&b[0])) + hx := hashInt(uint64(x), alg) + hy := hashInt(uint64(y), alg^1) + return uint64(n) ^ hx ^ hy + case n > 0: + x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1])) + return hashInt(uint64(x), alg) + case n == 0: + return 1 + } + } + + // increase differentiation enough to improve hash quality + return xxh3.Hash(b) + exprimes[alg] +} + +const ( + sentinel uint64 = 0 + loadFactor int64 = 2 +) + +func max(a, b uint64) uint64 { + if a > b { + return a + } + return b +} + +var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } + +// KeyNotFound is the constant returned by memo table functions when a key isn't found in the table +const KeyNotFound = -1 + +// BinaryMemoTable is our hashtable for binary data using the BinaryBuilder +// to construct the actual data in an easy to pass around way with minimal copies +// while using a hash table to keep track of the indexes into the dictionary that +// is created as we go. +type BinaryMemoTable struct { + tbl *Int32HashTable + builder *array.BinaryBuilder + nullIdx int +} + +// NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will +// be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. +// initial and valuesize can be used to pre-allocate the table to reduce allocations. With +// initial being the initial number of entries to allocate for and valuesize being the starting +// amount of space allocated for writing the actual binary data. +func NewBinaryMemoTable(mem memory.Allocator, initial, valuesize int) *BinaryMemoTable { + if mem == nil { + mem = memory.DefaultAllocator + } + bldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + bldr.Reserve(int(initial)) + datasize := valuesize + if datasize <= 0 { + datasize = initial * 4 + } + bldr.ReserveData(datasize) + return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} +} + +// Reset dumps all of the data in the table allowing it to be reutilized. +func (s *BinaryMemoTable) Reset() { + s.tbl.Reset(32) + s.builder.NewArray().Release() + s.builder.Reserve(int(32)) + s.builder.ReserveData(int(32) * 4) + s.nullIdx = KeyNotFound +} + +// GetNull returns the index of a null that has been inserted into the table or +// KeyNotFound. The bool returned will be true if there was a null inserted into +// the table, and false otherwise. +func (s *BinaryMemoTable) GetNull() (int, bool) { + return int(s.nullIdx), s.nullIdx != KeyNotFound +} + +// Size returns the current size of the memo table including the null value +// if one has been inserted. +func (s *BinaryMemoTable) Size() int { + sz := int(s.tbl.size) + if _, ok := s.GetNull(); ok { + sz++ + } + return sz +} + +// helper function to easily return a byte slice for any given value +// regardless of the type if it's a []byte, parquet.ByteArray, +// parquet.FixedLenByteArray or string. +func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { + switch v := val.(type) { + case []byte: + return v + case parquet.ByteArray: + return *(*[]byte)(unsafe.Pointer(&v)) + case parquet.FixedLenByteArray: + return *(*[]byte)(unsafe.Pointer(&v)) + case string: + var out []byte + h := (*reflect.StringHeader)(unsafe.Pointer(&v)) + s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) + s.Data = h.Data + s.Len = h.Len + s.Cap = h.Len + return out + default: + panic("invalid type for binarymemotable") + } +} + +// helper function to get the hash value regardless of the underlying binary type +func (BinaryMemoTable) getHash(val interface{}) uint64 { + switch v := val.(type) { + case string: + return hashString(v, 0) + case []byte: + return hash(v, 0) + case parquet.ByteArray: + return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) + case parquet.FixedLenByteArray: + return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) + default: + panic("invalid type for binarymemotable") + } +} + +// helper function to append the given value to the builder regardless +// of the underlying binary type. +func (b *BinaryMemoTable) appendVal(val interface{}) { + switch v := val.(type) { + case string: + b.builder.AppendString(v) + case []byte: + b.builder.Append(v) + case parquet.ByteArray: + b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) + case parquet.FixedLenByteArray: + b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) + } +} + +func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { + return b.tbl.Lookup(h, func(i int32) bool { + return bytes.Equal(val, b.builder.Value(int(i))) + }) +} + +// Get returns the index of the specified value in the table or KeyNotFound, +// and a boolean indicating whether it was found in the table. +func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { + if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { + return int(p.payload.val), ok + } + return KeyNotFound, false +} + +// GetOrInsert returns the index of the given value in the table, if not found +// it is inserted into the table. The return value 'found' indicates whether the value +// was found in the table (true) or inserted (false) along with any possible error. +func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { + h := b.getHash(val) + p, found := b.lookup(h, b.valAsByteSlice(val)) + if found { + idx = int(p.payload.val) + } else { + idx = b.Size() + b.appendVal(val) + b.tbl.Insert(p, h, int32(idx), -1) + } + return +} + +// GetOrInsertNull retrieves the index of a null in the table or inserts +// null into the table, returning the index and a boolean indicating if it was +// found in the table (true) or was inserted (false). +func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { + idx, found = b.GetNull() + if !found { + idx = b.Size() + b.nullIdx = idx + b.builder.AppendNull() + } + return +} + +// helper function to get the offset into the builder data for a given +// index value. +func (b *BinaryMemoTable) findOffset(idx int) uintptr { + val := b.builder.Value(idx) + for len(val) == 0 { + idx++ + if idx >= b.builder.Len() { + break + } + val = b.builder.Value(idx) + } + if len(val) != 0 { + return uintptr(unsafe.Pointer(&val[0])) + } + return uintptr(b.builder.DataLen()) + b.findOffset(0) +} + +// CopyOffsets copies the list of offsets into the passed in slice, the offsets +// being the start and end values of the underlying allocated bytes in the builder +// for the individual values of the table. out should be at least sized to Size()+1 +func (b *BinaryMemoTable) CopyOffsets(out []int8) { + b.CopyOffsetsSubset(0, out) +} + +// CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, +// it gets a subset of the offsets in the table starting at the index provided by "start". +func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int8) { + if b.builder.Len() <= start { + return + } + + first := b.findOffset(0) + delta := b.findOffset(start) + for i := start; i < b.Size(); i++ { + offset := int8(b.findOffset(i) - delta) + out[i-start] = offset + } + + out[b.Size()-start] = int8(b.builder.DataLen() - int(delta) - int(first)) +} + +// CopyValues copies the raw binary data bytes out, out should be a []byte +// with at least ValuesSize bytes allocated to copy into. +func (b *BinaryMemoTable) CopyValues(out interface{}) { + b.CopyValuesSubset(0, out) +} + +// CopyValuesSubset copies the raw binary data bytes out starting with the value +// at the index start, out should be a []byte with at least ValuesSize bytes allocated +func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { + var ( + first = b.findOffset(0) + offset = b.findOffset(int(start)) + length = b.builder.DataLen() - int(offset-first) + ) + + outval := out.([]byte) + copy(outval, b.builder.Value(start)[0:length]) +} + +func (b *BinaryMemoTable) WriteOut(out []byte) { + b.CopyValues(out) +} + +func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) { + b.CopyValuesSubset(start, out) +} + +// CopyFixedWidthValues exists to cope with the fact that the table doesn't keep +// track of the fixed width when inserting the null value the databuffer holds a +// zero length byte slice for the null value (if found) +func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { + if start >= b.Size() { + return + } + + null, exists := b.GetNull() + if !exists || null < start { + // nothing to skip, proceed as usual + b.CopyValuesSubset(start, out) + return + } + + var ( + leftOffset = b.findOffset(start) + nullOffset = b.findOffset(null) + leftSize = nullOffset - leftOffset + ) + + if leftSize > 0 { + copy(out, b.builder.Value(start)[0:leftSize]) + } + + rightSize := b.ValuesSize() - int(nullOffset) + if rightSize > 0 { + // skip the null fixed size value + copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize]) + } +} + +// VisitValues exists to run the visitFn on each value currently in the hash table. +func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { + for i := int(start); i < b.Size(); i++ { + visitFn(b.builder.Value(i)) + } +} + +// Release is used to tell the underlying builder that it can release the memory allocated +// when the reference count reaches 0, this is safe to be called from multiple goroutines +// simultaneously +func (b *BinaryMemoTable) Release() { b.builder.Release() } + +// Retain increases the ref count, it is safe to call it from multiple goroutines +// simultaneously. +func (b *BinaryMemoTable) Retain() { b.builder.Retain() } + +// ValuesSize returns the current total size of all the raw bytes that have been inserted +// into the memotable so far. +func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() } |