diff options
Diffstat (limited to 'src/arrow/cpp/src/arrow/compute/kernels/vector_hash_test.cc')
-rw-r--r-- | src/arrow/cpp/src/arrow/compute/kernels/vector_hash_test.cc | 756 |
1 files changed, 756 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/src/arrow/cpp/src/arrow/compute/kernels/vector_hash_test.cc new file mode 100644 index 000000000..a10667e49 --- /dev/null +++ b/src/arrow/cpp/src/arrow/compute/kernels/vector_hash_test.cc @@ -0,0 +1,756 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> +#include <cstdint> +#include <cstdio> +#include <functional> +#include <locale> +#include <memory> +#include <stdexcept> +#include <string> +#include <utility> +#include <vector> + +#include <gtest/gtest.h> + +#include "arrow/array.h" +#include "arrow/array/builder_decimal.h" +#include "arrow/buffer.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" + +#include "arrow/compute/api.h" +#include "arrow/compute/kernels/test_util.h" + +#include "arrow/ipc/json_simple.h" + +namespace arrow { + +using internal::checked_cast; + +namespace compute { + +// ---------------------------------------------------------------------- +// Dictionary tests + +template <typename T> +void CheckUnique(const std::shared_ptr<T>& input, + const std::shared_ptr<Array>& expected) { + ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Unique(input)); + ValidateOutput(*result); + // TODO: We probably shouldn't rely on array ordering. + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +template <typename Type, typename T> +void CheckUnique(const std::shared_ptr<DataType>& type, const std::vector<T>& in_values, + const std::vector<bool>& in_is_valid, const std::vector<T>& out_values, + const std::vector<bool>& out_is_valid) { + std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid); + std::shared_ptr<Array> expected = _MakeArray<Type, T>(type, out_values, out_is_valid); + CheckUnique(input, expected); +} + +// Check that ValueCounts() accepts a 0-length array with null buffers +void CheckValueCountsNull(const std::shared_ptr<DataType>& type) { + std::vector<std::shared_ptr<Buffer>> data_buffers(2); + Datum input; + input.value = + ArrayData::Make(type, 0 /* length */, std::move(data_buffers), 0 /* null_count */); + + std::shared_ptr<Array> ex_values = ArrayFromJSON(type, "[]"); + std::shared_ptr<Array> ex_counts = ArrayFromJSON(int64(), "[]"); + + ASSERT_OK_AND_ASSIGN(auto result_struct, ValueCounts(input)); + ValidateOutput(*result_struct); + ASSERT_NE(result_struct->GetFieldByName(kValuesFieldName), nullptr); + // TODO: We probably shouldn't rely on value ordering. + ASSERT_ARRAYS_EQUAL(*ex_values, *result_struct->GetFieldByName(kValuesFieldName)); + ASSERT_ARRAYS_EQUAL(*ex_counts, *result_struct->GetFieldByName(kCountsFieldName)); +} + +template <typename T> +void CheckValueCounts(const std::shared_ptr<T>& input, + const std::shared_ptr<Array>& expected_values, + const std::shared_ptr<Array>& expected_counts) { + ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, ValueCounts(input)); + ValidateOutput(*result); + auto result_struct = std::dynamic_pointer_cast<StructArray>(result); + ASSERT_EQ(result_struct->num_fields(), 2); + // TODO: We probably shouldn't rely on value ordering. + ASSERT_ARRAYS_EQUAL(*expected_values, *result_struct->field(kValuesFieldIndex)); + ASSERT_ARRAYS_EQUAL(*expected_counts, *result_struct->field(kCountsFieldIndex)); +} + +template <typename Type, typename T> +void CheckValueCounts(const std::shared_ptr<DataType>& type, + const std::vector<T>& in_values, + const std::vector<bool>& in_is_valid, + const std::vector<T>& out_values, + const std::vector<bool>& out_is_valid, + const std::vector<int64_t>& out_counts) { + std::vector<bool> all_valids(out_is_valid.size(), true); + std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid); + std::shared_ptr<Array> ex_values = _MakeArray<Type, T>(type, out_values, out_is_valid); + std::shared_ptr<Array> ex_counts = + _MakeArray<Int64Type, int64_t>(int64(), out_counts, all_valids); + + CheckValueCounts(input, ex_values, ex_counts); +} + +void CheckDictEncode(const std::shared_ptr<Array>& input, + const std::shared_ptr<Array>& expected_values, + const std::shared_ptr<Array>& expected_indices) { + auto type = dictionary(expected_indices->type(), expected_values->type()); + DictionaryArray expected(type, expected_indices, expected_values); + + ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input)); + std::shared_ptr<Array> result = MakeArray(datum_out.array()); + ValidateOutput(*result); + + ASSERT_ARRAYS_EQUAL(expected, *result); +} + +template <typename Type, typename T> +void CheckDictEncode(const std::shared_ptr<DataType>& type, + const std::vector<T>& in_values, + const std::vector<bool>& in_is_valid, + const std::vector<T>& out_values, + const std::vector<bool>& out_is_valid, + const std::vector<int32_t>& out_indices) { + std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid); + std::shared_ptr<Array> ex_dict = _MakeArray<Type, T>(type, out_values, out_is_valid); + std::shared_ptr<Array> ex_indices = + _MakeArray<Int32Type, int32_t>(int32(), out_indices, in_is_valid); + return CheckDictEncode(input, ex_dict, ex_indices); +} + +class TestHashKernel : public ::testing::Test {}; + +template <typename Type> +class TestHashKernelPrimitive : public ::testing::Test {}; + +typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type, + UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType, + Date32Type, Date64Type> + PrimitiveDictionaries; + +TYPED_TEST_SUITE(TestHashKernelPrimitive, PrimitiveDictionaries); + +TYPED_TEST(TestHashKernelPrimitive, Unique) { + using T = typename TypeParam::c_type; + auto type = TypeTraits<TypeParam>::type_singleton(); + CheckUnique<TypeParam, T>(type, {2, 1, 2, 1}, {true, false, true, true}, {2, 0, 1}, + {1, 0, 1}); + CheckUnique<TypeParam, T>(type, {2, 1, 3, 1}, {false, false, true, true}, {0, 3, 1}, + {0, 1, 1}); + + // Sliced + CheckUnique(ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4), + ArrayFromJSON(type, "[2, null, 3]")); +} + +TYPED_TEST(TestHashKernelPrimitive, ValueCounts) { + using T = typename TypeParam::c_type; + auto type = TypeTraits<TypeParam>::type_singleton(); + CheckValueCounts<TypeParam, T>(type, {2, 1, 2, 1, 2, 3, 4}, + {true, false, true, true, true, true, false}, + {2, 0, 1, 3}, {1, 0, 1, 1}, {3, 2, 1, 1}); + CheckValueCounts<TypeParam, T>(type, {}, {}, {}, {}, {}); + CheckValueCountsNull(type); + + // Sliced + CheckValueCounts(ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4), + ArrayFromJSON(type, "[2, null, 3]"), + ArrayFromJSON(int64(), "[2, 1, 1]")); +} + +TYPED_TEST(TestHashKernelPrimitive, DictEncode) { + using T = typename TypeParam::c_type; + auto type = TypeTraits<TypeParam>::type_singleton(); + CheckDictEncode<TypeParam, T>(type, {2, 1, 2, 1, 2, 3}, + {true, false, true, true, true, true}, {2, 1, 3}, + {1, 1, 1}, {0, 0, 0, 1, 0, 2}); + + // Sliced + CheckDictEncode(ArrayFromJSON(type, "[2, 1, null, 4, 3, 1, 42]")->Slice(1, 5), + ArrayFromJSON(type, "[1, 4, 3]"), + ArrayFromJSON(int32(), "[0, null, 1, 2, 0]")); +} + +TYPED_TEST(TestHashKernelPrimitive, ZeroChunks) { + auto type = TypeTraits<TypeParam>::type_singleton(); + + auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type); + ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks)); + + ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY); + AssertChunkedEqual(*result.chunked_array(), + ChunkedArray({}, dictionary(int32(), type))); +} + +TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { + using T = typename TypeParam::c_type; + + const int64_t kTotalValues = std::min<int64_t>(INT16_MAX, 1UL << sizeof(T) / 2); + const int64_t kRepeats = 5; + + std::vector<T> values; + std::vector<T> uniques; + std::vector<int32_t> indices; + std::vector<int64_t> counts; + for (int64_t i = 0; i < kTotalValues * kRepeats; i++) { + const auto val = static_cast<T>(i % kTotalValues); + values.push_back(val); + + if (i < kTotalValues) { + uniques.push_back(val); + counts.push_back(kRepeats); + } + indices.push_back(static_cast<int32_t>(i % kTotalValues)); + } + + auto type = TypeTraits<TypeParam>::type_singleton(); + CheckUnique<TypeParam, T>(type, values, {}, uniques, {}); + CheckValueCounts<TypeParam, T>(type, values, {}, uniques, {}, counts); + CheckDictEncode<TypeParam, T>(type, values, {}, uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueTimeTimestamp) { + CheckUnique<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 2, 1}, + {true, false, true, true}, {2, 0, 1}, {1, 0, 1}); + + CheckUnique<Time64Type, int64_t>(time64(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 0, 1}, {1, 0, 1}); + + CheckUnique<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 0, 1}, {1, 0, 1}); +} + +TEST_F(TestHashKernel, ValueCountsTimeTimestamp) { + CheckValueCounts<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 2, 1}, + {true, false, true, true}, {2, 0, 1}, {1, 0, 1}, + {2, 1, 1}); + + CheckValueCounts<Time64Type, int64_t>(time64(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 0, 1}, {1, 0, 1}, + {2, 1, 1}); + + CheckValueCounts<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {2, 1, 2, 1}, + {true, false, true, true}, {2, 0, 1}, + {1, 0, 1}, {2, 1, 1}); +} + +TEST_F(TestHashKernel, UniqueBoolean) { + CheckUnique<BooleanType, bool>(boolean(), {true, true, false, true}, + {true, false, true, true}, {true, false, false}, + {1, 0, 1}); + + CheckUnique<BooleanType, bool>(boolean(), {false, true, false, true}, + {true, false, true, true}, {false, false, true}, + {1, 0, 1}); + + // No nulls + CheckUnique<BooleanType, bool>(boolean(), {true, true, false, true}, {}, {true, false}, + {}); + + CheckUnique<BooleanType, bool>(boolean(), {false, true, false, true}, {}, {false, true}, + {}); + + // Sliced + CheckUnique(ArrayFromJSON(boolean(), "[null, true, true, false]")->Slice(1, 2), + ArrayFromJSON(boolean(), "[true]")); +} + +TEST_F(TestHashKernel, ValueCountsBoolean) { + CheckValueCounts<BooleanType, bool>(boolean(), {true, true, false, true}, + {true, false, true, true}, {true, false, false}, + {1, 0, 1}, {2, 1, 1}); + + CheckValueCounts<BooleanType, bool>(boolean(), {false, true, false, true}, + {true, false, true, true}, {false, false, true}, + {1, 0, 1}, {2, 1, 1}); + + // No nulls + CheckValueCounts<BooleanType, bool>(boolean(), {true, true, false, true}, {}, + {true, false}, {}, {3, 1}); + + CheckValueCounts<BooleanType, bool>(boolean(), {false, true, false, true}, {}, + {false, true}, {}, {2, 2}); + + // Sliced + CheckValueCounts(ArrayFromJSON(boolean(), "[true, false, false, null]")->Slice(1, 2), + ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(), "[2]")); +} + +TEST_F(TestHashKernel, ValueCountsNull) { + CheckValueCounts(ArrayFromJSON(null(), "[null, null, null]"), + ArrayFromJSON(null(), "[null]"), ArrayFromJSON(int64(), "[3]")); +} + +TEST_F(TestHashKernel, DictEncodeBoolean) { + CheckDictEncode<BooleanType, bool>(boolean(), {true, true, false, true, false}, + {true, false, true, true, true}, {true, false}, {}, + {0, 0, 1, 0, 1}); + + CheckDictEncode<BooleanType, bool>(boolean(), {false, true, false, true, false}, + {true, false, true, true, true}, {false, true}, {}, + {0, 0, 0, 1, 0}); + + // No nulls + CheckDictEncode<BooleanType, bool>(boolean(), {true, true, false, true, false}, {}, + {true, false}, {}, {0, 0, 1, 0, 1}); + + CheckDictEncode<BooleanType, bool>(boolean(), {false, true, false, true, false}, {}, + {false, true}, {}, {0, 1, 0, 1, 0}); + + // Sliced + CheckDictEncode( + ArrayFromJSON(boolean(), "[false, true, null, true, false]")->Slice(1, 3), + ArrayFromJSON(boolean(), "[true]"), ArrayFromJSON(int32(), "[0, null, 0]")); +} + +template <typename ArrowType> +class TestHashKernelBinaryTypes : public TestHashKernel { + protected: + std::shared_ptr<DataType> type() { return TypeTraits<ArrowType>::type_singleton(); } + + void CheckDictEncodeP(const std::vector<std::string>& in_values, + const std::vector<bool>& in_is_valid, + const std::vector<std::string>& out_values, + const std::vector<bool>& out_is_valid, + const std::vector<int32_t>& out_indices) { + CheckDictEncode<ArrowType, std::string>(type(), in_values, in_is_valid, out_values, + out_is_valid, out_indices); + } + + void CheckValueCountsP(const std::vector<std::string>& in_values, + const std::vector<bool>& in_is_valid, + const std::vector<std::string>& out_values, + const std::vector<bool>& out_is_valid, + const std::vector<int64_t>& out_counts) { + CheckValueCounts<ArrowType, std::string>(type(), in_values, in_is_valid, out_values, + out_is_valid, out_counts); + } + + void CheckUniqueP(const std::vector<std::string>& in_values, + const std::vector<bool>& in_is_valid, + const std::vector<std::string>& out_values, + const std::vector<bool>& out_is_valid) { + CheckUnique<ArrowType, std::string>(type(), in_values, in_is_valid, out_values, + out_is_valid); + } +}; + +TYPED_TEST_SUITE(TestHashKernelBinaryTypes, BinaryArrowTypes); + +TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) { + auto type = this->type(); + + auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type); + ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks)); + + ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY); + AssertChunkedEqual(*result.chunked_array(), + ChunkedArray({}, dictionary(int32(), type))); +} + +TYPED_TEST(TestHashKernelBinaryTypes, TwoChunks) { + auto type = this->type(); + + auto two_chunks = std::make_shared<ChunkedArray>( + ArrayVector{ + ArrayFromJSON(type, "[\"a\"]"), + ArrayFromJSON(type, "[\"b\"]"), + }, + type); + ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(two_chunks)); + + auto dict_type = dictionary(int32(), type); + auto dictionary = ArrayFromJSON(type, R"(["a", "b"])"); + + auto chunk_0 = std::make_shared<DictionaryArray>( + dict_type, ArrayFromJSON(int32(), "[0]"), dictionary); + auto chunk_1 = std::make_shared<DictionaryArray>( + dict_type, ArrayFromJSON(int32(), "[1]"), dictionary); + + ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY); + AssertChunkedEqual(*result.chunked_array(), + ChunkedArray({chunk_0, chunk_1}, dict_type)); +} + +TYPED_TEST(TestHashKernelBinaryTypes, Unique) { + this->CheckUniqueP({"test", "", "test2", "test"}, {true, false, true, true}, + {"test", "", "test2"}, {1, 0, 1}); + + // Sliced + CheckUnique( + ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ef", "cd", "gh"])")->Slice(1, 4), + ArrayFromJSON(this->type(), R"([null, "cd", "ef"])")); +} + +TYPED_TEST(TestHashKernelBinaryTypes, ValueCounts) { + this->CheckValueCountsP({"test", "", "test2", "test"}, {true, false, true, true}, + {"test", "", "test2"}, {1, 0, 1}, {2, 1, 1}); + + // Sliced + CheckValueCounts( + ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ab", "cd", "ef"])")->Slice(1, 4), + ArrayFromJSON(this->type(), R"([null, "cd", "ab"])"), + ArrayFromJSON(int64(), "[1, 2, 1]")); +} + +TYPED_TEST(TestHashKernelBinaryTypes, DictEncode) { + this->CheckDictEncodeP({"test", "", "test2", "test", "baz"}, + {true, false, true, true, true}, {"test", "test2", "baz"}, {}, + {0, 0, 1, 0, 2}); + + // Sliced + CheckDictEncode( + ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ab", "cd", "ef"])")->Slice(1, 4), + ArrayFromJSON(this->type(), R"(["cd", "ab"])"), + ArrayFromJSON(int32(), "[null, 0, 1, 0]")); +} + +TYPED_TEST(TestHashKernelBinaryTypes, BinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + std::vector<std::string> values; + std::vector<std::string> uniques; + std::vector<int32_t> indices; + std::vector<int64_t> counts; + char buf[20] = "test"; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0); + values.emplace_back(buf); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + counts.push_back(kRepeats); + } + indices.push_back(index); + } + + this->CheckUniqueP(values, {}, uniques, {}); + this->CheckValueCountsP(values, {}, uniques, {}, counts); + this->CheckDictEncodeP(values, {}, uniques, {}, indices); +} + +TEST_F(TestHashKernel, UniqueFixedSizeBinary) { + auto type = fixed_size_binary(3); + + CheckUnique<FixedSizeBinaryType, std::string>(type, {"aaa", "", "bbb", "aaa"}, + {true, false, true, true}, + {"aaa", "", "bbb"}, {1, 0, 1}); + + // Sliced + CheckUnique( + ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", "ddd"])")->Slice(1, 4), + ArrayFromJSON(type, R"([null, "bbb", "ccc"])")); +} + +TEST_F(TestHashKernel, ValueCountsFixedSizeBinary) { + auto type = fixed_size_binary(3); + auto input = ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", null])"); + + CheckValueCounts(input, ArrayFromJSON(type, R"(["aaa", null, "bbb", "ccc"])"), + ArrayFromJSON(int64(), "[1, 2, 2, 1]")); + + // Sliced + CheckValueCounts(input->Slice(1, 4), ArrayFromJSON(type, R"([null, "bbb", "ccc"])"), + ArrayFromJSON(int64(), "[1, 2, 1]")); +} + +TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) { + auto type = fixed_size_binary(3); + + CheckDictEncode<FixedSizeBinaryType, std::string>( + type, {"bbb", "", "bbb", "aaa", "ccc"}, {true, false, true, true, true}, + {"bbb", "aaa", "ccc"}, {}, {0, 0, 0, 1, 2}); + + // Sliced + CheckDictEncode( + ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", "ddd"])")->Slice(1, 4), + ArrayFromJSON(type, R"(["bbb", "ccc"])"), + ArrayFromJSON(int32(), "[null, 0, 0, 1]")); +} + +TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) { + const int32_t kTotalValues = 10000; +#if !defined(ARROW_VALGRIND) + const int32_t kRepeats = 10; +#else + // Mitigate Valgrind's slowness + const int32_t kRepeats = 3; +#endif + + std::vector<std::string> values; + std::vector<std::string> uniques; + std::vector<int32_t> indices; + char buf[7] = "test.."; + + for (int32_t i = 0; i < kTotalValues * kRepeats; i++) { + int32_t index = i % kTotalValues; + + buf[4] = static_cast<char>(index / 128); + buf[5] = static_cast<char>(index % 128); + values.emplace_back(buf, 6); + + if (i < kTotalValues) { + uniques.push_back(values.back()); + } + indices.push_back(index); + } + + auto type = fixed_size_binary(6); + CheckUnique<FixedSizeBinaryType, std::string>(type, values, {}, uniques, {}); + CheckDictEncode<FixedSizeBinaryType, std::string>(type, values, {}, uniques, {}, + indices); +} + +TEST_F(TestHashKernel, UniqueDecimal) { + std::vector<Decimal128> values{12, 12, 11, 12}; + std::vector<Decimal128> expected{12, 0, 11}; + + CheckUnique<Decimal128Type, Decimal128>(decimal(2, 0), values, + {true, false, true, true}, expected, {1, 0, 1}); +} + +TEST_F(TestHashKernel, UniqueNull) { + CheckUnique<NullType, std::nullptr_t>(null(), {nullptr, nullptr}, {false, true}, + {nullptr}, {false}); + CheckUnique<NullType, std::nullptr_t>(null(), {}, {}, {}, {}); +} + +TEST_F(TestHashKernel, ValueCountsDecimal) { + std::vector<Decimal128> values{12, 12, 11, 12}; + std::vector<Decimal128> expected{12, 0, 11}; + + CheckValueCounts<Decimal128Type, Decimal128>( + decimal(2, 0), values, {true, false, true, true}, expected, {1, 0, 1}, {2, 1, 1}); +} + +TEST_F(TestHashKernel, DictEncodeDecimal) { + std::vector<Decimal128> values{12, 12, 11, 12, 13}; + std::vector<Decimal128> expected{12, 11, 13}; + + CheckDictEncode<Decimal128Type, Decimal128>(decimal(2, 0), values, + {true, false, true, true, true}, expected, + {}, {0, 0, 1, 0, 2}); +} + +TEST_F(TestHashKernel, DictionaryUniqueAndValueCounts) { + auto dict_json = "[10, 20, 30, 40]"; + auto dict = ArrayFromJSON(int64(), dict_json); + for (auto index_ty : IntTypes()) { + auto indices = ArrayFromJSON(index_ty, "[3, 0, 0, 0, 1, 1, 3, 0, 1, 3, 0, 1]"); + + auto dict_ty = dictionary(index_ty, int64()); + + auto ex_indices = ArrayFromJSON(index_ty, "[3, 0, 1]"); + + auto input = std::make_shared<DictionaryArray>(dict_ty, indices, dict); + auto ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices, dict); + CheckUnique(input, ex_uniques); + + auto ex_counts = ArrayFromJSON(int64(), "[3, 5, 4]"); + CheckValueCounts(input, ex_uniques, ex_counts); + + // Empty array - executor never gives the kernel any batches, + // so result dictionary is empty + CheckUnique(DictArrayFromJSON(dict_ty, "[]", dict_json), + DictArrayFromJSON(dict_ty, "[]", "[]")); + CheckValueCounts(DictArrayFromJSON(dict_ty, "[]", dict_json), + DictArrayFromJSON(dict_ty, "[]", "[]"), + ArrayFromJSON(int64(), "[]")); + + // Check chunked array + auto chunked = *ChunkedArray::Make({input->Slice(0, 2), input->Slice(2)}); + CheckUnique(chunked, ex_uniques); + CheckValueCounts(chunked, ex_uniques, ex_counts); + + // Different chunk dictionaries + auto input_2 = DictArrayFromJSON(dict_ty, "[1, null, 2, 3]", "[30, 40, 50, 60]"); + auto ex_uniques_2 = + DictArrayFromJSON(dict_ty, "[3, 0, 1, null, 4, 5]", "[10, 20, 30, 40, 50, 60]"); + auto ex_counts_2 = ArrayFromJSON(int64(), "[4, 5, 4, 1, 1, 1]"); + auto different_dictionaries = *ChunkedArray::Make({input, input_2}, dict_ty); + + CheckUnique(different_dictionaries, ex_uniques_2); + CheckValueCounts(different_dictionaries, ex_uniques_2, ex_counts_2); + + // Dictionary with encoded nulls + auto dict_with_null = ArrayFromJSON(int64(), "[10, null, 30, 40]"); + input = std::make_shared<DictionaryArray>(dict_ty, indices, dict_with_null); + ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices, dict_with_null); + CheckUnique(input, ex_uniques); + + CheckValueCounts(input, ex_uniques, ex_counts); + + // Dictionary with masked nulls + auto indices_with_null = + ArrayFromJSON(index_ty, "[3, 0, 0, 0, null, null, 3, 0, null, 3, 0, null]"); + auto ex_indices_with_null = ArrayFromJSON(index_ty, "[3, 0, null]"); + ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices_with_null, dict); + input = std::make_shared<DictionaryArray>(dict_ty, indices_with_null, dict); + CheckUnique(input, ex_uniques); + + CheckValueCounts(input, ex_uniques, ex_counts); + + // Dictionary with encoded AND masked nulls + auto some_indices_with_null = + ArrayFromJSON(index_ty, "[3, 0, 0, 0, 1, 1, 3, 0, null, 3, 0, null]"); + ex_uniques = + std::make_shared<DictionaryArray>(dict_ty, ex_indices_with_null, dict_with_null); + input = std::make_shared<DictionaryArray>(dict_ty, indices_with_null, dict_with_null); + CheckUnique(input, ex_uniques); + CheckValueCounts(input, ex_uniques, ex_counts); + } +} + +/* TODO(ARROW-4124): Determine if we want to do something that is reproducible with + * floats. +TEST_F(TestHashKernel, ValueCountsFloat) { + + // No nulls + CheckValueCounts<FloatType, float>(float32(), {1.0f, 0.0f, -0.0f, +std::nan("1"), std::nan("2") }, + {}, {0.0f, 1.0f, std::nan("1")}, {}, {}); + + CheckValueCounts<DoubleType, double>(float64(), {1.0f, 0.0f, -0.0f, +std::nan("1"), std::nan("2") }, + {}, {0.0f, 1.0f, std::nan("1")}, {}, {}); +} +*/ + +TEST_F(TestHashKernel, ChunkedArrayInvoke) { + std::vector<std::string> values1 = {"foo", "bar", "foo"}; + std::vector<std::string> values2 = {"bar", "baz", "quuux", "foo"}; + + auto type = utf8(); + auto a1 = _MakeArray<StringType, std::string>(type, values1, {}); + auto a2 = _MakeArray<StringType, std::string>(type, values2, {}); + + std::vector<std::string> dict_values = {"foo", "bar", "baz", "quuux"}; + auto ex_dict = _MakeArray<StringType, std::string>(type, dict_values, {}); + + auto ex_counts = _MakeArray<Int64Type, int64_t>(int64(), {3, 2, 1, 1}, {}); + + ArrayVector arrays = {a1, a2}; + auto carr = std::make_shared<ChunkedArray>(arrays); + + // Unique + ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Unique(carr)); + ASSERT_ARRAYS_EQUAL(*ex_dict, *result); + + // Dictionary encode + auto dict_type = dictionary(int32(), type); + + auto i1 = _MakeArray<Int32Type, int32_t>(int32(), {0, 1, 0}, {}); + auto i2 = _MakeArray<Int32Type, int32_t>(int32(), {1, 2, 3, 0}, {}); + + ArrayVector dict_arrays = {std::make_shared<DictionaryArray>(dict_type, i1, ex_dict), + std::make_shared<DictionaryArray>(dict_type, i2, ex_dict)}; + auto dict_carr = std::make_shared<ChunkedArray>(dict_arrays); + + // Unique counts + ASSERT_OK_AND_ASSIGN(auto counts, ValueCounts(carr)); + ASSERT_ARRAYS_EQUAL(*ex_dict, *counts->field(0)); + ASSERT_ARRAYS_EQUAL(*ex_counts, *counts->field(1)); + + // Dictionary encode + ASSERT_OK_AND_ASSIGN(Datum encoded_out, DictionaryEncode(carr)); + ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); + + AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); +} + +TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) { + // ARROW-7008 + auto values = ArrayFromJSON(utf8(), "[]"); + ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values)); + ValidateOutput(datum_result); +} + +TEST_F(TestHashKernel, NullEncodingSchemes) { + auto values = ArrayFromJSON(uint8(), "[1, 1, null, 2, null]"); + + // Masking should put null in the indices array + auto expected_mask_indices = ArrayFromJSON(int32(), "[0, 0, null, 1, null]"); + auto expected_mask_dictionary = ArrayFromJSON(uint8(), "[1, 2]"); + auto dictionary_type = dictionary(int32(), uint8()); + std::shared_ptr<Array> expected = std::make_shared<DictionaryArray>( + dictionary_type, expected_mask_indices, expected_mask_dictionary); + + ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values)); + std::shared_ptr<Array> result = datum_result.make_array(); + AssertArraysEqual(*expected, *result); + + // Encoding should put null in the dictionary + auto expected_encoded_indices = ArrayFromJSON(int32(), "[0, 0, 1, 2, 1]"); + auto expected_encoded_dict = ArrayFromJSON(uint8(), "[1, null, 2]"); + expected = std::make_shared<DictionaryArray>(dictionary_type, expected_encoded_indices, + expected_encoded_dict); + + auto options = DictionaryEncodeOptions::Defaults(); + options.null_encoding_behavior = DictionaryEncodeOptions::ENCODE; + ASSERT_OK_AND_ASSIGN(datum_result, DictionaryEncode(values, options)); + result = datum_result.make_array(); + AssertArraysEqual(*expected, *result); +} + +TEST_F(TestHashKernel, ChunkedArrayZeroChunk) { + // ARROW-6857 + auto chunked_array = std::make_shared<ChunkedArray>(ArrayVector{}, utf8()); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result_array, Unique(chunked_array)); + auto expected = ArrayFromJSON(chunked_array->type(), "[]"); + AssertArraysEqual(*expected, *result_array); + + ASSERT_OK_AND_ASSIGN(result_array, ValueCounts(chunked_array)); + expected = ArrayFromJSON(struct_({field(kValuesFieldName, chunked_array->type()), + field(kCountsFieldName, int64())}), + "[]"); + AssertArraysEqual(*expected, *result_array); + + ASSERT_OK_AND_ASSIGN(Datum result_datum, DictionaryEncode(chunked_array)); + auto dict_type = dictionary(int32(), chunked_array->type()); + ASSERT_EQ(result_datum.kind(), Datum::CHUNKED_ARRAY); + + AssertChunkedEqual(*std::make_shared<ChunkedArray>(ArrayVector{}, dict_type), + *result_datum.chunked_array()); +} + +} // namespace compute +} // namespace arrow |