diff options
Diffstat (limited to 'src/arrow/cpp/src/parquet/encoding_test.cc')
-rw-r--r-- | src/arrow/cpp/src/parquet/encoding_test.cc | 1247 |
1 files changed, 1247 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/parquet/encoding_test.cc b/src/arrow/cpp/src/parquet/encoding_test.cc new file mode 100644 index 000000000..d271d59ef --- /dev/null +++ b/src/arrow/cpp/src/parquet/encoding_test.cc @@ -0,0 +1,1247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include <algorithm> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <limits> +#include <utility> +#include <vector> + +#include "arrow/array.h" +#include "arrow/array/builder_dict.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_writer.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/endian.h" + +#include "parquet/encoding.h" +#include "parquet/platform.h" +#include "parquet/schema.h" +#include "parquet/test_util.h" +#include "parquet/types.h" + +using arrow::default_memory_pool; +using arrow::MemoryPool; +using arrow::internal::checked_cast; + +namespace BitUtil = arrow::BitUtil; + +namespace parquet { + +namespace test { + +TEST(VectorBooleanTest, TestEncodeDecode) { + // PARQUET-454 + int nvalues = 10000; + int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues)); + + std::vector<bool> draws; + ::arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */); + + std::unique_ptr<BooleanEncoder> encoder = + MakeTypedEncoder<BooleanType>(Encoding::PLAIN); + encoder->Put(draws, nvalues); + + std::unique_ptr<BooleanDecoder> decoder = + MakeTypedDecoder<BooleanType>(Encoding::PLAIN); + + std::shared_ptr<Buffer> encode_buffer = encoder->FlushValues(); + ASSERT_EQ(nbytes, encode_buffer->size()); + + std::vector<uint8_t> decode_buffer(nbytes); + const uint8_t* decode_data = &decode_buffer[0]; + + decoder->SetData(nvalues, encode_buffer->data(), + static_cast<int>(encode_buffer->size())); + int values_decoded = decoder->Decode(&decode_buffer[0], nvalues); + ASSERT_EQ(nvalues, values_decoded); + + for (int i = 0; i < nvalues; ++i) { + ASSERT_EQ(draws[i], ::arrow::BitUtil::GetBit(decode_data, i)) << i; + } +} + +// ---------------------------------------------------------------------- +// test data generation + +template <typename T> +void GenerateData(int num_values, T* out, std::vector<uint8_t>* heap) { + // seed the prng so failure is deterministic + random_numbers(num_values, 0, std::numeric_limits<T>::min(), + std::numeric_limits<T>::max(), out); +} + +template <> +void GenerateData<bool>(int num_values, bool* out, std::vector<uint8_t>* heap) { + // seed the prng so failure is deterministic + random_bools(num_values, 0.5, 0, out); +} + +template <> +void GenerateData<Int96>(int num_values, Int96* out, std::vector<uint8_t>* heap) { + // seed the prng so failure is deterministic + random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(), + std::numeric_limits<int32_t>::max(), out); +} + +template <> +void GenerateData<ByteArray>(int num_values, ByteArray* out, std::vector<uint8_t>* heap) { + // seed the prng so failure is deterministic + int max_byte_array_len = 12; + heap->resize(num_values * max_byte_array_len); + random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); +} + +static int flba_length = 8; + +template <> +void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>* heap) { + // seed the prng so failure is deterministic + heap->resize(num_values * flba_length); + random_fixed_byte_array(num_values, 0, heap->data(), flba_length, out); +} + +template <typename T> +void VerifyResults(T* result, T* expected, int num_values) { + for (int i = 0; i < num_values; ++i) { + ASSERT_EQ(expected[i], result[i]) << i; + } +} + +template <typename T> +void VerifyResultsSpaced(T* result, T* expected, int num_values, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + for (auto i = 0; i < num_values; ++i) { + if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + ASSERT_EQ(expected[i], result[i]) << i; + } + } +} + +template <> +void VerifyResults<FLBA>(FLBA* result, FLBA* expected, int num_values) { + for (int i = 0; i < num_values; ++i) { + ASSERT_EQ(0, memcmp(expected[i].ptr, result[i].ptr, flba_length)) << i; + } +} + +template <> +void VerifyResultsSpaced<FLBA>(FLBA* result, FLBA* expected, int num_values, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + for (auto i = 0; i < num_values; ++i) { + if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + ASSERT_EQ(0, memcmp(expected[i].ptr, result[i].ptr, flba_length)) << i; + } + } +} + +// ---------------------------------------------------------------------- +// Create some column descriptors + +template <typename DType> +std::shared_ptr<ColumnDescriptor> ExampleDescr() { + auto node = schema::PrimitiveNode::Make("name", Repetition::OPTIONAL, DType::type_num); + return std::make_shared<ColumnDescriptor>(node, 0, 0); +} + +template <> +std::shared_ptr<ColumnDescriptor> ExampleDescr<FLBAType>() { + auto node = schema::PrimitiveNode::Make("name", Repetition::OPTIONAL, + Type::FIXED_LEN_BYTE_ARRAY, + ConvertedType::DECIMAL, flba_length, 10, 2); + return std::make_shared<ColumnDescriptor>(node, 0, 0); +} + +// ---------------------------------------------------------------------- +// Plain encoding tests + +template <typename Type> +class TestEncodingBase : public ::testing::Test { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void SetUp() { + descr_ = ExampleDescr<Type>(); + type_length_ = descr_->type_length(); + allocator_ = default_memory_pool(); + } + + void TearDown() {} + + void InitData(int nvalues, int repeats) { + num_values_ = nvalues * repeats; + input_bytes_.resize(num_values_ * sizeof(c_type)); + output_bytes_.resize(num_values_ * sizeof(c_type)); + draws_ = reinterpret_cast<c_type*>(input_bytes_.data()); + decode_buf_ = reinterpret_cast<c_type*>(output_bytes_.data()); + GenerateData<c_type>(nvalues, draws_, &data_buffer_); + + // add some repeated values + for (int j = 1; j < repeats; ++j) { + for (int i = 0; i < nvalues; ++i) { + draws_[nvalues * j + i] = draws_[i]; + } + } + } + + virtual void CheckRoundtrip() = 0; + + virtual void CheckRoundtripSpaced(const uint8_t* valid_bits, + int64_t valid_bits_offset) {} + + void Execute(int nvalues, int repeats) { + InitData(nvalues, repeats); + CheckRoundtrip(); + } + + void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset, + double null_probability) { + InitData(nvalues, repeats); + + int64_t size = num_values_ + valid_bits_offset; + auto rand = ::arrow::random::RandomArrayGenerator(1923); + const auto array = rand.UInt8(size, 0, 100, null_probability); + const auto valid_bits = array->null_bitmap_data(); + if (valid_bits) { + CheckRoundtripSpaced(valid_bits, valid_bits_offset); + } + } + + protected: + MemoryPool* allocator_; + + int num_values_; + int type_length_; + c_type* draws_; + c_type* decode_buf_; + std::vector<uint8_t> input_bytes_; + std::vector<uint8_t> output_bytes_; + std::vector<uint8_t> data_buffer_; + + std::shared_ptr<Buffer> encode_buffer_; + std::shared_ptr<ColumnDescriptor> descr_; +}; + +// Member variables are not visible to templated subclasses. Possibly figure +// out an alternative to this class layering at some point +#define USING_BASE_MEMBERS() \ + using TestEncodingBase<Type>::allocator_; \ + using TestEncodingBase<Type>::descr_; \ + using TestEncodingBase<Type>::num_values_; \ + using TestEncodingBase<Type>::draws_; \ + using TestEncodingBase<Type>::data_buffer_; \ + using TestEncodingBase<Type>::type_length_; \ + using TestEncodingBase<Type>::encode_buffer_; \ + using TestEncodingBase<Type>::decode_buf_; + +template <typename Type> +class TestPlainEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + virtual void CheckRoundtrip() { + auto encoder = MakeTypedEncoder<Type>(Encoding::PLAIN, false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::PLAIN, descr_.get()); + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + void CheckRoundtripSpaced(const uint8_t* valid_bits, int64_t valid_bits_offset) { + auto encoder = MakeTypedEncoder<Type>(Encoding::PLAIN, false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::PLAIN, descr_.get()); + int null_count = 0; + for (auto i = 0; i < num_values_; i++) { + if (!BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + null_count++; + } + } + + encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); + encode_buffer_ = encoder->FlushValues(); + decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, + valid_bits, valid_bits_offset); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, num_values_, + valid_bits, valid_bits_offset)); + } + + protected: + USING_BASE_MEMBERS(); +}; + +TYPED_TEST_SUITE(TestPlainEncoding, ParquetTypes); + +TYPED_TEST(TestPlainEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(10000, 1)); + + // Spaced test with different sizes and offest to guarantee SIMD implementation + constexpr int kAvx512Size = 64; // sizeof(__m512i) for Avx512 + constexpr int kSimdSize = kAvx512Size; // Current the max is Avx512 + constexpr int kMultiSimdSize = kSimdSize * 33; + + for (auto null_prob : {0.001, 0.1, 0.5, 0.9, 0.999}) { + // Test with both size and offset up to 3 Simd block + for (auto i = 1; i < kSimdSize * 3; i++) { + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(i, 1, 0, null_prob)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(i, 1, i + 1, null_prob)); + } + // Large block and offset + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize, 1, 0, null_prob)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize + 33, 1, 0, null_prob)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize, 1, 33, null_prob)); + ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize + 33, 1, 33, null_prob)); + } +} + +// ---------------------------------------------------------------------- +// Dictionary encoding tests + +typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType, + ByteArrayType, FLBAType> + DictEncodedTypes; + +template <typename Type> +class TestDictionaryEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void CheckRoundtrip() { + std::vector<uint8_t> valid_bits(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); + + auto base_encoder = MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get()); + auto encoder = + dynamic_cast<typename EncodingTraits<Type>::Encoder*>(base_encoder.get()); + auto dict_traits = dynamic_cast<DictEncoder<Type>*>(base_encoder.get()); + + ASSERT_NO_THROW(encoder->Put(draws_, num_values_)); + dict_buffer_ = + AllocateBuffer(default_memory_pool(), dict_traits->dict_encoded_size()); + dict_traits->WriteDict(dict_buffer_->mutable_data()); + std::shared_ptr<Buffer> indices = encoder->FlushValues(); + + auto base_spaced_encoder = + MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get()); + auto spaced_encoder = + dynamic_cast<typename EncodingTraits<Type>::Encoder*>(base_spaced_encoder.get()); + + // PutSpaced should lead to the same results + // This also checks the PutSpaced implementation for valid_bits=nullptr + ASSERT_NO_THROW(spaced_encoder->PutSpaced(draws_, num_values_, nullptr, 0)); + std::shared_ptr<Buffer> indices_from_spaced = spaced_encoder->FlushValues(); + ASSERT_TRUE(indices_from_spaced->Equals(*indices)); + + auto dict_decoder = MakeTypedDecoder<Type>(Encoding::PLAIN, descr_.get()); + dict_decoder->SetData(dict_traits->num_entries(), dict_buffer_->data(), + static_cast<int>(dict_buffer_->size())); + + auto decoder = MakeDictDecoder<Type>(descr_.get()); + decoder->SetDict(dict_decoder.get()); + + decoder->SetData(num_values_, indices->data(), static_cast<int>(indices->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + + // TODO(wesm): The DictionaryDecoder must stay alive because the decoded + // values' data is owned by a buffer inside the DictionaryEncoder. We + // should revisit when data lifetime is reviewed more generally. + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + + // Also test spaced decoding + decoder->SetData(num_values_, indices->data(), static_cast<int>(indices->size())); + // Also tests DecodeSpaced handling for valid_bits=nullptr + values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, 0, nullptr, 0); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + protected: + USING_BASE_MEMBERS(); + std::shared_ptr<ResizableBuffer> dict_buffer_; +}; + +TYPED_TEST_SUITE(TestDictionaryEncoding, DictEncodedTypes); + +TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) { + ASSERT_NO_FATAL_FAILURE(this->Execute(2500, 2)); +} + +TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { + ASSERT_THROW(MakeDictDecoder<BooleanType>(nullptr), ParquetException); +} + +// ---------------------------------------------------------------------- +// Shared arrow builder decode tests + +class TestArrowBuilderDecoding : public ::testing::Test { + public: + using DenseBuilder = ::arrow::internal::ChunkedBinaryBuilder; + using DictBuilder = ::arrow::BinaryDictionary32Builder; + + void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; } + void TearDown() override {} + + void InitTestCase(double null_probability) { + GenerateInputData(null_probability); + SetupEncoderDecoder(); + } + + void GenerateInputData(double null_probability) { + constexpr int num_unique = 100; + constexpr int repeat = 100; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 10; + ::arrow::random::RandomArrayGenerator rag(0); + expected_dense_ = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, null_probability); + + num_values_ = static_cast<int>(expected_dense_->length()); + null_count_ = static_cast<int>(expected_dense_->null_count()); + valid_bits_ = expected_dense_->null_bitmap_data(); + + auto builder = CreateDictBuilder(); + ASSERT_OK(builder->AppendArray(*expected_dense_)); + ASSERT_OK(builder->Finish(&expected_dict_)); + + // Initialize input_data_ for the encoder from the expected_array_ values + const auto& binary_array = static_cast<const ::arrow::BinaryArray&>(*expected_dense_); + input_data_.resize(binary_array.length()); + + for (int64_t i = 0; i < binary_array.length(); ++i) { + auto view = binary_array.GetView(i); + input_data_[i] = {static_cast<uint32_t>(view.length()), + reinterpret_cast<const uint8_t*>(view.data())}; + } + } + + std::unique_ptr<DictBuilder> CreateDictBuilder() { + return std::unique_ptr<DictBuilder>(new DictBuilder(default_memory_pool())); + } + + // Setup encoder/decoder pair for testing with + virtual void SetupEncoderDecoder() = 0; + + void CheckDense(int actual_num_values, const ::arrow::Array& chunk) { + ASSERT_EQ(actual_num_values, num_values_ - null_count_); + ASSERT_ARRAYS_EQUAL(chunk, *expected_dense_); + } + + template <typename Builder> + void CheckDict(int actual_num_values, Builder& builder) { + ASSERT_EQ(actual_num_values, num_values_ - null_count_); + std::shared_ptr<::arrow::Array> actual; + ASSERT_OK(builder.Finish(&actual)); + ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); + } + + void CheckDecodeArrowUsingDenseBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + + typename EncodingTraits<ByteArrayType>::Accumulator acc; + acc.builder.reset(new ::arrow::BinaryBuilder); + auto actual_num_values = + decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &acc); + + std::shared_ptr<::arrow::Array> chunk; + ASSERT_OK(acc.builder->Finish(&chunk)); + CheckDense(actual_num_values, *chunk); + } + } + + void CheckDecodeArrowUsingDictBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + auto builder = CreateDictBuilder(); + auto actual_num_values = + decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, builder.get()); + CheckDict(actual_num_values, *builder); + } + } + + void CheckDecodeArrowNonNullUsingDenseBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + if (null_count_ > 0) { + continue; + } + typename EncodingTraits<ByteArrayType>::Accumulator acc; + acc.builder.reset(new ::arrow::BinaryBuilder); + auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &acc); + std::shared_ptr<::arrow::Array> chunk; + ASSERT_OK(acc.builder->Finish(&chunk)); + CheckDense(actual_num_values, *chunk); + } + } + + void CheckDecodeArrowNonNullUsingDictBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + if (null_count_ > 0) { + continue; + } + auto builder = CreateDictBuilder(); + auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, builder.get()); + CheckDict(actual_num_values, *builder); + } + } + + protected: + std::vector<double> null_probabilities_; + std::shared_ptr<::arrow::Array> expected_dict_; + std::shared_ptr<::arrow::Array> expected_dense_; + int num_values_; + int null_count_; + std::vector<ByteArray> input_data_; + const uint8_t* valid_bits_; + std::unique_ptr<ByteArrayEncoder> encoder_; + ByteArrayDecoder* decoder_; + std::unique_ptr<ByteArrayDecoder> plain_decoder_; + std::unique_ptr<DictDecoder<ByteArrayType>> dict_decoder_; + std::shared_ptr<Buffer> buffer_; +}; + +class PlainEncoding : public TestArrowBuilderDecoding { + public: + void SetupEncoderDecoder() override { + encoder_ = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN); + plain_decoder_ = MakeTypedDecoder<ByteArrayType>(Encoding::PLAIN); + decoder_ = plain_decoder_.get(); + if (valid_bits_ != nullptr) { + ASSERT_NO_THROW( + encoder_->PutSpaced(input_data_.data(), num_values_, valid_bits_, 0)); + } else { + ASSERT_NO_THROW(encoder_->Put(input_data_.data(), num_values_)); + } + buffer_ = encoder_->FlushValues(); + decoder_->SetData(num_values_, buffer_->data(), static_cast<int>(buffer_->size())); + } +}; + +TEST_F(PlainEncoding, CheckDecodeArrowUsingDenseBuilder) { + this->CheckDecodeArrowUsingDenseBuilder(); +} + +TEST_F(PlainEncoding, CheckDecodeArrowUsingDictBuilder) { + this->CheckDecodeArrowUsingDictBuilder(); +} + +TEST_F(PlainEncoding, CheckDecodeArrowNonNullDenseBuilder) { + this->CheckDecodeArrowNonNullUsingDenseBuilder(); +} + +TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { + this->CheckDecodeArrowNonNullUsingDictBuilder(); +} + +TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { + // Implemented as part of ARROW-3246 + + const int64_t size = 50; + const int32_t min_length = 0; + const int32_t max_length = 10; + const double null_probability = 0.25; + + auto CheckSeed = [&](int seed) { + ::arrow::random::RandomArrayGenerator rag(seed); + auto values = rag.String(size, min_length, max_length, null_probability); + + auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN); + auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::PLAIN); + + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast<int>(values->length() - values->null_count()); + decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size())); + + typename EncodingTraits<ByteArrayType>::Accumulator acc; + acc.builder.reset(new ::arrow::StringBuilder); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(values->length()), + static_cast<int>(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ASSERT_EQ(50, result->length()); + ::arrow::AssertArraysEqual(*values, *result); + }; + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + CheckSeed(seed); + } +} + +template <typename T> +void GetDictDecoder(DictEncoder<T>* encoder, int64_t num_values, + std::shared_ptr<Buffer>* out_values, + std::shared_ptr<Buffer>* out_dict, const ColumnDescriptor* descr, + std::unique_ptr<TypedDecoder<T>>* out_decoder) { + auto decoder = MakeDictDecoder<T>(descr); + auto buf = encoder->FlushValues(); + auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); + encoder->WriteDict(dict_buf->mutable_data()); + + auto dict_decoder = MakeTypedDecoder<T>(Encoding::PLAIN, descr); + dict_decoder->SetData(encoder->num_entries(), dict_buf->data(), + static_cast<int>(dict_buf->size())); + + decoder->SetData(static_cast<int>(num_values), buf->data(), + static_cast<int>(buf->size())); + decoder->SetDict(dict_decoder.get()); + + *out_values = buf; + *out_dict = dict_buf; + ASSERT_NE(decoder, nullptr); + auto released = dynamic_cast<TypedDecoder<T>*>(decoder.release()); + ASSERT_NE(released, nullptr); + *out_decoder = std::unique_ptr<TypedDecoder<T>>(released); +} + +template <typename ParquetType> +class EncodingAdHocTyped : public ::testing::Test { + public: + using ArrowType = typename EncodingTraits<ParquetType>::ArrowType; + using EncoderType = typename EncodingTraits<ParquetType>::Encoder; + using DecoderType = typename EncodingTraits<ParquetType>::Decoder; + using BuilderType = typename EncodingTraits<ParquetType>::Accumulator; + using DictBuilderType = typename EncodingTraits<ParquetType>::DictAccumulator; + + static const ColumnDescriptor* column_descr() { + static auto column_descr = ExampleDescr<ParquetType>(); + return column_descr.get(); + } + + std::shared_ptr<::arrow::Array> GetValues(int seed); + + static std::shared_ptr<::arrow::DataType> arrow_type(); + + void Plain(int seed) { + auto values = GetValues(seed); + auto encoder = MakeTypedEncoder<ParquetType>( + Encoding::PLAIN, /*use_dictionary=*/false, column_descr()); + auto decoder = MakeTypedDecoder<ParquetType>(Encoding::PLAIN, column_descr()); + + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast<int>(values->length() - values->null_count()); + decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size())); + + BuilderType acc(arrow_type(), ::arrow::default_memory_pool()); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(values->length()), + static_cast<int>(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.Finish(&result)); + ASSERT_EQ(50, result->length()); + ::arrow::AssertArraysEqual(*values, *result, /*verbose=*/true); + } + + void ByteStreamSplit(int seed) { + if (!std::is_same<ParquetType, FloatType>::value && + !std::is_same<ParquetType, DoubleType>::value) { + return; + } + auto values = GetValues(seed); + auto encoder = MakeTypedEncoder<ParquetType>( + Encoding::BYTE_STREAM_SPLIT, /*use_dictionary=*/false, column_descr()); + auto decoder = + MakeTypedDecoder<ParquetType>(Encoding::BYTE_STREAM_SPLIT, column_descr()); + + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + int num_values = static_cast<int>(values->length() - values->null_count()); + decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size())); + + BuilderType acc(arrow_type(), ::arrow::default_memory_pool()); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(values->length()), + static_cast<int>(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.Finish(&result)); + ASSERT_EQ(50, result->length()); + ::arrow::AssertArraysEqual(*values, *result); + } + + void Dict(int seed) { + if (std::is_same<ParquetType, BooleanType>::value) { + return; + } + + auto values = GetValues(seed); + + auto owned_encoder = + MakeTypedEncoder<ParquetType>(Encoding::PLAIN, + /*use_dictionary=*/true, column_descr()); + auto encoder = dynamic_cast<DictEncoder<ParquetType>*>(owned_encoder.get()); + + ASSERT_NO_THROW(encoder->Put(*values)); + + std::shared_ptr<Buffer> buf, dict_buf; + int num_values = static_cast<int>(values->length() - values->null_count()); + + std::unique_ptr<TypedDecoder<ParquetType>> decoder; + GetDictDecoder(encoder, num_values, &buf, &dict_buf, column_descr(), &decoder); + + BuilderType acc(arrow_type(), ::arrow::default_memory_pool()); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(values->length()), + static_cast<int>(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.Finish(&result)); + ::arrow::AssertArraysEqual(*values, *result); + } + + void DictPutIndices() { + if (std::is_same<ParquetType, BooleanType>::value) { + return; + } + + auto dict_values = ::arrow::ArrayFromJSON( + arrow_type(), std::is_same<ParquetType, FLBAType>::value + ? R"(["abcdefgh", "ijklmnop", "qrstuvwx"])" + : "[120, -37, 47]"); + auto indices = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 1, 2]"); + auto indices_nulls = + ::arrow::ArrayFromJSON(::arrow::int32(), "[null, 0, 1, null, 2]"); + + auto expected = ::arrow::ArrayFromJSON( + arrow_type(), std::is_same<ParquetType, FLBAType>::value + ? R"(["abcdefgh", "ijklmnop", "qrstuvwx", null, + "abcdefgh", "ijklmnop", null, "qrstuvwx"])" + : "[120, -37, 47, null, " + "120, -37, null, 47]"); + + auto owned_encoder = + MakeTypedEncoder<ParquetType>(Encoding::PLAIN, + /*use_dictionary=*/true, column_descr()); + auto owned_decoder = MakeDictDecoder<ParquetType>(); + + auto encoder = dynamic_cast<DictEncoder<ParquetType>*>(owned_encoder.get()); + + ASSERT_NO_THROW(encoder->PutDictionary(*dict_values)); + + // Trying to call PutDictionary again throws + ASSERT_THROW(encoder->PutDictionary(*dict_values), ParquetException); + + ASSERT_NO_THROW(encoder->PutIndices(*indices)); + ASSERT_NO_THROW(encoder->PutIndices(*indices_nulls)); + + std::shared_ptr<Buffer> buf, dict_buf; + int num_values = static_cast<int>(expected->length() - expected->null_count()); + + std::unique_ptr<TypedDecoder<ParquetType>> decoder; + GetDictDecoder(encoder, num_values, &buf, &dict_buf, column_descr(), &decoder); + + BuilderType acc(arrow_type(), ::arrow::default_memory_pool()); + ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast<int>(expected->length()), + static_cast<int>(expected->null_count()), + expected->null_bitmap_data(), + expected->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.Finish(&result)); + ::arrow::AssertArraysEqual(*expected, *result); + } + + protected: + const int64_t size_ = 50; + const double null_probability_ = 0.25; +}; + +template <typename ParquetType> +std::shared_ptr<::arrow::DataType> EncodingAdHocTyped<ParquetType>::arrow_type() { + return ::arrow::TypeTraits<ArrowType>::type_singleton(); +} + +template <> +std::shared_ptr<::arrow::DataType> EncodingAdHocTyped<FLBAType>::arrow_type() { + return ::arrow::fixed_size_binary(sizeof(uint64_t)); +} + +template <typename ParquetType> +std::shared_ptr<::arrow::Array> EncodingAdHocTyped<ParquetType>::GetValues(int seed) { + ::arrow::random::RandomArrayGenerator rag(seed); + return rag.Numeric<ArrowType>(size_, 0, 10, null_probability_); +} + +template <> +std::shared_ptr<::arrow::Array> EncodingAdHocTyped<BooleanType>::GetValues(int seed) { + ::arrow::random::RandomArrayGenerator rag(seed); + return rag.Boolean(size_, 0.1, null_probability_); +} + +template <> +std::shared_ptr<::arrow::Array> EncodingAdHocTyped<FLBAType>::GetValues(int seed) { + ::arrow::random::RandomArrayGenerator rag(seed); + std::shared_ptr<::arrow::Array> values; + ARROW_EXPECT_OK( + rag.UInt64(size_, 0, std::numeric_limits<uint64_t>::max(), null_probability_) + ->View(arrow_type()) + .Value(&values)); + return values; +} + +using EncodingAdHocTypedCases = + ::testing::Types<BooleanType, Int32Type, Int64Type, FloatType, DoubleType, FLBAType>; + +TYPED_TEST_SUITE(EncodingAdHocTyped, EncodingAdHocTypedCases); + +TYPED_TEST(EncodingAdHocTyped, PlainArrowDirectPut) { + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + this->Plain(seed); + } +} + +TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) { + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + this->ByteStreamSplit(seed); + } +} + +TEST(DictEncodingAdHoc, ArrowBinaryDirectPut) { + // Implemented as part of ARROW-3246 + const int64_t size = 50; + const int64_t min_length = 0; + const int64_t max_length = 10; + const double null_probability = 0.1; + ::arrow::random::RandomArrayGenerator rag(0); + auto values = rag.String(size, min_length, max_length, null_probability); + + auto owned_encoder = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN, + /*use_dictionary=*/true); + + auto encoder = dynamic_cast<DictEncoder<ByteArrayType>*>(owned_encoder.get()); + + ASSERT_NO_THROW(encoder->Put(*values)); + + std::unique_ptr<ByteArrayDecoder> decoder; + std::shared_ptr<Buffer> buf, dict_buf; + int num_values = static_cast<int>(values->length() - values->null_count()); + GetDictDecoder(encoder, num_values, &buf, &dict_buf, nullptr, &decoder); + + typename EncodingTraits<ByteArrayType>::Accumulator acc; + acc.builder.reset(new ::arrow::StringBuilder); + ASSERT_EQ(num_values, + decoder->DecodeArrow(static_cast<int>(values->length()), + static_cast<int>(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ::arrow::AssertArraysEqual(*values, *result); +} + +TYPED_TEST(EncodingAdHocTyped, DictArrowDirectPut) { this->Dict(0); } + +TEST(DictEncodingAdHoc, PutDictionaryPutIndices) { + // Part of ARROW-3246 + auto dict_values = + ::arrow::ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\", \"baz\"]"); + + auto CheckIndexType = [&](const std::shared_ptr<::arrow::DataType>& index_ty) { + auto indices = ::arrow::ArrayFromJSON(index_ty, "[0, 1, 2]"); + auto indices_nulls = ::arrow::ArrayFromJSON(index_ty, "[null, 0, 1, null, 2]"); + + auto expected = ::arrow::ArrayFromJSON(::arrow::binary(), + "[\"foo\", \"bar\", \"baz\", null, " + "\"foo\", \"bar\", null, \"baz\"]"); + + auto owned_encoder = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN, + /*use_dictionary=*/true); + auto owned_decoder = MakeDictDecoder<ByteArrayType>(); + + auto encoder = dynamic_cast<DictEncoder<ByteArrayType>*>(owned_encoder.get()); + + ASSERT_NO_THROW(encoder->PutDictionary(*dict_values)); + + // Trying to call PutDictionary again throws + ASSERT_THROW(encoder->PutDictionary(*dict_values), ParquetException); + + ASSERT_NO_THROW(encoder->PutIndices(*indices)); + ASSERT_NO_THROW(encoder->PutIndices(*indices_nulls)); + + std::unique_ptr<ByteArrayDecoder> decoder; + std::shared_ptr<Buffer> buf, dict_buf; + int num_values = static_cast<int>(expected->length() - expected->null_count()); + GetDictDecoder(encoder, num_values, &buf, &dict_buf, nullptr, &decoder); + + typename EncodingTraits<ByteArrayType>::Accumulator acc; + acc.builder.reset(new ::arrow::BinaryBuilder); + ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast<int>(expected->length()), + static_cast<int>(expected->null_count()), + expected->null_bitmap_data(), + expected->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.builder->Finish(&result)); + ::arrow::AssertArraysEqual(*expected, *result); + }; + + for (auto ty : ::arrow::all_dictionary_index_types()) { + CheckIndexType(ty); + } +} + +TYPED_TEST(EncodingAdHocTyped, DictArrowDirectPutIndices) { this->DictPutIndices(); } + +class DictEncoding : public TestArrowBuilderDecoding { + public: + void SetupEncoderDecoder() override { + auto node = schema::ByteArray("name"); + descr_ = std::unique_ptr<ColumnDescriptor>(new ColumnDescriptor(node, 0, 0)); + encoder_ = MakeTypedEncoder<ByteArrayType>(Encoding::PLAIN, /*use_dictionary=*/true, + descr_.get()); + if (null_count_ == 0) { + ASSERT_NO_THROW(encoder_->Put(input_data_.data(), num_values_)); + } else { + ASSERT_NO_THROW( + encoder_->PutSpaced(input_data_.data(), num_values_, valid_bits_, 0)); + } + buffer_ = encoder_->FlushValues(); + + auto dict_encoder = dynamic_cast<DictEncoder<ByteArrayType>*>(encoder_.get()); + ASSERT_NE(dict_encoder, nullptr); + dict_buffer_ = + AllocateBuffer(default_memory_pool(), dict_encoder->dict_encoded_size()); + dict_encoder->WriteDict(dict_buffer_->mutable_data()); + + // Simulate reading the dictionary page followed by a data page + plain_decoder_ = MakeTypedDecoder<ByteArrayType>(Encoding::PLAIN, descr_.get()); + plain_decoder_->SetData(dict_encoder->num_entries(), dict_buffer_->data(), + static_cast<int>(dict_buffer_->size())); + + dict_decoder_ = MakeDictDecoder<ByteArrayType>(descr_.get()); + dict_decoder_->SetDict(plain_decoder_.get()); + dict_decoder_->SetData(num_values_, buffer_->data(), + static_cast<int>(buffer_->size())); + decoder_ = dynamic_cast<ByteArrayDecoder*>(dict_decoder_.get()); + } + + protected: + std::unique_ptr<ColumnDescriptor> descr_; + std::shared_ptr<Buffer> dict_buffer_; +}; + +TEST_F(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { + this->CheckDecodeArrowUsingDenseBuilder(); +} + +TEST_F(DictEncoding, CheckDecodeArrowUsingDictBuilder) { + this->CheckDecodeArrowUsingDictBuilder(); +} + +TEST_F(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { + this->CheckDecodeArrowNonNullUsingDenseBuilder(); +} + +TEST_F(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { + this->CheckDecodeArrowNonNullUsingDictBuilder(); +} + +TEST_F(DictEncoding, CheckDecodeIndicesSpaced) { + for (auto np : null_probabilities_) { + InitTestCase(np); + auto builder = CreateDictBuilder(); + dict_decoder_->InsertDictionary(builder.get()); + int actual_num_values; + if (null_count_ == 0) { + actual_num_values = dict_decoder_->DecodeIndices(num_values_, builder.get()); + } else { + actual_num_values = dict_decoder_->DecodeIndicesSpaced( + num_values_, null_count_, valid_bits_, 0, builder.get()); + } + ASSERT_EQ(actual_num_values, num_values_ - null_count_); + std::shared_ptr<::arrow::Array> actual; + ASSERT_OK(builder->Finish(&actual)); + ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); + + // Check that null indices are zero-initialized + const auto& dict_actual = checked_cast<const ::arrow::DictionaryArray&>(*actual); + const auto& indices = + checked_cast<const ::arrow::Int32Array&>(*dict_actual.indices()); + + auto raw_values = indices.raw_values(); + for (int64_t i = 0; i < indices.length(); ++i) { + if (indices.IsNull(i) && raw_values[i] != 0) { + FAIL() << "Null slot not zero-initialized"; + } + } + } +} + +TEST_F(DictEncoding, CheckDecodeIndicesNoNulls) { + InitTestCase(/*null_probability=*/0.0); + auto builder = CreateDictBuilder(); + dict_decoder_->InsertDictionary(builder.get()); + auto actual_num_values = dict_decoder_->DecodeIndices(num_values_, builder.get()); + CheckDict(actual_num_values, *builder); +} + +// ---------------------------------------------------------------------- +// BYTE_STREAM_SPLIT encode/decode tests. + +template <typename Type> +class TestByteStreamSplitEncoding : public TestEncodingBase<Type> { + public: + using c_type = typename Type::c_type; + static constexpr int TYPE = Type::type_num; + + void CheckRoundtrip() override { + auto encoder = + MakeTypedEncoder<Type>(Encoding::BYTE_STREAM_SPLIT, false, descr_.get()); + auto decoder = MakeTypedDecoder<Type>(Encoding::BYTE_STREAM_SPLIT, descr_.get()); + encoder->Put(draws_, num_values_); + encode_buffer_ = encoder->FlushValues(); + + { + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(num_values_, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, num_values_)); + } + + { + // Try again but with a small step. + decoder->SetData(num_values_, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int step = 131; + int remaining = num_values_; + for (int i = 0; i < num_values_; i += step) { + int num_decoded = decoder->Decode(decode_buf_, step); + ASSERT_EQ(num_decoded, std::min(step, remaining)); + ASSERT_NO_FATAL_FAILURE( + VerifyResults<c_type>(decode_buf_, &draws_[i], num_decoded)); + remaining -= num_decoded; + } + } + + { + std::vector<uint8_t> valid_bits(::arrow::BitUtil::BytesForBits(num_values_), 0); + std::vector<c_type> expected_filtered_output; + const int every_nth = 5; + expected_filtered_output.reserve((num_values_ + every_nth - 1) / every_nth); + ::arrow::internal::BitmapWriter writer{valid_bits.data(), 0, num_values_}; + // Set every fifth bit. + for (int i = 0; i < num_values_; ++i) { + if (i % every_nth == 0) { + writer.Set(); + expected_filtered_output.push_back(draws_[i]); + } + writer.Next(); + } + writer.Finish(); + const int expected_size = static_cast<int>(expected_filtered_output.size()); + ASSERT_NO_THROW(encoder->PutSpaced(draws_, num_values_, valid_bits.data(), 0)); + encode_buffer_ = encoder->FlushValues(); + + decoder->SetData(expected_size, encode_buffer_->data(), + static_cast<int>(encode_buffer_->size())); + int values_decoded = decoder->Decode(decode_buf_, num_values_); + ASSERT_EQ(expected_size, values_decoded); + ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>( + decode_buf_, expected_filtered_output.data(), expected_size)); + } + } + + void CheckDecode(); + void CheckEncode(); + + protected: + USING_BASE_MEMBERS(); + + void CheckDecode(const uint8_t* encoded_data, const int64_t encoded_data_size, + const c_type* expected_decoded_data, const int num_elements) { + std::unique_ptr<TypedDecoder<Type>> decoder = + MakeTypedDecoder<Type>(Encoding::BYTE_STREAM_SPLIT); + decoder->SetData(num_elements, encoded_data, static_cast<int>(encoded_data_size)); + std::vector<c_type> decoded_data(num_elements); + int num_decoded_elements = decoder->Decode(decoded_data.data(), num_elements); + ASSERT_EQ(num_elements, num_decoded_elements); + for (size_t i = 0U; i < decoded_data.size(); ++i) { + ASSERT_EQ(expected_decoded_data[i], decoded_data[i]); + } + ASSERT_EQ(0, decoder->values_left()); + } + + void CheckEncode(const c_type* data, const int num_elements, + const uint8_t* expected_encoded_data, + const int64_t encoded_data_size) { + std::unique_ptr<TypedEncoder<Type>> encoder = + MakeTypedEncoder<Type>(Encoding::BYTE_STREAM_SPLIT); + encoder->Put(data, num_elements); + auto encoded_data = encoder->FlushValues(); + ASSERT_EQ(encoded_data_size, encoded_data->size()); + const uint8_t* encoded_data_raw = encoded_data->data(); + for (int64_t i = 0; i < encoded_data->size(); ++i) { + ASSERT_EQ(expected_encoded_data[i], encoded_data_raw[i]); + } + } +}; + +template <typename c_type> +static std::vector<c_type> ToLittleEndian(const std::vector<c_type>& input) { + std::vector<c_type> data(input.size()); + std::transform(input.begin(), input.end(), data.begin(), [](const c_type& value) { + return ::arrow::BitUtil::ToLittleEndian(value); + }); + return data; +} + +static_assert(sizeof(float) == sizeof(uint32_t), + "BYTE_STREAM_SPLIT encoding tests assume float / uint32_t type sizes"); +static_assert(sizeof(double) == sizeof(uint64_t), + "BYTE_STREAM_SPLIT encoding tests assume double / uint64_t type sizes"); + +template <> +void TestByteStreamSplitEncoding<FloatType>::CheckDecode() { + const uint8_t data[] = {0x11, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC}; + const auto expected_output = + ToLittleEndian<uint32_t>({0xAA774411U, 0xBB885522U, 0xCC996633U}); + CheckDecode(data, static_cast<int64_t>(sizeof(data)), + reinterpret_cast<const float*>(expected_output.data()), + static_cast<int>(sizeof(data) / sizeof(float))); +} + +template <> +void TestByteStreamSplitEncoding<DoubleType>::CheckDecode() { + const uint8_t data[] = {0xDE, 0xC0, 0x37, 0x13, 0x11, 0x22, 0x33, 0x44, + 0xAA, 0xBB, 0xCC, 0xDD, 0x55, 0x66, 0x77, 0x88}; + const auto expected_output = + ToLittleEndian<uint64_t>({0x7755CCAA331137DEULL, 0x8866DDBB442213C0ULL}); + CheckDecode(data, static_cast<int64_t>(sizeof(data)), + reinterpret_cast<const double*>(expected_output.data()), + static_cast<int>(sizeof(data) / sizeof(double))); +} + +template <> +void TestByteStreamSplitEncoding<DoubleType>::CheckEncode() { + const auto data = ToLittleEndian<uint64_t>( + {0x4142434445464748ULL, 0x0102030405060708ULL, 0xb1b2b3b4b5b6b7b8ULL}); + const uint8_t expected_output[24] = { + 0x48, 0x08, 0xb8, 0x47, 0x07, 0xb7, 0x46, 0x06, 0xb6, 0x45, 0x05, 0xb5, + 0x44, 0x04, 0xb4, 0x43, 0x03, 0xb3, 0x42, 0x02, 0xb2, 0x41, 0x01, 0xb1, + }; + CheckEncode(reinterpret_cast<const double*>(data.data()), static_cast<int>(data.size()), + expected_output, sizeof(expected_output)); +} + +template <> +void TestByteStreamSplitEncoding<FloatType>::CheckEncode() { + const auto data = ToLittleEndian<uint32_t>({0xaabbccdd, 0x11223344}); + const uint8_t expected_output[8] = {0xdd, 0x44, 0xcc, 0x33, 0xbb, 0x22, 0xaa, 0x11}; + CheckEncode(reinterpret_cast<const float*>(data.data()), static_cast<int>(data.size()), + expected_output, sizeof(expected_output)); +} + +typedef ::testing::Types<FloatType, DoubleType> ByteStreamSplitTypes; +TYPED_TEST_SUITE(TestByteStreamSplitEncoding, ByteStreamSplitTypes); + +TYPED_TEST(TestByteStreamSplitEncoding, BasicRoundTrip) { + for (int values = 0; values < 32; ++values) { + ASSERT_NO_FATAL_FAILURE(this->Execute(values, 1)); + } + + // We need to test with different sizes to guarantee that the SIMD implementation + // can handle both inputs with size divisible by 4/8 and sizes which would + // require a scalar loop for the suffix. + constexpr size_t kSuffixSize = 7; + constexpr size_t kAvx2Size = 32; // sizeof(__m256i) for AVX2 + constexpr size_t kAvx512Size = 64; // sizeof(__m512i) for AVX512 + constexpr size_t kMultiSimdSize = kAvx512Size * 7; + + // Exercise only one SIMD loop. SSE and AVX2 covered in above loop. + ASSERT_NO_FATAL_FAILURE(this->Execute(kAvx512Size, 1)); + // Exercise one SIMD loop with suffix. SSE covered in above loop. + ASSERT_NO_FATAL_FAILURE(this->Execute(kAvx2Size + kSuffixSize, 1)); + ASSERT_NO_FATAL_FAILURE(this->Execute(kAvx512Size + kSuffixSize, 1)); + // Exercise multi SIMD loop. + ASSERT_NO_FATAL_FAILURE(this->Execute(kMultiSimdSize, 1)); + // Exercise multi SIMD loop with suffix. + ASSERT_NO_FATAL_FAILURE(this->Execute(kMultiSimdSize + kSuffixSize, 1)); +} + +TYPED_TEST(TestByteStreamSplitEncoding, RoundTripSingleElement) { + ASSERT_NO_FATAL_FAILURE(this->Execute(1, 1)); +} + +TYPED_TEST(TestByteStreamSplitEncoding, CheckOnlyDecode) { + ASSERT_NO_FATAL_FAILURE(this->CheckDecode()); +} + +TYPED_TEST(TestByteStreamSplitEncoding, CheckOnlyEncode) { + ASSERT_NO_FATAL_FAILURE(this->CheckEncode()); +} + +TEST(ByteStreamSplitEncodeDecode, InvalidDataTypes) { + // First check encoders. + ASSERT_THROW(MakeTypedEncoder<Int32Type>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder<Int64Type>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder<Int96Type>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder<BooleanType>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder<ByteArrayType>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedEncoder<FLBAType>(Encoding::BYTE_STREAM_SPLIT), ParquetException); + + // Then check decoders. + ASSERT_THROW(MakeTypedDecoder<Int32Type>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder<Int64Type>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder<Int96Type>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder<BooleanType>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder<ByteArrayType>(Encoding::BYTE_STREAM_SPLIT), + ParquetException); + ASSERT_THROW(MakeTypedDecoder<FLBAType>(Encoding::BYTE_STREAM_SPLIT), ParquetException); +} + +} // namespace test +} // namespace parquet |