diff options
Diffstat (limited to 'src/arrow/cpp/src/parquet/column_page.h')
-rw-r--r-- | src/arrow/cpp/src/parquet/column_page.h | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/src/arrow/cpp/src/parquet/column_page.h b/src/arrow/cpp/src/parquet/column_page.h new file mode 100644 index 000000000..2fab77ed0 --- /dev/null +++ b/src/arrow/cpp/src/parquet/column_page.h @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#pragma once + +#include <cstdint> +#include <memory> +#include <string> + +#include "parquet/statistics.h" +#include "parquet/types.h" + +namespace parquet { + +// TODO: Parallel processing is not yet safe because of memory-ownership +// semantics (the PageReader may or may not own the memory referenced by a +// page) +// +// TODO(wesm): In the future Parquet implementations may store the crc code +// in format::PageHeader. parquet-mr currently does not, so we also skip it +// here, both on the read and write path +class Page { + public: + Page(const std::shared_ptr<Buffer>& buffer, PageType::type type) + : buffer_(buffer), type_(type) {} + + PageType::type type() const { return type_; } + + std::shared_ptr<Buffer> buffer() const { return buffer_; } + + // @returns: a pointer to the page's data + const uint8_t* data() const { return buffer_->data(); } + + // @returns: the total size in bytes of the page's data buffer + int32_t size() const { return static_cast<int32_t>(buffer_->size()); } + + private: + std::shared_ptr<Buffer> buffer_; + PageType::type type_; +}; + +/// \brief Base type for DataPageV1 and DataPageV2 including common attributes +class DataPage : public Page { + public: + int32_t num_values() const { return num_values_; } + Encoding::type encoding() const { return encoding_; } + int64_t uncompressed_size() const { return uncompressed_size_; } + const EncodedStatistics& statistics() const { return statistics_; } + + virtual ~DataPage() = default; + + protected: + DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values, + Encoding::type encoding, int64_t uncompressed_size, + const EncodedStatistics& statistics = EncodedStatistics()) + : Page(buffer, type), + num_values_(num_values), + encoding_(encoding), + uncompressed_size_(uncompressed_size), + statistics_(statistics) {} + + int32_t num_values_; + Encoding::type encoding_; + int64_t uncompressed_size_; + EncodedStatistics statistics_; +}; + +class DataPageV1 : public DataPage { + public: + DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values, + Encoding::type encoding, Encoding::type definition_level_encoding, + Encoding::type repetition_level_encoding, int64_t uncompressed_size, + const EncodedStatistics& statistics = EncodedStatistics()) + : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size, + statistics), + definition_level_encoding_(definition_level_encoding), + repetition_level_encoding_(repetition_level_encoding) {} + + Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } + + Encoding::type definition_level_encoding() const { return definition_level_encoding_; } + + private: + Encoding::type definition_level_encoding_; + Encoding::type repetition_level_encoding_; +}; + +class DataPageV2 : public DataPage { + public: + DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls, + int32_t num_rows, Encoding::type encoding, + int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, + int64_t uncompressed_size, bool is_compressed = false, + const EncodedStatistics& statistics = EncodedStatistics()) + : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size, + statistics), + num_nulls_(num_nulls), + num_rows_(num_rows), + definition_levels_byte_length_(definition_levels_byte_length), + repetition_levels_byte_length_(repetition_levels_byte_length), + is_compressed_(is_compressed) {} + + int32_t num_nulls() const { return num_nulls_; } + + int32_t num_rows() const { return num_rows_; } + + int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } + + int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } + + bool is_compressed() const { return is_compressed_; } + + private: + int32_t num_nulls_; + int32_t num_rows_; + int32_t definition_levels_byte_length_; + int32_t repetition_levels_byte_length_; + bool is_compressed_; +}; + +class DictionaryPage : public Page { + public: + DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values, + Encoding::type encoding, bool is_sorted = false) + : Page(buffer, PageType::DICTIONARY_PAGE), + num_values_(num_values), + encoding_(encoding), + is_sorted_(is_sorted) {} + + int32_t num_values() const { return num_values_; } + + Encoding::type encoding() const { return encoding_; } + + bool is_sorted() const { return is_sorted_; } + + private: + int32_t num_values_; + Encoding::type encoding_; + bool is_sorted_; +}; + +} // namespace parquet |