summaryrefslogtreecommitdiffstats
path: root/src/arrow/cpp/examples/parquet
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/cpp/examples/parquet')
-rw-r--r--src/arrow/cpp/examples/parquet/CMakeLists.txt78
-rw-r--r--src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc451
-rw-r--r--src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc656
-rw-r--r--src/arrow/cpp/examples/parquet/low_level_api/reader_writer.cc413
-rw-r--r--src/arrow/cpp/examples/parquet/low_level_api/reader_writer.h70
-rw-r--r--src/arrow/cpp/examples/parquet/low_level_api/reader_writer2.cc434
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt42
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_arrow/README.md20
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc140
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc324
10 files changed, 2628 insertions, 0 deletions
diff --git a/src/arrow/cpp/examples/parquet/CMakeLists.txt b/src/arrow/cpp/examples/parquet/CMakeLists.txt
new file mode 100644
index 000000000..2d16948ae
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_executable(parquet_low_level_example low_level_api/reader_writer.cc)
+add_executable(parquet_low_level_example2 low_level_api/reader_writer2.cc)
+add_executable(parquet_arrow_example parquet_arrow/reader_writer.cc)
+add_executable(parquet_stream_api_example parquet_stream_api/stream_reader_writer.cc)
+target_include_directories(parquet_low_level_example PRIVATE low_level_api/)
+target_include_directories(parquet_low_level_example2 PRIVATE low_level_api/)
+
+# The variables in these files are for illustration purposes
+set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS
+ low_level_api/reader_writer.cc
+ low_level_api/reader_writer2.cc)
+
+if (PARQUET_REQUIRE_ENCRYPTION)
+ add_executable(parquet_encryption_example low_level_api/encryption_reader_writer.cc)
+ add_executable(parquet_encryption_example_all_crypto_options low_level_api/encryption_reader_writer_all_crypto_options.cc)
+ target_include_directories(parquet_encryption_example PRIVATE low_level_api/)
+ target_include_directories(parquet_encryption_example_all_crypto_options PRIVATE low_level_api/)
+
+ set(PARQUET_EXAMPLES_WARNING_SUPPRESSIONS
+ ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS}
+ low_level_api/encryption_reader_writer.cc
+ low_level_api/encryption_reader_writer_all_crypto_options.cc)
+
+endif()
+
+if(UNIX)
+ foreach(FILE ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS})
+ set_property(SOURCE ${FILE}
+ APPEND_STRING
+ PROPERTY COMPILE_FLAGS "-Wno-unused-variable")
+ endforeach()
+endif()
+
+# Prefer shared linkage but use static if shared build is deactivated
+if (ARROW_BUILD_SHARED)
+ set(PARQUET_EXAMPLE_LINK_LIBS parquet_shared)
+else()
+ set(PARQUET_EXAMPLE_LINK_LIBS parquet_static)
+endif()
+
+target_link_libraries(parquet_arrow_example ${PARQUET_EXAMPLE_LINK_LIBS})
+target_link_libraries(parquet_low_level_example ${PARQUET_EXAMPLE_LINK_LIBS})
+target_link_libraries(parquet_low_level_example2 ${PARQUET_EXAMPLE_LINK_LIBS})
+target_link_libraries(parquet_stream_api_example ${PARQUET_EXAMPLE_LINK_LIBS})
+
+if(PARQUET_REQUIRE_ENCRYPTION)
+ target_link_libraries(parquet_encryption_example ${PARQUET_EXAMPLE_LINK_LIBS})
+ target_link_libraries(parquet_encryption_example_all_crypto_options ${PARQUET_EXAMPLE_LINK_LIBS})
+endif()
+
+add_dependencies(parquet
+ parquet_low_level_example
+ parquet_low_level_example2
+ parquet_arrow_example
+ parquet_stream_api_example)
+
+if (PARQUET_REQUIRE_ENCRYPTION)
+ add_dependencies(parquet
+ parquet_encryption_example
+ parquet_encryption_example_all_crypto_options)
+endif()
diff --git a/src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc b/src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc
new file mode 100644
index 000000000..75788b283
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer.cc
@@ -0,0 +1,451 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <reader_writer.h>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+/*
+ * This file contains sample for writing and reading encrypted Parquet file with
+ * basic encryption configuration.
+ *
+ * A detailed description of the Parquet Modular Encryption specification can be found
+ * here:
+ * https://github.com/apache/parquet-format/blob/encryption/Encryption.md
+ *
+ * The write sample creates a file with eight columns where two of the columns and the
+ * footer are encrypted.
+ *
+ * The read sample decrypts using key retriever that holds the keys of two encrypted
+ * columns and the footer key.
+ */
+
+constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
+const char* PARQUET_FILENAME = "parquet_cpp_example.parquet.encrypted";
+const char* kFooterEncryptionKey = "0123456789012345"; // 128bit/16
+const char* kColumnEncryptionKey1 = "1234567890123450";
+const char* kColumnEncryptionKey2 = "1234567890123451";
+
+int main(int argc, char** argv) {
+ /**********************************************************************************
+ PARQUET ENCRYPTION WRITER EXAMPLE
+ **********************************************************************************/
+
+ try {
+ // Create a local file output stream instance.
+ using FileClass = ::arrow::io::FileOutputStream;
+ std::shared_ptr<FileClass> out_file;
+ PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(PARQUET_FILENAME));
+
+ // Setup the parquet schema
+ std::shared_ptr<GroupNode> schema = SetupSchema();
+
+ // Add encryption properties
+ // Encryption configuration: Encrypt two columns and the footer.
+ std::map<std::string, std::shared_ptr<parquet::ColumnEncryptionProperties>>
+ encryption_cols;
+
+ parquet::SchemaDescriptor schema_desc;
+ schema_desc.Init(schema);
+ auto column_path1 = schema_desc.Column(5)->path()->ToDotString();
+ auto column_path2 = schema_desc.Column(4)->path()->ToDotString();
+
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder0(column_path1);
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder1(column_path2);
+ encryption_col_builder0.key(kColumnEncryptionKey1)->key_id("kc1");
+ encryption_col_builder1.key(kColumnEncryptionKey2)->key_id("kc2");
+
+ encryption_cols[column_path1] = encryption_col_builder0.build();
+ encryption_cols[column_path2] = encryption_col_builder1.build();
+
+ parquet::FileEncryptionProperties::Builder file_encryption_builder(
+ kFooterEncryptionKey);
+
+ parquet::WriterProperties::Builder builder;
+ // Add the current encryption configuration to WriterProperties.
+ builder.encryption(file_encryption_builder.footer_key_metadata("kf")
+ ->encrypted_columns(encryption_cols)
+ ->build());
+
+ // Add other writer properties
+ builder.compression(parquet::Compression::SNAPPY);
+
+ std::shared_ptr<parquet::WriterProperties> props = builder.build();
+
+ // Create a ParquetFileWriter instance
+ std::shared_ptr<parquet::ParquetFileWriter> file_writer =
+ parquet::ParquetFileWriter::Open(out_file, schema, props);
+
+ // Append a RowGroup with a specific number of rows.
+ parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+
+ // Write the Bool column
+ parquet::BoolWriter* bool_writer =
+ static_cast<parquet::BoolWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ bool value = ((i % 2) == 0) ? true : false;
+ bool_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Int32 column
+ parquet::Int32Writer* int32_writer =
+ static_cast<parquet::Int32Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ int32_t value = i;
+ int32_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Int64 column. Each row has repeats twice.
+ parquet::Int64Writer* int64_writer =
+ static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) {
+ int64_t value = i * 1000 * 1000;
+ value *= 1000 * 1000;
+ int16_t definition_level = 1;
+ int16_t repetition_level = 0;
+ if ((i % 2) == 0) {
+ repetition_level = 1; // start of a new record
+ }
+ int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value);
+ }
+
+ // Write the INT96 column.
+ parquet::Int96Writer* int96_writer =
+ static_cast<parquet::Int96Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::Int96 value;
+ value.value[0] = i;
+ value.value[1] = i + 1;
+ value.value[2] = i + 2;
+ int96_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Float column
+ parquet::FloatWriter* float_writer =
+ static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ float value = static_cast<float>(i) * 1.1f;
+ float_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Double column
+ parquet::DoubleWriter* double_writer =
+ static_cast<parquet::DoubleWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ double value = i * 1.1111111;
+ double_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the ByteArray column. Make every alternate values NULL
+ parquet::ByteArrayWriter* ba_writer =
+ static_cast<parquet::ByteArrayWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::ByteArray value;
+ char hello[FIXED_LENGTH] = "parquet";
+ hello[7] = static_cast<char>(static_cast<int>('0') + i / 100);
+ hello[8] = static_cast<char>(static_cast<int>('0') + (i / 10) % 10);
+ hello[9] = static_cast<char>(static_cast<int>('0') + i % 10);
+ if (i % 2 == 0) {
+ int16_t definition_level = 1;
+ value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
+ value.len = FIXED_LENGTH;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, &value);
+ } else {
+ int16_t definition_level = 0;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr);
+ }
+ }
+
+ // Write the FixedLengthByteArray column
+ parquet::FixedLenByteArrayWriter* flba_writer =
+ static_cast<parquet::FixedLenByteArrayWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::FixedLenByteArray value;
+ char v = static_cast<char>(i);
+ char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ value.ptr = reinterpret_cast<const uint8_t*>(&flba[0]);
+
+ flba_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Close the ParquetFileWriter
+ file_writer->Close();
+
+ // Write the bytes to file
+ DCHECK(out_file->Close().ok());
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet write error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ /**********************************************************************************
+ PARQUET ENCRYPTION READER EXAMPLE
+ **********************************************************************************/
+
+ // Decryption configuration: Decrypt using key retriever callback that holds the keys
+ // of two encrypted columns and the footer key.
+ std::shared_ptr<parquet::StringKeyIdRetriever> string_kr1 =
+ std::make_shared<parquet::StringKeyIdRetriever>();
+ string_kr1->PutKey("kf", kFooterEncryptionKey);
+ string_kr1->PutKey("kc1", kColumnEncryptionKey1);
+ string_kr1->PutKey("kc2", kColumnEncryptionKey2);
+ std::shared_ptr<parquet::DecryptionKeyRetriever> kr1 =
+ std::static_pointer_cast<parquet::StringKeyIdRetriever>(string_kr1);
+
+ parquet::FileDecryptionProperties::Builder file_decryption_builder;
+
+ try {
+ parquet::ReaderProperties reader_properties = parquet::default_reader_properties();
+
+ // Add the current decryption configuration to ReaderProperties.
+ reader_properties.file_decryption_properties(
+ file_decryption_builder.key_retriever(kr1)->build());
+
+ // Create a ParquetReader instance
+ std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
+ parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false, reader_properties);
+
+ // Get the File MetaData
+ std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
+
+ // Get the number of RowGroups
+ int num_row_groups = file_metadata->num_row_groups();
+ assert(num_row_groups == 1);
+
+ // Get the number of Columns
+ int num_columns = file_metadata->num_columns();
+ assert(num_columns == 8);
+
+ // Iterate over all the RowGroups in the file
+ for (int r = 0; r < num_row_groups; ++r) {
+ // Get the RowGroup Reader
+ std::shared_ptr<parquet::RowGroupReader> row_group_reader =
+ parquet_reader->RowGroup(r);
+
+ int64_t values_read = 0;
+ int64_t rows_read = 0;
+ int16_t definition_level;
+ int16_t repetition_level;
+ int i;
+ std::shared_ptr<parquet::ColumnReader> column_reader;
+
+ // Get the Column Reader for the boolean column
+ column_reader = row_group_reader->Column(0);
+ parquet::BoolReader* bool_reader =
+ static_cast<parquet::BoolReader*>(column_reader.get());
+
+ // Read all the rows in the column
+ i = 0;
+ while (bool_reader->HasNext()) {
+ bool value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ bool expected_value = ((i % 2) == 0) ? true : false;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the Int32 column
+ column_reader = row_group_reader->Column(1);
+ parquet::Int32Reader* int32_reader =
+ static_cast<parquet::Int32Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int32_reader->HasNext()) {
+ int32_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ assert(value == i);
+ i++;
+ }
+
+ // Get the Column Reader for the Int64 column
+ column_reader = row_group_reader->Column(2);
+ parquet::Int64Reader* int64_reader =
+ static_cast<parquet::Int64Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int64_reader->HasNext()) {
+ int64_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level,
+ &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ int64_t expected_value = i * 1000 * 1000;
+ expected_value *= 1000 * 1000;
+ assert(value == expected_value);
+ if ((i % 2) == 0) {
+ assert(repetition_level == 1);
+ } else {
+ assert(repetition_level == 0);
+ }
+ i++;
+ }
+
+ // Get the Column Reader for the Int96 column
+ column_reader = row_group_reader->Column(3);
+ parquet::Int96Reader* int96_reader =
+ static_cast<parquet::Int96Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int96_reader->HasNext()) {
+ parquet::Int96 value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ parquet::Int96 expected_value;
+ expected_value.value[0] = i;
+ expected_value.value[1] = i + 1;
+ expected_value.value[2] = i + 2;
+ for (int j = 0; j < 3; j++) {
+ assert(value.value[j] == expected_value.value[j]);
+ }
+ ARROW_UNUSED(expected_value); // suppress compiler warning in release builds
+ i++;
+ }
+
+ // Get the Column Reader for the Float column
+ column_reader = row_group_reader->Column(4);
+ parquet::FloatReader* float_reader =
+ static_cast<parquet::FloatReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (float_reader->HasNext()) {
+ float value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ float expected_value = static_cast<float>(i) * 1.1f;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the Double column
+ column_reader = row_group_reader->Column(5);
+ parquet::DoubleReader* double_reader =
+ static_cast<parquet::DoubleReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (double_reader->HasNext()) {
+ double value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ double expected_value = i * 1.1111111;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the ByteArray column
+ column_reader = row_group_reader->Column(6);
+ parquet::ByteArrayReader* ba_reader =
+ static_cast<parquet::ByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (ba_reader->HasNext()) {
+ parquet::ByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read =
+ ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ ARROW_UNUSED(rows_read); // suppress compiler warning in release builds
+ // Verify the value written
+ char expected_value[FIXED_LENGTH] = "parquet";
+ expected_value[7] = static_cast<char>('0' + i / 100);
+ expected_value[8] = static_cast<char>('0' + (i / 10) % 10);
+ expected_value[9] = static_cast<char>('0' + i % 10);
+ if (i % 2 == 0) { // only alternate values exist
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ assert(value.len == FIXED_LENGTH);
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ assert(definition_level == 1);
+ } else {
+ // There are NULL values in the rows written
+ assert(values_read == 0);
+ assert(definition_level == 0);
+ }
+ ARROW_UNUSED(expected_value); // suppress compiler warning in release builds
+ i++;
+ }
+
+ // Get the Column Reader for the FixedLengthByteArray column
+ column_reader = row_group_reader->Column(7);
+ parquet::FixedLenByteArrayReader* flba_reader =
+ static_cast<parquet::FixedLenByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (flba_reader->HasNext()) {
+ parquet::FixedLenByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ char v = static_cast<char>(i);
+ char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ i++;
+ }
+ }
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet read error: " << e.what() << std::endl;
+ }
+
+ std::cout << "Parquet Writing and Reading Complete" << std::endl;
+ return 0;
+}
diff --git a/src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc b/src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc
new file mode 100644
index 000000000..5b01e0284
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/low_level_api/encryption_reader_writer_all_crypto_options.cc
@@ -0,0 +1,656 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+#include <dirent.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <regex>
+#include <sstream>
+
+/*
+ * This file contains samples for writing and reading encrypted Parquet files in different
+ * encryption and decryption configurations.
+ * Each sample section is dedicated to an independent configuration and shows its creation
+ * from beginning to end.
+ * The samples have the following goals:
+ * 1) Demonstrate usage of different options for data encryption and decryption.
+ * 2) Produce encrypted files for interoperability tests with other (eg parquet-mr)
+ * readers that support encryption.
+ * 3) Produce encrypted files with plaintext footer, for testing the ability of legacy
+ * readers to parse the footer and read unencrypted columns.
+ * 4) Perform interoperability tests with other (eg parquet-mr) writers, by reading
+ * encrypted files produced by these writers.
+ *
+ * Each write sample produces new independent parquet file, encrypted with a different
+ * encryption configuration as described below.
+ * The name of each file is in the form of:
+ * tester<encryption config number>.parquet.encrypted.
+ *
+ * The read sample creates a set of decryption configurations and then uses each of them
+ * to read all encrypted files in the input directory.
+ *
+ * The different encryption and decryption configurations are listed below.
+ *
+ * Usage: ./encryption-interop-tests <write/read> <path-to-directory-of-parquet-files>
+ *
+ * A detailed description of the Parquet Modular Encryption specification can be found
+ * here:
+ * https://github.com/apache/parquet-format/blob/encryption/Encryption.md
+ *
+ * The write sample creates files with four columns in the following
+ * encryption configurations:
+ *
+ * - Encryption configuration 1: Encrypt all columns and the footer with the same key.
+ * (uniform encryption)
+ * - Encryption configuration 2: Encrypt two columns and the footer, with different
+ * keys.
+ * - Encryption configuration 3: Encrypt two columns, with different keys.
+ * Don’t encrypt footer (to enable legacy readers)
+ * - plaintext footer mode.
+ * - Encryption configuration 4: Encrypt two columns and the footer, with different
+ * keys. Supply aad_prefix for file identity
+ * verification.
+ * - Encryption configuration 5: Encrypt two columns and the footer, with different
+ * keys. Supply aad_prefix, and call
+ * disable_aad_prefix_storage to prevent file
+ * identity storage in file metadata.
+ * - Encryption configuration 6: Encrypt two columns and the footer, with different
+ * keys. Use the alternative (AES_GCM_CTR_V1) algorithm.
+ *
+ * The read sample uses each of the following decryption configurations to read every
+ * encrypted files in the input directory:
+ *
+ * - Decryption configuration 1: Decrypt using key retriever that holds the keys of
+ * two encrypted columns and the footer key.
+ * - Decryption configuration 2: Decrypt using key retriever that holds the keys of
+ * two encrypted columns and the footer key. Supplies
+ * aad_prefix to verify file identity.
+ * - Decryption configuration 3: Decrypt using explicit column and footer keys
+ * (instead of key retrieval callback).
+ */
+
+constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
+
+const char* kFooterEncryptionKey = "0123456789012345"; // 128bit/16
+const char* kColumnEncryptionKey1 = "1234567890123450";
+const char* kColumnEncryptionKey2 = "1234567890123451";
+const char* fileName = "tester";
+
+using FileClass = ::arrow::io::FileOutputStream;
+using parquet::ConvertedType;
+using parquet::Repetition;
+using parquet::Type;
+using parquet::schema::GroupNode;
+using parquet::schema::PrimitiveNode;
+
+void PrintDecryptionConfiguration(int configuration);
+// Check that the decryption result is as expected.
+void CheckResult(std::string file, int example_id, std::string exception_msg);
+// Returns true if FileName ends with suffix. Otherwise returns false.
+// Used to skip unencrypted parquet files.
+bool FileNameEndsWith(std::string file_name, std::string suffix);
+
+std::vector<std::string> GetDirectoryFiles(const std::string& path) {
+ std::vector<std::string> files;
+ struct dirent* entry;
+ DIR* dir = opendir(path.c_str());
+
+ if (dir == NULL) {
+ exit(-1);
+ }
+ while ((entry = readdir(dir)) != NULL) {
+ files.push_back(std::string(entry->d_name));
+ }
+ closedir(dir);
+ return files;
+}
+
+static std::shared_ptr<GroupNode> SetupSchema() {
+ parquet::schema::NodeVector fields;
+ // Create a primitive node named 'boolean_field' with type:BOOLEAN,
+ // repetition:REQUIRED
+ fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED,
+ Type::BOOLEAN, ConvertedType::NONE));
+
+ // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
+ // logical type:TIME_MILLIS
+ fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32,
+ ConvertedType::TIME_MILLIS));
+
+ fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT,
+ ConvertedType::NONE));
+
+ fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE,
+ ConvertedType::NONE));
+
+ // Create a GroupNode named 'schema' using the primitive nodes defined above
+ // This GroupNode is the root node of the schema tree
+ return std::static_pointer_cast<GroupNode>(
+ GroupNode::Make("schema", Repetition::REQUIRED, fields));
+}
+
+void InteropTestWriteEncryptedParquetFiles(std::string root_path) {
+ /**********************************************************************************
+ Creating a number of Encryption configurations
+ **********************************************************************************/
+
+ // This vector will hold various encryption configurations.
+ std::vector<std::shared_ptr<parquet::FileEncryptionProperties>>
+ vector_of_encryption_configurations;
+
+ // Encryption configuration 1: Encrypt all columns and the footer with the same key.
+ // (uniform encryption)
+ parquet::FileEncryptionProperties::Builder file_encryption_builder_1(
+ kFooterEncryptionKey);
+ // Add to list of encryption configurations.
+ vector_of_encryption_configurations.push_back(
+ file_encryption_builder_1.footer_key_metadata("kf")->build());
+
+ // Encryption configuration 2: Encrypt two columns and the footer, with different keys.
+ std::map<std::string, std::shared_ptr<parquet::ColumnEncryptionProperties>>
+ encryption_cols2;
+ std::string path1 = "double_field";
+ std::string path2 = "float_field";
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_20(path1);
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_21(path2);
+ encryption_col_builder_20.key(kColumnEncryptionKey1)->key_id("kc1");
+ encryption_col_builder_21.key(kColumnEncryptionKey2)->key_id("kc2");
+
+ encryption_cols2[path1] = encryption_col_builder_20.build();
+ encryption_cols2[path2] = encryption_col_builder_21.build();
+
+ parquet::FileEncryptionProperties::Builder file_encryption_builder_2(
+ kFooterEncryptionKey);
+
+ vector_of_encryption_configurations.push_back(
+ file_encryption_builder_2.footer_key_metadata("kf")
+ ->encrypted_columns(encryption_cols2)
+ ->build());
+
+ // Encryption configuration 3: Encrypt two columns, with different keys.
+ // Don’t encrypt footer.
+ // (plaintext footer mode, readable by legacy readers)
+ std::map<std::string, std::shared_ptr<parquet::ColumnEncryptionProperties>>
+ encryption_cols3;
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_30(path1);
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_31(path2);
+ encryption_col_builder_30.key(kColumnEncryptionKey1)->key_id("kc1");
+ encryption_col_builder_31.key(kColumnEncryptionKey2)->key_id("kc2");
+
+ encryption_cols3[path1] = encryption_col_builder_30.build();
+ encryption_cols3[path2] = encryption_col_builder_31.build();
+ parquet::FileEncryptionProperties::Builder file_encryption_builder_3(
+ kFooterEncryptionKey);
+
+ vector_of_encryption_configurations.push_back(
+ file_encryption_builder_3.footer_key_metadata("kf")
+ ->encrypted_columns(encryption_cols3)
+ ->set_plaintext_footer()
+ ->build());
+
+ // Encryption configuration 4: Encrypt two columns and the footer, with different keys.
+ // Use aad_prefix.
+ std::map<std::string, std::shared_ptr<parquet::ColumnEncryptionProperties>>
+ encryption_cols4;
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_40(path1);
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_41(path2);
+ encryption_col_builder_40.key(kColumnEncryptionKey1)->key_id("kc1");
+ encryption_col_builder_41.key(kColumnEncryptionKey2)->key_id("kc2");
+
+ encryption_cols4[path1] = encryption_col_builder_40.build();
+ encryption_cols4[path2] = encryption_col_builder_41.build();
+ parquet::FileEncryptionProperties::Builder file_encryption_builder_4(
+ kFooterEncryptionKey);
+
+ vector_of_encryption_configurations.push_back(
+ file_encryption_builder_4.footer_key_metadata("kf")
+ ->encrypted_columns(encryption_cols4)
+ ->aad_prefix(fileName)
+ ->build());
+
+ // Encryption configuration 5: Encrypt two columns and the footer, with different keys.
+ // Use aad_prefix and disable_aad_prefix_storage.
+ std::map<std::string, std::shared_ptr<parquet::ColumnEncryptionProperties>>
+ encryption_cols5;
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_50(path1);
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_51(path2);
+ encryption_col_builder_50.key(kColumnEncryptionKey1)->key_id("kc1");
+ encryption_col_builder_51.key(kColumnEncryptionKey2)->key_id("kc2");
+
+ encryption_cols5[path1] = encryption_col_builder_50.build();
+ encryption_cols5[path2] = encryption_col_builder_51.build();
+ parquet::FileEncryptionProperties::Builder file_encryption_builder_5(
+ kFooterEncryptionKey);
+
+ vector_of_encryption_configurations.push_back(
+ file_encryption_builder_5.encrypted_columns(encryption_cols5)
+ ->footer_key_metadata("kf")
+ ->aad_prefix(fileName)
+ ->disable_aad_prefix_storage()
+ ->build());
+
+ // Encryption configuration 6: Encrypt two columns and the footer, with different keys.
+ // Use AES_GCM_CTR_V1 algorithm.
+ std::map<std::string, std::shared_ptr<parquet::ColumnEncryptionProperties>>
+ encryption_cols6;
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_60(path1);
+ parquet::ColumnEncryptionProperties::Builder encryption_col_builder_61(path2);
+ encryption_col_builder_60.key(kColumnEncryptionKey1)->key_id("kc1");
+ encryption_col_builder_61.key(kColumnEncryptionKey2)->key_id("kc2");
+
+ encryption_cols6[path1] = encryption_col_builder_60.build();
+ encryption_cols6[path2] = encryption_col_builder_61.build();
+ parquet::FileEncryptionProperties::Builder file_encryption_builder_6(
+ kFooterEncryptionKey);
+
+ vector_of_encryption_configurations.push_back(
+ file_encryption_builder_6.footer_key_metadata("kf")
+ ->encrypted_columns(encryption_cols6)
+ ->algorithm(parquet::ParquetCipher::AES_GCM_CTR_V1)
+ ->build());
+
+ /**********************************************************************************
+ PARQUET WRITER EXAMPLE
+ **********************************************************************************/
+
+ // Iterate over the encryption configurations and for each one write a parquet file.
+ for (unsigned example_id = 0; example_id < vector_of_encryption_configurations.size();
+ ++example_id) {
+ std::stringstream ss;
+ ss << example_id + 1;
+ std::string test_number_string = ss.str();
+ try {
+ // Create a local file output stream instance.
+ std::shared_ptr<FileClass> out_file;
+ std::string file =
+ root_path + fileName + std::string(test_number_string) + ".parquet.encrypted";
+ std::cout << "Write " << file << std::endl;
+ PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(file));
+
+ // Setup the parquet schema
+ std::shared_ptr<GroupNode> schema = SetupSchema();
+
+ // Add writer properties
+ parquet::WriterProperties::Builder builder;
+ builder.compression(parquet::Compression::SNAPPY);
+
+ // Add the current encryption configuration to WriterProperties.
+ builder.encryption(vector_of_encryption_configurations[example_id]);
+
+ std::shared_ptr<parquet::WriterProperties> props = builder.build();
+
+ // Create a ParquetFileWriter instance
+ std::shared_ptr<parquet::ParquetFileWriter> file_writer =
+ parquet::ParquetFileWriter::Open(out_file, schema, props);
+
+ // Append a RowGroup with a specific number of rows.
+ parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+
+ // Write the Bool column
+ parquet::BoolWriter* bool_writer =
+ static_cast<parquet::BoolWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ bool value = ((i % 2) == 0) ? true : false;
+ bool_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Int32 column
+ parquet::Int32Writer* int32_writer =
+ static_cast<parquet::Int32Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ int32_t value = i;
+ int32_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Float column
+ parquet::FloatWriter* float_writer =
+ static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ float value = static_cast<float>(i) * 1.1f;
+ float_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Double column
+ parquet::DoubleWriter* double_writer =
+ static_cast<parquet::DoubleWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ double value = i * 1.1111111;
+ double_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+ // Close the ParquetFileWriter
+ file_writer->Close();
+
+ // Write the bytes to file
+ DCHECK(out_file->Close().ok());
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet write error: " << e.what() << std::endl;
+ return;
+ }
+ }
+}
+
+void InteropTestReadEncryptedParquetFiles(std::string root_path) {
+ std::vector<std::string> files_in_directory = GetDirectoryFiles(root_path);
+
+ /**********************************************************************************
+ Creating a number of Decryption configurations
+ **********************************************************************************/
+
+ // This vector will hold various decryption configurations.
+ std::vector<std::shared_ptr<parquet::FileDecryptionProperties>>
+ vector_of_decryption_configurations;
+
+ // Decryption configuration 1: Decrypt using key retriever callback that holds the keys
+ // of two encrypted columns and the footer key.
+ std::shared_ptr<parquet::StringKeyIdRetriever> string_kr1 =
+ std::make_shared<parquet::StringKeyIdRetriever>();
+ string_kr1->PutKey("kf", kFooterEncryptionKey);
+ string_kr1->PutKey("kc1", kColumnEncryptionKey1);
+ string_kr1->PutKey("kc2", kColumnEncryptionKey2);
+ std::shared_ptr<parquet::DecryptionKeyRetriever> kr1 =
+ std::static_pointer_cast<parquet::StringKeyIdRetriever>(string_kr1);
+
+ parquet::FileDecryptionProperties::Builder file_decryption_builder_1;
+ vector_of_decryption_configurations.push_back(
+ file_decryption_builder_1.key_retriever(kr1)->build());
+
+ // Decryption configuration 2: Decrypt using key retriever callback that holds the keys
+ // of two encrypted columns and the footer key. Supply aad_prefix.
+ std::shared_ptr<parquet::StringKeyIdRetriever> string_kr2 =
+ std::make_shared<parquet::StringKeyIdRetriever>();
+ string_kr2->PutKey("kf", kFooterEncryptionKey);
+ string_kr2->PutKey("kc1", kColumnEncryptionKey1);
+ string_kr2->PutKey("kc2", kColumnEncryptionKey2);
+ std::shared_ptr<parquet::DecryptionKeyRetriever> kr2 =
+ std::static_pointer_cast<parquet::StringKeyIdRetriever>(string_kr2);
+
+ parquet::FileDecryptionProperties::Builder file_decryption_builder_2;
+ vector_of_decryption_configurations.push_back(
+ file_decryption_builder_2.key_retriever(kr2)->aad_prefix(fileName)->build());
+
+ // Decryption configuration 3: Decrypt using explicit column and footer keys.
+ std::string path_double = "double_field";
+ std::string path_float = "float_field";
+ std::map<std::string, std::shared_ptr<parquet::ColumnDecryptionProperties>>
+ decryption_cols;
+ parquet::ColumnDecryptionProperties::Builder decryption_col_builder31(path_double);
+ parquet::ColumnDecryptionProperties::Builder decryption_col_builder32(path_float);
+
+ decryption_cols[path_double] =
+ decryption_col_builder31.key(kColumnEncryptionKey1)->build();
+ decryption_cols[path_float] =
+ decryption_col_builder32.key(kColumnEncryptionKey2)->build();
+
+ parquet::FileDecryptionProperties::Builder file_decryption_builder_3;
+ vector_of_decryption_configurations.push_back(
+ file_decryption_builder_3.footer_key(kFooterEncryptionKey)
+ ->column_keys(decryption_cols)
+ ->build());
+
+ /**********************************************************************************
+ PARQUET READER EXAMPLE
+ **********************************************************************************/
+
+ // Iterate over the decryption configurations and use each one to read every files
+ // in the input directory.
+ for (unsigned example_id = 0; example_id < vector_of_decryption_configurations.size();
+ ++example_id) {
+ PrintDecryptionConfiguration(example_id + 1);
+ for (auto const& file : files_in_directory) {
+ std::string exception_msg = "";
+ if (!FileNameEndsWith(file, "parquet.encrypted")) // Skip non encrypted files
+ continue;
+ try {
+ std::cout << "--> Read file " << file << std::endl;
+
+ parquet::ReaderProperties reader_properties =
+ parquet::default_reader_properties();
+
+ // Add the current decryption configuration to ReaderProperties.
+ reader_properties.file_decryption_properties(
+ vector_of_decryption_configurations[example_id]->DeepClone());
+
+ // Create a ParquetReader instance
+ std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
+ parquet::ParquetFileReader::OpenFile(root_path + file, false,
+ reader_properties);
+
+ // Get the File MetaData
+ std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
+
+ // Get the number of RowGroups
+ int num_row_groups = file_metadata->num_row_groups();
+ assert(num_row_groups == 1);
+
+ // Get the number of Columns
+ int num_columns = file_metadata->num_columns();
+ assert(num_columns == 4);
+
+ // Iterate over all the RowGroups in the file
+ for (int r = 0; r < num_row_groups; ++r) {
+ // Get the RowGroup Reader
+ std::shared_ptr<parquet::RowGroupReader> row_group_reader =
+ parquet_reader->RowGroup(r);
+
+ int64_t values_read = 0;
+ int64_t rows_read = 0;
+ int i;
+ std::shared_ptr<parquet::ColumnReader> column_reader;
+
+ // Get the Column Reader for the boolean column
+ column_reader = row_group_reader->Column(0);
+ parquet::BoolReader* bool_reader =
+ static_cast<parquet::BoolReader*>(column_reader.get());
+
+ // Read all the rows in the column
+ i = 0;
+ while (bool_reader->HasNext()) {
+ bool value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ bool expected_value = ((i % 2) == 0) ? true : false;
+ assert(value == expected_value);
+ i++;
+ }
+ ARROW_UNUSED(rows_read); // suppress compiler warning in release builds
+
+ // Get the Column Reader for the Int32 column
+ column_reader = row_group_reader->Column(1);
+ parquet::Int32Reader* int32_reader =
+ static_cast<parquet::Int32Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int32_reader->HasNext()) {
+ int32_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read =
+ int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ assert(value == i);
+ i++;
+ }
+
+ // Get the Column Reader for the Float column
+ column_reader = row_group_reader->Column(2);
+ parquet::FloatReader* float_reader =
+ static_cast<parquet::FloatReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (float_reader->HasNext()) {
+ float value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read =
+ float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ float expected_value = static_cast<float>(i) * 1.1f;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the Double column
+ column_reader = row_group_reader->Column(3);
+ parquet::DoubleReader* double_reader =
+ static_cast<parquet::DoubleReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (double_reader->HasNext()) {
+ double value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read =
+ double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ double expected_value = i * 1.1111111;
+ assert(value == expected_value);
+ i++;
+ }
+ }
+ } catch (const std::exception& e) {
+ exception_msg = e.what();
+ }
+ CheckResult(file, example_id, exception_msg);
+ std::cout << "file [" << file << "] Parquet Reading Complete" << std::endl;
+ }
+ }
+}
+
+void PrintDecryptionConfiguration(int configuration) {
+ std::cout << "\n\nDecryption configuration ";
+ if (configuration == 1) {
+ std::cout << "1: \n\nDecrypt using key retriever that holds"
+ " the keys of two encrypted columns and the footer key."
+ << std::endl;
+ } else if (configuration == 2) {
+ std::cout << "2: \n\nDecrypt using key retriever that holds"
+ " the keys of two encrypted columns and the footer key. Pass aad_prefix."
+ << std::endl;
+ } else if (configuration == 3) {
+ std::cout << "3: \n\nDecrypt using explicit column and footer keys." << std::endl;
+ } else {
+ std::cout << "Unknown configuration" << std::endl;
+ exit(-1);
+ }
+ std::cout << std::endl;
+}
+
+// Check that the decryption result is as expected.
+void CheckResult(std::string file, int example_id, std::string exception_msg) {
+ int encryption_configuration_number;
+ std::regex r("tester([0-9]+)\\.parquet.encrypted");
+ std::smatch m;
+ std::regex_search(file, m, r);
+ if (m.size() == 0) {
+ std::cerr
+ << "Error: Error parsing filename to extract encryption configuration number. "
+ << std::endl;
+ }
+ std::string encryption_configuration_number_str = m.str(1);
+ encryption_configuration_number = atoi(encryption_configuration_number_str.c_str());
+ if (encryption_configuration_number < 1 || encryption_configuration_number > 6) {
+ std::cerr << "Error: Unknown encryption configuration number. " << std::endl;
+ }
+
+ int decryption_configuration_number = example_id + 1;
+
+ // Encryption_configuration number five contains aad_prefix and
+ // disable_aad_prefix_storage.
+ // An exception is expected to be thrown if the file is not decrypted with aad_prefix.
+ if (encryption_configuration_number == 5) {
+ if (decryption_configuration_number == 1 || decryption_configuration_number == 3) {
+ std::size_t found = exception_msg.find("AAD");
+ if (found == std::string::npos)
+ std::cout << "Error: Expecting AAD related exception.";
+ return;
+ }
+ }
+ // Decryption configuration number two contains aad_prefix. An exception is expected to
+ // be thrown if the file was not encrypted with the same aad_prefix.
+ if (decryption_configuration_number == 2) {
+ if (encryption_configuration_number != 5 && encryption_configuration_number != 4) {
+ std::size_t found = exception_msg.find("AAD");
+ if (found == std::string::npos) {
+ std::cout << "Error: Expecting AAD related exception." << std::endl;
+ }
+ return;
+ }
+ }
+ if (!exception_msg.empty())
+ std::cout << "Error: Unexpected exception was thrown." << exception_msg;
+}
+
+bool FileNameEndsWith(std::string file_name, std::string suffix) {
+ std::string::size_type idx = file_name.find_first_of('.');
+
+ if (idx != std::string::npos) {
+ std::string extension = file_name.substr(idx + 1);
+ if (extension.compare(suffix) == 0) return true;
+ }
+ return false;
+}
+
+int main(int argc, char** argv) {
+ enum Operation { write, read };
+ std::string root_path;
+ Operation operation = write;
+ if (argc < 3) {
+ std::cout << "Usage: encryption-reader-writer-all-crypto-options <read/write> "
+ "<Path-to-parquet-files>"
+ << std::endl;
+ exit(1);
+ }
+ root_path = argv[1];
+ if (root_path.compare("read") == 0) {
+ operation = read;
+ }
+
+ root_path = argv[2];
+ std::cout << "Root path is: " << root_path << std::endl;
+
+ if (operation == write) {
+ InteropTestWriteEncryptedParquetFiles(root_path);
+ } else {
+ InteropTestReadEncryptedParquetFiles(root_path);
+ }
+ return 0;
+}
diff --git a/src/arrow/cpp/examples/parquet/low_level_api/reader_writer.cc b/src/arrow/cpp/examples/parquet/low_level_api/reader_writer.cc
new file mode 100644
index 000000000..09af32289
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/low_level_api/reader_writer.cc
@@ -0,0 +1,413 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <reader_writer.h>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+/*
+ * This example describes writing and reading Parquet Files in C++ and serves as a
+ * reference to the API.
+ * The file contains all the physical data types supported by Parquet.
+ * This example uses the RowGroupWriter API that supports writing RowGroups optimized for
+ * memory consumption.
+ **/
+
+/* Parquet is a structured columnar file format
+ * Parquet File = "Parquet data" + "Parquet Metadata"
+ * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a
+ * columnar layout
+ * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their
+ * Columns
+ * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a
+ * complex (nested) type (internal nodes)
+ * For specific details, please refer the format here:
+ * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+ **/
+
+constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
+const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet";
+
+int main(int argc, char** argv) {
+ /**********************************************************************************
+ PARQUET WRITER EXAMPLE
+ **********************************************************************************/
+ // parquet::REQUIRED fields do not need definition and repetition level values
+ // parquet::OPTIONAL fields require only definition level values
+ // parquet::REPEATED fields require both definition and repetition level values
+ try {
+ // Create a local file output stream instance.
+ using FileClass = ::arrow::io::FileOutputStream;
+ std::shared_ptr<FileClass> out_file;
+ PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(PARQUET_FILENAME));
+
+ // Setup the parquet schema
+ std::shared_ptr<GroupNode> schema = SetupSchema();
+
+ // Add writer properties
+ parquet::WriterProperties::Builder builder;
+ builder.compression(parquet::Compression::SNAPPY);
+ std::shared_ptr<parquet::WriterProperties> props = builder.build();
+
+ // Create a ParquetFileWriter instance
+ std::shared_ptr<parquet::ParquetFileWriter> file_writer =
+ parquet::ParquetFileWriter::Open(out_file, schema, props);
+
+ // Append a RowGroup with a specific number of rows.
+ parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+
+ // Write the Bool column
+ parquet::BoolWriter* bool_writer =
+ static_cast<parquet::BoolWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ bool value = ((i % 2) == 0) ? true : false;
+ bool_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Int32 column
+ parquet::Int32Writer* int32_writer =
+ static_cast<parquet::Int32Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ int32_t value = i;
+ int32_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Int64 column. Each row has repeats twice.
+ parquet::Int64Writer* int64_writer =
+ static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) {
+ int64_t value = i * 1000 * 1000;
+ value *= 1000 * 1000;
+ int16_t definition_level = 1;
+ int16_t repetition_level = 0;
+ if ((i % 2) == 0) {
+ repetition_level = 1; // start of a new record
+ }
+ int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value);
+ }
+
+ // Write the INT96 column.
+ parquet::Int96Writer* int96_writer =
+ static_cast<parquet::Int96Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::Int96 value;
+ value.value[0] = i;
+ value.value[1] = i + 1;
+ value.value[2] = i + 2;
+ int96_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Float column
+ parquet::FloatWriter* float_writer =
+ static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ float value = static_cast<float>(i) * 1.1f;
+ float_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Double column
+ parquet::DoubleWriter* double_writer =
+ static_cast<parquet::DoubleWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ double value = i * 1.1111111;
+ double_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the ByteArray column. Make every alternate values NULL
+ parquet::ByteArrayWriter* ba_writer =
+ static_cast<parquet::ByteArrayWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::ByteArray value;
+ char hello[FIXED_LENGTH] = "parquet";
+ hello[7] = static_cast<char>(static_cast<int>('0') + i / 100);
+ hello[8] = static_cast<char>(static_cast<int>('0') + (i / 10) % 10);
+ hello[9] = static_cast<char>(static_cast<int>('0') + i % 10);
+ if (i % 2 == 0) {
+ int16_t definition_level = 1;
+ value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
+ value.len = FIXED_LENGTH;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, &value);
+ } else {
+ int16_t definition_level = 0;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr);
+ }
+ }
+
+ // Write the FixedLengthByteArray column
+ parquet::FixedLenByteArrayWriter* flba_writer =
+ static_cast<parquet::FixedLenByteArrayWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::FixedLenByteArray value;
+ char v = static_cast<char>(i);
+ char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ value.ptr = reinterpret_cast<const uint8_t*>(&flba[0]);
+
+ flba_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Close the ParquetFileWriter
+ file_writer->Close();
+
+ // Write the bytes to file
+ DCHECK(out_file->Close().ok());
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet write error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ /**********************************************************************************
+ PARQUET READER EXAMPLE
+ **********************************************************************************/
+
+ try {
+ // Create a ParquetReader instance
+ std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
+ parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false);
+
+ // Get the File MetaData
+ std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
+
+ // Get the number of RowGroups
+ int num_row_groups = file_metadata->num_row_groups();
+ assert(num_row_groups == 1);
+
+ // Get the number of Columns
+ int num_columns = file_metadata->num_columns();
+ assert(num_columns == 8);
+
+ // Iterate over all the RowGroups in the file
+ for (int r = 0; r < num_row_groups; ++r) {
+ // Get the RowGroup Reader
+ std::shared_ptr<parquet::RowGroupReader> row_group_reader =
+ parquet_reader->RowGroup(r);
+
+ int64_t values_read = 0;
+ int64_t rows_read = 0;
+ int16_t definition_level;
+ int16_t repetition_level;
+ int i;
+ std::shared_ptr<parquet::ColumnReader> column_reader;
+
+ ARROW_UNUSED(rows_read); // prevent warning in release build
+
+ // Get the Column Reader for the boolean column
+ column_reader = row_group_reader->Column(0);
+ parquet::BoolReader* bool_reader =
+ static_cast<parquet::BoolReader*>(column_reader.get());
+
+ // Read all the rows in the column
+ i = 0;
+ while (bool_reader->HasNext()) {
+ bool value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ bool expected_value = ((i % 2) == 0) ? true : false;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the Int32 column
+ column_reader = row_group_reader->Column(1);
+ parquet::Int32Reader* int32_reader =
+ static_cast<parquet::Int32Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int32_reader->HasNext()) {
+ int32_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ assert(value == i);
+ i++;
+ }
+
+ // Get the Column Reader for the Int64 column
+ column_reader = row_group_reader->Column(2);
+ parquet::Int64Reader* int64_reader =
+ static_cast<parquet::Int64Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int64_reader->HasNext()) {
+ int64_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level,
+ &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ int64_t expected_value = i * 1000 * 1000;
+ expected_value *= 1000 * 1000;
+ assert(value == expected_value);
+ if ((i % 2) == 0) {
+ assert(repetition_level == 1);
+ } else {
+ assert(repetition_level == 0);
+ }
+ i++;
+ }
+
+ // Get the Column Reader for the Int96 column
+ column_reader = row_group_reader->Column(3);
+ parquet::Int96Reader* int96_reader =
+ static_cast<parquet::Int96Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int96_reader->HasNext()) {
+ parquet::Int96 value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ parquet::Int96 expected_value;
+ ARROW_UNUSED(expected_value); // prevent warning in release build
+ expected_value.value[0] = i;
+ expected_value.value[1] = i + 1;
+ expected_value.value[2] = i + 2;
+ for (int j = 0; j < 3; j++) {
+ assert(value.value[j] == expected_value.value[j]);
+ }
+ i++;
+ }
+
+ // Get the Column Reader for the Float column
+ column_reader = row_group_reader->Column(4);
+ parquet::FloatReader* float_reader =
+ static_cast<parquet::FloatReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (float_reader->HasNext()) {
+ float value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ float expected_value = static_cast<float>(i) * 1.1f;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the Double column
+ column_reader = row_group_reader->Column(5);
+ parquet::DoubleReader* double_reader =
+ static_cast<parquet::DoubleReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (double_reader->HasNext()) {
+ double value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ double expected_value = i * 1.1111111;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the ByteArray column
+ column_reader = row_group_reader->Column(6);
+ parquet::ByteArrayReader* ba_reader =
+ static_cast<parquet::ByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (ba_reader->HasNext()) {
+ parquet::ByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read =
+ ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // Verify the value written
+ char expected_value[FIXED_LENGTH] = "parquet";
+ ARROW_UNUSED(expected_value); // prevent warning in release build
+ expected_value[7] = static_cast<char>('0' + i / 100);
+ expected_value[8] = static_cast<char>('0' + (i / 10) % 10);
+ expected_value[9] = static_cast<char>('0' + i % 10);
+ if (i % 2 == 0) { // only alternate values exist
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ assert(value.len == FIXED_LENGTH);
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ assert(definition_level == 1);
+ } else {
+ // There are NULL values in the rows written
+ assert(values_read == 0);
+ assert(definition_level == 0);
+ }
+ i++;
+ }
+
+ // Get the Column Reader for the FixedLengthByteArray column
+ column_reader = row_group_reader->Column(7);
+ parquet::FixedLenByteArrayReader* flba_reader =
+ static_cast<parquet::FixedLenByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (flba_reader->HasNext()) {
+ parquet::FixedLenByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ char v = static_cast<char>(i);
+ char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ i++;
+ }
+ }
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet read error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ std::cout << "Parquet Writing and Reading Complete" << std::endl;
+
+ return 0;
+}
diff --git a/src/arrow/cpp/examples/parquet/low_level_api/reader_writer.h b/src/arrow/cpp/examples/parquet/low_level_api/reader_writer.h
new file mode 100644
index 000000000..ed8e74653
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/low_level_api/reader_writer.h
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+
+using parquet::ConvertedType;
+using parquet::Repetition;
+using parquet::Type;
+using parquet::schema::GroupNode;
+using parquet::schema::PrimitiveNode;
+
+constexpr int FIXED_LENGTH = 10;
+
+static std::shared_ptr<GroupNode> SetupSchema() {
+ parquet::schema::NodeVector fields;
+ // Create a primitive node named 'boolean_field' with type:BOOLEAN,
+ // repetition:REQUIRED
+ fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED,
+ Type::BOOLEAN, ConvertedType::NONE));
+
+ // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
+ // logical type:TIME_MILLIS
+ fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32,
+ ConvertedType::TIME_MILLIS));
+
+ // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED
+ fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64,
+ ConvertedType::NONE));
+
+ fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96,
+ ConvertedType::NONE));
+
+ fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT,
+ ConvertedType::NONE));
+
+ fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE,
+ ConvertedType::NONE));
+
+ // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL
+ fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY,
+ ConvertedType::NONE));
+
+ // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY,
+ // repetition:REQUIRED, field_length = FIXED_LENGTH
+ fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE,
+ FIXED_LENGTH));
+
+ // Create a GroupNode named 'schema' using the primitive nodes defined above
+ // This GroupNode is the root node of the schema tree
+ return std::static_pointer_cast<GroupNode>(
+ GroupNode::Make("schema", Repetition::REQUIRED, fields));
+}
diff --git a/src/arrow/cpp/examples/parquet/low_level_api/reader_writer2.cc b/src/arrow/cpp/examples/parquet/low_level_api/reader_writer2.cc
new file mode 100644
index 000000000..65dd5799e
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/low_level_api/reader_writer2.cc
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <reader_writer.h>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+/*
+ * This example describes writing and reading Parquet Files in C++ and serves as a
+ * reference to the API.
+ * The file contains all the physical data types supported by Parquet.
+ * This example uses the RowGroupWriter API that supports writing RowGroups based on a
+ * certain size.
+ **/
+
+/* Parquet is a structured columnar file format
+ * Parquet File = "Parquet data" + "Parquet Metadata"
+ * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a
+ * columnar layout
+ * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their
+ * Columns
+ * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a
+ * complex (nested) type (internal nodes)
+ * For specific details, please refer the format here:
+ * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+ **/
+
+constexpr int NUM_ROWS = 2500000;
+constexpr int64_t ROW_GROUP_SIZE = 16 * 1024 * 1024; // 16 MB
+const char PARQUET_FILENAME[] = "parquet_cpp_example2.parquet";
+
+int main(int argc, char** argv) {
+ /**********************************************************************************
+ PARQUET WRITER EXAMPLE
+ **********************************************************************************/
+ // parquet::REQUIRED fields do not need definition and repetition level values
+ // parquet::OPTIONAL fields require only definition level values
+ // parquet::REPEATED fields require both definition and repetition level values
+ try {
+ // Create a local file output stream instance.
+ using FileClass = ::arrow::io::FileOutputStream;
+ std::shared_ptr<FileClass> out_file;
+ PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(PARQUET_FILENAME));
+
+ // Setup the parquet schema
+ std::shared_ptr<GroupNode> schema = SetupSchema();
+
+ // Add writer properties
+ parquet::WriterProperties::Builder builder;
+ builder.compression(parquet::Compression::SNAPPY);
+ std::shared_ptr<parquet::WriterProperties> props = builder.build();
+
+ // Create a ParquetFileWriter instance
+ std::shared_ptr<parquet::ParquetFileWriter> file_writer =
+ parquet::ParquetFileWriter::Open(out_file, schema, props);
+
+ // Append a BufferedRowGroup to keep the RowGroup open until a certain size
+ parquet::RowGroupWriter* rg_writer = file_writer->AppendBufferedRowGroup();
+
+ int num_columns = file_writer->num_columns();
+ std::vector<int64_t> buffered_values_estimate(num_columns, 0);
+ for (int i = 0; i < NUM_ROWS; i++) {
+ int64_t estimated_bytes = 0;
+ // Get the estimated size of the values that are not written to a page yet
+ for (int n = 0; n < num_columns; n++) {
+ estimated_bytes += buffered_values_estimate[n];
+ }
+
+ // We need to consider the compressed pages
+ // as well as the values that are not compressed yet
+ if ((rg_writer->total_bytes_written() + rg_writer->total_compressed_bytes() +
+ estimated_bytes) > ROW_GROUP_SIZE) {
+ rg_writer->Close();
+ std::fill(buffered_values_estimate.begin(), buffered_values_estimate.end(), 0);
+ rg_writer = file_writer->AppendBufferedRowGroup();
+ }
+
+ int col_id = 0;
+ // Write the Bool column
+ parquet::BoolWriter* bool_writer =
+ static_cast<parquet::BoolWriter*>(rg_writer->column(col_id));
+ bool bool_value = ((i % 2) == 0) ? true : false;
+ bool_writer->WriteBatch(1, nullptr, nullptr, &bool_value);
+ buffered_values_estimate[col_id] = bool_writer->EstimatedBufferedValueBytes();
+
+ // Write the Int32 column
+ col_id++;
+ parquet::Int32Writer* int32_writer =
+ static_cast<parquet::Int32Writer*>(rg_writer->column(col_id));
+ int32_t int32_value = i;
+ int32_writer->WriteBatch(1, nullptr, nullptr, &int32_value);
+ buffered_values_estimate[col_id] = int32_writer->EstimatedBufferedValueBytes();
+
+ // Write the Int64 column. Each row has repeats twice.
+ col_id++;
+ parquet::Int64Writer* int64_writer =
+ static_cast<parquet::Int64Writer*>(rg_writer->column(col_id));
+ int64_t int64_value1 = 2 * i;
+ int16_t definition_level = 1;
+ int16_t repetition_level = 0;
+ int64_writer->WriteBatch(1, &definition_level, &repetition_level, &int64_value1);
+ int64_t int64_value2 = (2 * i + 1);
+ repetition_level = 1; // start of a new record
+ int64_writer->WriteBatch(1, &definition_level, &repetition_level, &int64_value2);
+ buffered_values_estimate[col_id] = int64_writer->EstimatedBufferedValueBytes();
+
+ // Write the INT96 column.
+ col_id++;
+ parquet::Int96Writer* int96_writer =
+ static_cast<parquet::Int96Writer*>(rg_writer->column(col_id));
+ parquet::Int96 int96_value;
+ int96_value.value[0] = i;
+ int96_value.value[1] = i + 1;
+ int96_value.value[2] = i + 2;
+ int96_writer->WriteBatch(1, nullptr, nullptr, &int96_value);
+ buffered_values_estimate[col_id] = int96_writer->EstimatedBufferedValueBytes();
+
+ // Write the Float column
+ col_id++;
+ parquet::FloatWriter* float_writer =
+ static_cast<parquet::FloatWriter*>(rg_writer->column(col_id));
+ float float_value = static_cast<float>(i) * 1.1f;
+ float_writer->WriteBatch(1, nullptr, nullptr, &float_value);
+ buffered_values_estimate[col_id] = float_writer->EstimatedBufferedValueBytes();
+
+ // Write the Double column
+ col_id++;
+ parquet::DoubleWriter* double_writer =
+ static_cast<parquet::DoubleWriter*>(rg_writer->column(col_id));
+ double double_value = i * 1.1111111;
+ double_writer->WriteBatch(1, nullptr, nullptr, &double_value);
+ buffered_values_estimate[col_id] = double_writer->EstimatedBufferedValueBytes();
+
+ // Write the ByteArray column. Make every alternate values NULL
+ col_id++;
+ parquet::ByteArrayWriter* ba_writer =
+ static_cast<parquet::ByteArrayWriter*>(rg_writer->column(col_id));
+ parquet::ByteArray ba_value;
+ char hello[FIXED_LENGTH] = "parquet";
+ hello[7] = static_cast<char>(static_cast<int>('0') + i / 100);
+ hello[8] = static_cast<char>(static_cast<int>('0') + (i / 10) % 10);
+ hello[9] = static_cast<char>(static_cast<int>('0') + i % 10);
+ if (i % 2 == 0) {
+ int16_t definition_level = 1;
+ ba_value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
+ ba_value.len = FIXED_LENGTH;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, &ba_value);
+ } else {
+ int16_t definition_level = 0;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr);
+ }
+ buffered_values_estimate[col_id] = ba_writer->EstimatedBufferedValueBytes();
+
+ // Write the FixedLengthByteArray column
+ col_id++;
+ parquet::FixedLenByteArrayWriter* flba_writer =
+ static_cast<parquet::FixedLenByteArrayWriter*>(rg_writer->column(col_id));
+ parquet::FixedLenByteArray flba_value;
+ char v = static_cast<char>(i);
+ char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ flba_value.ptr = reinterpret_cast<const uint8_t*>(&flba[0]);
+
+ flba_writer->WriteBatch(1, nullptr, nullptr, &flba_value);
+ buffered_values_estimate[col_id] = flba_writer->EstimatedBufferedValueBytes();
+ }
+
+ // Close the RowGroupWriter
+ rg_writer->Close();
+ // Close the ParquetFileWriter
+ file_writer->Close();
+
+ // Write the bytes to file
+ DCHECK(out_file->Close().ok());
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet write error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ /**********************************************************************************
+ PARQUET READER EXAMPLE
+ **********************************************************************************/
+
+ try {
+ // Create a ParquetReader instance
+ std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
+ parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false);
+
+ // Get the File MetaData
+ std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
+
+ int num_row_groups = file_metadata->num_row_groups();
+
+ // Get the number of Columns
+ int num_columns = file_metadata->num_columns();
+ assert(num_columns == 8);
+
+ std::vector<int> col_row_counts(num_columns, 0);
+
+ // Iterate over all the RowGroups in the file
+ for (int r = 0; r < num_row_groups; ++r) {
+ // Get the RowGroup Reader
+ std::shared_ptr<parquet::RowGroupReader> row_group_reader =
+ parquet_reader->RowGroup(r);
+
+ assert(row_group_reader->metadata()->total_byte_size() < ROW_GROUP_SIZE);
+
+ int64_t values_read = 0;
+ int64_t rows_read = 0;
+ int16_t definition_level;
+ int16_t repetition_level;
+ std::shared_ptr<parquet::ColumnReader> column_reader;
+ int col_id = 0;
+
+ ARROW_UNUSED(rows_read); // prevent warning in release build
+
+ // Get the Column Reader for the boolean column
+ column_reader = row_group_reader->Column(col_id);
+ parquet::BoolReader* bool_reader =
+ static_cast<parquet::BoolReader*>(column_reader.get());
+
+ // Read all the rows in the column
+ while (bool_reader->HasNext()) {
+ bool value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ bool expected_value = ((col_row_counts[col_id] % 2) == 0) ? true : false;
+ assert(value == expected_value);
+ col_row_counts[col_id]++;
+ }
+
+ // Get the Column Reader for the Int32 column
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
+ parquet::Int32Reader* int32_reader =
+ static_cast<parquet::Int32Reader*>(column_reader.get());
+ // Read all the rows in the column
+ while (int32_reader->HasNext()) {
+ int32_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ assert(value == col_row_counts[col_id]);
+ col_row_counts[col_id]++;
+ }
+
+ // Get the Column Reader for the Int64 column
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
+ parquet::Int64Reader* int64_reader =
+ static_cast<parquet::Int64Reader*>(column_reader.get());
+ // Read all the rows in the column
+ while (int64_reader->HasNext()) {
+ int64_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level,
+ &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ int64_t expected_value = col_row_counts[col_id];
+ assert(value == expected_value);
+ if ((col_row_counts[col_id] % 2) == 0) {
+ assert(repetition_level == 0);
+ } else {
+ assert(repetition_level == 1);
+ }
+ col_row_counts[col_id]++;
+ }
+
+ // Get the Column Reader for the Int96 column
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
+ parquet::Int96Reader* int96_reader =
+ static_cast<parquet::Int96Reader*>(column_reader.get());
+ // Read all the rows in the column
+ while (int96_reader->HasNext()) {
+ parquet::Int96 value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ parquet::Int96 expected_value;
+ ARROW_UNUSED(expected_value); // prevent warning in release build
+ expected_value.value[0] = col_row_counts[col_id];
+ expected_value.value[1] = col_row_counts[col_id] + 1;
+ expected_value.value[2] = col_row_counts[col_id] + 2;
+ for (int j = 0; j < 3; j++) {
+ assert(value.value[j] == expected_value.value[j]);
+ }
+ col_row_counts[col_id]++;
+ }
+
+ // Get the Column Reader for the Float column
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
+ parquet::FloatReader* float_reader =
+ static_cast<parquet::FloatReader*>(column_reader.get());
+ // Read all the rows in the column
+ while (float_reader->HasNext()) {
+ float value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ float expected_value = static_cast<float>(col_row_counts[col_id]) * 1.1f;
+ assert(value == expected_value);
+ col_row_counts[col_id]++;
+ }
+
+ // Get the Column Reader for the Double column
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
+ parquet::DoubleReader* double_reader =
+ static_cast<parquet::DoubleReader*>(column_reader.get());
+ // Read all the rows in the column
+ while (double_reader->HasNext()) {
+ double value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ double expected_value = col_row_counts[col_id] * 1.1111111;
+ assert(value == expected_value);
+ col_row_counts[col_id]++;
+ }
+
+ // Get the Column Reader for the ByteArray column
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
+ parquet::ByteArrayReader* ba_reader =
+ static_cast<parquet::ByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ while (ba_reader->HasNext()) {
+ parquet::ByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read =
+ ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // Verify the value written
+ char expected_value[FIXED_LENGTH] = "parquet";
+ ARROW_UNUSED(expected_value); // prevent warning in release build
+ expected_value[7] = static_cast<char>('0' + col_row_counts[col_id] / 100);
+ expected_value[8] = static_cast<char>('0' + (col_row_counts[col_id] / 10) % 10);
+ expected_value[9] = static_cast<char>('0' + col_row_counts[col_id] % 10);
+ if (col_row_counts[col_id] % 2 == 0) { // only alternate values exist
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ assert(value.len == FIXED_LENGTH);
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ assert(definition_level == 1);
+ } else {
+ // There are NULL values in the rows written
+ assert(values_read == 0);
+ assert(definition_level == 0);
+ }
+ col_row_counts[col_id]++;
+ }
+
+ // Get the Column Reader for the FixedLengthByteArray column
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
+ parquet::FixedLenByteArrayReader* flba_reader =
+ static_cast<parquet::FixedLenByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ while (flba_reader->HasNext()) {
+ parquet::FixedLenByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ char v = static_cast<char>(col_row_counts[col_id]);
+ char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ col_row_counts[col_id]++;
+ }
+ }
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet read error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ std::cout << "Parquet Writing and Reading Complete" << std::endl;
+
+ return 0;
+}
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt b/src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt
new file mode 100644
index 000000000..43eb21957
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Require cmake that supports BYPRODUCTS in add_custom_command, ExternalProject_Add [1].
+cmake_minimum_required(VERSION 3.2.0)
+
+project(parquet_arrow_example)
+
+include(ExternalProject)
+include(FindPkgConfig)
+include(GNUInstallDirs)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules")
+
+# This ensures that things like gnu++11 get passed correctly
+set(CMAKE_CXX_STANDARD 11)
+
+# We require a C++11 compliant compiler
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Look for installed packages the system
+find_package(Arrow REQUIRED)
+find_package(Parquet REQUIRED)
+
+include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR})
+
+add_executable(parquet_arrow_example reader_writer.cc)
+target_link_libraries(parquet_arrow_example ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB})
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/README.md b/src/arrow/cpp/examples/parquet/parquet_arrow/README.md
new file mode 100644
index 000000000..e99819fd2
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_arrow/README.md
@@ -0,0 +1,20 @@
+<!---
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+-->
+
+Using parquet-cpp with the arrow interface
+==========================================
+
+This folder contains an example project that shows how to setup a CMake project
+that consumes `parquet-cpp` as a library as well as how you can use the
+`parquet/arrow` interface to reading and write Apache Parquet files.
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc
new file mode 100644
index 000000000..f5d96ec16
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/exception.h>
+
+#include <iostream>
+
+// #0 Build dummy data to pass around
+// To have some input data, we first create an Arrow Table that holds
+// some data.
+std::shared_ptr<arrow::Table> generate_table() {
+ arrow::Int64Builder i64builder;
+ PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5}));
+ std::shared_ptr<arrow::Array> i64array;
+ PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
+
+ arrow::StringBuilder strbuilder;
+ PARQUET_THROW_NOT_OK(strbuilder.Append("some"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("string"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("content"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("in"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("rows"));
+ std::shared_ptr<arrow::Array> strarray;
+ PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
+
+ std::shared_ptr<arrow::Schema> schema = arrow::schema(
+ {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())});
+
+ return arrow::Table::Make(schema, {i64array, strarray});
+}
+
+// #1 Write out the data as a Parquet file
+void write_parquet_file(const arrow::Table& table) {
+ std::shared_ptr<arrow::io::FileOutputStream> outfile;
+ PARQUET_ASSIGN_OR_THROW(
+ outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet"));
+ // The last argument to the function call is the size of the RowGroup in
+ // the parquet file. Normally you would choose this to be rather large but
+ // for the example, we use a small value to have multiple RowGroups.
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
+}
+
+// #2: Fully read in the file
+void read_whole_file() {
+ std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #3: Read only a single RowGroup of the parquet file
+void read_single_rowgroup() {
+ std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #4: Read only a single column of the whole parquet file
+void read_single_column() {
+ std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::ChunkedArray> array;
+ PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+// #5: Read only a single column of a RowGroup (this is known as ColumnChunk)
+// from the Parquet file.
+void read_single_column_chunk() {
+ std::cout << "Reading first ColumnChunk of the first RowGroup of "
+ "parquet-arrow-example.parquet"
+ << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::ChunkedArray> array;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+ std::shared_ptr<arrow::Table> table = generate_table();
+ write_parquet_file(*table);
+ read_whole_file();
+ read_single_rowgroup();
+ read_single_column();
+ read_single_column_chunk();
+}
diff --git a/src/arrow/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc b/src/arrow/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc
new file mode 100644
index 000000000..64ab7af49
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_stream_api/stream_reader_writer.cc
@@ -0,0 +1,324 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cassert>
+#include <chrono>
+#include <cstdint>
+#include <cstring>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <utility>
+
+#include "arrow/io/file.h"
+#include "parquet/exception.h"
+#include "parquet/stream_reader.h"
+#include "parquet/stream_writer.h"
+
+// This file gives an example of how to use the parquet::StreamWriter
+// and parquet::StreamReader classes.
+// It shows writing/reading of the supported types as well as how a
+// user-defined type can be handled.
+
+template <typename T>
+using optional = parquet::StreamReader::optional<T>;
+
+// Example of a user-defined type to be written to/read from Parquet
+// using C++ input/output operators.
+class UserTimestamp {
+ public:
+ UserTimestamp() = default;
+
+ explicit UserTimestamp(const std::chrono::microseconds v) : ts_{v} {}
+
+ bool operator==(const UserTimestamp& x) const { return ts_ == x.ts_; }
+
+ void dump(std::ostream& os) const {
+ const auto t = static_cast<std::time_t>(
+ std::chrono::duration_cast<std::chrono::seconds>(ts_).count());
+ os << std::put_time(std::gmtime(&t), "%Y%m%d-%H%M%S");
+ }
+
+ void dump(parquet::StreamWriter& os) const { os << ts_; }
+
+ private:
+ std::chrono::microseconds ts_;
+};
+
+std::ostream& operator<<(std::ostream& os, const UserTimestamp& v) {
+ v.dump(os);
+ return os;
+}
+
+parquet::StreamWriter& operator<<(parquet::StreamWriter& os, const UserTimestamp& v) {
+ v.dump(os);
+ return os;
+}
+
+parquet::StreamReader& operator>>(parquet::StreamReader& os, UserTimestamp& v) {
+ std::chrono::microseconds ts;
+
+ os >> ts;
+ v = UserTimestamp{ts};
+
+ return os;
+}
+
+std::shared_ptr<parquet::schema::GroupNode> GetSchema() {
+ parquet::schema::NodeVector fields;
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "string_field", parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY,
+ parquet::ConvertedType::UTF8));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "char_field", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
+ parquet::ConvertedType::NONE, 1));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "char[4]_field", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
+ parquet::ConvertedType::NONE, 4));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "int8_field", parquet::Repetition::REQUIRED, parquet::Type::INT32,
+ parquet::ConvertedType::INT_8));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "uint16_field", parquet::Repetition::REQUIRED, parquet::Type::INT32,
+ parquet::ConvertedType::UINT_16));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "int32_field", parquet::Repetition::REQUIRED, parquet::Type::INT32,
+ parquet::ConvertedType::INT_32));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "uint64_field", parquet::Repetition::OPTIONAL, parquet::Type::INT64,
+ parquet::ConvertedType::UINT_64));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "double_field", parquet::Repetition::REQUIRED, parquet::Type::DOUBLE,
+ parquet::ConvertedType::NONE));
+
+ // User defined timestamp type.
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "timestamp_field", parquet::Repetition::REQUIRED, parquet::Type::INT64,
+ parquet::ConvertedType::TIMESTAMP_MICROS));
+
+ fields.push_back(parquet::schema::PrimitiveNode::Make(
+ "chrono_milliseconds_field", parquet::Repetition::REQUIRED, parquet::Type::INT64,
+ parquet::ConvertedType::TIMESTAMP_MILLIS));
+
+ return std::static_pointer_cast<parquet::schema::GroupNode>(
+ parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields));
+}
+
+struct TestData {
+ static const int num_rows = 2000;
+
+ static void init() { std::time(&ts_offset_); }
+
+ static optional<std::string> GetOptString(const int i) {
+ if (i % 2 == 0) return {};
+ return "Str #" + std::to_string(i);
+ }
+ static arrow::util::string_view GetStringView(const int i) {
+ static std::string string;
+ string = "StringView #" + std::to_string(i);
+ return arrow::util::string_view(string);
+ }
+ static const char* GetCharPtr(const int i) {
+ static std::string string;
+ string = "CharPtr #" + std::to_string(i);
+ return string.c_str();
+ }
+ static char GetChar(const int i) { return i & 1 ? 'M' : 'F'; }
+ static int8_t GetInt8(const int i) { return static_cast<int8_t>((i % 256) - 128); }
+ static uint16_t GetUInt16(const int i) { return static_cast<uint16_t>(i); }
+ static int32_t GetInt32(const int i) { return 3 * i - 17; }
+ static optional<uint64_t> GetOptUInt64(const int i) {
+ if (i % 11 == 0) return {};
+ return (1ull << 40) + i * i + 101;
+ }
+ static double GetDouble(const int i) { return 6.62607004e-34 * 3e8 * i; }
+ static UserTimestamp GetUserTimestamp(const int i) {
+ return UserTimestamp{std::chrono::microseconds{(ts_offset_ + 3 * i) * 1000000 + i}};
+ }
+ static std::chrono::milliseconds GetChronoMilliseconds(const int i) {
+ return std::chrono::milliseconds{(ts_offset_ + 3 * i) * 1000ull + i};
+ }
+
+ static char char4_array[4];
+
+ private:
+ static std::time_t ts_offset_;
+};
+
+char TestData::char4_array[] = "XYZ";
+std::time_t TestData::ts_offset_;
+
+void WriteParquetFile() {
+ std::shared_ptr<arrow::io::FileOutputStream> outfile;
+
+ PARQUET_ASSIGN_OR_THROW(
+ outfile, arrow::io::FileOutputStream::Open("parquet-stream-api-example.parquet"));
+
+ parquet::WriterProperties::Builder builder;
+
+#if defined ARROW_WITH_BROTLI
+ builder.compression(parquet::Compression::BROTLI);
+#elif defined ARROW_WITH_ZSTD
+ builder.compression(parquet::Compression::ZSTD);
+#endif
+
+ parquet::StreamWriter os{
+ parquet::ParquetFileWriter::Open(outfile, GetSchema(), builder.build())};
+
+ os.SetMaxRowGroupSize(1000);
+
+ for (auto i = 0; i < TestData::num_rows; ++i) {
+ // Output string using 3 different types: std::string, arrow::util::string_view and
+ // const char *.
+ switch (i % 3) {
+ case 0:
+ os << TestData::GetOptString(i);
+ break;
+ case 1:
+ os << TestData::GetStringView(i);
+ break;
+ case 2:
+ os << TestData::GetCharPtr(i);
+ break;
+ }
+ os << TestData::GetChar(i);
+ switch (i % 2) {
+ case 0:
+ os << TestData::char4_array;
+ break;
+ case 1:
+ os << parquet::StreamWriter::FixedStringView{TestData::GetCharPtr(i), 4};
+ break;
+ }
+ os << TestData::GetInt8(i);
+ os << TestData::GetUInt16(i);
+ os << TestData::GetInt32(i);
+ os << TestData::GetOptUInt64(i);
+ os << TestData::GetDouble(i);
+ os << TestData::GetUserTimestamp(i);
+ os << TestData::GetChronoMilliseconds(i);
+ os << parquet::EndRow;
+
+ if (i == TestData::num_rows / 2) {
+ os << parquet::EndRowGroup;
+ }
+ }
+ std::cout << "Parquet Stream Writing complete." << std::endl;
+}
+
+void ReadParquetFile() {
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+
+ PARQUET_ASSIGN_OR_THROW(
+ infile, arrow::io::ReadableFile::Open("parquet-stream-api-example.parquet"));
+
+ parquet::StreamReader os{parquet::ParquetFileReader::Open(infile)};
+
+ optional<std::string> opt_string;
+ char ch;
+ char char_array[4];
+ int8_t int8;
+ uint16_t uint16;
+ int32_t int32;
+ optional<uint64_t> opt_uint64;
+ double d;
+ UserTimestamp ts_user;
+ std::chrono::milliseconds ts_ms;
+ int i;
+
+ for (i = 0; !os.eof(); ++i) {
+ os >> opt_string;
+ os >> ch;
+ os >> char_array;
+ os >> int8;
+ os >> uint16;
+ os >> int32;
+ os >> opt_uint64;
+ os >> d;
+ os >> ts_user;
+ os >> ts_ms;
+ os >> parquet::EndRow;
+
+ if (0) {
+ // For debugging.
+ std::cout << "Row #" << i << std::endl;
+
+ std::cout << "string[";
+ if (opt_string) {
+ std::cout << *opt_string;
+ } else {
+ std::cout << "N/A";
+ }
+ std::cout << "] char[" << ch << "] charArray[" << char_array << "] int8["
+ << int(int8) << "] uint16[" << uint16 << "] int32[" << int32;
+ std::cout << "] uint64[";
+ if (opt_uint64) {
+ std::cout << *opt_uint64;
+ } else {
+ std::cout << "N/A";
+ }
+ std::cout << "] double[" << d << "] tsUser[" << ts_user << "] tsMs["
+ << ts_ms.count() << "]" << std::endl;
+ }
+ // Check data.
+ switch (i % 3) {
+ case 0:
+ assert(opt_string == TestData::GetOptString(i));
+ break;
+ case 1:
+ assert(*opt_string == TestData::GetStringView(i));
+ break;
+ case 2:
+ assert(*opt_string == TestData::GetCharPtr(i));
+ break;
+ }
+ assert(ch == TestData::GetChar(i));
+ switch (i % 2) {
+ case 0:
+ assert(0 == std::memcmp(char_array, TestData::char4_array, sizeof(char_array)));
+ break;
+ case 1:
+ assert(0 == std::memcmp(char_array, TestData::GetCharPtr(i), sizeof(char_array)));
+ break;
+ }
+ assert(int8 == TestData::GetInt8(i));
+ assert(uint16 == TestData::GetUInt16(i));
+ assert(int32 == TestData::GetInt32(i));
+ assert(opt_uint64 == TestData::GetOptUInt64(i));
+ assert(std::abs(d - TestData::GetDouble(i)) < 1e-6);
+ assert(ts_user == TestData::GetUserTimestamp(i));
+ assert(ts_ms == TestData::GetChronoMilliseconds(i));
+ }
+ assert(TestData::num_rows == i);
+
+ std::cout << "Parquet Stream Reading complete." << std::endl;
+}
+
+int main() {
+ WriteParquetFile();
+ ReadParquetFile();
+
+ return 0;
+}