summaryrefslogtreecommitdiffstats
path: root/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc')
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc140
1 files changed, 140 insertions, 0 deletions
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc
new file mode 100644
index 000000000..f5d96ec16
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/exception.h>
+
+#include <iostream>
+
+// #0 Build dummy data to pass around
+// To have some input data, we first create an Arrow Table that holds
+// some data.
+std::shared_ptr<arrow::Table> generate_table() {
+ arrow::Int64Builder i64builder;
+ PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5}));
+ std::shared_ptr<arrow::Array> i64array;
+ PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
+
+ arrow::StringBuilder strbuilder;
+ PARQUET_THROW_NOT_OK(strbuilder.Append("some"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("string"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("content"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("in"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("rows"));
+ std::shared_ptr<arrow::Array> strarray;
+ PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
+
+ std::shared_ptr<arrow::Schema> schema = arrow::schema(
+ {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())});
+
+ return arrow::Table::Make(schema, {i64array, strarray});
+}
+
+// #1 Write out the data as a Parquet file
+void write_parquet_file(const arrow::Table& table) {
+ std::shared_ptr<arrow::io::FileOutputStream> outfile;
+ PARQUET_ASSIGN_OR_THROW(
+ outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet"));
+ // The last argument to the function call is the size of the RowGroup in
+ // the parquet file. Normally you would choose this to be rather large but
+ // for the example, we use a small value to have multiple RowGroups.
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
+}
+
+// #2: Fully read in the file
+void read_whole_file() {
+ std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #3: Read only a single RowGroup of the parquet file
+void read_single_rowgroup() {
+ std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #4: Read only a single column of the whole parquet file
+void read_single_column() {
+ std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::ChunkedArray> array;
+ PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+// #5: Read only a single column of a RowGroup (this is known as ColumnChunk)
+// from the Parquet file.
+void read_single_column_chunk() {
+ std::cout << "Reading first ColumnChunk of the first RowGroup of "
+ "parquet-arrow-example.parquet"
+ << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::ChunkedArray> array;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+ std::shared_ptr<arrow::Table> table = generate_table();
+ write_parquet_file(*table);
+ read_whole_file();
+ read_single_rowgroup();
+ read_single_column();
+ read_single_column_chunk();
+}