diff options
Diffstat (limited to 'src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc')
-rw-r--r-- | src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc new file mode 100644 index 000000000..f5d96ec16 --- /dev/null +++ b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <arrow/api.h> +#include <arrow/io/api.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/writer.h> +#include <parquet/exception.h> + +#include <iostream> + +// #0 Build dummy data to pass around +// To have some input data, we first create an Arrow Table that holds +// some data. +std::shared_ptr<arrow::Table> generate_table() { + arrow::Int64Builder i64builder; + PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5})); + std::shared_ptr<arrow::Array> i64array; + PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array)); + + arrow::StringBuilder strbuilder; + PARQUET_THROW_NOT_OK(strbuilder.Append("some")); + PARQUET_THROW_NOT_OK(strbuilder.Append("string")); + PARQUET_THROW_NOT_OK(strbuilder.Append("content")); + PARQUET_THROW_NOT_OK(strbuilder.Append("in")); + PARQUET_THROW_NOT_OK(strbuilder.Append("rows")); + std::shared_ptr<arrow::Array> strarray; + PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray)); + + std::shared_ptr<arrow::Schema> schema = arrow::schema( + {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())}); + + return arrow::Table::Make(schema, {i64array, strarray}); +} + +// #1 Write out the data as a Parquet file +void write_parquet_file(const arrow::Table& table) { + std::shared_ptr<arrow::io::FileOutputStream> outfile; + PARQUET_ASSIGN_OR_THROW( + outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet")); + // The last argument to the function call is the size of the RowGroup in + // the parquet file. Normally you would choose this to be rather large but + // for the example, we use a small value to have multiple RowGroups. + PARQUET_THROW_NOT_OK( + parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); +} + +// #2: Fully read in the file +void read_whole_file() { + std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl; + std::shared_ptr<arrow::io::ReadableFile> infile; + PARQUET_ASSIGN_OR_THROW(infile, + arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", + arrow::default_memory_pool())); + + std::unique_ptr<parquet::arrow::FileReader> reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr<arrow::Table> table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() + << " columns." << std::endl; +} + +// #3: Read only a single RowGroup of the parquet file +void read_single_rowgroup() { + std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl; + std::shared_ptr<arrow::io::ReadableFile> infile; + PARQUET_ASSIGN_OR_THROW(infile, + arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", + arrow::default_memory_pool())); + + std::unique_ptr<parquet::arrow::FileReader> reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr<arrow::Table> table; + PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table)); + std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() + << " columns." << std::endl; +} + +// #4: Read only a single column of the whole parquet file +void read_single_column() { + std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl; + std::shared_ptr<arrow::io::ReadableFile> infile; + PARQUET_ASSIGN_OR_THROW(infile, + arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", + arrow::default_memory_pool())); + + std::unique_ptr<parquet::arrow::FileReader> reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr<arrow::ChunkedArray> array; + PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); + PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); + std::cout << std::endl; +} + +// #5: Read only a single column of a RowGroup (this is known as ColumnChunk) +// from the Parquet file. +void read_single_column_chunk() { + std::cout << "Reading first ColumnChunk of the first RowGroup of " + "parquet-arrow-example.parquet" + << std::endl; + std::shared_ptr<arrow::io::ReadableFile> infile; + PARQUET_ASSIGN_OR_THROW(infile, + arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", + arrow::default_memory_pool())); + + std::unique_ptr<parquet::arrow::FileReader> reader; + PARQUET_THROW_NOT_OK( + parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + std::shared_ptr<arrow::ChunkedArray> array; + PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); + PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); + std::cout << std::endl; +} + +int main(int argc, char** argv) { + std::shared_ptr<arrow::Table> table = generate_table(); + write_parquet_file(*table); + read_whole_file(); + read_single_rowgroup(); + read_single_column(); + read_single_column_chunk(); +} |