summaryrefslogtreecommitdiffstats
path: root/src/arrow/cpp/examples/parquet/parquet_arrow
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/cpp/examples/parquet/parquet_arrow')
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt42
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_arrow/README.md20
-rw-r--r--src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc140
3 files changed, 202 insertions, 0 deletions
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt b/src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt
new file mode 100644
index 000000000..43eb21957
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_arrow/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Require cmake that supports BYPRODUCTS in add_custom_command, ExternalProject_Add [1].
+cmake_minimum_required(VERSION 3.2.0)
+
+project(parquet_arrow_example)
+
+include(ExternalProject)
+include(FindPkgConfig)
+include(GNUInstallDirs)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules")
+
+# This ensures that things like gnu++11 get passed correctly
+set(CMAKE_CXX_STANDARD 11)
+
+# We require a C++11 compliant compiler
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Look for installed packages the system
+find_package(Arrow REQUIRED)
+find_package(Parquet REQUIRED)
+
+include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR})
+
+add_executable(parquet_arrow_example reader_writer.cc)
+target_link_libraries(parquet_arrow_example ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB})
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/README.md b/src/arrow/cpp/examples/parquet/parquet_arrow/README.md
new file mode 100644
index 000000000..e99819fd2
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_arrow/README.md
@@ -0,0 +1,20 @@
+<!---
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+-->
+
+Using parquet-cpp with the arrow interface
+==========================================
+
+This folder contains an example project that shows how to setup a CMake project
+that consumes `parquet-cpp` as a library as well as how you can use the
+`parquet/arrow` interface to reading and write Apache Parquet files.
diff --git a/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc
new file mode 100644
index 000000000..f5d96ec16
--- /dev/null
+++ b/src/arrow/cpp/examples/parquet/parquet_arrow/reader_writer.cc
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/exception.h>
+
+#include <iostream>
+
+// #0 Build dummy data to pass around
+// To have some input data, we first create an Arrow Table that holds
+// some data.
+std::shared_ptr<arrow::Table> generate_table() {
+ arrow::Int64Builder i64builder;
+ PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5}));
+ std::shared_ptr<arrow::Array> i64array;
+ PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
+
+ arrow::StringBuilder strbuilder;
+ PARQUET_THROW_NOT_OK(strbuilder.Append("some"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("string"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("content"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("in"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("rows"));
+ std::shared_ptr<arrow::Array> strarray;
+ PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
+
+ std::shared_ptr<arrow::Schema> schema = arrow::schema(
+ {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())});
+
+ return arrow::Table::Make(schema, {i64array, strarray});
+}
+
+// #1 Write out the data as a Parquet file
+void write_parquet_file(const arrow::Table& table) {
+ std::shared_ptr<arrow::io::FileOutputStream> outfile;
+ PARQUET_ASSIGN_OR_THROW(
+ outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet"));
+ // The last argument to the function call is the size of the RowGroup in
+ // the parquet file. Normally you would choose this to be rather large but
+ // for the example, we use a small value to have multiple RowGroups.
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
+}
+
+// #2: Fully read in the file
+void read_whole_file() {
+ std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #3: Read only a single RowGroup of the parquet file
+void read_single_rowgroup() {
+ std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #4: Read only a single column of the whole parquet file
+void read_single_column() {
+ std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::ChunkedArray> array;
+ PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+// #5: Read only a single column of a RowGroup (this is known as ColumnChunk)
+// from the Parquet file.
+void read_single_column_chunk() {
+ std::cout << "Reading first ColumnChunk of the first RowGroup of "
+ "parquet-arrow-example.parquet"
+ << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_ASSIGN_OR_THROW(infile,
+ arrow::io::ReadableFile::Open("parquet-arrow-example.parquet",
+ arrow::default_memory_pool()));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::ChunkedArray> array;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+ std::shared_ptr<arrow::Table> table = generate_table();
+ write_parquet_file(*table);
+ read_whole_file();
+ read_single_rowgroup();
+ read_single_column();
+ read_single_column_chunk();
+}