// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include #include #include #include #include // #0 Build dummy data to pass around // To have some input data, we first create an Arrow Table that holds // some data. std::shared_ptr generate_table() { arrow::Int64Builder i64builder; PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5})); std::shared_ptr i64array; PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array)); arrow::StringBuilder strbuilder; PARQUET_THROW_NOT_OK(strbuilder.Append("some")); PARQUET_THROW_NOT_OK(strbuilder.Append("string")); PARQUET_THROW_NOT_OK(strbuilder.Append("content")); PARQUET_THROW_NOT_OK(strbuilder.Append("in")); PARQUET_THROW_NOT_OK(strbuilder.Append("rows")); std::shared_ptr strarray; PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray)); std::shared_ptr schema = arrow::schema( {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())}); return arrow::Table::Make(schema, {i64array, strarray}); } // #1 Write out the data as a Parquet file void write_parquet_file(const arrow::Table& table) { std::shared_ptr outfile; PARQUET_ASSIGN_OR_THROW( outfile, arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet")); // The last argument to the function call is the size of the RowGroup in // the parquet file. Normally you would choose this to be rather large but // for the example, we use a small value to have multiple RowGroups. PARQUET_THROW_NOT_OK( parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); } // #2: Fully read in the file void read_whole_file() { std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", arrow::default_memory_pool())); std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() << " columns." << std::endl; } // #3: Read only a single RowGroup of the parquet file void read_single_rowgroup() { std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", arrow::default_memory_pool())); std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table)); std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() << " columns." << std::endl; } // #4: Read only a single column of the whole parquet file void read_single_column() { std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", arrow::default_memory_pool())); std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; } // #5: Read only a single column of a RowGroup (this is known as ColumnChunk) // from the Parquet file. void read_single_column_chunk() { std::cout << "Reading first ColumnChunk of the first RowGroup of " "parquet-arrow-example.parquet" << std::endl; std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open("parquet-arrow-example.parquet", arrow::default_memory_pool())); std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; } int main(int argc, char** argv) { std::shared_ptr table = generate_table(); write_parquet_file(*table); read_whole_file(); read_single_rowgroup(); read_single_column(); read_single_column_chunk(); }