From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/arrow/cpp/tools/parquet/CMakeLists.txt | 36 ++++++++++ src/arrow/cpp/tools/parquet/parquet_dump_schema.cc | 52 ++++++++++++++ src/arrow/cpp/tools/parquet/parquet_reader.cc | 82 ++++++++++++++++++++++ src/arrow/cpp/tools/parquet/parquet_scan.cc | 78 ++++++++++++++++++++ 4 files changed, 248 insertions(+) create mode 100644 src/arrow/cpp/tools/parquet/CMakeLists.txt create mode 100644 src/arrow/cpp/tools/parquet/parquet_dump_schema.cc create mode 100644 src/arrow/cpp/tools/parquet/parquet_reader.cc create mode 100644 src/arrow/cpp/tools/parquet/parquet_scan.cc (limited to 'src/arrow/cpp/tools/parquet') diff --git a/src/arrow/cpp/tools/parquet/CMakeLists.txt b/src/arrow/cpp/tools/parquet/CMakeLists.txt new file mode 100644 index 000000000..81ab49421 --- /dev/null +++ b/src/arrow/cpp/tools/parquet/CMakeLists.txt @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(PARQUET_BUILD_EXECUTABLES) + set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan) + + foreach(TOOL ${PARQUET_TOOLS}) + string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) + add_executable(${TOOL} "${TOOL_SOURCE}.cc") + if(ARROW_BUILD_SHARED) + target_link_libraries(${TOOL} parquet_shared) + else() + target_link_libraries(${TOOL} parquet_static) + endif() + # Avoid unsetting RPATH when installing + set_target_properties(${TOOL} PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) + install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + endforeach(TOOL) + + add_dependencies(parquet ${PARQUET_TOOLS}) +endif() diff --git a/src/arrow/cpp/tools/parquet/parquet_dump_schema.cc b/src/arrow/cpp/tools/parquet/parquet_dump_schema.cc new file mode 100644 index 000000000..0d7c2428f --- /dev/null +++ b/src/arrow/cpp/tools/parquet/parquet_dump_schema.cc @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "parquet/api/reader.h" +#include "parquet/api/schema.h" + +int main(int argc, char** argv) { + bool help_flag = false; + std::string filename; + + for (int i = 1; i < argc; i++) { + if (!std::strcmp(argv[i], "-?") || !std::strcmp(argv[i], "-h") || + !std::strcmp(argv[i], "--help")) { + help_flag = true; + } else { + filename = argv[i]; + } + } + + if (argc != 2 || help_flag) { + std::cerr << "Usage: parquet-dump-schema [-h] [--help]" + << " " << std::endl; + return -1; + } + + try { + std::unique_ptr reader = + parquet::ParquetFileReader::OpenFile(filename); + PrintSchema(reader->metadata()->schema()->schema_root().get(), std::cout); + } catch (const std::exception& e) { + std::cerr << "Parquet error: " << e.what() << std::endl; + return -1; + } + + return 0; +} diff --git a/src/arrow/cpp/tools/parquet/parquet_reader.cc b/src/arrow/cpp/tools/parquet/parquet_reader.cc new file mode 100644 index 000000000..c7db8e11a --- /dev/null +++ b/src/arrow/cpp/tools/parquet/parquet_reader.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "parquet/api/reader.h" + +int main(int argc, char** argv) { + if (argc > 5 || argc < 2) { + std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json] " + << "[--dump] [--print-key-value-metadata] [--columns=...] " + << std::endl; + return -1; + } + + std::string filename; + bool print_values = true; + bool print_key_value_metadata = false; + bool memory_map = true; + bool format_json = false; + bool format_dump = false; + + // Read command-line options + const std::string COLUMNS_PREFIX = "--columns="; + std::list columns; + + char *param, *value; + for (int i = 1; i < argc; i++) { + if ((param = std::strstr(argv[i], "--only-metadata"))) { + print_values = false; + } else if ((param = std::strstr(argv[i], "--print-key-value-metadata"))) { + print_key_value_metadata = true; + } else if ((param = std::strstr(argv[i], "--no-memory-map"))) { + memory_map = false; + } else if ((param = std::strstr(argv[i], "--json"))) { + format_json = true; + } else if ((param = std::strstr(argv[i], "--dump"))) { + format_dump = true; + } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) { + value = std::strtok(param + COLUMNS_PREFIX.length(), ","); + while (value) { + columns.push_back(std::atoi(value)); + value = std::strtok(nullptr, ","); + } + } else { + filename = argv[i]; + } + } + + try { + std::unique_ptr reader = + parquet::ParquetFileReader::OpenFile(filename, memory_map); + parquet::ParquetFilePrinter printer(reader.get()); + if (format_json) { + printer.JSONPrint(std::cout, columns, filename.c_str()); + } else { + printer.DebugPrint(std::cout, columns, print_values, format_dump, + print_key_value_metadata, filename.c_str()); + } + } catch (const std::exception& e) { + std::cerr << "Parquet error: " << e.what() << std::endl; + return -1; + } + + return 0; +} diff --git a/src/arrow/cpp/tools/parquet/parquet_scan.cc b/src/arrow/cpp/tools/parquet/parquet_scan.cc new file mode 100644 index 000000000..2a7721e58 --- /dev/null +++ b/src/arrow/cpp/tools/parquet/parquet_scan.cc @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "parquet/api/reader.h" + +int main(int argc, char** argv) { + if (argc > 4 || argc < 1) { + std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] " + << std::endl; + return -1; + } + + std::string filename; + + // Read command-line options + int batch_size = 256; + const std::string COLUMNS_PREFIX = "--columns="; + const std::string BATCH_SIZE_PREFIX = "--batch-size="; + std::vector columns; + int num_columns = 0; + + char *param, *value; + for (int i = 1; i < argc; i++) { + if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) { + value = std::strtok(param + COLUMNS_PREFIX.length(), ","); + while (value) { + columns.push_back(std::atoi(value)); + value = std::strtok(nullptr, ","); + num_columns++; + } + } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) { + value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " "); + if (value) { + batch_size = std::atoi(value); + } + } else { + filename = argv[i]; + } + } + + try { + double total_time; + std::clock_t start_time = std::clock(); + std::unique_ptr reader = + parquet::ParquetFileReader::OpenFile(filename); + + int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get()); + + total_time = static_cast(std::clock() - start_time) / + static_cast(CLOCKS_PER_SEC); + std::cout << total_rows << " rows scanned in " << total_time << " seconds." + << std::endl; + } catch (const std::exception& e) { + std::cerr << "Parquet error: " << e.what() << std::endl; + return -1; + } + + return 0; +} -- cgit v1.2.3