diff options
Diffstat (limited to 'src/arrow/c_glib/parquet-glib')
-rw-r--r-- | src/arrow/c_glib/parquet-glib/arrow-file-reader.cpp | 401 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/arrow-file-reader.h | 76 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/arrow-file-reader.hpp | 29 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/arrow-file-writer.cpp | 579 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/arrow-file-writer.h | 123 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/arrow-file-writer.hpp | 33 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/meson.build | 83 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/parquet-glib.h | 23 | ||||
-rw-r--r-- | src/arrow/c_glib/parquet-glib/parquet-glib.hpp | 25 |
9 files changed, 1372 insertions, 0 deletions
diff --git a/src/arrow/c_glib/parquet-glib/arrow-file-reader.cpp b/src/arrow/c_glib/parquet-glib/arrow-file-reader.cpp new file mode 100644 index 000000000..2532db202 --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/arrow-file-reader.cpp @@ -0,0 +1,401 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/arrow-glib.hpp> +#include <arrow-glib/internal-index.hpp> + +#include <parquet-glib/arrow-file-reader.hpp> + +#include <parquet/file_reader.h> + +G_BEGIN_DECLS + +/** + * SECTION: arrow-file-reader + * @short_description: Arrow file reader class + * @include: parquet-glib/parquet-glib.h + * + * #GParquetArrowFileReader is a class for reading Apache Parquet data + * from file and returns them as Apache Arrow data. + */ + +typedef struct GParquetArrowFileReaderPrivate_ { + parquet::arrow::FileReader *arrow_file_reader; +} GParquetArrowFileReaderPrivate; + +enum { + PROP_0, + PROP_ARROW_FILE_READER +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GParquetArrowFileReader, + gparquet_arrow_file_reader, + G_TYPE_OBJECT) + +#define GPARQUET_ARROW_FILE_READER_GET_PRIVATE(obj) \ + static_cast<GParquetArrowFileReaderPrivate *>( \ + gparquet_arrow_file_reader_get_instance_private( \ + GPARQUET_ARROW_FILE_READER(obj))) + +static void +gparquet_arrow_file_reader_finalize(GObject *object) +{ + auto priv = GPARQUET_ARROW_FILE_READER_GET_PRIVATE(object); + + delete priv->arrow_file_reader; + + G_OBJECT_CLASS(gparquet_arrow_file_reader_parent_class)->finalize(object); +} + +static void +gparquet_arrow_file_reader_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GPARQUET_ARROW_FILE_READER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_ARROW_FILE_READER: + priv->arrow_file_reader = + static_cast<parquet::arrow::FileReader *>(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gparquet_arrow_file_reader_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gparquet_arrow_file_reader_init(GParquetArrowFileReader *object) +{ +} + +static void +gparquet_arrow_file_reader_class_init(GParquetArrowFileReaderClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gparquet_arrow_file_reader_finalize; + gobject_class->set_property = gparquet_arrow_file_reader_set_property; + gobject_class->get_property = gparquet_arrow_file_reader_get_property; + + spec = g_param_spec_pointer("arrow-file-reader", + "ArrowFileReader", + "The raw parquet::arrow::FileReader *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_ARROW_FILE_READER, spec); +} + +/** + * gparquet_arrow_file_reader_new_arrow: + * @source: Arrow source to be read. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GParquetArrowFileReader. + * + * Since: 0.11.0 + */ +GParquetArrowFileReader * +gparquet_arrow_file_reader_new_arrow(GArrowSeekableInputStream *source, + GError **error) +{ + auto arrow_random_access_file = + garrow_seekable_input_stream_get_raw(source); + auto arrow_memory_pool = arrow::default_memory_pool(); + std::unique_ptr<parquet::arrow::FileReader> parquet_arrow_file_reader; + auto status = parquet::arrow::OpenFile(arrow_random_access_file, + arrow_memory_pool, + &parquet_arrow_file_reader); + if (garrow_error_check(error, + status, + "[parquet][arrow][file-reader][new-arrow]")) { + return gparquet_arrow_file_reader_new_raw(parquet_arrow_file_reader.release()); + } else { + return NULL; + } +} + +/** + * gparquet_arrow_file_reader_new_path: + * @path: Path to be read. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GParquetArrowFileReader. + * + * Since: 0.11.0 + */ +GParquetArrowFileReader * +gparquet_arrow_file_reader_new_path(const gchar *path, + GError **error) +{ + auto arrow_memory_mapped_file = + arrow::io::MemoryMappedFile::Open(path, arrow::io::FileMode::READ); + if (!garrow::check(error, + arrow_memory_mapped_file, + "[parquet][arrow][file-reader][new-path]")) { + return NULL; + } + + std::shared_ptr<arrow::io::RandomAccessFile> arrow_random_access_file = + arrow_memory_mapped_file.ValueOrDie(); + auto arrow_memory_pool = arrow::default_memory_pool(); + std::unique_ptr<parquet::arrow::FileReader> parquet_arrow_file_reader; + auto status = parquet::arrow::OpenFile(arrow_random_access_file, + arrow_memory_pool, + &parquet_arrow_file_reader); + if (garrow::check(error, + status, + "[parquet][arrow][file-reader][new-path]")) { + return gparquet_arrow_file_reader_new_raw(parquet_arrow_file_reader.release()); + } else { + return NULL; + } +} + +/** + * gparquet_arrow_file_reader_read_table: + * @reader: A #GParquetArrowFileReader. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): A read #GArrowTable. + * + * Since: 0.11.0 + */ +GArrowTable * +gparquet_arrow_file_reader_read_table(GParquetArrowFileReader *reader, + GError **error) +{ + auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); + std::shared_ptr<arrow::Table> arrow_table; + auto status = parquet_arrow_file_reader->ReadTable(&arrow_table); + if (garrow_error_check(error, + status, + "[parquet][arrow][file-reader][read-table]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * gparquet_arrow_file_reader_read_row_group: + * @reader: A #GParquetArrowFileReader. + * @row_group_index: A row group index to be read. + * @column_indices: (array length=n_column_indices) (nullable): + * Column indices to be read. %NULL means that all columns are read. + * If an index is negative, the index is counted backward from the + * end of the columns. `-1` means the last column. + * @n_column_indices: The number of elements of @column_indices. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): A read #GArrowTable. + * + * Since: 1.0.0 + */ +GArrowTable * +gparquet_arrow_file_reader_read_row_group(GParquetArrowFileReader *reader, + gint row_group_index, + gint *column_indices, + gsize n_column_indices, + GError **error) +{ + const gchar *tag = "[parquet][arrow][file-reader][read-row-group]"; + auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); + std::shared_ptr<arrow::Table> arrow_table; + arrow::Status status; + if (column_indices) { + const auto n_columns = + parquet_arrow_file_reader->parquet_reader()->metadata()->num_columns(); + std::vector<int> parquet_column_indices; + for (gsize i = 0; i < n_column_indices; ++i) { + auto column_index = column_indices[i]; + if (!garrow_internal_index_adjust(column_index, n_columns)) { + garrow_error_check(error, + arrow::Status::IndexError("Out of index: " + "<0..", n_columns, ">: " + "<", column_index, ">"), + tag); + return NULL; + } + parquet_column_indices.push_back(column_index); + } + status = + parquet_arrow_file_reader->ReadRowGroup(row_group_index, + parquet_column_indices, + &arrow_table); + } else { + status = + parquet_arrow_file_reader->ReadRowGroup(row_group_index, &arrow_table); + } + if (garrow_error_check(error, + status, + tag)) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + +/** + * gparquet_arrow_file_reader_get_schema: + * @reader: A #GParquetArrowFileReader. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): A got #GArrowSchema. + * + * Since: 0.12.0 + */ +GArrowSchema * +gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader, + GError **error) +{ + auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); + + std::shared_ptr<arrow::Schema> arrow_schema; + auto status = parquet_arrow_file_reader->GetSchema(&arrow_schema); + if (garrow_error_check(error, + status, + "[parquet][arrow][file-reader][get-schema]")) { + return garrow_schema_new_raw(&arrow_schema); + } else { + return NULL; + } +} + +/** + * gparquet_arrow_file_reader_read_column_data: + * @reader: A #GParquetArrowFileReader. + * @i: The index of the column to be read. + * If an index is negative, the index is counted backward from the + * end of the columns. `-1` means the last column. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): A read #GArrowChunkedArray. + * + * Since: 0.15.0 + */ +GArrowChunkedArray * +gparquet_arrow_file_reader_read_column_data(GParquetArrowFileReader *reader, + gint i, + GError **error) +{ + const auto tag = "[parquet][arrow][file-reader][read-column-data]"; + auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); + + const auto n_columns = + parquet_arrow_file_reader->parquet_reader()->metadata()->num_columns(); + if (!garrow_internal_index_adjust(i, n_columns)) { + garrow_error_check(error, + arrow::Status::IndexError("Out of index: " + "<0..", n_columns, ">: " + "<", i, ">"), + tag); + return NULL; + } + + std::shared_ptr<arrow::ChunkedArray> arrow_chunked_array; + auto status = + parquet_arrow_file_reader->ReadColumn(i, &arrow_chunked_array); + if (!garrow_error_check(error, status, tag)) { + return NULL; + } + + return garrow_chunked_array_new_raw(&arrow_chunked_array); +} + +/** + * gparquet_arrow_file_reader_get_n_row_groups: + * @reader: A #GParquetArrowFileReader. + * + * Returns: The number of row groups. + * + * Since: 0.11.0 + */ +gint +gparquet_arrow_file_reader_get_n_row_groups(GParquetArrowFileReader *reader) +{ + auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); + return parquet_arrow_file_reader->num_row_groups(); +} + +/** + * gparquet_arrow_file_reader_get_n_rows: + * @reader: A #GParquetArrowFileReader. + * + * Returns: The number of rows. + * + * Since: 6.0.0 + */ +gint64 +gparquet_arrow_file_reader_get_n_rows(GParquetArrowFileReader *reader) +{ + auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); + return parquet_arrow_file_reader->parquet_reader()->metadata()->num_rows(); +} + +/** + * gparquet_arrow_file_reader_use_threads: + * @reader: A #GParquetArrowFileReader. + * @use_threads: Whether use threads or not. + * + * Since: 0.11.0 + */ +void +gparquet_arrow_file_reader_set_use_threads(GParquetArrowFileReader *reader, + gboolean use_threads) +{ + auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); + parquet_arrow_file_reader->set_use_threads(use_threads); +} + +G_END_DECLS + +GParquetArrowFileReader * +gparquet_arrow_file_reader_new_raw(parquet::arrow::FileReader *parquet_arrow_file_reader) +{ + auto arrow_file_reader = + GPARQUET_ARROW_FILE_READER(g_object_new(GPARQUET_TYPE_ARROW_FILE_READER, + "arrow-file-reader", parquet_arrow_file_reader, + NULL)); + return arrow_file_reader; +} + +parquet::arrow::FileReader * +gparquet_arrow_file_reader_get_raw(GParquetArrowFileReader *arrow_file_reader) +{ + auto priv = GPARQUET_ARROW_FILE_READER_GET_PRIVATE(arrow_file_reader); + return priv->arrow_file_reader; +} diff --git a/src/arrow/c_glib/parquet-glib/arrow-file-reader.h b/src/arrow/c_glib/parquet-glib/arrow-file-reader.h new file mode 100644 index 000000000..abea06c57 --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/arrow-file-reader.h @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-glib/arrow-glib.h> + +G_BEGIN_DECLS + +#define GPARQUET_TYPE_ARROW_FILE_READER (gparquet_arrow_file_reader_get_type()) +G_DECLARE_DERIVABLE_TYPE(GParquetArrowFileReader, + gparquet_arrow_file_reader, + GPARQUET, + ARROW_FILE_READER, + GObject) +struct _GParquetArrowFileReaderClass +{ + GObjectClass parent_class; +}; + +GParquetArrowFileReader * +gparquet_arrow_file_reader_new_arrow(GArrowSeekableInputStream *source, + GError **error); +GParquetArrowFileReader * +gparquet_arrow_file_reader_new_path(const gchar *path, + GError **error); + +GArrowTable * +gparquet_arrow_file_reader_read_table(GParquetArrowFileReader *reader, + GError **error); + +GARROW_AVAILABLE_IN_1_0 +GArrowTable * +gparquet_arrow_file_reader_read_row_group(GParquetArrowFileReader *reader, + gint row_group_index, + gint *column_indices, + gsize n_column_indices, + GError **error); + +GArrowSchema * +gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader, + GError **error); + +GArrowChunkedArray * +gparquet_arrow_file_reader_read_column_data(GParquetArrowFileReader *reader, + gint i, + GError **error); + +gint +gparquet_arrow_file_reader_get_n_row_groups(GParquetArrowFileReader *reader); + +GARROW_AVAILABLE_IN_6_0 +gint64 +gparquet_arrow_file_reader_get_n_rows(GParquetArrowFileReader *reader); + +void +gparquet_arrow_file_reader_set_use_threads(GParquetArrowFileReader *reader, + gboolean use_threads); + +G_END_DECLS diff --git a/src/arrow/c_glib/parquet-glib/arrow-file-reader.hpp b/src/arrow/c_glib/parquet-glib/arrow-file-reader.hpp new file mode 100644 index 000000000..172dcccb0 --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/arrow-file-reader.hpp @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <parquet/arrow/reader.h> + +#include <parquet-glib/arrow-file-reader.h> + +GParquetArrowFileReader * +gparquet_arrow_file_reader_new_raw(parquet::arrow::FileReader *parquet_arrow_file_reader); +parquet::arrow::FileReader * +gparquet_arrow_file_reader_get_raw(GParquetArrowFileReader *arrow_file_reader); diff --git a/src/arrow/c_glib/parquet-glib/arrow-file-writer.cpp b/src/arrow/c_glib/parquet-glib/arrow-file-writer.cpp new file mode 100644 index 000000000..c53bb94ce --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/arrow-file-writer.cpp @@ -0,0 +1,579 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/arrow-glib.hpp> + +#include <parquet-glib/arrow-file-writer.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: arrow-file-writer + * @short_description: Arrow file writer class + * @include: parquet-glib/parquet-glib.h + * + * #GParquetWriterProperties is a class for the writer properties. + * + * #GParquetArrowFileWriter is a class for writer Apache Arrow data to + * file as Apache Parquet format. + */ + +typedef struct GParquetWriterPropertiesPrivate_ { + std::shared_ptr<parquet::WriterProperties> properties; + parquet::WriterProperties::Builder *builder; + gboolean changed; +} GParquetWriterPropertiesPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GParquetWriterProperties, + gparquet_writer_properties, + G_TYPE_OBJECT) + +#define GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(object) \ + static_cast<GParquetWriterPropertiesPrivate *>( \ + gparquet_writer_properties_get_instance_private( \ + GPARQUET_WRITER_PROPERTIES(object))) + +static void +gparquet_writer_properties_finalize(GObject *object) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(object); + + priv->properties.~shared_ptr(); + delete priv->builder; + + G_OBJECT_CLASS(gparquet_writer_properties_parent_class)->finalize(object); +} + +static void +gparquet_writer_properties_init(GParquetWriterProperties *object) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(object); + new(&priv->properties) std::shared_ptr<parquet::WriterProperties>; + priv->builder = new parquet::WriterProperties::Builder(); + priv->changed = TRUE; +} + +static void +gparquet_writer_properties_class_init(GParquetWriterPropertiesClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gparquet_writer_properties_finalize; +} + +/** + * gparquet_writer_properties_new: + * + * Return: A newly created #GParquetWriterProperties. + * + * Since: 0.17.0 + */ +GParquetWriterProperties * +gparquet_writer_properties_new(void) +{ + auto writer_properties = g_object_new(GPARQUET_TYPE_WRITER_PROPERTIES, + NULL); + return GPARQUET_WRITER_PROPERTIES(writer_properties); +} + +/** + * gparquet_writer_properties_set_compression: + * @properties: A #GParquetWriterProperties. + * @compression_type: A #GArrowCompressionType. + * @path: (nullable): The column path as dot string. + * + * Since: 0.17.0 + */ +void +gparquet_writer_properties_set_compression(GParquetWriterProperties *properties, + GArrowCompressionType compression_type, + const gchar *path) +{ + auto arrow_compression_type = garrow_compression_type_to_raw(compression_type); + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + if (path) { + priv->builder->compression(path, arrow_compression_type); + } else { + priv->builder->compression(arrow_compression_type); + } + priv->changed = TRUE; +} + +/** + * gparquet_writer_properties_get_compression_path: + * @properties: A #GParquetWriterProperties. + * @path: The path as dot string. + * + * Returns: The compression type of #GParquetWriterProperties. + * + * Since: 0.17.0 + */ +GArrowCompressionType +gparquet_writer_properties_get_compression_path(GParquetWriterProperties *properties, + const gchar *path) +{ + auto parquet_properties = gparquet_writer_properties_get_raw(properties); + auto parquet_column_path = parquet::schema::ColumnPath::FromDotString(path); + auto arrow_compression = parquet_properties->compression(parquet_column_path); + return garrow_compression_type_from_raw(arrow_compression); +} + +/** + * gparquet_writer_properties_enable_dictionary: + * @properties: A #GParquetWriterProperties. + * @path: (nullable): The column path as dot string. + * + * Since: 0.17.0 + */ +void +gparquet_writer_properties_enable_dictionary(GParquetWriterProperties *properties, + const gchar *path) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + if (path) { + priv->builder->enable_dictionary(path); + } else { + priv->builder->enable_dictionary(); + } + priv->changed = TRUE; +} + +/** + * gparquet_writer_properties_disable_dictionary: + * @properties: A #GParquetWriterProperties. + * @path: (nullable): The column path as dot string. + * + * Since: 0.17.0 + */ +void +gparquet_writer_properties_disable_dictionary(GParquetWriterProperties *properties, + const gchar *path) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + if (path) { + priv->builder->disable_dictionary(path); + } else { + priv->builder->disable_dictionary(); + } + priv->changed = TRUE; +} + +/** + * gparquet_writer_properties_is_dictionary_enabled: + * @properties: A #GParquetWriterProperties. + * @path: The path as dot string. + * + * Returns: %TRUE on dictionary enabled, %FALSE on dictionary disabled. + * + * Since: 0.17.0 + */ +gboolean +gparquet_writer_properties_is_dictionary_enabled(GParquetWriterProperties *properties, + const gchar *path) +{ + auto parquet_properties = gparquet_writer_properties_get_raw(properties); + auto parquet_column_path = parquet::schema::ColumnPath::FromDotString(path); + return parquet_properties->dictionary_enabled(parquet_column_path); +} + +/** + * gparquet_writer_properties_set_dictionary_page_size_limit: + * @properties: A #GParquetWriterProperties. + * @limit: The dictionary page size limit. + * + * Since: 0.17.0 + */ +void +gparquet_writer_properties_set_dictionary_page_size_limit(GParquetWriterProperties *properties, + gint64 limit) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + priv->builder->dictionary_pagesize_limit(limit); + priv->changed = TRUE; +} + +/** + * gparquet_writer_properties_get_dictionary_page_size_limit: + * @properties: A #GParquetWriterProperties. + * + * Returns: The dictionary page size limit. + * + * Since: 0.17.0 + */ +gint64 +gparquet_writer_properties_get_dictionary_page_size_limit(GParquetWriterProperties *properties) +{ + auto parquet_properties = gparquet_writer_properties_get_raw(properties); + return parquet_properties->dictionary_pagesize_limit(); +} + +/** + * gparquet_writer_properties_set_batch_size: + * @properties: A #GParquetWriterProperties. + * @batch_size: The batch size. + * + * Since: 0.17.0 + */ +void +gparquet_writer_properties_set_batch_size(GParquetWriterProperties *properties, + gint64 batch_size) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + priv->builder->write_batch_size(batch_size); + priv->changed = TRUE; +} + +/** + * gparquet_writer_properties_get_batch_size: + * @properties: A #GParquetWriterProperties. + * + * Returns: The batch size. + * + * Since: 0.17.0 + */ +gint64 +gparquet_writer_properties_get_batch_size(GParquetWriterProperties *properties) +{ + auto parquet_properties = gparquet_writer_properties_get_raw(properties); + return parquet_properties->write_batch_size(); +} + +/** + * gparquet_writer_properties_set_max_row_group_length: + * @properties: A #GParquetWriterProperties. + * @length: The max row group length. + * + * Since: 0.17.0 + */ +void +gparquet_writer_properties_set_max_row_group_length(GParquetWriterProperties *properties, + gint64 length) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + priv->builder->max_row_group_length(length); + priv->changed = TRUE; +} + +/** + * gparquet_writer_properties_get_max_row_group_length: + * @properties: A #GParquetWriterProperties. + * + * Returns: The max row group length. + * + * Since: 0.17.0 + */ +gint64 +gparquet_writer_properties_get_max_row_group_length(GParquetWriterProperties *properties) +{ + auto parquet_properties = gparquet_writer_properties_get_raw(properties); + return parquet_properties->max_row_group_length(); +} + +/** + * gparquet_writer_properties_set_data_page_size: + * @properties: A #GParquetWriterProperties. + * @data_page_size: The data page size. + * + * Since: 0.17.0 + */ +void +gparquet_writer_properties_set_data_page_size(GParquetWriterProperties *properties, + gint64 data_page_size) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + priv->builder->data_pagesize(data_page_size); + priv->changed = TRUE; +} + +/** + * gparquet_writer_properties_get_data_page_size: + * @properties: A #GParquetWriterProperties. + * + * Returns: The data page size. + * + * Since: 0.17.0 + */ +gint64 +gparquet_writer_properties_get_data_page_size(GParquetWriterProperties *properties) +{ + auto parquet_properties = gparquet_writer_properties_get_raw(properties); + return parquet_properties->data_pagesize(); +} + + +typedef struct GParquetArrowFileWriterPrivate_ { + parquet::arrow::FileWriter *arrow_file_writer; +} GParquetArrowFileWriterPrivate; + +enum { + PROP_0, + PROP_ARROW_FILE_WRITER +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GParquetArrowFileWriter, + gparquet_arrow_file_writer, + G_TYPE_OBJECT) + +#define GPARQUET_ARROW_FILE_WRITER_GET_PRIVATE(obj) \ + static_cast<GParquetArrowFileWriterPrivate *>( \ + gparquet_arrow_file_writer_get_instance_private( \ + GPARQUET_ARROW_FILE_WRITER(obj))) + +static void +gparquet_arrow_file_writer_finalize(GObject *object) +{ + auto priv = GPARQUET_ARROW_FILE_WRITER_GET_PRIVATE(object); + + delete priv->arrow_file_writer; + + G_OBJECT_CLASS(gparquet_arrow_file_writer_parent_class)->finalize(object); +} + +static void +gparquet_arrow_file_writer_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GPARQUET_ARROW_FILE_WRITER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_ARROW_FILE_WRITER: + priv->arrow_file_writer = + static_cast<parquet::arrow::FileWriter *>(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gparquet_arrow_file_writer_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gparquet_arrow_file_writer_init(GParquetArrowFileWriter *object) +{ +} + +static void +gparquet_arrow_file_writer_class_init(GParquetArrowFileWriterClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gparquet_arrow_file_writer_finalize; + gobject_class->set_property = gparquet_arrow_file_writer_set_property; + gobject_class->get_property = gparquet_arrow_file_writer_get_property; + + spec = g_param_spec_pointer("arrow-file-writer", + "ArrowFileWriter", + "The raw std::shared<parquet::arrow::FileWriter> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_ARROW_FILE_WRITER, spec); +} + +/** + * gparquet_arrow_file_writer_new_arrow: + * @schema: Arrow schema for written data. + * @sink: Arrow output stream to be written. + * @writer_properties: (nullable): A #GParquetWriterProperties. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GParquetArrowFileWriter. + * + * Since: 0.11.0 + */ +GParquetArrowFileWriter * +gparquet_arrow_file_writer_new_arrow(GArrowSchema *schema, + GArrowOutputStream *sink, + GParquetWriterProperties *writer_properties, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema).get(); + auto arrow_output_stream = garrow_output_stream_get_raw(sink); + auto arrow_memory_pool = arrow::default_memory_pool(); + std::unique_ptr<parquet::arrow::FileWriter> parquet_arrow_file_writer; + arrow::Status status; + if (writer_properties) { + auto parquet_writer_properties = gparquet_writer_properties_get_raw(writer_properties); + status = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties, + &parquet_arrow_file_writer); + } else { + auto parquet_writer_properties = parquet::default_writer_properties(); + status = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties, + &parquet_arrow_file_writer); + } + if (garrow_error_check(error, + status, + "[parquet][arrow][file-writer][new-arrow]")) { + return gparquet_arrow_file_writer_new_raw(parquet_arrow_file_writer.release()); + } else { + return NULL; + } +} + +/** + * gparquet_arrow_file_writer_new_path: + * @schema: Arrow schema for written data. + * @path: Path to be read. + * @writer_properties: (nullable): A #GParquetWriterProperties. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GParquetArrowFileWriter. + * + * Since: 0.11.0 + */ +GParquetArrowFileWriter * +gparquet_arrow_file_writer_new_path(GArrowSchema *schema, + const gchar *path, + GParquetWriterProperties *writer_properties, + GError **error) +{ + auto arrow_file_output_stream = + arrow::io::FileOutputStream::Open(path, false); + if (!garrow::check(error, + arrow_file_output_stream, + "[parquet][arrow][file-writer][new-path]")) { + return NULL; + } + + auto arrow_schema = garrow_schema_get_raw(schema).get(); + std::shared_ptr<arrow::io::OutputStream> arrow_output_stream = + arrow_file_output_stream.ValueOrDie(); + auto arrow_memory_pool = arrow::default_memory_pool(); + std::unique_ptr<parquet::arrow::FileWriter> parquet_arrow_file_writer; + arrow::Status status; + if (writer_properties) { + auto parquet_writer_properties = gparquet_writer_properties_get_raw(writer_properties); + status = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties, + &parquet_arrow_file_writer); + } else { + auto parquet_writer_properties = parquet::default_writer_properties(); + status = parquet::arrow::FileWriter::Open(*arrow_schema, + arrow_memory_pool, + arrow_output_stream, + parquet_writer_properties, + &parquet_arrow_file_writer); + } + if (garrow::check(error, + status, + "[parquet][arrow][file-writer][new-path]")) { + return gparquet_arrow_file_writer_new_raw(parquet_arrow_file_writer.release()); + } else { + return NULL; + } +} + +/** + * gparquet_arrow_file_writer_write_table: + * @writer: A #GParquetArrowFileWriter. + * @table: A table to be written. + * @chunk_size: The max number of rows in a row group. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.11.0 + */ +gboolean +gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, + GArrowTable *table, + guint64 chunk_size, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_table = garrow_table_get_raw(table).get(); + auto status = parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size); + return garrow_error_check(error, + status, + "[parquet][arrow][file-writer][write-table]"); +} + +/** + * gparquet_arrow_file_writer_close: + * @writer: A #GParquetArrowFileWriter. + * @error: (nullable): Return locatipcn for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.11.0 + */ +gboolean +gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto status = parquet_arrow_file_writer->Close(); + return garrow_error_check(error, + status, + "[parquet][arrow][file-writer][close]"); +} + + +G_END_DECLS + +GParquetArrowFileWriter * +gparquet_arrow_file_writer_new_raw(parquet::arrow::FileWriter *parquet_arrow_file_writer) +{ + auto arrow_file_writer = + GPARQUET_ARROW_FILE_WRITER(g_object_new(GPARQUET_TYPE_ARROW_FILE_WRITER, + "arrow-file-writer", parquet_arrow_file_writer, + NULL)); + return arrow_file_writer; +} + +parquet::arrow::FileWriter * +gparquet_arrow_file_writer_get_raw(GParquetArrowFileWriter *arrow_file_writer) +{ + auto priv = GPARQUET_ARROW_FILE_WRITER_GET_PRIVATE(arrow_file_writer); + return priv->arrow_file_writer; +} + +std::shared_ptr<parquet::WriterProperties> +gparquet_writer_properties_get_raw(GParquetWriterProperties *properties) +{ + auto priv = GPARQUET_WRITER_PROPERTIES_GET_PRIVATE(properties); + if (priv->changed) { + priv->properties = priv->builder->build(); + priv->changed = FALSE; + } + return priv->properties; +} diff --git a/src/arrow/c_glib/parquet-glib/arrow-file-writer.h b/src/arrow/c_glib/parquet-glib/arrow-file-writer.h new file mode 100644 index 000000000..67083a074 --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/arrow-file-writer.h @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-glib/arrow-glib.h> + +G_BEGIN_DECLS + +#define GPARQUET_TYPE_WRITER_PROPERTIES \ + (gparquet_writer_properties_get_type()) +G_DECLARE_DERIVABLE_TYPE(GParquetWriterProperties, + gparquet_writer_properties, + GPARQUET, + WRITER_PROPERTIES, + GObject) +struct _GParquetWriterPropertiesClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_0_17 +GParquetWriterProperties *gparquet_writer_properties_new(void); +GARROW_AVAILABLE_IN_0_17 +void +gparquet_writer_properties_set_compression(GParquetWriterProperties *properties, + GArrowCompressionType compression_type, + const gchar *path); +GARROW_AVAILABLE_IN_0_17 +GArrowCompressionType +gparquet_writer_properties_get_compression_path(GParquetWriterProperties *properties, + const gchar *path); +GARROW_AVAILABLE_IN_0_17 +void +gparquet_writer_properties_enable_dictionary(GParquetWriterProperties *properties, + const gchar *path); +GARROW_AVAILABLE_IN_0_17 +void +gparquet_writer_properties_disable_dictionary(GParquetWriterProperties *properties, + const gchar *path); +GARROW_AVAILABLE_IN_0_17 +gboolean +gparquet_writer_properties_is_dictionary_enabled(GParquetWriterProperties *properties, + const gchar *path); +GARROW_AVAILABLE_IN_0_17 +void +gparquet_writer_properties_set_dictionary_page_size_limit(GParquetWriterProperties *properties, + gint64 limit); +GARROW_AVAILABLE_IN_0_17 +gint64 +gparquet_writer_properties_get_dictionary_page_size_limit(GParquetWriterProperties *properties); +GARROW_AVAILABLE_IN_0_17 +void +gparquet_writer_properties_set_batch_size(GParquetWriterProperties *properties, + gint64 batch_size); +GARROW_AVAILABLE_IN_0_17 +gint64 +gparquet_writer_properties_get_batch_size(GParquetWriterProperties *properties); +GARROW_AVAILABLE_IN_0_17 +void +gparquet_writer_properties_set_max_row_group_length(GParquetWriterProperties *properties, + gint64 length); +GARROW_AVAILABLE_IN_0_17 +gint64 +gparquet_writer_properties_get_max_row_group_length(GParquetWriterProperties *properties); +GARROW_AVAILABLE_IN_0_17 +void +gparquet_writer_properties_set_data_page_size(GParquetWriterProperties *properties, + gint64 data_page_size); +GARROW_AVAILABLE_IN_0_17 +gint64 +gparquet_writer_properties_get_data_page_size(GParquetWriterProperties *properties); + + +#define GPARQUET_TYPE_ARROW_FILE_WRITER (gparquet_arrow_file_writer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GParquetArrowFileWriter, + gparquet_arrow_file_writer, + GPARQUET, + ARROW_FILE_WRITER, + GObject) +struct _GParquetArrowFileWriterClass +{ + GObjectClass parent_class; +}; + +GParquetArrowFileWriter * +gparquet_arrow_file_writer_new_arrow(GArrowSchema *schema, + GArrowOutputStream *sink, + GParquetWriterProperties *writer_properties, + GError **error); +GParquetArrowFileWriter * +gparquet_arrow_file_writer_new_path(GArrowSchema *schema, + const gchar *path, + GParquetWriterProperties *writer_properties, + GError **error); + +gboolean +gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, + GArrowTable *table, + guint64 chunk_size, + GError **error); + +gboolean +gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, + GError **error); + +G_END_DECLS diff --git a/src/arrow/c_glib/parquet-glib/arrow-file-writer.hpp b/src/arrow/c_glib/parquet-glib/arrow-file-writer.hpp new file mode 100644 index 000000000..69fecf1be --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/arrow-file-writer.hpp @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <memory> + +#include <parquet/arrow/writer.h> + +#include <parquet-glib/arrow-file-writer.h> + +GParquetArrowFileWriter * +gparquet_arrow_file_writer_new_raw(parquet::arrow::FileWriter *parquet_arrow_file_writer); +parquet::arrow::FileWriter * +gparquet_arrow_file_writer_get_raw(GParquetArrowFileWriter *arrow_file_writer); +std::shared_ptr<parquet::WriterProperties> +gparquet_writer_properties_get_raw(GParquetWriterProperties *properties); diff --git a/src/arrow/c_glib/parquet-glib/meson.build b/src/arrow/c_glib/parquet-glib/meson.build new file mode 100644 index 000000000..73cd9e45c --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/meson.build @@ -0,0 +1,83 @@ +# -*- indent-tabs-mode: nil -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +project_name = 'parquet-glib' + +sources = files( + 'arrow-file-reader.cpp', + 'arrow-file-writer.cpp', +) + +c_headers = files( + 'arrow-file-reader.h', + 'arrow-file-writer.h', + 'parquet-glib.h', +) + +cpp_headers = files( + 'arrow-file-reader.hpp', + 'arrow-file-writer.hpp', + 'parquet-glib.hpp', +) + +headers = c_headers + cpp_headers +install_headers(headers, subdir: project_name) + + +dependencies = [ + arrow, + parquet, + arrow_glib, +] +libparquet_glib = library('parquet-glib', + sources: sources, + install: true, + dependencies: dependencies, + include_directories: base_include_directories, + soversion: so_version, + version: library_version) +parquet_glib = declare_dependency(link_with: libparquet_glib, + include_directories: base_include_directories, + dependencies: dependencies) + +pkgconfig.generate(libparquet_glib, + filebase: project_name, + name: 'Apache Parquet GLib', + description: 'C API for Apache Parquet based on GLib', + version: version, + requires: ['parquet', 'arrow-glib']) + +if have_gi + gnome.generate_gir(libparquet_glib, + dependencies: declare_dependency(sources: arrow_glib_gir), + sources: sources + c_headers, + namespace: 'Parquet', + nsversion: api_version, + identifier_prefix: 'GParquet', + symbol_prefix: 'gparquet', + export_packages: 'parquet-glib', + includes: [ + 'Arrow-1.0', + ], + install: true, + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ]) +endif diff --git a/src/arrow/c_glib/parquet-glib/parquet-glib.h b/src/arrow/c_glib/parquet-glib/parquet-glib.h new file mode 100644 index 000000000..6ae0f7e8f --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/parquet-glib.h @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <parquet-glib/arrow-file-reader.h> +#include <parquet-glib/arrow-file-writer.h> diff --git a/src/arrow/c_glib/parquet-glib/parquet-glib.hpp b/src/arrow/c_glib/parquet-glib/parquet-glib.hpp new file mode 100644 index 000000000..988e715a2 --- /dev/null +++ b/src/arrow/c_glib/parquet-glib/parquet-glib.hpp @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <parquet-glib/parquet-glib.h> + +#include <parquet-glib/arrow-file-reader.hpp> +#include <parquet-glib/arrow-file-writer.hpp> |