diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/c_glib/arrow-dataset-glib/file-format.cpp | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/c_glib/arrow-dataset-glib/file-format.cpp')
-rw-r--r-- | src/arrow/c_glib/arrow-dataset-glib/file-format.cpp | 574 |
1 files changed, 574 insertions, 0 deletions
diff --git a/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp b/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp new file mode 100644 index 000000000..c0c92d966 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp @@ -0,0 +1,574 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/error.hpp> +#include <arrow-glib/file-system.hpp> +#include <arrow-glib/output-stream.hpp> +#include <arrow-glib/record-batch.hpp> +#include <arrow-glib/reader.hpp> +#include <arrow-glib/schema.hpp> + +#include <arrow-dataset-glib/file-format.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: file-format + * @section_id: file-format + * @title: File format classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetFileWriteOptions is a class for options to write a file + * of this format. + * + * #GADatasetFileWriter is a class for writing a file of this format. + * + * #GADatasetFileFormat is a base class for file format classes. + * + * #GADatasetCSVFileFormat is a class for CSV file format. + * + * #GADatasetIPCFileFormat is a class for IPC file format. + * + * #GADatasetParquetFileFormat is a class for Parquet file format. + * + * Since: 3.0.0 + */ + +typedef struct GADatasetFileWriteOptionsPrivate_ { + std::shared_ptr<arrow::dataset::FileWriteOptions> options; +} GADatasetFileWriteOptionsPrivate; + +enum { + PROP_OPTIONS = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriteOptions, + gadataset_file_write_options, + G_TYPE_OBJECT) + +#define GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(obj) \ + static_cast<GADatasetFileWriteOptionsPrivate *>( \ + gadataset_file_write_options_get_instance_private( \ + GADATASET_FILE_WRITE_OPTIONS(obj))) + +static void +gadataset_file_write_options_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + priv->options.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_write_options_parent_class)->finalize(object); +} + +static void +gadataset_file_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_OPTIONS: + priv->options = + *static_cast<std::shared_ptr<arrow::dataset::FileWriteOptions> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_write_options_init(GADatasetFileWriteOptions *object) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + new(&priv->options) std::shared_ptr<arrow::dataset::FileWriteOptions>; +} + +static void +gadataset_file_write_options_class_init(GADatasetFileWriteOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_write_options_finalize; + gobject_class->set_property = gadataset_file_write_options_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("options", + "Options", + "The raw " + "std::shared<arrow::dataset::FileWriteOptions> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_OPTIONS, spec); +} + + +typedef struct GADatasetFileWriterPrivate_ { + std::shared_ptr<arrow::dataset::FileWriter> writer; +} GADatasetFileWriterPrivate; + +enum { + PROP_WRITER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriter, + gadataset_file_writer, + G_TYPE_OBJECT) + +#define GADATASET_FILE_WRITER_GET_PRIVATE(obj) \ + static_cast<GADatasetFileWriterPrivate *>( \ + gadataset_file_writer_get_instance_private( \ + GADATASET_FILE_WRITER(obj))) + +static void +gadataset_file_writer_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + priv->writer.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_writer_parent_class)->finalize(object); +} + +static void +gadataset_file_writer_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_WRITER: + priv->writer = + *static_cast<std::shared_ptr<arrow::dataset::FileWriter> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_writer_init(GADatasetFileWriter *object) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + new(&(priv->writer)) std::shared_ptr<arrow::dataset::FileWriter>; +} + +static void +gadataset_file_writer_class_init(GADatasetFileWriterClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_writer_finalize; + gobject_class->set_property = gadataset_file_writer_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("writer", + "Writer", + "The raw " + "std::shared<arrow::dataset::FileWriter> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_WRITER, spec); +} + +/** + * gadataset_file_writer_write_record_batch: + * @writer: A #GADatasetFileWriter. + * @record_batch: A #GArrowRecordBatch to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + auto status = arrow_writer->Write(arrow_record_batch); + return garrow::check(error, status, "[file-writer][write-record-batch]"); +} + +/** + * gadataset_file_writer_write_record_batch_reader: + * @writer: A #GADatasetFileWriter. + * @reader: A #GArrowRecordBatchReader to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer, + GArrowRecordBatchReader *reader, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + auto arrow_reader = garrow_record_batch_reader_get_raw(reader); + auto status = arrow_writer->Write(arrow_reader.get()); + return garrow::check(error, + status, + "[file-writer][write-record-batch-reader]"); +} + +/** + * gadataset_file_writer_finish: + * @writer: A #GADatasetFileWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_finish(GADatasetFileWriter *writer, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + auto status = arrow_writer->Finish(); + return garrow::check(error, + status, + "[file-writer][finish]"); +} + + +typedef struct GADatasetFileFormatPrivate_ { + std::shared_ptr<arrow::dataset::FileFormat> format; +} GADatasetFileFormatPrivate; + +enum { + PROP_FORMAT = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat, + gadataset_file_format, + G_TYPE_OBJECT) + +#define GADATASET_FILE_FORMAT_GET_PRIVATE(obj) \ + static_cast<GADatasetFileFormatPrivate *>( \ + gadataset_file_format_get_instance_private( \ + GADATASET_FILE_FORMAT(obj))) + +static void +gadataset_file_format_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); + priv->format.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_format_parent_class)->finalize(object); +} + +static void +gadataset_file_format_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = + *static_cast<std::shared_ptr<arrow::dataset::FileFormat> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_format_init(GADatasetFileFormat *object) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); + new(&priv->format) std::shared_ptr<arrow::dataset::FileFormat>; +} + +static void +gadataset_file_format_class_init(GADatasetFileFormatClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_format_finalize; + gobject_class->set_property = gadataset_file_format_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("format", + "Format", + "The raw std::shared<arrow::dataset::FileFormat> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); +} + +/** + * gadataset_file_format_get_type_name: + * @format: A #GADatasetFileFormat. + * + * Returns: The type name of @format. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 3.0.0 + */ +gchar * +gadataset_file_format_get_type_name(GADatasetFileFormat *format) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + const auto &type_name = arrow_format->type_name(); + return g_strndup(type_name.data(), type_name.size()); +} + +/** + * gadataset_file_format_get_default_write_options: + * @format: A #GADatasetFileFormat. + * + * Returns: (transfer full): The default #GADatasetFileWriteOptions of @format. + * + * Since: 6.0.0 + */ +GADatasetFileWriteOptions * +gadataset_file_format_get_default_write_options(GADatasetFileFormat *format) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + auto arrow_options = arrow_format->DefaultWriteOptions(); + return gadataset_file_write_options_new_raw(&arrow_options); +} + +/** + * gadataset_file_format_open_writer: + * @format: A #GADatasetFileFormat. + * @destination: A #GArrowOutputStream. + * @file_system: The #GArrowFileSystem of @destination. + * @path: The path of @destination. + * @schema: A #GArrowSchema that is used by written record batches. + * @options: A #GADatasetFileWriteOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The newly created #GADatasetFileWriter of @format + * on success, %NULL on error. + * + * Since: 6.0.0 + */ +GADatasetFileWriter * +gadataset_file_format_open_writer(GADatasetFileFormat *format, + GArrowOutputStream *destination, + GArrowFileSystem *file_system, + const gchar *path, + GArrowSchema *schema, + GADatasetFileWriteOptions *options, + GError **error) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + auto arrow_destination = garrow_output_stream_get_raw(destination); + auto arrow_file_system = garrow_file_system_get_raw(file_system); + auto arrow_schema = garrow_schema_get_raw(schema); + auto arrow_options = gadataset_file_write_options_get_raw(options); + auto arrow_writer_result = + arrow_format->MakeWriter(arrow_destination, + arrow_schema, + arrow_options, + {arrow_file_system, path}); + if (garrow::check(error, arrow_writer_result, "[file-format][open-writer]")) { + auto arrow_writer = *arrow_writer_result; + return gadataset_file_writer_new_raw(&arrow_writer); + } else { + return NULL; + } +} + +/** + * gadataset_file_format_equal: + * @format: A #GADatasetFileFormat. + * @other_format: A #GADatasetFileFormat to be compared. + * + * Returns: %TRUE if they are the same content file format, %FALSE otherwise. + * + * Since: 3.0.0 + */ +gboolean +gadataset_file_format_equal(GADatasetFileFormat *format, + GADatasetFileFormat *other_format) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + const auto arrow_other_format = gadataset_file_format_get_raw(other_format); + return arrow_format->Equals(*arrow_other_format); +} + + +G_DEFINE_TYPE(GADatasetCSVFileFormat, + gadataset_csv_file_format, + GADATASET_TYPE_FILE_FORMAT) + +static void +gadataset_csv_file_format_init(GADatasetCSVFileFormat *object) +{ +} + +static void +gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass *klass) +{ +} + +/** + * gadataset_csv_file_format_new: + * + * Returns: The newly created CSV file format. + * + * Since: 3.0.0 + */ +GADatasetCSVFileFormat * +gadataset_csv_file_format_new(void) +{ + std::shared_ptr<arrow::dataset::FileFormat> arrow_format = + std::make_shared<arrow::dataset::CsvFileFormat>(); + return GADATASET_CSV_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format)); +} + + +G_DEFINE_TYPE(GADatasetIPCFileFormat, + gadataset_ipc_file_format, + GADATASET_TYPE_FILE_FORMAT) + +static void +gadataset_ipc_file_format_init(GADatasetIPCFileFormat *object) +{ +} + +static void +gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass *klass) +{ +} + +/** + * gadataset_ipc_file_format_new: + * + * Returns: The newly created IPC file format. + * + * Since: 3.0.0 + */ +GADatasetIPCFileFormat * +gadataset_ipc_file_format_new(void) +{ + std::shared_ptr<arrow::dataset::FileFormat> arrow_format = + std::make_shared<arrow::dataset::IpcFileFormat>(); + return GADATASET_IPC_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format)); +} + + +G_DEFINE_TYPE(GADatasetParquetFileFormat, + gadataset_parquet_file_format, + GADATASET_TYPE_FILE_FORMAT) + +static void +gadataset_parquet_file_format_init(GADatasetParquetFileFormat *object) +{ +} + +static void +gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass *klass) +{ +} + +/** + * gadataset_parquet_file_format_new: + * + * Returns: The newly created Parquet file format. + * + * Since: 3.0.0 + */ +GADatasetParquetFileFormat * +gadataset_parquet_file_format_new(void) +{ + std::shared_ptr<arrow::dataset::FileFormat> arrow_format = + std::make_shared<arrow::dataset::ParquetFileFormat>(); + return GADATASET_PARQUET_FILE_FORMAT( + gadataset_file_format_new_raw(&arrow_format)); +} + + +G_END_DECLS + +GADatasetFileWriteOptions * +gadataset_file_write_options_new_raw( + std::shared_ptr<arrow::dataset::FileWriteOptions> *arrow_options) +{ + return GADATASET_FILE_WRITE_OPTIONS( + g_object_new(GADATASET_TYPE_FILE_WRITE_OPTIONS, + "options", arrow_options, + NULL)); +} + +std::shared_ptr<arrow::dataset::FileWriteOptions> +gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(options); + return priv->options; +} + + +GADatasetFileWriter * +gadataset_file_writer_new_raw( + std::shared_ptr<arrow::dataset::FileWriter> *arrow_writer) +{ + return GADATASET_FILE_WRITER(g_object_new(GADATASET_TYPE_FILE_WRITER, + "writer", arrow_writer, + NULL)); +} + +std::shared_ptr<arrow::dataset::FileWriter> +gadataset_file_writer_get_raw(GADatasetFileWriter *writer) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(writer); + return priv->writer; +} + + +GADatasetFileFormat * +gadataset_file_format_new_raw( + std::shared_ptr<arrow::dataset::FileFormat> *arrow_format) +{ + GType type = GADATASET_TYPE_FILE_FORMAT; + const auto &type_name = (*arrow_format)->type_name(); + if (type_name == "csv") { + type = GADATASET_TYPE_CSV_FILE_FORMAT; + } else if (type_name == "ipc") { + type = GADATASET_TYPE_IPC_FILE_FORMAT; + } else if (type_name == "parquet") { + type = GADATASET_TYPE_PARQUET_FILE_FORMAT; + } + return GADATASET_FILE_FORMAT(g_object_new(type, + "format", arrow_format, + NULL)); +} + +std::shared_ptr<arrow::dataset::FileFormat> +gadataset_file_format_get_raw(GADatasetFileFormat *format) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(format); + return priv->format; +} |