diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/c_glib/arrow-dataset-glib | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/c_glib/arrow-dataset-glib')
23 files changed, 3893 insertions, 0 deletions
diff --git a/src/arrow/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/src/arrow/c_glib/arrow-dataset-glib/arrow-dataset-glib.h new file mode 100644 index 000000000..58f4e216c --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/arrow-dataset-glib.h @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-glib/arrow-glib.h> + +#include <arrow-dataset-glib/dataset-factory.h> +#include <arrow-dataset-glib/dataset.h> +#include <arrow-dataset-glib/enums.h> +#include <arrow-dataset-glib/file-format.h> +#include <arrow-dataset-glib/fragment.h> +#include <arrow-dataset-glib/partitioning.h> +#include <arrow-dataset-glib/scanner.h> diff --git a/src/arrow/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp b/src/arrow/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp new file mode 100644 index 000000000..8e9965068 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-glib/arrow-glib.hpp> + +#include <arrow-dataset-glib/dataset-factory.hpp> +#include <arrow-dataset-glib/dataset.hpp> +#include <arrow-dataset-glib/file-format.hpp> +#include <arrow-dataset-glib/fragment.hpp> +#include <arrow-dataset-glib/partitioning.hpp> +#include <arrow-dataset-glib/scanner.hpp> diff --git a/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.cpp b/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.cpp new file mode 100644 index 000000000..1e532760a --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.cpp @@ -0,0 +1,552 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/error.hpp> +#include <arrow-glib/file-system.hpp> + +#include <arrow-dataset-glib/dataset-factory.hpp> +#include <arrow-dataset-glib/dataset.hpp> +#include <arrow-dataset-glib/file-format.hpp> +#include <arrow-dataset-glib/partitioning.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: dataset-factory + * @section_id: dataset-factory + * @title: Dataset factory related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDatasetFactory is a base class for dataset factories. + * + * #GADatasetFileSystemDatasetFactory is a class for + * #GADatasetFileSystemDataset factory. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetFactoryPrivate_ { + std::shared_ptr<arrow::dataset::DatasetFactory> factory; +} GADatasetDatasetFactoryPrivate; + +enum { + PROP_DATASET_FACTORY = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDatasetFactory, + gadataset_dataset_factory, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast<GADatasetDatasetFactoryPrivate *>( \ + gadataset_dataset_factory_get_instance_private( \ + GADATASET_DATASET_FACTORY(obj))) + +static void +gadataset_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + priv->factory.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET_FACTORY: + { + auto arrow_factory_pointer = + static_cast<std::shared_ptr<arrow::dataset::DatasetFactory> *>( + g_value_get_pointer(value)); + if (arrow_factory_pointer) { + priv->factory = *arrow_factory_pointer; + } + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_factory_init(GADatasetDatasetFactory *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->factory) std::shared_ptr<arrow::dataset::DatasetFactory>; +} + +static void +gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_factory_finalize; + gobject_class->set_property = gadataset_dataset_factory_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset-factory", + "Dataset factory", + "The raw " + "std::shared<arrow::dataset::DatasetFactory> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET_FACTORY, spec); +} + +/** + * gadataset_dataset_factory_finish: + * @factory: A #GADatasetDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error) +{ + auto arrow_factory = gadataset_dataset_factory_get_raw(factory); + auto arrow_dataset_result = arrow_factory->Finish(); + if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) { + auto arrow_dataset = *arrow_dataset_result; + return gadataset_dataset_new_raw(&arrow_dataset); + } else { + return NULL; + } +} + + +typedef struct GADatasetFileSystemDatasetFactoryPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; + GADatasetPartitioning *partitioning; + GList *files; + arrow::dataset::FileSystemFactoryOptions options; +} GADatasetFileSystemDatasetFactoryPrivate; + +enum { + PROP_FORMAT = 1, + PROP_FILE_SYSTEM, + PROP_PARTITIONING, + PROP_PARTITION_BASE_DIR, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET_TYPE_DATASET_FACTORY) + +#define GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast<GADatasetFileSystemDatasetFactoryPrivate *>( \ + gadataset_file_system_dataset_factory_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET_FACTORY(obj))) + +static void +gadataset_file_system_dataset_factory_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + if (priv->partitioning) { + g_object_unref(priv->partitioning); + priv->partitioning = NULL; + } + + if (priv->files) { + g_list_free_full(priv->files, g_object_unref); + priv->files = NULL; + } + + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + priv->options.~FileSystemFactoryOptions(); + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_file_system_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + case PROP_PARTITIONING: + { + auto partitioning = g_value_get_object(value); + if (partitioning == priv->partitioning) { + break; + } + auto old_partitioning = priv->partitioning; + if (partitioning) { + g_object_ref(partitioning); + priv->partitioning = GADATASET_PARTITIONING(partitioning); + priv->options.partitioning = + gadataset_partitioning_get_raw(priv->partitioning); + } else { + priv->options.partitioning = arrow::dataset::Partitioning::Default(); + } + if (old_partitioning) { + g_object_unref(old_partitioning); + } + } + break; + case PROP_PARTITION_BASE_DIR: + priv->options.partition_base_dir = g_value_get_string(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + case PROP_PARTITIONING: + g_value_set_object(value, priv->partitioning); + break; + case PROP_PARTITION_BASE_DIR: + g_value_set_string(value, priv->options.partition_base_dir.c_str()); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_init( + GADatasetFileSystemDatasetFactory *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->options) arrow::dataset::FileSystemFactoryOptions; +} + +static void +gadataset_file_system_dataset_factory_class_init( + GADatasetFileSystemDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_factory_dispose; + gobject_class->finalize = gadataset_file_system_dataset_factory_finalize; + gobject_class->set_property = gadataset_file_system_dataset_factory_set_property; + gobject_class->get_property = gadataset_file_system_dataset_factory_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDatasetFactory:format: + * + * Format passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format passed to GADatasetFileSystemDataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast<GParamFlags>(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + + /** + * GADatasetFileSystemDatasetFactory:file-system: + * + * File system passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system passed to GADatasetFileSystemDataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast<GParamFlags>(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); + + /** + * GADatasetFileSystemDatasetFactory:partitioning: + * + * Partitioning used by #GADatasetFileSystemDataset. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("partitioning", + "Partitioning", + "Partitioning used by GADatasetFileSystemDataset", + GADATASET_TYPE_PARTITIONING, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); + + /** + * GADatasetFileSystemDatasetFactory:partition-base-dir: + * + * Partition base directory used by #GADatasetFileSystemDataset. + * + * Since: 6.0.0 + */ + spec = g_param_spec_string("partition-base-dir", + "Partition base directory", + "Partition base directory " + "used by GADatasetFileSystemDataset", + NULL, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_PARTITION_BASE_DIR, spec); +} + +/** + * gadataset_file_system_factory_new: + * @format: A #GADatasetFileFormat. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GADatasetDatasetFileSystemFactory on success, + * %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *format) +{ + return GADATASET_FILE_SYSTEM_DATASET_FACTORY( + g_object_new(GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY, + "format", format, + NULL)); +} + +/** + * gadataset_file_system_dataset_factory_set_file_system: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @file_system: A #GArrowFileSystem. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + priv->file_system = file_system; + g_object_ref(priv->file_system); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_set_file_system_uri: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @uri: An URI for file system. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system-uri]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + std::string internal_path; + auto arrow_file_system_result = + arrow::fs::FileSystemFromUri(uri, &internal_path); + if (!garrow::check(error, arrow_file_system_result, context)) { + return FALSE; + } + auto arrow_file_system = *arrow_file_system_result; + auto arrow_file_info_result = arrow_file_system->GetFileInfo(internal_path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + priv->file_system = garrow_file_system_new_raw(&arrow_file_system); + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_add_path: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @path: A path to be added. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][add-path]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return FALSE; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_file_info_result = arrow_file_system->GetFileInfo(path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_finish: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetFileSystemDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][finish]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return NULL; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_format = gadataset_file_format_get_raw(priv->format); + arrow::Result<std::shared_ptr<arrow::dataset::DatasetFactory>> + arrow_factory_result; + if (priv->files && + !priv->files->next && + garrow_file_info_is_dir(GARROW_FILE_INFO(priv->files->data))) { + auto file = GARROW_FILE_INFO(priv->files->data); + arrow::fs::FileSelector arrow_selector; + arrow_selector.base_dir = garrow_file_info_get_raw(file)->path(); + arrow_selector.recursive = true; + arrow_factory_result = + arrow::dataset::FileSystemDatasetFactory::Make(arrow_file_system, + arrow_selector, + arrow_format, + priv->options); + } else { + std::vector<arrow::fs::FileInfo> arrow_files; + priv->files = g_list_reverse(priv->files); + for (auto node = priv->files; node; node = node->next) { + auto file = GARROW_FILE_INFO(node->data); + arrow_files.push_back(*garrow_file_info_get_raw(file)); + } + priv->files = g_list_reverse(priv->files); + arrow_factory_result = + arrow::dataset::FileSystemDatasetFactory::Make(arrow_file_system, + arrow_files, + arrow_format, + priv->options); + } + if (!garrow::check(error, arrow_factory_result, context)) { + return NULL; + } + auto arrow_dataset_result = (*arrow_factory_result)->Finish(); + if (!garrow::check(error, arrow_dataset_result, context)) { + return NULL; + } + auto arrow_dataset = *arrow_dataset_result; + return GADATASET_FILE_SYSTEM_DATASET( + gadataset_dataset_new_raw(&arrow_dataset, + "dataset", &arrow_dataset, + "file-system", priv->file_system, + "format", priv->format, + "partitioning", priv->partitioning, + NULL)); +} + + +G_END_DECLS + +std::shared_ptr<arrow::dataset::DatasetFactory> +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(factory); + return priv->factory; +} diff --git a/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.h b/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.h new file mode 100644 index 000000000..e2ee3ed98 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.h @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-dataset-glib/dataset.h> + +G_BEGIN_DECLS + +#define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory, + gadataset_dataset_factory, + GADATASET, + DATASET_FACTORY, + GObject) +struct _GADatasetDatasetFactoryClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY \ + (gadataset_file_system_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET, + FILE_SYSTEM_DATASET_FACTORY, + GADatasetDatasetFactory) +struct _GADatasetFileSystemDatasetFactoryClass +{ + GADatasetDatasetFactoryClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *file_format); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error); +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error); +/* +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_file( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileInfo *file, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_selector( + GADatasetFileSystemDatasetFactory *factory, + GArrorFileSelector *selector, + GError **error); +*/ + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error); + + +G_END_DECLS diff --git a/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.hpp b/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.hpp new file mode 100644 index 000000000..114db35bc --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/dataset-factory.hpp @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow/dataset/api.h> + +#include <arrow-dataset-glib/dataset-factory.h> + +std::shared_ptr<arrow::dataset::DatasetFactory> +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory); diff --git a/src/arrow/c_glib/arrow-dataset-glib/dataset.cpp b/src/arrow/c_glib/arrow-dataset-glib/dataset.cpp new file mode 100644 index 000000000..8613bedad --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/dataset.cpp @@ -0,0 +1,736 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/error.hpp> +#include <arrow-glib/file-system.hpp> +#include <arrow-glib/table.hpp> + +#include <arrow-dataset-glib/dataset-factory.hpp> +#include <arrow-dataset-glib/dataset.hpp> +#include <arrow-dataset-glib/file-format.hpp> +#include <arrow-dataset-glib/partitioning.hpp> +#include <arrow-dataset-glib/scanner.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: dataset + * @section_id: dataset + * @title: Dataset related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDataset is a base class for datasets. + * + * #GADatasetFileSystemDataset is a class for file system dataset. + * + * #GADatasetFileSystemDatasetWriteOptions is a class for options to + * write a dataset to file system dataset. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetPrivate_ { + std::shared_ptr<arrow::dataset::Dataset> dataset; +} GADatasetDatasetPrivate; + +enum { + PROP_DATASET = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDataset, + gadataset_dataset, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_GET_PRIVATE(obj) \ + static_cast<GADatasetDatasetPrivate *>( \ + gadataset_dataset_get_instance_private( \ + GADATASET_DATASET(obj))) + +static void +gadataset_dataset_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + priv->dataset.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_parent_class)->finalize(object); +} + +static void +gadataset_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET: + priv->dataset = + *static_cast<std::shared_ptr<arrow::dataset::Dataset> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_init(GADatasetDataset *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + new(&priv->dataset) std::shared_ptr<arrow::dataset::Dataset>; +} + +static void +gadataset_dataset_class_init(GADatasetDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_finalize; + gobject_class->set_property = gadataset_dataset_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset", + "Dataset", + "The raw " + "std::shared<arrow::dataset::Dataset> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET, spec); +} + +/** + * gadataset_dataset_begin_scan: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScannerBuilder on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error) +{ + return gadataset_scanner_builder_new(dataset, error); +} + +/** + * gadataset_dataset_to_table: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A loaded #GArrowTable on success, %NULL on error. + * + * Since: 5.0.0 + */ +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error) +{ + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (!garrow::check(error, + arrow_scanner_builder_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner_builder = *arrow_scanner_builder_result; + auto arrow_scanner_result = arrow_scanner_builder->Finish(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner = *arrow_scanner_result; + auto arrow_table_result = arrow_scanner->ToTable(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + return garrow_table_new_raw(&(*arrow_table_result)); +} + +/** + * gadataset_dataset_get_type_name: + * @dataset: A #GADatasetDataset. + * + * Returns: The type name of @dataset. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset) +{ + const auto arrow_dataset = gadataset_dataset_get_raw(dataset); + const auto &type_name = arrow_dataset->type_name(); + return g_strndup(type_name.data(), type_name.size()); +} + + +typedef struct GADatasetFileSystemDatasetWriteOptionsPrivate_ { + arrow::dataset::FileSystemDatasetWriteOptions options; + GADatasetFileWriteOptions *file_write_options; + GArrowFileSystem *file_system; + GADatasetPartitioning *partitioning; +} GADatasetFileSystemDatasetWriteOptionsPrivate; + +enum { + PROP_FILE_WRITE_OPTIONS = 1, + PROP_FILE_SYSTEM, + PROP_BASE_DIR, + PROP_PARTITIONING, + PROP_MAX_PARTITIONS, + PROP_BASE_NAME_TEMPLATE, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetWriteOptions, + gadataset_file_system_dataset_write_options, + G_TYPE_OBJECT) + +#define GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(obj) \ + static_cast<GADatasetFileSystemDatasetWriteOptionsPrivate *>( \ + gadataset_file_system_dataset_write_options_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS(obj))) + +static void +gadataset_file_system_dataset_write_options_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + priv->options.~FileSystemDatasetWriteOptions(); + G_OBJECT_CLASS(gadataset_file_system_dataset_write_options_parent_class)-> + finalize(object); +} + +static void +gadataset_file_system_dataset_write_options_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + + if (priv->file_write_options) { + g_object_unref(priv->file_write_options); + priv->file_write_options = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + if (priv->partitioning) { + g_object_unref(priv->partitioning); + priv->partitioning = NULL; + } + + G_OBJECT_CLASS(gadataset_file_system_dataset_write_options_parent_class)-> + dispose(object); +} + +static void +gadataset_file_system_dataset_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FILE_WRITE_OPTIONS: + { + auto file_write_options = g_value_get_object(value); + if (file_write_options == priv->file_write_options) { + break; + } + auto old_file_write_options = priv->file_write_options; + if (file_write_options) { + g_object_ref(file_write_options); + priv->file_write_options = + GADATASET_FILE_WRITE_OPTIONS(file_write_options); + priv->options.file_write_options = + gadataset_file_write_options_get_raw(priv->file_write_options); + } else { + priv->options.file_write_options = nullptr; + } + if (old_file_write_options) { + g_object_unref(old_file_write_options); + } + } + break; + case PROP_FILE_SYSTEM: + { + auto file_system = g_value_get_object(value); + if (file_system == priv->file_system) { + break; + } + auto old_file_system = priv->file_system; + if (file_system) { + g_object_ref(file_system); + priv->file_system = GARROW_FILE_SYSTEM(file_system); + priv->options.filesystem = garrow_file_system_get_raw(priv->file_system); + } else { + priv->options.filesystem = nullptr; + } + if (old_file_system) { + g_object_unref(old_file_system); + } + } + break; + case PROP_BASE_DIR: + priv->options.base_dir = g_value_get_string(value); + break; + case PROP_PARTITIONING: + { + auto partitioning = g_value_get_object(value); + if (partitioning == priv->partitioning) { + break; + } + auto old_partitioning = priv->partitioning; + if (partitioning) { + g_object_ref(partitioning); + priv->partitioning = GADATASET_PARTITIONING(partitioning); + priv->options.partitioning = + gadataset_partitioning_get_raw(priv->partitioning); + } else { + priv->options.partitioning = arrow::dataset::Partitioning::Default(); + } + if (old_partitioning) { + g_object_unref(old_partitioning); + } + } + break; + case PROP_MAX_PARTITIONS: + priv->options.max_partitions = g_value_get_uint(value); + break; + case PROP_BASE_NAME_TEMPLATE: + priv->options.basename_template = g_value_get_string(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_write_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FILE_WRITE_OPTIONS: + g_value_set_object(value, priv->file_write_options); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + case PROP_BASE_DIR: + g_value_set_string(value, priv->options.base_dir.c_str()); + break; + case PROP_PARTITIONING: + g_value_set_object(value, priv->partitioning); + break; + case PROP_MAX_PARTITIONS: + g_value_set_uint(value, priv->options.max_partitions); + break; + case PROP_BASE_NAME_TEMPLATE: + g_value_set_string(value, priv->options.basename_template.c_str()); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_write_options_init( + GADatasetFileSystemDatasetWriteOptions *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + new(&(priv->options)) arrow::dataset::FileSystemDatasetWriteOptions; + priv->options.partitioning = arrow::dataset::Partitioning::Default(); +} + +static void +gadataset_file_system_dataset_write_options_class_init( + GADatasetFileSystemDatasetWriteOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = + gadataset_file_system_dataset_write_options_finalize; + gobject_class->dispose = + gadataset_file_system_dataset_write_options_dispose; + gobject_class->set_property = + gadataset_file_system_dataset_write_options_set_property; + gobject_class->get_property = + gadataset_file_system_dataset_write_options_get_property; + + arrow::dataset::FileSystemDatasetWriteOptions default_options; + GParamSpec *spec; + /** + * GADatasetFileSystemDatasetWriteOptions:file_write_options: + * + * Options for individual fragment writing. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("file-write-options", + "File write options", + "Options for individual fragment writing", + GADATASET_TYPE_FILE_WRITE_OPTIONS, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_FILE_WRITE_OPTIONS, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:file_system: + * + * #GArrowFileSystem into which a dataset will be written. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "GArrowFileSystem into which " + "a dataset will be written", + GARROW_TYPE_FILE_SYSTEM, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:base_dir: + * + * Root directory into which the dataset will be written. + * + * Since: 6.0.0 + */ + spec = g_param_spec_string("base-dir", + "Base directory", + "Root directory into which " + "the dataset will be written", + NULL, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_BASE_DIR, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:partitioning: + * + * #GADatasetPartitioning used to generate fragment paths. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("partitioning", + "Partitioning", + "GADatasetPartitioning used to " + "generate fragment paths", + GADATASET_TYPE_PARTITIONING, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:max-partitions: + * + * Maximum number of partitions any batch may be written into. + * + * Since: 6.0.0 + */ + spec = g_param_spec_uint("max-partitions", + "Max partitions", + "Maximum number of partitions " + "any batch may be written into", + 0, + G_MAXINT, + default_options.max_partitions, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_MAX_PARTITIONS, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:base-name-template: + * + * Template string used to generate fragment base names. {i} will be + * replaced by an auto incremented integer. + * + * Since: 6.0.0 + */ + spec = g_param_spec_string("base-name-template", + "Base name template", + "Template string used to generate fragment " + "base names. {i} will be replaced by " + "an auto incremented integer", + NULL, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_BASE_NAME_TEMPLATE, spec); +} + +/** + * gadataset_file_system_dataset_write_options_new: + * + * Returns: The newly created #GADatasetFileSystemDatasetWriteOptions. + * + * Since: 6.0.0 + */ +GADatasetFileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_new(void) +{ + return GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS( + g_object_new(GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS, + NULL)); +} + + +typedef struct GADatasetFileSystemDatasetPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; + GADatasetPartitioning *partitioning; +} GADatasetFileSystemDatasetPrivate; + +enum { + PROP_FILE_SYSTEM_DATASET_FORMAT = 1, + PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM, + PROP_FILE_SYSTEM_DATASET_PARTITIONING, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET_TYPE_DATASET) + +#define GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(obj) \ + static_cast<GADatasetFileSystemDatasetPrivate *>( \ + gadataset_file_system_dataset_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET(obj))) + +static void +gadataset_file_system_dataset_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + G_OBJECT_CLASS(gadataset_file_system_dataset_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FILE_SYSTEM_DATASET_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + case PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM: + priv->file_system = GARROW_FILE_SYSTEM(g_value_dup_object(value)); + break; + case PROP_FILE_SYSTEM_DATASET_PARTITIONING: + priv->partitioning = GADATASET_PARTITIONING(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FILE_SYSTEM_DATASET_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + case PROP_FILE_SYSTEM_DATASET_PARTITIONING: + g_value_set_object(value, priv->partitioning); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_init(GADatasetFileSystemDataset *object) +{ +} + +static void +gadataset_file_system_dataset_class_init(GADatasetFileSystemDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_dispose; + gobject_class->set_property = gadataset_file_system_dataset_set_property; + gobject_class->get_property = gadataset_file_system_dataset_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDataset:format: + * + * Format of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format of the dataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast<GParamFlags>(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_FILE_SYSTEM_DATASET_FORMAT, + spec); + + /** + * GADatasetFileSystemDataset:file-system: + * + * File system of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system of the dataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast<GParamFlags>(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM, + spec); + + /** + * GADatasetFileSystemDataset:partitioning: + * + * Partitioning of the dataset. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("partitioning", + "Partitioning", + "Partitioning of the dataset", + GADATASET_TYPE_PARTITIONING, + static_cast<GParamFlags>(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_FILE_SYSTEM_DATASET_PARTITIONING, + spec); +} + +/** + * gadataset_file_system_dataset_write_scanner: + * @scanner: A #GADatasetScanner that produces data to be written. + * @options: A #GADatasetFileSystemDatasetWriteOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_system_dataset_write_scanner( + GADatasetScanner *scanner, + GADatasetFileSystemDatasetWriteOptions *options, + GError **error) +{ + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_options = + gadataset_file_system_dataset_write_options_get_raw(options); + auto status = + arrow::dataset::FileSystemDataset::Write(*arrow_options, arrow_scanner); + return garrow::check(error, + status, + "[file-system-dataset][write-scanner]"); +} + + +G_END_DECLS + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset) +{ + return gadataset_dataset_new_raw(arrow_dataset, + "dataset", arrow_dataset, + NULL); +} + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset, + const gchar *first_property_name, + ...) +{ + va_list args; + va_start(args, first_property_name); + auto array = gadataset_dataset_new_raw_valist(arrow_dataset, + first_property_name, + args); + va_end(args); + return array; +} + +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset, + const gchar *first_property_name, + va_list args) +{ + GType type = GADATASET_TYPE_DATASET; + const auto type_name = (*arrow_dataset)->type_name(); + if (type_name == "filesystem") { + type = GADATASET_TYPE_FILE_SYSTEM_DATASET; + } + return GADATASET_DATASET(g_object_new_valist(type, + first_property_name, + args)); +} + +std::shared_ptr<arrow::dataset::Dataset> +gadataset_dataset_get_raw(GADatasetDataset *dataset) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(dataset); + return priv->dataset; +} + +arrow::dataset::FileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_get_raw( + GADatasetFileSystemDatasetWriteOptions *options) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(options); + return &(priv->options); +} diff --git a/src/arrow/c_glib/arrow-dataset-glib/dataset.h b/src/arrow/c_glib/arrow-dataset-glib/dataset.h new file mode 100644 index 000000000..86d077caa --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/dataset.h @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-dataset-glib/file-format.h> + +G_BEGIN_DECLS + +typedef struct _GADatasetScannerBuilder GADatasetScannerBuilder; +typedef struct _GADatasetScanner GADatasetScanner; + +#define GADATASET_TYPE_DATASET (gadataset_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDataset, + gadataset_dataset, + GADATASET, + DATASET, + GObject) +struct _GADatasetDatasetClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS \ + (gadataset_file_system_dataset_write_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetWriteOptions, + gadataset_file_system_dataset_write_options, + GADATASET, + FILE_SYSTEM_DATASET_WRITE_OPTIONS, + GObject) +struct _GADatasetFileSystemDatasetWriteOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetFileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_new(void); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET \ + (gadataset_file_system_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET, + FILE_SYSTEM_DATASET, + GADatasetDataset) +struct _GADatasetFileSystemDatasetClass +{ + GADatasetDatasetClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_system_dataset_write_scanner( + GADatasetScanner *scanner, + GADatasetFileSystemDatasetWriteOptions *options, + GError **error); + + +G_END_DECLS diff --git a/src/arrow/c_glib/arrow-dataset-glib/dataset.hpp b/src/arrow/c_glib/arrow-dataset-glib/dataset.hpp new file mode 100644 index 000000000..1dab391e8 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/dataset.hpp @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow/dataset/api.h> + +#include <arrow-dataset-glib/dataset.h> + + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset); +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset, + const gchar *first_property_name, + ...); +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset, + const gchar *first_property_name, + va_list arg); +std::shared_ptr<arrow::dataset::Dataset> +gadataset_dataset_get_raw(GADatasetDataset *dataset); + + +arrow::dataset::FileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_get_raw( + GADatasetFileSystemDatasetWriteOptions *options); diff --git a/src/arrow/c_glib/arrow-dataset-glib/enums.c.template b/src/arrow/c_glib/arrow-dataset-glib/enums.c.template new file mode 100644 index 000000000..8921ab062 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/enums.c.template @@ -0,0 +1,52 @@ +/*** BEGIN file-header ***/ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-dataset-glib/arrow-dataset-glib.h> +/*** END file-header ***/ + +/*** BEGIN file-production ***/ + +/* enumerations from "@filename@" */ +/*** END file-production ***/ + +/*** BEGIN value-header ***/ +GType +@enum_name@_get_type(void) +{ + static GType etype = 0; + if (G_UNLIKELY(etype == 0)) { + static const G@Type@Value values[] = { +/*** END value-header ***/ + +/*** BEGIN value-production ***/ + {@VALUENAME@, "@VALUENAME@", "@valuenick@"}, +/*** END value-production ***/ + +/*** BEGIN value-tail ***/ + {0, NULL, NULL} + }; + etype = g_@type@_register_static(g_intern_static_string("@EnumName@"), values); + } + return etype; +} +/*** END value-tail ***/ + +/*** BEGIN file-tail ***/ +/*** END file-tail ***/ diff --git a/src/arrow/c_glib/arrow-dataset-glib/enums.h.template b/src/arrow/c_glib/arrow-dataset-glib/enums.h.template new file mode 100644 index 000000000..d6a0a455f --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/enums.h.template @@ -0,0 +1,41 @@ +/*** BEGIN file-header ***/ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-dataset-glib/partitioning.h> + +G_BEGIN_DECLS +/*** END file-header ***/ + +/*** BEGIN file-production ***/ + +/* enumerations from "@filename@" */ +/*** END file-production ***/ + +/*** BEGIN value-header ***/ +GType @enum_name@_get_type(void) G_GNUC_CONST; +#define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) +/*** END value-header ***/ + +/*** BEGIN file-tail ***/ + +G_END_DECLS +/*** END file-tail ***/ diff --git a/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp b/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp new file mode 100644 index 000000000..c0c92d966 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/file-format.cpp @@ -0,0 +1,574 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/error.hpp> +#include <arrow-glib/file-system.hpp> +#include <arrow-glib/output-stream.hpp> +#include <arrow-glib/record-batch.hpp> +#include <arrow-glib/reader.hpp> +#include <arrow-glib/schema.hpp> + +#include <arrow-dataset-glib/file-format.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: file-format + * @section_id: file-format + * @title: File format classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetFileWriteOptions is a class for options to write a file + * of this format. + * + * #GADatasetFileWriter is a class for writing a file of this format. + * + * #GADatasetFileFormat is a base class for file format classes. + * + * #GADatasetCSVFileFormat is a class for CSV file format. + * + * #GADatasetIPCFileFormat is a class for IPC file format. + * + * #GADatasetParquetFileFormat is a class for Parquet file format. + * + * Since: 3.0.0 + */ + +typedef struct GADatasetFileWriteOptionsPrivate_ { + std::shared_ptr<arrow::dataset::FileWriteOptions> options; +} GADatasetFileWriteOptionsPrivate; + +enum { + PROP_OPTIONS = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriteOptions, + gadataset_file_write_options, + G_TYPE_OBJECT) + +#define GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(obj) \ + static_cast<GADatasetFileWriteOptionsPrivate *>( \ + gadataset_file_write_options_get_instance_private( \ + GADATASET_FILE_WRITE_OPTIONS(obj))) + +static void +gadataset_file_write_options_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + priv->options.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_write_options_parent_class)->finalize(object); +} + +static void +gadataset_file_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_OPTIONS: + priv->options = + *static_cast<std::shared_ptr<arrow::dataset::FileWriteOptions> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_write_options_init(GADatasetFileWriteOptions *object) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + new(&priv->options) std::shared_ptr<arrow::dataset::FileWriteOptions>; +} + +static void +gadataset_file_write_options_class_init(GADatasetFileWriteOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_write_options_finalize; + gobject_class->set_property = gadataset_file_write_options_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("options", + "Options", + "The raw " + "std::shared<arrow::dataset::FileWriteOptions> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_OPTIONS, spec); +} + + +typedef struct GADatasetFileWriterPrivate_ { + std::shared_ptr<arrow::dataset::FileWriter> writer; +} GADatasetFileWriterPrivate; + +enum { + PROP_WRITER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriter, + gadataset_file_writer, + G_TYPE_OBJECT) + +#define GADATASET_FILE_WRITER_GET_PRIVATE(obj) \ + static_cast<GADatasetFileWriterPrivate *>( \ + gadataset_file_writer_get_instance_private( \ + GADATASET_FILE_WRITER(obj))) + +static void +gadataset_file_writer_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + priv->writer.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_writer_parent_class)->finalize(object); +} + +static void +gadataset_file_writer_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_WRITER: + priv->writer = + *static_cast<std::shared_ptr<arrow::dataset::FileWriter> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_writer_init(GADatasetFileWriter *object) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + new(&(priv->writer)) std::shared_ptr<arrow::dataset::FileWriter>; +} + +static void +gadataset_file_writer_class_init(GADatasetFileWriterClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_writer_finalize; + gobject_class->set_property = gadataset_file_writer_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("writer", + "Writer", + "The raw " + "std::shared<arrow::dataset::FileWriter> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_WRITER, spec); +} + +/** + * gadataset_file_writer_write_record_batch: + * @writer: A #GADatasetFileWriter. + * @record_batch: A #GArrowRecordBatch to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + auto status = arrow_writer->Write(arrow_record_batch); + return garrow::check(error, status, "[file-writer][write-record-batch]"); +} + +/** + * gadataset_file_writer_write_record_batch_reader: + * @writer: A #GADatasetFileWriter. + * @reader: A #GArrowRecordBatchReader to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer, + GArrowRecordBatchReader *reader, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + auto arrow_reader = garrow_record_batch_reader_get_raw(reader); + auto status = arrow_writer->Write(arrow_reader.get()); + return garrow::check(error, + status, + "[file-writer][write-record-batch-reader]"); +} + +/** + * gadataset_file_writer_finish: + * @writer: A #GADatasetFileWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_finish(GADatasetFileWriter *writer, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + auto status = arrow_writer->Finish(); + return garrow::check(error, + status, + "[file-writer][finish]"); +} + + +typedef struct GADatasetFileFormatPrivate_ { + std::shared_ptr<arrow::dataset::FileFormat> format; +} GADatasetFileFormatPrivate; + +enum { + PROP_FORMAT = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat, + gadataset_file_format, + G_TYPE_OBJECT) + +#define GADATASET_FILE_FORMAT_GET_PRIVATE(obj) \ + static_cast<GADatasetFileFormatPrivate *>( \ + gadataset_file_format_get_instance_private( \ + GADATASET_FILE_FORMAT(obj))) + +static void +gadataset_file_format_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); + priv->format.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_format_parent_class)->finalize(object); +} + +static void +gadataset_file_format_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = + *static_cast<std::shared_ptr<arrow::dataset::FileFormat> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_format_init(GADatasetFileFormat *object) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); + new(&priv->format) std::shared_ptr<arrow::dataset::FileFormat>; +} + +static void +gadataset_file_format_class_init(GADatasetFileFormatClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_format_finalize; + gobject_class->set_property = gadataset_file_format_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("format", + "Format", + "The raw std::shared<arrow::dataset::FileFormat> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); +} + +/** + * gadataset_file_format_get_type_name: + * @format: A #GADatasetFileFormat. + * + * Returns: The type name of @format. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 3.0.0 + */ +gchar * +gadataset_file_format_get_type_name(GADatasetFileFormat *format) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + const auto &type_name = arrow_format->type_name(); + return g_strndup(type_name.data(), type_name.size()); +} + +/** + * gadataset_file_format_get_default_write_options: + * @format: A #GADatasetFileFormat. + * + * Returns: (transfer full): The default #GADatasetFileWriteOptions of @format. + * + * Since: 6.0.0 + */ +GADatasetFileWriteOptions * +gadataset_file_format_get_default_write_options(GADatasetFileFormat *format) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + auto arrow_options = arrow_format->DefaultWriteOptions(); + return gadataset_file_write_options_new_raw(&arrow_options); +} + +/** + * gadataset_file_format_open_writer: + * @format: A #GADatasetFileFormat. + * @destination: A #GArrowOutputStream. + * @file_system: The #GArrowFileSystem of @destination. + * @path: The path of @destination. + * @schema: A #GArrowSchema that is used by written record batches. + * @options: A #GADatasetFileWriteOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The newly created #GADatasetFileWriter of @format + * on success, %NULL on error. + * + * Since: 6.0.0 + */ +GADatasetFileWriter * +gadataset_file_format_open_writer(GADatasetFileFormat *format, + GArrowOutputStream *destination, + GArrowFileSystem *file_system, + const gchar *path, + GArrowSchema *schema, + GADatasetFileWriteOptions *options, + GError **error) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + auto arrow_destination = garrow_output_stream_get_raw(destination); + auto arrow_file_system = garrow_file_system_get_raw(file_system); + auto arrow_schema = garrow_schema_get_raw(schema); + auto arrow_options = gadataset_file_write_options_get_raw(options); + auto arrow_writer_result = + arrow_format->MakeWriter(arrow_destination, + arrow_schema, + arrow_options, + {arrow_file_system, path}); + if (garrow::check(error, arrow_writer_result, "[file-format][open-writer]")) { + auto arrow_writer = *arrow_writer_result; + return gadataset_file_writer_new_raw(&arrow_writer); + } else { + return NULL; + } +} + +/** + * gadataset_file_format_equal: + * @format: A #GADatasetFileFormat. + * @other_format: A #GADatasetFileFormat to be compared. + * + * Returns: %TRUE if they are the same content file format, %FALSE otherwise. + * + * Since: 3.0.0 + */ +gboolean +gadataset_file_format_equal(GADatasetFileFormat *format, + GADatasetFileFormat *other_format) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + const auto arrow_other_format = gadataset_file_format_get_raw(other_format); + return arrow_format->Equals(*arrow_other_format); +} + + +G_DEFINE_TYPE(GADatasetCSVFileFormat, + gadataset_csv_file_format, + GADATASET_TYPE_FILE_FORMAT) + +static void +gadataset_csv_file_format_init(GADatasetCSVFileFormat *object) +{ +} + +static void +gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass *klass) +{ +} + +/** + * gadataset_csv_file_format_new: + * + * Returns: The newly created CSV file format. + * + * Since: 3.0.0 + */ +GADatasetCSVFileFormat * +gadataset_csv_file_format_new(void) +{ + std::shared_ptr<arrow::dataset::FileFormat> arrow_format = + std::make_shared<arrow::dataset::CsvFileFormat>(); + return GADATASET_CSV_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format)); +} + + +G_DEFINE_TYPE(GADatasetIPCFileFormat, + gadataset_ipc_file_format, + GADATASET_TYPE_FILE_FORMAT) + +static void +gadataset_ipc_file_format_init(GADatasetIPCFileFormat *object) +{ +} + +static void +gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass *klass) +{ +} + +/** + * gadataset_ipc_file_format_new: + * + * Returns: The newly created IPC file format. + * + * Since: 3.0.0 + */ +GADatasetIPCFileFormat * +gadataset_ipc_file_format_new(void) +{ + std::shared_ptr<arrow::dataset::FileFormat> arrow_format = + std::make_shared<arrow::dataset::IpcFileFormat>(); + return GADATASET_IPC_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format)); +} + + +G_DEFINE_TYPE(GADatasetParquetFileFormat, + gadataset_parquet_file_format, + GADATASET_TYPE_FILE_FORMAT) + +static void +gadataset_parquet_file_format_init(GADatasetParquetFileFormat *object) +{ +} + +static void +gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass *klass) +{ +} + +/** + * gadataset_parquet_file_format_new: + * + * Returns: The newly created Parquet file format. + * + * Since: 3.0.0 + */ +GADatasetParquetFileFormat * +gadataset_parquet_file_format_new(void) +{ + std::shared_ptr<arrow::dataset::FileFormat> arrow_format = + std::make_shared<arrow::dataset::ParquetFileFormat>(); + return GADATASET_PARQUET_FILE_FORMAT( + gadataset_file_format_new_raw(&arrow_format)); +} + + +G_END_DECLS + +GADatasetFileWriteOptions * +gadataset_file_write_options_new_raw( + std::shared_ptr<arrow::dataset::FileWriteOptions> *arrow_options) +{ + return GADATASET_FILE_WRITE_OPTIONS( + g_object_new(GADATASET_TYPE_FILE_WRITE_OPTIONS, + "options", arrow_options, + NULL)); +} + +std::shared_ptr<arrow::dataset::FileWriteOptions> +gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(options); + return priv->options; +} + + +GADatasetFileWriter * +gadataset_file_writer_new_raw( + std::shared_ptr<arrow::dataset::FileWriter> *arrow_writer) +{ + return GADATASET_FILE_WRITER(g_object_new(GADATASET_TYPE_FILE_WRITER, + "writer", arrow_writer, + NULL)); +} + +std::shared_ptr<arrow::dataset::FileWriter> +gadataset_file_writer_get_raw(GADatasetFileWriter *writer) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(writer); + return priv->writer; +} + + +GADatasetFileFormat * +gadataset_file_format_new_raw( + std::shared_ptr<arrow::dataset::FileFormat> *arrow_format) +{ + GType type = GADATASET_TYPE_FILE_FORMAT; + const auto &type_name = (*arrow_format)->type_name(); + if (type_name == "csv") { + type = GADATASET_TYPE_CSV_FILE_FORMAT; + } else if (type_name == "ipc") { + type = GADATASET_TYPE_IPC_FILE_FORMAT; + } else if (type_name == "parquet") { + type = GADATASET_TYPE_PARQUET_FILE_FORMAT; + } + return GADATASET_FILE_FORMAT(g_object_new(type, + "format", arrow_format, + NULL)); +} + +std::shared_ptr<arrow::dataset::FileFormat> +gadataset_file_format_get_raw(GADatasetFileFormat *format) +{ + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(format); + return priv->format; +} diff --git a/src/arrow/c_glib/arrow-dataset-glib/file-format.h b/src/arrow/c_glib/arrow-dataset-glib/file-format.h new file mode 100644 index 000000000..16a834074 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/file-format.h @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-glib/arrow-glib.h> + +G_BEGIN_DECLS + +#define GADATASET_TYPE_FILE_WRITE_OPTIONS \ + (gadataset_file_write_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileWriteOptions, + gadataset_file_write_options, + GADATASET, + FILE_WRITE_OPTIONS, + GObject) +struct _GADatasetFileWriteOptionsClass +{ + GObjectClass parent_class; +}; + + +#define GADATASET_TYPE_FILE_WRITER \ + (gadataset_file_writer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileWriter, + gadataset_file_writer, + GADATASET, + FILE_WRITER, + GObject) +struct _GADatasetFileWriterClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error); +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer, + GArrowRecordBatchReader *reader, + GError **error); +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_writer_finish(GADatasetFileWriter *writer, + GError **error); + + +#define GADATASET_TYPE_FILE_FORMAT (gadataset_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileFormat, + gadataset_file_format, + GADATASET, + FILE_FORMAT, + GObject) +struct _GADatasetFileFormatClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_3_0 +gchar * +gadataset_file_format_get_type_name(GADatasetFileFormat *format); +GARROW_AVAILABLE_IN_6_0 +GADatasetFileWriteOptions * +gadataset_file_format_get_default_write_options(GADatasetFileFormat *format); +GARROW_AVAILABLE_IN_6_0 +GADatasetFileWriter * +gadataset_file_format_open_writer(GADatasetFileFormat *format, + GArrowOutputStream *destination, + GArrowFileSystem *file_system, + const gchar *path, + GArrowSchema *schema, + GADatasetFileWriteOptions *options, + GError **error); + +GARROW_AVAILABLE_IN_3_0 +gboolean +gadataset_file_format_equal(GADatasetFileFormat *format, + GADatasetFileFormat *other_format); + + +#define GADATASET_TYPE_CSV_FILE_FORMAT (gadataset_csv_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetCSVFileFormat, + gadataset_csv_file_format, + GADATASET, + CSV_FILE_FORMAT, + GADatasetFileFormat) +struct _GADatasetCSVFileFormatClass +{ + GADatasetFileFormatClass parent_class; +}; + +GARROW_AVAILABLE_IN_3_0 +GADatasetCSVFileFormat *gadataset_csv_file_format_new(void); + + +#define GADATASET_TYPE_IPC_FILE_FORMAT (gadataset_ipc_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetIPCFileFormat, + gadataset_ipc_file_format, + GADATASET, + IPC_FILE_FORMAT, + GADatasetFileFormat) +struct _GADatasetIPCFileFormatClass +{ + GADatasetFileFormatClass parent_class; +}; + +GARROW_AVAILABLE_IN_3_0 +GADatasetIPCFileFormat *gadataset_ipc_file_format_new(void); + + +#define GADATASET_TYPE_PARQUET_FILE_FORMAT \ + (gadataset_parquet_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetParquetFileFormat, + gadataset_parquet_file_format, + GADATASET, + PARQUET_FILE_FORMAT, + GADatasetFileFormat) +struct _GADatasetParquetFileFormatClass +{ + GADatasetFileFormatClass parent_class; +}; + +GARROW_AVAILABLE_IN_3_0 +GADatasetParquetFileFormat *gadataset_parquet_file_format_new(void); + + +G_END_DECLS diff --git a/src/arrow/c_glib/arrow-dataset-glib/file-format.hpp b/src/arrow/c_glib/arrow-dataset-glib/file-format.hpp new file mode 100644 index 000000000..636dc5c01 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/file-format.hpp @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow/dataset/api.h> + +#include <arrow-dataset-glib/file-format.h> + +GADatasetFileWriteOptions * +gadataset_file_write_options_new_raw( + std::shared_ptr<arrow::dataset::FileWriteOptions> *arrow_options); +std::shared_ptr<arrow::dataset::FileWriteOptions> +gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options); + + +GADatasetFileWriter * +gadataset_file_writer_new_raw( + std::shared_ptr<arrow::dataset::FileWriter> *arrow_writer); +std::shared_ptr<arrow::dataset::FileWriter> +gadataset_file_writer_get_raw(GADatasetFileWriter *writer); + + +GADatasetFileFormat * +gadataset_file_format_new_raw( + std::shared_ptr<arrow::dataset::FileFormat> *arrow_format); +std::shared_ptr<arrow::dataset::FileFormat> +gadataset_file_format_get_raw(GADatasetFileFormat *format); diff --git a/src/arrow/c_glib/arrow-dataset-glib/fragment.cpp b/src/arrow/c_glib/arrow-dataset-glib/fragment.cpp new file mode 100644 index 000000000..f2f0cd1c3 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/fragment.cpp @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/record-batch.hpp> +#include <arrow-glib/schema.hpp> + +#include <arrow-dataset-glib/fragment.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: fragment + * @section_id: fragment + * @title: Fragment classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetFragment is a base class for all fragment classes. + * + * #GADatasetInMemoryFragment is a class for in-memory fragment. + * + * Since: 4.0.0 + */ + +/* arrow::dataset::Fragment */ + +typedef struct GADatasetFragmentPrivate_ { + std::shared_ptr<arrow::dataset::Fragment> fragment; +} GADatasetFragmentPrivate; + +enum { + PROP_FRAGMENT = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetFragment, + gadataset_fragment, + G_TYPE_OBJECT) + +#define GADATASET_FRAGMENT_GET_PRIVATE(obj) \ + static_cast<GADatasetFragmentPrivate *>( \ + gadataset_fragment_get_instance_private( \ + GADATASET_FRAGMENT(obj))) + +static void +gadataset_fragment_finalize(GObject *object) +{ + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(object); + + priv->fragment.~shared_ptr(); + + G_OBJECT_CLASS(gadataset_fragment_parent_class)->finalize(object); +} + +static void +gadataset_fragment_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FRAGMENT: + priv->fragment = + *static_cast<std::shared_ptr<arrow::dataset::Fragment> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_fragment_init(GADatasetFragment *object) +{ + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(object); + new(&priv->fragment) std::shared_ptr<arrow::dataset::Fragment>; +} + +static void +gadataset_fragment_class_init(GADatasetFragmentClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_fragment_finalize; + gobject_class->set_property = gadataset_fragment_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("fragment", + "Fragment", + "The raw std::shared<arrow::dataset::Fragment> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FRAGMENT, spec); +} + +/* arrow::dataset::InMemoryFragment */ + +G_DEFINE_TYPE(GADatasetInMemoryFragment, + gadataset_in_memory_fragment, + GADATASET_TYPE_FRAGMENT) + +static void +gadataset_in_memory_fragment_init(GADatasetInMemoryFragment *object) +{ +} + +static void +gadataset_in_memory_fragment_class_init(GADatasetInMemoryFragmentClass *klass) +{ +} + +/** + * gadataset_in_memory_fragment_new: + * @schema: A #GArrowSchema. + * @record_batches: (array length=n_record_batches): + * (element-type GArrowRecordBatch): The record batches of the table. + * @n_record_batches: The number of record batches. + * + * Returns: A newly created #GADatasetInMemoryFragment. + * + * Since: 4.0.0 + */ +GADatasetInMemoryFragment * +gadataset_in_memory_fragment_new(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector<std::shared_ptr<arrow::RecordBatch>> arrow_record_batches; + arrow_record_batches.reserve(n_record_batches); + for (gsize i = 0; i < n_record_batches; ++i) { + auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); + arrow_record_batches.push_back(arrow_record_batch); + } + auto arrow_in_memory_fragment = + std::make_shared<arrow::dataset::InMemoryFragment>(arrow_schema, + arrow_record_batches); + return gadataset_in_memory_fragment_new_raw(&arrow_in_memory_fragment); +} + +G_END_DECLS + +GADatasetFragment * +gadataset_fragment_new_raw( + std::shared_ptr<arrow::dataset::Fragment> *arrow_fragment) +{ + auto fragment = + GADATASET_FRAGMENT(g_object_new(GADATASET_TYPE_FRAGMENT, + "fragment", arrow_fragment, + NULL)); + return fragment; +} + +std::shared_ptr<arrow::dataset::Fragment> +gadataset_fragment_get_raw(GADatasetFragment *fragment) +{ + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(fragment); + return priv->fragment; +} + +GADatasetInMemoryFragment * +gadataset_in_memory_fragment_new_raw( + std::shared_ptr<arrow::dataset::InMemoryFragment> *arrow_fragment) +{ + auto fragment = + GADATASET_IN_MEMORY_FRAGMENT(g_object_new(GADATASET_TYPE_IN_MEMORY_FRAGMENT, + "fragment", arrow_fragment, + NULL)); + return fragment; +} diff --git a/src/arrow/c_glib/arrow-dataset-glib/fragment.h b/src/arrow/c_glib/arrow-dataset-glib/fragment.h new file mode 100644 index 000000000..9376b6cf3 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/fragment.h @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-glib/arrow-glib.h> + +G_BEGIN_DECLS + +/* arrow::dataset::Fragment */ + +#define GADATASET_TYPE_FRAGMENT (gadataset_fragment_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFragment, + gadataset_fragment, + GADATASET, + FRAGMENT, + GObject) +struct _GADatasetFragmentClass +{ + GObjectClass parent_class; +}; + +/* arrow::dataset::InMemoryFragment */ + +#define GADATASET_TYPE_IN_MEMORY_FRAGMENT \ + (gadataset_in_memory_fragment_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetInMemoryFragment, + gadataset_in_memory_fragment, + GADATASET, + IN_MEMORY_FRAGMENT, + GADatasetFragment) +struct _GADatasetInMemoryFragmentClass +{ + GADatasetFragmentClass parent_class; +}; + +GARROW_AVAILABLE_IN_4_0 +GADatasetInMemoryFragment * +gadataset_in_memory_fragment_new(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches); + +G_END_DECLS diff --git a/src/arrow/c_glib/arrow-dataset-glib/fragment.hpp b/src/arrow/c_glib/arrow-dataset-glib/fragment.hpp new file mode 100644 index 000000000..904f83653 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/fragment.hpp @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow/dataset/api.h> + +#include <arrow-dataset-glib/fragment.h> + +std::shared_ptr<arrow::dataset::Fragment> +gadataset_fragment_get_raw(GADatasetFragment *fragment); + +GADatasetFragment* +gadataset_fragment_new_raw( + std::shared_ptr<arrow::dataset::Fragment> *arrow_fragment); + +GADatasetInMemoryFragment* +gadataset_in_memory_fragment_new_raw( + std::shared_ptr<arrow::dataset::InMemoryFragment> *arrow_fragment); diff --git a/src/arrow/c_glib/arrow-dataset-glib/meson.build b/src/arrow/c_glib/arrow-dataset-glib/meson.build new file mode 100644 index 000000000..0d9b8564e --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/meson.build @@ -0,0 +1,104 @@ +# -*- indent-tabs-mode: nil -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sources = files( + 'dataset-factory.cpp', + 'dataset.cpp', + 'file-format.cpp', + 'fragment.cpp', + 'partitioning.cpp', + 'scanner.cpp', +) + +c_headers = files( + 'arrow-dataset-glib.h', + 'dataset-factory.h', + 'dataset.h', + 'file-format.h', + 'fragment.h', + 'partitioning.h', + 'scanner.h', +) + +cpp_headers = files( + 'arrow-dataset-glib.hpp', + 'dataset-factory.hpp', + 'dataset.hpp', + 'file-format.hpp', + 'fragment.hpp', + 'partitioning.hpp', + 'scanner.hpp', +) + +enums = gnome.mkenums('enums', + sources: c_headers, + identifier_prefix: 'GADataset', + symbol_prefix: 'gadataset', + c_template: 'enums.c.template', + h_template: 'enums.h.template', + install_dir: join_paths(include_dir, meson.project_name()), + install_header: true) +enums_source = enums[0] +enums_header = enums[1] + + +headers = c_headers + cpp_headers +install_headers(headers, subdir: 'arrow-dataset-glib') + +dependencies = [ + arrow_dataset, + arrow_glib, +] +libarrow_dataset_glib = library('arrow-dataset-glib', + sources: sources + enums, + install: true, + dependencies: dependencies, + include_directories: base_include_directories, + soversion: so_version, + version: library_version) +arrow_dataset_glib = declare_dependency(link_with: libarrow_dataset_glib, + include_directories: base_include_directories, + dependencies: dependencies, + sources: enums_header) + +pkgconfig.generate(libarrow_dataset_glib, + filebase: 'arrow-dataset-glib', + name: 'Apache Arrow Dataset GLib', + description: 'C API for Apache Arrow Dataset based on GLib', + version: version, + requires: ['arrow-glib', 'arrow-dataset']) + +if have_gi + gnome.generate_gir(libarrow_dataset_glib, + dependencies: declare_dependency(sources: arrow_glib_gir), + sources: sources + c_headers + enums, + namespace: 'ArrowDataset', + nsversion: api_version, + identifier_prefix: 'GADataset', + symbol_prefix: 'gadataset', + export_packages: 'arrow-dataset-glib', + includes: [ + 'Arrow-1.0', + ], + install: true, + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ]) +endif diff --git a/src/arrow/c_glib/arrow-dataset-glib/partitioning.cpp b/src/arrow/c_glib/arrow-dataset-glib/partitioning.cpp new file mode 100644 index 000000000..bce33671a --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/partitioning.cpp @@ -0,0 +1,440 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/array.hpp> +#include <arrow-glib/error.hpp> +#include <arrow-glib/schema.hpp> + +#include <arrow-dataset-glib/enums.h> +#include <arrow-dataset-glib/partitioning.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: partitioning + * @section_id: partitioning + * @title: Partitioning classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetPartitioningOptions is a class for partitioning options. + * + * #GADatasetPartitioning is a base class for partitioning classes + * such as #GADatasetDirectoryPartitioning. + * + * #GADatasetKeyValuePartitioning is a base class for key-value style + * partitioning classes such as #GADatasetDirectoryPartitioning. + * + * #GADatasetDirectoryPartitioning is a class for partitioning that + * uses directory structure. + * + * Since: 6.0.0 + */ + +typedef struct GADatasetPartitioningOptionsPrivate_ { + gboolean infer_dictionary; + GArrowSchema *schema; + GADatasetSegmentEncoding segment_encoding; +} GADatasetPartitioningOptionsPrivate; + +enum { + PROP_INFER_DICTIONARY = 1, + PROP_SCHEMA, + PROP_SEGMENT_ENCODING, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioningOptions, + gadataset_partitioning_options, + G_TYPE_OBJECT) + +#define GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(obj) \ + static_cast<GADatasetPartitioningOptionsPrivate *>( \ + gadataset_partitioning_options_get_instance_private( \ + GADATASET_PARTITIONING_OPTIONS(obj))) + +static void +gadataset_partitioning_options_dispose(GObject *object) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + if (priv->schema) { + g_object_unref(priv->schema); + priv->schema = nullptr; + } + + G_OBJECT_CLASS(gadataset_partitioning_options_parent_class)->dispose(object); +} + +static void +gadataset_partitioning_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_INFER_DICTIONARY: + priv->infer_dictionary = g_value_get_boolean(value); + break; + case PROP_SCHEMA: + { + auto schema = g_value_get_object(value); + if (priv->schema == schema) { + break; + } + auto old_schema = priv->schema; + if (schema) { + g_object_ref(schema); + priv->schema = GARROW_SCHEMA(schema); + } else { + priv->schema = NULL; + } + if (old_schema) { + g_object_unref(old_schema); + } + } + break; + case PROP_SEGMENT_ENCODING: + priv->segment_encoding = + static_cast<GADatasetSegmentEncoding>(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_partitioning_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_INFER_DICTIONARY: + g_value_set_boolean(value, priv->infer_dictionary); + break; + case PROP_SCHEMA: + g_value_set_object(value, priv->schema); + break; + case PROP_SEGMENT_ENCODING: + g_value_set_enum(value, priv->segment_encoding); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_partitioning_options_init(GADatasetPartitioningOptions *object) +{ +} + +static void +gadataset_partitioning_options_class_init( + GADatasetPartitioningOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gadataset_partitioning_options_dispose; + gobject_class->set_property = gadataset_partitioning_options_set_property; + gobject_class->get_property = gadataset_partitioning_options_get_property; + + arrow::dataset::PartitioningFactoryOptions default_options; + GParamSpec *spec; + /** + * GADatasetPartitioningOptions:infer-dictionary: + * + * When inferring a schema for partition fields, yield dictionary + * encoded types instead of plain. This can be more efficient when + * materializing virtual columns, and Expressions parsed by the + * finished Partitioning will include dictionaries of all unique + * inspected values for each field. + * + * Since: 6.0.0 + */ + spec = g_param_spec_boolean("infer-dictionary", + "Infer dictionary", + "Whether encode partitioned field values as " + "dictionary", + default_options.infer_dictionary, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_INFER_DICTIONARY, spec); + + /** + * GADatasetPartitioningOptions:schema: + * + * Optionally, an expected schema can be provided, in which case + * inference will only check discovered fields against the schema + * and update internal state (such as dictionaries). + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("schema", + "Schema", + "Inference will only check discovered fields " + "against the schema and update internal state", + GARROW_TYPE_SCHEMA, + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SCHEMA, spec); + + /** + * GADatasetPartitioningOptions:segment-encoding: + * + * After splitting a path into components, decode the path + * components before parsing according to this scheme. + * + * Since: 6.0.0 + */ + spec = g_param_spec_enum("segment-encoding", + "Segment encoding", + "After splitting a path into components, " + "decode the path components before " + "parsing according to this scheme", + GADATASET_TYPE_SEGMENT_ENCODING, + static_cast<GADatasetSegmentEncoding>( + default_options.segment_encoding), + static_cast<GParamFlags>(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SEGMENT_ENCODING, spec); +} + +/** + * gadataset_partitioning_options_new: + * + * Returns: The newly created #GADatasetPartitioningOptions. + * + * Since: 6.0.0 + */ +GADatasetPartitioningOptions * +gadataset_partitioning_options_new(void) +{ + return GADATASET_PARTITIONING_OPTIONS( + g_object_new(GADATASET_TYPE_PARTITIONING_OPTIONS, + NULL)); +} + + +typedef struct GADatasetPartitioningPrivate_ { + std::shared_ptr<arrow::dataset::Partitioning> partitioning; +} GADatasetPartitioningPrivate; + +enum { + PROP_PARTITIONING = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioning, + gadataset_partitioning, + G_TYPE_OBJECT) + +#define GADATASET_PARTITIONING_GET_PRIVATE(obj) \ + static_cast<GADatasetPartitioningPrivate *>( \ + gadataset_partitioning_get_instance_private( \ + GADATASET_PARTITIONING(obj))) + +static void +gadataset_partitioning_finalize(GObject *object) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); + priv->partitioning.~shared_ptr(); + G_OBJECT_CLASS(gadataset_partitioning_parent_class)->finalize(object); +} + +static void +gadataset_partitioning_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_PARTITIONING: + priv->partitioning = + *static_cast<std::shared_ptr<arrow::dataset::Partitioning> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_partitioning_init(GADatasetPartitioning *object) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); + new(&priv->partitioning) std::shared_ptr<arrow::dataset::Partitioning>; +} + +static void +gadataset_partitioning_class_init(GADatasetPartitioningClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_partitioning_finalize; + gobject_class->set_property = gadataset_partitioning_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("partitioning", + "Partitioning", + "The raw " + "std::shared<arrow::dataset::Partitioning> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); +} + +/** + * gadataset_partitioning_new: + * + * Returns: The newly created #GADatasetPartitioning that doesn't + * partition. + * + * Since: 6.0.0 + */ +GADatasetPartitioning * +gadataset_partitioning_new(void) +{ + auto arrow_partitioning = arrow::dataset::Partitioning::Default(); + return GADATASET_PARTITIONING( + g_object_new(GADATASET_TYPE_PARTITIONING, + "partitioning", &arrow_partitioning, + NULL)); +} + +/** + * gadataset_partitioning_get_type_name: + * @partitioning: A #GADatasetPartitioning. + * + * Returns: The type name of @partitioning. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 6.0.0 + */ +gchar * +gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning) +{ + auto arrow_partitioning = gadataset_partitioning_get_raw(partitioning); + auto arrow_type_name = arrow_partitioning->type_name(); + return g_strndup(arrow_type_name.c_str(), + arrow_type_name.size()); +} + + +G_DEFINE_TYPE(GADatasetKeyValuePartitioning, + gadataset_key_value_partitioning, + GADATASET_TYPE_PARTITIONING) + +static void +gadataset_key_value_partitioning_init(GADatasetKeyValuePartitioning *object) +{ +} + +static void +gadataset_key_value_partitioning_class_init( + GADatasetKeyValuePartitioningClass *klass) +{ +} + + +G_DEFINE_TYPE(GADatasetDirectoryPartitioning, + gadataset_directory_partitioning, + GADATASET_TYPE_KEY_VALUE_PARTITIONING) + +static void +gadataset_directory_partitioning_init(GADatasetDirectoryPartitioning *object) +{ +} + +static void +gadataset_directory_partitioning_class_init( + GADatasetDirectoryPartitioningClass *klass) +{ +} + +/** + * gadataset_directory_partitioning_new: + * @schema: A #GArrowSchema that describes all partitioned segments. + * @dictionaries: (nullable) (element-type GArrowArray): A list of #GArrowArray + * for dictionary data types in @schema. + * @options: (nullable): A #GADatasetPartitioningOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: The newly created #GADatasetDirectoryPartitioning on success, + * %NULL on error. + * + * Since: 6.0.0 + */ +GADatasetDirectoryPartitioning * +gadataset_directory_partitioning_new(GArrowSchema *schema, + GList *dictionaries, + GADatasetPartitioningOptions *options, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector<std::shared_ptr<arrow::Array>> arrow_dictionaries; + for (auto node = dictionaries; node; node = node->next) { + auto dictionary = GARROW_ARRAY(node->data); + if (dictionary) { + arrow_dictionaries.push_back(garrow_array_get_raw(dictionary)); + } else { + arrow_dictionaries.push_back(nullptr); + } + } + arrow::dataset::KeyValuePartitioningOptions arrow_options; + if (options) { + arrow_options = + gadataset_partitioning_options_get_raw_key_value_partitioning_options( + options); + } + auto arrow_partitioning = + std::make_shared<arrow::dataset::DirectoryPartitioning>( + arrow_schema, + arrow_dictionaries, + arrow_options); + return GADATASET_DIRECTORY_PARTITIONING( + g_object_new(GADATASET_TYPE_DIRECTORY_PARTITIONING, + "partitioning", &arrow_partitioning, + NULL)); +} + + +G_END_DECLS + +arrow::dataset::KeyValuePartitioningOptions +gadataset_partitioning_options_get_raw_key_value_partitioning_options( + GADatasetPartitioningOptions *options) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(options); + arrow::dataset::KeyValuePartitioningOptions arrow_options; + arrow_options.segment_encoding = + static_cast<arrow::dataset::SegmentEncoding>(priv->segment_encoding); + return arrow_options; +} + +std::shared_ptr<arrow::dataset::Partitioning> +gadataset_partitioning_get_raw(GADatasetPartitioning *partitioning) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(partitioning); + return priv->partitioning; +} diff --git a/src/arrow/c_glib/arrow-dataset-glib/partitioning.h b/src/arrow/c_glib/arrow-dataset-glib/partitioning.h new file mode 100644 index 000000000..d408d9bd5 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/partitioning.h @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-glib/arrow-glib.h> + +G_BEGIN_DECLS + +/** + * GADatasetSegmentEncoding + * @GADATASET_SEGMENT_ENCODING_NONE: No encoding. + * @GADATASET_SEGMENT_ENCODING_URI: Segment values are URL-encoded. + * + * They are corresponding to `arrow::dataset::SegmentEncoding` values. + * + * Since: 6.0.0 + */ +typedef enum { + GADATASET_SEGMENT_ENCODING_NONE, + GADATASET_SEGMENT_ENCODING_URI, +} GADatasetSegmentEncoding; + + +#define GADATASET_TYPE_PARTITIONING_OPTIONS \ + (gadataset_partitioning_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetPartitioningOptions, + gadataset_partitioning_options, + GADATASET, + PARTITIONING_OPTIONS, + GObject) +struct _GADatasetPartitioningOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetPartitioningOptions * +gadataset_partitioning_options_new(void); + + +#define GADATASET_TYPE_PARTITIONING (gadataset_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetPartitioning, + gadataset_partitioning, + GADATASET, + PARTITIONING, + GObject) +struct _GADatasetPartitioningClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetPartitioning * +gadataset_partitioning_new(void); +GARROW_AVAILABLE_IN_6_0 +gchar * +gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning); + + +#define GADATASET_TYPE_KEY_VALUE_PARTITIONING \ + (gadataset_key_value_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetKeyValuePartitioning, + gadataset_key_value_partitioning, + GADATASET, + KEY_VALUE_PARTITIONING, + GADatasetPartitioning) +struct _GADatasetKeyValuePartitioningClass +{ + GADatasetPartitioningClass parent_class; +}; + + +#define GADATASET_TYPE_DIRECTORY_PARTITIONING \ + (gadataset_directory_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDirectoryPartitioning, + gadataset_directory_partitioning, + GADATASET, + DIRECTORY_PARTITIONING, + GADatasetKeyValuePartitioning) +struct _GADatasetDirectoryPartitioningClass +{ + GADatasetKeyValuePartitioningClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetDirectoryPartitioning * +gadataset_directory_partitioning_new(GArrowSchema *schema, + GList *dictionaries, + GADatasetPartitioningOptions *options, + GError **error); + + +G_END_DECLS diff --git a/src/arrow/c_glib/arrow-dataset-glib/partitioning.hpp b/src/arrow/c_glib/arrow-dataset-glib/partitioning.hpp new file mode 100644 index 000000000..2481ecb33 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/partitioning.hpp @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow/dataset/api.h> + +#include <arrow-dataset-glib/partitioning.h> + +arrow::dataset::KeyValuePartitioningOptions +gadataset_partitioning_options_get_raw_key_value_partitioning_options( + GADatasetPartitioningOptions *options); + +std::shared_ptr<arrow::dataset::Partitioning> +gadataset_partitioning_get_raw(GADatasetPartitioning *partitioning); diff --git a/src/arrow/c_glib/arrow-dataset-glib/scanner.cpp b/src/arrow/c_glib/arrow-dataset-glib/scanner.cpp new file mode 100644 index 000000000..51542bb0a --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/scanner.cpp @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <arrow-glib/error.hpp> +#include <arrow-glib/expression.hpp> +#include <arrow-glib/reader.hpp> +#include <arrow-glib/table.hpp> + +#include <arrow-dataset-glib/dataset.hpp> +#include <arrow-dataset-glib/scanner.hpp> + +G_BEGIN_DECLS + +/** + * SECTION: scanner + * @section_id: scanner + * @title: Scanner related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetScanner is a class for scanning dataset. + * + * #GADatasetScannerBuilder is a class for building a scanner. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetScannerPrivate_ { + std::shared_ptr<arrow::dataset::Scanner> scanner; +} GADatasetScannerPrivate; + +enum { + PROP_SCANNER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanner, + gadataset_scanner, + G_TYPE_OBJECT) + +#define GADATASET_SCANNER_GET_PRIVATE(obj) \ + static_cast<GADatasetScannerPrivate *>( \ + gadataset_scanner_get_instance_private( \ + GADATASET_SCANNER(obj))) + +static void +gadataset_scanner_finalize(GObject *object) +{ + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + priv->scanner.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_parent_class)->finalize(object); +} + +static void +gadataset_scanner_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_SCANNER: + priv->scanner = + *static_cast<std::shared_ptr<arrow::dataset::Scanner> *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_scanner_init(GADatasetScanner *object) +{ + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + new(&priv->scanner) std::shared_ptr<arrow::dataset::Scanner>; +} + +static void +gadataset_scanner_class_init(GADatasetScannerClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_scanner_finalize; + gobject_class->set_property = gadataset_scanner_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("scanner", + "Scanner", + "The raw std::shared<arrow::dataset::Scanner> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_SCANNER, spec); +} + +/** + * gadataset_scanner_to_table: + * @scanner: A #GADatasetScanner. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GArrowTable on success, %NULL on error. + * + * Since: 5.0.0 + */ +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error) +{ + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_table_result = arrow_scanner->ToTable(); + if (garrow::check(error, arrow_table_result, "[scanner][to-table]")) { + auto arrow_table = *arrow_table_result; + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + + +typedef struct GADatasetScannerBuilderPrivate_ { + std::shared_ptr<arrow::dataset::ScannerBuilder> scanner_builder; +} GADatasetScannerBuilderPrivate; + +enum { + PROP_SCANNER_BUILDER = 1, + PROP_USE_ASYNC, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder, + gadataset_scanner_builder, + G_TYPE_OBJECT) + +#define GADATASET_SCANNER_BUILDER_GET_PRIVATE(obj) \ + static_cast<GADatasetScannerBuilderPrivate *>( \ + gadataset_scanner_builder_get_instance_private( \ + GADATASET_SCANNER_BUILDER(obj))) + +static void +gadataset_scanner_builder_finalize(GObject *object) +{ + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + priv->scanner_builder.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_builder_parent_class)->finalize(object); +} + +static void +gadataset_scanner_builder_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_SCANNER_BUILDER: + priv->scanner_builder = + *static_cast<std::shared_ptr<arrow::dataset::ScannerBuilder> *>( + g_value_get_pointer(value)); + break; + case PROP_USE_ASYNC: + garrow::check(nullptr, + priv->scanner_builder->UseAsync(g_value_get_boolean(value)), + "[scanner-builder][use-async][set]"); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_scanner_builder_init(GADatasetScannerBuilder *object) +{ + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + new(&priv->scanner_builder) std::shared_ptr<arrow::dataset::ScannerBuilder>; +} + +static void +gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_scanner_builder_finalize; + gobject_class->set_property = gadataset_scanner_builder_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("scanner-builder", + "Scanner builder", + "The raw " + "std::shared<arrow::dataset::ScannerBuilder> *", + static_cast<GParamFlags>(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec); + + arrow::dataset::ScanOptions default_options; + /** + * GADatasetScannerBuilder:use-async: + * + * Whether or not async mode is used. + * + * Since: 6.0.0 + */ + spec = g_param_spec_boolean("use-async", + "Use async", + "Whether or not async mode is used", + default_options.use_async, + static_cast<GParamFlags>(G_PARAM_WRITABLE)); + g_object_class_install_property(gobject_class, PROP_USE_ASYNC, spec); +} + +/** + * gadataset_scanner_builder_new: + * @dataset: A #GADatasetDataset to be scanned. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GADatasetScannerBuilder on success, + * %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error) +{ + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (garrow::check(error, + arrow_scanner_builder_result, + "[scanner-builder][new]")) { + auto arrow_scanner_builder = *arrow_scanner_builder_result; + return gadataset_scanner_builder_new_raw(&arrow_scanner_builder); + } else { + return NULL; + } +} + +/** + * gadataset_scanner_builder_new_record_batch_reader: + * @reader: A #GArrowRecordBatchReader that produces record batches. + * + * Returns: (nullable): A newly created #GADatasetScannerBuilder. + * + * Since: 6.0.0 + */ +GADatasetScannerBuilder * +gadataset_scanner_builder_new_record_batch_reader( + GArrowRecordBatchReader *reader) +{ + auto arrow_reader = garrow_record_batch_reader_get_raw(reader); + auto arrow_scanner_builder = + arrow::dataset::ScannerBuilder::FromRecordBatchReader(arrow_reader); + return gadataset_scanner_builder_new_raw(&arrow_scanner_builder); +} + +/** + * gadataset_scanner_builder_set_filter: + * @builder: A #GADatasetScannerBuilder. + * @expression: A #GArrowExpression to filter rows with. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_scanner_builder_set_filter(GADatasetScannerBuilder *builder, + GArrowExpression *expression, + GError **error) +{ + auto arrow_builder = gadataset_scanner_builder_get_raw(builder); + auto arrow_expression = garrow_expression_get_raw(expression); + return garrow::check(error, + arrow_builder->Filter(*arrow_expression), + "[scanner-builder][filter][set]"); +} + +/** + * gadataset_scanner_builder_finish: + * @builder: A #GADatasetScannerBuilder. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScanner on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error) +{ + auto arrow_builder = gadataset_scanner_builder_get_raw(builder); + auto arrow_scanner_result = arrow_builder->Finish(); + if (garrow::check(error, arrow_scanner_result, "[scanner-builder][finish]")) { + auto arrow_scanner = *arrow_scanner_result; + return gadataset_scanner_new_raw(&arrow_scanner); + } else { + return NULL; + } +} + + +G_END_DECLS + +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr<arrow::dataset::Scanner> *arrow_scanner) +{ + auto scanner = + GADATASET_SCANNER(g_object_new(GADATASET_TYPE_SCANNER, + "scanner", arrow_scanner, + NULL)); + return scanner; +} + +std::shared_ptr<arrow::dataset::Scanner> +gadataset_scanner_get_raw(GADatasetScanner *scanner) +{ + auto priv = GADATASET_SCANNER_GET_PRIVATE(scanner); + return priv->scanner; +} + +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr<arrow::dataset::ScannerBuilder> *arrow_scanner_builder) +{ + return GADATASET_SCANNER_BUILDER( + g_object_new(GADATASET_TYPE_SCANNER_BUILDER, + "scanner-builder", arrow_scanner_builder, + NULL)); +} + +std::shared_ptr<arrow::dataset::ScannerBuilder> +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder) +{ + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(scanner_builder); + return priv->scanner_builder; +} diff --git a/src/arrow/c_glib/arrow-dataset-glib/scanner.h b/src/arrow/c_glib/arrow-dataset-glib/scanner.h new file mode 100644 index 000000000..59da2577d --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/scanner.h @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow-dataset-glib/dataset.h> +#include <arrow-dataset-glib/fragment.h> + +G_BEGIN_DECLS + +#define GADATASET_TYPE_SCANNER (gadataset_scanner_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScanner, + gadataset_scanner, + GADATASET, + SCANNER, + GObject) +struct _GADatasetScannerClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error); + +#define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScannerBuilder, + gadataset_scanner_builder, + GADATASET, + SCANNER_BUILDER, + GObject) +struct _GADatasetScannerBuilderClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_6_0 +GADatasetScannerBuilder * +gadataset_scanner_builder_new_record_batch_reader( + GArrowRecordBatchReader *reader); + +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_scanner_builder_set_filter(GADatasetScannerBuilder *builder, + GArrowExpression *expression, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error); + +G_END_DECLS diff --git a/src/arrow/c_glib/arrow-dataset-glib/scanner.hpp b/src/arrow/c_glib/arrow-dataset-glib/scanner.hpp new file mode 100644 index 000000000..663ab6fc4 --- /dev/null +++ b/src/arrow/c_glib/arrow-dataset-glib/scanner.hpp @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <arrow/dataset/api.h> + +#include <arrow-dataset-glib/fragment.h> +#include <arrow-dataset-glib/scanner.h> + +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr<arrow::dataset::Scanner> *arrow_scanner); +std::shared_ptr<arrow::dataset::Scanner> +gadataset_scanner_get_raw(GADatasetScanner *scanner); + +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr<arrow::dataset::ScannerBuilder> *arrow_scanner_builder); +std::shared_ptr<arrow::dataset::ScannerBuilder> +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder); |