/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #include #include #include #include #include G_BEGIN_DECLS /** * SECTION: partitioning * @section_id: partitioning * @title: Partitioning classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * * #GADatasetPartitioningOptions is a class for partitioning options. * * #GADatasetPartitioning is a base class for partitioning classes * such as #GADatasetDirectoryPartitioning. * * #GADatasetKeyValuePartitioning is a base class for key-value style * partitioning classes such as #GADatasetDirectoryPartitioning. * * #GADatasetDirectoryPartitioning is a class for partitioning that * uses directory structure. * * Since: 6.0.0 */ typedef struct GADatasetPartitioningOptionsPrivate_ { gboolean infer_dictionary; GArrowSchema *schema; GADatasetSegmentEncoding segment_encoding; } GADatasetPartitioningOptionsPrivate; enum { PROP_INFER_DICTIONARY = 1, PROP_SCHEMA, PROP_SEGMENT_ENCODING, }; G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioningOptions, gadataset_partitioning_options, G_TYPE_OBJECT) #define GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(obj) \ static_cast( \ gadataset_partitioning_options_get_instance_private( \ GADATASET_PARTITIONING_OPTIONS(obj))) static void gadataset_partitioning_options_dispose(GObject *object) { auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); if (priv->schema) { g_object_unref(priv->schema); priv->schema = nullptr; } G_OBJECT_CLASS(gadataset_partitioning_options_parent_class)->dispose(object); } static void gadataset_partitioning_options_set_property(GObject *object, guint prop_id, const GValue *value, GParamSpec *pspec) { auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); switch (prop_id) { case PROP_INFER_DICTIONARY: priv->infer_dictionary = g_value_get_boolean(value); break; case PROP_SCHEMA: { auto schema = g_value_get_object(value); if (priv->schema == schema) { break; } auto old_schema = priv->schema; if (schema) { g_object_ref(schema); priv->schema = GARROW_SCHEMA(schema); } else { priv->schema = NULL; } if (old_schema) { g_object_unref(old_schema); } } break; case PROP_SEGMENT_ENCODING: priv->segment_encoding = static_cast(g_value_get_enum(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; } } static void gadataset_partitioning_options_get_property(GObject *object, guint prop_id, GValue *value, GParamSpec *pspec) { auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); switch (prop_id) { case PROP_INFER_DICTIONARY: g_value_set_boolean(value, priv->infer_dictionary); break; case PROP_SCHEMA: g_value_set_object(value, priv->schema); break; case PROP_SEGMENT_ENCODING: g_value_set_enum(value, priv->segment_encoding); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; } } static void gadataset_partitioning_options_init(GADatasetPartitioningOptions *object) { } static void gadataset_partitioning_options_class_init( GADatasetPartitioningOptionsClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->dispose = gadataset_partitioning_options_dispose; gobject_class->set_property = gadataset_partitioning_options_set_property; gobject_class->get_property = gadataset_partitioning_options_get_property; arrow::dataset::PartitioningFactoryOptions default_options; GParamSpec *spec; /** * GADatasetPartitioningOptions:infer-dictionary: * * When inferring a schema for partition fields, yield dictionary * encoded types instead of plain. This can be more efficient when * materializing virtual columns, and Expressions parsed by the * finished Partitioning will include dictionaries of all unique * inspected values for each field. * * Since: 6.0.0 */ spec = g_param_spec_boolean("infer-dictionary", "Infer dictionary", "Whether encode partitioned field values as " "dictionary", default_options.infer_dictionary, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, PROP_INFER_DICTIONARY, spec); /** * GADatasetPartitioningOptions:schema: * * Optionally, an expected schema can be provided, in which case * inference will only check discovered fields against the schema * and update internal state (such as dictionaries). * * Since: 6.0.0 */ spec = g_param_spec_object("schema", "Schema", "Inference will only check discovered fields " "against the schema and update internal state", GARROW_TYPE_SCHEMA, static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, PROP_SCHEMA, spec); /** * GADatasetPartitioningOptions:segment-encoding: * * After splitting a path into components, decode the path * components before parsing according to this scheme. * * Since: 6.0.0 */ spec = g_param_spec_enum("segment-encoding", "Segment encoding", "After splitting a path into components, " "decode the path components before " "parsing according to this scheme", GADATASET_TYPE_SEGMENT_ENCODING, static_cast( default_options.segment_encoding), static_cast(G_PARAM_READWRITE)); g_object_class_install_property(gobject_class, PROP_SEGMENT_ENCODING, spec); } /** * gadataset_partitioning_options_new: * * Returns: The newly created #GADatasetPartitioningOptions. * * Since: 6.0.0 */ GADatasetPartitioningOptions * gadataset_partitioning_options_new(void) { return GADATASET_PARTITIONING_OPTIONS( g_object_new(GADATASET_TYPE_PARTITIONING_OPTIONS, NULL)); } typedef struct GADatasetPartitioningPrivate_ { std::shared_ptr partitioning; } GADatasetPartitioningPrivate; enum { PROP_PARTITIONING = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioning, gadataset_partitioning, G_TYPE_OBJECT) #define GADATASET_PARTITIONING_GET_PRIVATE(obj) \ static_cast( \ gadataset_partitioning_get_instance_private( \ GADATASET_PARTITIONING(obj))) static void gadataset_partitioning_finalize(GObject *object) { auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); priv->partitioning.~shared_ptr(); G_OBJECT_CLASS(gadataset_partitioning_parent_class)->finalize(object); } static void gadataset_partitioning_set_property(GObject *object, guint prop_id, const GValue *value, GParamSpec *pspec) { auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); switch (prop_id) { case PROP_PARTITIONING: priv->partitioning = *static_cast *>( g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; } } static void gadataset_partitioning_init(GADatasetPartitioning *object) { auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); new(&priv->partitioning) std::shared_ptr; } static void gadataset_partitioning_class_init(GADatasetPartitioningClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->finalize = gadataset_partitioning_finalize; gobject_class->set_property = gadataset_partitioning_set_property; GParamSpec *spec; spec = g_param_spec_pointer("partitioning", "Partitioning", "The raw " "std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); } /** * gadataset_partitioning_new: * * Returns: The newly created #GADatasetPartitioning that doesn't * partition. * * Since: 6.0.0 */ GADatasetPartitioning * gadataset_partitioning_new(void) { auto arrow_partitioning = arrow::dataset::Partitioning::Default(); return GADATASET_PARTITIONING( g_object_new(GADATASET_TYPE_PARTITIONING, "partitioning", &arrow_partitioning, NULL)); } /** * gadataset_partitioning_get_type_name: * @partitioning: A #GADatasetPartitioning. * * Returns: The type name of @partitioning. * * It should be freed with g_free() when no longer needed. * * Since: 6.0.0 */ gchar * gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning) { auto arrow_partitioning = gadataset_partitioning_get_raw(partitioning); auto arrow_type_name = arrow_partitioning->type_name(); return g_strndup(arrow_type_name.c_str(), arrow_type_name.size()); } G_DEFINE_TYPE(GADatasetKeyValuePartitioning, gadataset_key_value_partitioning, GADATASET_TYPE_PARTITIONING) static void gadataset_key_value_partitioning_init(GADatasetKeyValuePartitioning *object) { } static void gadataset_key_value_partitioning_class_init( GADatasetKeyValuePartitioningClass *klass) { } G_DEFINE_TYPE(GADatasetDirectoryPartitioning, gadataset_directory_partitioning, GADATASET_TYPE_KEY_VALUE_PARTITIONING) static void gadataset_directory_partitioning_init(GADatasetDirectoryPartitioning *object) { } static void gadataset_directory_partitioning_class_init( GADatasetDirectoryPartitioningClass *klass) { } /** * gadataset_directory_partitioning_new: * @schema: A #GArrowSchema that describes all partitioned segments. * @dictionaries: (nullable) (element-type GArrowArray): A list of #GArrowArray * for dictionary data types in @schema. * @options: (nullable): A #GADatasetPartitioningOptions. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: The newly created #GADatasetDirectoryPartitioning on success, * %NULL on error. * * Since: 6.0.0 */ GADatasetDirectoryPartitioning * gadataset_directory_partitioning_new(GArrowSchema *schema, GList *dictionaries, GADatasetPartitioningOptions *options, GError **error) { auto arrow_schema = garrow_schema_get_raw(schema); std::vector> arrow_dictionaries; for (auto node = dictionaries; node; node = node->next) { auto dictionary = GARROW_ARRAY(node->data); if (dictionary) { arrow_dictionaries.push_back(garrow_array_get_raw(dictionary)); } else { arrow_dictionaries.push_back(nullptr); } } arrow::dataset::KeyValuePartitioningOptions arrow_options; if (options) { arrow_options = gadataset_partitioning_options_get_raw_key_value_partitioning_options( options); } auto arrow_partitioning = std::make_shared( arrow_schema, arrow_dictionaries, arrow_options); return GADATASET_DIRECTORY_PARTITIONING( g_object_new(GADATASET_TYPE_DIRECTORY_PARTITIONING, "partitioning", &arrow_partitioning, NULL)); } G_END_DECLS arrow::dataset::KeyValuePartitioningOptions gadataset_partitioning_options_get_raw_key_value_partitioning_options( GADatasetPartitioningOptions *options) { auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(options); arrow::dataset::KeyValuePartitioningOptions arrow_options; arrow_options.segment_encoding = static_cast(priv->segment_encoding); return arrow_options; } std::shared_ptr gadataset_partitioning_get_raw(GADatasetPartitioning *partitioning) { auto priv = GADATASET_PARTITIONING_GET_PRIVATE(partitioning); return priv->partitioning; }