# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Define Partitioning for a Dataset #' #' @description #' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()` #' method to indicate how the file's paths should be interpreted to define #' partitioning. #' #' `DirectoryPartitioning` describes how to interpret raw path segments, in #' order. For example, `schema(year = int16(), month = int8())` would define #' partitions for file paths like "2019/01/file.parquet", #' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In #' the previous example: when writing a dataset if the month was `NA` (or #' `NULL`), the files would be placed in "2019/file.parquet". When reading, the #' rows in "2019/file.parquet" would return an `NA` for the month column. An #' error will be raised if an outer directory is `NULL` and an inner directory #' is not. #' #' `HivePartitioning` is for Hive-style partitioning, which embeds field #' names and values in path segments, such as #' "/year=2019/month=2/data.parquet". Because fields are named in the path #' segments, order does not matter. This partitioning scheme allows `NULL` #' values. They will be replaced by a configurable `null_fallback` which #' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When #' reading, the `null_fallback` string will be replaced with `NA`s as #' appropriate. #' #' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect #' partition features from the file paths. #' @section Factory: #' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()` #' methods take a [Schema] as a single input argument. The helper #' function [`hive_partition(...)`][hive_partition] is shorthand for #' `HivePartitioning$create(schema(...))`. #' #' With `DirectoryPartitioningFactory$create()`, you can provide just the #' names of the path segments (in our example, `c("year", "month")`), and #' the `DatasetFactory` will infer the data types for those partition variables. #' `HivePartitioningFactory$create()` takes no arguments: both variable names #' and their types can be inferred from the file paths. `hive_partition()` with #' no arguments returns a `HivePartitioningFactory`. #' @name Partitioning #' @rdname Partitioning #' @export Partitioning <- R6Class("Partitioning", inherit = ArrowObject) #' @usage NULL #' @format NULL #' @rdname Partitioning #' @export DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning) DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") { dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding) } #' @usage NULL #' @format NULL #' @rdname Partitioning #' @export HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning) HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") { dataset___HivePartitioning(schm, null_fallback = null_fallback_or_default(null_fallback), segment_encoding = segment_encoding ) } #' Construct Hive partitioning #' #' Hive partitioning embeds field names and values in path segments, such as #' "/year=2019/month=2/data.parquet". #' #' Because fields are named in the path segments, order of fields passed to #' `hive_partition()` does not matter. #' @param ... named list of [data types][data-type], passed to [schema()] #' @param null_fallback character to be used in place of missing values (`NA` or `NULL`) #' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`, #' which is what Hive uses. #' @param segment_encoding Decode partition segments after splitting paths. #' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is). #' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if #' calling `hive_partition()` with no arguments. #' @examplesIf arrow_with_dataset() #' hive_partition(year = int16(), month = int8()) #' @export hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") { schm <- schema(...) if (length(schm) == 0) { HivePartitioningFactory$create(null_fallback, segment_encoding) } else { HivePartitioning$create(schm, null_fallback, segment_encoding) } } PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject) #' @usage NULL #' @format NULL #' @rdname Partitioning #' @export DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory) DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") { dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding) } #' @usage NULL #' @format NULL #' @rdname Partitioning #' @export HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory) HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") { dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding) } null_fallback_or_default <- function(null_fallback) { null_fallback %||% "__HIVE_DEFAULT_PARTITION__" }