diff options
Diffstat (limited to 'src/arrow/r/R/dataset-partition.R')
-rw-r--r-- | src/arrow/r/R/dataset-partition.R | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/src/arrow/r/R/dataset-partition.R b/src/arrow/r/R/dataset-partition.R new file mode 100644 index 000000000..35d5bc00c --- /dev/null +++ b/src/arrow/r/R/dataset-partition.R @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Define Partitioning for a Dataset +#' +#' @description +#' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()` +#' method to indicate how the file's paths should be interpreted to define +#' partitioning. +#' +#' `DirectoryPartitioning` describes how to interpret raw path segments, in +#' order. For example, `schema(year = int16(), month = int8())` would define +#' partitions for file paths like "2019/01/file.parquet", +#' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In +#' the previous example: when writing a dataset if the month was `NA` (or +#' `NULL`), the files would be placed in "2019/file.parquet". When reading, the +#' rows in "2019/file.parquet" would return an `NA` for the month column. An +#' error will be raised if an outer directory is `NULL` and an inner directory +#' is not. +#' +#' `HivePartitioning` is for Hive-style partitioning, which embeds field +#' names and values in path segments, such as +#' "/year=2019/month=2/data.parquet". Because fields are named in the path +#' segments, order does not matter. This partitioning scheme allows `NULL` +#' values. They will be replaced by a configurable `null_fallback` which +#' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When +#' reading, the `null_fallback` string will be replaced with `NA`s as +#' appropriate. +#' +#' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect +#' partition features from the file paths. +#' @section Factory: +#' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()` +#' methods take a [Schema] as a single input argument. The helper +#' function [`hive_partition(...)`][hive_partition] is shorthand for +#' `HivePartitioning$create(schema(...))`. +#' +#' With `DirectoryPartitioningFactory$create()`, you can provide just the +#' names of the path segments (in our example, `c("year", "month")`), and +#' the `DatasetFactory` will infer the data types for those partition variables. +#' `HivePartitioningFactory$create()` takes no arguments: both variable names +#' and their types can be inferred from the file paths. `hive_partition()` with +#' no arguments returns a `HivePartitioningFactory`. +#' @name Partitioning +#' @rdname Partitioning +#' @export +Partitioning <- R6Class("Partitioning", inherit = ArrowObject) +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning) +DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") { + dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding) +} + +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning) +HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") { + dataset___HivePartitioning(schm, + null_fallback = null_fallback_or_default(null_fallback), + segment_encoding = segment_encoding + ) +} + +#' Construct Hive partitioning +#' +#' Hive partitioning embeds field names and values in path segments, such as +#' "/year=2019/month=2/data.parquet". +#' +#' Because fields are named in the path segments, order of fields passed to +#' `hive_partition()` does not matter. +#' @param ... named list of [data types][data-type], passed to [schema()] +#' @param null_fallback character to be used in place of missing values (`NA` or `NULL`) +#' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`, +#' which is what Hive uses. +#' @param segment_encoding Decode partition segments after splitting paths. +#' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is). +#' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if +#' calling `hive_partition()` with no arguments. +#' @examplesIf arrow_with_dataset() +#' hive_partition(year = int16(), month = int8()) +#' @export +hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") { + schm <- schema(...) + if (length(schm) == 0) { + HivePartitioningFactory$create(null_fallback, segment_encoding) + } else { + HivePartitioning$create(schm, null_fallback, segment_encoding) + } +} + +PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject) + +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory) +DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") { + dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding) +} + +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory) +HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") { + dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding) +} + +null_fallback_or_default <- function(null_fallback) { + null_fallback %||% "__HIVE_DEFAULT_PARTITION__" +} |