summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R/dataset-partition.R
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/R/dataset-partition.R132
1 files changed, 132 insertions, 0 deletions
diff --git a/src/arrow/r/R/dataset-partition.R b/src/arrow/r/R/dataset-partition.R
new file mode 100644
index 000000000..35d5bc00c
--- /dev/null
+++ b/src/arrow/r/R/dataset-partition.R
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Define Partitioning for a Dataset
+#'
+#' @description
+#' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()`
+#' method to indicate how the file's paths should be interpreted to define
+#' partitioning.
+#'
+#' `DirectoryPartitioning` describes how to interpret raw path segments, in
+#' order. For example, `schema(year = int16(), month = int8())` would define
+#' partitions for file paths like "2019/01/file.parquet",
+#' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In
+#' the previous example: when writing a dataset if the month was `NA` (or
+#' `NULL`), the files would be placed in "2019/file.parquet". When reading, the
+#' rows in "2019/file.parquet" would return an `NA` for the month column. An
+#' error will be raised if an outer directory is `NULL` and an inner directory
+#' is not.
+#'
+#' `HivePartitioning` is for Hive-style partitioning, which embeds field
+#' names and values in path segments, such as
+#' "/year=2019/month=2/data.parquet". Because fields are named in the path
+#' segments, order does not matter. This partitioning scheme allows `NULL`
+#' values. They will be replaced by a configurable `null_fallback` which
+#' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When
+#' reading, the `null_fallback` string will be replaced with `NA`s as
+#' appropriate.
+#'
+#' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect
+#' partition features from the file paths.
+#' @section Factory:
+#' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()`
+#' methods take a [Schema] as a single input argument. The helper
+#' function [`hive_partition(...)`][hive_partition] is shorthand for
+#' `HivePartitioning$create(schema(...))`.
+#'
+#' With `DirectoryPartitioningFactory$create()`, you can provide just the
+#' names of the path segments (in our example, `c("year", "month")`), and
+#' the `DatasetFactory` will infer the data types for those partition variables.
+#' `HivePartitioningFactory$create()` takes no arguments: both variable names
+#' and their types can be inferred from the file paths. `hive_partition()` with
+#' no arguments returns a `HivePartitioningFactory`.
+#' @name Partitioning
+#' @rdname Partitioning
+#' @export
+Partitioning <- R6Class("Partitioning", inherit = ArrowObject)
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning)
+DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") {
+ dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning)
+HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") {
+ dataset___HivePartitioning(schm,
+ null_fallback = null_fallback_or_default(null_fallback),
+ segment_encoding = segment_encoding
+ )
+}
+
+#' Construct Hive partitioning
+#'
+#' Hive partitioning embeds field names and values in path segments, such as
+#' "/year=2019/month=2/data.parquet".
+#'
+#' Because fields are named in the path segments, order of fields passed to
+#' `hive_partition()` does not matter.
+#' @param ... named list of [data types][data-type], passed to [schema()]
+#' @param null_fallback character to be used in place of missing values (`NA` or `NULL`)
+#' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`,
+#' which is what Hive uses.
+#' @param segment_encoding Decode partition segments after splitting paths.
+#' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is).
+#' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if
+#' calling `hive_partition()` with no arguments.
+#' @examplesIf arrow_with_dataset()
+#' hive_partition(year = int16(), month = int8())
+#' @export
+hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") {
+ schm <- schema(...)
+ if (length(schm) == 0) {
+ HivePartitioningFactory$create(null_fallback, segment_encoding)
+ } else {
+ HivePartitioning$create(schm, null_fallback, segment_encoding)
+ }
+}
+
+PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory)
+DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") {
+ dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory)
+HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") {
+ dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding)
+}
+
+null_fallback_or_default <- function(null_fallback) {
+ null_fallback %||% "__HIVE_DEFAULT_PARTITION__"
+}