From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/arrow/r/R/dataset-factory.R | 170 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 src/arrow/r/R/dataset-factory.R (limited to 'src/arrow/r/R/dataset-factory.R') diff --git a/src/arrow/r/R/dataset-factory.R b/src/arrow/r/R/dataset-factory.R new file mode 100644 index 000000000..c56a6b181 --- /dev/null +++ b/src/arrow/r/R/dataset-factory.R @@ -0,0 +1,170 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include dataset.R + +#' @usage NULL +#' @format NULL +#' @rdname Dataset +#' @export +DatasetFactory <- R6Class("DatasetFactory", + inherit = ArrowObject, + public = list( + Finish = function(schema = NULL, unify_schemas = FALSE) { + if (is.null(schema)) { + dataset___DatasetFactory__Finish1(self, unify_schemas) + } else { + assert_is(schema, "Schema") + dataset___DatasetFactory__Finish2(self, schema) + } + }, + Inspect = function(unify_schemas = FALSE) { + dataset___DatasetFactory__Inspect(self, unify_schemas) + } + ) +) +DatasetFactory$create <- function(x, + filesystem = NULL, + format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"), + partitioning = NULL, + ...) { + if (is_list_of(x, "DatasetFactory")) { + return(dataset___UnionDatasetFactory__Make(x)) + } + + if (is.character(format)) { + format <- FileFormat$create(match.arg(format), ...) + } else { + assert_is(format, "FileFormat") + } + + path_and_fs <- get_paths_and_filesystem(x, filesystem) + info <- path_and_fs$fs$GetFileInfo(path_and_fs$path) + + if (length(info) > 1 || info[[1]]$type == FileType$File) { + # x looks like a vector of one or more file paths (not a directory path) + return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL, path_and_fs$path, format)) + } + + if (!is.null(partitioning)) { + if (inherits(partitioning, "Schema")) { + partitioning <- DirectoryPartitioning$create(partitioning) + } else if (is.character(partitioning)) { + # These are the column/field names, and we should autodetect their types + partitioning <- DirectoryPartitioningFactory$create(partitioning) + } + } + + selector <- FileSelector$create(path_and_fs$path, allow_not_found = FALSE, recursive = TRUE) + + FileSystemDatasetFactory$create(path_and_fs$fs, selector, NULL, format, partitioning) +} + +#' Create a DatasetFactory +#' +#' A [Dataset] can constructed using one or more [DatasetFactory]s. +#' This function helps you construct a `DatasetFactory` that you can pass to +#' [open_dataset()]. +#' +#' If you would only have a single `DatasetFactory` (for example, you have a +#' single directory containing Parquet files), you can call `open_dataset()` +#' directly. Use `dataset_factory()` when you +#' want to combine different directories, file systems, or file formats. +#' +#' @param x A string path to a directory containing data files, a vector of one +#' one or more string paths to data files, or a list of `DatasetFactory` objects +#' whose datasets should be combined. If this argument is specified it will be +#' used to construct a `UnionDatasetFactory` and other arguments will be +#' ignored. +#' @param filesystem A [FileSystem] object; if omitted, the `FileSystem` will +#' be detected from `x` +#' @param format A [FileFormat] object, or a string identifier of the format of +#' the files in `x`. Currently supported values: +#' * "parquet" +#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +#' only version 2 files are supported +#' * "csv"/"text", aliases for the same thing (because comma is the default +#' delimiter for text files +#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"` +#' +#' Default is "parquet", unless a `delimiter` is also specified, in which case +#' it is assumed to be "text". +#' @param partitioning One of +#' * A `Schema`, in which case the file paths relative to `sources` will be +#' parsed, and path segments will be matched with the schema fields. For +#' example, `schema(year = int16(), month = int8())` would create partitions +#' for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc. +#' * A character vector that defines the field names corresponding to those +#' path segments (that is, you're providing the names that would correspond +#' to a `Schema` but the types will be autodetected) +#' * A `HivePartitioning` or `HivePartitioningFactory`, as returned +#' by [hive_partition()] which parses explicit or autodetected fields from +#' Hive-style path segments +#' * `NULL` for no partitioning +#' @param ... Additional format-specific options, passed to +#' `FileFormat$create()`. For CSV options, note that you can specify them either +#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if you +#' encounter one that `arrow` should support. +#' @return A `DatasetFactory` object. Pass this to [open_dataset()], +#' in a list potentially with other `DatasetFactory` objects, to create +#' a `Dataset`. +#' @export +dataset_factory <- DatasetFactory$create + +#' @usage NULL +#' @format NULL +#' @rdname Dataset +#' @export +FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory", + inherit = DatasetFactory +) +FileSystemDatasetFactory$create <- function(filesystem, + selector = NULL, + paths = NULL, + format, + partitioning = NULL) { + assert_is(filesystem, "FileSystem") + is.null(selector) || assert_is(selector, "FileSelector") + is.null(paths) || assert_is(paths, "character") + assert_that( + xor(is.null(selector), is.null(paths)), + msg = "Either selector or paths must be specified" + ) + assert_is(format, "FileFormat") + if (!is.null(paths)) { + assert_that(is.null(partitioning), msg = "Partitioning not supported with paths") + } + + if (!is.null(paths)) { + ptr <- dataset___FileSystemDatasetFactory__Make0(filesystem, paths, format) + } else if (is.null(partitioning)) { + ptr <- dataset___FileSystemDatasetFactory__Make1(filesystem, selector, format) + } else if (inherits(partitioning, "PartitioningFactory")) { + ptr <- dataset___FileSystemDatasetFactory__Make3(filesystem, selector, format, partitioning) + } else if (inherits(partitioning, "Partitioning")) { + ptr <- dataset___FileSystemDatasetFactory__Make2(filesystem, selector, format, partitioning) + } else { + stop( + "Expected 'partitioning' to be NULL, PartitioningFactory or Partitioning", + call. = FALSE + ) + } + + ptr +} -- cgit v1.2.3