# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' @include dataset.R #' @usage NULL #' @format NULL #' @rdname Dataset #' @export DatasetFactory <- R6Class("DatasetFactory", inherit = ArrowObject, public = list( Finish = function(schema = NULL, unify_schemas = FALSE) { if (is.null(schema)) { dataset___DatasetFactory__Finish1(self, unify_schemas) } else { assert_is(schema, "Schema") dataset___DatasetFactory__Finish2(self, schema) } }, Inspect = function(unify_schemas = FALSE) { dataset___DatasetFactory__Inspect(self, unify_schemas) } ) ) DatasetFactory$create <- function(x, filesystem = NULL, format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"), partitioning = NULL, ...) { if (is_list_of(x, "DatasetFactory")) { return(dataset___UnionDatasetFactory__Make(x)) } if (is.character(format)) { format <- FileFormat$create(match.arg(format), ...) } else { assert_is(format, "FileFormat") } path_and_fs <- get_paths_and_filesystem(x, filesystem) info <- path_and_fs$fs$GetFileInfo(path_and_fs$path) if (length(info) > 1 || info[[1]]$type == FileType$File) { # x looks like a vector of one or more file paths (not a directory path) return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL, path_and_fs$path, format)) } if (!is.null(partitioning)) { if (inherits(partitioning, "Schema")) { partitioning <- DirectoryPartitioning$create(partitioning) } else if (is.character(partitioning)) { # These are the column/field names, and we should autodetect their types partitioning <- DirectoryPartitioningFactory$create(partitioning) } } selector <- FileSelector$create(path_and_fs$path, allow_not_found = FALSE, recursive = TRUE) FileSystemDatasetFactory$create(path_and_fs$fs, selector, NULL, format, partitioning) } #' Create a DatasetFactory #' #' A [Dataset] can constructed using one or more [DatasetFactory]s. #' This function helps you construct a `DatasetFactory` that you can pass to #' [open_dataset()]. #' #' If you would only have a single `DatasetFactory` (for example, you have a #' single directory containing Parquet files), you can call `open_dataset()` #' directly. Use `dataset_factory()` when you #' want to combine different directories, file systems, or file formats. #' #' @param x A string path to a directory containing data files, a vector of one #' one or more string paths to data files, or a list of `DatasetFactory` objects #' whose datasets should be combined. If this argument is specified it will be #' used to construct a `UnionDatasetFactory` and other arguments will be #' ignored. #' @param filesystem A [FileSystem] object; if omitted, the `FileSystem` will #' be detected from `x` #' @param format A [FileFormat] object, or a string identifier of the format of #' the files in `x`. Currently supported values: #' * "parquet" #' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that #' only version 2 files are supported #' * "csv"/"text", aliases for the same thing (because comma is the default #' delimiter for text files #' * "tsv", equivalent to passing `format = "text", delimiter = "\t"` #' #' Default is "parquet", unless a `delimiter` is also specified, in which case #' it is assumed to be "text". #' @param partitioning One of #' * A `Schema`, in which case the file paths relative to `sources` will be #' parsed, and path segments will be matched with the schema fields. For #' example, `schema(year = int16(), month = int8())` would create partitions #' for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc. #' * A character vector that defines the field names corresponding to those #' path segments (that is, you're providing the names that would correspond #' to a `Schema` but the types will be autodetected) #' * A `HivePartitioning` or `HivePartitioningFactory`, as returned #' by [hive_partition()] which parses explicit or autodetected fields from #' Hive-style path segments #' * `NULL` for no partitioning #' @param ... Additional format-specific options, passed to #' `FileFormat$create()`. For CSV options, note that you can specify them either #' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the #' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). #' Not all `readr` options are currently supported; please file an issue if you #' encounter one that `arrow` should support. #' @return A `DatasetFactory` object. Pass this to [open_dataset()], #' in a list potentially with other `DatasetFactory` objects, to create #' a `Dataset`. #' @export dataset_factory <- DatasetFactory$create #' @usage NULL #' @format NULL #' @rdname Dataset #' @export FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory", inherit = DatasetFactory ) FileSystemDatasetFactory$create <- function(filesystem, selector = NULL, paths = NULL, format, partitioning = NULL) { assert_is(filesystem, "FileSystem") is.null(selector) || assert_is(selector, "FileSelector") is.null(paths) || assert_is(paths, "character") assert_that( xor(is.null(selector), is.null(paths)), msg = "Either selector or paths must be specified" ) assert_is(format, "FileFormat") if (!is.null(paths)) { assert_that(is.null(partitioning), msg = "Partitioning not supported with paths") } if (!is.null(paths)) { ptr <- dataset___FileSystemDatasetFactory__Make0(filesystem, paths, format) } else if (is.null(partitioning)) { ptr <- dataset___FileSystemDatasetFactory__Make1(filesystem, selector, format) } else if (inherits(partitioning, "PartitioningFactory")) { ptr <- dataset___FileSystemDatasetFactory__Make3(filesystem, selector, format, partitioning) } else if (inherits(partitioning, "Partitioning")) { ptr <- dataset___FileSystemDatasetFactory__Make2(filesystem, selector, format, partitioning) } else { stop( "Expected 'partitioning' to be NULL, PartitioningFactory or Partitioning", call. = FALSE ) } ptr }