diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/R/dataset-format.R | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/R/dataset-format.R')
-rw-r--r-- | src/arrow/r/R/dataset-format.R | 353 |
1 files changed, 353 insertions, 0 deletions
diff --git a/src/arrow/r/R/dataset-format.R b/src/arrow/r/R/dataset-format.R new file mode 100644 index 000000000..b0b93219e --- /dev/null +++ b/src/arrow/r/R/dataset-format.R @@ -0,0 +1,353 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Dataset file formats +#' +#' @description +#' A `FileFormat` holds information about how to read and parse the files +#' included in a `Dataset`. There are subclasses corresponding to the supported +#' file formats (`ParquetFileFormat` and `IpcFileFormat`). +#' +#' @section Factory: +#' `FileFormat$create()` takes the following arguments: +#' * `format`: A string identifier of the file format. Currently supported values: +#' * "parquet" +#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +#' only version 2 files are supported +#' * "csv"/"text", aliases for the same thing (because comma is the default +#' delimiter for text files +#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"` +#' * `...`: Additional format-specific options +#' +#' `format = "parquet"``: +#' * `dict_columns`: Names of columns which should be read as dictionaries. +#' * Any Parquet options from [FragmentScanOptions]. +#' +#' `format = "text"`: see [CsvParseOptions]. Note that you can specify them either +#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if +#' you encounter one that `arrow` should support. Also, the following options are +#' supported. From [CsvReadOptions]: +#' * `skip_rows` +#' * `column_names` +#' * `autogenerate_column_names` +#' From [CsvFragmentScanOptions] (these values can be overridden at scan time): +#' * `convert_options`: a [CsvConvertOptions] +#' * `block_size` +#' +#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`) +#' @rdname FileFormat +#' @name FileFormat +#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows" +#' ## Semi-colon delimited files +#' # Set up directory for examples +#' tf <- tempfile() +#' dir.create(tf) +#' on.exit(unlink(tf)) +#' write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE) +#' +#' # Create FileFormat object +#' format <- FileFormat$create(format = "text", delimiter = ";") +#' +#' open_dataset(tf, format = format) +#' @export +FileFormat <- R6Class("FileFormat", + inherit = ArrowObject, + active = list( + # @description + # Return the `FileFormat`'s type + type = function() dataset___FileFormat__type_name(self) + ) +) +FileFormat$create <- function(format, schema = NULL, ...) { + opt_names <- names(list(...)) + if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) { + CsvFileFormat$create(schema = schema, ...) + } else if (format == c("tsv")) { + CsvFileFormat$create(delimiter = "\t", schema = schema, ...) + } else if (format == "parquet") { + ParquetFileFormat$create(...) + } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing + dataset___IpcFileFormat__Make() + } else { + stop("Unsupported file format: ", format, call. = FALSE) + } +} + +#' @export +as.character.FileFormat <- function(x, ...) { + out <- x$type + # Slight hack: special case IPC -> feather, otherwise is just the type_name + ifelse(out == "ipc", "feather", out) +} + +#' @usage NULL +#' @format NULL +#' @rdname FileFormat +#' @export +ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat) +ParquetFileFormat$create <- function(..., + dict_columns = character(0)) { + options <- ParquetFragmentScanOptions$create(...) + dataset___ParquetFileFormat__Make(options, dict_columns) +} + +#' @usage NULL +#' @format NULL +#' @rdname FileFormat +#' @export +IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat) + +#' @usage NULL +#' @format NULL +#' @rdname FileFormat +#' @export +CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat) +CsvFileFormat$create <- function(..., + opts = csv_file_format_parse_options(...), + convert_options = csv_file_format_convert_opts(...), + read_options = csv_file_format_read_opts(...)) { + dataset___CsvFileFormat__Make(opts, convert_options, read_options) +} + +# Support both readr-style option names and Arrow C++ option names +csv_file_format_parse_options <- function(...) { + opts <- list(...) + # Filter out arguments meant for CsvConvertOptions/CsvReadOptions + convert_opts <- names(formals(CsvConvertOptions$create)) + read_opts <- names(formals(CsvReadOptions$create)) + opts[convert_opts] <- NULL + opts[read_opts] <- NULL + opts[["schema"]] <- NULL + opt_names <- names(opts) + # Catch any readr-style options specified with full option names that are + # supported by read_delim_arrow() (and its wrappers) but are not yet + # supported here + unsup_readr_opts <- setdiff( + names(formals(read_delim_arrow)), + names(formals(readr_to_csv_parse_options)) + ) + is_unsup_opt <- opt_names %in% unsup_readr_opts + unsup_opts <- opt_names[is_unsup_opt] + if (length(unsup_opts)) { + stop( + "The following ", + ngettext(length(unsup_opts), "option is ", "options are "), + "supported in \"read_delim_arrow\" functions ", + "but not yet supported here: ", + oxford_paste(unsup_opts), + call. = FALSE + ) + } + # Catch any options with full or partial names that do not match any of the + # recognized Arrow C++ option names or readr-style option names + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) + is_readr_opt <- !is.na(pmatch(opt_names, readr_opts)) + unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt] + if (length(unrec_opts)) { + stop( + "Unrecognized ", + ngettext(length(unrec_opts), "option", "options"), + ": ", + oxford_paste(unrec_opts), + call. = FALSE + ) + } + # Catch options with ambiguous partial names (such as "del") that make it + # unclear whether the user is specifying Arrow C++ options ("delimiter") or + # readr-style options ("delim") + is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts))) + ambig_opts <- opt_names[is_ambig_opt] + if (length(ambig_opts)) { + stop("Ambiguous ", + ngettext(length(ambig_opts), "option", "options"), + ": ", + oxford_paste(ambig_opts), + ". Use full argument names", + call. = FALSE + ) + } + if (any(is_readr_opt)) { + # Catch cases when the user specifies a mix of Arrow C++ options and + # readr-style options + if (!all(is_readr_opt)) { + stop("Use either Arrow parse options or readr parse options, not both", + call. = FALSE + ) + } + do.call(readr_to_csv_parse_options, opts) # all options have readr-style names + } else { + do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names + } +} + +csv_file_format_convert_opts <- function(...) { + opts <- list(...) + # Filter out arguments meant for CsvParseOptions/CsvReadOptions + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + read_opts <- names(formals(CsvReadOptions$create)) + opts[arrow_opts] <- NULL + opts[readr_opts] <- NULL + opts[read_opts] <- NULL + opts[["schema"]] <- NULL + do.call(CsvConvertOptions$create, opts) +} + +csv_file_format_read_opts <- function(schema = NULL, ...) { + opts <- list(...) + # Filter out arguments meant for CsvParseOptions/CsvConvertOptions + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + convert_opts <- names(formals(CsvConvertOptions$create)) + opts[arrow_opts] <- NULL + opts[readr_opts] <- NULL + opts[convert_opts] <- NULL + if (!is.null(schema)) { + opts[["column_names"]] <- names(schema) + } + do.call(CsvReadOptions$create, opts) +} + +#' Format-specific scan options +#' +#' @description +#' A `FragmentScanOptions` holds options specific to a `FileFormat` and a scan +#' operation. +#' +#' @section Factory: +#' `FragmentScanOptions$create()` takes the following arguments: +#' * `format`: A string identifier of the file format. Currently supported values: +#' * "parquet" +#' * "csv"/"text", aliases for the same format. +#' * `...`: Additional format-specific options +#' +#' `format = "parquet"``: +#' * `use_buffered_stream`: Read files through buffered input streams rather than +#' loading entire row groups at once. This may be enabled +#' to reduce memory overhead. Disabled by default. +#' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB. +#' * `pre_buffer`: Pre-buffer the raw Parquet data. This can improve performance +#' on high-latency filesystems. Disabled by default. +# +#' `format = "text"`: see [CsvConvertOptions]. Note that options can only be +#' specified with the Arrow C++ library naming. Also, "block_size" from +#' [CsvReadOptions] may be given. +#' +#' It returns the appropriate subclass of `FragmentScanOptions` +#' (e.g. `CsvFragmentScanOptions`). +#' @rdname FragmentScanOptions +#' @name FragmentScanOptions +#' @export +FragmentScanOptions <- R6Class("FragmentScanOptions", + inherit = ArrowObject, + active = list( + # @description + # Return the `FragmentScanOptions`'s type + type = function() dataset___FragmentScanOptions__type_name(self) + ) +) +FragmentScanOptions$create <- function(format, ...) { + if (format %in% c("csv", "text", "tsv")) { + CsvFragmentScanOptions$create(...) + } else if (format == "parquet") { + ParquetFragmentScanOptions$create(...) + } else { + stop("Unsupported file format: ", format, call. = FALSE) + } +} + +#' @export +as.character.FragmentScanOptions <- function(x, ...) { + x$type +} + +#' @usage NULL +#' @format NULL +#' @rdname FragmentScanOptions +#' @export +CsvFragmentScanOptions <- R6Class("CsvFragmentScanOptions", inherit = FragmentScanOptions) +CsvFragmentScanOptions$create <- function(..., + convert_opts = csv_file_format_convert_opts(...), + read_opts = csv_file_format_read_opts(...)) { + dataset___CsvFragmentScanOptions__Make(convert_opts, read_opts) +} + +#' @usage NULL +#' @format NULL +#' @rdname FragmentScanOptions +#' @export +ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = FragmentScanOptions) +ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE, + buffer_size = 8196, + pre_buffer = TRUE) { + dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer) +} + +#' Format-specific write options +#' +#' @description +#' A `FileWriteOptions` holds write options specific to a `FileFormat`. +FileWriteOptions <- R6Class("FileWriteOptions", + inherit = ArrowObject, + public = list( + update = function(table, ...) { + if (self$type == "parquet") { + dataset___ParquetFileWriteOptions__update( + self, + ParquetWriterProperties$create(table, ...), + ParquetArrowWriterProperties$create(...) + ) + } else if (self$type == "ipc") { + args <- list(...) + if (is.null(args$codec)) { + dataset___IpcFileWriteOptions__update1( + self, + get_ipc_use_legacy_format(args$use_legacy_format), + get_ipc_metadata_version(args$metadata_version) + ) + } else { + dataset___IpcFileWriteOptions__update2( + self, + get_ipc_use_legacy_format(args$use_legacy_format), + args$codec, + get_ipc_metadata_version(args$metadata_version) + ) + } + } else if (self$type == "csv") { + dataset___CsvFileWriteOptions__update( + self, + CsvWriteOptions$create(...) + ) + } + invisible(self) + } + ), + active = list( + type = function() dataset___FileWriteOptions__type_name(self) + ) +) +FileWriteOptions$create <- function(format, ...) { + if (!inherits(format, "FileFormat")) { + format <- FileFormat$create(format) + } + options <- dataset___FileFormat__DefaultWriteOptions(format) + options$update(...) +} |