summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R/dataset-format.R
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/R/dataset-format.R
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/R/dataset-format.R')
-rw-r--r--src/arrow/r/R/dataset-format.R353
1 files changed, 353 insertions, 0 deletions
diff --git a/src/arrow/r/R/dataset-format.R b/src/arrow/r/R/dataset-format.R
new file mode 100644
index 000000000..b0b93219e
--- /dev/null
+++ b/src/arrow/r/R/dataset-format.R
@@ -0,0 +1,353 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Dataset file formats
+#'
+#' @description
+#' A `FileFormat` holds information about how to read and parse the files
+#' included in a `Dataset`. There are subclasses corresponding to the supported
+#' file formats (`ParquetFileFormat` and `IpcFileFormat`).
+#'
+#' @section Factory:
+#' `FileFormat$create()` takes the following arguments:
+#' * `format`: A string identifier of the file format. Currently supported values:
+#' * "parquet"
+#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+#' only version 2 files are supported
+#' * "csv"/"text", aliases for the same thing (because comma is the default
+#' delimiter for text files
+#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
+#' * `...`: Additional format-specific options
+#'
+#' `format = "parquet"``:
+#' * `dict_columns`: Names of columns which should be read as dictionaries.
+#' * Any Parquet options from [FragmentScanOptions].
+#'
+#' `format = "text"`: see [CsvParseOptions]. Note that you can specify them either
+#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
+#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
+#' Not all `readr` options are currently supported; please file an issue if
+#' you encounter one that `arrow` should support. Also, the following options are
+#' supported. From [CsvReadOptions]:
+#' * `skip_rows`
+#' * `column_names`
+#' * `autogenerate_column_names`
+#' From [CsvFragmentScanOptions] (these values can be overridden at scan time):
+#' * `convert_options`: a [CsvConvertOptions]
+#' * `block_size`
+#'
+#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
+#' @rdname FileFormat
+#' @name FileFormat
+#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows"
+#' ## Semi-colon delimited files
+#' # Set up directory for examples
+#' tf <- tempfile()
+#' dir.create(tf)
+#' on.exit(unlink(tf))
+#' write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)
+#'
+#' # Create FileFormat object
+#' format <- FileFormat$create(format = "text", delimiter = ";")
+#'
+#' open_dataset(tf, format = format)
+#' @export
+FileFormat <- R6Class("FileFormat",
+ inherit = ArrowObject,
+ active = list(
+ # @description
+ # Return the `FileFormat`'s type
+ type = function() dataset___FileFormat__type_name(self)
+ )
+)
+FileFormat$create <- function(format, schema = NULL, ...) {
+ opt_names <- names(list(...))
+ if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) {
+ CsvFileFormat$create(schema = schema, ...)
+ } else if (format == c("tsv")) {
+ CsvFileFormat$create(delimiter = "\t", schema = schema, ...)
+ } else if (format == "parquet") {
+ ParquetFileFormat$create(...)
+ } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing
+ dataset___IpcFileFormat__Make()
+ } else {
+ stop("Unsupported file format: ", format, call. = FALSE)
+ }
+}
+
+#' @export
+as.character.FileFormat <- function(x, ...) {
+ out <- x$type
+ # Slight hack: special case IPC -> feather, otherwise is just the type_name
+ ifelse(out == "ipc", "feather", out)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat)
+ParquetFileFormat$create <- function(...,
+ dict_columns = character(0)) {
+ options <- ParquetFragmentScanOptions$create(...)
+ dataset___ParquetFileFormat__Make(options, dict_columns)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
+CsvFileFormat$create <- function(...,
+ opts = csv_file_format_parse_options(...),
+ convert_options = csv_file_format_convert_opts(...),
+ read_options = csv_file_format_read_opts(...)) {
+ dataset___CsvFileFormat__Make(opts, convert_options, read_options)
+}
+
+# Support both readr-style option names and Arrow C++ option names
+csv_file_format_parse_options <- function(...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
+ convert_opts <- names(formals(CsvConvertOptions$create))
+ read_opts <- names(formals(CsvReadOptions$create))
+ opts[convert_opts] <- NULL
+ opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
+ opt_names <- names(opts)
+ # Catch any readr-style options specified with full option names that are
+ # supported by read_delim_arrow() (and its wrappers) but are not yet
+ # supported here
+ unsup_readr_opts <- setdiff(
+ names(formals(read_delim_arrow)),
+ names(formals(readr_to_csv_parse_options))
+ )
+ is_unsup_opt <- opt_names %in% unsup_readr_opts
+ unsup_opts <- opt_names[is_unsup_opt]
+ if (length(unsup_opts)) {
+ stop(
+ "The following ",
+ ngettext(length(unsup_opts), "option is ", "options are "),
+ "supported in \"read_delim_arrow\" functions ",
+ "but not yet supported here: ",
+ oxford_paste(unsup_opts),
+ call. = FALSE
+ )
+ }
+ # Catch any options with full or partial names that do not match any of the
+ # recognized Arrow C++ option names or readr-style option names
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
+ is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
+ unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
+ if (length(unrec_opts)) {
+ stop(
+ "Unrecognized ",
+ ngettext(length(unrec_opts), "option", "options"),
+ ": ",
+ oxford_paste(unrec_opts),
+ call. = FALSE
+ )
+ }
+ # Catch options with ambiguous partial names (such as "del") that make it
+ # unclear whether the user is specifying Arrow C++ options ("delimiter") or
+ # readr-style options ("delim")
+ is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts)))
+ ambig_opts <- opt_names[is_ambig_opt]
+ if (length(ambig_opts)) {
+ stop("Ambiguous ",
+ ngettext(length(ambig_opts), "option", "options"),
+ ": ",
+ oxford_paste(ambig_opts),
+ ". Use full argument names",
+ call. = FALSE
+ )
+ }
+ if (any(is_readr_opt)) {
+ # Catch cases when the user specifies a mix of Arrow C++ options and
+ # readr-style options
+ if (!all(is_readr_opt)) {
+ stop("Use either Arrow parse options or readr parse options, not both",
+ call. = FALSE
+ )
+ }
+ do.call(readr_to_csv_parse_options, opts) # all options have readr-style names
+ } else {
+ do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names
+ }
+}
+
+csv_file_format_convert_opts <- function(...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvParseOptions/CsvReadOptions
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ read_opts <- names(formals(CsvReadOptions$create))
+ opts[arrow_opts] <- NULL
+ opts[readr_opts] <- NULL
+ opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
+ do.call(CsvConvertOptions$create, opts)
+}
+
+csv_file_format_read_opts <- function(schema = NULL, ...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ convert_opts <- names(formals(CsvConvertOptions$create))
+ opts[arrow_opts] <- NULL
+ opts[readr_opts] <- NULL
+ opts[convert_opts] <- NULL
+ if (!is.null(schema)) {
+ opts[["column_names"]] <- names(schema)
+ }
+ do.call(CsvReadOptions$create, opts)
+}
+
+#' Format-specific scan options
+#'
+#' @description
+#' A `FragmentScanOptions` holds options specific to a `FileFormat` and a scan
+#' operation.
+#'
+#' @section Factory:
+#' `FragmentScanOptions$create()` takes the following arguments:
+#' * `format`: A string identifier of the file format. Currently supported values:
+#' * "parquet"
+#' * "csv"/"text", aliases for the same format.
+#' * `...`: Additional format-specific options
+#'
+#' `format = "parquet"``:
+#' * `use_buffered_stream`: Read files through buffered input streams rather than
+#' loading entire row groups at once. This may be enabled
+#' to reduce memory overhead. Disabled by default.
+#' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB.
+#' * `pre_buffer`: Pre-buffer the raw Parquet data. This can improve performance
+#' on high-latency filesystems. Disabled by default.
+#
+#' `format = "text"`: see [CsvConvertOptions]. Note that options can only be
+#' specified with the Arrow C++ library naming. Also, "block_size" from
+#' [CsvReadOptions] may be given.
+#'
+#' It returns the appropriate subclass of `FragmentScanOptions`
+#' (e.g. `CsvFragmentScanOptions`).
+#' @rdname FragmentScanOptions
+#' @name FragmentScanOptions
+#' @export
+FragmentScanOptions <- R6Class("FragmentScanOptions",
+ inherit = ArrowObject,
+ active = list(
+ # @description
+ # Return the `FragmentScanOptions`'s type
+ type = function() dataset___FragmentScanOptions__type_name(self)
+ )
+)
+FragmentScanOptions$create <- function(format, ...) {
+ if (format %in% c("csv", "text", "tsv")) {
+ CsvFragmentScanOptions$create(...)
+ } else if (format == "parquet") {
+ ParquetFragmentScanOptions$create(...)
+ } else {
+ stop("Unsupported file format: ", format, call. = FALSE)
+ }
+}
+
+#' @export
+as.character.FragmentScanOptions <- function(x, ...) {
+ x$type
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FragmentScanOptions
+#' @export
+CsvFragmentScanOptions <- R6Class("CsvFragmentScanOptions", inherit = FragmentScanOptions)
+CsvFragmentScanOptions$create <- function(...,
+ convert_opts = csv_file_format_convert_opts(...),
+ read_opts = csv_file_format_read_opts(...)) {
+ dataset___CsvFragmentScanOptions__Make(convert_opts, read_opts)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FragmentScanOptions
+#' @export
+ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = FragmentScanOptions)
+ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE,
+ buffer_size = 8196,
+ pre_buffer = TRUE) {
+ dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer)
+}
+
+#' Format-specific write options
+#'
+#' @description
+#' A `FileWriteOptions` holds write options specific to a `FileFormat`.
+FileWriteOptions <- R6Class("FileWriteOptions",
+ inherit = ArrowObject,
+ public = list(
+ update = function(table, ...) {
+ if (self$type == "parquet") {
+ dataset___ParquetFileWriteOptions__update(
+ self,
+ ParquetWriterProperties$create(table, ...),
+ ParquetArrowWriterProperties$create(...)
+ )
+ } else if (self$type == "ipc") {
+ args <- list(...)
+ if (is.null(args$codec)) {
+ dataset___IpcFileWriteOptions__update1(
+ self,
+ get_ipc_use_legacy_format(args$use_legacy_format),
+ get_ipc_metadata_version(args$metadata_version)
+ )
+ } else {
+ dataset___IpcFileWriteOptions__update2(
+ self,
+ get_ipc_use_legacy_format(args$use_legacy_format),
+ args$codec,
+ get_ipc_metadata_version(args$metadata_version)
+ )
+ }
+ } else if (self$type == "csv") {
+ dataset___CsvFileWriteOptions__update(
+ self,
+ CsvWriteOptions$create(...)
+ )
+ }
+ invisible(self)
+ }
+ ),
+ active = list(
+ type = function() dataset___FileWriteOptions__type_name(self)
+ )
+)
+FileWriteOptions$create <- function(format, ...) {
+ if (!inherits(format, "FileFormat")) {
+ format <- FileFormat$create(format)
+ }
+ options <- dataset___FileFormat__DefaultWriteOptions(format)
+ options$update(...)
+}