summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R/feather.R
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/R/feather.R
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/R/feather.R')
-rw-r--r--src/arrow/r/R/feather.R219
1 files changed, 219 insertions, 0 deletions
diff --git a/src/arrow/r/R/feather.R b/src/arrow/r/R/feather.R
new file mode 100644
index 000000000..70a270bbe
--- /dev/null
+++ b/src/arrow/r/R/feather.R
@@ -0,0 +1,219 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Write data in the Feather format
+#'
+#' Feather provides binary columnar serialization for data frames.
+#' It is designed to make reading and writing data frames efficient,
+#' and to make sharing data across data analysis languages easy.
+#' This function writes both the original, limited specification of the format
+#' and the version 2 specification, which is the Apache Arrow IPC file format.
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param version integer Feather file version. Version 2 is the current.
+#' Version 1 is the more limited legacy format.
+#' @param chunk_size For V2 files, the number of rows that each chunk of data
+#' should have in the file. Use a smaller `chunk_size` when you need faster
+#' random row access. Default is 64K. This option is not supported for V1.
+#' @param compression Name of compression codec to use, if any. Default is
+#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
+#' "uncompressed". "zstd" is the other available codec and generally has better
+#' compression ratios in exchange for slower read and write performance
+#' See [codec_is_available()]. This option is not supported for V1.
+#' @param compression_level If `compression` is "zstd", you may
+#' specify an integer compression level. If omitted, the compression codec's
+#' default compression level is used.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
+#' @seealso [Schema] for information about schemas and metadata handling.
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_feather(mtcars, tf)
+#' @include arrow-package.R
+write_feather <- function(x,
+ sink,
+ version = 2,
+ chunk_size = 65536L,
+ compression = c("default", "lz4", "uncompressed", "zstd"),
+ compression_level = NULL) {
+ # Handle and validate options before touching data
+ version <- as.integer(version)
+ assert_that(version %in% 1:2)
+ compression <- match.arg(compression)
+ chunk_size <- as.integer(chunk_size)
+ assert_that(chunk_size > 0)
+ if (compression == "default") {
+ if (version == 2 && codec_is_available("lz4")) {
+ compression <- "lz4"
+ } else {
+ compression <- "uncompressed"
+ }
+ }
+ if (is.null(compression_level)) {
+ # Use -1 as sentinal for "default"
+ compression_level <- -1L
+ }
+ compression_level <- as.integer(compression_level)
+ # Now make sure that options make sense together
+ if (version == 1) {
+ if (chunk_size != 65536L) {
+ stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE)
+ }
+ if (compression != "uncompressed") {
+ stop("Feather version 1 does not support the 'compression' option", call. = FALSE)
+ }
+ if (compression_level != -1L) {
+ stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE)
+ }
+ }
+ if (compression != "zstd" && compression_level != -1L) {
+ stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE)
+ }
+ # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug:
+ version <- version + 1L
+
+ # "lz4" is the convenience
+ if (compression == "lz4") {
+ compression <- "lz4_frame"
+ }
+
+ compression <- compression_from_name(compression)
+
+ x_out <- x
+ if (is.data.frame(x) || inherits(x, "RecordBatch")) {
+ x <- Table$create(x)
+ }
+
+ assert_that(is_writable_table(x))
+
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+ ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level)
+ invisible(x_out)
+}
+
+#' Read a Feather file
+#'
+#' Feather provides binary columnar serialization for data frames.
+#' It is designed to make reading and writing data frames efficient,
+#' and to make sharing data across data analysis languages easy.
+#' This function reads both the original, limited specification of the format
+#' and the version 2 specification, which is the Apache Arrow IPC file format.
+#'
+#' @inheritParams read_ipc_stream
+#' @inheritParams read_delim_arrow
+#' @param ... additional parameters, passed to [make_readable_file()].
+#'
+#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
+#' Arrow [Table] otherwise
+#'
+#' @export
+#' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data.
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_feather(mtcars, tf)
+#' df <- read_feather(tf)
+#' dim(df)
+#' # Can select columns
+#' df <- read_feather(tf, col_select = starts_with("d"))
+read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
+ if (!inherits(file, "RandomAccessFile")) {
+ file <- make_readable_file(file, ...)
+ on.exit(file$close())
+ }
+ reader <- FeatherReader$create(file)
+
+ col_select <- enquo(col_select)
+ columns <- if (!quo_is_null(col_select)) {
+ vars_select(names(reader), !!col_select)
+ }
+
+ out <- tryCatch(
+ reader$Read(columns),
+ error = read_compressed_error
+ )
+
+ if (isTRUE(as_data_frame)) {
+ out <- as.data.frame(out)
+ }
+ out
+}
+
+#' @title FeatherReader class
+#' @rdname FeatherReader
+#' @name FeatherReader
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class enables you to interact with Feather files. Create
+#' one to connect to a file or other InputStream, and call `Read()` on it to
+#' make an `arrow::Table`. See its usage in [`read_feather()`].
+#'
+#' @section Factory:
+#'
+#' The `FeatherReader$create()` factory method instantiates the object and
+#' takes the following argument:
+#'
+#' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
+#'
+#' @section Methods:
+#'
+#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of
+#' integer indices
+#' - `$column_names`: Active binding, returns the column names in the Feather file
+#' - `$schema`: Active binding, returns the schema of the Feather file
+#' - `$version`: Active binding, returns `1` or `2`, according to the Feather
+#' file version
+#'
+#' @export
+#' @include arrow-package.R
+FeatherReader <- R6Class("FeatherReader",
+ inherit = ArrowObject,
+ public = list(
+ Read = function(columns) {
+ ipc___feather___Reader__Read(self, columns)
+ },
+ print = function(...) {
+ cat("FeatherReader:\n")
+ print(self$schema)
+ invisible(self)
+ }
+ ),
+ active = list(
+ # versions are officially 2 for V1 and 3 for V2 :shrug:
+ version = function() ipc___feather___Reader__version(self) - 1L,
+ column_names = function() names(self$schema),
+ schema = function() ipc___feather___Reader__schema(self)
+ )
+)
+
+#' @export
+names.FeatherReader <- function(x) x$column_names
+
+FeatherReader$create <- function(file) {
+ assert_is(file, "RandomAccessFile")
+ ipc___feather___Reader__Open(file)
+}