# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Write data in the Feather format #' #' Feather provides binary columnar serialization for data frames. #' It is designed to make reading and writing data frames efficient, #' and to make sharing data across data analysis languages easy. #' This function writes both the original, limited specification of the format #' and the version 2 specification, which is the Apache Arrow IPC file format. #' #' @param x `data.frame`, [RecordBatch], or [Table] #' @param sink A string file path, URI, or [OutputStream], or path in a file #' system (`SubTreeFileSystem`) #' @param version integer Feather file version. Version 2 is the current. #' Version 1 is the more limited legacy format. #' @param chunk_size For V2 files, the number of rows that each chunk of data #' should have in the file. Use a smaller `chunk_size` when you need faster #' random row access. Default is 64K. This option is not supported for V1. #' @param compression Name of compression codec to use, if any. Default is #' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise #' "uncompressed". "zstd" is the other available codec and generally has better #' compression ratios in exchange for slower read and write performance #' See [codec_is_available()]. This option is not supported for V1. #' @param compression_level If `compression` is "zstd", you may #' specify an integer compression level. If omitted, the compression codec's #' default compression level is used. #' #' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream], #' the stream will be left open. #' @export #' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data. #' @seealso [Schema] for information about schemas and metadata handling. #' @examplesIf arrow_available() #' tf <- tempfile() #' on.exit(unlink(tf)) #' write_feather(mtcars, tf) #' @include arrow-package.R write_feather <- function(x, sink, version = 2, chunk_size = 65536L, compression = c("default", "lz4", "uncompressed", "zstd"), compression_level = NULL) { # Handle and validate options before touching data version <- as.integer(version) assert_that(version %in% 1:2) compression <- match.arg(compression) chunk_size <- as.integer(chunk_size) assert_that(chunk_size > 0) if (compression == "default") { if (version == 2 && codec_is_available("lz4")) { compression <- "lz4" } else { compression <- "uncompressed" } } if (is.null(compression_level)) { # Use -1 as sentinal for "default" compression_level <- -1L } compression_level <- as.integer(compression_level) # Now make sure that options make sense together if (version == 1) { if (chunk_size != 65536L) { stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE) } if (compression != "uncompressed") { stop("Feather version 1 does not support the 'compression' option", call. = FALSE) } if (compression_level != -1L) { stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE) } } if (compression != "zstd" && compression_level != -1L) { stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE) } # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug: version <- version + 1L # "lz4" is the convenience if (compression == "lz4") { compression <- "lz4_frame" } compression <- compression_from_name(compression) x_out <- x if (is.data.frame(x) || inherits(x, "RecordBatch")) { x <- Table$create(x) } assert_that(is_writable_table(x)) if (!inherits(sink, "OutputStream")) { sink <- make_output_stream(sink) on.exit(sink$close()) } ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level) invisible(x_out) } #' Read a Feather file #' #' Feather provides binary columnar serialization for data frames. #' It is designed to make reading and writing data frames efficient, #' and to make sharing data across data analysis languages easy. #' This function reads both the original, limited specification of the format #' and the version 2 specification, which is the Apache Arrow IPC file format. #' #' @inheritParams read_ipc_stream #' @inheritParams read_delim_arrow #' @param ... additional parameters, passed to [make_readable_file()]. #' #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an #' Arrow [Table] otherwise #' #' @export #' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data. #' @examplesIf arrow_available() #' tf <- tempfile() #' on.exit(unlink(tf)) #' write_feather(mtcars, tf) #' df <- read_feather(tf) #' dim(df) #' # Can select columns #' df <- read_feather(tf, col_select = starts_with("d")) read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { if (!inherits(file, "RandomAccessFile")) { file <- make_readable_file(file, ...) on.exit(file$close()) } reader <- FeatherReader$create(file) col_select <- enquo(col_select) columns <- if (!quo_is_null(col_select)) { vars_select(names(reader), !!col_select) } out <- tryCatch( reader$Read(columns), error = read_compressed_error ) if (isTRUE(as_data_frame)) { out <- as.data.frame(out) } out } #' @title FeatherReader class #' @rdname FeatherReader #' @name FeatherReader #' @docType class #' @usage NULL #' @format NULL #' @description This class enables you to interact with Feather files. Create #' one to connect to a file or other InputStream, and call `Read()` on it to #' make an `arrow::Table`. See its usage in [`read_feather()`]. #' #' @section Factory: #' #' The `FeatherReader$create()` factory method instantiates the object and #' takes the following argument: #' #' - `file` an Arrow file connection object inheriting from `RandomAccessFile`. #' #' @section Methods: #' #' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of #' integer indices #' - `$column_names`: Active binding, returns the column names in the Feather file #' - `$schema`: Active binding, returns the schema of the Feather file #' - `$version`: Active binding, returns `1` or `2`, according to the Feather #' file version #' #' @export #' @include arrow-package.R FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject, public = list( Read = function(columns) { ipc___feather___Reader__Read(self, columns) }, print = function(...) { cat("FeatherReader:\n") print(self$schema) invisible(self) } ), active = list( # versions are officially 2 for V1 and 3 for V2 :shrug: version = function() ipc___feather___Reader__version(self) - 1L, column_names = function() names(self$schema), schema = function() ipc___feather___Reader__schema(self) ) ) #' @export names.FeatherReader <- function(x) x$column_names FeatherReader$create <- function(file) { assert_is(file, "RandomAccessFile") ipc___feather___Reader__Open(file) }