summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/R
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/R')
-rw-r--r--src/arrow/r/R/array-data.R53
-rw-r--r--src/arrow/r/R/array.R329
-rw-r--r--src/arrow/r/R/arrow-datum.R266
-rw-r--r--src/arrow/r/R/arrow-package.R351
-rw-r--r--src/arrow/r/R/arrow-tabular.R272
-rw-r--r--src/arrow/r/R/arrowExports.R1801
-rw-r--r--src/arrow/r/R/buffer.R78
-rw-r--r--src/arrow/r/R/chunked-array.R153
-rw-r--r--src/arrow/r/R/compression.R124
-rw-r--r--src/arrow/r/R/compute.R309
-rw-r--r--src/arrow/r/R/config.R44
-rw-r--r--src/arrow/r/R/csv.R644
-rw-r--r--src/arrow/r/R/dataset-factory.R170
-rw-r--r--src/arrow/r/R/dataset-format.R353
-rw-r--r--src/arrow/r/R/dataset-partition.R132
-rw-r--r--src/arrow/r/R/dataset-scan.R262
-rw-r--r--src/arrow/r/R/dataset-write.R144
-rw-r--r--src/arrow/r/R/dataset.R367
-rw-r--r--src/arrow/r/R/deprecated.R40
-rw-r--r--src/arrow/r/R/dictionary.R69
-rw-r--r--src/arrow/r/R/dplyr-arrange.R98
-rw-r--r--src/arrow/r/R/dplyr-collect.R121
-rw-r--r--src/arrow/r/R/dplyr-count.R60
-rw-r--r--src/arrow/r/R/dplyr-distinct.R46
-rw-r--r--src/arrow/r/R/dplyr-eval.R123
-rw-r--r--src/arrow/r/R/dplyr-filter.R91
-rw-r--r--src/arrow/r/R/dplyr-functions.R1087
-rw-r--r--src/arrow/r/R/dplyr-group-by.R86
-rw-r--r--src/arrow/r/R/dplyr-join.R126
-rw-r--r--src/arrow/r/R/dplyr-mutate.R140
-rw-r--r--src/arrow/r/R/dplyr-select.R125
-rw-r--r--src/arrow/r/R/dplyr-summarize.R289
-rw-r--r--src/arrow/r/R/dplyr.R259
-rw-r--r--src/arrow/r/R/duckdb.R165
-rw-r--r--src/arrow/r/R/enums.R178
-rw-r--r--src/arrow/r/R/expression.R240
-rw-r--r--src/arrow/r/R/feather.R219
-rw-r--r--src/arrow/r/R/field.R84
-rw-r--r--src/arrow/r/R/filesystem.R505
-rw-r--r--src/arrow/r/R/flight.R124
-rw-r--r--src/arrow/r/R/install-arrow.R239
-rw-r--r--src/arrow/r/R/io.R295
-rw-r--r--src/arrow/r/R/ipc_stream.R123
-rw-r--r--src/arrow/r/R/json.R102
-rw-r--r--src/arrow/r/R/memory-pool.R61
-rw-r--r--src/arrow/r/R/message.R97
-rw-r--r--src/arrow/r/R/metadata.R210
-rw-r--r--src/arrow/r/R/parquet.R585
-rw-r--r--src/arrow/r/R/python.R225
-rw-r--r--src/arrow/r/R/query-engine.R298
-rw-r--r--src/arrow/r/R/record-batch-reader.R164
-rw-r--r--src/arrow/r/R/record-batch-writer.R194
-rw-r--r--src/arrow/r/R/record-batch.R193
-rw-r--r--src/arrow/r/R/reexports-bit64.R22
-rw-r--r--src/arrow/r/R/reexports-tidyselect.R46
-rw-r--r--src/arrow/r/R/scalar.R101
-rw-r--r--src/arrow/r/R/schema.R330
-rw-r--r--src/arrow/r/R/table.R170
-rw-r--r--src/arrow/r/R/type.R541
-rw-r--r--src/arrow/r/R/util.R195
60 files changed, 14318 insertions, 0 deletions
diff --git a/src/arrow/r/R/array-data.R b/src/arrow/r/R/array-data.R
new file mode 100644
index 000000000..99c24fdcf
--- /dev/null
+++ b/src/arrow/r/R/array-data.R
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @title ArrayData class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description The `ArrayData` class allows you to get and inspect the data
+#' inside an `arrow::Array`.
+#'
+#' @section Usage:
+#'
+#' ```
+#' data <- Array$create(x)$data()
+#'
+#' data$type
+#' data$length
+#' data$null_count
+#' data$offset
+#' data$buffers
+#' ```
+#'
+#' @section Methods:
+#'
+#' ...
+#'
+#' @rdname ArrayData
+#' @name ArrayData
+#' @include type.R
+ArrayData <- R6Class("ArrayData",
+ inherit = ArrowObject,
+ active = list(
+ type = function() ArrayData__get_type(self),
+ length = function() ArrayData__get_length(self),
+ null_count = function() ArrayData__get_null_count(self),
+ offset = function() ArrayData__get_offset(self),
+ buffers = function() ArrayData__buffers(self)
+ )
+)
diff --git a/src/arrow/r/R/array.R b/src/arrow/r/R/array.R
new file mode 100644
index 000000000..46acc14ff
--- /dev/null
+++ b/src/arrow/r/R/array.R
@@ -0,0 +1,329 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+#' @title Arrow Arrays
+#' @description An `Array` is an immutable data array with some logical type
+#' and some length. Most logical types are contained in the base
+#' `Array` class; there are also subclasses for `DictionaryArray`, `ListArray`,
+#' and `StructArray`.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Factory:
+#' The `Array$create()` factory method instantiates an `Array` and
+#' takes the following arguments:
+#' * `x`: an R vector, list, or `data.frame`
+#' * `type`: an optional [data type][data-type] for `x`. If omitted, the type
+#' will be inferred from the data.
+#'
+#' `Array$create()` will return the appropriate subclass of `Array`, such as
+#' `DictionaryArray` when given an R factor.
+#'
+#' To compose a `DictionaryArray` directly, call `DictionaryArray$create()`,
+#' which takes two arguments:
+#' * `x`: an R vector or `Array` of integers for the dictionary indices
+#' * `dict`: an R vector or `Array` of dictionary values (like R factor levels
+#' but not limited to strings only)
+#' @section Usage:
+#'
+#' ```
+#' a <- Array$create(x)
+#' length(a)
+#'
+#' print(a)
+#' a == a
+#' ```
+#'
+#' @section Methods:
+#'
+#' - `$IsNull(i)`: Return true if value at index is null. Does not boundscheck
+#' - `$IsValid(i)`: Return true if value at index is valid. Does not boundscheck
+#' - `$length()`: Size in the number of elements this array contains
+#' - `$offset`: A relative position into another array's data, to enable zero-copy slicing
+#' - `$null_count`: The number of null entries in the array
+#' - `$type`: logical type of data
+#' - `$type_id()`: type id
+#' - `$Equals(other)` : is this array equal to `other`
+#' - `$ApproxEquals(other)` :
+#' - `$Diff(other)` : return a string expressing the difference between two arrays
+#' - `$data()`: return the underlying [ArrayData][ArrayData]
+#' - `$as_vector()`: convert to an R vector
+#' - `$ToString()`: string representation of the array
+#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array
+#' with the indicated offset and length. If length is `NULL`, the slice goes
+#' until the end of the array.
+#' - `$Take(i)`: return an `Array` with values at positions given by integers
+#' (R vector or Array Array) `i`.
+#' - `$Filter(i, keep_na = TRUE)`: return an `Array` with values at positions where logical
+#' vector (or Arrow boolean Array) `i` is `TRUE`.
+#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be
+#' used to rearrange the `Array` in ascending or descending order
+#' - `$RangeEquals(other, start_idx, end_idx, other_start_idx)` :
+#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the
+#' data in the array to change its type.
+#' - `$View(type)`: Construct a zero-copy view of this array with the given type.
+#' - `$Validate()` : Perform any validation checks to determine obvious inconsistencies
+#' within the array's internal data. This can be an expensive check, potentially `O(length)`
+#'
+#' @rdname array
+#' @name array
+#' @examplesIf arrow_available()
+#' my_array <- Array$create(1:10)
+#' my_array$type
+#' my_array$cast(int8())
+#'
+#' # Check if value is null; zero-indexed
+#' na_array <- Array$create(c(1:5, NA))
+#' na_array$IsNull(0)
+#' na_array$IsNull(5)
+#' na_array$IsValid(5)
+#' na_array$null_count
+#'
+#' # zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice
+#' new_array <- na_array$Slice(5)
+#' new_array$offset
+#'
+#' # Compare 2 arrays
+#' na_array2 <- na_array
+#' na_array2 == na_array # element-wise comparison
+#' na_array2$Equals(na_array) # overall comparison
+#' @export
+Array <- R6Class("Array",
+ inherit = ArrowDatum,
+ public = list(
+ IsNull = function(i) Array__IsNull(self, i),
+ IsValid = function(i) Array__IsValid(self, i),
+ length = function() Array__length(self),
+ type_id = function() Array__type_id(self),
+ Equals = function(other, ...) {
+ inherits(other, "Array") && Array__Equals(self, other)
+ },
+ ApproxEquals = function(other) {
+ inherits(other, "Array") && Array__ApproxEquals(self, other)
+ },
+ Diff = function(other) {
+ if (!inherits(other, "Array")) {
+ other <- Array$create(other)
+ }
+ Array__Diff(self, other)
+ },
+ data = function() Array__data(self),
+ as_vector = function() Array__as_vector(self),
+ ToString = function() {
+ typ <- paste0("<", self$type$ToString(), ">")
+ paste(typ, Array__ToString(self), sep = "\n")
+ },
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ Array__Slice1(self, offset)
+ } else {
+ Array__Slice2(self, offset, length)
+ }
+ },
+ Take = function(i) {
+ if (is.numeric(i)) {
+ i <- as.integer(i)
+ }
+ if (is.integer(i)) {
+ i <- Array$create(i)
+ }
+ call_function("take", self, i)
+ },
+ Filter = function(i, keep_na = TRUE) {
+ if (is.logical(i)) {
+ i <- Array$create(i)
+ }
+ assert_is(i, "Array")
+ call_function("filter", self, i, options = list(keep_na = keep_na))
+ },
+ SortIndices = function(descending = FALSE) {
+ assert_that(is.logical(descending))
+ assert_that(length(descending) == 1L)
+ assert_that(!is.na(descending))
+ call_function("array_sort_indices", self, options = list(order = descending))
+ },
+ RangeEquals = function(other, start_idx, end_idx, other_start_idx = 0L) {
+ assert_is(other, "Array")
+ Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx)
+ },
+ View = function(type) {
+ Array$create(Array__View(self, as_type(type)))
+ },
+ Validate = function() Array__Validate(self),
+ export_to_c = function(array_ptr, schema_ptr) ExportArray(self, array_ptr, schema_ptr)
+ ),
+ active = list(
+ null_count = function() Array__null_count(self),
+ offset = function() Array__offset(self),
+ type = function() Array__type(self)
+ )
+)
+Array$create <- function(x, type = NULL) {
+ if (!is.null(type)) {
+ type <- as_type(type)
+ }
+ if (inherits(x, "Scalar")) {
+ out <- x$as_array()
+ if (!is.null(type)) {
+ out <- out$cast(type)
+ }
+ return(out)
+ }
+ vec_to_arrow(x, type)
+}
+#' @include arrowExports.R
+Array$import_from_c <- ImportArray
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+DictionaryArray <- R6Class("DictionaryArray",
+ inherit = Array,
+ public = list(
+ indices = function() DictionaryArray__indices(self),
+ dictionary = function() DictionaryArray__dictionary(self)
+ ),
+ active = list(
+ ordered = function() self$type$ordered
+ )
+)
+DictionaryArray$create <- function(x, dict = NULL) {
+ if (is.factor(x)) {
+ # The simple case: converting a factor.
+ # Ignoring `dict`; should probably error if dict is not NULL
+ return(Array$create(x))
+ }
+
+ assert_that(!is.null(dict))
+ if (!is.Array(x)) {
+ x <- Array$create(x)
+ }
+ if (!is.Array(dict)) {
+ dict <- Array$create(dict)
+ }
+ type <- DictionaryType$create(x$type, dict$type)
+ DictionaryArray__FromArrays(type, x, dict)
+}
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+StructArray <- R6Class("StructArray",
+ inherit = Array,
+ public = list(
+ field = function(i) StructArray__field(self, i),
+ GetFieldByName = function(name) StructArray__GetFieldByName(self, name),
+ Flatten = function() StructArray__Flatten(self)
+ )
+)
+
+
+#' @export
+`[[.StructArray` <- function(x, i, ...) {
+ if (is.character(i)) {
+ x$GetFieldByName(i)
+ } else if (is.numeric(i)) {
+ x$field(i - 1)
+ } else {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`$.StructArray` <- function(x, name, ...) {
+ assert_that(is.string(name))
+ if (name %in% ls(x)) {
+ get(name, x)
+ } else {
+ x$GetFieldByName(name)
+ }
+}
+
+#' @export
+names.StructArray <- function(x, ...) StructType__field_names(x$type)
+
+#' @export
+dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields)
+
+#' @export
+as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) {
+ as.vector(x)
+}
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+ListArray <- R6Class("ListArray",
+ inherit = Array,
+ public = list(
+ values = function() ListArray__values(self),
+ value_length = function(i) ListArray__value_length(self, i),
+ value_offset = function(i) ListArray__value_offset(self, i),
+ raw_value_offsets = function() ListArray__raw_value_offsets(self)
+ ),
+ active = list(
+ value_type = function() ListArray__value_type(self)
+ )
+)
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+LargeListArray <- R6Class("LargeListArray",
+ inherit = Array,
+ public = list(
+ values = function() LargeListArray__values(self),
+ value_length = function(i) LargeListArray__value_length(self, i),
+ value_offset = function(i) LargeListArray__value_offset(self, i),
+ raw_value_offsets = function() LargeListArray__raw_value_offsets(self)
+ ),
+ active = list(
+ value_type = function() LargeListArray__value_type(self)
+ )
+)
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+FixedSizeListArray <- R6Class("FixedSizeListArray",
+ inherit = Array,
+ public = list(
+ values = function() FixedSizeListArray__values(self),
+ value_length = function(i) FixedSizeListArray__value_length(self, i),
+ value_offset = function(i) FixedSizeListArray__value_offset(self, i)
+ ),
+ active = list(
+ value_type = function() FixedSizeListArray__value_type(self),
+ list_size = function() self$type$list_size
+ )
+)
+
+is.Array <- function(x, type = NULL) { # nolint
+ is_it <- inherits(x, c("Array", "ChunkedArray"))
+ if (is_it && !is.null(type)) {
+ is_it <- x$type$ToString() %in% type
+ }
+ is_it
+}
diff --git a/src/arrow/r/R/arrow-datum.R b/src/arrow/r/R/arrow-datum.R
new file mode 100644
index 000000000..557321f68
--- /dev/null
+++ b/src/arrow/r/R/arrow-datum.R
@@ -0,0 +1,266 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+
+# Base class for Array, ChunkedArray, and Scalar, for S3 method dispatch only.
+# Does not exist in C++ class hierarchy
+ArrowDatum <- R6Class("ArrowDatum",
+ inherit = ArrowObject,
+ public = list(
+ cast = function(target_type, safe = TRUE, ...) {
+ opts <- cast_options(safe, ...)
+ opts$to_type <- as_type(target_type)
+ call_function("cast", self, options = opts)
+ }
+ )
+)
+
+#' @export
+length.ArrowDatum <- function(x) x$length()
+
+#' @export
+is.finite.ArrowDatum <- function(x) {
+ is_fin <- call_function("is_finite", x)
+ # for compatibility with base::is.finite(), return FALSE for NA_real_
+ is_fin & !is.na(is_fin)
+}
+
+#' @export
+is.infinite.ArrowDatum <- function(x) {
+ is_inf <- call_function("is_inf", x)
+ # for compatibility with base::is.infinite(), return FALSE for NA_real_
+ is_inf & !is.na(is_inf)
+}
+
+#' @export
+is.na.ArrowDatum <- function(x) {
+ call_function("is_null", x, options = list(nan_is_null = TRUE))
+}
+
+#' @export
+is.nan.ArrowDatum <- function(x) {
+ if (x$type_id() %in% TYPES_WITH_NAN) {
+ # TODO: if an option is added to the is_nan kernel to treat NA as NaN,
+ # use that to simplify the code here (ARROW-13366)
+ call_function("is_nan", x) & call_function("is_valid", x)
+ } else {
+ Scalar$create(FALSE)$as_array(length(x))
+ }
+}
+
+#' @export
+as.vector.ArrowDatum <- function(x, mode) {
+ x$as_vector()
+}
+
+#' @export
+Ops.ArrowDatum <- function(e1, e2) {
+ if (.Generic == "!") {
+ eval_array_expression(.Generic, e1)
+ } else if (.Generic %in% names(.array_function_map)) {
+ eval_array_expression(.Generic, e1, e2)
+ } else {
+ stop(paste0("Unsupported operation on `", class(e1)[1L], "` : "), .Generic, call. = FALSE)
+ }
+}
+
+# Wrapper around call_function that:
+# (1) maps R function names to Arrow C++ compute ("/" --> "divide_checked")
+# (2) wraps R input args as Array or Scalar
+eval_array_expression <- function(FUN,
+ ...,
+ args = list(...),
+ options = empty_named_list()) {
+ if (FUN == "-" && length(args) == 1L) {
+ if (inherits(args[[1]], "ArrowObject")) {
+ return(eval_array_expression("negate_checked", args[[1]]))
+ } else {
+ return(-args[[1]])
+ }
+ }
+ args <- lapply(args, .wrap_arrow, FUN)
+
+ # In Arrow, "divide" is one function, which does integer division on
+ # integer inputs and floating-point division on floats
+ if (FUN == "/") {
+ # TODO: omg so many ways it's wrong to assume these types
+ args <- map(args, ~ .$cast(float64()))
+ } else if (FUN == "%/%") {
+ # In R, integer division works like floor(float division)
+ out <- eval_array_expression("/", args = args, options = options)
+ return(out$cast(int32(), allow_float_truncate = TRUE))
+ } else if (FUN == "%%") {
+ # We can't simply do {e1 - e2 * ( e1 %/% e2 )} since Ops.Array evaluates
+ # eagerly, but we can build that up
+ quotient <- eval_array_expression("%/%", args = args)
+ base <- eval_array_expression("*", quotient, args[[2]])
+ # this cast is to ensure that the result of this and e1 are the same
+ # (autocasting only applies to scalars)
+ base <- base$cast(args[[1]]$type)
+ return(eval_array_expression("-", args[[1]], base))
+ }
+
+ call_function(
+ .array_function_map[[FUN]] %||% FUN,
+ args = args,
+ options = options
+ )
+}
+
+.wrap_arrow <- function(arg, fun) {
+ if (!inherits(arg, "ArrowObject")) {
+ # TODO: Array$create if lengths are equal?
+ if (fun == "%in%") {
+ arg <- Array$create(arg)
+ } else {
+ arg <- Scalar$create(arg)
+ }
+ }
+ arg
+}
+
+#' @export
+na.omit.ArrowDatum <- function(object, ...) {
+ object$Filter(!is.na(object))
+}
+
+#' @export
+na.exclude.ArrowDatum <- na.omit.ArrowDatum
+
+#' @export
+na.fail.ArrowDatum <- function(object, ...) {
+ if (object$null_count > 0) {
+ stop("missing values in object", call. = FALSE)
+ }
+ object
+}
+
+filter_rows <- function(x, i, keep_na = TRUE, ...) {
+ # General purpose function for [ row subsetting with R semantics
+ # Based on the input for `i`, calls x$Filter, x$Slice, or x$Take
+ nrows <- x$num_rows %||% x$length() # Depends on whether Array or Table-like
+ if (is.logical(i)) {
+ if (isTRUE(i)) {
+ # Shortcut without doing any work
+ x
+ } else {
+ i <- rep_len(i, nrows) # For R recycling behavior; consider vctrs::vec_recycle()
+ x$Filter(i, keep_na)
+ }
+ } else if (is.numeric(i)) {
+ if (all(i < 0)) {
+ # in R, negative i means "everything but i"
+ i <- setdiff(seq_len(nrows), -1 * i)
+ }
+ if (is.sliceable(i)) {
+ x$Slice(i[1] - 1, length(i))
+ } else if (all(i > 0)) {
+ x$Take(i - 1)
+ } else {
+ stop("Cannot mix positive and negative indices", call. = FALSE)
+ }
+ } else if (is.Array(i, INTEGER_TYPES)) {
+ # NOTE: this doesn't do the - 1 offset
+ x$Take(i)
+ } else if (is.Array(i, "bool")) {
+ x$Filter(i, keep_na)
+ } else {
+ # Unsupported cases
+ if (is.Array(i)) {
+ stop("Cannot extract rows with an Array of type ", i$type$ToString(), call. = FALSE)
+ }
+ stop("Cannot extract rows with an object of class ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`[.ArrowDatum` <- filter_rows
+
+#' @importFrom utils head
+#' @export
+head.ArrowDatum <- function(x, n = 6L, ...) {
+ assert_is(n, c("numeric", "integer"))
+ assert_that(length(n) == 1)
+ len <- NROW(x)
+ if (n < 0) {
+ # head(x, negative) means all but the last n rows
+ n <- max(len + n, 0)
+ } else {
+ n <- min(len, n)
+ }
+ if (n == len) {
+ return(x)
+ }
+ x$Slice(0, n)
+}
+
+#' @importFrom utils tail
+#' @export
+tail.ArrowDatum <- function(x, n = 6L, ...) {
+ assert_is(n, c("numeric", "integer"))
+ assert_that(length(n) == 1)
+ len <- NROW(x)
+ if (n < 0) {
+ # tail(x, negative) means all but the first n rows
+ n <- min(-n, len)
+ } else {
+ n <- max(len - n, 0)
+ }
+ if (n == 0) {
+ return(x)
+ }
+ x$Slice(n)
+}
+
+is.sliceable <- function(i) {
+ # Determine whether `i` can be expressed as a $Slice() command
+ is.numeric(i) &&
+ length(i) > 0 &&
+ all(i > 0) &&
+ i[1] <= i[length(i)] &&
+ identical(as.integer(i), i[1]:i[length(i)])
+}
+
+#' @export
+as.double.ArrowDatum <- function(x, ...) as.double(as.vector(x), ...)
+
+#' @export
+as.integer.ArrowDatum <- function(x, ...) as.integer(as.vector(x), ...)
+
+#' @export
+as.character.ArrowDatum <- function(x, ...) as.character(as.vector(x), ...)
+
+#' @export
+sort.ArrowDatum <- function(x, decreasing = FALSE, na.last = NA, ...) {
+ # Arrow always sorts nulls at the end of the array. This corresponds to
+ # sort(na.last = TRUE). For the other two cases (na.last = NA and
+ # na.last = FALSE) we need to use workarounds.
+ # TODO: Implement this more cleanly after ARROW-12063
+ if (is.na(na.last)) {
+ # Filter out NAs before sorting
+ x <- x$Filter(!is.na(x))
+ x$Take(x$SortIndices(descending = decreasing))
+ } else if (na.last) {
+ x$Take(x$SortIndices(descending = decreasing))
+ } else {
+ # Create a new array that encodes missing values as 1 and non-missing values
+ # as 0. Sort descending by that array first to get the NAs at the beginning
+ tbl <- Table$create(x = x, `is_na` = as.integer(is.na(x)))
+ tbl$x$Take(tbl$SortIndices(names = c("is_na", "x"), descending = c(TRUE, decreasing)))
+ }
+}
diff --git a/src/arrow/r/R/arrow-package.R b/src/arrow/r/R/arrow-package.R
new file mode 100644
index 000000000..edc2652b6
--- /dev/null
+++ b/src/arrow/r/R/arrow-package.R
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail
+#' @importFrom R6 R6Class
+#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr
+#' @importFrom assertthat assert_that is.string
+#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos
+#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
+#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
+#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
+#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
+#' @useDynLib arrow, .registration = TRUE
+#' @keywords internal
+"_PACKAGE"
+
+#' @importFrom vctrs s3_register vec_size vec_cast vec_unique
+.onLoad <- function(...) {
+ dplyr_methods <- paste0(
+ "dplyr::",
+ c(
+ "select", "filter", "collect", "summarise", "group_by", "groups",
+ "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
+ "arrange", "rename", "pull", "relocate", "compute", "collapse",
+ "distinct", "left_join", "right_join", "inner_join", "full_join",
+ "semi_join", "anti_join", "count", "tally"
+ )
+ )
+ for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {
+ for (m in dplyr_methods) {
+ s3_register(m, cl)
+ }
+ }
+ s3_register("dplyr::tbl_vars", "arrow_dplyr_query")
+
+ for (cl in c(
+ "Array", "RecordBatch", "ChunkedArray", "Table", "Schema",
+ "Field", "DataType", "RecordBatchReader"
+ )) {
+ s3_register("reticulate::py_to_r", paste0("pyarrow.lib.", cl))
+ s3_register("reticulate::r_to_py", cl)
+ }
+
+ # Create these once, at package build time
+ if (arrow_available()) {
+ # Also include all available Arrow Compute functions,
+ # namespaced as arrow_fun.
+ # We can't do this at install time because list_compute_functions() may error
+ all_arrow_funs <- list_compute_functions()
+ arrow_funcs <- set_names(
+ lapply(all_arrow_funs, function(fun) {
+ force(fun)
+ function(...) build_expr(fun, ...)
+ }),
+ paste0("arrow_", all_arrow_funs)
+ )
+ .cache$functions <- c(nse_funcs, arrow_funcs)
+ }
+
+ if (tolower(Sys.info()[["sysname"]]) == "windows") {
+ # Disable multithreading on Windows
+ # See https://issues.apache.org/jira/browse/ARROW-8379
+ options(arrow.use_threads = FALSE)
+ }
+
+ invisible()
+}
+
+.onAttach <- function(libname, pkgname) {
+ if (!arrow_available()) {
+ msg <- paste(
+ "The Arrow C++ library is not available. To retry installation with debug output, run:",
+ " install_arrow(verbose = TRUE)",
+ "See https://arrow.apache.org/docs/r/articles/install.html for more guidance and troubleshooting.",
+ sep = "\n"
+ )
+ packageStartupMessage(msg)
+ } else {
+ # Just to be extra safe, let's wrap this in a try();
+ # we don't a failed startup message to prevent the package from loading
+ try({
+ features <- arrow_info()$capabilities
+ # That has all of the #ifdef features, plus the compression libs and the
+ # string libraries (but not the memory allocators, they're added elsewhere)
+ #
+ # Let's print a message if some are off
+ if (some_features_are_off(features)) {
+ packageStartupMessage("See arrow_info() for available features")
+ }
+ })
+ }
+}
+
+#' Is the C++ Arrow library available?
+#'
+#' You won't generally need to call these function, but they're made available
+#' for diagnostic purposes.
+#' @return `TRUE` or `FALSE` depending on whether the package was installed
+#' with:
+#' * The Arrow C++ library (check with `arrow_available()`)
+#' * Arrow Dataset support enabled (check with `arrow_with_dataset()`)
+#' * Parquet support enabled (check with `arrow_with_parquet()`)
+#' * JSON support enabled (check with `arrow_with_json()`)
+#' * Amazon S3 support enabled (check with `arrow_with_s3()`)
+#' @export
+#' @examples
+#' arrow_available()
+#' arrow_with_dataset()
+#' arrow_with_parquet()
+#' arrow_with_json()
+#' arrow_with_s3()
+#' @seealso If any of these are `FALSE`, see
+#' `vignette("install", package = "arrow")` for guidance on reinstalling the
+#' package.
+arrow_available <- function() {
+ tryCatch(.Call(`_arrow_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_dataset <- function() {
+ is_32bit <- .Machine$sizeof.pointer < 8
+ is_old_r <- getRversion() < "4.0.0"
+ is_windows <- tolower(Sys.info()[["sysname"]]) == "windows"
+ if (is_32bit && is_old_r && is_windows) {
+ # 32-bit rtools 3.5 does not properly implement the std::thread expectations
+ # but we can't just disable ARROW_DATASET in that build,
+ # so report it as "off" here.
+ return(FALSE)
+ }
+ tryCatch(.Call(`_dataset_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_parquet <- function() {
+ tryCatch(.Call(`_parquet_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_s3 <- function() {
+ tryCatch(.Call(`_s3_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_json <- function() {
+ tryCatch(.Call(`_json_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+option_use_threads <- function() {
+ !is_false(getOption("arrow.use_threads"))
+}
+
+#' Report information on the package's capabilities
+#'
+#' This function summarizes a number of build-time configurations and run-time
+#' settings for the Arrow package. It may be useful for diagnostics.
+#' @return A list including version information, boolean "capabilities", and
+#' statistics from Arrow's memory allocator, and also Arrow's run-time
+#' information.
+#' @export
+#' @importFrom utils packageVersion
+arrow_info <- function() {
+ opts <- options()
+ out <- list(
+ version = packageVersion("arrow"),
+ libarrow = arrow_available(),
+ options = opts[grep("^arrow\\.", names(opts))]
+ )
+ if (out$libarrow) {
+ pool <- default_memory_pool()
+ runtimeinfo <- runtime_info()
+ buildinfo <- build_info()
+ compute_funcs <- list_compute_functions()
+ out <- c(out, list(
+ capabilities = c(
+ dataset = arrow_with_dataset(),
+ parquet = arrow_with_parquet(),
+ json = arrow_with_json(),
+ s3 = arrow_with_s3(),
+ utf8proc = "utf8_upper" %in% compute_funcs,
+ re2 = "replace_substring_regex" %in% compute_funcs,
+ vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1))
+ ),
+ memory_pool = list(
+ backend_name = pool$backend_name,
+ bytes_allocated = pool$bytes_allocated,
+ max_memory = pool$max_memory,
+ available_backends = supported_memory_backends()
+ ),
+ runtime_info = list(
+ simd_level = runtimeinfo[1],
+ detected_simd_level = runtimeinfo[2]
+ ),
+ build_info = list(
+ cpp_version = buildinfo[1],
+ cpp_compiler = buildinfo[2],
+ cpp_compiler_version = buildinfo[3],
+ cpp_compiler_flags = buildinfo[4],
+ # git_id is "" if not built from a git checkout
+ # convert that to NULL
+ git_id = if (nzchar(buildinfo[5])) buildinfo[5]
+ )
+ ))
+ }
+ structure(out, class = "arrow_info")
+}
+
+some_features_are_off <- function(features) {
+ # `features` is a named logical vector (as in arrow_info()$capabilities)
+ # Let's exclude some less relevant ones
+ blocklist <- c("lzo", "bz2", "brotli")
+ # Return TRUE if any of the other features are FALSE
+ !all(features[setdiff(names(features), blocklist)])
+}
+
+#' @export
+print.arrow_info <- function(x, ...) {
+ print_key_values <- function(title, vals, ...) {
+ # Make a key-value table for printing, no column names
+ df <- data.frame(vals, stringsAsFactors = FALSE, ...)
+ names(df) <- ""
+
+ cat(title, ":\n", sep = "")
+ print(df)
+ cat("\n")
+ }
+ cat("Arrow package version: ", format(x$version), "\n\n", sep = "")
+ if (x$libarrow) {
+ print_key_values("Capabilities", c(
+ x$capabilities,
+ jemalloc = "jemalloc" %in% x$memory_pool$available_backends,
+ mimalloc = "mimalloc" %in% x$memory_pool$available_backends
+ ))
+ if (some_features_are_off(x$capabilities) && identical(tolower(Sys.info()[["sysname"]]), "linux")) {
+ # Only on linux because (e.g.) we disable certain features on purpose on rtools35 and solaris
+ cat(
+ "To reinstall with more optional capabilities enabled, see\n",
+ " https://arrow.apache.org/docs/r/articles/install.html\n\n"
+ )
+ }
+
+ if (length(x$options)) {
+ print_key_values("Arrow options()", map_chr(x$options, format))
+ }
+
+ format_bytes <- function(b, units = "auto", digits = 2L, ...) {
+ format(structure(b, class = "object_size"), units = units, digits = digits, ...)
+ }
+ print_key_values("Memory", c(
+ Allocator = x$memory_pool$backend_name,
+ # utils:::format.object_size is not properly vectorized
+ Current = format_bytes(x$memory_pool$bytes_allocated, ...),
+ Max = format_bytes(x$memory_pool$max_memory, ...)
+ ))
+ print_key_values("Runtime", c(
+ `SIMD Level` = x$runtime_info$simd_level,
+ `Detected SIMD Level` = x$runtime_info$detected_simd_level
+ ))
+ print_key_values("Build", c(
+ `C++ Library Version` = x$build_info$cpp_version,
+ `C++ Compiler` = x$build_info$cpp_compiler,
+ `C++ Compiler Version` = x$build_info$cpp_compiler_version,
+ `Git ID` = x$build_info$git_id
+ ))
+ } else {
+ cat(
+ "Arrow C++ library not available. See https://arrow.apache.org/docs/r/articles/install.html ",
+ "for troubleshooting.\n"
+ )
+ }
+ invisible(x)
+}
+
+option_compress_metadata <- function() {
+ !is_false(getOption("arrow.compress_metadata"))
+}
+
+#' @include enums.R
+ArrowObject <- R6Class("ArrowObject",
+ public = list(
+ initialize = function(xp) self$set_pointer(xp),
+ pointer = function() get(".:xp:.", envir = self),
+ `.:xp:.` = NULL,
+ set_pointer = function(xp) {
+ if (!inherits(xp, "externalptr")) {
+ stop(
+ class(self)[1], "$new() requires a pointer as input: ",
+ "did you mean $create() instead?",
+ call. = FALSE
+ )
+ }
+ assign(".:xp:.", xp, envir = self)
+ },
+ print = function(...) {
+ if (!is.null(self$.class_title)) {
+ # Allow subclasses to override just printing the class name first
+ class_title <- self$.class_title()
+ } else {
+ class_title <- class(self)[[1]]
+ }
+ cat(class_title, "\n", sep = "")
+ if (!is.null(self$ToString)) {
+ cat(self$ToString(), "\n", sep = "")
+ }
+ invisible(self)
+ },
+ invalidate = function() {
+ assign(".:xp:.", NULL, envir = self)
+ }
+ )
+)
+
+#' @export
+`!=.ArrowObject` <- function(lhs, rhs) !(lhs == rhs) # nolint
+
+#' @export
+`==.ArrowObject` <- function(x, y) { # nolint
+ x$Equals(y)
+}
+
+#' @export
+all.equal.ArrowObject <- function(target, current, ..., check.attributes = TRUE) {
+ target$Equals(current, check_metadata = check.attributes)
+}
diff --git a/src/arrow/r/R/arrow-tabular.R b/src/arrow/r/R/arrow-tabular.R
new file mode 100644
index 000000000..43110ccf2
--- /dev/null
+++ b/src/arrow/r/R/arrow-tabular.R
@@ -0,0 +1,272 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+# Base class for RecordBatch and Table for S3 method dispatch only.
+# Does not exist in C++ class hierarchy
+ArrowTabular <- R6Class("ArrowTabular",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ sch <- unlist(strsplit(self$schema$ToString(), "\n"))
+ sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch)
+ dims <- sprintf("%s rows x %s columns", self$num_rows, self$num_columns)
+ paste(c(dims, sch), collapse = "\n")
+ },
+ Take = function(i) {
+ if (is.numeric(i)) {
+ i <- as.integer(i)
+ }
+ if (is.integer(i)) {
+ i <- Array$create(i)
+ }
+ assert_that(is.Array(i))
+ call_function("take", self, i)
+ },
+ Filter = function(i, keep_na = TRUE) {
+ if (is.logical(i)) {
+ i <- Array$create(i)
+ }
+ assert_that(is.Array(i, "bool"))
+ call_function("filter", self, i, options = list(keep_na = keep_na))
+ },
+ SortIndices = function(names, descending = FALSE) {
+ assert_that(is.character(names))
+ assert_that(length(names) > 0)
+ assert_that(!any(is.na(names)))
+ if (length(descending) == 1L) {
+ descending <- rep_len(descending, length(names))
+ }
+ assert_that(is.logical(descending))
+ assert_that(identical(length(names), length(descending)))
+ assert_that(!any(is.na(descending)))
+ call_function(
+ "sort_indices",
+ self,
+ # cpp11 does not support logical vectors so convert to integer
+ options = list(names = names, orders = as.integer(descending))
+ )
+ }
+ ),
+ active = list(
+ metadata = function(new) {
+ if (missing(new)) {
+ # Get the metadata (from the schema)
+ self$schema$metadata
+ } else {
+ # Set the metadata
+ new <- prepare_key_value_metadata(new)
+ out <- self$ReplaceSchemaMetadata(new)
+ # ReplaceSchemaMetadata returns a new object but we're modifying in place,
+ # so swap in that new C++ object pointer into our R6 object
+ self$set_pointer(out$pointer())
+ self
+ }
+ },
+ r_metadata = function(new) {
+ # Helper for the R metadata that handles the serialization
+ # See also method on Schema
+ if (missing(new)) {
+ out <- self$metadata$r
+ if (!is.null(out)) {
+ # Can't unserialize NULL
+ out <- .unserialize_arrow_r_metadata(out)
+ }
+ # Returns either NULL or a named list
+ out
+ } else {
+ # Set the R metadata
+ self$metadata$r <- .serialize_arrow_r_metadata(new)
+ self
+ }
+ }
+ )
+)
+
+#' @export
+as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
+ df <- x$to_data_frame()
+
+ if (!is.null(r_metadata <- x$metadata$r)) {
+ df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata))
+ }
+ df
+}
+
+#' @export
+`names<-.ArrowTabular` <- function(x, value) x$RenameColumns(value)
+
+#' @importFrom methods as
+#' @export
+`[.ArrowTabular` <- function(x, i, j, ..., drop = FALSE) {
+ if (nargs() == 2L) {
+ # List-like column extraction (x[i])
+ return(x[, i])
+ }
+ if (!missing(j)) {
+ # Selecting columns is cheaper than filtering rows, so do it first.
+ # That way, if we're filtering too, we have fewer arrays to filter/slice/take
+ if (is.character(j)) {
+ j_new <- match(j, names(x))
+ if (any(is.na(j_new))) {
+ stop("Column not found: ", oxford_paste(j[is.na(j_new)]), call. = FALSE)
+ }
+ j <- j_new
+ }
+ if (is_integerish(j)) {
+ if (any(is.na(j))) {
+ stop("Column indices cannot be NA", call. = FALSE)
+ }
+ if (length(j) && all(j < 0)) {
+ # in R, negative j means "everything but j"
+ j <- setdiff(seq_len(x$num_columns), -1 * j)
+ }
+ x <- x$SelectColumns(as.integer(j) - 1L)
+ }
+
+ if (drop && ncol(x) == 1L) {
+ x <- x$column(0)
+ }
+ }
+ if (!missing(i)) {
+ x <- filter_rows(x, i, ...)
+ }
+ x
+}
+
+#' @export
+`[[.ArrowTabular` <- function(x, i, ...) {
+ if (is.character(i)) {
+ x$GetColumnByName(i)
+ } else if (is.numeric(i)) {
+ x$column(i - 1)
+ } else {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`$.ArrowTabular` <- function(x, name, ...) {
+ assert_that(is.string(name))
+ if (name %in% ls(x)) {
+ get(name, x)
+ } else {
+ x$GetColumnByName(name)
+ }
+}
+
+#' @export
+`[[<-.ArrowTabular` <- function(x, i, value) {
+ if (!is.character(i) & !is.numeric(i)) {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+ assert_that(length(i) == 1, !is.na(i))
+
+ if (is.null(value)) {
+ if (is.character(i)) {
+ i <- match(i, names(x))
+ }
+ x <- x$RemoveColumn(i - 1L)
+ } else {
+ if (!is.character(i)) {
+ # get or create a/the column name
+ if (i <= x$num_columns) {
+ i <- names(x)[i]
+ } else {
+ i <- as.character(i)
+ }
+ }
+
+ # auto-magic recycling on non-ArrowObjects
+ if (!inherits(value, "ArrowObject")) {
+ value <- vctrs::vec_recycle(value, x$num_rows)
+ }
+
+ # construct the field
+ if (inherits(x, "RecordBatch") && !inherits(value, "Array")) {
+ value <- Array$create(value)
+ } else if (inherits(x, "Table") && !inherits(value, "ChunkedArray")) {
+ value <- ChunkedArray$create(value)
+ }
+ new_field <- field(i, value$type)
+
+ if (i %in% names(x)) {
+ i <- match(i, names(x)) - 1L
+ x <- x$SetColumn(i, new_field, value)
+ } else {
+ i <- x$num_columns
+ x <- x$AddColumn(i, new_field, value)
+ }
+ }
+ x
+}
+
+#' @export
+`$<-.ArrowTabular` <- function(x, i, value) {
+ assert_that(is.string(i))
+ # We need to check if `i` is in names in case it is an active binding (e.g.
+ # `metadata`, in which case we use assign to change the active binding instead
+ # of the column in the table)
+ if (i %in% ls(x)) {
+ assign(i, value, x)
+ } else {
+ x[[i]] <- value
+ }
+ x
+}
+
+#' @export
+dim.ArrowTabular <- function(x) c(x$num_rows, x$num_columns)
+
+#' @export
+length.ArrowTabular <- function(x) x$num_columns
+
+#' @export
+as.list.ArrowTabular <- function(x, ...) as.list(as.data.frame(x, ...))
+
+#' @export
+row.names.ArrowTabular <- function(x) as.character(seq_len(nrow(x)))
+
+#' @export
+dimnames.ArrowTabular <- function(x) list(row.names(x), names(x))
+
+#' @export
+head.ArrowTabular <- head.ArrowDatum
+
+#' @export
+tail.ArrowTabular <- tail.ArrowDatum
+
+#' @export
+na.fail.ArrowTabular <- function(object, ...) {
+ for (col in seq_len(object$num_columns)) {
+ if (object$column(col - 1L)$null_count > 0) {
+ stop("missing values in object", call. = FALSE)
+ }
+ }
+ object
+}
+
+#' @export
+na.omit.ArrowTabular <- function(object, ...) {
+ not_na <- map(object$columns, ~ call_function("is_valid", .x))
+ not_na_agg <- Reduce("&", not_na)
+ object$Filter(not_na_agg)
+}
+
+#' @export
+na.exclude.ArrowTabular <- na.omit.ArrowTabular
diff --git a/src/arrow/r/R/arrowExports.R b/src/arrow/r/R/arrowExports.R
new file mode 100644
index 000000000..014b1641f
--- /dev/null
+++ b/src/arrow/r/R/arrowExports.R
@@ -0,0 +1,1801 @@
+# Generated by using data-raw/codegen.R -> do not edit by hand
+
+test_SET_STRING_ELT <- function(s) {
+ invisible(.Call(`_arrow_test_SET_STRING_ELT`, s))
+}
+
+test_same_Array <- function(x, y) {
+ .Call(`_arrow_test_same_Array`, x, y)
+}
+
+is_arrow_altrep <- function(x) {
+ .Call(`_arrow_is_arrow_altrep`, x)
+}
+
+Array__Slice1 <- function(array, offset) {
+ .Call(`_arrow_Array__Slice1`, array, offset)
+}
+
+Array__Slice2 <- function(array, offset, length) {
+ .Call(`_arrow_Array__Slice2`, array, offset, length)
+}
+
+Array__IsNull <- function(x, i) {
+ .Call(`_arrow_Array__IsNull`, x, i)
+}
+
+Array__IsValid <- function(x, i) {
+ .Call(`_arrow_Array__IsValid`, x, i)
+}
+
+Array__length <- function(x) {
+ .Call(`_arrow_Array__length`, x)
+}
+
+Array__offset <- function(x) {
+ .Call(`_arrow_Array__offset`, x)
+}
+
+Array__null_count <- function(x) {
+ .Call(`_arrow_Array__null_count`, x)
+}
+
+Array__type <- function(x) {
+ .Call(`_arrow_Array__type`, x)
+}
+
+Array__ToString <- function(x) {
+ .Call(`_arrow_Array__ToString`, x)
+}
+
+Array__type_id <- function(x) {
+ .Call(`_arrow_Array__type_id`, x)
+}
+
+Array__Equals <- function(lhs, rhs) {
+ .Call(`_arrow_Array__Equals`, lhs, rhs)
+}
+
+Array__ApproxEquals <- function(lhs, rhs) {
+ .Call(`_arrow_Array__ApproxEquals`, lhs, rhs)
+}
+
+Array__Diff <- function(lhs, rhs) {
+ .Call(`_arrow_Array__Diff`, lhs, rhs)
+}
+
+Array__data <- function(array) {
+ .Call(`_arrow_Array__data`, array)
+}
+
+Array__RangeEquals <- function(self, other, start_idx, end_idx, other_start_idx) {
+ .Call(`_arrow_Array__RangeEquals`, self, other, start_idx, end_idx, other_start_idx)
+}
+
+Array__View <- function(array, type) {
+ .Call(`_arrow_Array__View`, array, type)
+}
+
+Array__Validate <- function(array) {
+ invisible(.Call(`_arrow_Array__Validate`, array))
+}
+
+DictionaryArray__indices <- function(array) {
+ .Call(`_arrow_DictionaryArray__indices`, array)
+}
+
+DictionaryArray__dictionary <- function(array) {
+ .Call(`_arrow_DictionaryArray__dictionary`, array)
+}
+
+StructArray__field <- function(array, i) {
+ .Call(`_arrow_StructArray__field`, array, i)
+}
+
+StructArray__GetFieldByName <- function(array, name) {
+ .Call(`_arrow_StructArray__GetFieldByName`, array, name)
+}
+
+StructArray__Flatten <- function(array) {
+ .Call(`_arrow_StructArray__Flatten`, array)
+}
+
+ListArray__value_type <- function(array) {
+ .Call(`_arrow_ListArray__value_type`, array)
+}
+
+LargeListArray__value_type <- function(array) {
+ .Call(`_arrow_LargeListArray__value_type`, array)
+}
+
+ListArray__values <- function(array) {
+ .Call(`_arrow_ListArray__values`, array)
+}
+
+LargeListArray__values <- function(array) {
+ .Call(`_arrow_LargeListArray__values`, array)
+}
+
+ListArray__value_length <- function(array, i) {
+ .Call(`_arrow_ListArray__value_length`, array, i)
+}
+
+LargeListArray__value_length <- function(array, i) {
+ .Call(`_arrow_LargeListArray__value_length`, array, i)
+}
+
+FixedSizeListArray__value_length <- function(array, i) {
+ .Call(`_arrow_FixedSizeListArray__value_length`, array, i)
+}
+
+ListArray__value_offset <- function(array, i) {
+ .Call(`_arrow_ListArray__value_offset`, array, i)
+}
+
+LargeListArray__value_offset <- function(array, i) {
+ .Call(`_arrow_LargeListArray__value_offset`, array, i)
+}
+
+FixedSizeListArray__value_offset <- function(array, i) {
+ .Call(`_arrow_FixedSizeListArray__value_offset`, array, i)
+}
+
+ListArray__raw_value_offsets <- function(array) {
+ .Call(`_arrow_ListArray__raw_value_offsets`, array)
+}
+
+LargeListArray__raw_value_offsets <- function(array) {
+ .Call(`_arrow_LargeListArray__raw_value_offsets`, array)
+}
+
+Array__as_vector <- function(array) {
+ .Call(`_arrow_Array__as_vector`, array)
+}
+
+ChunkedArray__as_vector <- function(chunked_array, use_threads) {
+ .Call(`_arrow_ChunkedArray__as_vector`, chunked_array, use_threads)
+}
+
+RecordBatch__to_dataframe <- function(batch, use_threads) {
+ .Call(`_arrow_RecordBatch__to_dataframe`, batch, use_threads)
+}
+
+Table__to_dataframe <- function(table, use_threads) {
+ .Call(`_arrow_Table__to_dataframe`, table, use_threads)
+}
+
+ArrayData__get_type <- function(x) {
+ .Call(`_arrow_ArrayData__get_type`, x)
+}
+
+ArrayData__get_length <- function(x) {
+ .Call(`_arrow_ArrayData__get_length`, x)
+}
+
+ArrayData__get_null_count <- function(x) {
+ .Call(`_arrow_ArrayData__get_null_count`, x)
+}
+
+ArrayData__get_offset <- function(x) {
+ .Call(`_arrow_ArrayData__get_offset`, x)
+}
+
+ArrayData__buffers <- function(x) {
+ .Call(`_arrow_ArrayData__buffers`, x)
+}
+
+Buffer__is_mutable <- function(buffer) {
+ .Call(`_arrow_Buffer__is_mutable`, buffer)
+}
+
+Buffer__ZeroPadding <- function(buffer) {
+ invisible(.Call(`_arrow_Buffer__ZeroPadding`, buffer))
+}
+
+Buffer__capacity <- function(buffer) {
+ .Call(`_arrow_Buffer__capacity`, buffer)
+}
+
+Buffer__size <- function(buffer) {
+ .Call(`_arrow_Buffer__size`, buffer)
+}
+
+r___RBuffer__initialize <- function(x) {
+ .Call(`_arrow_r___RBuffer__initialize`, x)
+}
+
+Buffer__data <- function(buffer) {
+ .Call(`_arrow_Buffer__data`, buffer)
+}
+
+Buffer__Equals <- function(x, y) {
+ .Call(`_arrow_Buffer__Equals`, x, y)
+}
+
+ChunkedArray__length <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__length`, chunked_array)
+}
+
+ChunkedArray__null_count <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__null_count`, chunked_array)
+}
+
+ChunkedArray__num_chunks <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__num_chunks`, chunked_array)
+}
+
+ChunkedArray__chunk <- function(chunked_array, i) {
+ .Call(`_arrow_ChunkedArray__chunk`, chunked_array, i)
+}
+
+ChunkedArray__chunks <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__chunks`, chunked_array)
+}
+
+ChunkedArray__type <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__type`, chunked_array)
+}
+
+ChunkedArray__Slice1 <- function(chunked_array, offset) {
+ .Call(`_arrow_ChunkedArray__Slice1`, chunked_array, offset)
+}
+
+ChunkedArray__Slice2 <- function(chunked_array, offset, length) {
+ .Call(`_arrow_ChunkedArray__Slice2`, chunked_array, offset, length)
+}
+
+ChunkedArray__View <- function(array, type) {
+ .Call(`_arrow_ChunkedArray__View`, array, type)
+}
+
+ChunkedArray__Validate <- function(chunked_array) {
+ invisible(.Call(`_arrow_ChunkedArray__Validate`, chunked_array))
+}
+
+ChunkedArray__Equals <- function(x, y) {
+ .Call(`_arrow_ChunkedArray__Equals`, x, y)
+}
+
+ChunkedArray__ToString <- function(x) {
+ .Call(`_arrow_ChunkedArray__ToString`, x)
+}
+
+ChunkedArray__from_list <- function(chunks, s_type) {
+ .Call(`_arrow_ChunkedArray__from_list`, chunks, s_type)
+}
+
+util___Codec__Create <- function(codec, compression_level) {
+ .Call(`_arrow_util___Codec__Create`, codec, compression_level)
+}
+
+util___Codec__name <- function(codec) {
+ .Call(`_arrow_util___Codec__name`, codec)
+}
+
+util___Codec__IsAvailable <- function(codec) {
+ .Call(`_arrow_util___Codec__IsAvailable`, codec)
+}
+
+io___CompressedOutputStream__Make <- function(codec, raw) {
+ .Call(`_arrow_io___CompressedOutputStream__Make`, codec, raw)
+}
+
+io___CompressedInputStream__Make <- function(codec, raw) {
+ .Call(`_arrow_io___CompressedInputStream__Make`, codec, raw)
+}
+
+ExecPlan_create <- function(use_threads) {
+ .Call(`_arrow_ExecPlan_create`, use_threads)
+}
+
+ExecPlan_run <- function(plan, final_node, sort_options, head) {
+ .Call(`_arrow_ExecPlan_run`, plan, final_node, sort_options, head)
+}
+
+ExecPlan_StopProducing <- function(plan) {
+ invisible(.Call(`_arrow_ExecPlan_StopProducing`, plan))
+}
+
+ExecNode_output_schema <- function(node) {
+ .Call(`_arrow_ExecNode_output_schema`, node)
+}
+
+ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) {
+ .Call(`_arrow_ExecNode_Scan`, plan, dataset, filter, materialized_field_names)
+}
+
+ExecNode_Filter <- function(input, filter) {
+ .Call(`_arrow_ExecNode_Filter`, input, filter)
+}
+
+ExecNode_Project <- function(input, exprs, names) {
+ .Call(`_arrow_ExecNode_Project`, input, exprs, names)
+}
+
+ExecNode_Aggregate <- function(input, options, target_names, out_field_names, key_names) {
+ .Call(`_arrow_ExecNode_Aggregate`, input, options, target_names, out_field_names, key_names)
+}
+
+ExecNode_Join <- function(input, type, right_data, left_keys, right_keys, left_output, right_output) {
+ .Call(`_arrow_ExecNode_Join`, input, type, right_data, left_keys, right_keys, left_output, right_output)
+}
+
+ExecNode_ReadFromRecordBatchReader <- function(plan, reader) {
+ .Call(`_arrow_ExecNode_ReadFromRecordBatchReader`, plan, reader)
+}
+
+RecordBatch__cast <- function(batch, schema, options) {
+ .Call(`_arrow_RecordBatch__cast`, batch, schema, options)
+}
+
+Table__cast <- function(table, schema, options) {
+ .Call(`_arrow_Table__cast`, table, schema, options)
+}
+
+compute__CallFunction <- function(func_name, args, options) {
+ .Call(`_arrow_compute__CallFunction`, func_name, args, options)
+}
+
+compute__GetFunctionNames <- function() {
+ .Call(`_arrow_compute__GetFunctionNames`)
+}
+
+build_info <- function() {
+ .Call(`_arrow_build_info`)
+}
+
+runtime_info <- function() {
+ .Call(`_arrow_runtime_info`)
+}
+
+csv___WriteOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___WriteOptions__initialize`, options)
+}
+
+csv___ReadOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___ReadOptions__initialize`, options)
+}
+
+csv___ParseOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___ParseOptions__initialize`, options)
+}
+
+csv___ReadOptions__column_names <- function(options) {
+ .Call(`_arrow_csv___ReadOptions__column_names`, options)
+}
+
+csv___ConvertOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___ConvertOptions__initialize`, options)
+}
+
+csv___TableReader__Make <- function(input, read_options, parse_options, convert_options) {
+ .Call(`_arrow_csv___TableReader__Make`, input, read_options, parse_options, convert_options)
+}
+
+csv___TableReader__Read <- function(table_reader) {
+ .Call(`_arrow_csv___TableReader__Read`, table_reader)
+}
+
+TimestampParser__kind <- function(parser) {
+ .Call(`_arrow_TimestampParser__kind`, parser)
+}
+
+TimestampParser__format <- function(parser) {
+ .Call(`_arrow_TimestampParser__format`, parser)
+}
+
+TimestampParser__MakeStrptime <- function(format) {
+ .Call(`_arrow_TimestampParser__MakeStrptime`, format)
+}
+
+TimestampParser__MakeISO8601 <- function() {
+ .Call(`_arrow_TimestampParser__MakeISO8601`)
+}
+
+csv___WriteCSV__Table <- function(table, write_options, stream) {
+ invisible(.Call(`_arrow_csv___WriteCSV__Table`, table, write_options, stream))
+}
+
+csv___WriteCSV__RecordBatch <- function(record_batch, write_options, stream) {
+ invisible(.Call(`_arrow_csv___WriteCSV__RecordBatch`, record_batch, write_options, stream))
+}
+
+dataset___Dataset__NewScan <- function(ds) {
+ .Call(`_arrow_dataset___Dataset__NewScan`, ds)
+}
+
+dataset___Dataset__schema <- function(dataset) {
+ .Call(`_arrow_dataset___Dataset__schema`, dataset)
+}
+
+dataset___Dataset__type_name <- function(dataset) {
+ .Call(`_arrow_dataset___Dataset__type_name`, dataset)
+}
+
+dataset___Dataset__ReplaceSchema <- function(dataset, schm) {
+ .Call(`_arrow_dataset___Dataset__ReplaceSchema`, dataset, schm)
+}
+
+dataset___UnionDataset__create <- function(datasets, schm) {
+ .Call(`_arrow_dataset___UnionDataset__create`, datasets, schm)
+}
+
+dataset___InMemoryDataset__create <- function(table) {
+ .Call(`_arrow_dataset___InMemoryDataset__create`, table)
+}
+
+dataset___UnionDataset__children <- function(ds) {
+ .Call(`_arrow_dataset___UnionDataset__children`, ds)
+}
+
+dataset___FileSystemDataset__format <- function(dataset) {
+ .Call(`_arrow_dataset___FileSystemDataset__format`, dataset)
+}
+
+dataset___FileSystemDataset__filesystem <- function(dataset) {
+ .Call(`_arrow_dataset___FileSystemDataset__filesystem`, dataset)
+}
+
+dataset___FileSystemDataset__files <- function(dataset) {
+ .Call(`_arrow_dataset___FileSystemDataset__files`, dataset)
+}
+
+dataset___DatasetFactory__Finish1 <- function(factory, unify_schemas) {
+ .Call(`_arrow_dataset___DatasetFactory__Finish1`, factory, unify_schemas)
+}
+
+dataset___DatasetFactory__Finish2 <- function(factory, schema) {
+ .Call(`_arrow_dataset___DatasetFactory__Finish2`, factory, schema)
+}
+
+dataset___DatasetFactory__Inspect <- function(factory, unify_schemas) {
+ .Call(`_arrow_dataset___DatasetFactory__Inspect`, factory, unify_schemas)
+}
+
+dataset___UnionDatasetFactory__Make <- function(children) {
+ .Call(`_arrow_dataset___UnionDatasetFactory__Make`, children)
+}
+
+dataset___FileSystemDatasetFactory__Make0 <- function(fs, paths, format) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make0`, fs, paths, format)
+}
+
+dataset___FileSystemDatasetFactory__Make2 <- function(fs, selector, format, partitioning) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make2`, fs, selector, format, partitioning)
+}
+
+dataset___FileSystemDatasetFactory__Make1 <- function(fs, selector, format) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make1`, fs, selector, format)
+}
+
+dataset___FileSystemDatasetFactory__Make3 <- function(fs, selector, format, factory) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make3`, fs, selector, format, factory)
+}
+
+dataset___FileFormat__type_name <- function(format) {
+ .Call(`_arrow_dataset___FileFormat__type_name`, format)
+}
+
+dataset___FileFormat__DefaultWriteOptions <- function(fmt) {
+ .Call(`_arrow_dataset___FileFormat__DefaultWriteOptions`, fmt)
+}
+
+dataset___ParquetFileFormat__Make <- function(options, dict_columns) {
+ .Call(`_arrow_dataset___ParquetFileFormat__Make`, options, dict_columns)
+}
+
+dataset___FileWriteOptions__type_name <- function(options) {
+ .Call(`_arrow_dataset___FileWriteOptions__type_name`, options)
+}
+
+dataset___ParquetFileWriteOptions__update <- function(options, writer_props, arrow_writer_props) {
+ invisible(.Call(`_arrow_dataset___ParquetFileWriteOptions__update`, options, writer_props, arrow_writer_props))
+}
+
+dataset___IpcFileWriteOptions__update2 <- function(ipc_options, use_legacy_format, codec, metadata_version) {
+ invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update2`, ipc_options, use_legacy_format, codec, metadata_version))
+}
+
+dataset___IpcFileWriteOptions__update1 <- function(ipc_options, use_legacy_format, metadata_version) {
+ invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update1`, ipc_options, use_legacy_format, metadata_version))
+}
+
+dataset___CsvFileWriteOptions__update <- function(csv_options, write_options) {
+ invisible(.Call(`_arrow_dataset___CsvFileWriteOptions__update`, csv_options, write_options))
+}
+
+dataset___IpcFileFormat__Make <- function() {
+ .Call(`_arrow_dataset___IpcFileFormat__Make`)
+}
+
+dataset___CsvFileFormat__Make <- function(parse_options, convert_options, read_options) {
+ .Call(`_arrow_dataset___CsvFileFormat__Make`, parse_options, convert_options, read_options)
+}
+
+dataset___FragmentScanOptions__type_name <- function(fragment_scan_options) {
+ .Call(`_arrow_dataset___FragmentScanOptions__type_name`, fragment_scan_options)
+}
+
+dataset___CsvFragmentScanOptions__Make <- function(convert_options, read_options) {
+ .Call(`_arrow_dataset___CsvFragmentScanOptions__Make`, convert_options, read_options)
+}
+
+dataset___ParquetFragmentScanOptions__Make <- function(use_buffered_stream, buffer_size, pre_buffer) {
+ .Call(`_arrow_dataset___ParquetFragmentScanOptions__Make`, use_buffered_stream, buffer_size, pre_buffer)
+}
+
+dataset___DirectoryPartitioning <- function(schm, segment_encoding) {
+ .Call(`_arrow_dataset___DirectoryPartitioning`, schm, segment_encoding)
+}
+
+dataset___DirectoryPartitioning__MakeFactory <- function(field_names, segment_encoding) {
+ .Call(`_arrow_dataset___DirectoryPartitioning__MakeFactory`, field_names, segment_encoding)
+}
+
+dataset___HivePartitioning <- function(schm, null_fallback, segment_encoding) {
+ .Call(`_arrow_dataset___HivePartitioning`, schm, null_fallback, segment_encoding)
+}
+
+dataset___HivePartitioning__MakeFactory <- function(null_fallback, segment_encoding) {
+ .Call(`_arrow_dataset___HivePartitioning__MakeFactory`, null_fallback, segment_encoding)
+}
+
+dataset___ScannerBuilder__ProjectNames <- function(sb, cols) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectNames`, sb, cols))
+}
+
+dataset___ScannerBuilder__ProjectExprs <- function(sb, exprs, names) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectExprs`, sb, exprs, names))
+}
+
+dataset___ScannerBuilder__Filter <- function(sb, expr) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__Filter`, sb, expr))
+}
+
+dataset___ScannerBuilder__UseThreads <- function(sb, threads) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__UseThreads`, sb, threads))
+}
+
+dataset___ScannerBuilder__UseAsync <- function(sb, use_async) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__UseAsync`, sb, use_async))
+}
+
+dataset___ScannerBuilder__BatchSize <- function(sb, batch_size) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__BatchSize`, sb, batch_size))
+}
+
+dataset___ScannerBuilder__FragmentScanOptions <- function(sb, options) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__FragmentScanOptions`, sb, options))
+}
+
+dataset___ScannerBuilder__schema <- function(sb) {
+ .Call(`_arrow_dataset___ScannerBuilder__schema`, sb)
+}
+
+dataset___ScannerBuilder__Finish <- function(sb) {
+ .Call(`_arrow_dataset___ScannerBuilder__Finish`, sb)
+}
+
+dataset___ScannerBuilder__FromRecordBatchReader <- function(reader) {
+ .Call(`_arrow_dataset___ScannerBuilder__FromRecordBatchReader`, reader)
+}
+
+dataset___Scanner__ToTable <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__ToTable`, scanner)
+}
+
+dataset___Scanner__ScanBatches <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__ScanBatches`, scanner)
+}
+
+dataset___Scanner__ToRecordBatchReader <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__ToRecordBatchReader`, scanner)
+}
+
+dataset___Scanner__head <- function(scanner, n) {
+ .Call(`_arrow_dataset___Scanner__head`, scanner, n)
+}
+
+dataset___Scanner__schema <- function(sc) {
+ .Call(`_arrow_dataset___Scanner__schema`, sc)
+}
+
+dataset___ScanTask__get_batches <- function(scan_task) {
+ .Call(`_arrow_dataset___ScanTask__get_batches`, scan_task)
+}
+
+dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior) {
+ invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior))
+}
+
+dataset___Scanner__TakeRows <- function(scanner, indices) {
+ .Call(`_arrow_dataset___Scanner__TakeRows`, scanner, indices)
+}
+
+dataset___Scanner__CountRows <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__CountRows`, scanner)
+}
+
+Int8__initialize <- function() {
+ .Call(`_arrow_Int8__initialize`)
+}
+
+Int16__initialize <- function() {
+ .Call(`_arrow_Int16__initialize`)
+}
+
+Int32__initialize <- function() {
+ .Call(`_arrow_Int32__initialize`)
+}
+
+Int64__initialize <- function() {
+ .Call(`_arrow_Int64__initialize`)
+}
+
+UInt8__initialize <- function() {
+ .Call(`_arrow_UInt8__initialize`)
+}
+
+UInt16__initialize <- function() {
+ .Call(`_arrow_UInt16__initialize`)
+}
+
+UInt32__initialize <- function() {
+ .Call(`_arrow_UInt32__initialize`)
+}
+
+UInt64__initialize <- function() {
+ .Call(`_arrow_UInt64__initialize`)
+}
+
+Float16__initialize <- function() {
+ .Call(`_arrow_Float16__initialize`)
+}
+
+Float32__initialize <- function() {
+ .Call(`_arrow_Float32__initialize`)
+}
+
+Float64__initialize <- function() {
+ .Call(`_arrow_Float64__initialize`)
+}
+
+Boolean__initialize <- function() {
+ .Call(`_arrow_Boolean__initialize`)
+}
+
+Utf8__initialize <- function() {
+ .Call(`_arrow_Utf8__initialize`)
+}
+
+LargeUtf8__initialize <- function() {
+ .Call(`_arrow_LargeUtf8__initialize`)
+}
+
+Binary__initialize <- function() {
+ .Call(`_arrow_Binary__initialize`)
+}
+
+LargeBinary__initialize <- function() {
+ .Call(`_arrow_LargeBinary__initialize`)
+}
+
+Date32__initialize <- function() {
+ .Call(`_arrow_Date32__initialize`)
+}
+
+Date64__initialize <- function() {
+ .Call(`_arrow_Date64__initialize`)
+}
+
+Null__initialize <- function() {
+ .Call(`_arrow_Null__initialize`)
+}
+
+Decimal128Type__initialize <- function(precision, scale) {
+ .Call(`_arrow_Decimal128Type__initialize`, precision, scale)
+}
+
+FixedSizeBinary__initialize <- function(byte_width) {
+ .Call(`_arrow_FixedSizeBinary__initialize`, byte_width)
+}
+
+Timestamp__initialize <- function(unit, timezone) {
+ .Call(`_arrow_Timestamp__initialize`, unit, timezone)
+}
+
+Time32__initialize <- function(unit) {
+ .Call(`_arrow_Time32__initialize`, unit)
+}
+
+Time64__initialize <- function(unit) {
+ .Call(`_arrow_Time64__initialize`, unit)
+}
+
+list__ <- function(x) {
+ .Call(`_arrow_list__`, x)
+}
+
+large_list__ <- function(x) {
+ .Call(`_arrow_large_list__`, x)
+}
+
+fixed_size_list__ <- function(x, list_size) {
+ .Call(`_arrow_fixed_size_list__`, x, list_size)
+}
+
+struct__ <- function(fields) {
+ .Call(`_arrow_struct__`, fields)
+}
+
+DataType__ToString <- function(type) {
+ .Call(`_arrow_DataType__ToString`, type)
+}
+
+DataType__name <- function(type) {
+ .Call(`_arrow_DataType__name`, type)
+}
+
+DataType__Equals <- function(lhs, rhs) {
+ .Call(`_arrow_DataType__Equals`, lhs, rhs)
+}
+
+DataType__num_fields <- function(type) {
+ .Call(`_arrow_DataType__num_fields`, type)
+}
+
+DataType__fields <- function(type) {
+ .Call(`_arrow_DataType__fields`, type)
+}
+
+DataType__id <- function(type) {
+ .Call(`_arrow_DataType__id`, type)
+}
+
+ListType__ToString <- function(type) {
+ .Call(`_arrow_ListType__ToString`, type)
+}
+
+FixedWidthType__bit_width <- function(type) {
+ .Call(`_arrow_FixedWidthType__bit_width`, type)
+}
+
+DateType__unit <- function(type) {
+ .Call(`_arrow_DateType__unit`, type)
+}
+
+TimeType__unit <- function(type) {
+ .Call(`_arrow_TimeType__unit`, type)
+}
+
+DecimalType__precision <- function(type) {
+ .Call(`_arrow_DecimalType__precision`, type)
+}
+
+DecimalType__scale <- function(type) {
+ .Call(`_arrow_DecimalType__scale`, type)
+}
+
+TimestampType__timezone <- function(type) {
+ .Call(`_arrow_TimestampType__timezone`, type)
+}
+
+TimestampType__unit <- function(type) {
+ .Call(`_arrow_TimestampType__unit`, type)
+}
+
+DictionaryType__initialize <- function(index_type, value_type, ordered) {
+ .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered)
+}
+
+DictionaryType__index_type <- function(type) {
+ .Call(`_arrow_DictionaryType__index_type`, type)
+}
+
+DictionaryType__value_type <- function(type) {
+ .Call(`_arrow_DictionaryType__value_type`, type)
+}
+
+DictionaryType__name <- function(type) {
+ .Call(`_arrow_DictionaryType__name`, type)
+}
+
+DictionaryType__ordered <- function(type) {
+ .Call(`_arrow_DictionaryType__ordered`, type)
+}
+
+StructType__GetFieldByName <- function(type, name) {
+ .Call(`_arrow_StructType__GetFieldByName`, type, name)
+}
+
+StructType__GetFieldIndex <- function(type, name) {
+ .Call(`_arrow_StructType__GetFieldIndex`, type, name)
+}
+
+StructType__field_names <- function(type) {
+ .Call(`_arrow_StructType__field_names`, type)
+}
+
+ListType__value_field <- function(type) {
+ .Call(`_arrow_ListType__value_field`, type)
+}
+
+ListType__value_type <- function(type) {
+ .Call(`_arrow_ListType__value_type`, type)
+}
+
+LargeListType__value_field <- function(type) {
+ .Call(`_arrow_LargeListType__value_field`, type)
+}
+
+LargeListType__value_type <- function(type) {
+ .Call(`_arrow_LargeListType__value_type`, type)
+}
+
+FixedSizeListType__value_field <- function(type) {
+ .Call(`_arrow_FixedSizeListType__value_field`, type)
+}
+
+FixedSizeListType__value_type <- function(type) {
+ .Call(`_arrow_FixedSizeListType__value_type`, type)
+}
+
+FixedSizeListType__list_size <- function(type) {
+ .Call(`_arrow_FixedSizeListType__list_size`, type)
+}
+
+compute___expr__equals <- function(lhs, rhs) {
+ .Call(`_arrow_compute___expr__equals`, lhs, rhs)
+}
+
+compute___expr__call <- function(func_name, argument_list, options) {
+ .Call(`_arrow_compute___expr__call`, func_name, argument_list, options)
+}
+
+field_names_in_expression <- function(x) {
+ .Call(`_arrow_field_names_in_expression`, x)
+}
+
+compute___expr__get_field_ref_name <- function(x) {
+ .Call(`_arrow_compute___expr__get_field_ref_name`, x)
+}
+
+compute___expr__field_ref <- function(name) {
+ .Call(`_arrow_compute___expr__field_ref`, name)
+}
+
+compute___expr__scalar <- function(x) {
+ .Call(`_arrow_compute___expr__scalar`, x)
+}
+
+compute___expr__ToString <- function(x) {
+ .Call(`_arrow_compute___expr__ToString`, x)
+}
+
+compute___expr__type <- function(x, schema) {
+ .Call(`_arrow_compute___expr__type`, x, schema)
+}
+
+compute___expr__type_id <- function(x, schema) {
+ .Call(`_arrow_compute___expr__type_id`, x, schema)
+}
+
+ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) {
+ invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level))
+}
+
+ipc___feather___Reader__version <- function(reader) {
+ .Call(`_arrow_ipc___feather___Reader__version`, reader)
+}
+
+ipc___feather___Reader__Read <- function(reader, columns) {
+ .Call(`_arrow_ipc___feather___Reader__Read`, reader, columns)
+}
+
+ipc___feather___Reader__Open <- function(stream) {
+ .Call(`_arrow_ipc___feather___Reader__Open`, stream)
+}
+
+ipc___feather___Reader__schema <- function(reader) {
+ .Call(`_arrow_ipc___feather___Reader__schema`, reader)
+}
+
+Field__initialize <- function(name, field, nullable) {
+ .Call(`_arrow_Field__initialize`, name, field, nullable)
+}
+
+Field__ToString <- function(field) {
+ .Call(`_arrow_Field__ToString`, field)
+}
+
+Field__name <- function(field) {
+ .Call(`_arrow_Field__name`, field)
+}
+
+Field__Equals <- function(field, other) {
+ .Call(`_arrow_Field__Equals`, field, other)
+}
+
+Field__nullable <- function(field) {
+ .Call(`_arrow_Field__nullable`, field)
+}
+
+Field__type <- function(field) {
+ .Call(`_arrow_Field__type`, field)
+}
+
+fs___FileInfo__type <- function(x) {
+ .Call(`_arrow_fs___FileInfo__type`, x)
+}
+
+fs___FileInfo__set_type <- function(x, type) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_type`, x, type))
+}
+
+fs___FileInfo__path <- function(x) {
+ .Call(`_arrow_fs___FileInfo__path`, x)
+}
+
+fs___FileInfo__set_path <- function(x, path) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_path`, x, path))
+}
+
+fs___FileInfo__size <- function(x) {
+ .Call(`_arrow_fs___FileInfo__size`, x)
+}
+
+fs___FileInfo__set_size <- function(x, size) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_size`, x, size))
+}
+
+fs___FileInfo__base_name <- function(x) {
+ .Call(`_arrow_fs___FileInfo__base_name`, x)
+}
+
+fs___FileInfo__extension <- function(x) {
+ .Call(`_arrow_fs___FileInfo__extension`, x)
+}
+
+fs___FileInfo__mtime <- function(x) {
+ .Call(`_arrow_fs___FileInfo__mtime`, x)
+}
+
+fs___FileInfo__set_mtime <- function(x, time) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_mtime`, x, time))
+}
+
+fs___FileSelector__base_dir <- function(selector) {
+ .Call(`_arrow_fs___FileSelector__base_dir`, selector)
+}
+
+fs___FileSelector__allow_not_found <- function(selector) {
+ .Call(`_arrow_fs___FileSelector__allow_not_found`, selector)
+}
+
+fs___FileSelector__recursive <- function(selector) {
+ .Call(`_arrow_fs___FileSelector__recursive`, selector)
+}
+
+fs___FileSelector__create <- function(base_dir, allow_not_found, recursive) {
+ .Call(`_arrow_fs___FileSelector__create`, base_dir, allow_not_found, recursive)
+}
+
+fs___FileSystem__GetTargetInfos_Paths <- function(file_system, paths) {
+ .Call(`_arrow_fs___FileSystem__GetTargetInfos_Paths`, file_system, paths)
+}
+
+fs___FileSystem__GetTargetInfos_FileSelector <- function(file_system, selector) {
+ .Call(`_arrow_fs___FileSystem__GetTargetInfos_FileSelector`, file_system, selector)
+}
+
+fs___FileSystem__CreateDir <- function(file_system, path, recursive) {
+ invisible(.Call(`_arrow_fs___FileSystem__CreateDir`, file_system, path, recursive))
+}
+
+fs___FileSystem__DeleteDir <- function(file_system, path) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteDir`, file_system, path))
+}
+
+fs___FileSystem__DeleteDirContents <- function(file_system, path) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteDirContents`, file_system, path))
+}
+
+fs___FileSystem__DeleteFile <- function(file_system, path) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteFile`, file_system, path))
+}
+
+fs___FileSystem__DeleteFiles <- function(file_system, paths) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteFiles`, file_system, paths))
+}
+
+fs___FileSystem__Move <- function(file_system, src, dest) {
+ invisible(.Call(`_arrow_fs___FileSystem__Move`, file_system, src, dest))
+}
+
+fs___FileSystem__CopyFile <- function(file_system, src, dest) {
+ invisible(.Call(`_arrow_fs___FileSystem__CopyFile`, file_system, src, dest))
+}
+
+fs___FileSystem__OpenInputStream <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenInputStream`, file_system, path)
+}
+
+fs___FileSystem__OpenInputFile <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenInputFile`, file_system, path)
+}
+
+fs___FileSystem__OpenOutputStream <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenOutputStream`, file_system, path)
+}
+
+fs___FileSystem__OpenAppendStream <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenAppendStream`, file_system, path)
+}
+
+fs___FileSystem__type_name <- function(file_system) {
+ .Call(`_arrow_fs___FileSystem__type_name`, file_system)
+}
+
+fs___LocalFileSystem__create <- function() {
+ .Call(`_arrow_fs___LocalFileSystem__create`)
+}
+
+fs___SubTreeFileSystem__create <- function(base_path, base_fs) {
+ .Call(`_arrow_fs___SubTreeFileSystem__create`, base_path, base_fs)
+}
+
+fs___SubTreeFileSystem__base_fs <- function(file_system) {
+ .Call(`_arrow_fs___SubTreeFileSystem__base_fs`, file_system)
+}
+
+fs___SubTreeFileSystem__base_path <- function(file_system) {
+ .Call(`_arrow_fs___SubTreeFileSystem__base_path`, file_system)
+}
+
+fs___FileSystemFromUri <- function(path) {
+ .Call(`_arrow_fs___FileSystemFromUri`, path)
+}
+
+fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads) {
+ invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads))
+}
+
+fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) {
+ .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes)
+}
+
+fs___S3FileSystem__region <- function(fs) {
+ .Call(`_arrow_fs___S3FileSystem__region`, fs)
+}
+
+io___Readable__Read <- function(x, nbytes) {
+ .Call(`_arrow_io___Readable__Read`, x, nbytes)
+}
+
+io___InputStream__Close <- function(x) {
+ invisible(.Call(`_arrow_io___InputStream__Close`, x))
+}
+
+io___OutputStream__Close <- function(x) {
+ invisible(.Call(`_arrow_io___OutputStream__Close`, x))
+}
+
+io___RandomAccessFile__GetSize <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__GetSize`, x)
+}
+
+io___RandomAccessFile__supports_zero_copy <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__supports_zero_copy`, x)
+}
+
+io___RandomAccessFile__Seek <- function(x, position) {
+ invisible(.Call(`_arrow_io___RandomAccessFile__Seek`, x, position))
+}
+
+io___RandomAccessFile__Tell <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__Tell`, x)
+}
+
+io___RandomAccessFile__Read0 <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__Read0`, x)
+}
+
+io___RandomAccessFile__ReadAt <- function(x, position, nbytes) {
+ .Call(`_arrow_io___RandomAccessFile__ReadAt`, x, position, nbytes)
+}
+
+io___MemoryMappedFile__Create <- function(path, size) {
+ .Call(`_arrow_io___MemoryMappedFile__Create`, path, size)
+}
+
+io___MemoryMappedFile__Open <- function(path, mode) {
+ .Call(`_arrow_io___MemoryMappedFile__Open`, path, mode)
+}
+
+io___MemoryMappedFile__Resize <- function(x, size) {
+ invisible(.Call(`_arrow_io___MemoryMappedFile__Resize`, x, size))
+}
+
+io___ReadableFile__Open <- function(path) {
+ .Call(`_arrow_io___ReadableFile__Open`, path)
+}
+
+io___BufferReader__initialize <- function(buffer) {
+ .Call(`_arrow_io___BufferReader__initialize`, buffer)
+}
+
+io___Writable__write <- function(stream, buf) {
+ invisible(.Call(`_arrow_io___Writable__write`, stream, buf))
+}
+
+io___OutputStream__Tell <- function(stream) {
+ .Call(`_arrow_io___OutputStream__Tell`, stream)
+}
+
+io___FileOutputStream__Open <- function(path) {
+ .Call(`_arrow_io___FileOutputStream__Open`, path)
+}
+
+io___BufferOutputStream__Create <- function(initial_capacity) {
+ .Call(`_arrow_io___BufferOutputStream__Create`, initial_capacity)
+}
+
+io___BufferOutputStream__capacity <- function(stream) {
+ .Call(`_arrow_io___BufferOutputStream__capacity`, stream)
+}
+
+io___BufferOutputStream__Finish <- function(stream) {
+ .Call(`_arrow_io___BufferOutputStream__Finish`, stream)
+}
+
+io___BufferOutputStream__Tell <- function(stream) {
+ .Call(`_arrow_io___BufferOutputStream__Tell`, stream)
+}
+
+io___BufferOutputStream__Write <- function(stream, bytes) {
+ invisible(.Call(`_arrow_io___BufferOutputStream__Write`, stream, bytes))
+}
+
+json___ReadOptions__initialize <- function(use_threads, block_size) {
+ .Call(`_arrow_json___ReadOptions__initialize`, use_threads, block_size)
+}
+
+json___ParseOptions__initialize1 <- function(newlines_in_values) {
+ .Call(`_arrow_json___ParseOptions__initialize1`, newlines_in_values)
+}
+
+json___ParseOptions__initialize2 <- function(newlines_in_values, explicit_schema) {
+ .Call(`_arrow_json___ParseOptions__initialize2`, newlines_in_values, explicit_schema)
+}
+
+json___TableReader__Make <- function(input, read_options, parse_options) {
+ .Call(`_arrow_json___TableReader__Make`, input, read_options, parse_options)
+}
+
+json___TableReader__Read <- function(table_reader) {
+ .Call(`_arrow_json___TableReader__Read`, table_reader)
+}
+
+MemoryPool__default <- function() {
+ .Call(`_arrow_MemoryPool__default`)
+}
+
+MemoryPool__bytes_allocated <- function(pool) {
+ .Call(`_arrow_MemoryPool__bytes_allocated`, pool)
+}
+
+MemoryPool__max_memory <- function(pool) {
+ .Call(`_arrow_MemoryPool__max_memory`, pool)
+}
+
+MemoryPool__backend_name <- function(pool) {
+ .Call(`_arrow_MemoryPool__backend_name`, pool)
+}
+
+supported_memory_backends <- function() {
+ .Call(`_arrow_supported_memory_backends`)
+}
+
+ipc___Message__body_length <- function(message) {
+ .Call(`_arrow_ipc___Message__body_length`, message)
+}
+
+ipc___Message__metadata <- function(message) {
+ .Call(`_arrow_ipc___Message__metadata`, message)
+}
+
+ipc___Message__body <- function(message) {
+ .Call(`_arrow_ipc___Message__body`, message)
+}
+
+ipc___Message__Verify <- function(message) {
+ .Call(`_arrow_ipc___Message__Verify`, message)
+}
+
+ipc___Message__type <- function(message) {
+ .Call(`_arrow_ipc___Message__type`, message)
+}
+
+ipc___Message__Equals <- function(x, y) {
+ .Call(`_arrow_ipc___Message__Equals`, x, y)
+}
+
+ipc___ReadRecordBatch__Message__Schema <- function(message, schema) {
+ .Call(`_arrow_ipc___ReadRecordBatch__Message__Schema`, message, schema)
+}
+
+ipc___ReadSchema_InputStream <- function(stream) {
+ .Call(`_arrow_ipc___ReadSchema_InputStream`, stream)
+}
+
+ipc___ReadSchema_Message <- function(message) {
+ .Call(`_arrow_ipc___ReadSchema_Message`, message)
+}
+
+ipc___MessageReader__Open <- function(stream) {
+ .Call(`_arrow_ipc___MessageReader__Open`, stream)
+}
+
+ipc___MessageReader__ReadNextMessage <- function(reader) {
+ .Call(`_arrow_ipc___MessageReader__ReadNextMessage`, reader)
+}
+
+ipc___ReadMessage <- function(stream) {
+ .Call(`_arrow_ipc___ReadMessage`, stream)
+}
+
+parquet___arrow___ArrowReaderProperties__Make <- function(use_threads) {
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__Make`, use_threads)
+}
+
+parquet___arrow___ArrowReaderProperties__set_use_threads <- function(properties, use_threads) {
+ invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads`, properties, use_threads))
+}
+
+parquet___arrow___ArrowReaderProperties__get_use_threads <- function(properties, use_threads) {
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads`, properties, use_threads)
+}
+
+parquet___arrow___ArrowReaderProperties__get_read_dictionary <- function(properties, column_index) {
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary`, properties, column_index)
+}
+
+parquet___arrow___ArrowReaderProperties__set_read_dictionary <- function(properties, column_index, read_dict) {
+ invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary`, properties, column_index, read_dict))
+}
+
+parquet___arrow___FileReader__OpenFile <- function(file, props) {
+ .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props)
+}
+
+parquet___arrow___FileReader__ReadTable1 <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadTable1`, reader)
+}
+
+parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadTable2`, reader, column_indices)
+}
+
+parquet___arrow___FileReader__ReadRowGroup1 <- function(reader, i) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup1`, reader, i)
+}
+
+parquet___arrow___FileReader__ReadRowGroup2 <- function(reader, i, column_indices) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup2`, reader, i, column_indices)
+}
+
+parquet___arrow___FileReader__ReadRowGroups1 <- function(reader, row_groups) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups1`, reader, row_groups)
+}
+
+parquet___arrow___FileReader__ReadRowGroups2 <- function(reader, row_groups, column_indices) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups2`, reader, row_groups, column_indices)
+}
+
+parquet___arrow___FileReader__num_rows <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__num_rows`, reader)
+}
+
+parquet___arrow___FileReader__num_columns <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__num_columns`, reader)
+}
+
+parquet___arrow___FileReader__num_row_groups <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__num_row_groups`, reader)
+}
+
+parquet___arrow___FileReader__ReadColumn <- function(reader, i) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadColumn`, reader, i)
+}
+
+parquet___ArrowWriterProperties___create <- function(allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit) {
+ .Call(`_arrow_parquet___ArrowWriterProperties___create`, allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit)
+}
+
+parquet___WriterProperties___Builder__create <- function() {
+ .Call(`_arrow_parquet___WriterProperties___Builder__create`)
+}
+
+parquet___WriterProperties___Builder__version <- function(builder, version) {
+ invisible(.Call(`_arrow_parquet___WriterProperties___Builder__version`, builder, version))
+}
+
+parquet___ArrowWriterProperties___Builder__set_compressions <- function(builder, paths, types) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compressions`, builder, paths, types))
+}
+
+parquet___ArrowWriterProperties___Builder__set_compression_levels <- function(builder, paths, levels) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels`, builder, paths, levels))
+}
+
+parquet___ArrowWriterProperties___Builder__set_use_dictionary <- function(builder, paths, use_dictionary) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary`, builder, paths, use_dictionary))
+}
+
+parquet___ArrowWriterProperties___Builder__set_write_statistics <- function(builder, paths, write_statistics) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics`, builder, paths, write_statistics))
+}
+
+parquet___ArrowWriterProperties___Builder__data_page_size <- function(builder, data_page_size) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__data_page_size`, builder, data_page_size))
+}
+
+parquet___WriterProperties___Builder__build <- function(builder) {
+ .Call(`_arrow_parquet___WriterProperties___Builder__build`, builder)
+}
+
+parquet___arrow___ParquetFileWriter__Open <- function(schema, sink, properties, arrow_properties) {
+ .Call(`_arrow_parquet___arrow___ParquetFileWriter__Open`, schema, sink, properties, arrow_properties)
+}
+
+parquet___arrow___FileWriter__WriteTable <- function(writer, table, chunk_size) {
+ invisible(.Call(`_arrow_parquet___arrow___FileWriter__WriteTable`, writer, table, chunk_size))
+}
+
+parquet___arrow___FileWriter__Close <- function(writer) {
+ invisible(.Call(`_arrow_parquet___arrow___FileWriter__Close`, writer))
+}
+
+parquet___arrow___WriteTable <- function(table, sink, properties, arrow_properties) {
+ invisible(.Call(`_arrow_parquet___arrow___WriteTable`, table, sink, properties, arrow_properties))
+}
+
+parquet___arrow___FileReader__GetSchema <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__GetSchema`, reader)
+}
+
+allocate_arrow_schema <- function() {
+ .Call(`_arrow_allocate_arrow_schema`)
+}
+
+delete_arrow_schema <- function(ptr) {
+ invisible(.Call(`_arrow_delete_arrow_schema`, ptr))
+}
+
+allocate_arrow_array <- function() {
+ .Call(`_arrow_allocate_arrow_array`)
+}
+
+delete_arrow_array <- function(ptr) {
+ invisible(.Call(`_arrow_delete_arrow_array`, ptr))
+}
+
+allocate_arrow_array_stream <- function() {
+ .Call(`_arrow_allocate_arrow_array_stream`)
+}
+
+delete_arrow_array_stream <- function(ptr) {
+ invisible(.Call(`_arrow_delete_arrow_array_stream`, ptr))
+}
+
+ImportArray <- function(array, schema) {
+ .Call(`_arrow_ImportArray`, array, schema)
+}
+
+ImportRecordBatch <- function(array, schema) {
+ .Call(`_arrow_ImportRecordBatch`, array, schema)
+}
+
+ImportSchema <- function(schema) {
+ .Call(`_arrow_ImportSchema`, schema)
+}
+
+ImportField <- function(field) {
+ .Call(`_arrow_ImportField`, field)
+}
+
+ImportType <- function(type) {
+ .Call(`_arrow_ImportType`, type)
+}
+
+ImportRecordBatchReader <- function(stream) {
+ .Call(`_arrow_ImportRecordBatchReader`, stream)
+}
+
+ExportType <- function(type, ptr) {
+ invisible(.Call(`_arrow_ExportType`, type, ptr))
+}
+
+ExportField <- function(field, ptr) {
+ invisible(.Call(`_arrow_ExportField`, field, ptr))
+}
+
+ExportSchema <- function(schema, ptr) {
+ invisible(.Call(`_arrow_ExportSchema`, schema, ptr))
+}
+
+ExportArray <- function(array, array_ptr, schema_ptr) {
+ invisible(.Call(`_arrow_ExportArray`, array, array_ptr, schema_ptr))
+}
+
+ExportRecordBatch <- function(batch, array_ptr, schema_ptr) {
+ invisible(.Call(`_arrow_ExportRecordBatch`, batch, array_ptr, schema_ptr))
+}
+
+ExportRecordBatchReader <- function(reader, stream_ptr) {
+ invisible(.Call(`_arrow_ExportRecordBatchReader`, reader, stream_ptr))
+}
+
+Table__from_dots <- function(lst, schema_sxp, use_threads) {
+ .Call(`_arrow_Table__from_dots`, lst, schema_sxp, use_threads)
+}
+
+vec_to_arrow <- function(x, s_type) {
+ .Call(`_arrow_vec_to_arrow`, x, s_type)
+}
+
+DictionaryArray__FromArrays <- function(type, indices, dict) {
+ .Call(`_arrow_DictionaryArray__FromArrays`, type, indices, dict)
+}
+
+RecordBatch__num_columns <- function(x) {
+ .Call(`_arrow_RecordBatch__num_columns`, x)
+}
+
+RecordBatch__num_rows <- function(x) {
+ .Call(`_arrow_RecordBatch__num_rows`, x)
+}
+
+RecordBatch__schema <- function(x) {
+ .Call(`_arrow_RecordBatch__schema`, x)
+}
+
+RecordBatch__RenameColumns <- function(batch, names) {
+ .Call(`_arrow_RecordBatch__RenameColumns`, batch, names)
+}
+
+RecordBatch__ReplaceSchemaMetadata <- function(x, metadata) {
+ .Call(`_arrow_RecordBatch__ReplaceSchemaMetadata`, x, metadata)
+}
+
+RecordBatch__columns <- function(batch) {
+ .Call(`_arrow_RecordBatch__columns`, batch)
+}
+
+RecordBatch__column <- function(batch, i) {
+ .Call(`_arrow_RecordBatch__column`, batch, i)
+}
+
+RecordBatch__GetColumnByName <- function(batch, name) {
+ .Call(`_arrow_RecordBatch__GetColumnByName`, batch, name)
+}
+
+RecordBatch__SelectColumns <- function(batch, indices) {
+ .Call(`_arrow_RecordBatch__SelectColumns`, batch, indices)
+}
+
+RecordBatch__Equals <- function(self, other, check_metadata) {
+ .Call(`_arrow_RecordBatch__Equals`, self, other, check_metadata)
+}
+
+RecordBatch__AddColumn <- function(batch, i, field, column) {
+ .Call(`_arrow_RecordBatch__AddColumn`, batch, i, field, column)
+}
+
+RecordBatch__SetColumn <- function(batch, i, field, column) {
+ .Call(`_arrow_RecordBatch__SetColumn`, batch, i, field, column)
+}
+
+RecordBatch__RemoveColumn <- function(batch, i) {
+ .Call(`_arrow_RecordBatch__RemoveColumn`, batch, i)
+}
+
+RecordBatch__column_name <- function(batch, i) {
+ .Call(`_arrow_RecordBatch__column_name`, batch, i)
+}
+
+RecordBatch__names <- function(batch) {
+ .Call(`_arrow_RecordBatch__names`, batch)
+}
+
+RecordBatch__Slice1 <- function(self, offset) {
+ .Call(`_arrow_RecordBatch__Slice1`, self, offset)
+}
+
+RecordBatch__Slice2 <- function(self, offset, length) {
+ .Call(`_arrow_RecordBatch__Slice2`, self, offset, length)
+}
+
+ipc___SerializeRecordBatch__Raw <- function(batch) {
+ .Call(`_arrow_ipc___SerializeRecordBatch__Raw`, batch)
+}
+
+ipc___ReadRecordBatch__InputStream__Schema <- function(stream, schema) {
+ .Call(`_arrow_ipc___ReadRecordBatch__InputStream__Schema`, stream, schema)
+}
+
+RecordBatch__from_arrays <- function(schema_sxp, lst) {
+ .Call(`_arrow_RecordBatch__from_arrays`, schema_sxp, lst)
+}
+
+RecordBatchReader__schema <- function(reader) {
+ .Call(`_arrow_RecordBatchReader__schema`, reader)
+}
+
+RecordBatchReader__ReadNext <- function(reader) {
+ .Call(`_arrow_RecordBatchReader__ReadNext`, reader)
+}
+
+RecordBatchReader__batches <- function(reader) {
+ .Call(`_arrow_RecordBatchReader__batches`, reader)
+}
+
+Table__from_RecordBatchReader <- function(reader) {
+ .Call(`_arrow_Table__from_RecordBatchReader`, reader)
+}
+
+ipc___RecordBatchStreamReader__Open <- function(stream) {
+ .Call(`_arrow_ipc___RecordBatchStreamReader__Open`, stream)
+}
+
+ipc___RecordBatchFileReader__schema <- function(reader) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__schema`, reader)
+}
+
+ipc___RecordBatchFileReader__num_record_batches <- function(reader) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__num_record_batches`, reader)
+}
+
+ipc___RecordBatchFileReader__ReadRecordBatch <- function(reader, i) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__ReadRecordBatch`, reader, i)
+}
+
+ipc___RecordBatchFileReader__Open <- function(file) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__Open`, file)
+}
+
+Table__from_RecordBatchFileReader <- function(reader) {
+ .Call(`_arrow_Table__from_RecordBatchFileReader`, reader)
+}
+
+ipc___RecordBatchFileReader__batches <- function(reader) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__batches`, reader)
+}
+
+ipc___RecordBatchWriter__WriteRecordBatch <- function(batch_writer, batch) {
+ invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteRecordBatch`, batch_writer, batch))
+}
+
+ipc___RecordBatchWriter__WriteTable <- function(batch_writer, table) {
+ invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteTable`, batch_writer, table))
+}
+
+ipc___RecordBatchWriter__Close <- function(batch_writer) {
+ invisible(.Call(`_arrow_ipc___RecordBatchWriter__Close`, batch_writer))
+}
+
+ipc___RecordBatchFileWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) {
+ .Call(`_arrow_ipc___RecordBatchFileWriter__Open`, stream, schema, use_legacy_format, metadata_version)
+}
+
+ipc___RecordBatchStreamWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) {
+ .Call(`_arrow_ipc___RecordBatchStreamWriter__Open`, stream, schema, use_legacy_format, metadata_version)
+}
+
+Array__GetScalar <- function(x, i) {
+ .Call(`_arrow_Array__GetScalar`, x, i)
+}
+
+Scalar__ToString <- function(s) {
+ .Call(`_arrow_Scalar__ToString`, s)
+}
+
+StructScalar__field <- function(s, i) {
+ .Call(`_arrow_StructScalar__field`, s, i)
+}
+
+StructScalar__GetFieldByName <- function(s, name) {
+ .Call(`_arrow_StructScalar__GetFieldByName`, s, name)
+}
+
+Scalar__as_vector <- function(scalar) {
+ .Call(`_arrow_Scalar__as_vector`, scalar)
+}
+
+MakeArrayFromScalar <- function(scalar, n) {
+ .Call(`_arrow_MakeArrayFromScalar`, scalar, n)
+}
+
+Scalar__is_valid <- function(s) {
+ .Call(`_arrow_Scalar__is_valid`, s)
+}
+
+Scalar__type <- function(s) {
+ .Call(`_arrow_Scalar__type`, s)
+}
+
+Scalar__Equals <- function(lhs, rhs) {
+ .Call(`_arrow_Scalar__Equals`, lhs, rhs)
+}
+
+Scalar__ApproxEquals <- function(lhs, rhs) {
+ .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs)
+}
+
+schema_ <- function(fields) {
+ .Call(`_arrow_schema_`, fields)
+}
+
+Schema__ToString <- function(s) {
+ .Call(`_arrow_Schema__ToString`, s)
+}
+
+Schema__num_fields <- function(s) {
+ .Call(`_arrow_Schema__num_fields`, s)
+}
+
+Schema__field <- function(s, i) {
+ .Call(`_arrow_Schema__field`, s, i)
+}
+
+Schema__AddField <- function(s, i, field) {
+ .Call(`_arrow_Schema__AddField`, s, i, field)
+}
+
+Schema__SetField <- function(s, i, field) {
+ .Call(`_arrow_Schema__SetField`, s, i, field)
+}
+
+Schema__RemoveField <- function(s, i) {
+ .Call(`_arrow_Schema__RemoveField`, s, i)
+}
+
+Schema__GetFieldByName <- function(s, x) {
+ .Call(`_arrow_Schema__GetFieldByName`, s, x)
+}
+
+Schema__fields <- function(schema) {
+ .Call(`_arrow_Schema__fields`, schema)
+}
+
+Schema__field_names <- function(schema) {
+ .Call(`_arrow_Schema__field_names`, schema)
+}
+
+Schema__HasMetadata <- function(schema) {
+ .Call(`_arrow_Schema__HasMetadata`, schema)
+}
+
+Schema__metadata <- function(schema) {
+ .Call(`_arrow_Schema__metadata`, schema)
+}
+
+Schema__WithMetadata <- function(schema, metadata) {
+ .Call(`_arrow_Schema__WithMetadata`, schema, metadata)
+}
+
+Schema__serialize <- function(schema) {
+ .Call(`_arrow_Schema__serialize`, schema)
+}
+
+Schema__Equals <- function(schema, other, check_metadata) {
+ .Call(`_arrow_Schema__Equals`, schema, other, check_metadata)
+}
+
+arrow__UnifySchemas <- function(schemas) {
+ .Call(`_arrow_arrow__UnifySchemas`, schemas)
+}
+
+Table__num_columns <- function(x) {
+ .Call(`_arrow_Table__num_columns`, x)
+}
+
+Table__num_rows <- function(x) {
+ .Call(`_arrow_Table__num_rows`, x)
+}
+
+Table__schema <- function(x) {
+ .Call(`_arrow_Table__schema`, x)
+}
+
+Table__ReplaceSchemaMetadata <- function(x, metadata) {
+ .Call(`_arrow_Table__ReplaceSchemaMetadata`, x, metadata)
+}
+
+Table__column <- function(table, i) {
+ .Call(`_arrow_Table__column`, table, i)
+}
+
+Table__field <- function(table, i) {
+ .Call(`_arrow_Table__field`, table, i)
+}
+
+Table__columns <- function(table) {
+ .Call(`_arrow_Table__columns`, table)
+}
+
+Table__ColumnNames <- function(table) {
+ .Call(`_arrow_Table__ColumnNames`, table)
+}
+
+Table__RenameColumns <- function(table, names) {
+ .Call(`_arrow_Table__RenameColumns`, table, names)
+}
+
+Table__Slice1 <- function(table, offset) {
+ .Call(`_arrow_Table__Slice1`, table, offset)
+}
+
+Table__Slice2 <- function(table, offset, length) {
+ .Call(`_arrow_Table__Slice2`, table, offset, length)
+}
+
+Table__Equals <- function(lhs, rhs, check_metadata) {
+ .Call(`_arrow_Table__Equals`, lhs, rhs, check_metadata)
+}
+
+Table__Validate <- function(table) {
+ .Call(`_arrow_Table__Validate`, table)
+}
+
+Table__ValidateFull <- function(table) {
+ .Call(`_arrow_Table__ValidateFull`, table)
+}
+
+Table__GetColumnByName <- function(table, name) {
+ .Call(`_arrow_Table__GetColumnByName`, table, name)
+}
+
+Table__RemoveColumn <- function(table, i) {
+ .Call(`_arrow_Table__RemoveColumn`, table, i)
+}
+
+Table__AddColumn <- function(table, i, field, column) {
+ .Call(`_arrow_Table__AddColumn`, table, i, field, column)
+}
+
+Table__SetColumn <- function(table, i, field, column) {
+ .Call(`_arrow_Table__SetColumn`, table, i, field, column)
+}
+
+Table__SelectColumns <- function(table, indices) {
+ .Call(`_arrow_Table__SelectColumns`, table, indices)
+}
+
+all_record_batches <- function(lst) {
+ .Call(`_arrow_all_record_batches`, lst)
+}
+
+Table__from_record_batches <- function(batches, schema_sxp) {
+ .Call(`_arrow_Table__from_record_batches`, batches, schema_sxp)
+}
+
+GetCpuThreadPoolCapacity <- function() {
+ .Call(`_arrow_GetCpuThreadPoolCapacity`)
+}
+
+SetCpuThreadPoolCapacity <- function(threads) {
+ invisible(.Call(`_arrow_SetCpuThreadPoolCapacity`, threads))
+}
+
+GetIOThreadPoolCapacity <- function() {
+ .Call(`_arrow_GetIOThreadPoolCapacity`)
+}
+
+SetIOThreadPoolCapacity <- function(threads) {
+ invisible(.Call(`_arrow_SetIOThreadPoolCapacity`, threads))
+}
+
+Array__infer_type <- function(x) {
+ .Call(`_arrow_Array__infer_type`, x)
+}
diff --git a/src/arrow/r/R/buffer.R b/src/arrow/r/R/buffer.R
new file mode 100644
index 000000000..a9424fd0d
--- /dev/null
+++ b/src/arrow/r/R/buffer.R
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @title Buffer class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description A Buffer is an object containing a pointer to a piece of
+#' contiguous memory with a particular size.
+#' @section Factory:
+#' `buffer()` lets you create an `arrow::Buffer` from an R object
+#' @section Methods:
+#'
+#' - `$is_mutable` : is this buffer mutable?
+#' - `$ZeroPadding()` : zero bytes in padding, i.e. bytes between size and capacity
+#' - `$size` : size in memory, in bytes
+#' - `$capacity`: possible capacity, in bytes
+#'
+#' @rdname buffer
+#' @name buffer
+#' @examplesIf arrow_available()
+#' my_buffer <- buffer(c(1, 2, 3, 4))
+#' my_buffer$is_mutable
+#' my_buffer$ZeroPadding()
+#' my_buffer$size
+#' my_buffer$capacity
+#' @export
+#' @include arrow-package.R
+#' @include enums.R
+Buffer <- R6Class("Buffer",
+ inherit = ArrowObject,
+ public = list(
+ ZeroPadding = function() Buffer__ZeroPadding(self),
+ data = function() Buffer__data(self),
+ Equals = function(other, ...) {
+ inherits(other, "Buffer") && Buffer__Equals(self, other)
+ }
+ ),
+ active = list(
+ is_mutable = function() Buffer__is_mutable(self),
+ size = function() Buffer__size(self),
+ capacity = function() Buffer__capacity(self)
+ )
+)
+
+Buffer$create <- function(x) {
+ if (inherits(x, "Buffer")) {
+ x
+ } else if (inherits(x, c("raw", "numeric", "integer", "complex"))) {
+ r___RBuffer__initialize(x)
+ } else if (inherits(x, "BufferOutputStream")) {
+ x$finish()
+ } else {
+ stop("Cannot convert object of class ", class(x), " to arrow::Buffer")
+ }
+}
+
+#' @param x R object. Only raw, numeric and integer vectors are currently supported
+#' @return an instance of `Buffer` that borrows memory from `x`
+#' @export
+buffer <- Buffer$create
+
+#' @export
+as.raw.Buffer <- function(x) x$data()
diff --git a/src/arrow/r/R/chunked-array.R b/src/arrow/r/R/chunked-array.R
new file mode 100644
index 000000000..597180ea7
--- /dev/null
+++ b/src/arrow/r/R/chunked-array.R
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+#' @title ChunkedArray class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description A `ChunkedArray` is a data structure managing a list of
+#' primitive Arrow [Arrays][Array] logically as one large array. Chunked arrays
+#' may be grouped together in a [Table].
+#' @section Factory:
+#' The `ChunkedArray$create()` factory method instantiates the object from
+#' various Arrays or R vectors. `chunked_array()` is an alias for it.
+#'
+#' @section Methods:
+#'
+#' - `$length()`: Size in the number of elements this array contains
+#' - `$chunk(i)`: Extract an `Array` chunk by integer position
+#' - `$as_vector()`: convert to an R vector
+#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array
+#' with the indicated offset and length. If length is `NULL`, the slice goes
+#' until the end of the array.
+#' - `$Take(i)`: return a `ChunkedArray` with values at positions given by
+#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be
+#' coerced to an R vector before taking.
+#' - `$Filter(i, keep_na = TRUE)`: return a `ChunkedArray` with values at positions where
+#' logical vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
+#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be
+#' used to rearrange the `ChunkedArray` in ascending or descending order
+#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the
+#' data in the array to change its type.
+#' - `$null_count`: The number of null entries in the array
+#' - `$chunks`: return a list of `Array`s
+#' - `$num_chunks`: integer number of chunks in the `ChunkedArray`
+#' - `$type`: logical type of data
+#' - `$View(type)`: Construct a zero-copy view of this `ChunkedArray` with the
+#' given type.
+#' - `$Validate()`: Perform any validation checks to determine obvious inconsistencies
+#' within the array's internal data. This can be an expensive check, potentially `O(length)`
+#'
+#' @rdname ChunkedArray
+#' @name ChunkedArray
+#' @seealso [Array]
+#' @examplesIf arrow_available()
+#' # Pass items into chunked_array as separate objects to create chunks
+#' class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73))
+#' class_scores$num_chunks
+#'
+#' # When taking a Slice from a chunked_array, chunks are preserved
+#' class_scores$Slice(2, length = 5)
+#'
+#' # You can combine Take and SortIndices to return a ChunkedArray with 1 chunk
+#' # containing all values, ordered.
+#' class_scores$Take(class_scores$SortIndices(descending = TRUE))
+#'
+#' # If you pass a list into chunked_array, you get a list of length 1
+#' list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8)))
+#' list_scores$num_chunks
+#'
+#' # When constructing a ChunkedArray, the first chunk is used to infer type.
+#' doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L))
+#' doubles$type
+#' @export
+ChunkedArray <- R6Class("ChunkedArray",
+ inherit = ArrowDatum,
+ public = list(
+ length = function() ChunkedArray__length(self),
+ type_id = function() ChunkedArray__type(self)$id,
+ chunk = function(i) Array$create(ChunkedArray__chunk(self, i)),
+ as_vector = function() ChunkedArray__as_vector(self, option_use_threads()),
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ ChunkedArray__Slice1(self, offset)
+ } else {
+ ChunkedArray__Slice2(self, offset, length)
+ }
+ },
+ Take = function(i) {
+ if (is.numeric(i)) {
+ i <- as.integer(i)
+ }
+ if (is.integer(i)) {
+ i <- Array$create(i)
+ }
+ call_function("take", self, i)
+ },
+ Filter = function(i, keep_na = TRUE) {
+ if (is.logical(i)) {
+ i <- Array$create(i)
+ }
+ call_function("filter", self, i, options = list(keep_na = keep_na))
+ },
+ SortIndices = function(descending = FALSE) {
+ assert_that(is.logical(descending))
+ assert_that(length(descending) == 1L)
+ assert_that(!is.na(descending))
+ # TODO: after ARROW-12042 is closed, review whether this and the
+ # Array$SortIndices definition can be consolidated
+ call_function(
+ "sort_indices",
+ self,
+ options = list(names = "", orders = as.integer(descending))
+ )
+ },
+ View = function(type) {
+ ChunkedArray__View(self, as_type(type))
+ },
+ Validate = function() {
+ ChunkedArray__Validate(self)
+ },
+ ToString = function() {
+ ChunkedArray__ToString(self)
+ },
+ Equals = function(other, ...) {
+ inherits(other, "ChunkedArray") && ChunkedArray__Equals(self, other)
+ }
+ ),
+ active = list(
+ null_count = function() ChunkedArray__null_count(self),
+ num_chunks = function() ChunkedArray__num_chunks(self),
+ chunks = function() map(ChunkedArray__chunks(self), Array$create),
+ type = function() ChunkedArray__type(self)
+ )
+)
+
+ChunkedArray$create <- function(..., type = NULL) {
+ if (!is.null(type)) {
+ type <- as_type(type)
+ }
+ ChunkedArray__from_list(list2(...), type)
+}
+
+#' @param \dots Vectors to coerce
+#' @param type currently ignored
+#' @rdname ChunkedArray
+#' @export
+chunked_array <- ChunkedArray$create
diff --git a/src/arrow/r/R/compression.R b/src/arrow/r/R/compression.R
new file mode 100644
index 000000000..7107012d0
--- /dev/null
+++ b/src/arrow/r/R/compression.R
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include enums.R
+#' @include arrow-package.R
+#' @include io.R
+
+#' @title Compression Codec class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description Codecs allow you to create [compressed input and output
+#' streams][compression].
+#' @section Factory:
+#' The `Codec$create()` factory method takes the following arguments:
+#' * `type`: string name of the compression method. Possible values are
+#' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo", or
+#' "bz2". `type` may be upper- or lower-cased. Not all methods may be
+#' available; support depends on build-time flags for the C++ library.
+#' See [codec_is_available()]. Most builds support at least "snappy" and
+#' "gzip". All support "uncompressed".
+#' * `compression_level`: compression level, the default value (`NA`) uses the
+#' default compression level for the selected compression `type`.
+#' @rdname Codec
+#' @name Codec
+#' @export
+Codec <- R6Class("Codec",
+ inherit = ArrowObject,
+ active = list(
+ name = function() util___Codec__name(self),
+ level = function() abort("Codec$level() not yet implemented")
+ )
+)
+Codec$create <- function(type = "gzip", compression_level = NA) {
+ if (is.string(type)) {
+ type <- util___Codec__Create(
+ compression_from_name(type), compression_level
+ )
+ }
+ assert_is(type, "Codec")
+ type
+}
+
+#' Check whether a compression codec is available
+#'
+#' Support for compression libraries depends on the build-time settings of
+#' the Arrow C++ library. This function lets you know which are available for
+#' use.
+#' @param type A string, one of "uncompressed", "snappy", "gzip", "brotli",
+#' "zstd", "lz4", "lzo", or "bz2", case insensitive.
+#' @return Logical: is `type` available?
+#' @export
+#' @examplesIf arrow_available()
+#' codec_is_available("gzip")
+codec_is_available <- function(type) {
+ util___Codec__IsAvailable(compression_from_name(type))
+}
+
+compression_from_name <- function(name) {
+ map_int(name, ~ CompressionType[[match.arg(toupper(.x), names(CompressionType))]])
+}
+
+#' @title Compressed stream classes
+#' @rdname compression
+#' @name compression
+#' @aliases CompressedInputStream CompressedOutputStream
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CompressedInputStream` and `CompressedOutputStream`
+#' allow you to apply a compression [Codec] to an
+#' input or output stream.
+#'
+#' @section Factory:
+#'
+#' The `CompressedInputStream$create()` and `CompressedOutputStream$create()`
+#' factory methods instantiate the object and take the following arguments:
+#'
+#' - `stream` An [InputStream] or [OutputStream], respectively
+#' - `codec` A `Codec`, either a [Codec][Codec] instance or a string
+#' - `compression_level` compression level for when the `codec` argument is given as a string
+#'
+#' @section Methods:
+#'
+#' Methods are inherited from [InputStream] and [OutputStream], respectively
+#' @export
+#' @include arrow-package.R
+CompressedOutputStream <- R6Class("CompressedOutputStream", inherit = OutputStream)
+CompressedOutputStream$create <- function(stream, codec = "gzip", compression_level = NA) {
+ codec <- Codec$create(codec, compression_level = compression_level)
+ if (is.string(stream)) {
+ stream <- FileOutputStream$create(stream)
+ }
+ assert_is(stream, "OutputStream")
+ io___CompressedOutputStream__Make(codec, stream)
+}
+
+#' @rdname compression
+#' @usage NULL
+#' @format NULL
+#' @export
+CompressedInputStream <- R6Class("CompressedInputStream", inherit = InputStream)
+CompressedInputStream$create <- function(stream, codec = "gzip", compression_level = NA) {
+ codec <- Codec$create(codec, compression_level = compression_level)
+ if (is.string(stream)) {
+ stream <- ReadableFile$create(stream)
+ }
+ assert_is(stream, "InputStream")
+ io___CompressedInputStream__Make(codec, stream)
+}
diff --git a/src/arrow/r/R/compute.R b/src/arrow/r/R/compute.R
new file mode 100644
index 000000000..0a7d77a09
--- /dev/null
+++ b/src/arrow/r/R/compute.R
@@ -0,0 +1,309 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Call an Arrow compute function
+#'
+#' This function provides a lower-level API for calling Arrow functions by their
+#' string function name. You won't use it directly for most applications.
+#' Many Arrow compute functions are mapped to R methods,
+#' and in a `dplyr` evaluation context, [all Arrow functions][list_compute_functions()]
+#' are callable with an `arrow_` prefix.
+#' @param function_name string Arrow compute function name
+#' @param ... Function arguments, which may include `Array`, `ChunkedArray`, `Scalar`,
+#' `RecordBatch`, or `Table`.
+#' @param args list arguments as an alternative to specifying in `...`
+#' @param options named list of C++ function options.
+#' @details When passing indices in `...`, `args`, or `options`, express them as
+#' 0-based integers (consistent with C++).
+#' @return An `Array`, `ChunkedArray`, `Scalar`, `RecordBatch`, or `Table`, whatever the compute function results in.
+#' @seealso [Arrow C++ documentation](https://arrow.apache.org/docs/cpp/compute.html) for
+#' the functions and their respective options.
+#' @examplesIf arrow_available()
+#' a <- Array$create(c(1L, 2L, 3L, NA, 5L))
+#' s <- Scalar$create(4L)
+#' call_function("coalesce", a, s)
+#'
+#' a <- Array$create(rnorm(10000))
+#' call_function("quantile", a, options = list(q = seq(0, 1, 0.25)))
+#' @export
+#' @include array.R
+#' @include chunked-array.R
+#' @include scalar.R
+call_function <- function(function_name, ..., args = list(...), options = empty_named_list()) {
+ assert_that(is.string(function_name))
+ assert_that(is.list(options), !is.null(names(options)))
+
+ datum_classes <- c("Array", "ChunkedArray", "RecordBatch", "Table", "Scalar")
+ valid_args <- map_lgl(args, ~ inherits(., datum_classes))
+ if (!all(valid_args)) {
+ # Lame, just pick one to report
+ first_bad <- min(which(!valid_args))
+ stop(
+ "Argument ", first_bad, " is of class ", head(class(args[[first_bad]]), 1),
+ " but it must be one of ", oxford_paste(datum_classes, "or"),
+ call. = FALSE
+ )
+ }
+
+ compute__CallFunction(function_name, args, options)
+}
+
+#' List available Arrow C++ compute functions
+#'
+#' This function lists the names of all available Arrow C++ library compute functions.
+#' These can be called by passing to [call_function()], or they can be
+#' called by name with an `arrow_` prefix inside a `dplyr` verb.
+#'
+#' The resulting list describes the capabilities of your `arrow` build.
+#' Some functions, such as string and regular expression functions,
+#' require optional build-time C++ dependencies. If your `arrow` package
+#' was not compiled with those features enabled, those functions will
+#' not appear in this list.
+#'
+#' Some functions take options that need to be passed when calling them
+#' (in a list called `options`). These options require custom handling
+#' in C++; many functions already have that handling set up but not all do.
+#' If you encounter one that needs special handling for options, please
+#' report an issue.
+#'
+#' Note that this list does *not* enumerate all of the R bindings for these functions.
+#' The package includes Arrow methods for many base R functions that can
+#' be called directly on Arrow objects, as well as some tidyverse-flavored versions
+#' available inside `dplyr` verbs.
+#'
+#' @param pattern Optional regular expression to filter the function list
+#' @param ... Additional parameters passed to `grep()`
+#' @return A character vector of available Arrow C++ function names
+#' @examplesIf arrow_available()
+#' available_funcs <- list_compute_functions()
+#' utf8_funcs <- list_compute_functions(pattern = "^UTF8", ignore.case = TRUE)
+#' @export
+list_compute_functions <- function(pattern = NULL, ...) {
+ funcs <- compute__GetFunctionNames()
+ if (!is.null(pattern)) {
+ funcs <- grep(pattern, funcs, value = TRUE, ...)
+ }
+ # TODO: Filtering of hash funcs will already happen in C++ with ARROW-13943
+ funcs <- grep(
+ "^hash_",
+ funcs,
+ value = TRUE,
+ invert = TRUE
+ )
+ funcs
+}
+
+#' @export
+sum.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("sum", ..., na.rm = na.rm)
+}
+
+#' @export
+mean.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("mean", ..., na.rm = na.rm)
+}
+
+#' @export
+min.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("min")
+}
+
+#' @export
+max.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("max")
+}
+
+scalar_aggregate <- function(FUN, ..., na.rm = FALSE, min_count = 0L) {
+ a <- collect_arrays_from_dots(list(...))
+ if (FUN == "min_max" && na.rm && a$null_count == length(a)) {
+ Array$create(data.frame(min = Inf, max = -Inf))
+ # If na.rm == TRUE and all values in array are NA, R returns
+ # Inf/-Inf, which are type double. Since Arrow is type-stable
+ # and does not do that, we handle this special case here.
+ } else {
+ call_function(FUN, a, options = list(skip_nulls = na.rm, min_count = min_count))
+ }
+}
+
+collect_arrays_from_dots <- function(dots) {
+ # Given a list that may contain both Arrays and ChunkedArrays,
+ # return a single ChunkedArray containing all of those chunks
+ # (may return a regular Array if there is only one element in dots)
+ # If there is only one element and it is a scalar, it returns the scalar
+ if (length(dots) == 1) {
+ return(dots[[1]])
+ }
+
+ assert_that(all(map_lgl(dots, is.Array)))
+ arrays <- unlist(lapply(dots, function(x) {
+ if (inherits(x, "ChunkedArray")) {
+ x$chunks
+ } else {
+ x
+ }
+ }))
+ ChunkedArray$create(!!!arrays)
+}
+
+#' @export
+quantile.ArrowDatum <- function(x,
+ probs = seq(0, 1, 0.25),
+ na.rm = FALSE,
+ type = 7,
+ interpolation = c("linear", "lower", "higher", "nearest", "midpoint"),
+ ...) {
+ if (inherits(x, "Scalar")) x <- Array$create(x)
+ assert_is(probs, c("numeric", "integer"))
+ assert_that(length(probs) > 0)
+ assert_that(all(probs >= 0 & probs <= 1))
+ if (!na.rm && x$null_count > 0) {
+ stop("Missing values not allowed if 'na.rm' is FALSE", call. = FALSE)
+ }
+ if (type != 7) {
+ stop(
+ "Argument `type` not supported in Arrow. To control the quantile ",
+ "interpolation algorithm, set argument `interpolation` to one of: ",
+ "\"linear\" (the default), \"lower\", \"higher\", \"nearest\", or ",
+ "\"midpoint\".",
+ call. = FALSE
+ )
+ }
+ interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]]
+ out <- call_function("quantile", x, options = list(q = probs, interpolation = interpolation))
+ if (length(out) == 0) {
+ # When there are no non-missing values in the data, the Arrow quantile
+ # function returns an empty Array, but for consistency with the R quantile
+ # function, we want an Array of NA_real_ with the same length as probs
+ out <- Array$create(rep(NA_real_, length(probs)))
+ }
+ out
+}
+
+#' @export
+median.ArrowDatum <- function(x, na.rm = FALSE, ...) {
+ if (!na.rm && x$null_count > 0) {
+ Scalar$create(NA_real_)
+ } else {
+ Scalar$create(quantile(x, probs = 0.5, na.rm = TRUE, ...))
+ }
+}
+
+#' @export
+unique.ArrowDatum <- function(x, incomparables = FALSE, ...) {
+ call_function("unique", x)
+}
+
+#' @export
+any.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("any", ..., na.rm = na.rm)
+}
+
+#' @export
+all.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("all", ..., na.rm = na.rm)
+}
+
+#' `match` and `%in%` for Arrow objects
+#'
+#' `base::match()` is not a generic, so we can't just define Arrow methods for
+#' it. This function exposes the analogous functions in the Arrow C++ library.
+#'
+#' @param x `Scalar`, `Array` or `ChunkedArray`
+#' @param table `Scalar`, Array`, `ChunkedArray`, or R vector lookup table.
+#' @param ... additional arguments, ignored
+#' @return `match_arrow()` returns an `int32`-type Arrow object of the same length
+#' and type as `x` with the (0-based) indexes into `table`. `is_in()` returns a
+#' `boolean`-type Arrow object of the same length and type as `x` with values indicating
+#' per element of `x` it it is present in `table`.
+#' @examplesIf arrow_available()
+#' # note that the returned value is 0-indexed
+#' cars_tbl <- arrow_table(name = rownames(mtcars), mtcars)
+#' match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name)
+#'
+#' is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name)
+#'
+#' # Although there are multiple matches, you are returned the index of the first
+#' # match, as with the base R equivalent
+#' match(4, mtcars$cyl) # 1-indexed
+#' match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed
+#'
+#' # If `x` contains multiple values, you are returned the indices of the first
+#' # match for each value.
+#' match(c(4, 6, 8), mtcars$cyl)
+#' match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl)
+#'
+#' # Return type matches type of `x`
+#' is_in(c(4, 6, 8), mtcars$cyl) # returns vector
+#' is_in(Scalar$create(4), mtcars$cyl) # returns Scalar
+#' is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array
+#' is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray
+#' @export
+match_arrow <- function(x, table, ...) {
+ if (!inherits(x, "ArrowDatum")) {
+ x <- Array$create(x)
+ }
+
+ if (!inherits(table, c("Array", "ChunkedArray"))) {
+ table <- Array$create(table)
+ }
+ call_function("index_in_meta_binary", x, table)
+}
+
+#' @rdname match_arrow
+#' @export
+is_in <- function(x, table, ...) {
+ if (!inherits(x, "ArrowDatum")) {
+ x <- Array$create(x)
+ }
+
+ if (!inherits(table, c("Array", "DictionaryArray", "ChunkedArray"))) {
+ table <- Array$create(table)
+ }
+ call_function("is_in_meta_binary", x, table)
+}
+
+#' `table` for Arrow objects
+#'
+#' This function tabulates the values in the array and returns a table of counts.
+#' @param x `Array` or `ChunkedArray`
+#' @return A `StructArray` containing "values" (same type as `x`) and "counts"
+#' `Int64`.
+#' @examplesIf arrow_available()
+#' cyl_vals <- Array$create(mtcars$cyl)
+#' counts <- value_counts(cyl_vals)
+#' @export
+value_counts <- function(x) {
+ call_function("value_counts", x)
+}
+
+#' Cast options
+#'
+#' @param safe logical: enforce safe conversion? Default `TRUE`
+#' @param ... additional cast options, such as `allow_int_overflow`,
+#' `allow_time_truncate`, and `allow_float_truncate`, which are set to `!safe`
+#' by default
+#' @return A list
+#' @export
+#' @keywords internal
+cast_options <- function(safe = TRUE, ...) {
+ opts <- list(
+ allow_int_overflow = !safe,
+ allow_time_truncate = !safe,
+ allow_float_truncate = !safe
+ )
+ modifyList(opts, list(...))
+}
diff --git a/src/arrow/r/R/config.R b/src/arrow/r/R/config.R
new file mode 100644
index 000000000..af07ad9a9
--- /dev/null
+++ b/src/arrow/r/R/config.R
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Manage the global CPU thread pool in libarrow
+#'
+#' @export
+cpu_count <- function() {
+ GetCpuThreadPoolCapacity()
+}
+
+#' @rdname cpu_count
+#' @param num_threads integer: New number of threads for thread pool
+#' @export
+set_cpu_count <- function(num_threads) {
+ SetCpuThreadPoolCapacity(as.integer(num_threads))
+}
+
+#' Manage the global I/O thread pool in libarrow
+#'
+#' @export
+io_thread_count <- function() {
+ GetIOThreadPoolCapacity()
+}
+
+#' @rdname io_thread_count
+#' @param num_threads integer: New number of threads for thread pool
+#' @export
+set_io_thread_count <- function(num_threads) {
+ SetIOThreadPoolCapacity(as.integer(num_threads))
+}
diff --git a/src/arrow/r/R/csv.R b/src/arrow/r/R/csv.R
new file mode 100644
index 000000000..ee890578f
--- /dev/null
+++ b/src/arrow/r/R/csv.R
@@ -0,0 +1,644 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Read a CSV or other delimited file with Arrow
+#'
+#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`.
+#' Arrow C++ options have been mapped to argument names that follow those of
+#' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`.
+#'
+#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around
+#' `read_delim_arrow()` that specify a delimiter.
+#'
+#' Note that not all `readr` options are currently implemented here. Please file
+#' an issue if you encounter one that `arrow` should support.
+#'
+#' If you need to control Arrow-specific reader parameters that don't have an
+#' equivalent in `readr::read_csv()`, you can either provide them in the
+#' `parse_options`, `convert_options`, or `read_options` arguments, or you can
+#' use [CsvTableReader] directly for lower-level access.
+#'
+#' @section Specifying column types and names:
+#'
+#' By default, the CSV reader will infer the column names and data types from the file, but there
+#' are a few ways you can specify them directly.
+#'
+#' One way is to provide an Arrow [Schema] in the `schema` argument,
+#' which is an ordered map of column name to type.
+#' When provided, it satisfies both the `col_names` and `col_types` arguments.
+#' This is good if you know all of this information up front.
+#'
+#' You can also pass a `Schema` to the `col_types` argument. If you do this,
+#' column names will still be inferred from the file unless you also specify
+#' `col_names`. In either case, the column names in the `Schema` must match the
+#' data's column names, whether they are explicitly provided or inferred. That
+#' said, this `Schema` does not have to reference all columns: those omitted
+#' will have their types inferred.
+#'
+#' Alternatively, you can declare column types by providing the compact string representation
+#' that `readr` uses to the `col_types` argument. This means you provide a
+#' single string, one character per column, where the characters map to Arrow
+#' types analogously to the `readr` type mapping:
+#'
+#' * "c": `utf8()`
+#' * "i": `int32()`
+#' * "n": `float64()`
+#' * "d": `float64()`
+#' * "l": `bool()`
+#' * "f": `dictionary()`
+#' * "D": `date32()`
+#' * "T": `timestamp()`
+#' * "t": `time32()`
+#' * "_": `null()`
+#' * "-": `null()`
+#' * "?": infer the type from the data
+#'
+#' If you use the compact string representation for `col_types`, you must also
+#' specify `col_names`.
+#'
+#' Regardless of how types are specified, all columns with a `null()` type will
+#' be dropped.
+#'
+#' Note that if you are specifying column names, whether by `schema` or
+#' `col_names`, and the CSV file has a header row that would otherwise be used
+#' to idenfity column names, you'll need to add `skip = 1` to skip that row.
+#'
+#' @param file A character file name or URI, `raw` vector, an Arrow input stream,
+#' or a `FileSystem` with path (`SubTreeFileSystem`).
+#' If a file name, a memory-mapped Arrow [InputStream] will be opened and
+#' closed when finished; compression will be detected from the file extension
+#' and handled automatically. If an input stream is provided, it will be left
+#' open.
+#' @param delim Single character used to separate fields within a record.
+#' @param quote Single character used to quote strings.
+#' @param escape_double Does the file escape quotes by doubling them?
+#' i.e. If this option is `TRUE`, the value `""""` represents
+#' a single quote, `\"`.
+#' @param escape_backslash Does the file use backslashes to escape special
+#' characters? This is more general than `escape_double` as backslashes
+#' can be used to escape the delimiter character, the quote character, or
+#' to add special characters like `\\n`.
+#' @param schema [Schema] that describes the table. If provided, it will be
+#' used to satisfy both `col_names` and `col_types`.
+#' @param col_names If `TRUE`, the first row of the input will be used as the
+#' column names and will not be included in the data frame. If `FALSE`, column
+#' names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
+#' Alternatively, you can specify a character vector of column names.
+#' @param col_types A compact string representation of the column types, or
+#' `NULL` (the default) to infer types from the data.
+#' @param col_select A character vector of column names to keep, as in the
+#' "select" argument to `data.table::fread()`, or a
+#' [tidy selection specification][tidyselect::vars_select()]
+#' of columns, as used in `dplyr::select()`.
+#' @param na A character vector of strings to interpret as missing values.
+#' @param quoted_na Should missing values inside quotes be treated as missing
+#' values (the default) or strings. (Note that this is different from the
+#' the Arrow C++ default for the corresponding convert option,
+#' `strings_can_be_null`.)
+#' @param skip_empty_rows Should blank rows be ignored altogether? If
+#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
+#' filled with missings.
+#' @param skip Number of lines to skip before reading data.
+#' @param timestamp_parsers User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are:
+#' - `NULL`: the default, which uses the ISO-8601 parser
+#' - a character vector of [strptime][base::strptime()] parse strings
+#' - a list of [TimestampParser] objects
+#' @param parse_options see [file reader options][CsvReadOptions].
+#' If given, this overrides any
+#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
+#' @param convert_options see [file reader options][CsvReadOptions]
+#' @param read_options see [file reader options][CsvReadOptions]
+#' @param as_data_frame Should the function return a `data.frame` (default) or
+#' an Arrow [Table]?
+#'
+#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`.
+#' @export
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write.csv(mtcars, file = tf)
+#' df <- read_csv_arrow(tf)
+#' dim(df)
+#' # Can select columns
+#' df <- read_csv_arrow(tf, col_select = starts_with("d"))
+read_delim_arrow <- function(file,
+ delim = ",",
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ if (inherits(schema, "Schema")) {
+ col_names <- names(schema)
+ col_types <- schema
+ }
+ if (is.null(parse_options)) {
+ parse_options <- readr_to_csv_parse_options(
+ delim,
+ quote,
+ escape_double,
+ escape_backslash,
+ skip_empty_rows
+ )
+ }
+ if (is.null(read_options)) {
+ read_options <- readr_to_csv_read_options(skip, col_names)
+ }
+ if (is.null(convert_options)) {
+ convert_options <- readr_to_csv_convert_options(
+ na,
+ quoted_na,
+ col_types = col_types,
+ col_names = read_options$column_names,
+ timestamp_parsers = timestamp_parsers
+ )
+ }
+
+ if (!inherits(file, "InputStream")) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+ reader <- CsvTableReader$create(
+ file,
+ read_options = read_options,
+ parse_options = parse_options,
+ convert_options = convert_options
+ )
+
+ tab <- reader$Read()
+
+ # TODO: move this into convert_options using include_columns
+ col_select <- enquo(col_select)
+ if (!quo_is_null(col_select)) {
+ tab <- tab[vars_select(names(tab), !!col_select)]
+ }
+
+ if (isTRUE(as_data_frame)) {
+ tab <- as.data.frame(tab)
+ }
+
+ tab
+}
+
+#' @rdname read_delim_arrow
+#' @export
+read_csv_arrow <- function(file,
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ mc <- match.call()
+ mc$delim <- ","
+ mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
+ eval.parent(mc)
+}
+
+#' @rdname read_delim_arrow
+#' @export
+read_tsv_arrow <- function(file,
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ mc <- match.call()
+ mc$delim <- "\t"
+ mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
+ eval.parent(mc)
+}
+
+#' @title Arrow CSV and JSON table reader classes
+#' @rdname CsvTableReader
+#' @name CsvTableReader
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvTableReader` and `JsonTableReader` wrap the Arrow C++ CSV
+#' and JSON table readers. See their usage in [read_csv_arrow()] and
+#' [read_json_arrow()], respectively.
+#'
+#' @section Factory:
+#'
+#' The `CsvTableReader$create()` and `JsonTableReader$create()` factory methods
+#' take the following arguments:
+#'
+#' - `file` An Arrow [InputStream]
+#' - `convert_options` (CSV only), `parse_options`, `read_options`: see
+#' [CsvReadOptions]
+#' - `...` additional parameters.
+#'
+#' @section Methods:
+#'
+#' - `$Read()`: returns an Arrow Table.
+#'
+#' @include arrow-package.R
+#' @export
+CsvTableReader <- R6Class("CsvTableReader",
+ inherit = ArrowObject,
+ public = list(
+ Read = function() csv___TableReader__Read(self)
+ )
+)
+CsvTableReader$create <- function(file,
+ read_options = CsvReadOptions$create(),
+ parse_options = CsvParseOptions$create(),
+ convert_options = CsvConvertOptions$create(),
+ ...) {
+ assert_is(file, "InputStream")
+ csv___TableReader__Make(file, read_options, parse_options, convert_options)
+}
+
+#' @title File reader options
+#' @rdname CsvReadOptions
+#' @name CsvReadOptions
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,
+#' `JsonReadOptions`, `JsonParseOptions`, and `TimestampParser` are containers for various
+#' file reading options. See their usage in [read_csv_arrow()] and
+#' [read_json_arrow()], respectively.
+#'
+#' @section Factory:
+#'
+#' The `CsvReadOptions$create()` and `JsonReadOptions$create()` factory methods
+#' take the following arguments:
+#'
+#' - `use_threads` Whether to use the global CPU thread pool
+#' - `block_size` Block size we request from the IO layer; also determines
+#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input
+#' must end with an empty line.
+#'
+#' `CsvReadOptions$create()` further accepts these additional arguments:
+#'
+#' - `skip_rows` Number of lines to skip before reading data (default 0)
+#' - `column_names` Character vector to supply column names. If length-0
+#' (the default), the first non-skipped row will be parsed to generate column
+#' names, unless `autogenerate_column_names` is `TRUE`.
+#' - `autogenerate_column_names` Logical: generate column names instead of
+#' using the first non-skipped row (the default)? If `TRUE`, column names will
+#' be "f0", "f1", ..., "fN".
+#'
+#' `CsvParseOptions$create()` takes the following arguments:
+#'
+#' - `delimiter` Field delimiting character (default `","`)
+#' - `quoting` Logical: are strings quoted? (default `TRUE`)
+#' - `quote_char` Quoting character, if `quoting` is `TRUE`
+#' - `double_quote` Logical: are quotes inside values double-quoted? (default `TRUE`)
+#' - `escaping` Logical: whether escaping is used (default `FALSE`)
+#' - `escape_char` Escaping character, if `escaping` is `TRUE`
+#' - `newlines_in_values` Logical: are values allowed to contain CR (`0x0d`)
+#' and LF (`0x0a`) characters? (default `FALSE`)
+#' - `ignore_empty_lines` Logical: should empty lines be ignored (default) or
+#' generate a row of missing values (if `FALSE`)?
+#'
+#' `JsonParseOptions$create()` accepts only the `newlines_in_values` argument.
+#'
+#' `CsvConvertOptions$create()` takes the following arguments:
+#'
+#' - `check_utf8` Logical: check UTF8 validity of string columns? (default `TRUE`)
+#' - `null_values` character vector of recognized spellings for null values.
+#' Analogous to the `na.strings` argument to
+#' [`read.csv()`][utils::read.csv()] or `na` in `readr::read_csv()`.
+#' - `strings_can_be_null` Logical: can string / binary columns have
+#' null values? Similar to the `quoted_na` argument to `readr::read_csv()`.
+#' (default `FALSE`)
+#' - `true_values` character vector of recognized spellings for `TRUE` values
+#' - `false_values` character vector of recognized spellings for `FALSE` values
+#' - `col_types` A `Schema` or `NULL` to infer types
+#' - `auto_dict_encode` Logical: Whether to try to automatically
+#' dictionary-encode string / binary data (think `stringsAsFactors`). Default `FALSE`.
+#' This setting is ignored for non-inferred columns (those in `col_types`).
+#' - `auto_dict_max_cardinality` If `auto_dict_encode`, string/binary columns
+#' are dictionary-encoded up to this number of unique values (default 50),
+#' after which it switches to regular encoding.
+#' - `include_columns` If non-empty, indicates the names of columns from the
+#' CSV file that should be actually read and converted (in the vector's order).
+#' - `include_missing_columns` Logical: if `include_columns` is provided, should
+#' columns named in it but not found in the data be included as a column of
+#' type `null()`? The default (`FALSE`) means that the reader will instead
+#' raise an error.
+#' - `timestamp_parsers` User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are
+#' (a) `NULL`, the default, which uses the ISO-8601 parser;
+#' (b) a character vector of [strptime][base::strptime()] parse strings; or
+#' (c) a list of [TimestampParser] objects.
+#'
+#' `TimestampParser$create()` takes an optional `format` string argument.
+#' See [`strptime()`][base::strptime()] for example syntax.
+#' The default is to use an ISO-8601 format parser.
+#'
+#' The `CsvWriteOptions$create()` factory method takes the following arguments:
+#' - `include_header` Whether to write an initial header line with column names
+#' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @section Active bindings:
+#'
+#' - `column_names`: from `CsvReadOptions`
+#'
+#' @export
+CsvReadOptions <- R6Class("CsvReadOptions",
+ inherit = ArrowObject,
+ active = list(
+ column_names = function() csv___ReadOptions__column_names(self)
+ )
+)
+CsvReadOptions$create <- function(use_threads = option_use_threads(),
+ block_size = 1048576L,
+ skip_rows = 0L,
+ column_names = character(0),
+ autogenerate_column_names = FALSE) {
+ csv___ReadOptions__initialize(
+ list(
+ use_threads = use_threads,
+ block_size = block_size,
+ skip_rows = skip_rows,
+ column_names = column_names,
+ autogenerate_column_names = autogenerate_column_names
+ )
+ )
+}
+
+#' @rdname CsvReadOptions
+#' @export
+CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
+CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) {
+ assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+ csv___WriteOptions__initialize(
+ list(
+ include_header = include_header,
+ batch_size = as.integer(batch_size)
+ )
+ )
+}
+
+readr_to_csv_read_options <- function(skip, col_names, col_types) {
+ if (isTRUE(col_names)) {
+ # C++ default to parse is 0-length string array
+ col_names <- character(0)
+ }
+ if (identical(col_names, FALSE)) {
+ CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE)
+ } else {
+ CsvReadOptions$create(skip_rows = skip, column_names = col_names)
+ }
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject)
+CsvParseOptions$create <- function(delimiter = ",",
+ quoting = TRUE,
+ quote_char = '"',
+ double_quote = TRUE,
+ escaping = FALSE,
+ escape_char = "\\",
+ newlines_in_values = FALSE,
+ ignore_empty_lines = TRUE) {
+ csv___ParseOptions__initialize(
+ list(
+ delimiter = delimiter,
+ quoting = quoting,
+ quote_char = quote_char,
+ double_quote = double_quote,
+ escaping = escaping,
+ escape_char = escape_char,
+ newlines_in_values = newlines_in_values,
+ ignore_empty_lines = ignore_empty_lines
+ )
+ )
+}
+
+readr_to_csv_parse_options <- function(delim = ",",
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ skip_empty_rows = TRUE) {
+ # This function translates from the readr argument list to the arrow arg names
+ # TODO: validate inputs
+ CsvParseOptions$create(
+ delimiter = delim,
+ quoting = nzchar(quote),
+ quote_char = quote,
+ double_quote = escape_double,
+ escaping = escape_backslash,
+ escape_char = "\\",
+ newlines_in_values = escape_backslash,
+ ignore_empty_lines = skip_empty_rows
+ )
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+TimestampParser <- R6Class("TimestampParser",
+ inherit = ArrowObject,
+ public = list(
+ kind = function() TimestampParser__kind(self),
+ format = function() TimestampParser__format(self)
+ )
+)
+TimestampParser$create <- function(format = NULL) {
+ if (is.null(format)) {
+ TimestampParser__MakeISO8601()
+ } else {
+ TimestampParser__MakeStrptime(format)
+ }
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
+CsvConvertOptions$create <- function(check_utf8 = TRUE,
+ null_values = c("", "NA"),
+ true_values = c("T", "true", "TRUE"),
+ false_values = c("F", "false", "FALSE"),
+ strings_can_be_null = FALSE,
+ col_types = NULL,
+ auto_dict_encode = FALSE,
+ auto_dict_max_cardinality = 50L,
+ include_columns = character(),
+ include_missing_columns = FALSE,
+ timestamp_parsers = NULL) {
+ if (!is.null(col_types) && !inherits(col_types, "Schema")) {
+ abort(c(
+ "Unsupported `col_types` specification.",
+ i = "`col_types` must be NULL, or a <Schema>."
+ ))
+ }
+
+ csv___ConvertOptions__initialize(
+ list(
+ check_utf8 = check_utf8,
+ null_values = null_values,
+ strings_can_be_null = strings_can_be_null,
+ col_types = col_types,
+ true_values = true_values,
+ false_values = false_values,
+ auto_dict_encode = auto_dict_encode,
+ auto_dict_max_cardinality = auto_dict_max_cardinality,
+ include_columns = include_columns,
+ include_missing_columns = include_missing_columns,
+ timestamp_parsers = timestamp_parsers
+ )
+ )
+}
+
+readr_to_csv_convert_options <- function(na,
+ quoted_na,
+ col_types = NULL,
+ col_names = NULL,
+ timestamp_parsers = NULL) {
+ include_columns <- character()
+
+ if (is.character(col_types)) {
+ if (length(col_types) != 1L) {
+ abort("`col_types` is a character vector that is not of size 1")
+ }
+ n <- nchar(col_types)
+ specs <- substring(col_types, seq_len(n), seq_len(n))
+ if (!is_bare_character(col_names, n)) {
+ abort("Compact specification for `col_types` requires `col_names`")
+ }
+
+ col_types <- set_names(nm = col_names, map2(specs, col_names, ~ {
+ switch(.x,
+ "c" = utf8(),
+ "i" = int32(),
+ "n" = float64(),
+ "d" = float64(),
+ "l" = bool(),
+ "f" = dictionary(),
+ "D" = date32(),
+ "T" = timestamp(),
+ "t" = time32(),
+ "_" = null(),
+ "-" = null(),
+ "?" = NULL,
+ abort("Unsupported compact specification: '", .x, "' for column '", .y, "'")
+ )
+ }))
+ # To "guess" types, omit them from col_types
+ col_types <- keep(col_types, ~ !is.null(.x))
+ col_types <- schema(!!!col_types)
+ }
+
+ if (!is.null(col_types)) {
+ assert_is(col_types, "Schema")
+ # If any columns are null(), drop them
+ # (by specifying the other columns in include_columns)
+ nulls <- map_lgl(col_types$fields, ~ .$type$Equals(null()))
+ if (any(nulls)) {
+ include_columns <- setdiff(col_names, names(col_types)[nulls])
+ }
+ }
+ CsvConvertOptions$create(
+ null_values = na,
+ strings_can_be_null = quoted_na,
+ col_types = col_types,
+ timestamp_parsers = timestamp_parsers,
+ include_columns = include_columns
+ )
+}
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+ sink,
+ include_header = TRUE,
+ batch_size = 1024L) {
+ write_options <- CsvWriteOptions$create(include_header, batch_size)
+
+ x_out <- x
+ if (is.data.frame(x)) {
+ x <- Table$create(x)
+ }
+
+ assert_that(is_writable_table(x))
+
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+
+ if (inherits(x, "RecordBatch")) {
+ csv___WriteCSV__RecordBatch(x, write_options, sink)
+ } else if (inherits(x, "Table")) {
+ csv___WriteCSV__Table(x, write_options, sink)
+ }
+
+ invisible(x_out)
+}
diff --git a/src/arrow/r/R/dataset-factory.R b/src/arrow/r/R/dataset-factory.R
new file mode 100644
index 000000000..c56a6b181
--- /dev/null
+++ b/src/arrow/r/R/dataset-factory.R
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include dataset.R
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Dataset
+#' @export
+DatasetFactory <- R6Class("DatasetFactory",
+ inherit = ArrowObject,
+ public = list(
+ Finish = function(schema = NULL, unify_schemas = FALSE) {
+ if (is.null(schema)) {
+ dataset___DatasetFactory__Finish1(self, unify_schemas)
+ } else {
+ assert_is(schema, "Schema")
+ dataset___DatasetFactory__Finish2(self, schema)
+ }
+ },
+ Inspect = function(unify_schemas = FALSE) {
+ dataset___DatasetFactory__Inspect(self, unify_schemas)
+ }
+ )
+)
+DatasetFactory$create <- function(x,
+ filesystem = NULL,
+ format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
+ partitioning = NULL,
+ ...) {
+ if (is_list_of(x, "DatasetFactory")) {
+ return(dataset___UnionDatasetFactory__Make(x))
+ }
+
+ if (is.character(format)) {
+ format <- FileFormat$create(match.arg(format), ...)
+ } else {
+ assert_is(format, "FileFormat")
+ }
+
+ path_and_fs <- get_paths_and_filesystem(x, filesystem)
+ info <- path_and_fs$fs$GetFileInfo(path_and_fs$path)
+
+ if (length(info) > 1 || info[[1]]$type == FileType$File) {
+ # x looks like a vector of one or more file paths (not a directory path)
+ return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL, path_and_fs$path, format))
+ }
+
+ if (!is.null(partitioning)) {
+ if (inherits(partitioning, "Schema")) {
+ partitioning <- DirectoryPartitioning$create(partitioning)
+ } else if (is.character(partitioning)) {
+ # These are the column/field names, and we should autodetect their types
+ partitioning <- DirectoryPartitioningFactory$create(partitioning)
+ }
+ }
+
+ selector <- FileSelector$create(path_and_fs$path, allow_not_found = FALSE, recursive = TRUE)
+
+ FileSystemDatasetFactory$create(path_and_fs$fs, selector, NULL, format, partitioning)
+}
+
+#' Create a DatasetFactory
+#'
+#' A [Dataset] can constructed using one or more [DatasetFactory]s.
+#' This function helps you construct a `DatasetFactory` that you can pass to
+#' [open_dataset()].
+#'
+#' If you would only have a single `DatasetFactory` (for example, you have a
+#' single directory containing Parquet files), you can call `open_dataset()`
+#' directly. Use `dataset_factory()` when you
+#' want to combine different directories, file systems, or file formats.
+#'
+#' @param x A string path to a directory containing data files, a vector of one
+#' one or more string paths to data files, or a list of `DatasetFactory` objects
+#' whose datasets should be combined. If this argument is specified it will be
+#' used to construct a `UnionDatasetFactory` and other arguments will be
+#' ignored.
+#' @param filesystem A [FileSystem] object; if omitted, the `FileSystem` will
+#' be detected from `x`
+#' @param format A [FileFormat] object, or a string identifier of the format of
+#' the files in `x`. Currently supported values:
+#' * "parquet"
+#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+#' only version 2 files are supported
+#' * "csv"/"text", aliases for the same thing (because comma is the default
+#' delimiter for text files
+#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
+#'
+#' Default is "parquet", unless a `delimiter` is also specified, in which case
+#' it is assumed to be "text".
+#' @param partitioning One of
+#' * A `Schema`, in which case the file paths relative to `sources` will be
+#' parsed, and path segments will be matched with the schema fields. For
+#' example, `schema(year = int16(), month = int8())` would create partitions
+#' for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc.
+#' * A character vector that defines the field names corresponding to those
+#' path segments (that is, you're providing the names that would correspond
+#' to a `Schema` but the types will be autodetected)
+#' * A `HivePartitioning` or `HivePartitioningFactory`, as returned
+#' by [hive_partition()] which parses explicit or autodetected fields from
+#' Hive-style path segments
+#' * `NULL` for no partitioning
+#' @param ... Additional format-specific options, passed to
+#' `FileFormat$create()`. For CSV options, note that you can specify them either
+#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
+#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
+#' Not all `readr` options are currently supported; please file an issue if you
+#' encounter one that `arrow` should support.
+#' @return A `DatasetFactory` object. Pass this to [open_dataset()],
+#' in a list potentially with other `DatasetFactory` objects, to create
+#' a `Dataset`.
+#' @export
+dataset_factory <- DatasetFactory$create
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Dataset
+#' @export
+FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory",
+ inherit = DatasetFactory
+)
+FileSystemDatasetFactory$create <- function(filesystem,
+ selector = NULL,
+ paths = NULL,
+ format,
+ partitioning = NULL) {
+ assert_is(filesystem, "FileSystem")
+ is.null(selector) || assert_is(selector, "FileSelector")
+ is.null(paths) || assert_is(paths, "character")
+ assert_that(
+ xor(is.null(selector), is.null(paths)),
+ msg = "Either selector or paths must be specified"
+ )
+ assert_is(format, "FileFormat")
+ if (!is.null(paths)) {
+ assert_that(is.null(partitioning), msg = "Partitioning not supported with paths")
+ }
+
+ if (!is.null(paths)) {
+ ptr <- dataset___FileSystemDatasetFactory__Make0(filesystem, paths, format)
+ } else if (is.null(partitioning)) {
+ ptr <- dataset___FileSystemDatasetFactory__Make1(filesystem, selector, format)
+ } else if (inherits(partitioning, "PartitioningFactory")) {
+ ptr <- dataset___FileSystemDatasetFactory__Make3(filesystem, selector, format, partitioning)
+ } else if (inherits(partitioning, "Partitioning")) {
+ ptr <- dataset___FileSystemDatasetFactory__Make2(filesystem, selector, format, partitioning)
+ } else {
+ stop(
+ "Expected 'partitioning' to be NULL, PartitioningFactory or Partitioning",
+ call. = FALSE
+ )
+ }
+
+ ptr
+}
diff --git a/src/arrow/r/R/dataset-format.R b/src/arrow/r/R/dataset-format.R
new file mode 100644
index 000000000..b0b93219e
--- /dev/null
+++ b/src/arrow/r/R/dataset-format.R
@@ -0,0 +1,353 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Dataset file formats
+#'
+#' @description
+#' A `FileFormat` holds information about how to read and parse the files
+#' included in a `Dataset`. There are subclasses corresponding to the supported
+#' file formats (`ParquetFileFormat` and `IpcFileFormat`).
+#'
+#' @section Factory:
+#' `FileFormat$create()` takes the following arguments:
+#' * `format`: A string identifier of the file format. Currently supported values:
+#' * "parquet"
+#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+#' only version 2 files are supported
+#' * "csv"/"text", aliases for the same thing (because comma is the default
+#' delimiter for text files
+#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
+#' * `...`: Additional format-specific options
+#'
+#' `format = "parquet"``:
+#' * `dict_columns`: Names of columns which should be read as dictionaries.
+#' * Any Parquet options from [FragmentScanOptions].
+#'
+#' `format = "text"`: see [CsvParseOptions]. Note that you can specify them either
+#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
+#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
+#' Not all `readr` options are currently supported; please file an issue if
+#' you encounter one that `arrow` should support. Also, the following options are
+#' supported. From [CsvReadOptions]:
+#' * `skip_rows`
+#' * `column_names`
+#' * `autogenerate_column_names`
+#' From [CsvFragmentScanOptions] (these values can be overridden at scan time):
+#' * `convert_options`: a [CsvConvertOptions]
+#' * `block_size`
+#'
+#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
+#' @rdname FileFormat
+#' @name FileFormat
+#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows"
+#' ## Semi-colon delimited files
+#' # Set up directory for examples
+#' tf <- tempfile()
+#' dir.create(tf)
+#' on.exit(unlink(tf))
+#' write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)
+#'
+#' # Create FileFormat object
+#' format <- FileFormat$create(format = "text", delimiter = ";")
+#'
+#' open_dataset(tf, format = format)
+#' @export
+FileFormat <- R6Class("FileFormat",
+ inherit = ArrowObject,
+ active = list(
+ # @description
+ # Return the `FileFormat`'s type
+ type = function() dataset___FileFormat__type_name(self)
+ )
+)
+FileFormat$create <- function(format, schema = NULL, ...) {
+ opt_names <- names(list(...))
+ if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) {
+ CsvFileFormat$create(schema = schema, ...)
+ } else if (format == c("tsv")) {
+ CsvFileFormat$create(delimiter = "\t", schema = schema, ...)
+ } else if (format == "parquet") {
+ ParquetFileFormat$create(...)
+ } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing
+ dataset___IpcFileFormat__Make()
+ } else {
+ stop("Unsupported file format: ", format, call. = FALSE)
+ }
+}
+
+#' @export
+as.character.FileFormat <- function(x, ...) {
+ out <- x$type
+ # Slight hack: special case IPC -> feather, otherwise is just the type_name
+ ifelse(out == "ipc", "feather", out)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat)
+ParquetFileFormat$create <- function(...,
+ dict_columns = character(0)) {
+ options <- ParquetFragmentScanOptions$create(...)
+ dataset___ParquetFileFormat__Make(options, dict_columns)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
+CsvFileFormat$create <- function(...,
+ opts = csv_file_format_parse_options(...),
+ convert_options = csv_file_format_convert_opts(...),
+ read_options = csv_file_format_read_opts(...)) {
+ dataset___CsvFileFormat__Make(opts, convert_options, read_options)
+}
+
+# Support both readr-style option names and Arrow C++ option names
+csv_file_format_parse_options <- function(...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
+ convert_opts <- names(formals(CsvConvertOptions$create))
+ read_opts <- names(formals(CsvReadOptions$create))
+ opts[convert_opts] <- NULL
+ opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
+ opt_names <- names(opts)
+ # Catch any readr-style options specified with full option names that are
+ # supported by read_delim_arrow() (and its wrappers) but are not yet
+ # supported here
+ unsup_readr_opts <- setdiff(
+ names(formals(read_delim_arrow)),
+ names(formals(readr_to_csv_parse_options))
+ )
+ is_unsup_opt <- opt_names %in% unsup_readr_opts
+ unsup_opts <- opt_names[is_unsup_opt]
+ if (length(unsup_opts)) {
+ stop(
+ "The following ",
+ ngettext(length(unsup_opts), "option is ", "options are "),
+ "supported in \"read_delim_arrow\" functions ",
+ "but not yet supported here: ",
+ oxford_paste(unsup_opts),
+ call. = FALSE
+ )
+ }
+ # Catch any options with full or partial names that do not match any of the
+ # recognized Arrow C++ option names or readr-style option names
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
+ is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
+ unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
+ if (length(unrec_opts)) {
+ stop(
+ "Unrecognized ",
+ ngettext(length(unrec_opts), "option", "options"),
+ ": ",
+ oxford_paste(unrec_opts),
+ call. = FALSE
+ )
+ }
+ # Catch options with ambiguous partial names (such as "del") that make it
+ # unclear whether the user is specifying Arrow C++ options ("delimiter") or
+ # readr-style options ("delim")
+ is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts)))
+ ambig_opts <- opt_names[is_ambig_opt]
+ if (length(ambig_opts)) {
+ stop("Ambiguous ",
+ ngettext(length(ambig_opts), "option", "options"),
+ ": ",
+ oxford_paste(ambig_opts),
+ ". Use full argument names",
+ call. = FALSE
+ )
+ }
+ if (any(is_readr_opt)) {
+ # Catch cases when the user specifies a mix of Arrow C++ options and
+ # readr-style options
+ if (!all(is_readr_opt)) {
+ stop("Use either Arrow parse options or readr parse options, not both",
+ call. = FALSE
+ )
+ }
+ do.call(readr_to_csv_parse_options, opts) # all options have readr-style names
+ } else {
+ do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names
+ }
+}
+
+csv_file_format_convert_opts <- function(...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvParseOptions/CsvReadOptions
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ read_opts <- names(formals(CsvReadOptions$create))
+ opts[arrow_opts] <- NULL
+ opts[readr_opts] <- NULL
+ opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
+ do.call(CsvConvertOptions$create, opts)
+}
+
+csv_file_format_read_opts <- function(schema = NULL, ...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ convert_opts <- names(formals(CsvConvertOptions$create))
+ opts[arrow_opts] <- NULL
+ opts[readr_opts] <- NULL
+ opts[convert_opts] <- NULL
+ if (!is.null(schema)) {
+ opts[["column_names"]] <- names(schema)
+ }
+ do.call(CsvReadOptions$create, opts)
+}
+
+#' Format-specific scan options
+#'
+#' @description
+#' A `FragmentScanOptions` holds options specific to a `FileFormat` and a scan
+#' operation.
+#'
+#' @section Factory:
+#' `FragmentScanOptions$create()` takes the following arguments:
+#' * `format`: A string identifier of the file format. Currently supported values:
+#' * "parquet"
+#' * "csv"/"text", aliases for the same format.
+#' * `...`: Additional format-specific options
+#'
+#' `format = "parquet"``:
+#' * `use_buffered_stream`: Read files through buffered input streams rather than
+#' loading entire row groups at once. This may be enabled
+#' to reduce memory overhead. Disabled by default.
+#' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB.
+#' * `pre_buffer`: Pre-buffer the raw Parquet data. This can improve performance
+#' on high-latency filesystems. Disabled by default.
+#
+#' `format = "text"`: see [CsvConvertOptions]. Note that options can only be
+#' specified with the Arrow C++ library naming. Also, "block_size" from
+#' [CsvReadOptions] may be given.
+#'
+#' It returns the appropriate subclass of `FragmentScanOptions`
+#' (e.g. `CsvFragmentScanOptions`).
+#' @rdname FragmentScanOptions
+#' @name FragmentScanOptions
+#' @export
+FragmentScanOptions <- R6Class("FragmentScanOptions",
+ inherit = ArrowObject,
+ active = list(
+ # @description
+ # Return the `FragmentScanOptions`'s type
+ type = function() dataset___FragmentScanOptions__type_name(self)
+ )
+)
+FragmentScanOptions$create <- function(format, ...) {
+ if (format %in% c("csv", "text", "tsv")) {
+ CsvFragmentScanOptions$create(...)
+ } else if (format == "parquet") {
+ ParquetFragmentScanOptions$create(...)
+ } else {
+ stop("Unsupported file format: ", format, call. = FALSE)
+ }
+}
+
+#' @export
+as.character.FragmentScanOptions <- function(x, ...) {
+ x$type
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FragmentScanOptions
+#' @export
+CsvFragmentScanOptions <- R6Class("CsvFragmentScanOptions", inherit = FragmentScanOptions)
+CsvFragmentScanOptions$create <- function(...,
+ convert_opts = csv_file_format_convert_opts(...),
+ read_opts = csv_file_format_read_opts(...)) {
+ dataset___CsvFragmentScanOptions__Make(convert_opts, read_opts)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FragmentScanOptions
+#' @export
+ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = FragmentScanOptions)
+ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE,
+ buffer_size = 8196,
+ pre_buffer = TRUE) {
+ dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer)
+}
+
+#' Format-specific write options
+#'
+#' @description
+#' A `FileWriteOptions` holds write options specific to a `FileFormat`.
+FileWriteOptions <- R6Class("FileWriteOptions",
+ inherit = ArrowObject,
+ public = list(
+ update = function(table, ...) {
+ if (self$type == "parquet") {
+ dataset___ParquetFileWriteOptions__update(
+ self,
+ ParquetWriterProperties$create(table, ...),
+ ParquetArrowWriterProperties$create(...)
+ )
+ } else if (self$type == "ipc") {
+ args <- list(...)
+ if (is.null(args$codec)) {
+ dataset___IpcFileWriteOptions__update1(
+ self,
+ get_ipc_use_legacy_format(args$use_legacy_format),
+ get_ipc_metadata_version(args$metadata_version)
+ )
+ } else {
+ dataset___IpcFileWriteOptions__update2(
+ self,
+ get_ipc_use_legacy_format(args$use_legacy_format),
+ args$codec,
+ get_ipc_metadata_version(args$metadata_version)
+ )
+ }
+ } else if (self$type == "csv") {
+ dataset___CsvFileWriteOptions__update(
+ self,
+ CsvWriteOptions$create(...)
+ )
+ }
+ invisible(self)
+ }
+ ),
+ active = list(
+ type = function() dataset___FileWriteOptions__type_name(self)
+ )
+)
+FileWriteOptions$create <- function(format, ...) {
+ if (!inherits(format, "FileFormat")) {
+ format <- FileFormat$create(format)
+ }
+ options <- dataset___FileFormat__DefaultWriteOptions(format)
+ options$update(...)
+}
diff --git a/src/arrow/r/R/dataset-partition.R b/src/arrow/r/R/dataset-partition.R
new file mode 100644
index 000000000..35d5bc00c
--- /dev/null
+++ b/src/arrow/r/R/dataset-partition.R
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Define Partitioning for a Dataset
+#'
+#' @description
+#' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()`
+#' method to indicate how the file's paths should be interpreted to define
+#' partitioning.
+#'
+#' `DirectoryPartitioning` describes how to interpret raw path segments, in
+#' order. For example, `schema(year = int16(), month = int8())` would define
+#' partitions for file paths like "2019/01/file.parquet",
+#' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In
+#' the previous example: when writing a dataset if the month was `NA` (or
+#' `NULL`), the files would be placed in "2019/file.parquet". When reading, the
+#' rows in "2019/file.parquet" would return an `NA` for the month column. An
+#' error will be raised if an outer directory is `NULL` and an inner directory
+#' is not.
+#'
+#' `HivePartitioning` is for Hive-style partitioning, which embeds field
+#' names and values in path segments, such as
+#' "/year=2019/month=2/data.parquet". Because fields are named in the path
+#' segments, order does not matter. This partitioning scheme allows `NULL`
+#' values. They will be replaced by a configurable `null_fallback` which
+#' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When
+#' reading, the `null_fallback` string will be replaced with `NA`s as
+#' appropriate.
+#'
+#' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect
+#' partition features from the file paths.
+#' @section Factory:
+#' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()`
+#' methods take a [Schema] as a single input argument. The helper
+#' function [`hive_partition(...)`][hive_partition] is shorthand for
+#' `HivePartitioning$create(schema(...))`.
+#'
+#' With `DirectoryPartitioningFactory$create()`, you can provide just the
+#' names of the path segments (in our example, `c("year", "month")`), and
+#' the `DatasetFactory` will infer the data types for those partition variables.
+#' `HivePartitioningFactory$create()` takes no arguments: both variable names
+#' and their types can be inferred from the file paths. `hive_partition()` with
+#' no arguments returns a `HivePartitioningFactory`.
+#' @name Partitioning
+#' @rdname Partitioning
+#' @export
+Partitioning <- R6Class("Partitioning", inherit = ArrowObject)
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning)
+DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") {
+ dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning)
+HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") {
+ dataset___HivePartitioning(schm,
+ null_fallback = null_fallback_or_default(null_fallback),
+ segment_encoding = segment_encoding
+ )
+}
+
+#' Construct Hive partitioning
+#'
+#' Hive partitioning embeds field names and values in path segments, such as
+#' "/year=2019/month=2/data.parquet".
+#'
+#' Because fields are named in the path segments, order of fields passed to
+#' `hive_partition()` does not matter.
+#' @param ... named list of [data types][data-type], passed to [schema()]
+#' @param null_fallback character to be used in place of missing values (`NA` or `NULL`)
+#' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`,
+#' which is what Hive uses.
+#' @param segment_encoding Decode partition segments after splitting paths.
+#' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is).
+#' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if
+#' calling `hive_partition()` with no arguments.
+#' @examplesIf arrow_with_dataset()
+#' hive_partition(year = int16(), month = int8())
+#' @export
+hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") {
+ schm <- schema(...)
+ if (length(schm) == 0) {
+ HivePartitioningFactory$create(null_fallback, segment_encoding)
+ } else {
+ HivePartitioning$create(schm, null_fallback, segment_encoding)
+ }
+}
+
+PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory)
+DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") {
+ dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory)
+HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") {
+ dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding)
+}
+
+null_fallback_or_default <- function(null_fallback) {
+ null_fallback %||% "__HIVE_DEFAULT_PARTITION__"
+}
diff --git a/src/arrow/r/R/dataset-scan.R b/src/arrow/r/R/dataset-scan.R
new file mode 100644
index 000000000..03c926fb4
--- /dev/null
+++ b/src/arrow/r/R/dataset-scan.R
@@ -0,0 +1,262 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Scan the contents of a dataset
+#'
+#' @description
+#' A `Scanner` iterates over a [Dataset]'s fragments and returns data
+#' according to given row filtering and column projection. A `ScannerBuilder`
+#' can help create one.
+#'
+#' @section Factory:
+#' `Scanner$create()` wraps the `ScannerBuilder` interface to make a `Scanner`.
+#' It takes the following arguments:
+#'
+#' * `dataset`: A `Dataset` or `arrow_dplyr_query` object, as returned by the
+#' `dplyr` methods on `Dataset`.
+#' * `projection`: A character vector of column names to select columns or a
+#' named list of expressions
+#' * `filter`: A `Expression` to filter the scanned rows by, or `TRUE` (default)
+#' to keep all rows.
+#' * `use_threads`: logical: should scanning use multithreading? Default `TRUE`
+#' * `use_async`: logical: should the async scanner (performs better on
+#' high-latency/highly parallel filesystems like S3) be used? Default `FALSE`
+#' * `...`: Additional arguments, currently ignored
+#' @section Methods:
+#' `ScannerBuilder` has the following methods:
+#'
+#' - `$Project(cols)`: Indicate that the scan should only return columns given
+#' by `cols`, a character vector of column names
+#' - `$Filter(expr)`: Filter rows by an [Expression].
+#' - `$UseThreads(threads)`: logical: should the scan use multithreading?
+#' The method's default input is `TRUE`, but you must call the method to enable
+#' multithreading because the scanner default is `FALSE`.
+#' - `$UseAsync(use_async)`: logical: should the async scanner be used?
+#' - `$BatchSize(batch_size)`: integer: Maximum row count of scanned record
+#' batches, default is 32K. If scanned record batches are overflowing memory
+#' then this method can be called to reduce their size.
+#' - `$schema`: Active binding, returns the [Schema] of the Dataset
+#' - `$Finish()`: Returns a `Scanner`
+#'
+#' `Scanner` currently has a single method, `$ToTable()`, which evaluates the
+#' query and returns an Arrow [Table].
+#' @rdname Scanner
+#' @name Scanner
+#' @export
+Scanner <- R6Class("Scanner",
+ inherit = ArrowObject,
+ public = list(
+ ToTable = function() dataset___Scanner__ToTable(self),
+ ScanBatches = function() dataset___Scanner__ScanBatches(self),
+ ToRecordBatchReader = function() dataset___Scanner__ToRecordBatchReader(self),
+ CountRows = function() dataset___Scanner__CountRows(self)
+ ),
+ active = list(
+ schema = function() dataset___Scanner__schema(self)
+ )
+)
+Scanner$create <- function(dataset,
+ projection = NULL,
+ filter = TRUE,
+ use_threads = option_use_threads(),
+ use_async = getOption("arrow.use_async", FALSE),
+ batch_size = NULL,
+ fragment_scan_options = NULL,
+ ...) {
+ if (inherits(dataset, "arrow_dplyr_query")) {
+ if (is_collapsed(dataset)) {
+ # TODO: Is there a way to get a RecordBatchReader rather than evaluating?
+ dataset$.data <- as_adq(dplyr::compute(dataset$.data))$.data
+ }
+
+ proj <- c(dataset$selected_columns, dataset$temp_columns)
+
+ if (!is.null(projection)) {
+ if (is.character(projection)) {
+ stopifnot("attempting to project with unknown columns" = all(projection %in% names(proj)))
+ proj <- proj[projection]
+ } else {
+ # TODO: ARROW-13802 accepting lists of Expressions as a projection
+ warning(
+ "Scanner$create(projection = ...) must be a character vector, ",
+ "ignoring the projection argument."
+ )
+ }
+ }
+
+ if (!isTRUE(filter)) {
+ dataset <- set_filters(dataset, filter)
+ }
+
+ return(Scanner$create(
+ dataset$.data,
+ proj,
+ dataset$filtered_rows,
+ use_threads,
+ use_async,
+ batch_size,
+ fragment_scan_options,
+ ...
+ ))
+ }
+
+ scanner_builder <- ScannerBuilder$create(dataset)
+ if (use_threads) {
+ scanner_builder$UseThreads()
+ }
+ if (use_async) {
+ scanner_builder$UseAsync()
+ }
+ if (!is.null(projection)) {
+ scanner_builder$Project(projection)
+ }
+ if (!isTRUE(filter)) {
+ scanner_builder$Filter(filter)
+ }
+ if (is_integerish(batch_size)) {
+ scanner_builder$BatchSize(batch_size)
+ }
+ if (!is.null(fragment_scan_options)) {
+ scanner_builder$FragmentScanOptions(fragment_scan_options)
+ }
+ scanner_builder$Finish()
+}
+
+#' @export
+names.Scanner <- function(x) names(x$schema)
+
+#' @export
+head.Scanner <- function(x, n = 6L, ...) {
+ assert_that(n > 0) # For now
+ dataset___Scanner__head(x, n)
+}
+
+#' @export
+tail.Scanner <- function(x, n = 6L, ...) {
+ assert_that(n > 0) # For now
+ result <- list()
+ batch_num <- 0
+ for (batch in rev(dataset___Scanner__ScanBatches(x))) {
+ batch_num <- batch_num + 1
+ result[[batch_num]] <- tail(batch, n)
+ n <- n - nrow(batch)
+ if (n <= 0) break
+ }
+ Table$create(!!!rev(result))
+}
+
+ScanTask <- R6Class("ScanTask",
+ inherit = ArrowObject,
+ public = list(
+ Execute = function() dataset___ScanTask__get_batches(self)
+ )
+)
+
+#' Apply a function to a stream of RecordBatches
+#'
+#' As an alternative to calling `collect()` on a `Dataset` query, you can
+#' use this function to access the stream of `RecordBatch`es in the `Dataset`.
+#' This lets you aggregate on each chunk and pull the intermediate results into
+#' a `data.frame` for further aggregation, even if you couldn't fit the whole
+#' `Dataset` result in memory.
+#'
+#' This is experimental and not recommended for production use.
+#'
+#' @param X A `Dataset` or `arrow_dplyr_query` object, as returned by the
+#' `dplyr` methods on `Dataset`.
+#' @param FUN A function or `purrr`-style lambda expression to apply to each
+#' batch
+#' @param ... Additional arguments passed to `FUN`
+#' @param .data.frame logical: collect the resulting chunks into a single
+#' `data.frame`? Default `TRUE`
+#' @export
+map_batches <- function(X, FUN, ..., .data.frame = TRUE) {
+ if (.data.frame) {
+ lapply <- map_dfr
+ }
+ scanner <- Scanner$create(ensure_group_vars(X))
+ FUN <- as_mapper(FUN)
+ lapply(scanner$ScanBatches(), function(batch) {
+ # TODO: wrap batch in arrow_dplyr_query with X$selected_columns,
+ # X$temp_columns, and X$group_by_vars
+ # if X is arrow_dplyr_query, if some other arg (.dplyr?) == TRUE
+ FUN(batch, ...)
+ })
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Scanner
+#' @export
+ScannerBuilder <- R6Class("ScannerBuilder",
+ inherit = ArrowObject,
+ public = list(
+ Project = function(cols) {
+ # cols is either a character vector or a named list of Expressions
+ if (is.character(cols)) {
+ dataset___ScannerBuilder__ProjectNames(self, cols)
+ } else if (length(cols) == 0) {
+ # Empty projection
+ dataset___ScannerBuilder__ProjectNames(self, character(0))
+ } else {
+ # List of Expressions
+ dataset___ScannerBuilder__ProjectExprs(self, cols, names(cols))
+ }
+ self
+ },
+ Filter = function(expr) {
+ assert_is(expr, "Expression")
+ dataset___ScannerBuilder__Filter(self, expr)
+ self
+ },
+ UseThreads = function(threads = option_use_threads()) {
+ dataset___ScannerBuilder__UseThreads(self, threads)
+ self
+ },
+ UseAsync = function(use_async = TRUE) {
+ dataset___ScannerBuilder__UseAsync(self, use_async)
+ self
+ },
+ BatchSize = function(batch_size) {
+ dataset___ScannerBuilder__BatchSize(self, batch_size)
+ self
+ },
+ FragmentScanOptions = function(options) {
+ dataset___ScannerBuilder__FragmentScanOptions(self, options)
+ self
+ },
+ Finish = function() dataset___ScannerBuilder__Finish(self)
+ ),
+ active = list(
+ schema = function() dataset___ScannerBuilder__schema(self)
+ )
+)
+ScannerBuilder$create <- function(dataset) {
+ if (inherits(dataset, "RecordBatchReader")) {
+ return(dataset___ScannerBuilder__FromRecordBatchReader(dataset))
+ }
+
+ if (inherits(dataset, c("data.frame", "ArrowTabular"))) {
+ dataset <- InMemoryDataset$create(dataset)
+ }
+ assert_is(dataset, "Dataset")
+
+ dataset$NewScan()
+}
+
+#' @export
+names.ScannerBuilder <- function(x) names(x$schema)
diff --git a/src/arrow/r/R/dataset-write.R b/src/arrow/r/R/dataset-write.R
new file mode 100644
index 000000000..3a98357b0
--- /dev/null
+++ b/src/arrow/r/R/dataset-write.R
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Write a dataset
+#'
+#' This function allows you to write a dataset. By writing to more efficient
+#' binary storage formats, and by specifying relevant partitioning, you can
+#' make it much faster to read and query.
+#'
+#' @param dataset [Dataset], [RecordBatch], [Table], `arrow_dplyr_query`, or
+#' `data.frame`. If an `arrow_dplyr_query`, the query will be evaluated and
+#' the result will be written. This means that you can `select()`, `filter()`, `mutate()`,
+#' etc. to transform the data before it is written if you need to.
+#' @param path string path, URI, or `SubTreeFileSystem` referencing a directory
+#' to write to (directory will be created if it does not exist)
+#' @param format a string identifier of the file format. Default is to use
+#' "parquet" (see [FileFormat])
+#' @param partitioning `Partitioning` or a character vector of columns to
+#' use as partition keys (to be written as path segments). Default is to
+#' use the current `group_by()` columns.
+#' @param basename_template string template for the names of files to be written.
+#' Must contain `"{i}"`, which will be replaced with an autoincremented
+#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"`
+#' will yield `"part-0.feather", ...`.
+#' @param hive_style logical: write partition segments as Hive-style
+#' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`.
+#' @param existing_data_behavior The behavior to use when there is already data
+#' in the destination directory. Must be one of "overwrite", "error", or
+#' "delete_matching".
+#' - "overwrite" (the default) then any new files created will overwrite
+#' existing files
+#' - "error" then the operation will fail if the destination directory is not
+#' empty
+#' - "delete_matching" then the writer will delete any existing partitions
+#' if data is going to be written to those partitions and will leave alone
+#' partitions which data is not written to.
+#' @param ... additional format-specific arguments. For available Parquet
+#' options, see [write_parquet()]. The available Feather options are
+#' - `use_legacy_format` logical: write data formatted so that Arrow libraries
+#' versions 0.14 and lower can read it. Default is `FALSE`. You can also
+#' enable this by setting the environment variable `ARROW_PRE_0_15_IPC_FORMAT=1`.
+#' - `metadata_version`: A string like "V5" or the equivalent integer indicating
+#' the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
+#' unless the environment variable `ARROW_PRE_1_0_METADATA_VERSION=1`, in
+#' which case it will be V4.
+#' - `codec`: A [Codec] which will be used to compress body buffers of written
+#' files. Default (NULL) will not compress body buffers.
+#' - `null_fallback`: character to be used in place of missing values (`NA` or
+#' `NULL`) when using Hive-style partitioning. See [hive_partition()].
+#' @return The input `dataset`, invisibly
+#' @examplesIf arrow_with_dataset() & arrow_with_parquet() & requireNamespace("dplyr", quietly = TRUE)
+#' # You can write datasets partitioned by the values in a column (here: "cyl").
+#' # This creates a structure of the form cyl=X/part-Z.parquet.
+#' one_level_tree <- tempfile()
+#' write_dataset(mtcars, one_level_tree, partitioning = "cyl")
+#' list.files(one_level_tree, recursive = TRUE)
+#'
+#' # You can also partition by the values in multiple columns
+#' # (here: "cyl" and "gear").
+#' # This creates a structure of the form cyl=X/gear=Y/part-Z.parquet.
+#' two_levels_tree <- tempfile()
+#' write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear"))
+#' list.files(two_levels_tree, recursive = TRUE)
+#'
+#' # In the two previous examples we would have:
+#' # X = {4,6,8}, the number of cylinders.
+#' # Y = {3,4,5}, the number of forward gears.
+#' # Z = {0,1,2}, the number of saved parts, starting from 0.
+#'
+#' # You can obtain the same result as as the previous examples using arrow with
+#' # a dplyr pipeline. This will be the same as two_levels_tree above, but the
+#' # output directory will be different.
+#' library(dplyr)
+#' two_levels_tree_2 <- tempfile()
+#' mtcars %>%
+#' group_by(cyl, gear) %>%
+#' write_dataset(two_levels_tree_2)
+#' list.files(two_levels_tree_2, recursive = TRUE)
+#'
+#' # And you can also turn off the Hive-style directory naming where the column
+#' # name is included with the values by using `hive_style = FALSE`.
+#'
+#' # Write a structure X/Y/part-Z.parquet.
+#' two_levels_tree_no_hive <- tempfile()
+#' mtcars %>%
+#' group_by(cyl, gear) %>%
+#' write_dataset(two_levels_tree_no_hive, hive_style = FALSE)
+#' list.files(two_levels_tree_no_hive, recursive = TRUE)
+#' @export
+write_dataset <- function(dataset,
+ path,
+ format = c("parquet", "feather", "arrow", "ipc", "csv"),
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = paste0("part-{i}.", as.character(format)),
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite", "error", "delete_matching"),
+ ...) {
+ format <- match.arg(format)
+ if (inherits(dataset, "arrow_dplyr_query")) {
+ # partitioning vars need to be in the `select` schema
+ dataset <- ensure_group_vars(dataset)
+ } else if (inherits(dataset, "grouped_df")) {
+ force(partitioning)
+ # Drop the grouping metadata before writing; we've already consumed it
+ # now to construct `partitioning` and don't want it in the metadata$r
+ dataset <- dplyr::ungroup(dataset)
+ }
+
+ scanner <- Scanner$create(dataset, use_async = TRUE)
+ if (!inherits(partitioning, "Partitioning")) {
+ partition_schema <- scanner$schema[partitioning]
+ if (isTRUE(hive_style)) {
+ partitioning <- HivePartitioning$create(partition_schema, null_fallback = list(...)$null_fallback)
+ } else {
+ partitioning <- DirectoryPartitioning$create(partition_schema)
+ }
+ }
+
+ path_and_fs <- get_path_and_filesystem(path)
+ options <- FileWriteOptions$create(format, table = scanner, ...)
+
+ existing_data_behavior_opts <- c("delete_matching", "overwrite", "error")
+ existing_data_behavior <- match(match.arg(existing_data_behavior), existing_data_behavior_opts) - 1L
+
+ dataset___Dataset__Write(
+ options, path_and_fs$fs, path_and_fs$path,
+ partitioning, basename_template, scanner,
+ existing_data_behavior
+ )
+}
diff --git a/src/arrow/r/R/dataset.R b/src/arrow/r/R/dataset.R
new file mode 100644
index 000000000..7207a5543
--- /dev/null
+++ b/src/arrow/r/R/dataset.R
@@ -0,0 +1,367 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Open a multi-file dataset
+#'
+#' Arrow Datasets allow you to query against data that has been split across
+#' multiple files. This sharding of data may indicate partitioning, which
+#' can accelerate queries that only touch some partitions (files). Call
+#' `open_dataset()` to point to a directory of data files and return a
+#' `Dataset`, then use `dplyr` methods to query it.
+#'
+#' @param sources One of:
+#' * a string path or URI to a directory containing data files
+#' * a string path or URI to a single file
+#' * a character vector of paths or URIs to individual data files
+#' * a list of `Dataset` objects as created by this function
+#' * a list of `DatasetFactory` objects as created by [dataset_factory()].
+#'
+#' When `sources` is a vector of file URIs, they must all use the same protocol
+#' and point to files located in the same file system and having the same
+#' format.
+#' @param schema [Schema] for the `Dataset`. If `NULL` (the default), the schema
+#' will be inferred from the data sources.
+#' @param partitioning When `sources` is a directory path/URI, one of:
+#' * a `Schema`, in which case the file paths relative to `sources` will be
+#' parsed, and path segments will be matched with the schema fields. For
+#' example, `schema(year = int16(), month = int8())` would create partitions
+#' for file paths like `"2019/01/file.parquet"`, `"2019/02/file.parquet"`,
+#' etc.
+#' * a character vector that defines the field names corresponding to those
+#' path segments (that is, you're providing the names that would correspond
+#' to a `Schema` but the types will be autodetected)
+#' * a `HivePartitioning` or `HivePartitioningFactory`, as returned
+#' by [hive_partition()] which parses explicit or autodetected fields from
+#' Hive-style path segments
+#' * `NULL` for no partitioning
+#'
+#' The default is to autodetect Hive-style partitions. When `sources` is not a
+#' directory path/URI, `partitioning` is ignored.
+#' @param unify_schemas logical: should all data fragments (files, `Dataset`s)
+#' be scanned in order to create a unified schema from them? If `FALSE`, only
+#' the first fragment will be inspected for its schema. Use this fast path
+#' when you know and trust that all fragments have an identical schema.
+#' The default is `FALSE` when creating a dataset from a directory path/URI or
+#' vector of file paths/URIs (because there may be many files and scanning may
+#' be slow) but `TRUE` when `sources` is a list of `Dataset`s (because there
+#' should be few `Dataset`s in the list and their `Schema`s are already in
+#' memory).
+#' @param format A [FileFormat] object, or a string identifier of the format of
+#' the files in `x`. This argument is ignored when `sources` is a list of `Dataset` objects.
+#' Currently supported values:
+#' * "parquet"
+#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+#' only version 2 files are supported
+#' * "csv"/"text", aliases for the same thing (because comma is the default
+#' delimiter for text files
+#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
+#'
+#' Default is "parquet", unless a `delimiter` is also specified, in which case
+#' it is assumed to be "text".
+#' @param ... additional arguments passed to `dataset_factory()` when `sources`
+#' is a directory path/URI or vector of file paths/URIs, otherwise ignored.
+#' These may include `format` to indicate the file format, or other
+#' format-specific options.
+#' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data,
+#' or call [`$NewScan()`][Scanner] to construct a query directly.
+#' @export
+#' @seealso `vignette("dataset", package = "arrow")`
+#' @include arrow-package.R
+#' @examplesIf arrow_with_dataset() & arrow_with_parquet()
+#' # Set up directory for examples
+#' tf <- tempfile()
+#' dir.create(tf)
+#' on.exit(unlink(tf))
+#'
+#' data <- dplyr::group_by(mtcars, cyl)
+#' write_dataset(data, tf)
+#'
+#' # You can specify a directory containing the files for your dataset and
+#' # open_dataset will scan all files in your directory.
+#' open_dataset(tf)
+#'
+#' # You can also supply a vector of paths
+#' open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"), file.path(tf, "cyl=8/part-0.parquet")))
+#'
+#' ## You must specify the file format if using a format other than parquet.
+#' tf2 <- tempfile()
+#' dir.create(tf2)
+#' on.exit(unlink(tf2))
+#' write_dataset(data, tf2, format = "ipc")
+#' # This line will results in errors when you try to work with the data
+#' \dontrun{
+#' open_dataset(tf2)
+#' }
+#' # This line will work
+#' open_dataset(tf2, format = "ipc")
+#'
+#' ## You can specify file partitioning to include it as a field in your dataset
+#' # Create a temporary directory and write example dataset
+#' tf3 <- tempfile()
+#' dir.create(tf3)
+#' on.exit(unlink(tf3))
+#' write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE)
+#'
+#' # View files - you can see the partitioning means that files have been written
+#' # to folders based on Month/Day values
+#' tf3_files <- list.files(tf3, recursive = TRUE)
+#'
+#' # With no partitioning specified, dataset contains all files but doesn't include
+#' # directory names as field names
+#' open_dataset(tf3)
+#'
+#' # Now that partitioning has been specified, your dataset contains columns for Month and Day
+#' open_dataset(tf3, partitioning = c("Month", "Day"))
+#'
+#' # If you want to specify the data types for your fields, you can pass in a Schema
+#' open_dataset(tf3, partitioning = schema(Month = int8(), Day = int8()))
+open_dataset <- function(sources,
+ schema = NULL,
+ partitioning = hive_partition(),
+ unify_schemas = NULL,
+ format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
+ ...) {
+ if (!arrow_with_dataset()) {
+ stop("This build of the arrow package does not support Datasets", call. = FALSE)
+ }
+ if (is_list_of(sources, "Dataset")) {
+ if (is.null(schema)) {
+ if (is.null(unify_schemas) || isTRUE(unify_schemas)) {
+ # Default is to unify schemas here
+ schema <- unify_schemas(schemas = map(sources, ~ .$schema))
+ } else {
+ # Take the first one.
+ schema <- sources[[1]]$schema
+ }
+ }
+ # Enforce that all datasets have the same schema
+ assert_is(schema, "Schema")
+ sources <- lapply(sources, function(x) {
+ x$schema <- schema
+ x
+ })
+ return(dataset___UnionDataset__create(sources, schema))
+ }
+
+ factory <- DatasetFactory$create(sources, partitioning = partitioning, format = format, schema = schema, ...)
+ tryCatch(
+ # Default is _not_ to inspect/unify schemas
+ factory$Finish(schema, isTRUE(unify_schemas)),
+ error = function(e) {
+ handle_parquet_io_error(e, format)
+ }
+ )
+}
+
+#' Multi-file datasets
+#'
+#' @description
+#' Arrow Datasets allow you to query against data that has been split across
+#' multiple files. This sharding of data may indicate partitioning, which
+#' can accelerate queries that only touch some partitions (files).
+#'
+#' A `Dataset` contains one or more `Fragments`, such as files, of potentially
+#' differing type and partitioning.
+#'
+#' For `Dataset$create()`, see [open_dataset()], which is an alias for it.
+#'
+#' `DatasetFactory` is used to provide finer control over the creation of `Dataset`s.
+#'
+#' @section Factory:
+#' `DatasetFactory` is used to create a `Dataset`, inspect the [Schema] of the
+#' fragments contained in it, and declare a partitioning.
+#' `FileSystemDatasetFactory` is a subclass of `DatasetFactory` for
+#' discovering files in the local file system, the only currently supported
+#' file system.
+#'
+#' For the `DatasetFactory$create()` factory method, see [dataset_factory()], an
+#' alias for it. A `DatasetFactory` has:
+#'
+#' - `$Inspect(unify_schemas)`: If `unify_schemas` is `TRUE`, all fragments
+#' will be scanned and a unified [Schema] will be created from them; if `FALSE`
+#' (default), only the first fragment will be inspected for its schema. Use this
+#' fast path when you know and trust that all fragments have an identical schema.
+#' - `$Finish(schema, unify_schemas)`: Returns a `Dataset`. If `schema` is provided,
+#' it will be used for the `Dataset`; if omitted, a `Schema` will be created from
+#' inspecting the fragments (files) in the dataset, following `unify_schemas`
+#' as described above.
+#'
+#' `FileSystemDatasetFactory$create()` is a lower-level factory method and
+#' takes the following arguments:
+#' * `filesystem`: A [FileSystem]
+#' * `selector`: Either a [FileSelector] or `NULL`
+#' * `paths`: Either a character vector of file paths or `NULL`
+#' * `format`: A [FileFormat]
+#' * `partitioning`: Either `Partitioning`, `PartitioningFactory`, or `NULL`
+#' @section Methods:
+#'
+#' A `Dataset` has the following methods:
+#' - `$NewScan()`: Returns a [ScannerBuilder] for building a query
+#' - `$schema`: Active binding that returns the [Schema] of the Dataset; you
+#' may also replace the dataset's schema by using `ds$schema <- new_schema`.
+#' This method currently supports only adding, removing, or reordering
+#' fields in the schema: you cannot alter or cast the field types.
+#'
+#' `FileSystemDataset` has the following methods:
+#' - `$files`: Active binding, returns the files of the `FileSystemDataset`
+#' - `$format`: Active binding, returns the [FileFormat] of the `FileSystemDataset`
+#'
+#' `UnionDataset` has the following methods:
+#' - `$children`: Active binding, returns all child `Dataset`s.
+#'
+#' @export
+#' @seealso [open_dataset()] for a simple interface to creating a `Dataset`
+Dataset <- R6Class("Dataset",
+ inherit = ArrowObject,
+ public = list(
+ # @description
+ # Start a new scan of the data
+ # @return A [ScannerBuilder]
+ NewScan = function() dataset___Dataset__NewScan(self),
+ ToString = function() self$schema$ToString()
+ ),
+ active = list(
+ schema = function(schema) {
+ if (missing(schema)) {
+ dataset___Dataset__schema(self)
+ } else {
+ assert_is(schema, "Schema")
+ invisible(dataset___Dataset__ReplaceSchema(self, schema))
+ }
+ },
+ metadata = function() self$schema$metadata,
+ num_rows = function() self$NewScan()$Finish()$CountRows(),
+ num_cols = function() length(self$schema),
+ # @description
+ # Return the Dataset's type.
+ type = function() dataset___Dataset__type_name(self)
+ )
+)
+Dataset$create <- open_dataset
+
+#' @name FileSystemDataset
+#' @rdname Dataset
+#' @export
+FileSystemDataset <- R6Class("FileSystemDataset",
+ inherit = Dataset,
+ public = list(
+ .class_title = function() {
+ nfiles <- length(self$files)
+ file_type <- self$format$type
+ pretty_file_type <- list(
+ parquet = "Parquet",
+ ipc = "Feather"
+ )[[file_type]]
+
+ paste(
+ class(self)[[1]],
+ "with",
+ nfiles,
+ pretty_file_type %||% file_type,
+ ifelse(nfiles == 1, "file", "files")
+ )
+ }
+ ),
+ active = list(
+ # @description
+ # Return the files contained in this `FileSystemDataset`
+ files = function() dataset___FileSystemDataset__files(self),
+ # @description
+ # Return the format of files in this `Dataset`
+ format = function() {
+ dataset___FileSystemDataset__format(self)
+ },
+ # @description
+ # Return the filesystem of files in this `Dataset`
+ filesystem = function() {
+ dataset___FileSystemDataset__filesystem(self)
+ }
+ )
+)
+
+#' @name UnionDataset
+#' @rdname Dataset
+#' @export
+UnionDataset <- R6Class("UnionDataset",
+ inherit = Dataset,
+ active = list(
+ # @description
+ # Return the UnionDataset's child `Dataset`s
+ children = function() {
+ dataset___UnionDataset__children(self)
+ }
+ )
+)
+
+#' @name InMemoryDataset
+#' @rdname Dataset
+#' @export
+InMemoryDataset <- R6Class("InMemoryDataset", inherit = Dataset)
+InMemoryDataset$create <- function(x) {
+ if (!arrow_with_dataset()) {
+ stop("This build of the arrow package does not support Datasets", call. = FALSE)
+ }
+ if (!inherits(x, "Table")) {
+ x <- Table$create(x)
+ }
+ dataset___InMemoryDataset__create(x)
+}
+
+
+#' @export
+names.Dataset <- function(x) names(x$schema)
+
+#' @export
+dim.Dataset <- function(x) c(x$num_rows, x$num_cols)
+
+#' @export
+c.Dataset <- function(...) Dataset$create(list(...))
+
+#' @export
+head.Dataset <- function(x, n = 6L, ...) {
+ head(Scanner$create(x), n)
+}
+
+#' @export
+tail.Dataset <- function(x, n = 6L, ...) {
+ tail(Scanner$create(x), n)
+}
+
+#' @export
+`[.Dataset` <- function(x, i, j, ..., drop = FALSE) {
+ if (nargs() == 2L) {
+ # List-like column extraction (x[i])
+ return(x[, i])
+ }
+ if (!missing(j)) {
+ x <- select.Dataset(x, all_of(j))
+ }
+
+ if (!missing(i)) {
+ x <- take_dataset_rows(x, i)
+ }
+ x
+}
+
+take_dataset_rows <- function(x, i) {
+ if (!is.numeric(i) || any(i < 0)) {
+ stop("Only slicing with positive indices is supported", call. = FALSE)
+ }
+ scanner <- Scanner$create(x)
+ i <- Array$create(i - 1)
+ dataset___Scanner__TakeRows(scanner, i)
+}
diff --git a/src/arrow/r/R/deprecated.R b/src/arrow/r/R/deprecated.R
new file mode 100644
index 000000000..e8848c4aa
--- /dev/null
+++ b/src/arrow/r/R/deprecated.R
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @rdname read_ipc_stream
+#' @export
+read_arrow <- function(file, ...) {
+ .Deprecated(msg = "Use 'read_ipc_stream' or 'read_feather' instead.")
+ if (inherits(file, "raw")) {
+ read_ipc_stream(file, ...)
+ } else {
+ read_feather(file, ...)
+ }
+}
+
+#' @rdname write_ipc_stream
+#' @export
+write_arrow <- function(x, sink, ...) {
+ .Deprecated(msg = "Use 'write_ipc_stream' or 'write_feather' instead.")
+ if (inherits(sink, "raw")) {
+ # HACK for sparklyr
+ # Note that this returns a new R raw vector, not the one passed as `sink`
+ write_to_raw(x)
+ } else {
+ write_feather(x, sink, ...)
+ }
+}
diff --git a/src/arrow/r/R/dictionary.R b/src/arrow/r/R/dictionary.R
new file mode 100644
index 000000000..b701768d6
--- /dev/null
+++ b/src/arrow/r/R/dictionary.R
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include type.R
+
+#' @title class DictionaryType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname DictionaryType
+#' @name DictionaryType
+DictionaryType <- R6Class("DictionaryType",
+ inherit = FixedWidthType,
+ public = list(
+ ToString = function() {
+ prettier_dictionary_type(DataType__ToString(self))
+ }
+ ),
+ active = list(
+ index_type = function() DictionaryType__index_type(self),
+ value_type = function() DictionaryType__value_type(self),
+ name = function() DictionaryType__name(self),
+ ordered = function() DictionaryType__ordered(self)
+ )
+)
+DictionaryType$create <- function(index_type = int32(),
+ value_type = utf8(),
+ ordered = FALSE) {
+ assert_is(index_type, "DataType")
+ assert_is(value_type, "DataType")
+ DictionaryType__initialize(index_type, value_type, ordered)
+}
+
+#' Create a dictionary type
+#'
+#' @param index_type A DataType for the indices (default [int32()])
+#' @param value_type A DataType for the values (default [utf8()])
+#' @param ordered Is this an ordered dictionary (default `FALSE`)?
+#'
+#' @return A [DictionaryType]
+#' @seealso [Other Arrow data types][data-type]
+#' @export
+dictionary <- DictionaryType$create
+
+prettier_dictionary_type <- function(x) {
+ # Prettier format the "ordered" attribute
+ x <- sub(", ordered=0", "", x)
+ sub("ordered=1", "ordered", x)
+}
diff --git a/src/arrow/r/R/dplyr-arrange.R b/src/arrow/r/R/dplyr-arrange.R
new file mode 100644
index 000000000..4c8c687a3
--- /dev/null
+++ b/src/arrow/r/R/dplyr-arrange.R
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) {
+ call <- match.call()
+ exprs <- quos(...)
+ if (.by_group) {
+ # when the data is is grouped and .by_group is TRUE, order the result by
+ # the grouping columns first
+ exprs <- c(quos(!!!dplyr::groups(.data)), exprs)
+ }
+ if (length(exprs) == 0) {
+ # Nothing to do
+ return(.data)
+ }
+ .data <- as_adq(.data)
+ # find and remove any dplyr::desc() and tidy-eval
+ # the arrange expressions inside an Arrow data_mask
+ sorts <- vector("list", length(exprs))
+ descs <- logical(0)
+ mask <- arrow_mask(.data)
+ for (i in seq_along(exprs)) {
+ x <- find_and_remove_desc(exprs[[i]])
+ exprs[[i]] <- x[["quos"]]
+ sorts[[i]] <- arrow_eval(exprs[[i]], mask)
+ names(sorts)[i] <- format_expr(exprs[[i]])
+ if (inherits(sorts[[i]], "try-error")) {
+ msg <- paste("Expression", names(sorts)[i], "not supported in Arrow")
+ return(abandon_ship(call, .data, msg))
+ }
+ descs[i] <- x[["desc"]]
+ }
+ .data$arrange_vars <- c(sorts, .data$arrange_vars)
+ .data$arrange_desc <- c(descs, .data$arrange_desc)
+ .data
+}
+arrange.Dataset <- arrange.ArrowTabular <- arrange.arrow_dplyr_query
+
+# Helper to handle desc() in arrange()
+# * Takes a quosure as input
+# * Returns a list with two elements:
+# 1. The quosure with any wrapping parentheses and desc() removed
+# 2. A logical value indicating whether desc() was found
+# * Performs some other validation
+find_and_remove_desc <- function(quosure) {
+ expr <- quo_get_expr(quosure)
+ descending <- FALSE
+ if (length(all.vars(expr)) < 1L) {
+ stop(
+ "Expression in arrange() does not contain any field names: ",
+ deparse(expr),
+ call. = FALSE
+ )
+ }
+ # Use a while loop to remove any number of nested pairs of enclosing
+ # parentheses and any number of nested desc() calls. In the case of multiple
+ # nested desc() calls, each one toggles the sort order.
+ while (identical(typeof(expr), "language") && is.call(expr)) {
+ if (identical(expr[[1]], quote(`(`))) {
+ # remove enclosing parentheses
+ expr <- expr[[2]]
+ } else if (identical(expr[[1]], quote(desc))) {
+ # ensure desc() has only one argument (when an R expression is a function
+ # call, length == 2 means it has exactly one argument)
+ if (length(expr) > 2) {
+ stop("desc() expects only one argument", call. = FALSE)
+ }
+ # remove desc() and toggle descending
+ expr <- expr[[2]]
+ descending <- !descending
+ } else {
+ break
+ }
+ }
+ return(
+ list(
+ quos = quo_set_expr(quosure, expr),
+ desc = descending
+ )
+ )
+}
diff --git a/src/arrow/r/R/dplyr-collect.R b/src/arrow/r/R/dplyr-collect.R
new file mode 100644
index 000000000..13e68f3f4
--- /dev/null
+++ b/src/arrow/r/R/dplyr-collect.R
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
+ # head and tail are not ExecNodes, at best we can handle them via sink node
+ # so if there are any steps done after head/tail, we need to
+ # evaluate the query up to then and then do a new query for the rest
+ if (is_collapsed(x) && has_head_tail(x$.data)) {
+ x$.data <- as_adq(dplyr::compute(x$.data))$.data
+ }
+
+ # See query-engine.R for ExecPlan/Nodes
+ tab <- do_exec_plan(x)
+ if (as_data_frame) {
+ df <- as.data.frame(tab)
+ tab$invalidate()
+ restore_dplyr_features(df, x)
+ } else {
+ restore_dplyr_features(tab, x)
+ }
+}
+collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
+ if (as_data_frame) {
+ as.data.frame(x, ...)
+ } else {
+ x
+ }
+}
+collect.Dataset <- function(x, ...) dplyr::collect(as_adq(x), ...)
+
+compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame = FALSE)
+compute.ArrowTabular <- function(x, ...) x
+compute.Dataset <- compute.arrow_dplyr_query
+
+pull.arrow_dplyr_query <- function(.data, var = -1) {
+ .data <- as_adq(.data)
+ var <- vars_pull(names(.data), !!enquo(var))
+ .data$selected_columns <- set_names(.data$selected_columns[var], var)
+ dplyr::collect(.data)[[1]]
+}
+pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query
+
+restore_dplyr_features <- function(df, query) {
+ # An arrow_dplyr_query holds some attributes that Arrow doesn't know about
+ # After calling collect(), make sure these features are carried over
+
+ if (length(query$group_by_vars) > 0) {
+ # Preserve groupings, if present
+ if (is.data.frame(df)) {
+ df <- dplyr::grouped_df(
+ df,
+ dplyr::group_vars(query),
+ drop = dplyr::group_by_drop_default(query)
+ )
+ } else {
+ # This is a Table, via compute() or collect(as_data_frame = FALSE)
+ df <- as_adq(df)
+ df$group_by_vars <- query$group_by_vars
+ df$drop_empty_groups <- query$drop_empty_groups
+ }
+ }
+ df
+}
+
+collapse.arrow_dplyr_query <- function(x, ...) {
+ # Figure out what schema will result from the query
+ x$schema <- implicit_schema(x)
+ # Nest inside a new arrow_dplyr_query (and keep groups)
+ restore_dplyr_features(arrow_dplyr_query(x), x)
+}
+collapse.Dataset <- collapse.ArrowTabular <- function(x, ...) {
+ arrow_dplyr_query(x)
+}
+
+implicit_schema <- function(.data) {
+ .data <- ensure_group_vars(.data)
+ old_schm <- .data$.data$schema
+
+ if (is.null(.data$aggregations)) {
+ new_fields <- map(.data$selected_columns, ~ .$type(old_schm))
+ if (!is.null(.data$join) && !(.data$join$type %in% JoinType[1:4])) {
+ # Add cols from right side, except for semi/anti joins
+ right_cols <- .data$join$right_data$selected_columns
+ new_fields <- c(new_fields, map(
+ right_cols[setdiff(names(right_cols), .data$join$by)],
+ ~ .$type(.data$join$right_data$.data$schema)
+ ))
+ }
+ } else {
+ new_fields <- map(summarize_projection(.data), ~ .$type(old_schm))
+ # * Put group_by_vars first (this can't be done by summarize,
+ # they have to be last per the aggregate node signature,
+ # and they get projected to this order after aggregation)
+ # * Infer the output types from the aggregations
+ group_fields <- new_fields[.data$group_by_vars]
+ hash <- length(.data$group_by_vars) > 0
+ agg_fields <- imap(
+ new_fields[setdiff(names(new_fields), .data$group_by_vars)],
+ ~ output_type(.data$aggregations[[.y]][["fun"]], .x, hash)
+ )
+ new_fields <- c(group_fields, agg_fields)
+ }
+ schema(!!!new_fields)
+}
diff --git a/src/arrow/r/R/dplyr-count.R b/src/arrow/r/R/dplyr-count.R
new file mode 100644
index 000000000..c567c285f
--- /dev/null
+++ b/src/arrow/r/R/dplyr-count.R
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# The following S3 methods are registered on load if dplyr is present
+
+count.arrow_dplyr_query <- function(x, ..., wt = NULL, sort = FALSE, name = NULL) {
+ if (!missing(...)) {
+ out <- dplyr::group_by(x, ..., .add = TRUE)
+ } else {
+ out <- x
+ }
+ out <- dplyr::tally(out, wt = {{ wt }}, sort = sort, name = name)
+
+ # Restore original group vars
+ gv <- dplyr::group_vars(x)
+ if (length(gv)) {
+ out$group_by_vars <- gv
+ }
+
+ out
+}
+
+count.Dataset <- count.ArrowTabular <- count.arrow_dplyr_query
+
+#' @importFrom rlang sym :=
+tally.arrow_dplyr_query <- function(x, wt = NULL, sort = FALSE, name = NULL) {
+ check_name <- utils::getFromNamespace("check_name", "dplyr")
+ name <- check_name(name, dplyr::group_vars(x))
+
+ if (quo_is_null(enquo(wt))) {
+ out <- dplyr::summarize(x, !!name := n())
+ } else {
+ out <- dplyr::summarize(x, !!name := sum({{ wt }}, na.rm = TRUE))
+ }
+
+ if (sort) {
+ dplyr::arrange(out, desc(!!sym(name)))
+ } else {
+ out
+ }
+}
+
+tally.Dataset <- tally.ArrowTabular <- tally.arrow_dplyr_query
+
+# we don't want to depend on dplyr, but we refrence these above
+utils::globalVariables(c("n", "desc"))
diff --git a/src/arrow/r/R/dplyr-distinct.R b/src/arrow/r/R/dplyr-distinct.R
new file mode 100644
index 000000000..5dfcb641f
--- /dev/null
+++ b/src/arrow/r/R/dplyr-distinct.R
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# The following S3 methods are registered on load if dplyr is present
+
+distinct.arrow_dplyr_query <- function(.data, ..., .keep_all = FALSE) {
+ if (.keep_all == TRUE) {
+ # After ARROW-13993 is merged, we can implement this (ARROW-14045)
+ arrow_not_supported("`distinct()` with `.keep_all = TRUE`")
+ }
+
+ original_gv <- dplyr::group_vars(.data)
+ if (length(quos(...))) {
+ # group_by() calls mutate() if there are any expressions in ...
+ .data <- dplyr::group_by(.data, ..., .add = TRUE)
+ # `data %>% group_by() %>% summarise()` returns cols in order supplied
+ # but distinct() returns cols in dataset order, so sort group vars
+ .data$group_by_vars <- names(.data)[names(.data) %in% .data$group_by_vars]
+ } else {
+ # distinct() with no vars specified means distinct across all cols
+ .data <- dplyr::group_by(.data, !!!syms(names(.data)))
+ }
+
+ out <- dplyr::summarize(.data, .groups = "drop")
+ # distinct() doesn't modify group by vars, so restore the original ones
+ if (length(original_gv)) {
+ out$group_by_vars <- original_gv
+ }
+ out
+}
+
+distinct.Dataset <- distinct.ArrowTabular <- distinct.arrow_dplyr_query
diff --git a/src/arrow/r/R/dplyr-eval.R b/src/arrow/r/R/dplyr-eval.R
new file mode 100644
index 000000000..9d944ab80
--- /dev/null
+++ b/src/arrow/r/R/dplyr-eval.R
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+arrow_eval <- function(expr, mask) {
+ # filter(), mutate(), etc. work by evaluating the quoted `exprs` to generate Expressions
+ # with references to Arrays (if .data is Table/RecordBatch) or Fields (if
+ # .data is a Dataset).
+
+ # This yields an Expression as long as the `exprs` are implemented in Arrow.
+ # Otherwise, it returns a try-error
+ tryCatch(eval_tidy(expr, mask), error = function(e) {
+ # Look for the cases where bad input was given, i.e. this would fail
+ # in regular dplyr anyway, and let those raise those as errors;
+ # else, for things not supported by Arrow return a "try-error",
+ # which we'll handle differently
+ msg <- conditionMessage(e)
+ if (getOption("arrow.debug", FALSE)) print(msg)
+ patterns <- .cache$i18ized_error_pattern
+ if (is.null(patterns)) {
+ patterns <- i18ize_error_messages()
+ # Memoize it
+ .cache$i18ized_error_pattern <- patterns
+ }
+ if (grepl(patterns, msg)) {
+ stop(e)
+ }
+
+ out <- structure(msg, class = "try-error", condition = e)
+ if (grepl("not supported.*Arrow", msg) || getOption("arrow.debug", FALSE)) {
+ # One of ours. Mark it so that consumers can handle it differently
+ class(out) <- c("arrow-try-error", class(out))
+ }
+ invisible(out)
+ })
+}
+
+handle_arrow_not_supported <- function(err, lab) {
+ # Look for informative message from the Arrow function version (see above)
+ if (inherits(err, "arrow-try-error")) {
+ # Include it if found
+ paste0("In ", lab, ", ", as.character(err))
+ } else {
+ # Otherwise be opaque (the original error is probably not useful)
+ paste("Expression", lab, "not supported in Arrow")
+ }
+}
+
+i18ize_error_messages <- function() {
+ # Figure out what the error messages will be with this LANGUAGE
+ # so that we can look for them
+ out <- list(
+ obj = tryCatch(eval(parse(text = "X_____X")), error = function(e) conditionMessage(e)),
+ fun = tryCatch(eval(parse(text = "X_____X()")), error = function(e) conditionMessage(e))
+ )
+ paste(map(out, ~ sub("X_____X", ".*", .)), collapse = "|")
+}
+
+# Helper to raise a common error
+arrow_not_supported <- function(msg) {
+ # TODO: raise a classed error?
+ stop(paste(msg, "not supported by Arrow"), call. = FALSE)
+}
+
+# Create a data mask for evaluating a dplyr expression
+arrow_mask <- function(.data, aggregation = FALSE) {
+ f_env <- new_environment(.cache$functions)
+
+ # Add functions that need to error hard and clear.
+ # Some R functions will still try to evaluate on an Expression
+ # and return NA with a warning
+ fail <- function(...) stop("Not implemented")
+ for (f in c("mean", "sd")) {
+ f_env[[f]] <- fail
+ }
+
+ if (aggregation) {
+ # This should probably be done with an environment inside an environment
+ # but a first attempt at that had scoping problems (ARROW-13499)
+ for (f in names(agg_funcs)) {
+ f_env[[f]] <- agg_funcs[[f]]
+ }
+ }
+
+ # Assign the schema to the expressions
+ map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+
+ # Add the column references and make the mask
+ out <- new_data_mask(
+ new_environment(.data$selected_columns, parent = f_env),
+ f_env
+ )
+ # Then insert the data pronoun
+ # TODO: figure out what rlang::as_data_pronoun does/why we should use it
+ # (because if we do we get `Error: Can't modify the data pronoun` in mutate())
+ out$.data <- .data$selected_columns
+ out
+}
+
+format_expr <- function(x) {
+ if (is_quosure(x)) {
+ x <- quo_get_expr(x)
+ }
+ out <- deparse(x)
+ if (length(out) > 1) {
+ # Add ellipses because we are going to truncate
+ out[1] <- paste0(out[1], "...")
+ }
+ head(out, 1)
+}
diff --git a/src/arrow/r/R/dplyr-filter.R b/src/arrow/r/R/dplyr-filter.R
new file mode 100644
index 000000000..3c8c08ea5
--- /dev/null
+++ b/src/arrow/r/R/dplyr-filter.R
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) {
+ # TODO something with the .preserve argument
+ filts <- quos(...)
+ if (length(filts) == 0) {
+ # Nothing to do
+ return(.data)
+ }
+
+ .data <- as_adq(.data)
+ # tidy-eval the filter expressions inside an Arrow data_mask
+ filters <- lapply(filts, arrow_eval, arrow_mask(.data))
+ bad_filters <- map_lgl(filters, ~ inherits(., "try-error"))
+ if (any(bad_filters)) {
+ # This is similar to abandon_ship() except that the filter eval is
+ # vectorized, and we apply filters that _did_ work before abandoning ship
+ # with the rest
+ expr_labs <- map_chr(filts[bad_filters], format_expr)
+ if (query_on_dataset(.data)) {
+ # Abort. We don't want to auto-collect if this is a Dataset because that
+ # could blow up, too big.
+ stop(
+ "Filter expression not supported for Arrow Datasets: ",
+ oxford_paste(expr_labs, quote = FALSE),
+ "\nCall collect() first to pull data into R.",
+ call. = FALSE
+ )
+ } else {
+ arrow_errors <- map2_chr(
+ filters[bad_filters], expr_labs,
+ handle_arrow_not_supported
+ )
+ if (length(arrow_errors) == 1) {
+ msg <- paste0(arrow_errors, "; ")
+ } else {
+ msg <- paste0("* ", arrow_errors, "\n", collapse = "")
+ }
+ warning(
+ msg, "pulling data into R",
+ immediate. = TRUE,
+ call. = FALSE
+ )
+ # Set any valid filters first, then collect and then apply the invalid ones in R
+ .data <- set_filters(.data, filters[!bad_filters])
+ return(dplyr::filter(dplyr::collect(.data), !!!filts[bad_filters]))
+ }
+ }
+
+ set_filters(.data, filters)
+}
+filter.Dataset <- filter.ArrowTabular <- filter.arrow_dplyr_query
+
+set_filters <- function(.data, expressions) {
+ if (length(expressions)) {
+ if (is_list_of(expressions, "Expression")) {
+ # expressions is a list of Expressions. AND them together and set them on .data
+ new_filter <- Reduce("&", expressions)
+ } else if (inherits(expressions, "Expression")) {
+ new_filter <- expressions
+ } else {
+ stop("filter expressions must be either an expression or a list of expressions", call. = FALSE)
+ }
+
+ if (isTRUE(.data$filtered_rows)) {
+ # TRUE is default (i.e. no filter yet), so we don't need to & with it
+ .data$filtered_rows <- new_filter
+ } else {
+ .data$filtered_rows <- .data$filtered_rows & new_filter
+ }
+ }
+ .data
+}
diff --git a/src/arrow/r/R/dplyr-functions.R b/src/arrow/r/R/dplyr-functions.R
new file mode 100644
index 000000000..717cdae96
--- /dev/null
+++ b/src/arrow/r/R/dplyr-functions.R
@@ -0,0 +1,1087 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' @include expression.R
+NULL
+
+# This environment is an internal cache for things including data mask functions
+# We'll populate it at package load time.
+.cache <- NULL
+init_env <- function() {
+ .cache <<- new.env(hash = TRUE)
+}
+init_env()
+
+# nse_funcs is a list of functions that operated on (and return) Expressions
+# These will be the basis for a data_mask inside dplyr methods
+# and will be added to .cache at package load time
+
+# Start with mappings from R function name spellings
+nse_funcs <- lapply(set_names(names(.array_function_map)), function(operator) {
+ force(operator)
+ function(...) build_expr(operator, ...)
+})
+
+# Now add functions to that list where the mapping from R to Arrow isn't 1:1
+# Each of these functions should have the same signature as the R function
+# they're replacing.
+#
+# When to use `build_expr()` vs. `Expression$create()`?
+#
+# Use `build_expr()` if you need to
+# (1) map R function names to Arrow C++ functions
+# (2) wrap R inputs (vectors) as Array/Scalar
+#
+# `Expression$create()` is lower level. Most of the functions below use it
+# because they manage the preparation of the user-provided inputs
+# and don't need to wrap scalars
+
+nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) {
+ opts <- cast_options(safe, ...)
+ opts$to_type <- as_type(target_type)
+ Expression$create("cast", x, options = opts)
+}
+
+nse_funcs$coalesce <- function(...) {
+ args <- list2(...)
+ if (length(args) < 1) {
+ abort("At least one argument must be supplied to coalesce()")
+ }
+
+ # Treat NaN like NA for consistency with dplyr::coalesce(), but if *all*
+ # the values are NaN, we should return NaN, not NA, so don't replace
+ # NaN with NA in the final (or only) argument
+ # TODO: if an option is added to the coalesce kernel to treat NaN as NA,
+ # use that to simplify the code here (ARROW-13389)
+ attr(args[[length(args)]], "last") <- TRUE
+ args <- lapply(args, function(arg) {
+ last_arg <- is.null(attr(arg, "last"))
+ attr(arg, "last") <- NULL
+
+ if (!inherits(arg, "Expression")) {
+ arg <- Expression$scalar(arg)
+ }
+
+ # coalesce doesn't yet support factors/dictionaries
+ # TODO: remove this after ARROW-14167 is merged
+ if (nse_funcs$is.factor(arg)) {
+ warning("Dictionaries (in R: factors) are currently converted to strings (characters) in coalesce", call. = FALSE)
+ }
+
+ if (last_arg && arg$type_id() %in% TYPES_WITH_NAN) {
+ # store the NA_real_ in the same type as arg to avoid avoid casting
+ # smaller float types to larger float types
+ NA_expr <- Expression$scalar(Scalar$create(NA_real_, type = arg$type()))
+ Expression$create("if_else", Expression$create("is_nan", arg), NA_expr, arg)
+ } else {
+ arg
+ }
+ })
+ Expression$create("coalesce", args = args)
+}
+
+nse_funcs$is.na <- function(x) {
+ build_expr("is_null", x, options = list(nan_is_null = TRUE))
+}
+
+nse_funcs$is.nan <- function(x) {
+ if (is.double(x) || (inherits(x, "Expression") &&
+ x$type_id() %in% TYPES_WITH_NAN)) {
+ # TODO: if an option is added to the is_nan kernel to treat NA as NaN,
+ # use that to simplify the code here (ARROW-13366)
+ build_expr("is_nan", x) & build_expr("is_valid", x)
+ } else {
+ Expression$scalar(FALSE)
+ }
+}
+
+nse_funcs$is <- function(object, class2) {
+ if (is.string(class2)) {
+ switch(class2,
+ # for R data types, pass off to is.*() functions
+ character = nse_funcs$is.character(object),
+ numeric = nse_funcs$is.numeric(object),
+ integer = nse_funcs$is.integer(object),
+ integer64 = nse_funcs$is.integer64(object),
+ logical = nse_funcs$is.logical(object),
+ factor = nse_funcs$is.factor(object),
+ list = nse_funcs$is.list(object),
+ # for Arrow data types, compare class2 with object$type()$ToString(),
+ # but first strip off any parameters to only compare the top-level data
+ # type, and canonicalize class2
+ sub("^([^([<]+).*$", "\\1", object$type()$ToString()) ==
+ canonical_type_str(class2)
+ )
+ } else if (inherits(class2, "DataType")) {
+ object$type() == as_type(class2)
+ } else {
+ stop("Second argument to is() is not a string or DataType", call. = FALSE)
+ }
+}
+
+nse_funcs$dictionary_encode <- function(x,
+ null_encoding_behavior = c("mask", "encode")) {
+ behavior <- toupper(match.arg(null_encoding_behavior))
+ null_encoding_behavior <- NullEncodingBehavior[[behavior]]
+ Expression$create(
+ "dictionary_encode",
+ x,
+ options = list(null_encoding_behavior = null_encoding_behavior)
+ )
+}
+
+nse_funcs$between <- function(x, left, right) {
+ x >= left & x <= right
+}
+
+nse_funcs$is.finite <- function(x) {
+ is_fin <- Expression$create("is_finite", x)
+ # for compatibility with base::is.finite(), return FALSE for NA_real_
+ is_fin & !nse_funcs$is.na(is_fin)
+}
+
+nse_funcs$is.infinite <- function(x) {
+ is_inf <- Expression$create("is_inf", x)
+ # for compatibility with base::is.infinite(), return FALSE for NA_real_
+ is_inf & !nse_funcs$is.na(is_inf)
+}
+
+# as.* type casting functions
+# as.factor() is mapped in expression.R
+nse_funcs$as.character <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = string()))
+}
+nse_funcs$as.double <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = float64()))
+}
+nse_funcs$as.integer <- function(x) {
+ Expression$create(
+ "cast",
+ x,
+ options = cast_options(
+ to_type = int32(),
+ allow_float_truncate = TRUE,
+ allow_decimal_truncate = TRUE
+ )
+ )
+}
+nse_funcs$as.integer64 <- function(x) {
+ Expression$create(
+ "cast",
+ x,
+ options = cast_options(
+ to_type = int64(),
+ allow_float_truncate = TRUE,
+ allow_decimal_truncate = TRUE
+ )
+ )
+}
+nse_funcs$as.logical <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = boolean()))
+}
+nse_funcs$as.numeric <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = float64()))
+}
+
+# is.* type functions
+nse_funcs$is.character <- function(x) {
+ is.character(x) || (inherits(x, "Expression") &&
+ x$type_id() %in% Type[c("STRING", "LARGE_STRING")])
+}
+nse_funcs$is.numeric <- function(x) {
+ is.numeric(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c(
+ "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32",
+ "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE",
+ "DECIMAL", "DECIMAL256"
+ )])
+}
+nse_funcs$is.double <- function(x) {
+ is.double(x) || (inherits(x, "Expression") && x$type_id() == Type["DOUBLE"])
+}
+nse_funcs$is.integer <- function(x) {
+ is.integer(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c(
+ "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32",
+ "UINT64", "INT64"
+ )])
+}
+nse_funcs$is.integer64 <- function(x) {
+ is.integer64(x) || (inherits(x, "Expression") && x$type_id() == Type["INT64"])
+}
+nse_funcs$is.logical <- function(x) {
+ is.logical(x) || (inherits(x, "Expression") && x$type_id() == Type["BOOL"])
+}
+nse_funcs$is.factor <- function(x) {
+ is.factor(x) || (inherits(x, "Expression") && x$type_id() == Type["DICTIONARY"])
+}
+nse_funcs$is.list <- function(x) {
+ is.list(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c(
+ "LIST", "FIXED_SIZE_LIST", "LARGE_LIST"
+ )])
+}
+
+# rlang::is_* type functions
+nse_funcs$is_character <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.character(x)
+}
+nse_funcs$is_double <- function(x, n = NULL, finite = NULL) {
+ assert_that(is.null(n) && is.null(finite))
+ nse_funcs$is.double(x)
+}
+nse_funcs$is_integer <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.integer(x)
+}
+nse_funcs$is_list <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.list(x)
+}
+nse_funcs$is_logical <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.logical(x)
+}
+nse_funcs$is_timestamp <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ inherits(x, "POSIXt") || (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")])
+}
+
+# String functions
+nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
+ if (allowNA) {
+ arrow_not_supported("allowNA = TRUE")
+ }
+ if (is.na(keepNA)) {
+ keepNA <- !identical(type, "width")
+ }
+ if (!keepNA) {
+ # TODO: I think there is a fill_null kernel we could use, set null to 2
+ arrow_not_supported("keepNA = TRUE")
+ }
+ if (identical(type, "bytes")) {
+ Expression$create("binary_length", x)
+ } else {
+ Expression$create("utf8_length", x)
+ }
+}
+
+nse_funcs$paste <- function(..., sep = " ", collapse = NULL, recycle0 = FALSE) {
+ assert_that(
+ is.null(collapse),
+ msg = "paste() with the collapse argument is not yet supported in Arrow"
+ )
+ if (!inherits(sep, "Expression")) {
+ assert_that(!is.na(sep), msg = "Invalid separator")
+ }
+ arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep)
+}
+
+nse_funcs$paste0 <- function(..., collapse = NULL, recycle0 = FALSE) {
+ assert_that(
+ is.null(collapse),
+ msg = "paste0() with the collapse argument is not yet supported in Arrow"
+ )
+ arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "")
+}
+
+nse_funcs$str_c <- function(..., sep = "", collapse = NULL) {
+ assert_that(
+ is.null(collapse),
+ msg = "str_c() with the collapse argument is not yet supported in Arrow"
+ )
+ arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep)
+}
+
+arrow_string_join_function <- function(null_handling, null_replacement = NULL) {
+ # the `binary_join_element_wise` Arrow C++ compute kernel takes the separator
+ # as the last argument, so pass `sep` as the last dots arg to this function
+ function(...) {
+ args <- lapply(list(...), function(arg) {
+ # handle scalar literal args, and cast all args to string for
+ # consistency with base::paste(), base::paste0(), and stringr::str_c()
+ if (!inherits(arg, "Expression")) {
+ assert_that(
+ length(arg) == 1,
+ msg = "Literal vectors of length != 1 not supported in string concatenation"
+ )
+ Expression$scalar(as.character(arg))
+ } else {
+ nse_funcs$as.character(arg)
+ }
+ })
+ Expression$create(
+ "binary_join_element_wise",
+ args = args,
+ options = list(
+ null_handling = null_handling,
+ null_replacement = null_replacement
+ )
+ )
+ }
+}
+
+# Currently, Arrow does not supports a locale option for string case conversion
+# functions, contrast to stringr's API, so the 'locale' argument is only valid
+# for stringr's default value ("en"). The following are string functions that
+# take a 'locale' option as its second argument:
+# str_to_lower
+# str_to_upper
+# str_to_title
+#
+# Arrow locale will be supported with ARROW-14126
+stop_if_locale_provided <- function(locale) {
+ if (!identical(locale, "en")) {
+ stop("Providing a value for 'locale' other than the default ('en') is not supported by Arrow. ",
+ "To change locale, use 'Sys.setlocale()'",
+ call. = FALSE
+ )
+ }
+}
+
+nse_funcs$str_to_lower <- function(string, locale = "en") {
+ stop_if_locale_provided(locale)
+ Expression$create("utf8_lower", string)
+}
+
+nse_funcs$str_to_upper <- function(string, locale = "en") {
+ stop_if_locale_provided(locale)
+ Expression$create("utf8_upper", string)
+}
+
+nse_funcs$str_to_title <- function(string, locale = "en") {
+ stop_if_locale_provided(locale)
+ Expression$create("utf8_title", string)
+}
+
+nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) {
+ side <- match.arg(side)
+ trim_fun <- switch(side,
+ left = "utf8_ltrim_whitespace",
+ right = "utf8_rtrim_whitespace",
+ both = "utf8_trim_whitespace"
+ )
+ Expression$create(trim_fun, string)
+}
+
+nse_funcs$substr <- function(x, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # substr treats values as if they're on a continous number line, so values
+ # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
+ # this behavior
+ if (start <= 0) {
+ start <- 1
+ }
+
+ # if `stop` is lower than `start`, this is invalid, so set `stop` to
+ # 0 so that an empty string will be returned (consistent with base::substr())
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ x,
+ # we don't need to subtract 1 from `stop` as C++ counts exclusively
+ # which effectively cancels out the difference in indexing between R & C++
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- function(text, first, last) {
+ nse_funcs$substr(x = text, start = first, stop = last)
+}
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(end) == 1,
+ msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # In stringr::str_sub, an `end` value of -1 means the end of the string, so
+ # set it to the maximum integer to match this behavior
+ if (end == -1) {
+ end <- .Machine$integer.max
+ }
+
+ # An end value lower than a start value returns an empty string in
+ # stringr::str_sub so set end to 0 here to match this behavior
+ if (end < start) {
+ end <- 0
+ }
+
+ # subtract 1 from `start` because C++ is 0-based and R is 1-based
+ # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0
+ # when `start` < 0, both str_sub and utf8_slice_codeunits count backwards from the end
+ if (start > 0) {
+ start <- start - 1L
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start, stop = end)
+ )
+}
+
+nse_funcs$grepl <- function(pattern, x, ignore.case = FALSE, fixed = FALSE) {
+ arrow_fun <- ifelse(fixed, "match_substring", "match_substring_regex")
+ Expression$create(
+ arrow_fun,
+ x,
+ options = list(pattern = pattern, ignore_case = ignore.case)
+ )
+}
+
+nse_funcs$str_detect <- function(string, pattern, negate = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ out <- nse_funcs$grepl(
+ pattern = opts$pattern,
+ x = string,
+ ignore.case = opts$ignore_case,
+ fixed = opts$fixed
+ )
+ if (negate) {
+ out <- !out
+ }
+ out
+}
+
+nse_funcs$str_like <- function(string, pattern, ignore_case = TRUE) {
+ Expression$create(
+ "match_like",
+ string,
+ options = list(pattern = pattern, ignore_case = ignore_case)
+ )
+}
+
+# Encapsulate some common logic for sub/gsub/str_replace/str_replace_all
+arrow_r_string_replace_function <- function(max_replacements) {
+ function(pattern, replacement, x, ignore.case = FALSE, fixed = FALSE) {
+ Expression$create(
+ ifelse(fixed && !ignore.case, "replace_substring", "replace_substring_regex"),
+ x,
+ options = list(
+ pattern = format_string_pattern(pattern, ignore.case, fixed),
+ replacement = format_string_replacement(replacement, ignore.case, fixed),
+ max_replacements = max_replacements
+ )
+ )
+ }
+}
+
+arrow_stringr_string_replace_function <- function(max_replacements) {
+ function(string, pattern, replacement) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ arrow_r_string_replace_function(max_replacements)(
+ pattern = opts$pattern,
+ replacement = replacement,
+ x = string,
+ ignore.case = opts$ignore_case,
+ fixed = opts$fixed
+ )
+ }
+}
+
+nse_funcs$sub <- arrow_r_string_replace_function(1L)
+nse_funcs$gsub <- arrow_r_string_replace_function(-1L)
+nse_funcs$str_replace <- arrow_stringr_string_replace_function(1L)
+nse_funcs$str_replace_all <- arrow_stringr_string_replace_function(-1L)
+
+nse_funcs$strsplit <- function(x,
+ split,
+ fixed = FALSE,
+ perl = FALSE,
+ useBytes = FALSE) {
+ assert_that(is.string(split))
+
+ arrow_fun <- ifelse(fixed, "split_pattern", "split_pattern_regex")
+ # warn when the user specifies both fixed = TRUE and perl = TRUE, for
+ # consistency with the behavior of base::strsplit()
+ if (fixed && perl) {
+ warning("Argument 'perl = TRUE' will be ignored", call. = FALSE)
+ }
+ # since split is not a regex, proceed without any warnings or errors regardless
+ # of the value of perl, for consistency with the behavior of base::strsplit()
+ Expression$create(
+ arrow_fun,
+ x,
+ options = list(pattern = split, reverse = FALSE, max_splits = -1L)
+ )
+}
+
+nse_funcs$str_split <- function(string, pattern, n = Inf, simplify = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ arrow_fun <- ifelse(opts$fixed, "split_pattern", "split_pattern_regex")
+ if (opts$ignore_case) {
+ arrow_not_supported("Case-insensitive string splitting")
+ }
+ if (n == 0) {
+ arrow_not_supported("Splitting strings into zero parts")
+ }
+ if (identical(n, Inf)) {
+ n <- 0L
+ }
+ if (simplify) {
+ warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE)
+ }
+ # The max_splits option in the Arrow C++ library controls the maximum number
+ # of places at which the string is split, whereas the argument n to
+ # str_split() controls the maximum number of pieces to return. So we must
+ # subtract 1 from n to get max_splits.
+ Expression$create(
+ arrow_fun,
+ string,
+ options = list(
+ pattern = opts$pattern,
+ reverse = FALSE,
+ max_splits = n - 1L
+ )
+ )
+}
+
+nse_funcs$pmin <- function(..., na.rm = FALSE) {
+ build_expr(
+ "min_element_wise",
+ ...,
+ options = list(skip_nulls = na.rm)
+ )
+}
+
+nse_funcs$pmax <- function(..., na.rm = FALSE) {
+ build_expr(
+ "max_element_wise",
+ ...,
+ options = list(skip_nulls = na.rm)
+ )
+}
+
+nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") {
+ assert_that(is_integerish(width))
+ side <- match.arg(side)
+ assert_that(is.string(pad))
+
+ if (side == "left") {
+ pad_func <- "utf8_lpad"
+ } else if (side == "right") {
+ pad_func <- "utf8_rpad"
+ } else if (side == "both") {
+ pad_func <- "utf8_center"
+ }
+
+ Expression$create(
+ pad_func,
+ string,
+ options = list(width = width, padding = pad)
+ )
+}
+
+nse_funcs$startsWith <- function(x, prefix) {
+ Expression$create(
+ "starts_with",
+ x,
+ options = list(pattern = prefix)
+ )
+}
+
+nse_funcs$endsWith <- function(x, suffix) {
+ Expression$create(
+ "ends_with",
+ x,
+ options = list(pattern = suffix)
+ )
+}
+
+nse_funcs$str_starts <- function(string, pattern, negate = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ if (opts$fixed) {
+ out <- nse_funcs$startsWith(x = string, prefix = opts$pattern)
+ } else {
+ out <- nse_funcs$grepl(pattern = paste0("^", opts$pattern), x = string, fixed = FALSE)
+ }
+
+ if (negate) {
+ out <- !out
+ }
+ out
+}
+
+nse_funcs$str_ends <- function(string, pattern, negate = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ if (opts$fixed) {
+ out <- nse_funcs$endsWith(x = string, suffix = opts$pattern)
+ } else {
+ out <- nse_funcs$grepl(pattern = paste0(opts$pattern, "$"), x = string, fixed = FALSE)
+ }
+
+ if (negate) {
+ out <- !out
+ }
+ out
+}
+
+nse_funcs$str_count <- function(string, pattern) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ if (!is.string(pattern)) {
+ arrow_not_supported("`pattern` must be a length 1 character vector; other values")
+ }
+ arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex")
+ Expression$create(
+ arrow_fun,
+ string,
+ options = list(pattern = opts$pattern, ignore_case = opts$ignore_case)
+ )
+}
+
+# String function helpers
+
+# format `pattern` as needed for case insensitivity and literal matching by RE2
+format_string_pattern <- function(pattern, ignore.case, fixed) {
+ # Arrow lacks native support for case-insensitive literal string matching and
+ # replacement, so we use the regular expression engine (RE2) to do this.
+ # https://github.com/google/re2/wiki/Syntax
+ if (ignore.case) {
+ if (fixed) {
+ # Everything between "\Q" and "\E" is treated as literal text.
+ # If the search text contains any literal "\E" strings, make them
+ # lowercase so they won't signal the end of the literal text:
+ pattern <- gsub("\\E", "\\e", pattern, fixed = TRUE)
+ pattern <- paste0("\\Q", pattern, "\\E")
+ }
+ # Prepend "(?i)" for case-insensitive matching
+ pattern <- paste0("(?i)", pattern)
+ }
+ pattern
+}
+
+# format `replacement` as needed for literal replacement by RE2
+format_string_replacement <- function(replacement, ignore.case, fixed) {
+ # Arrow lacks native support for case-insensitive literal string
+ # replacement, so we use the regular expression engine (RE2) to do this.
+ # https://github.com/google/re2/wiki/Syntax
+ if (ignore.case && fixed) {
+ # Escape single backslashes in the regex replacement text so they are
+ # interpreted as literal backslashes:
+ replacement <- gsub("\\", "\\\\", replacement, fixed = TRUE)
+ }
+ replacement
+}
+
+#' Get `stringr` pattern options
+#'
+#' This function assigns definitions for the `stringr` pattern modifier
+#' functions (`fixed()`, `regex()`, etc.) inside itself, and uses them to
+#' evaluate the quoted expression `pattern`, returning a list that is used
+#' to control pattern matching behavior in internal `arrow` functions.
+#'
+#' @param pattern Unevaluated expression containing a call to a `stringr`
+#' pattern modifier function
+#'
+#' @return List containing elements `pattern`, `fixed`, and `ignore_case`
+#' @keywords internal
+get_stringr_pattern_options <- function(pattern) {
+ fixed <- function(pattern, ignore_case = FALSE, ...) {
+ check_dots(...)
+ list(pattern = pattern, fixed = TRUE, ignore_case = ignore_case)
+ }
+ regex <- function(pattern, ignore_case = FALSE, ...) {
+ check_dots(...)
+ list(pattern = pattern, fixed = FALSE, ignore_case = ignore_case)
+ }
+ coll <- function(...) {
+ arrow_not_supported("Pattern modifier `coll()`")
+ }
+ boundary <- function(...) {
+ arrow_not_supported("Pattern modifier `boundary()`")
+ }
+ check_dots <- function(...) {
+ dots <- list(...)
+ if (length(dots)) {
+ warning(
+ "Ignoring pattern modifier ",
+ ngettext(length(dots), "argument ", "arguments "),
+ "not supported in Arrow: ",
+ oxford_paste(names(dots)),
+ call. = FALSE
+ )
+ }
+ }
+ ensure_opts <- function(opts) {
+ if (is.character(opts)) {
+ opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE)
+ }
+ opts
+ }
+ ensure_opts(eval(pattern))
+}
+
+#' Does this string contain regex metacharacters?
+#'
+#' @param string String to be tested
+#' @keywords internal
+#' @return Logical: does `string` contain regex metacharacters?
+contains_regex <- function(string) {
+ grepl("[.\\|()[{^$*+?]", string)
+}
+
+nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") {
+ # Arrow uses unit for time parsing, strptime() does not.
+ # Arrow has no default option for strptime (format, unit),
+ # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms",
+ # (ARROW-12809)
+
+ # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820).
+ # Stop if tz is provided.
+ if (is.character(tz)) {
+ arrow_not_supported("Time zone argument")
+ }
+
+ unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units))
+
+ Expression$create("strptime", x, options = list(format = format, unit = unit))
+}
+
+nse_funcs$strftime <- function(x, format = "", tz = "", usetz = FALSE) {
+ if (usetz) {
+ format <- paste(format, "%Z")
+ }
+ if (tz == "") {
+ tz <- Sys.timezone()
+ }
+ # Arrow's strftime prints in timezone of the timestamp. To match R's strftime behavior we first
+ # cast the timestamp to desired timezone. This is a metadata only change.
+ if (nse_funcs$is_timestamp(x)) {
+ ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz)))
+ } else {
+ ts <- x
+ }
+ Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME")))
+}
+
+nse_funcs$format_ISO8601 <- function(x, usetz = FALSE, precision = NULL, ...) {
+ ISO8601_precision_map <-
+ list(
+ y = "%Y",
+ ym = "%Y-%m",
+ ymd = "%Y-%m-%d",
+ ymdh = "%Y-%m-%dT%H",
+ ymdhm = "%Y-%m-%dT%H:%M",
+ ymdhms = "%Y-%m-%dT%H:%M:%S"
+ )
+
+ if (is.null(precision)) {
+ precision <- "ymdhms"
+ }
+ if (!precision %in% names(ISO8601_precision_map)) {
+ abort(
+ paste(
+ "`precision` must be one of the following values:",
+ paste(names(ISO8601_precision_map), collapse = ", "),
+ "\nValue supplied was: ",
+ precision
+ )
+ )
+ }
+ format <- ISO8601_precision_map[[precision]]
+ if (usetz) {
+ format <- paste0(format, "%z")
+ }
+ Expression$create("strftime", x, options = list(format = format, locale = "C"))
+}
+
+nse_funcs$second <- function(x) {
+ Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x))
+}
+
+nse_funcs$trunc <- function(x, ...) {
+ # accepts and ignores ... for consistency with base::trunc()
+ build_expr("trunc", x)
+}
+
+nse_funcs$round <- function(x, digits = 0) {
+ build_expr(
+ "round",
+ x,
+ options = list(ndigits = digits, round_mode = RoundMode$HALF_TO_EVEN)
+ )
+}
+
+nse_funcs$wday <- function(x,
+ label = FALSE,
+ abbr = TRUE,
+ week_start = getOption("lubridate.week.start", 7),
+ locale = Sys.getlocale("LC_TIME")) {
+ if (label) {
+ if (abbr) {
+ format <- "%a"
+ } else {
+ format <- "%A"
+ }
+ return(Expression$create("strftime", x, options = list(format = format, locale = locale)))
+ }
+
+ Expression$create("day_of_week", x, options = list(count_from_zero = FALSE, week_start = week_start))
+}
+
+nse_funcs$log <- nse_funcs$logb <- function(x, base = exp(1)) {
+ # like other binary functions, either `x` or `base` can be Expression or double(1)
+ if (is.numeric(x) && length(x) == 1) {
+ x <- Expression$scalar(x)
+ } else if (!inherits(x, "Expression")) {
+ arrow_not_supported("x must be a column or a length-1 numeric; other values")
+ }
+
+ # handle `base` differently because we use the simpler ln, log2, and log10
+ # functions for specific scalar base values
+ if (inherits(base, "Expression")) {
+ return(Expression$create("logb_checked", x, base))
+ }
+
+ if (!is.numeric(base) || length(base) != 1) {
+ arrow_not_supported("base must be a column or a length-1 numeric; other values")
+ }
+
+ if (base == exp(1)) {
+ return(Expression$create("ln_checked", x))
+ }
+
+ if (base == 2) {
+ return(Expression$create("log2_checked", x))
+ }
+
+ if (base == 10) {
+ return(Expression$create("log10_checked", x))
+ }
+
+ Expression$create("logb_checked", x, Expression$scalar(base))
+}
+
+nse_funcs$if_else <- function(condition, true, false, missing = NULL) {
+ if (!is.null(missing)) {
+ return(nse_funcs$if_else(
+ nse_funcs$is.na(condition),
+ missing,
+ nse_funcs$if_else(condition, true, false)
+ ))
+ }
+
+ # if_else doesn't yet support factors/dictionaries
+ # TODO: remove this after ARROW-13358 is merged
+ warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false)
+ if (warn_types) {
+ warning(
+ "Dictionaries (in R: factors) are currently converted to strings (characters) ",
+ "in if_else and ifelse",
+ call. = FALSE
+ )
+ }
+
+ build_expr("if_else", condition, true, false)
+}
+
+# Although base R ifelse allows `yes` and `no` to be different classes
+nse_funcs$ifelse <- function(test, yes, no) {
+ nse_funcs$if_else(condition = test, true = yes, false = no)
+}
+
+nse_funcs$case_when <- function(...) {
+ formulas <- list2(...)
+ n <- length(formulas)
+ if (n == 0) {
+ abort("No cases provided in case_when()")
+ }
+ query <- vector("list", n)
+ value <- vector("list", n)
+ mask <- caller_env()
+ for (i in seq_len(n)) {
+ f <- formulas[[i]]
+ if (!inherits(f, "formula")) {
+ abort("Each argument to case_when() must be a two-sided formula")
+ }
+ query[[i]] <- arrow_eval(f[[2]], mask)
+ value[[i]] <- arrow_eval(f[[3]], mask)
+ if (!nse_funcs$is.logical(query[[i]])) {
+ abort("Left side of each formula in case_when() must be a logical expression")
+ }
+ if (inherits(value[[i]], "try-error")) {
+ abort(handle_arrow_not_supported(value[[i]], format_expr(f[[3]])))
+ }
+ }
+ build_expr(
+ "case_when",
+ args = c(
+ build_expr(
+ "make_struct",
+ args = query,
+ options = list(field_names = as.character(seq_along(query)))
+ ),
+ value
+ )
+ )
+}
+
+# Aggregation functions
+# These all return a list of:
+# @param fun string function name
+# @param data Expression (these are all currently a single field)
+# @param options list of function options, as passed to call_function
+# For group-by aggregation, `hash_` gets prepended to the function name.
+# So to see a list of available hash aggregation functions,
+# you can use list_compute_functions("^hash_")
+agg_funcs <- list()
+agg_funcs$sum <- function(..., na.rm = FALSE) {
+ list(
+ fun = "sum",
+ data = ensure_one_arg(list2(...), "sum"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$any <- function(..., na.rm = FALSE) {
+ list(
+ fun = "any",
+ data = ensure_one_arg(list2(...), "any"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$all <- function(..., na.rm = FALSE) {
+ list(
+ fun = "all",
+ data = ensure_one_arg(list2(...), "all"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$mean <- function(x, na.rm = FALSE) {
+ list(
+ fun = "mean",
+ data = x,
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) {
+ list(
+ fun = "stddev",
+ data = x,
+ options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof)
+ )
+}
+agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) {
+ list(
+ fun = "variance",
+ data = x,
+ options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof)
+ )
+}
+agg_funcs$quantile <- function(x, probs, na.rm = FALSE) {
+ if (length(probs) != 1) {
+ arrow_not_supported("quantile() with length(probs) != 1")
+ }
+ # TODO: Bind to the Arrow function that returns an exact quantile and remove
+ # this warning (ARROW-14021)
+ warn(
+ "quantile() currently returns an approximate quantile in Arrow",
+ .frequency = ifelse(is_interactive(), "once", "always"),
+ .frequency_id = "arrow.quantile.approximate"
+ )
+ list(
+ fun = "tdigest",
+ data = x,
+ options = list(skip_nulls = na.rm, q = probs)
+ )
+}
+agg_funcs$median <- function(x, na.rm = FALSE) {
+ # TODO: Bind to the Arrow function that returns an exact median and remove
+ # this warning (ARROW-14021)
+ warn(
+ "median() currently returns an approximate median in Arrow",
+ .frequency = ifelse(is_interactive(), "once", "always"),
+ .frequency_id = "arrow.median.approximate"
+ )
+ list(
+ fun = "approximate_median",
+ data = x,
+ options = list(skip_nulls = na.rm)
+ )
+}
+agg_funcs$n_distinct <- function(..., na.rm = FALSE) {
+ list(
+ fun = "count_distinct",
+ data = ensure_one_arg(list2(...), "n_distinct"),
+ options = list(na.rm = na.rm)
+ )
+}
+agg_funcs$n <- function() {
+ list(
+ fun = "sum",
+ data = Expression$scalar(1L),
+ options = list()
+ )
+}
+agg_funcs$min <- function(..., na.rm = FALSE) {
+ list(
+ fun = "min",
+ data = ensure_one_arg(list2(...), "min"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$max <- function(..., na.rm = FALSE) {
+ list(
+ fun = "max",
+ data = ensure_one_arg(list2(...), "max"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+
+ensure_one_arg <- function(args, fun) {
+ if (length(args) == 0) {
+ arrow_not_supported(paste0(fun, "() with 0 arguments"))
+ } else if (length(args) > 1) {
+ arrow_not_supported(paste0("Multiple arguments to ", fun, "()"))
+ }
+ args[[1]]
+}
+
+output_type <- function(fun, input_type, hash) {
+ # These are quick and dirty heuristics.
+ if (fun %in% c("any", "all")) {
+ bool()
+ } else if (fun %in% "sum") {
+ # It may upcast to a bigger type but this is close enough
+ input_type
+ } else if (fun %in% c("mean", "stddev", "variance", "approximate_median")) {
+ float64()
+ } else if (fun %in% "tdigest") {
+ if (hash) {
+ fixed_size_list_of(float64(), 1L)
+ } else {
+ float64()
+ }
+ } else {
+ # Just so things don't error, assume the resulting type is the same
+ input_type
+ }
+}
diff --git a/src/arrow/r/R/dplyr-group-by.R b/src/arrow/r/R/dplyr-group-by.R
new file mode 100644
index 000000000..66b867210
--- /dev/null
+++ b/src/arrow/r/R/dplyr-group-by.R
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+group_by.arrow_dplyr_query <- function(.data,
+ ...,
+ .add = FALSE,
+ add = .add,
+ .drop = dplyr::group_by_drop_default(.data)) {
+ .data <- as_adq(.data)
+ new_groups <- enquos(...)
+ # ... can contain expressions (i.e. can add (or rename?) columns) and so we
+ # need to identify those and add them on to the query with mutate. Specifically,
+ # we want to mark as new:
+ # * expressions (named or otherwise)
+ # * variables that have new names
+ # All others (i.e. simple references to variables) should not be (re)-added
+
+ # Identify any groups with names which aren't in names of .data
+ new_group_ind <- map_lgl(new_groups, ~ !(quo_name(.x) %in% names(.data)))
+ # Identify any groups which don't have names
+ named_group_ind <- map_lgl(names(new_groups), nzchar)
+ # Retain any new groups identified above
+ new_groups <- new_groups[new_group_ind | named_group_ind]
+ if (length(new_groups)) {
+ # now either use the name that was given in ... or if that is "" then use the expr
+ names(new_groups) <- imap_chr(new_groups, ~ ifelse(.y == "", quo_name(.x), .y))
+
+ # Add them to the data
+ .data <- dplyr::mutate(.data, !!!new_groups)
+ }
+ if (".add" %in% names(formals(dplyr::group_by))) {
+ # For compatibility with dplyr >= 1.0
+ gv <- dplyr::group_by_prepare(.data, ..., .add = .add)$group_names
+ } else {
+ gv <- dplyr::group_by_prepare(.data, ..., add = add)$group_names
+ }
+ .data$group_by_vars <- gv
+ .data$drop_empty_groups <- ifelse(length(gv), .drop, dplyr::group_by_drop_default(.data))
+ .data
+}
+group_by.Dataset <- group_by.ArrowTabular <- group_by.arrow_dplyr_query
+
+groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x))
+groups.Dataset <- groups.ArrowTabular <- function(x) NULL
+
+group_vars.arrow_dplyr_query <- function(x) x$group_by_vars
+group_vars.Dataset <- function(x) NULL
+group_vars.RecordBatchReader <- function(x) NULL
+group_vars.ArrowTabular <- function(x) {
+ x$r_metadata$attributes$.group_vars
+}
+
+# the logical literal in the two functions below controls the default value of
+# the .drop argument to group_by()
+group_by_drop_default.arrow_dplyr_query <-
+ function(.tbl) .tbl$drop_empty_groups %||% TRUE
+group_by_drop_default.Dataset <- group_by_drop_default.ArrowTabular <-
+ function(.tbl) TRUE
+
+ungroup.arrow_dplyr_query <- function(x, ...) {
+ x$group_by_vars <- character()
+ x$drop_empty_groups <- NULL
+ x
+}
+ungroup.Dataset <- force
+ungroup.ArrowTabular <- function(x) {
+ x$r_metadata$attributes$.group_vars <- NULL
+ x
+}
diff --git a/src/arrow/r/R/dplyr-join.R b/src/arrow/r/R/dplyr-join.R
new file mode 100644
index 000000000..c14b1a8f3
--- /dev/null
+++ b/src/arrow/r/R/dplyr-join.R
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+do_join <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE,
+ na_matches,
+ join_type) {
+ # TODO: handle `copy` arg: ignore?
+ # TODO: handle `suffix` arg: Arrow does prefix
+ # TODO: handle `keep` arg: "Should the join keys from both ‘x’ and ‘y’ be preserved in the output?"
+ # TODO: handle `na_matches` arg
+ x <- as_adq(x)
+ y <- as_adq(y)
+ by <- handle_join_by(by, x, y)
+
+ x$join <- list(
+ type = JoinType[[join_type]],
+ right_data = y,
+ by = by
+ )
+ collapse.arrow_dplyr_query(x)
+}
+
+left_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_OUTER")
+}
+left_join.Dataset <- left_join.ArrowTabular <- left_join.arrow_dplyr_query
+
+right_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER")
+}
+right_join.Dataset <- right_join.ArrowTabular <- right_join.arrow_dplyr_query
+
+inner_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "INNER")
+}
+inner_join.Dataset <- inner_join.ArrowTabular <- inner_join.arrow_dplyr_query
+
+full_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "FULL_OUTER")
+}
+full_join.Dataset <- full_join.ArrowTabular <- full_join.arrow_dplyr_query
+
+semi_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_SEMI")
+}
+semi_join.Dataset <- semi_join.ArrowTabular <- semi_join.arrow_dplyr_query
+
+anti_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_ANTI")
+}
+anti_join.Dataset <- anti_join.ArrowTabular <- anti_join.arrow_dplyr_query
+
+handle_join_by <- function(by, x, y) {
+ if (is.null(by)) {
+ return(set_names(intersect(names(x), names(y))))
+ }
+ stopifnot(is.character(by))
+ if (is.null(names(by))) {
+ by <- set_names(by)
+ }
+ # TODO: nicer messages?
+ stopifnot(
+ all(names(by) %in% names(x)),
+ all(by %in% names(y))
+ )
+ by
+}
diff --git a/src/arrow/r/R/dplyr-mutate.R b/src/arrow/r/R/dplyr-mutate.R
new file mode 100644
index 000000000..2e5239484
--- /dev/null
+++ b/src/arrow/r/R/dplyr-mutate.R
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+mutate.arrow_dplyr_query <- function(.data,
+ ...,
+ .keep = c("all", "used", "unused", "none"),
+ .before = NULL,
+ .after = NULL) {
+ call <- match.call()
+ exprs <- ensure_named_exprs(quos(...))
+
+ .keep <- match.arg(.keep)
+ .before <- enquo(.before)
+ .after <- enquo(.after)
+
+ if (.keep %in% c("all", "unused") && length(exprs) == 0) {
+ # Nothing to do
+ return(.data)
+ }
+
+ .data <- as_adq(.data)
+
+ # Restrict the cases we support for now
+ has_aggregations <- any(unlist(lapply(exprs, all_funs)) %in% names(agg_funcs))
+ if (has_aggregations) {
+ # ARROW-13926
+ # mutate() on a grouped dataset does calculations within groups
+ # This doesn't matter on scalar ops (arithmetic etc.) but it does
+ # for things with aggregations (e.g. subtracting the mean)
+ return(abandon_ship(call, .data, "window functions not currently supported in Arrow"))
+ }
+
+ mask <- arrow_mask(.data)
+ results <- list()
+ for (i in seq_along(exprs)) {
+ # Iterate over the indices and not the names because names may be repeated
+ # (which overwrites the previous name)
+ new_var <- names(exprs)[i]
+ results[[new_var]] <- arrow_eval(exprs[[i]], mask)
+ if (inherits(results[[new_var]], "try-error")) {
+ msg <- handle_arrow_not_supported(
+ results[[new_var]],
+ format_expr(exprs[[i]])
+ )
+ return(abandon_ship(call, .data, msg))
+ } else if (!inherits(results[[new_var]], "Expression") &&
+ !is.null(results[[new_var]])) {
+ # We need some wrapping to handle literal values
+ if (length(results[[new_var]]) != 1) {
+ msg <- paste0("In ", new_var, " = ", format_expr(exprs[[i]]), ", only values of size one are recycled")
+ return(abandon_ship(call, .data, msg))
+ }
+ results[[new_var]] <- Expression$scalar(results[[new_var]])
+ }
+ # Put it in the data mask too
+ mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]]
+ }
+
+ old_vars <- names(.data$selected_columns)
+ # Note that this is names(exprs) not names(results):
+ # if results$new_var is NULL, that means we are supposed to remove it
+ new_vars <- names(exprs)
+
+ # Assign the new columns into the .data$selected_columns
+ for (new_var in new_vars) {
+ .data$selected_columns[[new_var]] <- results[[new_var]]
+ }
+
+ # Deduplicate new_vars and remove NULL columns from new_vars
+ new_vars <- intersect(new_vars, names(.data$selected_columns))
+
+ # Respect .before and .after
+ if (!quo_is_null(.before) || !quo_is_null(.after)) {
+ new <- setdiff(new_vars, old_vars)
+ .data <- dplyr::relocate(.data, all_of(new), .before = !!.before, .after = !!.after)
+ }
+
+ # Respect .keep
+ if (.keep == "none") {
+ .data$selected_columns <- .data$selected_columns[new_vars]
+ } else if (.keep != "all") {
+ # "used" or "unused"
+ used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE)
+ if (.keep == "used") {
+ .data$selected_columns[setdiff(old_vars, used_vars)] <- NULL
+ } else {
+ # "unused"
+ .data$selected_columns[intersect(old_vars, used_vars)] <- NULL
+ }
+ }
+ # Even if "none", we still keep group vars
+ ensure_group_vars(.data)
+}
+mutate.Dataset <- mutate.ArrowTabular <- mutate.arrow_dplyr_query
+
+transmute.arrow_dplyr_query <- function(.data, ...) {
+ dots <- check_transmute_args(...)
+ dplyr::mutate(.data, !!!dots, .keep = "none")
+}
+transmute.Dataset <- transmute.ArrowTabular <- transmute.arrow_dplyr_query
+
+# This function is a copy of dplyr:::check_transmute_args at
+# https://github.com/tidyverse/dplyr/blob/master/R/mutate.R
+check_transmute_args <- function(..., .keep, .before, .after) {
+ if (!missing(.keep)) {
+ abort("`transmute()` does not support the `.keep` argument")
+ }
+ if (!missing(.before)) {
+ abort("`transmute()` does not support the `.before` argument")
+ }
+ if (!missing(.after)) {
+ abort("`transmute()` does not support the `.after` argument")
+ }
+ enquos(...)
+}
+
+ensure_named_exprs <- function(exprs) {
+ # Check for unnamed expressions and fix if any
+ unnamed <- !nzchar(names(exprs))
+ # Deparse and take the first element in case they're long expressions
+ names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
+ exprs
+}
diff --git a/src/arrow/r/R/dplyr-select.R b/src/arrow/r/R/dplyr-select.R
new file mode 100644
index 000000000..9a867ced9
--- /dev/null
+++ b/src/arrow/r/R/dplyr-select.R
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)
+
+select.arrow_dplyr_query <- function(.data, ...) {
+ check_select_helpers(enexprs(...))
+ column_select(as_adq(.data), !!!enquos(...))
+}
+select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query
+
+rename.arrow_dplyr_query <- function(.data, ...) {
+ check_select_helpers(enexprs(...))
+ column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename)
+}
+rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query
+
+column_select <- function(.data, ..., .FUN = vars_select) {
+ # .FUN is either tidyselect::vars_select or tidyselect::vars_rename
+ # It operates on the names() of selected_columns, i.e. the column names
+ # factoring in any renaming that may already have happened
+ out <- .FUN(names(.data), !!!enquos(...))
+ # Make sure that the resulting selected columns map back to the original data,
+ # as in when there are multiple renaming steps
+ .data$selected_columns <- set_names(.data$selected_columns[out], names(out))
+
+ # If we've renamed columns, we need to project that renaming into other
+ # query parameters we've collected
+ renamed <- out[names(out) != out]
+ if (length(renamed)) {
+ # Massage group_by
+ gbv <- .data$group_by_vars
+ renamed_groups <- gbv %in% renamed
+ gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
+ .data$group_by_vars <- gbv
+ # No need to massage filters because those contain references to Arrow objects
+ }
+ .data
+}
+
+relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
+ # The code in this function is adapted from the code in dplyr::relocate.data.frame
+ # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
+ # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829
+
+ .data <- as_adq(.data)
+
+ # Assign the schema to the expressions
+ map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+
+ # Create a mask for evaluating expressions in tidyselect helpers
+ mask <- new_environment(.cache$functions, parent = caller_env())
+
+ to_move <- eval_select(substitute(c(...)), .data$selected_columns, mask)
+
+ .before <- enquo(.before)
+ .after <- enquo(.after)
+ has_before <- !quo_is_null(.before)
+ has_after <- !quo_is_null(.after)
+
+ if (has_before && has_after) {
+ abort("Must supply only one of `.before` and `.after`.")
+ } else if (has_before) {
+ where <- min(unname(eval_select(quo_get_expr(.before), .data$selected_columns, mask)))
+ if (!where %in% to_move) {
+ to_move <- c(to_move, where)
+ }
+ } else if (has_after) {
+ where <- max(unname(eval_select(quo_get_expr(.after), .data$selected_columns, mask)))
+ if (!where %in% to_move) {
+ to_move <- c(where, to_move)
+ }
+ } else {
+ where <- 1L
+ if (!where %in% to_move) {
+ to_move <- c(to_move, where)
+ }
+ }
+
+ lhs <- setdiff(seq2(1, where - 1), to_move)
+ rhs <- setdiff(seq2(where + 1, length(.data$selected_columns)), to_move)
+
+ pos <- vec_unique(c(lhs, to_move, rhs))
+ new_names <- names(pos)
+ .data$selected_columns <- .data$selected_columns[pos]
+
+ if (!is.null(new_names)) {
+ names(.data$selected_columns)[new_names != ""] <- new_names[new_names != ""]
+ }
+ .data
+}
+relocate.Dataset <- relocate.ArrowTabular <- relocate.arrow_dplyr_query
+
+check_select_helpers <- function(exprs) {
+ # Throw an error if unsupported tidyselect selection helpers in `exprs`
+ exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x)
+ unsup_select_helpers <- "where"
+ funs_in_exprs <- unlist(lapply(exprs, all_funs))
+ unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers]
+ if (length(unsup_funs)) {
+ stop(
+ "Unsupported selection ",
+ ngettext(length(unsup_funs), "helper: ", "helpers: "),
+ oxford_paste(paste0(unsup_funs, "()"), quote = FALSE),
+ call. = FALSE
+ )
+ }
+}
diff --git a/src/arrow/r/R/dplyr-summarize.R b/src/arrow/r/R/dplyr-summarize.R
new file mode 100644
index 000000000..a6b7a3592
--- /dev/null
+++ b/src/arrow/r/R/dplyr-summarize.R
@@ -0,0 +1,289 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+summarise.arrow_dplyr_query <- function(.data, ...) {
+ call <- match.call()
+ .data <- as_adq(.data)
+ exprs <- quos(...)
+ # Only retain the columns we need to do our aggregations
+ vars_to_keep <- unique(c(
+ unlist(lapply(exprs, all.vars)), # vars referenced in summarise
+ dplyr::group_vars(.data) # vars needed for grouping
+ ))
+ # If exprs rely on the results of previous exprs
+ # (total = sum(x), mean = total / n())
+ # then not all vars will correspond to columns in the data,
+ # so don't try to select() them (use intersect() to exclude them)
+ # Note that this select() isn't useful for the Arrow summarize implementation
+ # because it will effectively project to keep what it needs anyway,
+ # but the data.frame fallback version does benefit from select here
+ .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data)))
+
+ # Try stuff, if successful return()
+ out <- try(do_arrow_summarize(.data, ...), silent = TRUE)
+ if (inherits(out, "try-error")) {
+ return(abandon_ship(call, .data, format(out)))
+ } else {
+ return(out)
+ }
+}
+summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query
+
+# This is the Arrow summarize implementation
+do_arrow_summarize <- function(.data, ..., .groups = NULL) {
+ exprs <- ensure_named_exprs(quos(...))
+
+ # Create a stateful environment for recording our evaluated expressions
+ # It's more complex than other places because a single summarize() expr
+ # may result in multiple query nodes (Aggregate, Project),
+ # and we have to walk through the expressions to disentangle them.
+ ctx <- env(
+ mask = arrow_mask(.data, aggregation = TRUE),
+ aggregations = empty_named_list(),
+ post_mutate = empty_named_list()
+ )
+ for (i in seq_along(exprs)) {
+ # Iterate over the indices and not the names because names may be repeated
+ # (which overwrites the previous name)
+ summarize_eval(
+ names(exprs)[i],
+ exprs[[i]],
+ ctx,
+ length(.data$group_by_vars) > 0
+ )
+ }
+
+ # Apply the results to the .data object.
+ # First, the aggregations
+ .data$aggregations <- ctx$aggregations
+ # Then collapse the query so that the resulting query object can have
+ # additional operations applied to it
+ out <- collapse.arrow_dplyr_query(.data)
+ # The expressions may have been translated into
+ # "first, aggregate, then transform the result further"
+ # nolint start
+ # For example,
+ # summarize(mean = sum(x) / n())
+ # is effectively implemented as
+ # summarize(..temp0 = sum(x), ..temp1 = n()) %>%
+ # mutate(mean = ..temp0 / ..temp1) %>%
+ # select(-starts_with("..temp"))
+ # If this is the case, there will be expressions in post_mutate
+ # nolint end
+ if (length(ctx$post_mutate)) {
+ # Append post_mutate, and make sure order is correct
+ # according to input exprs (also dropping ..temp columns)
+ out$selected_columns <- c(
+ out$selected_columns,
+ ctx$post_mutate
+ )[c(.data$group_by_vars, names(exprs))]
+ }
+
+ # If the object has .drop = FALSE and any group vars are dictionaries,
+ # we can't (currently) preserve the empty rows that dplyr does,
+ # so give a warning about that.
+ if (!dplyr::group_by_drop_default(.data)) {
+ group_by_exprs <- .data$selected_columns[.data$group_by_vars]
+ if (any(map_lgl(group_by_exprs, ~ inherits(.$type(), "DictionaryType")))) {
+ warning(
+ ".drop = FALSE currently not supported in Arrow aggregation",
+ call. = FALSE
+ )
+ }
+ }
+
+ # Handle .groups argument
+ if (length(.data$group_by_vars)) {
+ if (is.null(.groups)) {
+ # dplyr docs say:
+ # When ‘.groups’ is not specified, it is chosen based on the
+ # number of rows of the results:
+ # • If all the results have 1 row, you get "drop_last".
+ # • If the number of rows varies, you get "keep".
+ #
+ # But we don't support anything that returns multiple rows now
+ .groups <- "drop_last"
+ } else {
+ assert_that(is.string(.groups))
+ }
+ if (.groups == "drop_last") {
+ out$group_by_vars <- head(.data$group_by_vars, -1)
+ } else if (.groups == "keep") {
+ out$group_by_vars <- .data$group_by_vars
+ } else if (.groups == "rowwise") {
+ stop(arrow_not_supported('.groups = "rowwise"'))
+ } else if (.groups == "drop") {
+ # collapse() preserves groups so remove them
+ out <- dplyr::ungroup(out)
+ } else {
+ stop(paste("Invalid .groups argument:", .groups))
+ }
+ # TODO: shouldn't we be doing something with `drop_empty_groups` in summarize? (ARROW-14044)
+ out$drop_empty_groups <- .data$drop_empty_groups
+ }
+ out
+}
+
+arrow_eval_or_stop <- function(expr, mask) {
+ # TODO: change arrow_eval error handling behavior?
+ out <- arrow_eval(expr, mask)
+ if (inherits(out, "try-error")) {
+ msg <- handle_arrow_not_supported(out, format_expr(expr))
+ stop(msg, call. = FALSE)
+ }
+ out
+}
+
+summarize_projection <- function(.data) {
+ c(
+ map(.data$aggregations, ~ .$data),
+ .data$selected_columns[.data$group_by_vars]
+ )
+}
+
+format_aggregation <- function(x) {
+ paste0(x$fun, "(", x$data$ToString(), ")")
+}
+
+# This function handles each summarize expression and turns it into the
+# appropriate combination of (1) aggregations (possibly temporary) and
+# (2) post-aggregation transformations (mutate)
+# The function returns nothing: it assigns into the `ctx` environment
+summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) {
+ expr <- quo_get_expr(quosure)
+ ctx$quo_env <- quo_get_env(quosure)
+
+ funs_in_expr <- all_funs(expr)
+ if (length(funs_in_expr) == 0) {
+ # If it is a scalar or field ref, no special handling required
+ ctx$aggregations[[name]] <- arrow_eval_or_stop(quosure, ctx$mask)
+ return()
+ }
+
+ # For the quantile() binding in the hash aggregation case, we need to mutate
+ # the list output from the Arrow hash_tdigest kernel to flatten it into a
+ # column of type float64. We do that by modifying the unevaluated expression
+ # to replace quantile(...) with arrow_list_element(quantile(...), 0L)
+ if (hash && "quantile" %in% funs_in_expr) {
+ expr <- wrap_hash_quantile(expr)
+ funs_in_expr <- all_funs(expr)
+ }
+
+ # Start inspecting the expr to see what aggregations it involves
+ agg_funs <- names(agg_funcs)
+ outer_agg <- funs_in_expr[1] %in% agg_funs
+ inner_agg <- funs_in_expr[-1] %in% agg_funs
+
+ # First, pull out any aggregations wrapped in other function calls
+ if (any(inner_agg)) {
+ expr <- extract_aggregations(expr, ctx)
+ }
+
+ # By this point, there are no more aggregation functions in expr
+ # except for possibly the outer function call:
+ # they've all been pulled out to ctx$aggregations, and in their place in expr
+ # there are variable names, which will correspond to field refs in the
+ # query object after aggregation and collapse().
+ # So if we want to know if there are any aggregations inside expr,
+ # we have to look for them by their new var names
+ inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations)
+
+ if (outer_agg) {
+ # This is something like agg(fun(x, y)
+ # It just works by normal arrow_eval, unless there's a mix of aggs and
+ # columns in the original data like agg(fun(x, agg(x)))
+ # (but that will have been caught in extract_aggregations())
+ ctx$aggregations[[name]] <- arrow_eval_or_stop(
+ as_quosure(expr, ctx$quo_env),
+ ctx$mask
+ )
+ return()
+ } else if (all(inner_agg_exprs)) {
+ # Something like: fun(agg(x), agg(y))
+ # So based on the aggregations that have been extracted, mutate after
+ mutate_mask <- arrow_mask(
+ list(selected_columns = make_field_refs(names(ctx$aggregations)))
+ )
+ ctx$post_mutate[[name]] <- arrow_eval_or_stop(
+ as_quosure(expr, ctx$quo_env),
+ mutate_mask
+ )
+ return()
+ }
+
+ # Backstop for any other odd cases, like fun(x, y) (i.e. no aggregation),
+ # or aggregation functions that aren't supported in Arrow (not in agg_funcs)
+ stop(
+ handle_arrow_not_supported(quo_get_expr(quosure), format_expr(quosure)),
+ call. = FALSE
+ )
+}
+
+# This function recurses through expr, pulls out any aggregation expressions,
+# and inserts a variable name (field ref) in place of the aggregation
+extract_aggregations <- function(expr, ctx) {
+ # Keep the input in case we need to raise an error message with it
+ original_expr <- expr
+ funs <- all_funs(expr)
+ if (length(funs) == 0) {
+ return(expr)
+ } else if (length(funs) > 1) {
+ # Recurse more
+ expr[-1] <- lapply(expr[-1], extract_aggregations, ctx)
+ }
+ if (funs[1] %in% names(agg_funcs)) {
+ inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations)
+ if (any(inner_agg_exprs) & !all(inner_agg_exprs)) {
+ # We can't aggregate over a combination of dataset columns and other
+ # aggregations (e.g. sum(x - mean(x)))
+ # TODO: support in ARROW-13926
+ # TODO: Add "because" arg to explain _why_ it's not supported?
+ # TODO: this message could also say "not supported in summarize()"
+ # since some of these expressions may be legal elsewhere
+ stop(
+ handle_arrow_not_supported(original_expr, format_expr(original_expr)),
+ call. = FALSE
+ )
+ }
+
+ # We have an aggregation expression with no other aggregations inside it,
+ # so arrow_eval the expression on the data and give it a ..temp name prefix,
+ # then insert that name (symbol) back into the expression so that we can
+ # mutate() on the result of the aggregation and reference this field.
+ tmpname <- paste0("..temp", length(ctx$aggregations))
+ ctx$aggregations[[tmpname]] <- arrow_eval_or_stop(as_quosure(expr, ctx$quo_env), ctx$mask)
+ expr <- as.symbol(tmpname)
+ }
+ expr
+}
+
+# This function recurses through expr and wraps each call to quantile() with a
+# call to arrow_list_element()
+wrap_hash_quantile <- function(expr) {
+ if (length(expr) == 1) {
+ return(expr)
+ } else {
+ if (is.call(expr) && expr[[1]] == quote(quantile)) {
+ return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)")))
+ } else {
+ return(as.call(lapply(expr, wrap_hash_quantile)))
+ }
+ }
+}
diff --git a/src/arrow/r/R/dplyr.R b/src/arrow/r/R/dplyr.R
new file mode 100644
index 000000000..e6f67c066
--- /dev/null
+++ b/src/arrow/r/R/dplyr.R
@@ -0,0 +1,259 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include expression.R
+#' @include record-batch.R
+#' @include table.R
+
+arrow_dplyr_query <- function(.data) {
+ # An arrow_dplyr_query is a container for an Arrow data object (Table,
+ # RecordBatch, or Dataset) and the state of the user's dplyr query--things
+ # like selected columns, filters, and group vars.
+ # An arrow_dplyr_query can contain another arrow_dplyr_query in .data
+ gv <- dplyr::group_vars(.data) %||% character()
+
+ if (!inherits(.data, c("Dataset", "arrow_dplyr_query", "RecordBatchReader"))) {
+ .data <- InMemoryDataset$create(.data)
+ }
+ # Evaluating expressions on a dataset with duplicated fieldnames will error
+ dupes <- duplicated(names(.data))
+ if (any(dupes)) {
+ abort(c(
+ "Duplicated field names",
+ x = paste0(
+ "The following field names were found more than once in the data: ",
+ oxford_paste(names(.data)[dupes])
+ )
+ ))
+ }
+ structure(
+ list(
+ .data = .data,
+ # selected_columns is a named list:
+ # * contents are references/expressions pointing to the data
+ # * names are the names they should be in the end (i.e. this
+ # records any renaming)
+ selected_columns = make_field_refs(names(.data$schema)),
+ # filtered_rows will be an Expression
+ filtered_rows = TRUE,
+ # group_by_vars is a character vector of columns (as renamed)
+ # in the data. They will be kept when data is pulled into R.
+ group_by_vars = gv,
+ # drop_empty_groups is a logical value indicating whether to drop
+ # groups formed by factor levels that don't appear in the data. It
+ # should be non-null only when the data is grouped.
+ drop_empty_groups = NULL,
+ # arrange_vars will be a list of expressions named by their associated
+ # column names
+ arrange_vars = list(),
+ # arrange_desc will be a logical vector indicating the sort order for each
+ # expression in arrange_vars (FALSE for ascending, TRUE for descending)
+ arrange_desc = logical()
+ ),
+ class = "arrow_dplyr_query"
+ )
+}
+
+# The only difference between `arrow_dplyr_query()` and `as_adq()` is that if
+# `.data` is already an `arrow_dplyr_query`, `as_adq()`, will return it as is, but
+# `arrow_dplyr_query()` will nest it inside a new `arrow_dplyr_query`. The only
+# place where `arrow_dplyr_query()` should be called directly is inside
+# `collapse()` methods; everywhere else, call `as_adq()`.
+as_adq <- function(.data) {
+ # For most dplyr methods,
+ # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query
+ # This works because the functions all pass .data through as_adq()
+ if (inherits(.data, "arrow_dplyr_query")) {
+ return(.data)
+ }
+ arrow_dplyr_query(.data)
+}
+
+make_field_refs <- function(field_names) {
+ set_names(lapply(field_names, Expression$field_ref), field_names)
+}
+
+#' @export
+print.arrow_dplyr_query <- function(x, ...) {
+ schm <- x$.data$schema
+ types <- map_chr(x$selected_columns, function(expr) {
+ name <- expr$field_name
+ if (nzchar(name)) {
+ # Just a field_ref, so look up in the schema
+ schm$GetFieldByName(name)$type$ToString()
+ } else {
+ # Expression, so get its type and append the expression
+ paste0(
+ expr$type(schm)$ToString(),
+ " (", expr$ToString(), ")"
+ )
+ }
+ })
+ fields <- paste(names(types), types, sep = ": ", collapse = "\n")
+ cat(class(source_data(x))[1], " (query)\n", sep = "")
+ cat(fields, "\n", sep = "")
+ cat("\n")
+ if (length(x$aggregations)) {
+ cat("* Aggregations:\n")
+ aggs <- paste0(names(x$aggregations), ": ", map_chr(x$aggregations, format_aggregation), collapse = "\n")
+ cat(aggs, "\n", sep = "")
+ }
+ if (!isTRUE(x$filtered_rows)) {
+ filter_string <- x$filtered_rows$ToString()
+ cat("* Filter: ", filter_string, "\n", sep = "")
+ }
+ if (length(x$group_by_vars)) {
+ cat("* Grouped by ", paste(x$group_by_vars, collapse = ", "), "\n", sep = "")
+ }
+ if (length(x$arrange_vars)) {
+ arrange_strings <- map_chr(x$arrange_vars, function(x) x$ToString())
+ cat(
+ "* Sorted by ",
+ paste(
+ paste0(
+ arrange_strings,
+ " [", ifelse(x$arrange_desc, "desc", "asc"), "]"
+ ),
+ collapse = ", "
+ ),
+ "\n",
+ sep = ""
+ )
+ }
+ cat("See $.data for the source Arrow object\n")
+ invisible(x)
+}
+
+# These are the names reflecting all select/rename, not what is in Arrow
+#' @export
+names.arrow_dplyr_query <- function(x) names(x$selected_columns)
+
+#' @export
+dim.arrow_dplyr_query <- function(x) {
+ cols <- length(names(x))
+
+ if (is_collapsed(x)) {
+ # Don't evaluate just for nrow
+ rows <- NA_integer_
+ } else if (isTRUE(x$filtered_rows)) {
+ rows <- x$.data$num_rows
+ } else {
+ rows <- Scanner$create(x)$CountRows()
+ }
+ c(rows, cols)
+}
+
+#' @export
+as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) {
+ collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
+}
+
+#' @export
+head.arrow_dplyr_query <- function(x, n = 6L, ...) {
+ x$head <- n
+ collapse.arrow_dplyr_query(x)
+}
+
+#' @export
+tail.arrow_dplyr_query <- function(x, n = 6L, ...) {
+ x$tail <- n
+ collapse.arrow_dplyr_query(x)
+}
+
+#' @export
+`[.arrow_dplyr_query` <- function(x, i, j, ..., drop = FALSE) {
+ x <- ensure_group_vars(x)
+ if (nargs() == 2L) {
+ # List-like column extraction (x[i])
+ return(x[, i])
+ }
+ if (!missing(j)) {
+ x <- select.arrow_dplyr_query(x, all_of(j))
+ }
+
+ if (!missing(i)) {
+ out <- take_dataset_rows(x, i)
+ x <- restore_dplyr_features(out, x)
+ }
+ x
+}
+
+ensure_group_vars <- function(x) {
+ if (inherits(x, "arrow_dplyr_query")) {
+ # Before pulling data from Arrow, make sure all group vars are in the projection
+ gv <- set_names(setdiff(dplyr::group_vars(x), names(x)))
+ if (length(gv)) {
+ # Add them back
+ x$selected_columns <- c(
+ x$selected_columns,
+ make_field_refs(gv)
+ )
+ }
+ }
+ x
+}
+
+ensure_arrange_vars <- function(x) {
+ # The arrange() operation is not performed until later, because:
+ # - It must be performed after mutate(), to enable sorting by new columns.
+ # - It should be performed after filter() and select(), for efficiency.
+ # However, we need users to be able to arrange() by columns and expressions
+ # that are *not* returned in the query result. To enable this, we must
+ # *temporarily* include these columns and expressions in the projection. We
+ # use x$temp_columns to store these. Later, after the arrange() operation has
+ # been performed, these are omitted from the result. This differs from the
+ # columns in x$group_by_vars which *are* returned in the result.
+ x$temp_columns <- x$arrange_vars[!names(x$arrange_vars) %in% names(x$selected_columns)]
+ x
+}
+
+# Helper to handle unsupported dplyr features
+# * For Table/RecordBatch, we collect() and then call the dplyr method in R
+# * For Dataset, we just error
+abandon_ship <- function(call, .data, msg) {
+ msg <- trimws(msg)
+ dplyr_fun_name <- sub("^(.*?)\\..*", "\\1", as.character(call[[1]]))
+ if (query_on_dataset(.data)) {
+ stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE)
+ }
+ # else, collect and call dplyr method
+ warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE)
+ call$.data <- dplyr::collect(.data)
+ call[[1]] <- get(dplyr_fun_name, envir = asNamespace("dplyr"))
+ eval.parent(call, 2)
+}
+
+query_on_dataset <- function(x) !inherits(source_data(x), "InMemoryDataset")
+
+source_data <- function(x) {
+ if (is_collapsed(x)) {
+ source_data(x$.data)
+ } else {
+ x$.data
+ }
+}
+
+is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query")
+
+has_aggregation <- function(x) {
+ # TODO: update with joins (check right side data too)
+ !is.null(x$aggregations) || (is_collapsed(x) && has_aggregation(x$.data))
+}
+
+has_head_tail <- function(x) {
+ !is.null(x$head) || !is.null(x$tail) || (is_collapsed(x) && has_head_tail(x$.data))
+}
diff --git a/src/arrow/r/R/duckdb.R b/src/arrow/r/R/duckdb.R
new file mode 100644
index 000000000..c772d4fbd
--- /dev/null
+++ b/src/arrow/r/R/duckdb.R
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Create a (virtual) DuckDB table from an Arrow object
+#'
+#' This will do the necessary configuration to create a (virtual) table in DuckDB
+#' that is backed by the Arrow object given. No data is copied or modified until
+#' `collect()` or `compute()` are called or a query is run against the table.
+#'
+#' The result is a dbplyr-compatible object that can be used in d(b)plyr pipelines.
+#'
+#' If `auto_disconnect = TRUE`, the DuckDB table that is created will be configured
+#' to be unregistered when the `tbl` object is garbage collected. This is helpful
+#' if you don't want to have extra table objects in DuckDB after you've finished
+#' using them. Currently, this cleanup can, however, sometimes lead to hangs if
+#' tables are created and deleted in quick succession, hence the default value
+#' of `FALSE`
+#'
+#' @param .data the Arrow object (e.g. Dataset, Table) to use for the DuckDB table
+#' @param con a DuckDB connection to use (default will create one and store it
+#' in `options("arrow_duck_con")`)
+#' @param table_name a name to use in DuckDB for this object. The default is a
+#' unique string `"arrow_"` followed by numbers.
+#' @param auto_disconnect should the table be automatically cleaned up when the
+#' resulting object is removed (and garbage collected)? Default: `FALSE`
+#'
+#' @return A `tbl` of the new table in DuckDB
+#'
+#' @name to_duckdb
+#' @export
+#' @examplesIf getFromNamespace("run_duckdb_examples", "arrow")()
+#' library(dplyr)
+#'
+#' ds <- InMemoryDataset$create(mtcars)
+#'
+#' ds %>%
+#' filter(mpg < 30) %>%
+#' to_duckdb() %>%
+#' group_by(cyl) %>%
+#' summarize(mean_mpg = mean(mpg, na.rm = TRUE))
+to_duckdb <- function(.data,
+ con = arrow_duck_connection(),
+ table_name = unique_arrow_tablename(),
+ auto_disconnect = FALSE) {
+ .data <- as_adq(.data)
+ duckdb::duckdb_register_arrow(con, table_name, .data)
+
+ tbl <- tbl(con, table_name)
+ groups <- dplyr::groups(.data)
+ if (length(groups)) {
+ tbl <- dplyr::group_by(tbl, groups)
+ }
+
+ if (auto_disconnect) {
+ # this will add the correct connection disconnection when the tbl is gced.
+ # we should probably confirm that this use of src$disco is kosher.
+ tbl$src$disco <- duckdb_disconnector(con, table_name)
+ }
+
+ tbl
+}
+
+arrow_duck_connection <- function() {
+ con <- getOption("arrow_duck_con")
+ if (is.null(con) || !DBI::dbIsValid(con)) {
+ con <- DBI::dbConnect(duckdb::duckdb())
+ # Use the same CPU count that the arrow library is set to
+ DBI::dbExecute(con, paste0("PRAGMA threads=", cpu_count()))
+ options(arrow_duck_con = con)
+ }
+ con
+}
+
+# helper function to determine if duckdb examples should run
+# see: https://github.com/r-lib/roxygen2/issues/1242
+run_duckdb_examples <- function() {
+ arrow_with_dataset() &&
+ requireNamespace("duckdb", quietly = TRUE) &&
+ packageVersion("duckdb") > "0.2.7" &&
+ requireNamespace("dplyr", quietly = TRUE) &&
+ requireNamespace("dbplyr", quietly = TRUE)
+}
+
+# Adapted from dbplyr
+unique_arrow_tablename <- function() {
+ i <- getOption("arrow_table_name", 0) + 1
+ options(arrow_table_name = i)
+ sprintf("arrow_%03i", i)
+}
+
+# Creates an environment that disconnects the database when it's GC'd
+duckdb_disconnector <- function(con, tbl_name) {
+ reg.finalizer(environment(), function(...) {
+ # remote the table we ephemerally created (though only if the connection is
+ # still valid)
+ if (DBI::dbIsValid(con)) {
+ duckdb::duckdb_unregister_arrow(con, tbl_name)
+ }
+
+ # and there are no more tables, so we can safely shutdown
+ if (length(DBI::dbListTables(con)) == 0) {
+ DBI::dbDisconnect(con, shutdown = TRUE)
+ }
+ })
+ environment()
+}
+
+#' Create an Arrow object from others
+#'
+#' This can be used in pipelines that pass data back and forth between Arrow and
+#' other processes (like DuckDB).
+#'
+#' @param .data the object to be converted
+#'
+#' @return an `arrow_dplyr_query` object, to be used in dplyr pipelines.
+#' @export
+#'
+#' @examplesIf getFromNamespace("run_duckdb_examples", "arrow")()
+#' library(dplyr)
+#'
+#' ds <- InMemoryDataset$create(mtcars)
+#'
+#' ds %>%
+#' filter(mpg < 30) %>%
+#' to_duckdb() %>%
+#' group_by(cyl) %>%
+#' summarize(mean_mpg = mean(mpg, na.rm = TRUE)) %>%
+#' to_arrow() %>%
+#' collect()
+to_arrow <- function(.data) {
+ # If this is an Arrow object already, return quickly since we're already Arrow
+ if (inherits(.data, c("arrow_dplyr_query", "ArrowObject"))) {
+ return(.data)
+ }
+
+ # For now, we only handle .data from duckdb, so check that it is that if we've
+ # gotten this far
+ if (!inherits(dbplyr::remote_con(.data), "duckdb_connection")) {
+ stop(
+ "to_arrow() currently only supports Arrow tables, Arrow datasets, ",
+ "Arrow queries, or dbplyr tbls from duckdb connections",
+ call. = FALSE
+ )
+ }
+
+ # Run the query
+ res <- DBI::dbSendQuery(dbplyr::remote_con(.data), dbplyr::remote_query(.data), arrow = TRUE)
+
+ # TODO: we shouldn't need $read_table(), but we get segfaults when we do.
+ arrow_dplyr_query(duckdb::duckdb_fetch_record_batch(res)$read_table())
+}
diff --git a/src/arrow/r/R/enums.R b/src/arrow/r/R/enums.R
new file mode 100644
index 000000000..4e69b7a19
--- /dev/null
+++ b/src/arrow/r/R/enums.R
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @export
+`print.arrow-enum` <- function(x, ...) {
+ NextMethod()
+}
+
+enum <- function(class, ..., .list = list(...)) {
+ structure(
+ .list,
+ class = c(class, "arrow-enum")
+ )
+}
+
+#' Arrow enums
+#' @name enums
+#' @export
+#' @keywords internal
+TimeUnit <- enum("TimeUnit::type",
+ SECOND = 0L, MILLI = 1L, MICRO = 2L, NANO = 3L
+)
+
+#' @rdname enums
+#' @export
+DateUnit <- enum("DateUnit", DAY = 0L, MILLI = 1L)
+
+#' @rdname enums
+#' @export
+Type <- enum("Type::type",
+ "NA" = 0L,
+ BOOL = 1L,
+ UINT8 = 2L,
+ INT8 = 3L,
+ UINT16 = 4L,
+ INT16 = 5L,
+ UINT32 = 6L,
+ INT32 = 7L,
+ UINT64 = 8L,
+ INT64 = 9L,
+ HALF_FLOAT = 10L,
+ FLOAT = 11L,
+ DOUBLE = 12L,
+ STRING = 13L,
+ BINARY = 14L,
+ FIXED_SIZE_BINARY = 15L,
+ DATE32 = 16L,
+ DATE64 = 17L,
+ TIMESTAMP = 18L,
+ TIME32 = 19L,
+ TIME64 = 20L,
+ INTERVAL_MONTHS = 21L,
+ INTERVAL_DAY_TIME = 22L,
+ DECIMAL = 23L,
+ DECIMAL256 = 24L,
+ LIST = 25L,
+ STRUCT = 26L,
+ SPARSE_UNION = 27L,
+ DENSE_UNION = 28L,
+ DICTIONARY = 29L,
+ MAP = 30L,
+ EXTENSION = 31L,
+ FIXED_SIZE_LIST = 32L,
+ DURATION = 33L,
+ LARGE_STRING = 34L,
+ LARGE_BINARY = 35L,
+ LARGE_LIST = 36L
+)
+
+TYPES_WITH_NAN <- Type[c("HALF_FLOAT", "FLOAT", "DOUBLE")]
+
+#' @rdname enums
+#' @export
+StatusCode <- enum("StatusCode",
+ OK = 0L, OutOfMemory = 1L, KeyError = 2L, TypeError = 3L,
+ Invalid = 4L, IOError = 5L, CapacityError = 6L, IndexError = 7L,
+ UnknownError = 9L, NotImplemented = 10L, SerializationError = 11L,
+ PythonError = 12L, RError = 13L,
+ PlasmaObjectExists = 20L, PlasmaObjectNotFound = 21L,
+ PlasmaStoreFull = 22L, PlasmaObjectAlreadySealed = 23L
+)
+
+#' @rdname enums
+#' @export
+FileMode <- enum("FileMode",
+ READ = 0L, WRITE = 1L, READWRITE = 2L
+)
+
+#' @rdname enums
+#' @export
+MessageType <- enum("MessageType",
+ NONE = 0L, SCHEMA = 1L, DICTIONARY_BATCH = 2L, RECORD_BATCH = 3L, TENSOR = 4L
+)
+
+#' @rdname enums
+#' @export
+CompressionType <- enum("Compression::type",
+ UNCOMPRESSED = 0L, SNAPPY = 1L, GZIP = 2L, BROTLI = 3L, ZSTD = 4L, LZ4 = 5L,
+ LZ4_FRAME = 6L, LZO = 7L, BZ2 = 8L
+)
+
+#' @export
+#' @rdname enums
+FileType <- enum("FileType",
+ NotFound = 0L, Unknown = 1L, File = 2L, Directory = 3L
+)
+
+#' @export
+#' @rdname enums
+ParquetVersionType <- enum("ParquetVersionType",
+ PARQUET_1_0 = 0L, PARQUET_2_0 = 1L
+)
+
+#' @export
+#' @rdname enums
+MetadataVersion <- enum("MetadataVersion",
+ V1 = 0L, V2 = 1L, V3 = 2L, V4 = 3L, V5 = 4L
+)
+
+#' @export
+#' @rdname enums
+QuantileInterpolation <- enum("QuantileInterpolation",
+ LINEAR = 0L, LOWER = 1L, HIGHER = 2L, NEAREST = 3L, MIDPOINT = 4L
+)
+
+#' @export
+#' @rdname enums
+NullEncodingBehavior <- enum("NullEncodingBehavior",
+ ENCODE = 0L, MASK = 1L
+)
+
+#' @export
+#' @rdname enums
+NullHandlingBehavior <- enum("NullHandlingBehavior",
+ EMIT_NULL = 0L, SKIP = 1L, REPLACE = 2L
+)
+
+#' @export
+#' @rdname enums
+RoundMode <- enum("RoundMode",
+ DOWN = 0L,
+ UP = 1L,
+ TOWARDS_ZERO = 2L,
+ TOWARDS_INFINITY = 3L,
+ HALF_DOWN = 4L,
+ HALF_UP = 5L,
+ HALF_TOWARDS_ZERO = 6L,
+ HALF_TOWARDS_INFINITY = 7L,
+ HALF_TO_EVEN = 8L,
+ HALF_TO_ODD = 9L
+)
+
+#' @export
+#' @rdname enums
+JoinType <- enum("JoinType",
+ LEFT_SEMI = 0L,
+ RIGHT_SEMI = 1L,
+ LEFT_ANTI = 2L,
+ RIGHT_ANTI = 3L,
+ INNER = 4L,
+ LEFT_OUTER = 5L,
+ RIGHT_OUTER = 6L,
+ FULL_OUTER = 7L
+)
diff --git a/src/arrow/r/R/expression.R b/src/arrow/r/R/expression.R
new file mode 100644
index 000000000..b1b6635f5
--- /dev/null
+++ b/src/arrow/r/R/expression.R
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrowExports.R
+
+.unary_function_map <- list(
+ # NOTE: Each of the R functions mapped here takes exactly *one* argument, maps
+ # *directly* to an Arrow C++ compute kernel, and does not require any
+ # non-default options to be specified. More complex R function mappings are
+ # defined in dplyr-functions.R.
+
+ # functions are arranged alphabetically by name within categories
+
+ # arithmetic functions
+ "abs" = "abs_checked",
+ "ceiling" = "ceil",
+ "floor" = "floor",
+ "log10" = "log10_checked",
+ "log1p" = "log1p_checked",
+ "log2" = "log2_checked",
+ "sign" = "sign",
+ # trunc is defined in dplyr-functions.R
+
+ # trigonometric functions
+ "acos" = "acos_checked",
+ "asin" = "asin_checked",
+ "cos" = "cos_checked",
+ "sin" = "sin_checked",
+ "tan" = "tan_checked",
+
+ # logical functions
+ "!" = "invert",
+
+ # string functions
+ # nchar is defined in dplyr-functions.R
+ "str_length" = "utf8_length",
+ # str_pad is defined in dplyr-functions.R
+ # str_sub is defined in dplyr-functions.R
+ # str_to_lower is defined in dplyr-functions.R
+ # str_to_title is defined in dplyr-functions.R
+ # str_to_upper is defined in dplyr-functions.R
+ # str_trim is defined in dplyr-functions.R
+ "stri_reverse" = "utf8_reverse",
+ # substr is defined in dplyr-functions.R
+ # substring is defined in dplyr-functions.R
+ "tolower" = "utf8_lower",
+ "toupper" = "utf8_upper",
+
+ # date and time functions
+ "day" = "day",
+ "hour" = "hour",
+ "isoweek" = "iso_week",
+ "epiweek" = "us_week",
+ "isoyear" = "iso_year",
+ "minute" = "minute",
+ "month" = "month",
+ "quarter" = "quarter",
+ # second is defined in dplyr-functions.R
+ # wday is defined in dplyr-functions.R
+ "yday" = "day_of_year",
+ "year" = "year",
+
+ # type conversion functions
+ "as.factor" = "dictionary_encode"
+)
+
+.binary_function_map <- list(
+ # NOTE: Each of the R functions/operators mapped here takes exactly *two*
+ # arguments. Most map *directly* to an Arrow C++ compute kernel and require no
+ # non-default options, but some are modified by build_expr(). More complex R
+ # function/operator mappings are defined in dplyr-functions.R.
+ "==" = "equal",
+ "!=" = "not_equal",
+ ">" = "greater",
+ ">=" = "greater_equal",
+ "<" = "less",
+ "<=" = "less_equal",
+ "&" = "and_kleene",
+ "|" = "or_kleene",
+ "+" = "add_checked",
+ "-" = "subtract_checked",
+ "*" = "multiply_checked",
+ "/" = "divide",
+ "%/%" = "divide_checked",
+ # we don't actually use divide_checked with `%%`, rather it is rewritten to
+ # use `%/%` above.
+ "%%" = "divide_checked",
+ "^" = "power_checked",
+ "%in%" = "is_in_meta_binary"
+)
+
+.array_function_map <- c(.unary_function_map, .binary_function_map)
+
+#' Arrow expressions
+#'
+#' @description
+#' `Expression`s are used to define filter logic for passing to a [Dataset]
+#' [Scanner].
+#'
+#' `Expression$scalar(x)` constructs an `Expression` which always evaluates to
+#' the provided scalar (length-1) R value.
+#'
+#' `Expression$field_ref(name)` is used to construct an `Expression` which
+#' evaluates to the named column in the `Dataset` against which it is evaluated.
+#'
+#' `Expression$create(function_name, ..., options)` builds a function-call
+#' `Expression` containing one or more `Expression`s.
+#' @name Expression
+#' @rdname Expression
+#' @export
+Expression <- R6Class("Expression",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() compute___expr__ToString(self),
+ Equals = function(other, ...) {
+ inherits(other, "Expression") && compute___expr__equals(self, other)
+ },
+ # TODO: Implement type determination without storing
+ # schemas in Expression objects (ARROW-13186)
+ schema = NULL,
+ type = function(schema = self$schema) {
+ assert_that(!is.null(schema))
+ compute___expr__type(self, schema)
+ },
+ type_id = function(schema = self$schema) {
+ assert_that(!is.null(schema))
+ compute___expr__type_id(self, schema)
+ },
+ cast = function(to_type, safe = TRUE, ...) {
+ opts <- list(
+ to_type = to_type,
+ allow_int_overflow = !safe,
+ allow_time_truncate = !safe,
+ allow_float_truncate = !safe
+ )
+ Expression$create("cast", self, options = modifyList(opts, list(...)))
+ }
+ ),
+ active = list(
+ field_name = function() compute___expr__get_field_ref_name(self)
+ )
+)
+Expression$create <- function(function_name,
+ ...,
+ args = list(...),
+ options = empty_named_list()) {
+ assert_that(is.string(function_name))
+ assert_that(is_list_of(args, "Expression"), msg = "Expression arguments must be Expression objects")
+ expr <- compute___expr__call(function_name, args, options)
+ expr$schema <- unify_schemas(schemas = lapply(args, function(x) x$schema))
+ expr
+}
+
+Expression$field_ref <- function(name) {
+ assert_that(is.string(name))
+ compute___expr__field_ref(name)
+}
+Expression$scalar <- function(x) {
+ expr <- compute___expr__scalar(Scalar$create(x))
+ expr$schema <- schema()
+ expr
+}
+
+# Wrapper around Expression$create that:
+# (1) maps R function names to Arrow C++ compute ("/" --> "divide_checked")
+# (2) wraps R input args as Array or Scalar
+build_expr <- function(FUN,
+ ...,
+ args = list(...),
+ options = empty_named_list()) {
+ if (FUN == "-" && length(args) == 1L) {
+ if (inherits(args[[1]], c("ArrowObject", "Expression"))) {
+ return(build_expr("negate_checked", args[[1]]))
+ } else {
+ return(-args[[1]])
+ }
+ }
+ if (FUN == "%in%") {
+ # Special-case %in%, which is different from the Array function name
+ expr <- Expression$create("is_in", args[[1]],
+ options = list(
+ # If args[[2]] is already an Arrow object (like a scalar),
+ # this wouldn't work
+ value_set = Array$create(args[[2]]),
+ skip_nulls = TRUE
+ )
+ )
+ } else {
+ args <- lapply(args, function(x) {
+ if (!inherits(x, "Expression")) {
+ x <- Expression$scalar(x)
+ }
+ x
+ })
+
+ # In Arrow, "divide" is one function, which does integer division on
+ # integer inputs and floating-point division on floats
+ if (FUN == "/") {
+ # TODO: omg so many ways it's wrong to assume these types
+ args <- lapply(args, function(x) x$cast(float64()))
+ } else if (FUN == "%/%") {
+ # In R, integer division works like floor(float division)
+ out <- build_expr("/", args = args)
+ return(out$cast(int32(), allow_float_truncate = TRUE))
+ } else if (FUN == "%%") {
+ return(args[[1]] - args[[2]] * (args[[1]] %/% args[[2]]))
+ }
+
+ expr <- Expression$create(.array_function_map[[FUN]] %||% FUN, args = args, options = options)
+ }
+ expr
+}
+
+#' @export
+Ops.Expression <- function(e1, e2) {
+ if (.Generic == "!") {
+ build_expr(.Generic, e1)
+ } else {
+ build_expr(.Generic, e1, e2)
+ }
+}
+
+#' @export
+is.na.Expression <- function(x) {
+ Expression$create("is_null", x, options = list(nan_is_null = TRUE))
+}
diff --git a/src/arrow/r/R/feather.R b/src/arrow/r/R/feather.R
new file mode 100644
index 000000000..70a270bbe
--- /dev/null
+++ b/src/arrow/r/R/feather.R
@@ -0,0 +1,219 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Write data in the Feather format
+#'
+#' Feather provides binary columnar serialization for data frames.
+#' It is designed to make reading and writing data frames efficient,
+#' and to make sharing data across data analysis languages easy.
+#' This function writes both the original, limited specification of the format
+#' and the version 2 specification, which is the Apache Arrow IPC file format.
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param version integer Feather file version. Version 2 is the current.
+#' Version 1 is the more limited legacy format.
+#' @param chunk_size For V2 files, the number of rows that each chunk of data
+#' should have in the file. Use a smaller `chunk_size` when you need faster
+#' random row access. Default is 64K. This option is not supported for V1.
+#' @param compression Name of compression codec to use, if any. Default is
+#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
+#' "uncompressed". "zstd" is the other available codec and generally has better
+#' compression ratios in exchange for slower read and write performance
+#' See [codec_is_available()]. This option is not supported for V1.
+#' @param compression_level If `compression` is "zstd", you may
+#' specify an integer compression level. If omitted, the compression codec's
+#' default compression level is used.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
+#' @seealso [Schema] for information about schemas and metadata handling.
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_feather(mtcars, tf)
+#' @include arrow-package.R
+write_feather <- function(x,
+ sink,
+ version = 2,
+ chunk_size = 65536L,
+ compression = c("default", "lz4", "uncompressed", "zstd"),
+ compression_level = NULL) {
+ # Handle and validate options before touching data
+ version <- as.integer(version)
+ assert_that(version %in% 1:2)
+ compression <- match.arg(compression)
+ chunk_size <- as.integer(chunk_size)
+ assert_that(chunk_size > 0)
+ if (compression == "default") {
+ if (version == 2 && codec_is_available("lz4")) {
+ compression <- "lz4"
+ } else {
+ compression <- "uncompressed"
+ }
+ }
+ if (is.null(compression_level)) {
+ # Use -1 as sentinal for "default"
+ compression_level <- -1L
+ }
+ compression_level <- as.integer(compression_level)
+ # Now make sure that options make sense together
+ if (version == 1) {
+ if (chunk_size != 65536L) {
+ stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE)
+ }
+ if (compression != "uncompressed") {
+ stop("Feather version 1 does not support the 'compression' option", call. = FALSE)
+ }
+ if (compression_level != -1L) {
+ stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE)
+ }
+ }
+ if (compression != "zstd" && compression_level != -1L) {
+ stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE)
+ }
+ # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug:
+ version <- version + 1L
+
+ # "lz4" is the convenience
+ if (compression == "lz4") {
+ compression <- "lz4_frame"
+ }
+
+ compression <- compression_from_name(compression)
+
+ x_out <- x
+ if (is.data.frame(x) || inherits(x, "RecordBatch")) {
+ x <- Table$create(x)
+ }
+
+ assert_that(is_writable_table(x))
+
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+ ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level)
+ invisible(x_out)
+}
+
+#' Read a Feather file
+#'
+#' Feather provides binary columnar serialization for data frames.
+#' It is designed to make reading and writing data frames efficient,
+#' and to make sharing data across data analysis languages easy.
+#' This function reads both the original, limited specification of the format
+#' and the version 2 specification, which is the Apache Arrow IPC file format.
+#'
+#' @inheritParams read_ipc_stream
+#' @inheritParams read_delim_arrow
+#' @param ... additional parameters, passed to [make_readable_file()].
+#'
+#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
+#' Arrow [Table] otherwise
+#'
+#' @export
+#' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data.
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_feather(mtcars, tf)
+#' df <- read_feather(tf)
+#' dim(df)
+#' # Can select columns
+#' df <- read_feather(tf, col_select = starts_with("d"))
+read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
+ if (!inherits(file, "RandomAccessFile")) {
+ file <- make_readable_file(file, ...)
+ on.exit(file$close())
+ }
+ reader <- FeatherReader$create(file)
+
+ col_select <- enquo(col_select)
+ columns <- if (!quo_is_null(col_select)) {
+ vars_select(names(reader), !!col_select)
+ }
+
+ out <- tryCatch(
+ reader$Read(columns),
+ error = read_compressed_error
+ )
+
+ if (isTRUE(as_data_frame)) {
+ out <- as.data.frame(out)
+ }
+ out
+}
+
+#' @title FeatherReader class
+#' @rdname FeatherReader
+#' @name FeatherReader
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class enables you to interact with Feather files. Create
+#' one to connect to a file or other InputStream, and call `Read()` on it to
+#' make an `arrow::Table`. See its usage in [`read_feather()`].
+#'
+#' @section Factory:
+#'
+#' The `FeatherReader$create()` factory method instantiates the object and
+#' takes the following argument:
+#'
+#' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
+#'
+#' @section Methods:
+#'
+#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of
+#' integer indices
+#' - `$column_names`: Active binding, returns the column names in the Feather file
+#' - `$schema`: Active binding, returns the schema of the Feather file
+#' - `$version`: Active binding, returns `1` or `2`, according to the Feather
+#' file version
+#'
+#' @export
+#' @include arrow-package.R
+FeatherReader <- R6Class("FeatherReader",
+ inherit = ArrowObject,
+ public = list(
+ Read = function(columns) {
+ ipc___feather___Reader__Read(self, columns)
+ },
+ print = function(...) {
+ cat("FeatherReader:\n")
+ print(self$schema)
+ invisible(self)
+ }
+ ),
+ active = list(
+ # versions are officially 2 for V1 and 3 for V2 :shrug:
+ version = function() ipc___feather___Reader__version(self) - 1L,
+ column_names = function() names(self$schema),
+ schema = function() ipc___feather___Reader__schema(self)
+ )
+)
+
+#' @export
+names.FeatherReader <- function(x) x$column_names
+
+FeatherReader$create <- function(file) {
+ assert_is(file, "RandomAccessFile")
+ ipc___feather___Reader__Open(file)
+}
diff --git a/src/arrow/r/R/field.R b/src/arrow/r/R/field.R
new file mode 100644
index 000000000..d10ee7818
--- /dev/null
+++ b/src/arrow/r/R/field.R
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title Field class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description `field()` lets you create an `arrow::Field` that maps a
+#' [DataType][data-type] to a column name. Fields are contained in
+#' [Schemas][Schema].
+#' @section Methods:
+#'
+#' - `f$ToString()`: convert to a string
+#' - `f$Equals(other)`: test for equality. More naturally called as `f == other`
+#'
+#' @rdname Field
+#' @name Field
+#' @export
+Field <- R6Class("Field",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ prettier_dictionary_type(Field__ToString(self))
+ },
+ Equals = function(other, ...) {
+ inherits(other, "Field") && Field__Equals(self, other)
+ },
+ export_to_c = function(ptr) ExportField(self, ptr)
+ ),
+ active = list(
+ name = function() {
+ Field__name(self)
+ },
+ nullable = function() {
+ Field__nullable(self)
+ },
+ type = function() {
+ Field__type(self)
+ }
+ )
+)
+Field$create <- function(name, type, metadata, nullable = TRUE) {
+ assert_that(inherits(name, "character"), length(name) == 1L)
+ type <- as_type(type, name)
+ assert_that(missing(metadata), msg = "metadata= is currently ignored")
+ Field__initialize(enc2utf8(name), type, nullable)
+}
+#' @include arrowExports.R
+Field$import_from_c <- ImportField
+
+#' @param name field name
+#' @param type logical type, instance of [DataType]
+#' @param metadata currently ignored
+#' @param nullable TRUE if field is nullable
+#'
+#' @examplesIf arrow_available()
+#' field("x", int32())
+#' @rdname Field
+#' @export
+field <- Field$create
+
+.fields <- function(.list, nullable = TRUE) {
+ if (length(.list)) {
+ assert_that(!is.null(nms <- names(.list)))
+ map2(nms, .list, field)
+ } else {
+ list()
+ }
+}
diff --git a/src/arrow/r/R/filesystem.R b/src/arrow/r/R/filesystem.R
new file mode 100644
index 000000000..a09d0a51d
--- /dev/null
+++ b/src/arrow/r/R/filesystem.R
@@ -0,0 +1,505 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title FileSystem entry info
+#' @usage NULL
+#' @format NULL
+#'
+#' @section Methods:
+#'
+#' - `base_name()` : The file base name (component after the last directory
+#' separator).
+#' - `extension()` : The file extension
+#'
+#' @section Active bindings:
+#'
+#' - `$type`: The file type
+#' - `$path`: The full file path in the filesystem
+#' - `$size`: The size in bytes, if available. Only regular files are
+#' guaranteed to have a size.
+#' - `$mtime`: The time of last modification, if available.
+#'
+#' @rdname FileInfo
+#' @export
+FileInfo <- R6Class("FileInfo",
+ inherit = ArrowObject,
+ public = list(
+ base_name = function() fs___FileInfo__base_name(self),
+ extension = function() fs___FileInfo__extension(self)
+ ),
+ active = list(
+ type = function(type) {
+ if (missing(type)) {
+ fs___FileInfo__type(self)
+ } else {
+ fs___FileInfo__set_type(self, type)
+ }
+ },
+ path = function(path) {
+ if (missing(path)) {
+ fs___FileInfo__path(self)
+ } else {
+ invisible(fs___FileInfo__set_path(self))
+ }
+ },
+ size = function(size) {
+ if (missing(size)) {
+ fs___FileInfo__size(self)
+ } else {
+ invisible(fs___FileInfo__set_size(self, size))
+ }
+ },
+ mtime = function(time) {
+ if (missing(time)) {
+ fs___FileInfo__mtime(self)
+ } else {
+ if (!inherits(time, "POSIXct") && length(time) == 1L) {
+ abort("invalid time")
+ }
+ invisible(fs___FileInfo__set_mtime(self, time))
+ }
+ }
+ )
+)
+
+#' @title file selector
+#' @format NULL
+#'
+#' @section Factory:
+#'
+#' The `$create()` factory method instantiates a `FileSelector` given the 3 fields
+#' described below.
+#'
+#' @section Fields:
+#'
+#' - `base_dir`: The directory in which to select files. If the path exists but
+#' doesn't point to a directory, this should be an error.
+#' - `allow_not_found`: The behavior if `base_dir` doesn't exist in the
+#' filesystem. If `FALSE`, an error is returned. If `TRUE`, an empty
+#' selection is returned
+#' - `recursive`: Whether to recurse into subdirectories.
+#'
+#' @rdname FileSelector
+#' @export
+FileSelector <- R6Class("FileSelector",
+ inherit = ArrowObject,
+ active = list(
+ base_dir = function() fs___FileSelector__base_dir(self),
+ allow_not_found = function() fs___FileSelector__allow_not_found(self),
+ recursive = function() fs___FileSelector__recursive(self)
+ )
+)
+
+FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = FALSE) {
+ fs___FileSelector__create(clean_path_rel(base_dir), allow_not_found, recursive)
+}
+
+#' @title FileSystem classes
+#' @description `FileSystem` is an abstract file system API,
+#' `LocalFileSystem` is an implementation accessing files
+#' on the local machine. `SubTreeFileSystem` is an implementation that delegates
+#' to another implementation after prepending a fixed base path
+#'
+#' @section Factory:
+#'
+#' `LocalFileSystem$create()` returns the object and takes no arguments.
+#'
+#' `SubTreeFileSystem$create()` takes the following arguments:
+#'
+#' - `base_path`, a string path
+#' - `base_fs`, a `FileSystem` object
+#'
+#' `S3FileSystem$create()` optionally takes arguments:
+#'
+#' - `anonymous`: logical, default `FALSE`. If true, will not attempt to look up
+#' credentials using standard AWS configuration methods.
+#' - `access_key`, `secret_key`: authentication credentials. If one is provided,
+#' the other must be as well. If both are provided, they will override any
+#' AWS configuration set at the environment level.
+#' - `session_token`: optional string for authentication along with
+#' `access_key` and `secret_key`
+#' - `role_arn`: string AWS ARN of an AccessRole. If provided instead of `access_key` and
+#' `secret_key`, temporary credentials will be fetched by assuming this role.
+#' - `session_name`: optional string identifier for the assumed role session.
+#' - `external_id`: optional unique string identifier that might be required
+#' when you assume a role in another account.
+#' - `load_frequency`: integer, frequency (in seconds) with which temporary
+#' credentials from an assumed role session will be refreshed. Default is
+#' 900 (i.e. 15 minutes)
+#' - `region`: AWS region to connect to. If omitted, the AWS library will
+#' provide a sensible default based on client configuration, falling back
+#' to "us-east-1" if no other alternatives are found.
+#' - `endpoint_override`: If non-empty, override region with a connect string
+#' such as "localhost:9000". This is useful for connecting to file systems
+#' that emulate S3.
+#' - `scheme`: S3 connection transport (default "https")
+#' - `background_writes`: logical, whether `OutputStream` writes will be issued
+#' in the background, without blocking (default `TRUE`)
+#'
+#' @section Methods:
+#'
+#' - `$GetFileInfo(x)`: `x` may be a [FileSelector][FileSelector] or a character
+#' vector of paths. Returns a list of [FileInfo][FileInfo]
+#' - `$CreateDir(path, recursive = TRUE)`: Create a directory and subdirectories.
+#' - `$DeleteDir(path)`: Delete a directory and its contents, recursively.
+#' - `$DeleteDirContents(path)`: Delete a directory's contents, recursively.
+#' Like `$DeleteDir()`,
+#' but doesn't delete the directory itself. Passing an empty path (`""`) will
+#' wipe the entire filesystem tree.
+#' - `$DeleteFile(path)` : Delete a file.
+#' - `$DeleteFiles(paths)` : Delete many files. The default implementation
+#' issues individual delete operations in sequence.
+#' - `$Move(src, dest)`: Move / rename a file or directory. If the destination
+#' exists:
+#' if it is a non-empty directory, an error is returned
+#' otherwise, if it has the same type as the source, it is replaced
+#' otherwise, behavior is unspecified (implementation-dependent).
+#' - `$CopyFile(src, dest)`: Copy a file. If the destination exists and is a
+#' directory, an error is returned. Otherwise, it is replaced.
+#' - `$OpenInputStream(path)`: Open an [input stream][InputStream] for
+#' sequential reading.
+#' - `$OpenInputFile(path)`: Open an [input file][RandomAccessFile] for random
+#' access reading.
+#' - `$OpenOutputStream(path)`: Open an [output stream][OutputStream] for
+#' sequential writing.
+#' - `$OpenAppendStream(path)`: Open an [output stream][OutputStream] for
+#' appending.
+#'
+#' @section Active bindings:
+#'
+#' - `$type_name`: string filesystem type name, such as "local", "s3", etc.
+#' - `$region`: string AWS region, for `S3FileSystem` and `SubTreeFileSystem`
+#' containing a `S3FileSystem`
+#' - `$base_fs`: for `SubTreeFileSystem`, the `FileSystem` it contains
+#' - `$base_path`: for `SubTreeFileSystem`, the path in `$base_fs` which is considered
+#' root in this `SubTreeFileSystem`.
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @rdname FileSystem
+#' @name FileSystem
+#' @export
+FileSystem <- R6Class("FileSystem",
+ inherit = ArrowObject,
+ public = list(
+ GetFileInfo = function(x) {
+ if (inherits(x, "FileSelector")) {
+ fs___FileSystem__GetTargetInfos_FileSelector(self, x)
+ } else if (is.character(x)) {
+ fs___FileSystem__GetTargetInfos_Paths(self, clean_path_rel(x))
+ } else {
+ abort("incompatible type for FileSystem$GetFileInfo()")
+ }
+ },
+ CreateDir = function(path, recursive = TRUE) {
+ fs___FileSystem__CreateDir(self, clean_path_rel(path), isTRUE(recursive))
+ },
+ DeleteDir = function(path) {
+ fs___FileSystem__DeleteDir(self, clean_path_rel(path))
+ },
+ DeleteDirContents = function(path) {
+ fs___FileSystem__DeleteDirContents(self, clean_path_rel(path))
+ },
+ DeleteFile = function(path) {
+ fs___FileSystem__DeleteFile(self, clean_path_rel(path))
+ },
+ DeleteFiles = function(paths) {
+ fs___FileSystem__DeleteFiles(self, clean_path_rel(paths))
+ },
+ Move = function(src, dest) {
+ fs___FileSystem__Move(self, clean_path_rel(src), clean_path_rel(dest))
+ },
+ CopyFile = function(src, dest) {
+ fs___FileSystem__CopyFile(self, clean_path_rel(src), clean_path_rel(dest))
+ },
+ OpenInputStream = function(path) {
+ fs___FileSystem__OpenInputStream(self, clean_path_rel(path))
+ },
+ OpenInputFile = function(path) {
+ fs___FileSystem__OpenInputFile(self, clean_path_rel(path))
+ },
+ OpenOutputStream = function(path) {
+ fs___FileSystem__OpenOutputStream(self, clean_path_rel(path))
+ },
+ OpenAppendStream = function(path) {
+ fs___FileSystem__OpenAppendStream(self, clean_path_rel(path))
+ },
+
+ # Friendlier R user interface
+ path = function(x) SubTreeFileSystem$create(x, self),
+ cd = function(x) SubTreeFileSystem$create(x, self),
+ ls = function(path = "", ...) {
+ selector <- FileSelector$create(path, ...) # ... for recursive = TRUE
+ infos <- self$GetFileInfo(selector)
+ map_chr(infos, ~ .$path)
+ # TODO: add full.names argument like base::dir() (default right now is TRUE)
+ # TODO: see fs package for glob/regexp filtering
+ # TODO: verbose method that shows other attributes as df
+ # TODO: print methods for FileInfo, SubTreeFileSystem, S3FileSystem
+ }
+ ),
+ active = list(
+ type_name = function() fs___FileSystem__type_name(self)
+ )
+)
+FileSystem$from_uri <- function(uri) {
+ assert_that(is.string(uri))
+ fs___FileSystemFromUri(uri)
+}
+
+get_paths_and_filesystem <- function(x, filesystem = NULL) {
+ # Wrapper around FileSystem$from_uri that handles local paths
+ # and an optional explicit filesystem
+ if (inherits(x, "SubTreeFileSystem")) {
+ return(list(fs = x$base_fs, path = x$base_path))
+ }
+ assert_that(is.character(x))
+ are_urls <- are_urls(x)
+ if (any(are_urls)) {
+ if (!all(are_urls)) {
+ stop("Vectors of mixed paths and URIs are not supported", call. = FALSE)
+ }
+ if (!is.null(filesystem)) {
+ # Stop? Can't have URL (which yields a fs) and another fs
+ }
+ x <- lapply(x, FileSystem$from_uri)
+ if (length(unique(map(x, ~ class(.$fs)))) > 1) {
+ stop(
+ "Vectors of URIs for different file systems are not supported",
+ call. = FALSE
+ )
+ }
+ fs <- x[[1]]$fs
+ path <- map_chr(x, ~ .$path) # singular name "path" used for compatibility
+ } else {
+ fs <- filesystem %||% LocalFileSystem$create()
+ if (inherits(fs, "LocalFileSystem")) {
+ path <- clean_path_abs(x)
+ } else {
+ path <- clean_path_rel(x)
+ }
+ }
+ list(
+ fs = fs,
+ path = path
+ )
+}
+
+# variant of the above function that asserts that x is either a scalar string
+# or a SubTreeFileSystem
+get_path_and_filesystem <- function(x, filesystem = NULL) {
+ assert_that(is.string(x) || inherits(x, "SubTreeFileSystem"))
+ get_paths_and_filesystem(x, filesystem)
+}
+
+is_url <- function(x) is.string(x) && grepl("://", x)
+are_urls <- function(x) if (!is.character(x)) FALSE else grepl("://", x)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileSystem
+#' @export
+LocalFileSystem <- R6Class("LocalFileSystem", inherit = FileSystem)
+LocalFileSystem$create <- function() {
+ fs___LocalFileSystem__create()
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileSystem
+#' @importFrom utils modifyList
+#' @export
+S3FileSystem <- R6Class("S3FileSystem",
+ inherit = FileSystem,
+ active = list(
+ region = function() fs___S3FileSystem__region(self)
+ )
+)
+S3FileSystem$create <- function(anonymous = FALSE, ...) {
+ args <- list2(...)
+ if (anonymous) {
+ invalid_args <- intersect(
+ c(
+ "access_key", "secret_key", "session_token", "role_arn", "session_name",
+ "external_id", "load_frequency"
+ ),
+ names(args)
+ )
+ if (length(invalid_args)) {
+ stop("Cannot specify ", oxford_paste(invalid_args), " when anonymous = TRUE", call. = FALSE)
+ }
+ } else {
+ keys_present <- length(intersect(c("access_key", "secret_key"), names(args)))
+ if (keys_present == 1) {
+ stop("Key authentication requires both access_key and secret_key", call. = FALSE)
+ }
+ if ("session_token" %in% names(args) && keys_present != 2) {
+ stop(
+ "In order to initialize a session with temporary credentials, ",
+ "both secret_key and access_key must be provided ",
+ "in addition to session_token.",
+ call. = FALSE
+ )
+ }
+ arn <- "role_arn" %in% names(args)
+ if (keys_present == 2 && arn) {
+ stop("Cannot provide both key authentication and role_arn", call. = FALSE)
+ }
+ arn_extras <- intersect(c("session_name", "external_id", "load_frequency"), names(args))
+ if (length(arn_extras) > 0 && !arn) {
+ stop("Cannot specify ", oxford_paste(arn_extras), " without providing a role_arn string", call. = FALSE)
+ }
+ }
+ args <- c(modifyList(default_s3_options, args), anonymous = anonymous)
+ exec(fs___S3FileSystem__create, !!!args)
+}
+
+default_s3_options <- list(
+ access_key = "",
+ secret_key = "",
+ session_token = "",
+ role_arn = "",
+ session_name = "",
+ external_id = "",
+ load_frequency = 900L,
+ region = "",
+ endpoint_override = "",
+ scheme = "",
+ background_writes = TRUE
+)
+
+#' Connect to an AWS S3 bucket
+#'
+#' `s3_bucket()` is a convenience function to create an `S3FileSystem` object
+#' that automatically detects the bucket's AWS region and holding onto the its
+#' relative path.
+#'
+#' @param bucket string S3 bucket name or path
+#' @param ... Additional connection options, passed to `S3FileSystem$create()`
+#' @return A `SubTreeFileSystem` containing an `S3FileSystem` and the bucket's
+#' relative path. Note that this function's success does not guarantee that you
+#' are authorized to access the bucket's contents.
+#' @examplesIf arrow_with_s3()
+#' bucket <- s3_bucket("ursa-labs-taxi-data")
+#' @export
+s3_bucket <- function(bucket, ...) {
+ assert_that(is.string(bucket))
+ args <- list2(...)
+
+ # Use FileSystemFromUri to detect the bucket's region
+ if (!is_url(bucket)) {
+ bucket <- paste0("s3://", bucket)
+ }
+ fs_and_path <- FileSystem$from_uri(bucket)
+ fs <- fs_and_path$fs
+ # If there are no additional S3Options, we can use that filesystem
+ # Otherwise, take the region that was detected and make a new fs with the args
+ if (length(args)) {
+ args$region <- fs$region
+ fs <- exec(S3FileSystem$create, !!!args)
+ }
+ # Return a subtree pointing at that bucket path
+ SubTreeFileSystem$create(fs_and_path$path, fs)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileSystem
+#' @export
+SubTreeFileSystem <- R6Class("SubTreeFileSystem",
+ inherit = FileSystem,
+ public = list(
+ print = function(...) {
+ if (inherits(self$base_fs, "LocalFileSystem")) {
+ cat("SubTreeFileSystem: ", "file://", self$base_path, "\n", sep = "")
+ } else if (inherits(self$base_fs, "S3FileSystem")) {
+ cat("SubTreeFileSystem: ", "s3://", self$base_path, "\n", sep = "")
+ } else {
+ cat("SubTreeFileSystem", "\n", sep = "")
+ }
+ invisible(self)
+ }
+ ),
+ active = list(
+ base_fs = function() {
+ fs___SubTreeFileSystem__base_fs(self)
+ },
+ base_path = function() fs___SubTreeFileSystem__base_path(self)
+ )
+)
+SubTreeFileSystem$create <- function(base_path, base_fs = NULL) {
+ fs_and_path <- get_path_and_filesystem(base_path, base_fs)
+ fs___SubTreeFileSystem__create(fs_and_path$path, fs_and_path$fs)
+}
+
+#' @export
+`$.SubTreeFileSystem` <- function(x, name, ...) {
+ # This is to allow delegating methods/properties to the base_fs
+ assert_that(is.string(name))
+ if (name %in% ls(envir = x)) {
+ get(name, x)
+ } else if (name %in% ls(envir = x$base_fs)) {
+ get(name, x$base_fs)
+ } else {
+ NULL
+ }
+}
+
+#' Copy files between FileSystems
+#'
+#' @param from A string path to a local directory or file, a URI, or a
+#' `SubTreeFileSystem`. Files will be copied recursively from this path.
+#' @param to A string path to a local directory or file, a URI, or a
+#' `SubTreeFileSystem`. Directories will be created as necessary
+#' @param chunk_size The maximum size of block to read before flushing
+#' to the destination file. A larger chunk_size will use more memory while
+#' copying but may help accommodate high latency FileSystems.
+#' @return Nothing: called for side effects in the file system
+#' @export
+#' @examplesIf FALSE
+#' # Copy an S3 bucket's files to a local directory:
+#' copy_files("s3://your-bucket-name", "local-directory")
+#' # Using a FileSystem object
+#' copy_files(s3_bucket("your-bucket-name"), "local-directory")
+#' # Or go the other way, from local to S3
+#' copy_files("local-directory", s3_bucket("your-bucket-name"))
+copy_files <- function(from, to, chunk_size = 1024L * 1024L) {
+ from <- get_path_and_filesystem(from)
+ to <- get_path_and_filesystem(to)
+ invisible(fs___CopyFiles(
+ from$fs,
+ FileSelector$create(from$path, recursive = TRUE),
+ to$fs,
+ to$path,
+ chunk_size,
+ option_use_threads()
+ ))
+}
+
+clean_path_abs <- function(path) {
+ # Make sure we have a valid, absolute, forward-slashed path for passing to Arrow
+ normalizePath(path, winslash = "/", mustWork = FALSE)
+}
+
+clean_path_rel <- function(path) {
+ # Make sure all path separators are "/", not "\" as on Windows
+ path_sep <- ifelse(tolower(Sys.info()[["sysname"]]) == "windows", "\\\\", "/")
+ gsub(path_sep, "/", path)
+}
diff --git a/src/arrow/r/R/flight.R b/src/arrow/r/R/flight.R
new file mode 100644
index 000000000..cde297853
--- /dev/null
+++ b/src/arrow/r/R/flight.R
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Load a Python Flight server
+#'
+#' @param name string Python module name
+#' @param path file system path where the Python module is found. Default is
+#' to look in the `inst/` directory for included modules.
+#' @export
+#' @examplesIf FALSE
+#' load_flight_server("demo_flight_server")
+load_flight_server <- function(name, path = system.file(package = "arrow")) {
+ reticulate::import_from_path(name, path)
+}
+
+#' Connect to a Flight server
+#'
+#' @param host string hostname to connect to
+#' @param port integer port to connect on
+#' @param scheme URL scheme, default is "grpc+tcp"
+#' @return A `pyarrow.flight.FlightClient`.
+#' @export
+flight_connect <- function(host = "localhost", port, scheme = "grpc+tcp") {
+ pa <- reticulate::import("pyarrow")
+ location <- paste0(scheme, "://", host, ":", port)
+ pa$flight$FlightClient(location)
+}
+
+#' Send data to a Flight server
+#'
+#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()]
+#' @param data `data.frame`, [RecordBatch], or [Table] to upload
+#' @param path string identifier to store the data under
+#' @param overwrite logical: if `path` exists on `client` already, should we
+#' replace it with the contents of `data`? Default is `TRUE`; if `FALSE` and
+#' `path` exists, the function will error.
+#' @return `client`, invisibly.
+#' @export
+flight_put <- function(client, data, path, overwrite = TRUE) {
+ if (!overwrite && flight_path_exists(client, path)) {
+ stop(path, " exists.", call. = FALSE)
+ }
+ if (is.data.frame(data)) {
+ data <- Table$create(data)
+ }
+ py_data <- reticulate::r_to_py(data)
+ writer <- client$do_put(descriptor_for_path(path), py_data$schema)[[1]]
+ if (inherits(data, "RecordBatch")) {
+ writer$write_batch(py_data)
+ } else {
+ writer$write_table(py_data)
+ }
+ writer$close()
+ invisible(client)
+}
+
+#' Get data from a Flight server
+#'
+#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()]
+#' @param path string identifier under which data is stored
+#' @return A [Table]
+#' @export
+flight_get <- function(client, path) {
+ reader <- flight_reader(client, path)
+ reader$read_all()
+}
+
+# TODO: could use this as a RecordBatch iterator, call $read_chunk() on this
+flight_reader <- function(client, path) {
+ info <- client$get_flight_info(descriptor_for_path(path))
+ # Hack: assume a single ticket, on the same server as client is already connected
+ ticket <- info$endpoints[[1]]$ticket
+ client$do_get(ticket)
+}
+
+descriptor_for_path <- function(path) {
+ pa <- reticulate::import("pyarrow")
+ pa$flight$FlightDescriptor$for_path(path)
+}
+
+#' See available resources on a Flight server
+#'
+#' @inheritParams flight_get
+#' @return `list_flights()` returns a character vector of paths.
+#' `flight_path_exists()` returns a logical value, the equivalent of `path %in% list_flights()`
+#' @export
+list_flights <- function(client) {
+ generator <- client$list_flights()
+ out <- reticulate::iterate(generator, function(x) as.character(x$descriptor$path[[1]]))
+ out
+}
+
+#' @rdname list_flights
+#' @export
+flight_path_exists <- function(client, path) {
+ it_exists <- tryCatch(
+ expr = {
+ client$get_flight_info(descriptor_for_path(path))
+ TRUE
+ },
+ error = function(e) {
+ msg <- conditionMessage(e)
+ if (!any(grepl("ArrowKeyError", msg))) {
+ # Raise an error if this fails for any reason other than not found
+ stop(e)
+ }
+ FALSE
+ }
+ )
+}
diff --git a/src/arrow/r/R/install-arrow.R b/src/arrow/r/R/install-arrow.R
new file mode 100644
index 000000000..3e295c543
--- /dev/null
+++ b/src/arrow/r/R/install-arrow.R
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Install or upgrade the Arrow library
+#'
+#' Use this function to install the latest release of `arrow`, to switch to or
+#' from a nightly development version, or on Linux to try reinstalling with
+#' all necessary C++ dependencies.
+#'
+#' Note that, unlike packages like `tensorflow`, `blogdown`, and others that
+#' require external dependencies, you do not need to run `install_arrow()`
+#' after a successful `arrow` installation.
+#'
+#' @param nightly logical: Should we install a development version of the
+#' package, or should we install from CRAN (the default).
+#' @param binary On Linux, value to set for the environment variable
+#' `LIBARROW_BINARY`, which governs how C++ binaries are used, if at all.
+#' The default value, `TRUE`, tells the installation script to detect the
+#' Linux distribution and version and find an appropriate C++ library. `FALSE`
+#' would tell the script not to retrieve a binary and instead build Arrow C++
+#' from source. Other valid values are strings corresponding to a Linux
+#' distribution-version, to override the value that would be detected.
+#' See `vignette("install", package = "arrow")` for further details.
+#' @param use_system logical: Should we use `pkg-config` to look for Arrow
+#' system packages? Default is `FALSE`. If `TRUE`, source installation may be
+#' faster, but there is a risk of version mismatch. This sets the
+#' `ARROW_USE_PKG_CONFIG` environment variable.
+#' @param minimal logical: If building from source, should we build without
+#' optional dependencies (compression libraries, for example)? Default is
+#' `FALSE`. This sets the `LIBARROW_MINIMAL` environment variable.
+#' @param verbose logical: Print more debugging output when installing? Default
+#' is `FALSE`. This sets the `ARROW_R_DEV` environment variable.
+#' @param repos character vector of base URLs of the repositories to install
+#' from (passed to `install.packages()`)
+#' @param ... Additional arguments passed to `install.packages()`
+#' @export
+#' @importFrom utils install.packages
+#' @seealso [arrow_available()] to see if the package was configured with
+#' necessary C++ dependencies. `vignette("install", package = "arrow")` for
+#' more ways to tune installation on Linux.
+install_arrow <- function(nightly = FALSE,
+ binary = Sys.getenv("LIBARROW_BINARY", TRUE),
+ use_system = Sys.getenv("ARROW_USE_PKG_CONFIG", FALSE),
+ minimal = Sys.getenv("LIBARROW_MINIMAL", FALSE),
+ verbose = Sys.getenv("ARROW_R_DEV", FALSE),
+ repos = getOption("repos"),
+ ...) {
+ sysname <- tolower(Sys.info()[["sysname"]])
+ conda <- isTRUE(grepl("conda", R.Version()$platform))
+
+ if (conda) {
+ if (nightly) {
+ system("conda install -y -c arrow-nightlies -c conda-forge --strict-channel-priority r-arrow")
+ } else {
+ system("conda install -y -c conda-forge --strict-channel-priority r-arrow")
+ }
+ } else {
+ Sys.setenv(
+ LIBARROW_BINARY = binary,
+ LIBARROW_MINIMAL = minimal,
+ ARROW_R_DEV = verbose,
+ ARROW_USE_PKG_CONFIG = use_system
+ )
+ # On the M1, we can't use the usual autobrew, which pulls Intel dependencies
+ apple_m1 <- grepl("arm-apple|aarch64.*darwin", R.Version()$platform)
+ # On Rosetta, we have to build without JEMALLOC, so we also can't autobrew
+ rosetta <- identical(sysname, "darwin") && identical(system("sysctl -n sysctl.proc_translated", intern = TRUE), "1")
+ if (rosetta) {
+ Sys.setenv(ARROW_JEMALLOC = "OFF")
+ }
+ if (apple_m1 || rosetta) {
+ Sys.setenv(FORCE_BUNDLED_BUILD = "true")
+ }
+
+ opts <- list()
+ if (apple_m1 || rosetta) {
+ # Skip binaries (esp. for rosetta)
+ opts$pkgType <- "source"
+ } else if (isTRUE(binary)) {
+ # Unless otherwise directed, don't consider newer source packages when
+ # options(pkgType) == "both" (default on win/mac)
+ opts$install.packages.check.source <- "no"
+ opts$install.packages.compile.from.source <- "never"
+ }
+ if (length(opts)) {
+ old <- options(opts)
+ on.exit(options(old))
+ }
+ install.packages("arrow", repos = arrow_repos(repos, nightly), ...)
+ }
+ if ("arrow" %in% loadedNamespaces()) {
+ # If you've just sourced this file, "arrow" won't be (re)loaded
+ reload_arrow()
+ }
+}
+
+arrow_repos <- function(repos = getOption("repos"), nightly = FALSE) {
+ if (length(repos) == 0 || identical(repos, c(CRAN = "@CRAN@"))) {
+ # Set the default/CDN
+ repos <- "https://cloud.r-project.org/"
+ }
+ dev_repo <- getOption("arrow.dev_repo", "https://arrow-r-nightly.s3.amazonaws.com")
+ # Remove it if it's there (so nightly=FALSE won't accidentally pull from it)
+ repos <- setdiff(repos, dev_repo)
+ if (nightly) {
+ # Add it first
+ repos <- c(dev_repo, repos)
+ }
+ repos
+}
+
+reload_arrow <- function() {
+ if (requireNamespace("pkgload", quietly = TRUE)) {
+ is_attached <- "package:arrow" %in% search()
+ pkgload::unload("arrow")
+ if (is_attached) {
+ require("arrow", character.only = TRUE, quietly = TRUE)
+ } else {
+ requireNamespace("arrow", quietly = TRUE)
+ }
+ } else {
+ message("Please restart R to use the 'arrow' package.")
+ }
+}
+
+
+#' Create a source bundle that includes all thirdparty dependencies
+#'
+#' @param dest_file File path for the new tar.gz package. Defaults to
+#' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version)
+#' @param source_file File path for the input tar.gz package. Defaults to
+#' downloading the package from CRAN (or whatever you have set as the first in
+#' `getOption("repos")`)
+#' @return The full path to `dest_file`, invisibly
+#'
+#' This function is used for setting up an offline build. If it's possible to
+#' download at build time, don't use this function. Instead, let `cmake`
+#' download the required dependencies for you.
+#' These downloaded dependencies are only used in the build if
+#' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`.
+#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds
+#'
+#' If you're using binary packages you shouldn't need to use this function. You
+#' should download the appropriate binary from your package repository, transfer
+#' that to the offline computer, and install that. Any OS can create the source
+#' bundle, but it cannot be installed on Windows. (Instead, use a standard
+#' Windows binary package.)
+#'
+#' Note if you're using RStudio Package Manager on Linux: If you still want to
+#' make a source bundle with this function, make sure to set the first repo in
+#' `options("repos")` to be a mirror that contains source packages (that is:
+#' something other than the RSPM binary mirror URLs).
+#'
+#' ## Steps for an offline install with optional dependencies:
+#'
+#' ### Using a computer with internet access, pre-download the dependencies:
+#' * Install the `arrow` package _or_ run
+#' `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")`
+#' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")`
+#' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access
+#'
+#' ### On the computer without internet access, install the prepared package:
+#' * Install the `arrow` package from the copied file
+#' * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))`
+#' * This installation will build from source, so `cmake` must be available
+#' * Run [arrow_info()] to check installed capabilities
+#'
+#'
+#' @examples
+#' \dontrun{
+#' new_pkg <- create_package_with_all_dependencies()
+#' # Note: this works when run in the same R session, but it's meant to be
+#' # copied to a different computer.
+#' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo"))
+#' }
+#' @export
+create_package_with_all_dependencies <- function(dest_file = NULL, source_file = NULL) {
+ if (is.null(source_file)) {
+ pkg_download_dir <- tempfile()
+ dir.create(pkg_download_dir)
+ on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE)
+ message("Downloading Arrow source file")
+ downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source")
+ source_file <- downloaded[1, 2, drop = TRUE]
+ }
+ if (!file.exists(source_file) || !endsWith(source_file, "tar.gz")) {
+ stop("Arrow package .tar.gz file not found")
+ }
+ if (is.null(dest_file)) {
+ # e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz
+ # (add 'with_deps' for clarity if the file was downloaded locally)
+ dest_file <- paste0(gsub(".tar.gz$", "", basename(source_file)), "_with_deps.tar.gz")
+ }
+ untar_dir <- tempfile()
+ on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE)
+ utils::untar(source_file, exdir = untar_dir)
+ tools_dir <- file.path(untar_dir, "arrow/tools")
+ download_dependencies_sh <- file.path(tools_dir, "cpp/thirdparty/download_dependencies.sh")
+ # If you change this path, also need to edit nixlibs.R
+ download_dir <- file.path(tools_dir, "thirdparty_dependencies")
+ dir.create(download_dir)
+
+ message("Downloading files to ", download_dir)
+ download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0
+ if (!download_successful) {
+ stop("Failed to download thirdparty dependencies")
+ }
+ # Need to change directory to untar_dir so tar() will use relative paths. That
+ # means we'll need a full, non-relative path for dest_file. (extra_flags="-C"
+ # doesn't work with R's internal tar)
+ orig_wd <- getwd()
+ on.exit(setwd(orig_wd), add = TRUE)
+ # normalizePath() may return the input unchanged if dest_file doesn't exist,
+ # so create it first.
+ file.create(dest_file)
+ dest_file <- normalizePath(dest_file, mustWork = TRUE)
+ setwd(untar_dir)
+
+ message("Repacking tar.gz file to ", dest_file)
+ tar_successful <- utils::tar(dest_file, compression = "gz") == 0
+ if (!tar_successful) {
+ stop("Failed to create new tar.gz file")
+ }
+ invisible(dest_file)
+}
diff --git a/src/arrow/r/R/io.R b/src/arrow/r/R/io.R
new file mode 100644
index 000000000..898b306a3
--- /dev/null
+++ b/src/arrow/r/R/io.R
@@ -0,0 +1,295 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @include enums.R
+#' @include buffer.R
+
+# OutputStream ------------------------------------------------------------
+
+Writable <- R6Class("Writable",
+ inherit = ArrowObject,
+ public = list(
+ write = function(x) io___Writable__write(self, buffer(x))
+ )
+)
+
+#' @title OutputStream classes
+#' @description `FileOutputStream` is for writing to a file;
+#' `BufferOutputStream` writes to a buffer;
+#' You can create one and pass it to any of the table writers, for example.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `$create()` factory methods instantiate the `OutputStream` object and
+#' take the following arguments, depending on the subclass:
+#'
+#' - `path` For `FileOutputStream`, a character file name
+#' - `initial_capacity` For `BufferOutputStream`, the size in bytes of the
+#' buffer.
+#'
+#' @section Methods:
+#'
+#' - `$tell()`: return the position in the stream
+#' - `$close()`: close the stream
+#' - `$write(x)`: send `x` to the stream
+#' - `$capacity()`: for `BufferOutputStream`
+#' - `$finish()`: for `BufferOutputStream`
+#' - `$GetExtentBytesWritten()`: for `MockOutputStream`, report how many bytes
+#' were sent.
+#'
+#' @rdname OutputStream
+#' @name OutputStream
+OutputStream <- R6Class("OutputStream",
+ inherit = Writable,
+ public = list(
+ close = function() io___OutputStream__Close(self),
+ tell = function() io___OutputStream__Tell(self)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname OutputStream
+#' @export
+FileOutputStream <- R6Class("FileOutputStream", inherit = OutputStream)
+FileOutputStream$create <- function(path) {
+ io___FileOutputStream__Open(clean_path_abs(path))
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname OutputStream
+#' @export
+BufferOutputStream <- R6Class("BufferOutputStream",
+ inherit = OutputStream,
+ public = list(
+ capacity = function() io___BufferOutputStream__capacity(self),
+ finish = function() io___BufferOutputStream__Finish(self),
+ write = function(bytes) io___BufferOutputStream__Write(self, bytes),
+ tell = function() io___BufferOutputStream__Tell(self)
+ )
+)
+BufferOutputStream$create <- function(initial_capacity = 0L) {
+ io___BufferOutputStream__Create(initial_capacity)
+}
+
+# InputStream -------------------------------------------------------------
+
+
+Readable <- R6Class("Readable",
+ inherit = ArrowObject,
+ public = list(
+ Read = function(nbytes) io___Readable__Read(self, nbytes)
+ )
+)
+
+#' @title InputStream classes
+#' @description `RandomAccessFile` inherits from `InputStream` and is a base
+#' class for: `ReadableFile` for reading from a file; `MemoryMappedFile` for
+#' the same but with memory mapping; and `BufferReader` for reading from a
+#' buffer. Use these with the various table readers.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `$create()` factory methods instantiate the `InputStream` object and
+#' take the following arguments, depending on the subclass:
+#'
+#' - `path` For `ReadableFile`, a character file name
+#' - `x` For `BufferReader`, a [Buffer] or an object that can be
+#' made into a buffer via `buffer()`.
+#'
+#' To instantiate a `MemoryMappedFile`, call [mmap_open()].
+#'
+#' @section Methods:
+#'
+#' - `$GetSize()`:
+#' - `$supports_zero_copy()`: Logical
+#' - `$seek(position)`: go to that position in the stream
+#' - `$tell()`: return the position in the stream
+#' - `$close()`: close the stream
+#' - `$Read(nbytes)`: read data from the stream, either a specified `nbytes` or
+#' all, if `nbytes` is not provided
+#' - `$ReadAt(position, nbytes)`: similar to `$seek(position)$Read(nbytes)`
+#' - `$Resize(size)`: for a `MemoryMappedFile` that is writeable
+#'
+#' @rdname InputStream
+#' @name InputStream
+InputStream <- R6Class("InputStream",
+ inherit = Readable,
+ public = list(
+ close = function() io___InputStream__Close(self)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+RandomAccessFile <- R6Class("RandomAccessFile",
+ inherit = InputStream,
+ public = list(
+ GetSize = function() io___RandomAccessFile__GetSize(self),
+ supports_zero_copy = function() io___RandomAccessFile__supports_zero_copy(self),
+ seek = function(position) io___RandomAccessFile__Seek(self, position),
+ tell = function() io___RandomAccessFile__Tell(self),
+ Read = function(nbytes = NULL) {
+ if (is.null(nbytes)) {
+ io___RandomAccessFile__Read0(self)
+ } else {
+ io___Readable__Read(self, nbytes)
+ }
+ },
+ ReadAt = function(position, nbytes = NULL) {
+ if (is.null(nbytes)) {
+ nbytes <- self$GetSize() - position
+ }
+ io___RandomAccessFile__ReadAt(self, position, nbytes)
+ }
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+MemoryMappedFile <- R6Class("MemoryMappedFile",
+ inherit = RandomAccessFile,
+ public = list(
+ Resize = function(size) io___MemoryMappedFile__Resize(self, size)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+ReadableFile <- R6Class("ReadableFile", inherit = RandomAccessFile)
+ReadableFile$create <- function(path) {
+ io___ReadableFile__Open(clean_path_abs(path))
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+BufferReader <- R6Class("BufferReader", inherit = RandomAccessFile)
+BufferReader$create <- function(x) {
+ x <- buffer(x)
+ io___BufferReader__initialize(x)
+}
+
+#' Create a new read/write memory mapped file of a given size
+#'
+#' @param path file path
+#' @param size size in bytes
+#'
+#' @return a [arrow::io::MemoryMappedFile][MemoryMappedFile]
+#'
+#' @export
+mmap_create <- function(path, size) {
+ path <- clean_path_abs(path)
+ io___MemoryMappedFile__Create(path, size)
+}
+
+#' Open a memory mapped file
+#'
+#' @param path file path
+#' @param mode file mode (read/write/readwrite)
+#'
+#' @export
+mmap_open <- function(path, mode = c("read", "write", "readwrite")) {
+ mode <- match(match.arg(mode), c("read", "write", "readwrite")) - 1L
+ path <- clean_path_abs(path)
+ io___MemoryMappedFile__Open(path, mode)
+}
+
+#' Handle a range of possible input sources
+#' @param file A character file name, `raw` vector, or an Arrow input stream
+#' @param mmap Logical: whether to memory-map the file (default `TRUE`)
+#' @param compression If the file is compressed, created a [CompressedInputStream]
+#' with this compression codec, either a [Codec] or the string name of one.
+#' If `NULL` (default) and `file` is a string file name, the function will try
+#' to infer compression from the file extension.
+#' @param filesystem If not `NULL`, `file` will be opened via the
+#' `filesystem$OpenInputFile()` filesystem method, rather than the `io` module's
+#' `MemoryMappedFile` or `ReadableFile` constructors.
+#' @return An `InputStream` or a subclass of one.
+#' @keywords internal
+make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem = NULL) {
+ if (inherits(file, "SubTreeFileSystem")) {
+ filesystem <- file$base_fs
+ file <- file$base_path
+ }
+ if (is.string(file)) {
+ if (is_url(file)) {
+ fs_and_path <- FileSystem$from_uri(file)
+ filesystem <- fs_and_path$fs
+ file <- fs_and_path$path
+ }
+ if (is.null(compression)) {
+ # Infer compression from the file path
+ compression <- detect_compression(file)
+ }
+ if (!is.null(filesystem)) {
+ file <- filesystem$OpenInputFile(file)
+ } else if (isTRUE(mmap)) {
+ file <- mmap_open(file)
+ } else {
+ file <- ReadableFile$create(file)
+ }
+ if (!identical(compression, "uncompressed")) {
+ file <- CompressedInputStream$create(file, compression)
+ }
+ } else if (inherits(file, c("raw", "Buffer"))) {
+ file <- BufferReader$create(file)
+ }
+ assert_is(file, "InputStream")
+ file
+}
+
+make_output_stream <- function(x, filesystem = NULL) {
+ if (inherits(x, "SubTreeFileSystem")) {
+ filesystem <- x$base_fs
+ x <- x$base_path
+ } else if (is_url(x)) {
+ fs_and_path <- FileSystem$from_uri(x)
+ filesystem <- fs_and_path$fs
+ x <- fs_and_path$path
+ }
+ assert_that(is.string(x))
+ if (is.null(filesystem)) {
+ FileOutputStream$create(x)
+ } else {
+ filesystem$OpenOutputStream(x)
+ }
+}
+
+detect_compression <- function(path) {
+ assert_that(is.string(path))
+ switch(tools::file_ext(path),
+ bz2 = "bz2",
+ gz = "gzip",
+ lz4 = "lz4",
+ zst = "zstd",
+ "uncompressed"
+ )
+}
diff --git a/src/arrow/r/R/ipc_stream.R b/src/arrow/r/R/ipc_stream.R
new file mode 100644
index 000000000..c45d1de6e
--- /dev/null
+++ b/src/arrow/r/R/ipc_stream.R
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Write Arrow IPC stream format
+#'
+#' Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather. `write_ipc_stream()`
+#' and [write_feather()] write those formats, respectively.
+#'
+#' `write_arrow()`, a wrapper around `write_ipc_stream()` and `write_feather()`
+#' with some nonstandard behavior, is deprecated. You should explicitly choose
+#' the function that will write the desired IPC format (stream or file) since
+#' either can be written to a file or `OutputStream`.
+#'
+#' @inheritParams write_feather
+#' @param ... extra parameters passed to `write_feather()`.
+#'
+#' @return `x`, invisibly.
+#' @seealso [write_feather()] for writing IPC files. [write_to_raw()] to
+#' serialize data to a buffer.
+#' [RecordBatchWriter] for a lower-level interface.
+#' @export
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_ipc_stream(mtcars, tf)
+write_ipc_stream <- function(x, sink, ...) {
+ x_out <- x # So we can return the data we got
+ if (is.data.frame(x)) {
+ x <- Table$create(x)
+ }
+ assert_that(is_writable_table(x))
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+
+ writer <- RecordBatchStreamWriter$create(sink, x$schema)
+ writer$write(x)
+ writer$close()
+ invisible(x_out)
+}
+
+#' Write Arrow data to a raw vector
+#'
+#' [write_ipc_stream()] and [write_feather()] write data to a sink and return
+#' the data (`data.frame`, `RecordBatch`, or `Table`) they were given.
+#' This function wraps those so that you can serialize data to a buffer and
+#' access that buffer as a `raw` vector in R.
+#' @inheritParams write_feather
+#' @param format one of `c("stream", "file")`, indicating the IPC format to use
+#' @return A `raw` vector containing the bytes of the IPC serialized data.
+#' @examplesIf arrow_available()
+#' # The default format is "stream"
+#' mtcars_raw <- write_to_raw(mtcars)
+#' @export
+write_to_raw <- function(x, format = c("stream", "file")) {
+ sink <- BufferOutputStream$create()
+ if (match.arg(format) == "stream") {
+ write_ipc_stream(x, sink)
+ } else {
+ write_feather(x, sink)
+ }
+ as.raw(buffer(sink))
+}
+
+#' Read Arrow IPC stream format
+#'
+#' Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather. `read_ipc_stream()`
+#' and [read_feather()] read those formats, respectively.
+#'
+#' `read_arrow()`, a wrapper around `read_ipc_stream()` and `read_feather()`,
+#' is deprecated. You should explicitly choose
+#' the function that will read the desired IPC format (stream or file) since
+#' a file or `InputStream` may contain either.
+#'
+#' @param file A character file name or URI, `raw` vector, an Arrow input stream,
+#' or a `FileSystem` with path (`SubTreeFileSystem`).
+#' If a file name or URI, an Arrow [InputStream] will be opened and
+#' closed when finished. If an input stream is provided, it will be left
+#' open.
+#' @param as_data_frame Should the function return a `data.frame` (default) or
+#' an Arrow [Table]?
+#' @param ... extra parameters passed to `read_feather()`.
+#'
+#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
+#' Arrow [Table] otherwise
+#' @seealso [read_feather()] for writing IPC files. [RecordBatchReader] for a
+#' lower-level interface.
+#' @export
+read_ipc_stream <- function(file, as_data_frame = TRUE, ...) {
+ if (!inherits(file, "InputStream")) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+
+ # TODO: this could take col_select, like the other readers
+ # https://issues.apache.org/jira/browse/ARROW-6830
+ out <- RecordBatchStreamReader$create(file)$read_table()
+ if (as_data_frame) {
+ out <- as.data.frame(out)
+ }
+ out
+}
diff --git a/src/arrow/r/R/json.R b/src/arrow/r/R/json.R
new file mode 100644
index 000000000..0d54c8a8a
--- /dev/null
+++ b/src/arrow/r/R/json.R
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Read a JSON file
+#'
+#' Using [JsonTableReader]
+#'
+#' @inheritParams read_delim_arrow
+#' @param schema [Schema] that describes the table.
+#' @param ... Additional options passed to `JsonTableReader$create()`
+#'
+#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`.
+#' @export
+#' @examplesIf arrow_with_json()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines('
+#' { "hello": 3.5, "world": false, "yo": "thing" }
+#' { "hello": 3.25, "world": null }
+#' { "hello": 0.0, "world": true, "yo": null }
+#' ', tf, useBytes = TRUE)
+#' df <- read_json_arrow(tf)
+read_json_arrow <- function(file,
+ col_select = NULL,
+ as_data_frame = TRUE,
+ schema = NULL,
+ ...) {
+ if (!inherits(file, "InputStream")) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+ tab <- JsonTableReader$create(file, schema = schema, ...)$Read()
+
+ col_select <- enquo(col_select)
+ if (!quo_is_null(col_select)) {
+ tab <- tab[vars_select(names(tab), !!col_select)]
+ }
+
+ if (isTRUE(as_data_frame)) {
+ tab <- as.data.frame(tab)
+ }
+ tab
+}
+
+#' @include arrow-package.R
+#' @rdname CsvTableReader
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+JsonTableReader <- R6Class("JsonTableReader",
+ inherit = ArrowObject,
+ public = list(
+ Read = function() json___TableReader__Read(self)
+ )
+)
+JsonTableReader$create <- function(file,
+ read_options = JsonReadOptions$create(),
+ parse_options = JsonParseOptions$create(schema = schema),
+ schema = NULL,
+ ...) {
+ assert_is(file, "InputStream")
+ json___TableReader__Make(file, read_options, parse_options)
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+JsonReadOptions <- R6Class("JsonReadOptions", inherit = ArrowObject)
+JsonReadOptions$create <- function(use_threads = option_use_threads(), block_size = 1048576L) {
+ json___ReadOptions__initialize(use_threads, block_size)
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+JsonParseOptions <- R6Class("JsonParseOptions", inherit = ArrowObject)
+JsonParseOptions$create <- function(newlines_in_values = FALSE, schema = NULL) {
+ if (is.null(schema)) {
+ json___ParseOptions__initialize1(newlines_in_values)
+ } else {
+ json___ParseOptions__initialize2(newlines_in_values, schema)
+ }
+}
diff --git a/src/arrow/r/R/memory-pool.R b/src/arrow/r/R/memory-pool.R
new file mode 100644
index 000000000..2207ed6be
--- /dev/null
+++ b/src/arrow/r/R/memory-pool.R
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#'
+#' @title class arrow::MemoryPool
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' - `backend_name`: one of "jemalloc", "mimalloc", or "system". Alternative
+#' memory allocators are optionally enabled at build time. Windows builds
+#' generally have `mimalloc`, and most others have both `jemalloc` (used by
+#' default) and `mimalloc`. To change memory allocators at runtime, set the
+#' environment variable `ARROW_DEFAULT_MEMORY_POOL` to one of those strings
+#' prior to loading the `arrow` library.
+#' - `bytes_allocated`
+#' - `max_memory`
+#'
+#' @rdname MemoryPool
+#' @name MemoryPool
+#' @keywords internal
+MemoryPool <- R6Class("MemoryPool",
+ inherit = ArrowObject,
+ public = list(
+ # TODO: Allocate
+ # TODO: Reallocate
+ # TODO: Free
+ ),
+ active = list(
+ backend_name = function() MemoryPool__backend_name(self),
+ bytes_allocated = function() MemoryPool__bytes_allocated(self),
+ max_memory = function() MemoryPool__max_memory(self)
+ )
+)
+
+#' Arrow's default [MemoryPool]
+#'
+#' @return the default [MemoryPool]
+#' @export
+#' @keywords internal
+default_memory_pool <- function() {
+ MemoryPool__default()
+}
diff --git a/src/arrow/r/R/message.R b/src/arrow/r/R/message.R
new file mode 100644
index 000000000..ef33f1623
--- /dev/null
+++ b/src/arrow/r/R/message.R
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+
+#' @title class arrow::Message
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname Message
+#' @name Message
+Message <- R6Class("Message",
+ inherit = ArrowObject,
+ public = list(
+ Equals = function(other, ...) {
+ inherits(other, "Message") && ipc___Message__Equals(self, other)
+ },
+ body_length = function() ipc___Message__body_length(self),
+ Verify = function() ipc___Message__Verify(self)
+ ),
+ active = list(
+ type = function() ipc___Message__type(self),
+ metadata = function() ipc___Message__metadata(self),
+ body = function() ipc___Message__body(self)
+ )
+)
+
+#' @title class arrow::MessageReader
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname MessageReader
+#' @name MessageReader
+#' @export
+MessageReader <- R6Class("MessageReader",
+ inherit = ArrowObject,
+ public = list(
+ ReadNextMessage = function() ipc___MessageReader__ReadNextMessage(self)
+ )
+)
+
+MessageReader$create <- function(stream) {
+ if (!inherits(stream, "InputStream")) {
+ stream <- BufferReader$create(stream)
+ }
+ ipc___MessageReader__Open(stream)
+}
+
+#' Read a Message from a stream
+#'
+#' @param stream an InputStream
+#'
+#' @export
+read_message <- function(stream) {
+ UseMethod("read_message")
+}
+
+#' @export
+read_message.default <- function(stream) {
+ read_message(BufferReader$create(stream))
+}
+
+#' @export
+read_message.InputStream <- function(stream) {
+ ipc___ReadMessage(stream)
+}
+
+#' @export
+read_message.MessageReader <- function(stream) {
+ stream$ReadNextMessage()
+}
diff --git a/src/arrow/r/R/metadata.R b/src/arrow/r/R/metadata.R
new file mode 100644
index 000000000..768abeda7
--- /dev/null
+++ b/src/arrow/r/R/metadata.R
@@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom utils object.size
+.serialize_arrow_r_metadata <- function(x) {
+ assert_is(x, "list")
+
+ # drop problems attributes (most likely from readr)
+ x[["attributes"]][["problems"]] <- NULL
+
+ out <- serialize(x, NULL, ascii = TRUE)
+
+ # if the metadata is over 100 kB, compress
+ if (option_compress_metadata() && object.size(out) > 100000) {
+ out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
+
+ # but ensure that the compression+serialization is effective.
+ if (object.size(out) > object.size(out_comp)) out <- out_comp
+ }
+
+ rawToChar(out)
+}
+
+.unserialize_arrow_r_metadata <- function(x) {
+ tryCatch(
+ expr = {
+ out <- unserialize(charToRaw(x))
+
+ # if this is still raw, try decompressing
+ if (is.raw(out)) {
+ out <- unserialize(memDecompress(out, type = "gzip"))
+ }
+ out
+ },
+ error = function(e) {
+ warning("Invalid metadata$r", call. = FALSE)
+ NULL
+ }
+ )
+}
+
+#' @importFrom rlang trace_back
+apply_arrow_r_metadata <- function(x, r_metadata) {
+ tryCatch(
+ expr = {
+ columns_metadata <- r_metadata$columns
+ if (is.data.frame(x)) {
+ if (length(names(x)) && !is.null(columns_metadata)) {
+ for (name in intersect(names(columns_metadata), names(x))) {
+ x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
+ }
+ }
+ } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
+ # If we have a list and "columns_metadata" this applies row-level metadata
+ # inside of a column in a dataframe.
+
+ # However, if we are inside of a dplyr collection (including all datasets),
+ # we cannot apply this row-level metadata, since the order of the rows is
+ # not guaranteed to be the same, so don't even try, but warn what's going on
+ trace <- trace_back()
+ # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
+ in_dplyr_collect <- any(map_lgl(trace$calls %||% trace$call, function(x) {
+ grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]]
+ }))
+ if (in_dplyr_collect) {
+ warning(
+ "Row-level metadata is not compatible with this operation and has ",
+ "been ignored",
+ call. = FALSE
+ )
+ } else {
+ x <- map2(x, columns_metadata, function(.x, .y) {
+ apply_arrow_r_metadata(.x, .y)
+ })
+ }
+ x
+ }
+
+ if (!is.null(r_metadata$attributes)) {
+ attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
+ if (inherits(x, "POSIXlt")) {
+ # We store POSIXlt as a StructArray, which is translated back to R
+ # as a data.frame, but while data frames have a row.names = c(NA, nrow(x))
+ # attribute, POSIXlt does not, so since this is now no longer an object
+ # of class data.frame, remove the extraneous attribute
+ attr(x, "row.names") <- NULL
+ }
+ if (!is.null(attr(x, ".group_vars")) && requireNamespace("dplyr", quietly = TRUE)) {
+ x <- dplyr::group_by(x, !!!syms(attr(x, ".group_vars")))
+ attr(x, ".group_vars") <- NULL
+ }
+ }
+ },
+ error = function(e) {
+ warning("Invalid metadata$r", call. = FALSE)
+ }
+ )
+ x
+}
+
+remove_attributes <- function(x) {
+ removed_attributes <- character()
+ if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) {
+ removed_attributes <- c("class", "row.names", "names")
+ } else if (inherits(x, "data.frame")) {
+ removed_attributes <- c("row.names", "names")
+ } else if (inherits(x, "factor")) {
+ removed_attributes <- c("class", "levels")
+ } else if (inherits(x, c("integer64", "Date", "arrow_binary", "arrow_large_binary"))) {
+ removed_attributes <- c("class")
+ } else if (inherits(x, "arrow_fixed_size_binary")) {
+ removed_attributes <- c("class", "byte_width")
+ } else if (inherits(x, "POSIXct")) {
+ removed_attributes <- c("class", "tzone")
+ } else if (inherits(x, "hms") || inherits(x, "difftime")) {
+ removed_attributes <- c("class", "units")
+ }
+ removed_attributes
+}
+
+arrow_attributes <- function(x, only_top_level = FALSE) {
+ if (inherits(x, "grouped_df")) {
+ # Keep only the group var names, not the rest of the cached data that dplyr
+ # uses, which may be large
+ if (requireNamespace("dplyr", quietly = TRUE)) {
+ gv <- dplyr::group_vars(x)
+ x <- dplyr::ungroup(x)
+ # ungroup() first, then set attribute, bc ungroup() would erase it
+ attr(x, ".group_vars") <- gv
+ } else {
+ # Regardless, we shouldn't keep groups around
+ attr(x, "groups") <- NULL
+ }
+ }
+ att <- attributes(x)
+
+ removed_attributes <- remove_attributes(x)
+
+ att <- att[setdiff(names(att), removed_attributes)]
+ if (isTRUE(only_top_level)) {
+ return(att)
+ }
+
+ if (is.data.frame(x)) {
+ columns <- map(x, arrow_attributes)
+ out <- if (length(att) || !all(map_lgl(columns, is.null))) {
+ list(attributes = att, columns = columns)
+ }
+ return(out)
+ }
+
+ columns <- NULL
+ attempt_to_save_row_level <- getOption("arrow.preserve_row_level_metadata", FALSE) &&
+ is.list(x) && !inherits(x, "POSIXlt")
+ if (attempt_to_save_row_level) {
+ # However, if we are inside of a dplyr collection (including all datasets),
+ # we cannot apply this row-level metadata, since the order of the rows is
+ # not guaranteed to be the same, so don't even try, but warn what's going on
+ trace <- trace_back()
+ # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
+ in_dataset_write <- any(map_lgl(trace$calls %||% trace$call, function(x) {
+ grepl("write_dataset", x, fixed = TRUE)[[1]]
+ }))
+ if (in_dataset_write) {
+ warning(
+ "Row-level metadata is not compatible with datasets and will be discarded",
+ call. = FALSE
+ )
+ } else {
+ # for list columns, we also keep attributes of each
+ # element in columns
+ columns <- map(x, arrow_attributes)
+ }
+ if (all(map_lgl(columns, is.null))) {
+ columns <- NULL
+ }
+ } else if (inherits(x, c("sfc", "sf"))) {
+ # Check if there are any columns that look like sf columns, warn that we will
+ # not be saving this data for now (but only if arrow.preserve_row_level_metadata
+ # is set to FALSE)
+ warning(
+ "One of the columns given appears to be an `sfc` SF column. Due to their unique ",
+ "nature, these columns do not convert to Arrow well. We are working on ",
+ "better ways to do this, but in the interim we recommend converting any `sfc` ",
+ "columns to WKB (well-known binary) columns before using them with Arrow ",
+ "(for example, with `sf::st_as_binary(col)`).",
+ call. = FALSE
+ )
+ }
+
+ if (length(att) || !is.null(columns)) {
+ list(attributes = att, columns = columns)
+ } else {
+ NULL
+ }
+}
diff --git a/src/arrow/r/R/parquet.R b/src/arrow/r/R/parquet.R
new file mode 100644
index 000000000..ee2ed57de
--- /dev/null
+++ b/src/arrow/r/R/parquet.R
@@ -0,0 +1,585 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Read a Parquet file
+#'
+#' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format.
+#' This function enables you to read Parquet files into R.
+#'
+#' @inheritParams read_feather
+#' @param props [ParquetArrowReaderProperties]
+#' @param ... Additional arguments passed to `ParquetFileReader$create()`
+#'
+#' @return A [arrow::Table][Table], or a `data.frame` if `as_data_frame` is
+#' `TRUE` (the default).
+#' @examplesIf arrow_with_parquet()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_parquet(mtcars, tf)
+#' df <- read_parquet(tf, col_select = starts_with("d"))
+#' head(df)
+#' @export
+read_parquet <- function(file,
+ col_select = NULL,
+ as_data_frame = TRUE,
+ props = ParquetArrowReaderProperties$create(),
+ ...) {
+ if (is.string(file)) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+ reader <- ParquetFileReader$create(file, props = props, ...)
+
+ col_select <- enquo(col_select)
+ if (!quo_is_null(col_select)) {
+ # infer which columns to keep from schema
+ schema <- reader$GetSchema()
+ names <- names(schema)
+ indices <- match(vars_select(names, !!col_select), names) - 1L
+ tab <- tryCatch(
+ reader$ReadTable(indices),
+ error = read_compressed_error
+ )
+ } else {
+ # read all columns
+ tab <- tryCatch(
+ reader$ReadTable(),
+ error = read_compressed_error
+ )
+ }
+
+ if (as_data_frame) {
+ tab <- as.data.frame(tab)
+ }
+ tab
+}
+
+#' Write Parquet file to disk
+#'
+#' [Parquet](https://parquet.apache.org/) is a columnar storage file format.
+#' This function enables you to write Parquet files from R.
+#'
+#' Due to features of the format, Parquet files cannot be appended to.
+#' If you want to use the Parquet format but also want the ability to extend
+#' your dataset, you can write to additional Parquet files and then treat
+#' the whole directory of files as a [Dataset] you can query.
+#' See `vignette("dataset", package = "arrow")` for examples of this.
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param chunk_size chunk size in number of rows. If NULL, the total number of rows is used.
+#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values
+#' are coerced to character.
+#' @param compression compression algorithm. Default "snappy". See details.
+#' @param compression_level compression level. Meaning depends on compression algorithm
+#' @param use_dictionary Specify if we should use dictionary encoding. Default `TRUE`
+#' @param write_statistics Specify if we should write statistics. Default `TRUE`
+#' @param data_page_size Set a target threshold for the approximate encoded
+#' size of data pages within a column chunk (in bytes). Default 1 MiB.
+#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format. Default `FALSE`.
+#' @param coerce_timestamps Cast timestamps a particular resolution. Can be
+#' `NULL`, "ms" or "us". Default `NULL` (no casting)
+#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a
+#' particular resolution. E.g. if microsecond or nanosecond data is lost when coercing
+#' to "ms", do not raise an exception
+#' @param properties A `ParquetWriterProperties` object, used instead of the options
+#' enumerated in this function's signature. Providing `properties` as an argument
+#' is deprecated; if you need to assemble `ParquetWriterProperties` outside
+#' of `write_parquet()`, use `ParquetFileWriter` instead.
+#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like
+#' `properties`, this argument is deprecated.
+#'
+#' @details The parameters `compression`, `compression_level`, `use_dictionary` and
+#' `write_statistics` support various patterns:
+#'
+#' - The default `NULL` leaves the parameter unspecified, and the C++ library
+#' uses an appropriate default for each column (defaults listed above)
+#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns
+#' - An unnamed vector, of the same size as the number of columns, to specify a
+#' value for each column, in positional order
+#' - A named vector, to specify the value for the named columns, the default
+#' value for the setting is used when not supplied
+#'
+#' The `compression` argument can be any of the following (case insensitive):
+#' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2".
+#' Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip"
+#' are almost always included. See [codec_is_available()].
+#' The default "snappy" is used if available, otherwise "uncompressed". To
+#' disable compression, set `compression = "uncompressed"`.
+#' Note that "uncompressed" columns may still have dictionary encoding.
+#'
+#' @return the input `x` invisibly.
+#'
+#' @examplesIf arrow_with_parquet()
+#' tf1 <- tempfile(fileext = ".parquet")
+#' write_parquet(data.frame(x = 1:5), tf1)
+#'
+#' # using compression
+#' if (codec_is_available("gzip")) {
+#' tf2 <- tempfile(fileext = ".gz.parquet")
+#' write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
+#' }
+#' @export
+write_parquet <- function(x,
+ sink,
+ chunk_size = NULL,
+ # writer properties
+ version = NULL,
+ compression = default_parquet_compression(),
+ compression_level = NULL,
+ use_dictionary = NULL,
+ write_statistics = NULL,
+ data_page_size = NULL,
+ # arrow writer properties
+ use_deprecated_int96_timestamps = FALSE,
+ coerce_timestamps = NULL,
+ allow_truncated_timestamps = FALSE,
+ properties = NULL,
+ arrow_properties = NULL) {
+ x_out <- x
+
+ if (is.data.frame(x) || inherits(x, "RecordBatch")) {
+ x <- Table$create(x)
+ }
+
+ assert_that(is_writable_table(x))
+
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+
+ # Deprecation warnings
+ if (!is.null(properties)) {
+ warning(
+ "Providing 'properties' is deprecated. If you need to assemble properties outside ",
+ "this function, use ParquetFileWriter instead."
+ )
+ }
+ if (!is.null(arrow_properties)) {
+ warning(
+ "Providing 'arrow_properties' is deprecated. If you need to assemble arrow_properties ",
+ "outside this function, use ParquetFileWriter instead."
+ )
+ }
+
+ writer <- ParquetFileWriter$create(
+ x$schema,
+ sink,
+ properties = properties %||% ParquetWriterProperties$create(
+ x,
+ version = version,
+ compression = compression,
+ compression_level = compression_level,
+ use_dictionary = use_dictionary,
+ write_statistics = write_statistics,
+ data_page_size = data_page_size
+ ),
+ arrow_properties = arrow_properties %||% ParquetArrowWriterProperties$create(
+ use_deprecated_int96_timestamps = use_deprecated_int96_timestamps,
+ coerce_timestamps = coerce_timestamps,
+ allow_truncated_timestamps = allow_truncated_timestamps
+ )
+ )
+ writer$WriteTable(x, chunk_size = chunk_size %||% x$num_rows)
+ writer$Close()
+
+ invisible(x_out)
+}
+
+default_parquet_compression <- function() {
+ # Match the pyarrow default (overriding the C++ default)
+ if (codec_is_available("snappy")) {
+ "snappy"
+ } else {
+ NULL
+ }
+}
+
+ParquetArrowWriterProperties <- R6Class("ParquetArrowWriterProperties", inherit = ArrowObject)
+ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps = FALSE,
+ coerce_timestamps = NULL,
+ allow_truncated_timestamps = FALSE,
+ ...) {
+ if (is.null(coerce_timestamps)) {
+ timestamp_unit <- -1L # null sentinel value
+ } else {
+ timestamp_unit <- make_valid_time_unit(
+ coerce_timestamps,
+ c("ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO)
+ )
+ }
+ parquet___ArrowWriterProperties___create(
+ use_deprecated_int96_timestamps = isTRUE(use_deprecated_int96_timestamps),
+ timestamp_unit = timestamp_unit,
+ allow_truncated_timestamps = isTRUE(allow_truncated_timestamps)
+ )
+}
+
+valid_parquet_version <- c(
+ "1.0" = ParquetVersionType$PARQUET_1_0,
+ "2.0" = ParquetVersionType$PARQUET_2_0
+)
+
+make_valid_version <- function(version, valid_versions = valid_parquet_version) {
+ if (is_integerish(version)) {
+ version <- as.character(version)
+ }
+ tryCatch(
+ valid_versions[[match.arg(version, choices = names(valid_versions))]],
+ error = function(cond) {
+ stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call. = FALSE)
+ }
+ )
+}
+
+#' @title ParquetWriterProperties class
+#' @rdname ParquetWriterProperties
+#' @name ParquetWriterProperties
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class holds settings to control how a Parquet file is read
+#' by [ParquetFileWriter].
+#'
+#' @section Factory:
+#'
+#' The `ParquetWriterProperties$create()` factory method instantiates the object
+#' and takes the following arguments:
+#'
+#' - `table`: table to write (required)
+#' - `version`: Parquet version, "1.0" or "2.0". Default "1.0"
+#' - `compression`: Compression type, algorithm `"uncompressed"`
+#' - `compression_level`: Compression level; meaning depends on compression algorithm
+#' - `use_dictionary`: Specify if we should use dictionary encoding. Default `TRUE`
+#' - `write_statistics`: Specify if we should write statistics. Default `TRUE`
+#' - `data_page_size`: Set a target threshold for the approximate encoded
+#' size of data pages within a column chunk (in bytes). Default 1 MiB.
+#'
+#' @details The parameters `compression`, `compression_level`, `use_dictionary`
+#' and write_statistics` support various patterns:
+#'
+#' - The default `NULL` leaves the parameter unspecified, and the C++ library
+#' uses an appropriate default for each column (defaults listed above)
+#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns
+#' - An unnamed vector, of the same size as the number of columns, to specify a
+#' value for each column, in positional order
+#' - A named vector, to specify the value for the named columns, the default
+#' value for the setting is used when not supplied
+#'
+#' Unlike the high-level [write_parquet], `ParquetWriterProperties` arguments
+#' use the C++ defaults. Currently this means "uncompressed" rather than
+#' "snappy" for the `compression` argument.
+#'
+#' @seealso [write_parquet]
+#' @seealso [Schema] for information about schemas and metadata handling.
+#'
+#' @export
+ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = ArrowObject)
+ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder",
+ inherit = ArrowObject,
+ public = list(
+ set_version = function(version) {
+ parquet___WriterProperties___Builder__version(self, make_valid_version(version))
+ },
+ set_compression = function(table, compression) {
+ compression <- compression_from_name(compression)
+ assert_that(is.integer(compression))
+ private$.set(
+ table, compression,
+ parquet___ArrowWriterProperties___Builder__set_compressions
+ )
+ },
+ set_compression_level = function(table, compression_level) {
+ # cast to integer but keep names
+ compression_level <- set_names(as.integer(compression_level), names(compression_level))
+ private$.set(
+ table, compression_level,
+ parquet___ArrowWriterProperties___Builder__set_compression_levels
+ )
+ },
+ set_dictionary = function(table, use_dictionary) {
+ assert_that(is.logical(use_dictionary))
+ private$.set(
+ table, use_dictionary,
+ parquet___ArrowWriterProperties___Builder__set_use_dictionary
+ )
+ },
+ set_write_statistics = function(table, write_statistics) {
+ assert_that(is.logical(write_statistics))
+ private$.set(
+ table, write_statistics,
+ parquet___ArrowWriterProperties___Builder__set_write_statistics
+ )
+ },
+ set_data_page_size = function(data_page_size) {
+ parquet___ArrowWriterProperties___Builder__data_page_size(self, data_page_size)
+ }
+ ),
+ private = list(
+ .set = function(table, value, FUN) {
+ msg <- paste0("unsupported ", substitute(value), "= specification")
+ column_names <- names(table)
+ given_names <- names(value)
+ if (is.null(given_names)) {
+ if (length(value) %in% c(1L, length(column_names))) {
+ # If there's a single, unnamed value, FUN will set it globally
+ # If there are values for all columns, send them along with the names
+ FUN(self, column_names, value)
+ } else {
+ abort(msg)
+ }
+ } else if (all(given_names %in% column_names)) {
+ # Use the given names
+ FUN(self, given_names, value)
+ } else {
+ abort(msg)
+ }
+ }
+ )
+)
+
+ParquetWriterProperties$create <- function(table,
+ version = NULL,
+ compression = default_parquet_compression(),
+ compression_level = NULL,
+ use_dictionary = NULL,
+ write_statistics = NULL,
+ data_page_size = NULL,
+ ...) {
+ builder <- parquet___WriterProperties___Builder__create()
+ if (!is.null(version)) {
+ builder$set_version(version)
+ }
+ if (!is.null(compression)) {
+ builder$set_compression(table, compression = compression)
+ }
+ if (!is.null(compression_level)) {
+ builder$set_compression_level(table, compression_level = compression_level)
+ }
+ if (!is.null(use_dictionary)) {
+ builder$set_dictionary(table, use_dictionary)
+ }
+ if (!is.null(write_statistics)) {
+ builder$set_write_statistics(table, write_statistics)
+ }
+ if (!is.null(data_page_size)) {
+ builder$set_data_page_size(data_page_size)
+ }
+ parquet___WriterProperties___Builder__build(builder)
+}
+
+#' @title ParquetFileWriter class
+#' @rdname ParquetFileWriter
+#' @name ParquetFileWriter
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class enables you to interact with Parquet files.
+#'
+#' @section Factory:
+#'
+#' The `ParquetFileWriter$create()` factory method instantiates the object and
+#' takes the following arguments:
+#'
+#' - `schema` A [Schema]
+#' - `sink` An [arrow::io::OutputStream][OutputStream]
+#' - `properties` An instance of [ParquetWriterProperties]
+#' - `arrow_properties` An instance of `ParquetArrowWriterProperties`
+#'
+#' @section Methods:
+#'
+#' - `WriteTable` Write a [Table] to `sink`
+#' - `Close` Close the writer. Note: does not close the `sink`.
+#' [arrow::io::OutputStream][OutputStream] has its own `close()` method.
+#'
+#' @export
+#' @include arrow-package.R
+ParquetFileWriter <- R6Class("ParquetFileWriter",
+ inherit = ArrowObject,
+ public = list(
+ WriteTable = function(table, chunk_size) {
+ parquet___arrow___FileWriter__WriteTable(self, table, chunk_size)
+ },
+ Close = function() parquet___arrow___FileWriter__Close(self)
+ )
+)
+ParquetFileWriter$create <- function(schema,
+ sink,
+ properties = ParquetWriterProperties$create(),
+ arrow_properties = ParquetArrowWriterProperties$create()) {
+ assert_is(sink, "OutputStream")
+ parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties)
+}
+
+
+#' @title ParquetFileReader class
+#' @rdname ParquetFileReader
+#' @name ParquetFileReader
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class enables you to interact with Parquet files.
+#'
+#' @section Factory:
+#'
+#' The `ParquetFileReader$create()` factory method instantiates the object and
+#' takes the following arguments:
+#'
+#' - `file` A character file name, raw vector, or Arrow file connection object
+#' (e.g. `RandomAccessFile`).
+#' - `props` Optional [ParquetArrowReaderProperties]
+#' - `mmap` Logical: whether to memory-map the file (default `TRUE`)
+#' - `...` Additional arguments, currently ignored
+#'
+#' @section Methods:
+#'
+#' - `$ReadTable(column_indices)`: get an `arrow::Table` from the file. The optional
+#' `column_indices=` argument is a 0-based integer vector indicating which columns to retain.
+#' - `$ReadRowGroup(i, column_indices)`: get an `arrow::Table` by reading the `i`th row group (0-based).
+#' The optional `column_indices=` argument is a 0-based integer vector indicating which columns to retain.
+#' - `$ReadRowGroups(row_groups, column_indices)`: get an `arrow::Table` by reading several row
+#' groups (0-based integers).
+#' The optional `column_indices=` argument is a 0-based integer vector indicating which columns to retain.
+#' - `$GetSchema()`: get the `arrow::Schema` of the data in the file
+#' - `$ReadColumn(i)`: read the `i`th column (0-based) as a [ChunkedArray].
+#'
+#' @section Active bindings:
+#'
+#' - `$num_rows`: number of rows.
+#' - `$num_columns`: number of columns.
+#' - `$num_row_groups`: number of row groups.
+#'
+#' @export
+#' @examplesIf arrow_with_parquet()
+#' f <- system.file("v0.7.1.parquet", package = "arrow")
+#' pq <- ParquetFileReader$create(f)
+#' pq$GetSchema()
+#' if (codec_is_available("snappy")) {
+#' # This file has compressed data columns
+#' tab <- pq$ReadTable()
+#' tab$schema
+#' }
+#' @include arrow-package.R
+ParquetFileReader <- R6Class("ParquetFileReader",
+ inherit = ArrowObject,
+ active = list(
+ num_rows = function() {
+ as.integer(parquet___arrow___FileReader__num_rows(self))
+ },
+ num_columns = function() {
+ parquet___arrow___FileReader__num_columns(self)
+ },
+ num_row_groups = function() {
+ parquet___arrow___FileReader__num_row_groups(self)
+ }
+ ),
+ public = list(
+ ReadTable = function(column_indices = NULL) {
+ if (is.null(column_indices)) {
+ parquet___arrow___FileReader__ReadTable1(self)
+ } else {
+ column_indices <- vec_cast(column_indices, integer())
+ parquet___arrow___FileReader__ReadTable2(self, column_indices)
+ }
+ },
+ ReadRowGroup = function(i, column_indices = NULL) {
+ i <- vec_cast(i, integer())
+ if (is.null(column_indices)) {
+ parquet___arrow___FileReader__ReadRowGroup1(self, i)
+ } else {
+ column_indices <- vec_cast(column_indices, integer())
+ parquet___arrow___FileReader__ReadRowGroup2(self, i, column_indices)
+ }
+ },
+ ReadRowGroups = function(row_groups, column_indices = NULL) {
+ row_groups <- vec_cast(row_groups, integer())
+ if (is.null(column_indices)) {
+ parquet___arrow___FileReader__ReadRowGroups1(self, row_groups)
+ } else {
+ column_indices <- vec_cast(column_indices, integer())
+ parquet___arrow___FileReader__ReadRowGroups2(self, row_groups, column_indices)
+ }
+ },
+ ReadColumn = function(i) {
+ i <- vec_cast(i, integer())
+ parquet___arrow___FileReader__ReadColumn(self, i)
+ },
+ GetSchema = function() {
+ parquet___arrow___FileReader__GetSchema(self)
+ }
+ )
+)
+
+ParquetFileReader$create <- function(file,
+ props = ParquetArrowReaderProperties$create(),
+ mmap = TRUE,
+ ...) {
+ file <- make_readable_file(file, mmap)
+ assert_is(props, "ParquetArrowReaderProperties")
+
+ parquet___arrow___FileReader__OpenFile(file, props)
+}
+
+#' @title ParquetArrowReaderProperties class
+#' @rdname ParquetArrowReaderProperties
+#' @name ParquetArrowReaderProperties
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class holds settings to control how a Parquet file is read
+#' by [ParquetFileReader].
+#'
+#' @section Factory:
+#'
+#' The `ParquetArrowReaderProperties$create()` factory method instantiates the object
+#' and takes the following arguments:
+#'
+#' - `use_threads` Logical: whether to use multithreading (default `TRUE`)
+#'
+#' @section Methods:
+#'
+#' - `$read_dictionary(column_index)`
+#' - `$set_read_dictionary(column_index, read_dict)`
+#' - `$use_threads(use_threads)`
+#'
+#' @export
+ParquetArrowReaderProperties <- R6Class("ParquetArrowReaderProperties",
+ inherit = ArrowObject,
+ public = list(
+ read_dictionary = function(column_index) {
+ parquet___arrow___ArrowReaderProperties__get_read_dictionary(self, column_index)
+ },
+ set_read_dictionary = function(column_index, read_dict) {
+ parquet___arrow___ArrowReaderProperties__set_read_dictionary(self, column_index, read_dict)
+ }
+ ),
+ active = list(
+ use_threads = function(use_threads) {
+ if (missing(use_threads)) {
+ parquet___arrow___ArrowReaderProperties__get_use_threads(self)
+ } else {
+ parquet___arrow___ArrowReaderProperties__set_use_threads(self, use_threads)
+ }
+ }
+ )
+)
+
+ParquetArrowReaderProperties$create <- function(use_threads = option_use_threads()) {
+ parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads))
+}
diff --git a/src/arrow/r/R/python.R b/src/arrow/r/R/python.R
new file mode 100644
index 000000000..07cd4456b
--- /dev/null
+++ b/src/arrow/r/R/python.R
@@ -0,0 +1,225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+py_to_r.pyarrow.lib.Array <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ x$`_export_to_c`(array_ptr, schema_ptr)
+ Array$import_from_c(array_ptr, schema_ptr)
+}
+
+r_to_py.Array <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(array_ptr, schema_ptr)
+ out <- pa$Array$`_import_from_c`(array_ptr, schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.RecordBatch <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ x$`_export_to_c`(array_ptr, schema_ptr)
+
+ RecordBatch$import_from_c(array_ptr, schema_ptr)
+}
+
+r_to_py.RecordBatch <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(array_ptr, schema_ptr)
+ out <- pa$RecordBatch$`_import_from_c`(array_ptr, schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+r_to_py.ChunkedArray <- function(x, convert = FALSE) {
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ out <- pa$chunked_array(x$chunks)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.ChunkedArray <- function(x, ...) {
+ ChunkedArray$create(!!!maybe_py_to_r(x$chunks))
+}
+
+r_to_py.Table <- function(x, convert = FALSE) {
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ out <- pa$Table$from_arrays(x$columns, schema = x$schema)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.Table <- function(x, ...) {
+ colnames <- maybe_py_to_r(x$column_names)
+ r_cols <- maybe_py_to_r(x$columns)
+ names(r_cols) <- colnames
+ Table$create(!!!r_cols, schema = maybe_py_to_r(x$schema))
+}
+
+py_to_r.pyarrow.lib.Schema <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ x$`_export_to_c`(schema_ptr)
+ Schema$import_from_c(schema_ptr)
+}
+
+r_to_py.Schema <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(schema_ptr)
+ out <- pa$Schema$`_import_from_c`(schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.Field <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ x$`_export_to_c`(schema_ptr)
+ Field$import_from_c(schema_ptr)
+}
+
+r_to_py.Field <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(schema_ptr)
+ out <- pa$Field$`_import_from_c`(schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.DataType <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ x$`_export_to_c`(schema_ptr)
+ DataType$import_from_c(schema_ptr)
+}
+
+r_to_py.DataType <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(schema_ptr)
+ out <- pa$DataType$`_import_from_c`(schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.RecordBatchReader <- function(x, ...) {
+ stream_ptr <- allocate_arrow_array_stream()
+ on.exit(delete_arrow_array_stream(stream_ptr))
+
+ x$`_export_to_c`(stream_ptr)
+ RecordBatchReader$import_from_c(stream_ptr)
+}
+
+r_to_py.RecordBatchReader <- function(x, convert = FALSE) {
+ stream_ptr <- allocate_arrow_array_stream()
+ on.exit(delete_arrow_array_stream(stream_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(stream_ptr)
+ # TODO: handle subclasses of RecordBatchReader?
+ out <- pa$lib$RecordBatchReader$`_import_from_c`(stream_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+
+maybe_py_to_r <- function(x) {
+ if (inherits(x, "python.builtin.object")) {
+ # Depending on some auto-convert behavior, x may already be converted
+ # or it may still be a Python object
+ x <- reticulate::py_to_r(x)
+ }
+ x
+}
+
+#' Install pyarrow for use with reticulate
+#'
+#' `pyarrow` is the Python package for Apache Arrow. This function helps with
+#' installing it for use with `reticulate`.
+#'
+#' @param envname The name or full path of the Python environment to install
+#' into. This can be a virtualenv or conda environment created by `reticulate`.
+#' See `reticulate::py_install()`.
+#' @param nightly logical: Should we install a development version of the
+#' package? Default is to use the official release version.
+#' @param ... additional arguments passed to `reticulate::py_install()`.
+#' @export
+install_pyarrow <- function(envname = NULL, nightly = FALSE, ...) {
+ if (nightly) {
+ reticulate::py_install("pyarrow",
+ envname = envname, ...,
+ # Nightly for pip
+ pip_options = "--extra-index-url https://repo.fury.io/arrow-nightlies/ --pre --upgrade",
+ # Nightly for conda
+ channel = "arrow-nightlies"
+ )
+ } else {
+ reticulate::py_install("pyarrow", envname = envname, ...)
+ }
+}
diff --git a/src/arrow/r/R/query-engine.R b/src/arrow/r/R/query-engine.R
new file mode 100644
index 000000000..234aaf569
--- /dev/null
+++ b/src/arrow/r/R/query-engine.R
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+do_exec_plan <- function(.data) {
+ plan <- ExecPlan$create()
+ final_node <- plan$Build(.data)
+ tab <- plan$Run(final_node)
+
+ # TODO (ARROW-14289): make the head/tail methods return RBR not Table
+ if (inherits(tab, "RecordBatchReader")) {
+ tab <- tab$read_table()
+ }
+
+ # If arrange() created $temp_columns, make sure to omit them from the result
+ # We can't currently handle this in the ExecPlan itself because sorting
+ # happens in the end (SinkNode) so nothing comes after it.
+ if (length(final_node$sort$temp_columns) > 0) {
+ tab <- tab[, setdiff(names(tab), final_node$sort$temp_columns), drop = FALSE]
+ }
+
+ if (ncol(tab)) {
+ # Apply any column metadata from the original schema, where appropriate
+ original_schema <- source_data(.data)$schema
+ # TODO: do we care about other (non-R) metadata preservation?
+ # How would we know if it were meaningful?
+ r_meta <- original_schema$r_metadata
+ if (!is.null(r_meta)) {
+ # Filter r_metadata$columns on columns with name _and_ type match
+ new_schema <- tab$schema
+ common_names <- intersect(names(r_meta$columns), names(tab))
+ keep <- common_names[
+ map_lgl(common_names, ~ original_schema[[.]] == new_schema[[.]])
+ ]
+ r_meta$columns <- r_meta$columns[keep]
+ if (has_aggregation(.data)) {
+ # dplyr drops top-level attributes if you do summarize
+ r_meta$attributes <- NULL
+ }
+ tab$r_metadata <- r_meta
+ }
+ }
+
+ tab
+}
+
+ExecPlan <- R6Class("ExecPlan",
+ inherit = ArrowObject,
+ public = list(
+ Scan = function(dataset) {
+ # Handle arrow_dplyr_query
+ if (inherits(dataset, "arrow_dplyr_query")) {
+ if (inherits(dataset$.data, "RecordBatchReader")) {
+ return(ExecNode_ReadFromRecordBatchReader(self, dataset$.data))
+ }
+
+ filter <- dataset$filtered_rows
+ if (isTRUE(filter)) {
+ filter <- Expression$scalar(TRUE)
+ }
+ # Use FieldsInExpression to find all from dataset$selected_columns
+ colnames <- unique(unlist(map(
+ dataset$selected_columns,
+ field_names_in_expression
+ )))
+ dataset <- dataset$.data
+ assert_is(dataset, "Dataset")
+ } else {
+ if (inherits(dataset, "ArrowTabular")) {
+ dataset <- InMemoryDataset$create(dataset)
+ }
+ assert_is(dataset, "Dataset")
+ # Set some defaults
+ filter <- Expression$scalar(TRUE)
+ colnames <- names(dataset)
+ }
+ # ScanNode needs the filter to do predicate pushdown and skip partitions,
+ # and it needs to know which fields to materialize (and which are unnecessary)
+ ExecNode_Scan(self, dataset, filter, colnames %||% character(0))
+ },
+ Build = function(.data) {
+ # This method takes an arrow_dplyr_query and chains together the
+ # ExecNodes that they produce. It does not evaluate them--that is Run().
+ group_vars <- dplyr::group_vars(.data)
+ grouped <- length(group_vars) > 0
+
+ # Collect the target names first because we have to add back the group vars
+ target_names <- names(.data)
+ .data <- ensure_group_vars(.data)
+ .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns
+
+ if (inherits(.data$.data, "arrow_dplyr_query")) {
+ # We have a nested query. Recurse.
+ node <- self$Build(.data$.data)
+ } else {
+ node <- self$Scan(.data)
+ }
+
+ # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again
+ if (inherits(.data$filtered_rows, "Expression")) {
+ node <- node$Filter(.data$filtered_rows)
+ }
+
+ if (!is.null(.data$aggregations)) {
+ # Project to include just the data required for each aggregation,
+ # plus group_by_vars (last)
+ # TODO: validate that none of names(aggregations) are the same as names(group_by_vars)
+ # dplyr does not error on this but the result it gives isn't great
+ node <- node$Project(summarize_projection(.data))
+
+ if (grouped) {
+ # We need to prefix all of the aggregation function names with "hash_"
+ .data$aggregations <- lapply(.data$aggregations, function(x) {
+ x[["fun"]] <- paste0("hash_", x[["fun"]])
+ x
+ })
+ }
+
+ node <- node$Aggregate(
+ options = map(.data$aggregations, ~ .[c("fun", "options")]),
+ target_names = names(.data$aggregations),
+ out_field_names = names(.data$aggregations),
+ key_names = group_vars
+ )
+
+ if (grouped) {
+ # The result will have result columns first then the grouping cols.
+ # dplyr orders group cols first, so adapt the result to meet that expectation.
+ node <- node$Project(
+ make_field_refs(c(group_vars, names(.data$aggregations)))
+ )
+ if (getOption("arrow.summarise.sort", FALSE)) {
+ # Add sorting instructions for the rows too to match dplyr
+ # (see below about why sorting isn't itself a Node)
+ node$sort <- list(
+ names = group_vars,
+ orders = rep(0L, length(group_vars))
+ )
+ }
+ }
+ } else {
+ # If any columns are derived, reordered, or renamed we need to Project
+ # If there are aggregations, the projection was already handled above
+ # We have to project at least once to eliminate some junk columns
+ # that the ExecPlan adds:
+ # __fragment_index, __batch_index, __last_in_fragment
+ # Presumably extraneous repeated projection of the same thing
+ # (as when we've done collapse() and not projected after) is cheap/no-op
+ projection <- c(.data$selected_columns, .data$temp_columns)
+ node <- node$Project(projection)
+
+ if (!is.null(.data$join)) {
+ node <- node$Join(
+ type = .data$join$type,
+ right_node = self$Build(.data$join$right_data),
+ by = .data$join$by,
+ left_output = names(.data),
+ right_output = setdiff(names(.data$join$right_data), .data$join$by)
+ )
+ }
+ }
+
+ # Apply sorting: this is currently not an ExecNode itself, it is a
+ # sink node option.
+ # TODO: handle some cases:
+ # (1) arrange > summarize > arrange
+ # (2) ARROW-13779: arrange then operation where order matters (e.g. cumsum)
+ if (length(.data$arrange_vars)) {
+ node$sort <- list(
+ names = names(.data$arrange_vars),
+ orders = .data$arrange_desc,
+ temp_columns = names(.data$temp_columns)
+ )
+ }
+
+ # This is only safe because we are going to evaluate queries that end
+ # with head/tail first, then evaluate any subsequent query as a new query
+ if (!is.null(.data$head)) {
+ node$head <- .data$head
+ }
+ if (!is.null(.data$tail)) {
+ node$tail <- .data$tail
+ }
+
+ node
+ },
+ Run = function(node) {
+ assert_is(node, "ExecNode")
+
+ # Sorting and head/tail (if sorted) are handled in the SinkNode,
+ # created in ExecPlan_run
+ sorting <- node$sort %||% list()
+ select_k <- node$head %||% -1L
+ has_sorting <- length(sorting) > 0
+ if (has_sorting) {
+ if (!is.null(node$tail)) {
+ # Reverse the sort order and take the top K, then after we'll reverse
+ # the resulting rows so that it is ordered as expected
+ sorting$orders <- !sorting$orders
+ select_k <- node$tail
+ }
+ sorting$orders <- as.integer(sorting$orders)
+ }
+
+ out <- ExecPlan_run(self, node, sorting, select_k)
+
+ if (!has_sorting) {
+ # Since ExecPlans don't scan in deterministic order, head/tail are both
+ # essentially taking a random slice from somewhere in the dataset.
+ # And since the head() implementation is way more efficient than tail(),
+ # just use it to take the random slice
+ slice_size <- node$head %||% node$tail
+ if (!is.null(slice_size)) {
+ # TODO (ARROW-14289): make the head methods return RBR not Table
+ out <- head(out, slice_size)
+ }
+ # Can we now tell `self$Stop()` to StopProducing? We already have
+ # everything we need for the head (but it seems to segfault: ARROW-14329)
+ } else if (!is.null(node$tail)) {
+ # Reverse the row order to get back what we expect
+ # TODO: don't return Table, return RecordBatchReader
+ out <- out$read_table()
+ out <- out[rev(seq_len(nrow(out))), , drop = FALSE]
+ }
+
+ out
+ },
+ Stop = function() ExecPlan_StopProducing(self)
+ )
+)
+ExecPlan$create <- function(use_threads = option_use_threads()) {
+ ExecPlan_create(use_threads)
+}
+
+ExecNode <- R6Class("ExecNode",
+ inherit = ArrowObject,
+ public = list(
+ # `sort` is a slight hack to be able to keep around arrange() params,
+ # which don't currently yield their own ExecNode but rather are consumed
+ # in the SinkNode (in ExecPlan$run())
+ sort = NULL,
+ # Similar hacks for head and tail
+ head = NULL,
+ tail = NULL,
+ preserve_sort = function(new_node) {
+ new_node$sort <- self$sort
+ new_node$head <- self$head
+ new_node$tail <- self$tail
+ new_node
+ },
+ Project = function(cols) {
+ if (length(cols)) {
+ assert_is_list_of(cols, "Expression")
+ self$preserve_sort(ExecNode_Project(self, cols, names(cols)))
+ } else {
+ self$preserve_sort(ExecNode_Project(self, character(0), character(0)))
+ }
+ },
+ Filter = function(expr) {
+ assert_is(expr, "Expression")
+ self$preserve_sort(ExecNode_Filter(self, expr))
+ },
+ Aggregate = function(options, target_names, out_field_names, key_names) {
+ self$preserve_sort(
+ ExecNode_Aggregate(self, options, target_names, out_field_names, key_names)
+ )
+ },
+ Join = function(type, right_node, by, left_output, right_output) {
+ self$preserve_sort(
+ ExecNode_Join(
+ self,
+ type,
+ right_node,
+ left_keys = names(by),
+ right_keys = by,
+ left_output = left_output,
+ right_output = right_output
+ )
+ )
+ }
+ ),
+ active = list(
+ schema = function() ExecNode_output_schema(self)
+ )
+)
diff --git a/src/arrow/r/R/record-batch-reader.R b/src/arrow/r/R/record-batch-reader.R
new file mode 100644
index 000000000..1542e3649
--- /dev/null
+++ b/src/arrow/r/R/record-batch-reader.R
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' @title RecordBatchReader classes
+#' @description Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather.
+#' `RecordBatchStreamReader` and `RecordBatchFileReader` are
+#' interfaces for accessing record batches from input sources in those formats,
+#' respectively.
+#'
+#' For guidance on how to use these classes, see the examples section.
+#'
+#' @seealso [read_ipc_stream()] and [read_feather()] provide a much simpler interface
+#' for reading data from these formats and are sufficient for many use cases.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `RecordBatchFileReader$create()` and `RecordBatchStreamReader$create()`
+#' factory methods instantiate the object and
+#' take a single argument, named according to the class:
+#'
+#' - `file` A character file name, raw vector, or Arrow file connection object
+#' (e.g. [RandomAccessFile]).
+#' - `stream` A raw vector, [Buffer], or [InputStream].
+#'
+#' @section Methods:
+#'
+#' - `$read_next_batch()`: Returns a `RecordBatch`, iterating through the
+#' Reader. If there are no further batches in the Reader, it returns `NULL`.
+#' - `$schema`: Returns a [Schema] (active binding)
+#' - `$batches()`: Returns a list of `RecordBatch`es
+#' - `$read_table()`: Collects the reader's `RecordBatch`es into a [Table]
+#' - `$get_batch(i)`: For `RecordBatchFileReader`, return a particular batch
+#' by an integer index.
+#' - `$num_record_batches()`: For `RecordBatchFileReader`, see how many batches
+#' are in the file.
+#'
+#' @rdname RecordBatchReader
+#' @name RecordBatchReader
+#' @include arrow-package.R
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#'
+#' batch <- record_batch(chickwts)
+#'
+#' # This opens a connection to the file in Arrow
+#' file_obj <- FileOutputStream$create(tf)
+#' # Pass that to a RecordBatchWriter to write data conforming to a schema
+#' writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+#' writer$write(batch)
+#' # You may write additional batches to the stream, provided that they have
+#' # the same schema.
+#' # Call "close" on the writer to indicate end-of-file/stream
+#' writer$close()
+#' # Then, close the connection--closing the IPC message does not close the file
+#' file_obj$close()
+#'
+#' # Now, we have a file we can read from. Same pattern: open file connection,
+#' # then pass it to a RecordBatchReader
+#' read_file_obj <- ReadableFile$create(tf)
+#' reader <- RecordBatchFileReader$create(read_file_obj)
+#' # RecordBatchFileReader knows how many batches it has (StreamReader does not)
+#' reader$num_record_batches
+#' # We could consume the Reader by calling $read_next_batch() until all are,
+#' # consumed, or we can call $read_table() to pull them all into a Table
+#' tab <- reader$read_table()
+#' # Call as.data.frame to turn that Table into an R data.frame
+#' df <- as.data.frame(tab)
+#' # This should be the same data we sent
+#' all.equal(df, chickwts, check.attributes = FALSE)
+#' # Unlike the Writers, we don't have to close RecordBatchReaders,
+#' # but we do still need to close the file connection
+#' read_file_obj$close()
+RecordBatchReader <- R6Class("RecordBatchReader",
+ inherit = ArrowObject,
+ public = list(
+ read_next_batch = function() RecordBatchReader__ReadNext(self),
+ batches = function() RecordBatchReader__batches(self),
+ read_table = function() Table__from_RecordBatchReader(self),
+ export_to_c = function(stream_ptr) ExportRecordBatchReader(self, stream_ptr)
+ ),
+ active = list(
+ schema = function() RecordBatchReader__schema(self)
+ )
+)
+
+#' @export
+head.RecordBatchReader <- function(x, n = 6L, ...) {
+ head(Scanner$create(x), n)
+}
+
+#' @export
+tail.RecordBatchReader <- function(x, n = 6L, ...) {
+ tail(Scanner$create(x), n)
+}
+
+#' @rdname RecordBatchReader
+#' @usage NULL
+#' @format NULL
+#' @export
+RecordBatchStreamReader <- R6Class("RecordBatchStreamReader", inherit = RecordBatchReader)
+RecordBatchStreamReader$create <- function(stream) {
+ if (inherits(stream, c("raw", "Buffer"))) {
+ # TODO: deprecate this because it doesn't close the connection to the Buffer
+ # (that's a problem, right?)
+ stream <- BufferReader$create(stream)
+ }
+ assert_is(stream, "InputStream")
+ ipc___RecordBatchStreamReader__Open(stream)
+}
+#' @include arrowExports.R
+RecordBatchReader$import_from_c <- RecordBatchStreamReader$import_from_c <- ImportRecordBatchReader
+
+#' @rdname RecordBatchReader
+#' @usage NULL
+#' @format NULL
+#' @export
+RecordBatchFileReader <- R6Class("RecordBatchFileReader",
+ inherit = ArrowObject,
+ # Why doesn't this inherit from RecordBatchReader in C++?
+ # Origin: https://github.com/apache/arrow/pull/679
+ public = list(
+ get_batch = function(i) {
+ ipc___RecordBatchFileReader__ReadRecordBatch(self, i)
+ },
+ batches = function() {
+ ipc___RecordBatchFileReader__batches(self)
+ },
+ read_table = function() Table__from_RecordBatchFileReader(self)
+ ),
+ active = list(
+ num_record_batches = function() ipc___RecordBatchFileReader__num_record_batches(self),
+ schema = function() ipc___RecordBatchFileReader__schema(self)
+ )
+)
+RecordBatchFileReader$create <- function(file) {
+ if (inherits(file, c("raw", "Buffer"))) {
+ # TODO: deprecate this because it doesn't close the connection to the Buffer
+ # (that's a problem, right?)
+ file <- BufferReader$create(file)
+ }
+ assert_is(file, "InputStream")
+ ipc___RecordBatchFileReader__Open(file)
+}
diff --git a/src/arrow/r/R/record-batch-writer.R b/src/arrow/r/R/record-batch-writer.R
new file mode 100644
index 000000000..8675e785a
--- /dev/null
+++ b/src/arrow/r/R/record-batch-writer.R
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' @title RecordBatchWriter classes
+#' @description Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather.
+#' `RecordBatchStreamWriter` and `RecordBatchFileWriter` are
+#' interfaces for writing record batches to those formats, respectively.
+#'
+#' For guidance on how to use these classes, see the examples section.
+#'
+#' @seealso [write_ipc_stream()] and [write_feather()] provide a much simpler
+#' interface for writing data to these formats and are sufficient for many use
+#' cases. [write_to_raw()] is a version that serializes data to a buffer.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `RecordBatchFileWriter$create()` and `RecordBatchStreamWriter$create()`
+#' factory methods instantiate the object and take the following arguments:
+#'
+#' - `sink` An `OutputStream`
+#' - `schema` A [Schema] for the data to be written
+#' - `use_legacy_format` logical: write data formatted so that Arrow libraries
+#' versions 0.14 and lower can read it. Default is `FALSE`. You can also
+#' enable this by setting the environment variable `ARROW_PRE_0_15_IPC_FORMAT=1`.
+#' - `metadata_version`: A string like "V5" or the equivalent integer indicating
+#' the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
+#' unless the environment variable `ARROW_PRE_1_0_METADATA_VERSION=1`, in
+#' which case it will be V4.
+#'
+#' @section Methods:
+#'
+#' - `$write(x)`: Write a [RecordBatch], [Table], or `data.frame`, dispatching
+#' to the methods below appropriately
+#' - `$write_batch(batch)`: Write a `RecordBatch` to stream
+#' - `$write_table(table)`: Write a `Table` to stream
+#' - `$close()`: close stream. Note that this indicates end-of-file or
+#' end-of-stream--it does not close the connection to the `sink`. That needs
+#' to be closed separately.
+#'
+#' @rdname RecordBatchWriter
+#' @name RecordBatchWriter
+#' @include arrow-package.R
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#'
+#' batch <- record_batch(chickwts)
+#'
+#' # This opens a connection to the file in Arrow
+#' file_obj <- FileOutputStream$create(tf)
+#' # Pass that to a RecordBatchWriter to write data conforming to a schema
+#' writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+#' writer$write(batch)
+#' # You may write additional batches to the stream, provided that they have
+#' # the same schema.
+#' # Call "close" on the writer to indicate end-of-file/stream
+#' writer$close()
+#' # Then, close the connection--closing the IPC message does not close the file
+#' file_obj$close()
+#'
+#' # Now, we have a file we can read from. Same pattern: open file connection,
+#' # then pass it to a RecordBatchReader
+#' read_file_obj <- ReadableFile$create(tf)
+#' reader <- RecordBatchFileReader$create(read_file_obj)
+#' # RecordBatchFileReader knows how many batches it has (StreamReader does not)
+#' reader$num_record_batches
+#' # We could consume the Reader by calling $read_next_batch() until all are,
+#' # consumed, or we can call $read_table() to pull them all into a Table
+#' tab <- reader$read_table()
+#' # Call as.data.frame to turn that Table into an R data.frame
+#' df <- as.data.frame(tab)
+#' # This should be the same data we sent
+#' all.equal(df, chickwts, check.attributes = FALSE)
+#' # Unlike the Writers, we don't have to close RecordBatchReaders,
+#' # but we do still need to close the file connection
+#' read_file_obj$close()
+RecordBatchWriter <- R6Class("RecordBatchWriter",
+ inherit = ArrowObject,
+ public = list(
+ write_batch = function(batch) ipc___RecordBatchWriter__WriteRecordBatch(self, batch),
+ write_table = function(table) ipc___RecordBatchWriter__WriteTable(self, table),
+ write = function(x) {
+ if (inherits(x, "RecordBatch")) {
+ self$write_batch(x)
+ } else if (inherits(x, "Table")) {
+ self$write_table(x)
+ } else {
+ self$write_table(Table$create(x))
+ }
+ },
+ close = function() ipc___RecordBatchWriter__Close(self)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname RecordBatchWriter
+#' @export
+RecordBatchStreamWriter <- R6Class("RecordBatchStreamWriter", inherit = RecordBatchWriter)
+RecordBatchStreamWriter$create <- function(sink,
+ schema,
+ use_legacy_format = NULL,
+ metadata_version = NULL) {
+ if (is.string(sink)) {
+ stop(
+ "RecordBatchStreamWriter$create() requires an Arrow InputStream. ",
+ "Try providing FileOutputStream$create(", substitute(sink), ")",
+ call. = FALSE
+ )
+ }
+ assert_is(sink, "OutputStream")
+ assert_is(schema, "Schema")
+
+ ipc___RecordBatchStreamWriter__Open(
+ sink,
+ schema,
+ get_ipc_use_legacy_format(use_legacy_format),
+ get_ipc_metadata_version(metadata_version)
+ )
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname RecordBatchWriter
+#' @export
+RecordBatchFileWriter <- R6Class("RecordBatchFileWriter", inherit = RecordBatchStreamWriter)
+RecordBatchFileWriter$create <- function(sink,
+ schema,
+ use_legacy_format = NULL,
+ metadata_version = NULL) {
+ if (is.string(sink)) {
+ stop(
+ "RecordBatchFileWriter$create() requires an Arrow InputStream. ",
+ "Try providing FileOutputStream$create(", substitute(sink), ")",
+ call. = FALSE
+ )
+ }
+ assert_is(sink, "OutputStream")
+ assert_is(schema, "Schema")
+
+ ipc___RecordBatchFileWriter__Open(
+ sink,
+ schema,
+ get_ipc_use_legacy_format(use_legacy_format),
+ get_ipc_metadata_version(metadata_version)
+ )
+}
+
+get_ipc_metadata_version <- function(x) {
+ input <- x
+ if (is_integerish(x)) {
+ # 4 means "V4", which actually happens to be 3L
+ x <- paste0("V", x)
+ } else if (is.null(x)) {
+ if (identical(Sys.getenv("ARROW_PRE_1_0_METADATA_VERSION"), "1") ||
+ identical(Sys.getenv("ARROW_PRE_0_15_IPC_FORMAT"), "1")) {
+ # PRE_1_0 is specific for this;
+ # if you already set PRE_0_15, PRE_1_0 should be implied
+ x <- "V4"
+ } else {
+ # Take the latest
+ x <- length(MetadataVersion)
+ }
+ }
+ out <- MetadataVersion[[x]]
+ if (is.null(out)) {
+ stop(deparse(input), " is not a valid IPC MetadataVersion", call. = FALSE)
+ }
+ out
+}
+
+get_ipc_use_legacy_format <- function(x) {
+ isTRUE(x %||% identical(Sys.getenv("ARROW_PRE_0_15_IPC_FORMAT"), "1"))
+}
diff --git a/src/arrow/r/R/record-batch.R b/src/arrow/r/R/record-batch.R
new file mode 100644
index 000000000..c66ff7fb0
--- /dev/null
+++ b/src/arrow/r/R/record-batch.R
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @include array.R
+#' @title RecordBatch class
+#' @description A record batch is a collection of equal-length arrays matching
+#' a particular [Schema]. It is a table-like data structure that is semantically
+#' a sequence of [fields][Field], each a contiguous Arrow [Array].
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section S3 Methods and Usage:
+#' Record batches are data-frame-like, and many methods you expect to work on
+#' a `data.frame` are implemented for `RecordBatch`. This includes `[`, `[[`,
+#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull
+#' the data from an Arrow record batch into R with `as.data.frame()`. See the
+#' examples.
+#'
+#' A caveat about the `$` method: because `RecordBatch` is an `R6` object,
+#' `$` is also used to access the object's methods (see below). Methods take
+#' precedence over the table's columns. So, `batch$Slice` would return the
+#' "Slice" method function even if there were a column in the table called
+#' "Slice".
+#'
+#' @section R6 Methods:
+#' In addition to the more R-friendly S3 methods, a `RecordBatch` object has
+#' the following R6 methods that map onto the underlying C++ methods:
+#'
+#' - `$Equals(other)`: Returns `TRUE` if the `other` record batch is equal
+#' - `$column(i)`: Extract an `Array` by integer position from the batch
+#' - `$column_name(i)`: Get a column's name by integer position
+#' - `$names()`: Get all column names (called by `names(batch)`)
+#' - `$RenameColumns(value)`: Set all column names (called by `names(batch) <- value`)
+#' - `$GetColumnByName(name)`: Extract an `Array` by string name
+#' - `$RemoveColumn(i)`: Drops a column from the batch by integer position
+#' - `$SelectColumns(indices)`: Return a new record batch with a selection of columns, expressed as 0-based integers.
+#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
+#' indicated integer offset and going for the given length, or to the end
+#' of the table if `NULL`, the default.
+#' - `$Take(i)`: return an `RecordBatch` with rows at positions given by
+#' integers (R vector or Array Array) `i`.
+#' - `$Filter(i, keep_na = TRUE)`: return an `RecordBatch` with rows at positions where logical
+#' vector (or Arrow boolean Array) `i` is `TRUE`.
+#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row
+#' positions that can be used to rearrange the `RecordBatch` in ascending or
+#' descending order by the first named column, breaking ties with further named
+#' columns. `descending` can be a logical vector of length one or of the same
+#' length as `names`.
+#' - `$serialize()`: Returns a raw vector suitable for interprocess communication
+#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
+#' the schema of the record batch.
+#'
+#' There are also some active bindings
+#' - `$num_columns`
+#' - `$num_rows`
+#' - `$schema`
+#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
+#' Modify or replace by assigning in (`batch$metadata <- new_metadata`).
+#' All list elements are coerced to string. See `schema()` for more information.
+#' - `$columns`: Returns a list of `Array`s
+#' @rdname RecordBatch
+#' @name RecordBatch
+#' @export
+RecordBatch <- R6Class("RecordBatch",
+ inherit = ArrowTabular,
+ public = list(
+ column = function(i) RecordBatch__column(self, i),
+ column_name = function(i) RecordBatch__column_name(self, i),
+ names = function() RecordBatch__names(self),
+ RenameColumns = function(value) RecordBatch__RenameColumns(self, value),
+ Equals = function(other, check_metadata = FALSE, ...) {
+ inherits(other, "RecordBatch") && RecordBatch__Equals(self, other, isTRUE(check_metadata))
+ },
+ GetColumnByName = function(name) {
+ assert_that(is.string(name))
+ RecordBatch__GetColumnByName(self, name)
+ },
+ SelectColumns = function(indices) RecordBatch__SelectColumns(self, indices),
+ AddColumn = function(i, new_field, value) {
+ RecordBatch__AddColumn(self, i, new_field, value)
+ },
+ SetColumn = function(i, new_field, value) {
+ RecordBatch__SetColumn(self, i, new_field, value)
+ },
+ RemoveColumn = function(i) RecordBatch__RemoveColumn(self, i),
+ ReplaceSchemaMetadata = function(new) {
+ RecordBatch__ReplaceSchemaMetadata(self, new)
+ },
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ RecordBatch__Slice1(self, offset)
+ } else {
+ RecordBatch__Slice2(self, offset, length)
+ }
+ },
+ # Take, Filter, and SortIndices are methods on ArrowTabular
+ serialize = function() ipc___SerializeRecordBatch__Raw(self),
+ to_data_frame = function() {
+ RecordBatch__to_dataframe(self, use_threads = option_use_threads())
+ },
+ cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) {
+ assert_is(target_schema, "Schema")
+ assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
+ RecordBatch__cast(self, target_schema, options)
+ },
+ invalidate = function() {
+ .Call(`_arrow_RecordBatch__Reset`, self)
+ super$invalidate()
+ },
+ export_to_c = function(array_ptr, schema_ptr) {
+ ExportRecordBatch(self, array_ptr, schema_ptr)
+ }
+ ),
+ active = list(
+ num_columns = function() RecordBatch__num_columns(self),
+ num_rows = function() RecordBatch__num_rows(self),
+ schema = function() RecordBatch__schema(self),
+ columns = function() RecordBatch__columns(self)
+ )
+)
+
+RecordBatch$create <- function(..., schema = NULL) {
+ arrays <- list2(...)
+ if (length(arrays) == 1 && inherits(arrays[[1]], c("raw", "Buffer", "InputStream", "Message"))) {
+ return(RecordBatch$from_message(arrays[[1]], schema))
+ }
+
+ # Else, a list of arrays or data.frames
+ # making sure there are always names
+ if (is.null(names(arrays))) {
+ names(arrays) <- rep_len("", length(arrays))
+ }
+ stopifnot(length(arrays) > 0)
+
+ # If any arrays are length 1, recycle them
+ arrays <- recycle_scalars(arrays)
+
+ # TODO: should this also assert that they're all Arrays?
+ RecordBatch__from_arrays(schema, arrays)
+}
+
+RecordBatch$from_message <- function(obj, schema) {
+ # Message/Buffer readers, previously in read_record_batch()
+ assert_is(schema, "Schema")
+ if (inherits(obj, c("raw", "Buffer"))) {
+ obj <- BufferReader$create(obj)
+ on.exit(obj$close())
+ }
+ if (inherits(obj, "InputStream")) {
+ ipc___ReadRecordBatch__InputStream__Schema(obj, schema)
+ } else {
+ ipc___ReadRecordBatch__Message__Schema(obj, schema)
+ }
+}
+#' @include arrowExports.R
+RecordBatch$import_from_c <- ImportRecordBatch
+
+#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a
+#' mixture of data.frames and vectors, the inputs will be autospliced together
+#' (see examples). Alternatively, you can provide a single Arrow IPC
+#' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`.
+#' @param schema a [Schema], or `NULL` (the default) to infer the schema from
+#' the data in `...`. When providing an Arrow IPC buffer, `schema` is required.
+#' @rdname RecordBatch
+#' @examplesIf arrow_available()
+#' batch <- record_batch(name = rownames(mtcars), mtcars)
+#' dim(batch)
+#' dim(head(batch))
+#' names(batch)
+#' batch$mpg
+#' batch[["cyl"]]
+#' as.data.frame(batch[4:8, c("gear", "hp", "wt")])
+#' @export
+record_batch <- RecordBatch$create
+
+#' @export
+names.RecordBatch <- function(x) x$names()
diff --git a/src/arrow/r/R/reexports-bit64.R b/src/arrow/r/R/reexports-bit64.R
new file mode 100644
index 000000000..c89d2b150
--- /dev/null
+++ b/src/arrow/r/R/reexports-bit64.R
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom bit64 print.integer64
+bit64::print.integer64
+
+#' @importFrom bit64 str.integer64
+bit64::str.integer64
diff --git a/src/arrow/r/R/reexports-tidyselect.R b/src/arrow/r/R/reexports-tidyselect.R
new file mode 100644
index 000000000..cd0de2849
--- /dev/null
+++ b/src/arrow/r/R/reexports-tidyselect.R
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Alias required for help links in downstream packages
+#' @aliases select_helpers
+#' @importFrom tidyselect contains
+#' @export
+tidyselect::contains
+#' @importFrom tidyselect ends_with
+#' @export
+tidyselect::ends_with
+#' @importFrom tidyselect everything
+#' @export
+tidyselect::everything
+#' @importFrom tidyselect matches
+#' @export
+tidyselect::matches
+#' @importFrom tidyselect num_range
+#' @export
+tidyselect::num_range
+#' @importFrom tidyselect one_of
+#' @export
+tidyselect::one_of
+#' @importFrom tidyselect starts_with
+#' @export
+tidyselect::starts_with
+#' @importFrom tidyselect last_col
+#' @export
+tidyselect::last_col
+#' @importFrom tidyselect all_of
+#' @export
+tidyselect::all_of
diff --git a/src/arrow/r/R/scalar.R b/src/arrow/r/R/scalar.R
new file mode 100644
index 000000000..4dedc6c12
--- /dev/null
+++ b/src/arrow/r/R/scalar.R
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+#' @title Arrow scalars
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @description A `Scalar` holds a single value of an Arrow type.
+#'
+#' @section Methods:
+#' `$ToString()`: convert to a string
+#' `$as_vector()`: convert to an R vector
+#' `$as_array()`: convert to an Arrow `Array`
+#' `$Equals(other)`: is this Scalar equal to `other`
+#' `$ApproxEquals(other)`: is this Scalar approximately equal to `other`
+#' `$is_valid`: is this Scalar valid
+#' `$null_count`: number of invalid values - 1 or 0
+#' `$type`: Scalar type
+#'
+#' @name Scalar
+#' @rdname Scalar
+#' @examplesIf arrow_available()
+#' Scalar$create(pi)
+#' Scalar$create(404)
+#' # If you pass a vector into Scalar$create, you get a list containing your items
+#' Scalar$create(c(1, 2, 3))
+#'
+#' # Comparisons
+#' my_scalar <- Scalar$create(99)
+#' my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE
+#' my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE
+#' my_scalar$Equals(Scalar$create(99.000009)) # FALSE
+#' my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match)
+#'
+#' my_scalar$ToString()
+#' @export
+Scalar <- R6Class("Scalar",
+ inherit = ArrowDatum,
+ # TODO: document the methods
+ public = list(
+ ToString = function() Scalar__ToString(self),
+ type_id = function() Scalar__type(self)$id,
+ as_vector = function() Scalar__as_vector(self),
+ as_array = function(length = 1L) MakeArrayFromScalar(self, as.integer(length)),
+ Equals = function(other, ...) {
+ inherits(other, "Scalar") && Scalar__Equals(self, other)
+ },
+ ApproxEquals = function(other, ...) {
+ inherits(other, "Scalar") && Scalar__ApproxEquals(self, other)
+ }
+ ),
+ active = list(
+ is_valid = function() Scalar__is_valid(self),
+ null_count = function() sum(!self$is_valid),
+ type = function() Scalar__type(self)
+ )
+)
+Scalar$create <- function(x, type = NULL) {
+ if (is.null(x)) {
+ x <- vctrs::unspecified(1)
+ } else if (length(x) != 1 && !is.data.frame(x)) {
+ # Wrap in a list type
+ x <- list(x)
+ }
+ Array__GetScalar(Array$create(x, type = type), 0)
+}
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+StructScalar <- R6Class("StructScalar",
+ inherit = Scalar,
+ public = list(
+ field = function(i) StructScalar__field(self, i),
+ GetFieldByName = function(name) StructScalar__GetFieldByName(self, name)
+ )
+)
+
+#' @export
+length.Scalar <- function(x) 1L
+
+#' @export
+sort.Scalar <- function(x, decreasing = FALSE, ...) x
diff --git a/src/arrow/r/R/schema.R b/src/arrow/r/R/schema.R
new file mode 100644
index 000000000..c3dfee5f9
--- /dev/null
+++ b/src/arrow/r/R/schema.R
@@ -0,0 +1,330 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title Schema class
+#'
+#' @description A `Schema` is a list of [Field]s, which map names to
+#' Arrow [data types][data-type]. Create a `Schema` when you
+#' want to convert an R `data.frame` to Arrow but don't want to rely on the
+#' default mapping of R types to Arrow types, such as when you want to choose a
+#' specific numeric precision, or when creating a [Dataset] and you want to
+#' ensure a specific schema rather than inferring it from the various files.
+#'
+#' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method
+#' (active binding) that lets you access their schema.
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Methods:
+#'
+#' - `$ToString()`: convert to a string
+#' - `$field(i)`: returns the field at index `i` (0-based)
+#' - `$GetFieldByName(x)`: returns the field with name `x`
+#' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value
+#' `metadata` set. Note that all list elements in `metadata` will be coerced
+#' to `character`.
+#'
+#' @section Active bindings:
+#'
+#' - `$names`: returns the field names (called in `names(Schema)`)
+#' - `$num_fields`: returns the number of fields (called in `length(Schema)`)
+#' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for
+#' iterating over
+#' - `$HasMetadata`: logical: does this `Schema` have extra metadata?
+#' - `$metadata`: returns the key-value metadata as a named list.
+#' Modify or replace by assigning in (`sch$metadata <- new_metadata`).
+#' All list elements are coerced to string.
+#'
+#' @section R Metadata:
+#'
+#' When converting a data.frame to an Arrow Table or RecordBatch, attributes
+#' from the `data.frame` are saved alongside tables so that the object can be
+#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
+#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
+#' at the column (e.g. `attributes(df$col_a)`) or for list columns only:
+#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
+#' for storing `haven` columns in a table and being able to faithfully
+#' re-create them when pulled back into R. This metadata is separate from the
+#' schema (column names and types) which is compatible with other Arrow
+#' clients. The R metadata is only read by R and is ignored by other clients
+#' (e.g. Pandas has its own custom metadata). This metadata is stored in
+#' `$metadata$r`.
+#'
+#' Since Schema metadata keys and values must be strings, this metadata is
+#' saved by serializing R's attribute list structure to a string. If the
+#' serialized metadata exceeds 100Kb in size, by default it is compressed
+#' starting in version 3.0.0. To disable this compression (e.g. for tables
+#' that are compatible with Arrow versions before 3.0.0 and include large
+#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
+#' Files with compressed metadata are readable by older versions of arrow, but
+#' the metadata is dropped.
+#'
+#' @rdname Schema
+#' @name Schema
+#' @examplesIf arrow_available()
+#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+#' tab1 <- arrow_table(df)
+#' tab1$schema
+#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
+#' tab2$schema
+#' @export
+Schema <- R6Class("Schema",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ fields <- print_schema_fields(self)
+ if (self$HasMetadata) {
+ fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata")
+ }
+ fields
+ },
+ field = function(i) Schema__field(self, i),
+ GetFieldByName = function(x) Schema__GetFieldByName(self, x),
+ AddField = function(i, field) {
+ assert_is(field, "Field")
+ Schema__AddField(self, i, field)
+ },
+ SetField = function(i, field) {
+ assert_is(field, "Field")
+ Schema__SetField(self, i, field)
+ },
+ RemoveField = function(i) Schema__RemoveField(self, i),
+ serialize = function() Schema__serialize(self),
+ WithMetadata = function(metadata = NULL) {
+ metadata <- prepare_key_value_metadata(metadata)
+ Schema__WithMetadata(self, metadata)
+ },
+ Equals = function(other, check_metadata = FALSE, ...) {
+ inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata))
+ },
+ export_to_c = function(ptr) ExportSchema(self, ptr)
+ ),
+ active = list(
+ names = function() {
+ Schema__field_names(self)
+ },
+ num_fields = function() Schema__num_fields(self),
+ fields = function() Schema__fields(self),
+ HasMetadata = function() Schema__HasMetadata(self),
+ metadata = function(new_metadata) {
+ if (missing(new_metadata)) {
+ Schema__metadata(self)
+ } else {
+ # Set the metadata
+ out <- self$WithMetadata(new_metadata)
+ # $WithMetadata returns a new object but we're modifying in place,
+ # so swap in that new C++ object pointer into our R6 object
+ self$set_pointer(out$pointer())
+ self
+ }
+ },
+ r_metadata = function(new) {
+ # Helper for the R metadata that handles the serialization
+ # See also method on ArrowTabular
+ if (missing(new)) {
+ out <- self$metadata$r
+ if (!is.null(out)) {
+ # Can't unserialize NULL
+ out <- .unserialize_arrow_r_metadata(out)
+ }
+ # Returns either NULL or a named list
+ out
+ } else {
+ # Set the R metadata
+ self$metadata$r <- .serialize_arrow_r_metadata(new)
+ self
+ }
+ }
+ )
+)
+Schema$create <- function(...) {
+ .list <- list2(...)
+ if (all(map_lgl(.list, ~ inherits(., "Field")))) {
+ schema_(.list)
+ } else {
+ schema_(.fields(.list))
+ }
+}
+#' @include arrowExports.R
+Schema$import_from_c <- ImportSchema
+
+prepare_key_value_metadata <- function(metadata) {
+ # key-value-metadata must be a named character vector;
+ # this function validates and coerces
+ if (is.null(metadata)) {
+ # NULL to remove metadata, so equivalent to setting an empty list
+ metadata <- empty_named_list()
+ }
+ if (is.null(names(metadata))) {
+ stop(
+ "Key-value metadata must be a named list or character vector",
+ call. = FALSE
+ )
+ }
+ map_chr(metadata, as.character)
+}
+
+print_schema_fields <- function(s) {
+ # Alternative to Schema__ToString that doesn't print metadata
+ paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
+}
+
+#' @param ... named list containing [data types][data-type] or
+#' a list of [fields][field] containing the fields for the schema
+#' @export
+#' @rdname Schema
+schema <- Schema$create
+
+#' @export
+names.Schema <- function(x) x$names
+
+#' @export
+length.Schema <- function(x) x$num_fields
+
+#' @export
+`[[.Schema` <- function(x, i, ...) {
+ if (is.character(i)) {
+ x$GetFieldByName(i)
+ } else if (is.numeric(i)) {
+ x$field(i - 1)
+ } else {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`[[<-.Schema` <- function(x, i, value) {
+ assert_that(length(i) == 1)
+ if (is.character(i)) {
+ field_names <- names(x)
+ if (anyDuplicated(field_names)) {
+ stop("Cannot update field by name with duplicates", call. = FALSE)
+ }
+
+ # If i is character, it's the field name
+ if (!is.null(value) && !inherits(value, "Field")) {
+ value <- field(i, as_type(value, "value"))
+ }
+
+ # No match means we're adding to the end
+ i <- match(i, field_names, nomatch = length(field_names) + 1L)
+ } else {
+ assert_that(is.numeric(i), !is.na(i), i > 0)
+ # If i is numeric and we have a type,
+ # we need to grab the existing field name for the new one
+ if (!is.null(value) && !inherits(value, "Field")) {
+ value <- field(names(x)[i], as_type(value, "value"))
+ }
+ }
+
+ i <- as.integer(i - 1L)
+ if (i >= length(x)) {
+ if (!is.null(value)) {
+ x <- x$AddField(i, value)
+ }
+ } else if (is.null(value)) {
+ x <- x$RemoveField(i)
+ } else {
+ x <- x$SetField(i, value)
+ }
+ x
+}
+
+#' @export
+`$<-.Schema` <- `$<-.ArrowTabular`
+
+#' @export
+`[.Schema` <- function(x, i, ...) {
+ if (is.logical(i)) {
+ i <- rep_len(i, length(x)) # For R recycling behavior
+ i <- which(i)
+ }
+ if (is.numeric(i)) {
+ if (all(i < 0)) {
+ # in R, negative i means "everything but i"
+ i <- setdiff(seq_len(length(x)), -1 * i)
+ }
+ }
+ fields <- map(i, ~ x[[.]])
+ invalid <- map_lgl(fields, is.null)
+ if (any(invalid)) {
+ stop(
+ "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "),
+ oxford_paste(i[invalid]),
+ call. = FALSE
+ )
+ }
+ schema_(fields)
+}
+
+#' @export
+`$.Schema` <- function(x, name, ...) {
+ assert_that(is.string(name))
+ if (name %in% ls(x)) {
+ get(name, x)
+ } else {
+ x$GetFieldByName(name)
+ }
+}
+
+#' @export
+as.list.Schema <- function(x, ...) x$fields
+
+#' read a Schema from a stream
+#'
+#' @param stream a `Message`, `InputStream`, or `Buffer`
+#' @param ... currently ignored
+#' @return A [Schema]
+#' @export
+read_schema <- function(stream, ...) {
+ if (inherits(stream, "Message")) {
+ return(ipc___ReadSchema_Message(stream))
+ } else {
+ if (!inherits(stream, "InputStream")) {
+ stream <- BufferReader$create(stream)
+ on.exit(stream$close())
+ }
+ return(ipc___ReadSchema_InputStream(stream))
+ }
+}
+
+#' Combine and harmonize schemas
+#'
+#' @param ... [Schema]s to unify
+#' @param schemas Alternatively, a list of schemas
+#' @return A `Schema` with the union of fields contained in the inputs, or
+#' `NULL` if any of `schemas` is `NULL`
+#' @export
+#' @examplesIf arrow_available()
+#' a <- schema(b = double(), c = bool())
+#' z <- schema(b = double(), k = utf8())
+#' unify_schemas(a, z)
+unify_schemas <- function(..., schemas = list(...)) {
+ if (any(vapply(schemas, is.null, TRUE))) {
+ return(NULL)
+ }
+ arrow__UnifySchemas(schemas)
+}
+
+#' @export
+print.arrow_r_metadata <- function(x, ...) {
+ utils::str(x)
+ utils::str(.unserialize_arrow_r_metadata(x))
+ invisible(x)
+}
diff --git a/src/arrow/r/R/table.R b/src/arrow/r/R/table.R
new file mode 100644
index 000000000..5ae87f7e3
--- /dev/null
+++ b/src/arrow/r/R/table.R
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include record-batch.R
+#' @title Table class
+#' @description A Table is a sequence of [chunked arrays][ChunkedArray]. They
+#' have a similar interface to [record batches][RecordBatch], but they can be
+#' composed from multiple record batches or chunked arrays.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section S3 Methods and Usage:
+#' Tables are data-frame-like, and many methods you expect to work on
+#' a `data.frame` are implemented for `Table`. This includes `[`, `[[`,
+#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull
+#' the data from an Arrow table into R with `as.data.frame()`. See the
+#' examples.
+#'
+#' A caveat about the `$` method: because `Table` is an `R6` object,
+#' `$` is also used to access the object's methods (see below). Methods take
+#' precedence over the table's columns. So, `tab$Slice` would return the
+#' "Slice" method function even if there were a column in the table called
+#' "Slice".
+#'
+#' @section R6 Methods:
+#' In addition to the more R-friendly S3 methods, a `Table` object has
+#' the following R6 methods that map onto the underlying C++ methods:
+#'
+#' - `$column(i)`: Extract a `ChunkedArray` by integer position from the table
+#' - `$ColumnNames()`: Get all column names (called by `names(tab)`)
+#' - `$RenameColumns(value)`: Set all column names (called by `names(tab) <- value`)
+#' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name
+#' - `$field(i)`: Extract a `Field` from the table schema by integer position
+#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers.
+#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
+#' indicated integer offset and going for the given length, or to the end
+#' of the table if `NULL`, the default.
+#' - `$Take(i)`: return an `Table` with rows at positions given by
+#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be
+#' coerced to an R vector before taking.
+#' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical
+#' vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
+#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row
+#' positions that can be used to rearrange the `Table` in ascending or descending
+#' order by the first named column, breaking ties with further named columns.
+#' `descending` can be a logical vector of length one or of the same length as
+#' `names`.
+#' - `$serialize(output_stream, ...)`: Write the table to the given
+#' [OutputStream]
+#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
+#' the schema of the record batch.
+#'
+#' There are also some active bindings:
+#' - `$num_columns`
+#' - `$num_rows`
+#' - `$schema`
+#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
+#' Modify or replace by assigning in (`tab$metadata <- new_metadata`).
+#' All list elements are coerced to string. See `schema()` for more information.
+#' - `$columns`: Returns a list of `ChunkedArray`s
+#' @rdname Table
+#' @name Table
+#' @export
+Table <- R6Class("Table",
+ inherit = ArrowTabular,
+ public = list(
+ column = function(i) Table__column(self, i),
+ ColumnNames = function() Table__ColumnNames(self),
+ RenameColumns = function(value) Table__RenameColumns(self, value),
+ GetColumnByName = function(name) {
+ assert_is(name, "character")
+ assert_that(length(name) == 1)
+ Table__GetColumnByName(self, name)
+ },
+ RemoveColumn = function(i) Table__RemoveColumn(self, i),
+ AddColumn = function(i, new_field, value) Table__AddColumn(self, i, new_field, value),
+ SetColumn = function(i, new_field, value) Table__SetColumn(self, i, new_field, value),
+ ReplaceSchemaMetadata = function(new) {
+ Table__ReplaceSchemaMetadata(self, new)
+ },
+ field = function(i) Table__field(self, i),
+ serialize = function(output_stream, ...) write_table(self, output_stream, ...),
+ to_data_frame = function() {
+ Table__to_dataframe(self, use_threads = option_use_threads())
+ },
+ cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) {
+ assert_is(target_schema, "Schema")
+ assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
+ Table__cast(self, target_schema, options)
+ },
+ SelectColumns = function(indices) Table__SelectColumns(self, indices),
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ Table__Slice1(self, offset)
+ } else {
+ Table__Slice2(self, offset, length)
+ }
+ },
+ # Take, Filter, and SortIndices are methods on ArrowTabular
+ Equals = function(other, check_metadata = FALSE, ...) {
+ inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata))
+ },
+ Validate = function() Table__Validate(self),
+ ValidateFull = function() Table__ValidateFull(self),
+ invalidate = function() {
+ .Call(`_arrow_Table__Reset`, self)
+ super$invalidate()
+ }
+ ),
+ active = list(
+ num_columns = function() Table__num_columns(self),
+ num_rows = function() Table__num_rows(self),
+ schema = function() Table__schema(self),
+ columns = function() Table__columns(self)
+ )
+)
+
+Table$create <- function(..., schema = NULL) {
+ dots <- list2(...)
+ # making sure there are always names
+ if (is.null(names(dots))) {
+ names(dots) <- rep_len("", length(dots))
+ }
+ stopifnot(length(dots) > 0)
+
+ if (all_record_batches(dots)) {
+ return(Table__from_record_batches(dots, schema))
+ }
+
+ # If any arrays are length 1, recycle them
+ dots <- recycle_scalars(dots)
+
+ Table__from_dots(dots, schema, option_use_threads())
+}
+
+#' @export
+names.Table <- function(x) x$ColumnNames()
+
+#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a
+#' mixture of data.frames and named vectors, the inputs will be autospliced together
+#' (see examples). Alternatively, you can provide a single Arrow IPC
+#' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`.
+#' @param schema a [Schema], or `NULL` (the default) to infer the schema from
+#' the data in `...`. When providing an Arrow IPC buffer, `schema` is required.
+#' @rdname Table
+#' @examplesIf arrow_available()
+#' tbl <- arrow_table(name = rownames(mtcars), mtcars)
+#' dim(tbl)
+#' dim(head(tbl))
+#' names(tbl)
+#' tbl$mpg
+#' tbl[["cyl"]]
+#' as.data.frame(tbl[4:8, c("gear", "hp", "wt")])
+#' @export
+arrow_table <- Table$create
diff --git a/src/arrow/r/R/type.R b/src/arrow/r/R/type.R
new file mode 100644
index 000000000..4ef7cefb5
--- /dev/null
+++ b/src/arrow/r/R/type.R
@@ -0,0 +1,541 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title class arrow::DataType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname DataType
+#' @name DataType
+DataType <- R6Class("DataType",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ DataType__ToString(self)
+ },
+ Equals = function(other, ...) {
+ inherits(other, "DataType") && DataType__Equals(self, other)
+ },
+ fields = function() {
+ DataType__fields(self)
+ },
+ export_to_c = function(ptr) ExportType(self, ptr)
+ ),
+ active = list(
+ id = function() DataType__id(self),
+ name = function() DataType__name(self),
+ num_fields = function() DataType__num_fields(self)
+ )
+)
+
+#' @include arrowExports.R
+DataType$import_from_c <- ImportType
+
+INTEGER_TYPES <- as.character(outer(c("uint", "int"), c(8, 16, 32, 64), paste0))
+FLOAT_TYPES <- c("float16", "float32", "float64", "halffloat", "float", "double")
+
+#' infer the arrow Array type from an R vector
+#'
+#' @param x an R vector
+#'
+#' @return an arrow logical type
+#' @examplesIf arrow_available()
+#' type(1:10)
+#' type(1L:10L)
+#' type(c(1, 1.5, 2))
+#' type(c("A", "B", "C"))
+#' type(mtcars)
+#' type(Sys.Date())
+#' @export
+type <- function(x) UseMethod("type")
+
+#' @export
+type.default <- function(x) Array__infer_type(x)
+
+#' @export
+type.ArrowDatum <- function(x) x$type
+
+#----- metadata
+
+#' @title class arrow::FixedWidthType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname FixedWidthType
+#' @name FixedWidthType
+FixedWidthType <- R6Class("FixedWidthType",
+ inherit = DataType,
+ active = list(
+ bit_width = function() FixedWidthType__bit_width(self)
+ )
+)
+
+Int8 <- R6Class("Int8", inherit = FixedWidthType)
+Int16 <- R6Class("Int16", inherit = FixedWidthType)
+Int32 <- R6Class("Int32", inherit = FixedWidthType)
+Int64 <- R6Class("Int64", inherit = FixedWidthType)
+UInt8 <- R6Class("UInt8", inherit = FixedWidthType)
+UInt16 <- R6Class("UInt16", inherit = FixedWidthType)
+UInt32 <- R6Class("UInt32", inherit = FixedWidthType)
+UInt64 <- R6Class("UInt64", inherit = FixedWidthType)
+Float16 <- R6Class("Float16", inherit = FixedWidthType)
+Float32 <- R6Class("Float32", inherit = FixedWidthType)
+Float64 <- R6Class("Float64", inherit = FixedWidthType)
+Boolean <- R6Class("Boolean", inherit = FixedWidthType)
+Utf8 <- R6Class("Utf8", inherit = DataType)
+LargeUtf8 <- R6Class("LargeUtf8", inherit = DataType)
+Binary <- R6Class("Binary", inherit = DataType)
+FixedSizeBinary <- R6Class("FixedSizeBinary", inherit = FixedWidthType)
+LargeBinary <- R6Class("LargeBinary", inherit = DataType)
+
+DateType <- R6Class("DateType",
+ inherit = FixedWidthType,
+ public = list(
+ unit = function() DateType__unit(self)
+ )
+)
+Date32 <- R6Class("Date32", inherit = DateType)
+Date64 <- R6Class("Date64", inherit = DateType)
+
+TimeType <- R6Class("TimeType",
+ inherit = FixedWidthType,
+ public = list(
+ unit = function() TimeType__unit(self)
+ )
+)
+Time32 <- R6Class("Time32", inherit = TimeType)
+Time64 <- R6Class("Time64", inherit = TimeType)
+
+Null <- R6Class("Null", inherit = DataType)
+
+Timestamp <- R6Class("Timestamp",
+ inherit = FixedWidthType,
+ public = list(
+ timezone = function() TimestampType__timezone(self),
+ unit = function() TimestampType__unit(self)
+ )
+)
+
+DecimalType <- R6Class("DecimalType",
+ inherit = FixedWidthType,
+ public = list(
+ precision = function() DecimalType__precision(self),
+ scale = function() DecimalType__scale(self)
+ )
+)
+Decimal128Type <- R6Class("Decimal128Type", inherit = DecimalType)
+
+NestedType <- R6Class("NestedType", inherit = DataType)
+
+#' Apache Arrow data types
+#'
+#' These functions create type objects corresponding to Arrow types. Use them
+#' when defining a [schema()] or as inputs to other types, like `struct`. Most
+#' of these functions don't take arguments, but a few do.
+#'
+#' A few functions have aliases:
+#'
+#' * `utf8()` and `string()`
+#' * `float16()` and `halffloat()`
+#' * `float32()` and `float()`
+#' * `bool()` and `boolean()`
+#' * When called inside an `arrow` function, such as `schema()` or `cast()`,
+#' `double()` also is supported as a way of creating a `float64()`
+#'
+#' `date32()` creates a datetime type with a "day" unit, like the R `Date`
+#' class. `date64()` has a "ms" unit.
+#'
+#' `uint32` (32 bit unsigned integer), `uint64` (64 bit unsigned integer), and
+#' `int64` (64-bit signed integer) types may contain values that exceed the
+#' range of R's `integer` type (32-bit signed integer). When these arrow objects
+#' are translated to R objects, `uint32` and `uint64` are converted to `double`
+#' ("numeric") and `int64` is converted to `bit64::integer64`. For `int64`
+#' types, this conversion can be disabled (so that `int64` always yields a
+#' `bit64::integer64` object) by setting `options(arrow.int64_downcast =
+#' FALSE)`.
+#'
+#' @param unit For time/timestamp types, the time unit. `time32()` can take
+#' either "s" or "ms", while `time64()` can be "us" or "ns". `timestamp()` can
+#' take any of those four values.
+#' @param timezone For `timestamp()`, an optional time zone string.
+#' @param byte_width byte width for `FixedSizeBinary` type.
+#' @param list_size list size for `FixedSizeList` type.
+#' @param precision For `decimal()`, precision
+#' @param scale For `decimal()`, scale
+#' @param type For `list_of()`, a data type to make a list-of-type
+#' @param ... For `struct()`, a named list of types to define the struct columns
+#'
+#' @name data-type
+#' @return An Arrow type object inheriting from DataType.
+#' @export
+#' @seealso [dictionary()] for creating a dictionary (factor-like) type.
+#' @examplesIf arrow_available()
+#' bool()
+#' struct(a = int32(), b = double())
+#' timestamp("ms", timezone = "CEST")
+#' time64("ns")
+int8 <- function() Int8__initialize()
+
+#' @rdname data-type
+#' @export
+int16 <- function() Int16__initialize()
+
+#' @rdname data-type
+#' @export
+int32 <- function() Int32__initialize()
+
+#' @rdname data-type
+#' @export
+int64 <- function() Int64__initialize()
+
+#' @rdname data-type
+#' @export
+uint8 <- function() UInt8__initialize()
+
+#' @rdname data-type
+#' @export
+uint16 <- function() UInt16__initialize()
+
+#' @rdname data-type
+#' @export
+uint32 <- function() UInt32__initialize()
+
+#' @rdname data-type
+#' @export
+uint64 <- function() UInt64__initialize()
+
+#' @rdname data-type
+#' @export
+float16 <- function() Float16__initialize()
+
+#' @rdname data-type
+#' @export
+halffloat <- float16
+
+#' @rdname data-type
+#' @export
+float32 <- function() Float32__initialize()
+
+#' @rdname data-type
+#' @export
+float <- float32
+
+#' @rdname data-type
+#' @export
+float64 <- function() Float64__initialize()
+
+#' @rdname data-type
+#' @export
+boolean <- function() Boolean__initialize()
+
+#' @rdname data-type
+#' @export
+bool <- boolean
+
+#' @rdname data-type
+#' @export
+utf8 <- function() Utf8__initialize()
+
+#' @rdname data-type
+#' @export
+large_utf8 <- function() LargeUtf8__initialize()
+
+#' @rdname data-type
+#' @export
+binary <- function() Binary__initialize()
+
+#' @rdname data-type
+#' @export
+large_binary <- function() LargeBinary__initialize()
+
+#' @rdname data-type
+#' @export
+fixed_size_binary <- function(byte_width) FixedSizeBinary__initialize(byte_width)
+
+#' @rdname data-type
+#' @export
+string <- utf8
+
+#' @rdname data-type
+#' @export
+date32 <- function() Date32__initialize()
+
+#' @rdname data-type
+#' @export
+date64 <- function() Date64__initialize()
+
+#' @rdname data-type
+#' @export
+time32 <- function(unit = c("ms", "s")) {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, valid_time32_units)
+ Time32__initialize(unit)
+}
+
+valid_time32_units <- c(
+ "ms" = TimeUnit$MILLI,
+ "s" = TimeUnit$SECOND
+)
+
+valid_time64_units <- c(
+ "ns" = TimeUnit$NANO,
+ "us" = TimeUnit$MICRO
+)
+
+make_valid_time_unit <- function(unit, valid_units) {
+ if (is.character(unit)) {
+ unit <- valid_units[match.arg(unit, choices = names(valid_units))]
+ }
+ if (is.numeric(unit)) {
+ # Allow non-integer input for convenience
+ unit <- as.integer(unit)
+ } else {
+ stop('"unit" should be one of ', oxford_paste(names(valid_units), "or"), call. = FALSE)
+ }
+ if (!(unit %in% valid_units)) {
+ stop('"unit" should be one of ', oxford_paste(valid_units, "or"), call. = FALSE)
+ }
+ unit
+}
+
+#' @rdname data-type
+#' @export
+time64 <- function(unit = c("ns", "us")) {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, valid_time64_units)
+ Time64__initialize(unit)
+}
+
+#' @rdname data-type
+#' @export
+null <- function() Null__initialize()
+
+#' @rdname data-type
+#' @export
+timestamp <- function(unit = c("s", "ms", "us", "ns"), timezone = "") {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units))
+ assert_that(is.string(timezone))
+ Timestamp__initialize(unit, timezone)
+}
+
+#' @rdname data-type
+#' @export
+decimal <- function(precision, scale) {
+ if (is.numeric(precision)) {
+ precision <- as.integer(precision)
+ } else {
+ stop('"precision" must be an integer', call. = FALSE)
+ }
+ if (is.numeric(scale)) {
+ scale <- as.integer(scale)
+ } else {
+ stop('"scale" must be an integer', call. = FALSE)
+ }
+ Decimal128Type__initialize(precision, scale)
+}
+
+StructType <- R6Class("StructType",
+ inherit = NestedType,
+ public = list(
+ GetFieldByName = function(name) StructType__GetFieldByName(self, name),
+ GetFieldIndex = function(name) StructType__GetFieldIndex(self, name)
+ )
+)
+StructType$create <- function(...) struct__(.fields(list(...)))
+
+#' @rdname data-type
+#' @export
+struct <- StructType$create
+
+ListType <- R6Class("ListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() ListType__value_field(self),
+ value_type = function() ListType__value_type(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+list_of <- function(type) list__(type)
+
+LargeListType <- R6Class("LargeListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() LargeListType__value_field(self),
+ value_type = function() LargeListType__value_type(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+large_list_of <- function(type) large_list__(type)
+
+#' @rdname data-type
+#' @export
+FixedSizeListType <- R6Class("FixedSizeListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() FixedSizeListType__value_field(self),
+ value_type = function() FixedSizeListType__value_type(self),
+ list_size = function() FixedSizeListType__list_size(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size)
+
+as_type <- function(type, name = "type") {
+ # magic so we don't have to mask base::double()
+ if (identical(type, double())) {
+ type <- float64()
+ }
+ if (!inherits(type, "DataType")) {
+ stop(name, " must be a DataType, not ", class(type), call. = FALSE)
+ }
+ type
+}
+
+canonical_type_str <- function(type_str) {
+ # canonicalizes data type strings, converting data type function names and
+ # aliases to match the strings returned by DataType$ToString()
+ assert_that(is.string(type_str))
+ if (grepl("[([<]", type_str)) {
+ stop("Cannot interpret string representations of data types that have parameters", call. = FALSE)
+ }
+ switch(type_str,
+ int8 = "int8",
+ int16 = "int16",
+ int32 = "int32",
+ int64 = "int64",
+ uint8 = "uint8",
+ uint16 = "uint16",
+ uint32 = "uint32",
+ uint64 = "uint64",
+ float16 = "halffloat",
+ halffloat = "halffloat",
+ float32 = "float",
+ float = "float",
+ float64 = "double",
+ double = "double",
+ boolean = "bool",
+ bool = "bool",
+ utf8 = "string",
+ large_utf8 = "large_string",
+ large_string = "large_string",
+ binary = "binary",
+ large_binary = "large_binary",
+ fixed_size_binary = "fixed_size_binary",
+ string = "string",
+ date32 = "date32",
+ date64 = "date64",
+ time32 = "time32",
+ time64 = "time64",
+ null = "null",
+ timestamp = "timestamp",
+ decimal = "decimal128",
+ struct = "struct",
+ list_of = "list",
+ list = "list",
+ large_list_of = "large_list",
+ large_list = "large_list",
+ fixed_size_list_of = "fixed_size_list",
+ fixed_size_list = "fixed_size_list",
+ stop("Unrecognized string representation of data type", call. = FALSE)
+ )
+}
+
+# vctrs support -----------------------------------------------------------
+str_dup <- function(x, times) {
+ paste0(rep(x, times = times), collapse = "")
+}
+
+indent <- function(x, n) {
+ pad <- str_dup(" ", n)
+ sapply(x, gsub, pattern = "(\n+)", replacement = paste0("\\1", pad))
+}
+
+#' @importFrom vctrs vec_ptype_full vec_ptype_abbr
+#' @export
+vec_ptype_full.arrow_fixed_size_binary <- function(x, ...) {
+ paste0("fixed_size_binary<", attr(x, "byte_width"), ">")
+}
+
+#' @export
+vec_ptype_full.arrow_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("list<", param, ">")
+}
+
+#' @export
+vec_ptype_full.arrow_large_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("large_list<", param, ">")
+}
+
+#' @export
+vec_ptype_full.arrow_fixed_size_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("fixed_size_list<", param, ", ", attr(x, "list_size"), ">")
+}
+
+#' @export
+vec_ptype_abbr.arrow_fixed_size_binary <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_large_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_fixed_size_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
diff --git a/src/arrow/r/R/util.R b/src/arrow/r/R/util.R
new file mode 100644
index 000000000..9e3ade6a9
--- /dev/null
+++ b/src/arrow/r/R/util.R
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# for compatibility with R versions earlier than 4.0.0
+if (!exists("deparse1")) {
+ deparse1 <- function(expr, collapse = " ", width.cutoff = 500L, ...) {
+ paste(deparse(expr, width.cutoff, ...), collapse = collapse)
+ }
+}
+
+# for compatibility with R versions earlier than 3.6.0
+if (!exists("str2lang")) {
+ str2lang <- function(s) {
+ parse(text = s, keep.source = FALSE)[[1]]
+ }
+}
+
+oxford_paste <- function(x, conjunction = "and", quote = TRUE) {
+ if (quote && is.character(x)) {
+ x <- paste0('"', x, '"')
+ }
+ if (length(x) < 2) {
+ return(x)
+ }
+ x[length(x)] <- paste(conjunction, x[length(x)])
+ if (length(x) > 2) {
+ return(paste(x, collapse = ", "))
+ } else {
+ return(paste(x, collapse = " "))
+ }
+}
+
+assert_is <- function(object, class) {
+ msg <- paste(substitute(object), "must be a", oxford_paste(class, "or"))
+ assert_that(inherits(object, class), msg = msg)
+}
+
+assert_is_list_of <- function(object, class) {
+ msg <- paste(substitute(object), "must be a list of", oxford_paste(class, "or"))
+ assert_that(is_list_of(object, class), msg = msg)
+}
+
+is_list_of <- function(object, class) {
+ is.list(object) && all(map_lgl(object, ~ inherits(., class)))
+}
+
+empty_named_list <- function() structure(list(), .Names = character(0))
+
+r_symbolic_constants <- c(
+ "pi", "TRUE", "FALSE", "NULL", "Inf", "NA", "NaN",
+ "NA_integer_", "NA_real_", "NA_complex_", "NA_character_"
+)
+
+is_function <- function(expr, name) {
+ # We could have a quosure here if we have an expression like `sum({{ var }})`
+ if (is_quosure(expr)) {
+ expr <- quo_get_expr(expr)
+ }
+ if (!is.call(expr)) {
+ return(FALSE)
+ } else {
+ if (deparse(expr[[1]]) == name) {
+ return(TRUE)
+ }
+ out <- lapply(expr, is_function, name)
+ }
+ any(map_lgl(out, isTRUE))
+}
+
+all_funs <- function(expr) {
+ # It is not sufficient to simply do: setdiff(all.names, all.vars)
+ # here because that would fail to return the names of functions that
+ # share names with variables.
+ # To preserve duplicates, call `all.names()` not `all_names()` here.
+ if (is_quosure(expr)) {
+ expr <- quo_get_expr(expr)
+ }
+ names <- all.names(expr)
+ names[map_lgl(names, ~ is_function(expr, .))]
+}
+
+all_vars <- function(expr) {
+ setdiff(all.vars(expr), r_symbolic_constants)
+}
+
+all_names <- function(expr) {
+ setdiff(all.names(expr), r_symbolic_constants)
+}
+
+is_constant <- function(expr) {
+ length(all_vars(expr)) == 0
+}
+
+read_compressed_error <- function(e) {
+ msg <- conditionMessage(e)
+ if (grepl(" codec ", msg)) {
+ compression <- sub(".*Support for codec '(.*)'.*", "\\1", msg)
+ e$message <- paste0(
+ msg,
+ "\nIn order to read this file, you will need to reinstall arrow with additional features enabled.",
+ "\nSet one of these environment variables before installing:",
+ sprintf("\n\n * LIBARROW_MINIMAL=false (for all optional features, including '%s')", compression),
+ sprintf("\n * ARROW_WITH_%s=ON (for just '%s')", toupper(compression), compression),
+ "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details"
+ )
+ }
+ stop(e)
+}
+
+handle_parquet_io_error <- function(e, format) {
+ msg <- conditionMessage(e)
+ if (grepl("Parquet magic bytes not found in footer", msg) && length(format) > 1 && is_character(format)) {
+ # If length(format) > 1, that means it is (almost certainly) the default/not specified value
+ # so let the user know that they should specify the actual (not parquet) format
+ abort(c(
+ msg,
+ i = "Did you mean to specify a 'format' other than the default (parquet)?"
+ ))
+ }
+ stop(e)
+}
+
+is_writable_table <- function(x) {
+ inherits(x, c("data.frame", "ArrowTabular"))
+}
+
+# This attribute is used when is_writable is passed into assert_that, and allows
+# the call to form part of the error message when is_writable is FALSE
+attr(is_writable_table, "fail") <- function(call, env) {
+ paste0(
+ deparse(call$x),
+ " must be an object of class 'data.frame', 'RecordBatch', or 'Table', not '",
+ class(env[[deparse(call$x)]])[[1]],
+ "'."
+ )
+}
+
+#' Recycle scalar values in a list of arrays
+#'
+#' @param arrays List of arrays
+#' @return List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled
+#' @keywords internal
+recycle_scalars <- function(arrays) {
+ # Get lengths of items in arrays
+ arr_lens <- map_int(arrays, NROW)
+
+ is_scalar <- arr_lens == 1
+
+ if (length(arrays) > 1 && any(is_scalar) && !all(is_scalar)) {
+
+ # Recycling not supported for tibbles and data.frames
+ if (all(map_lgl(arrays, ~ inherits(.x, "data.frame")))) {
+ abort(c(
+ "All input tibbles or data.frames must have the same number of rows",
+ x = paste(
+ "Number of rows in longest and shortest inputs:",
+ oxford_paste(c(max(arr_lens), min(arr_lens)))
+ )
+ ))
+ }
+
+ max_array_len <- max(arr_lens)
+ arrays[is_scalar] <- lapply(arrays[is_scalar], repeat_value_as_array, max_array_len)
+ }
+ arrays
+}
+
+#' Take an object of length 1 and repeat it.
+#'
+#' @param object Object of length 1 to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray`
+#' @param n Number of repetitions
+#'
+#' @return `Array` of length `n`
+#'
+#' @keywords internal
+repeat_value_as_array <- function(object, n) {
+ if (inherits(object, "ChunkedArray")) {
+ return(Scalar$create(object$chunks[[1]])$as_array(n))
+ }
+ return(Scalar$create(object)$as_array(n))
+}