diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/R | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/R')
60 files changed, 14318 insertions, 0 deletions
diff --git a/src/arrow/r/R/array-data.R b/src/arrow/r/R/array-data.R new file mode 100644 index 000000000..99c24fdcf --- /dev/null +++ b/src/arrow/r/R/array-data.R @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @title ArrayData class +#' @usage NULL +#' @format NULL +#' @docType class +#' @description The `ArrayData` class allows you to get and inspect the data +#' inside an `arrow::Array`. +#' +#' @section Usage: +#' +#' ``` +#' data <- Array$create(x)$data() +#' +#' data$type +#' data$length +#' data$null_count +#' data$offset +#' data$buffers +#' ``` +#' +#' @section Methods: +#' +#' ... +#' +#' @rdname ArrayData +#' @name ArrayData +#' @include type.R +ArrayData <- R6Class("ArrayData", + inherit = ArrowObject, + active = list( + type = function() ArrayData__get_type(self), + length = function() ArrayData__get_length(self), + null_count = function() ArrayData__get_null_count(self), + offset = function() ArrayData__get_offset(self), + buffers = function() ArrayData__buffers(self) + ) +) diff --git a/src/arrow/r/R/array.R b/src/arrow/r/R/array.R new file mode 100644 index 000000000..46acc14ff --- /dev/null +++ b/src/arrow/r/R/array.R @@ -0,0 +1,329 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-datum.R + +#' @title Arrow Arrays +#' @description An `Array` is an immutable data array with some logical type +#' and some length. Most logical types are contained in the base +#' `Array` class; there are also subclasses for `DictionaryArray`, `ListArray`, +#' and `StructArray`. +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Factory: +#' The `Array$create()` factory method instantiates an `Array` and +#' takes the following arguments: +#' * `x`: an R vector, list, or `data.frame` +#' * `type`: an optional [data type][data-type] for `x`. If omitted, the type +#' will be inferred from the data. +#' +#' `Array$create()` will return the appropriate subclass of `Array`, such as +#' `DictionaryArray` when given an R factor. +#' +#' To compose a `DictionaryArray` directly, call `DictionaryArray$create()`, +#' which takes two arguments: +#' * `x`: an R vector or `Array` of integers for the dictionary indices +#' * `dict`: an R vector or `Array` of dictionary values (like R factor levels +#' but not limited to strings only) +#' @section Usage: +#' +#' ``` +#' a <- Array$create(x) +#' length(a) +#' +#' print(a) +#' a == a +#' ``` +#' +#' @section Methods: +#' +#' - `$IsNull(i)`: Return true if value at index is null. Does not boundscheck +#' - `$IsValid(i)`: Return true if value at index is valid. Does not boundscheck +#' - `$length()`: Size in the number of elements this array contains +#' - `$offset`: A relative position into another array's data, to enable zero-copy slicing +#' - `$null_count`: The number of null entries in the array +#' - `$type`: logical type of data +#' - `$type_id()`: type id +#' - `$Equals(other)` : is this array equal to `other` +#' - `$ApproxEquals(other)` : +#' - `$Diff(other)` : return a string expressing the difference between two arrays +#' - `$data()`: return the underlying [ArrayData][ArrayData] +#' - `$as_vector()`: convert to an R vector +#' - `$ToString()`: string representation of the array +#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array +#' with the indicated offset and length. If length is `NULL`, the slice goes +#' until the end of the array. +#' - `$Take(i)`: return an `Array` with values at positions given by integers +#' (R vector or Array Array) `i`. +#' - `$Filter(i, keep_na = TRUE)`: return an `Array` with values at positions where logical +#' vector (or Arrow boolean Array) `i` is `TRUE`. +#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be +#' used to rearrange the `Array` in ascending or descending order +#' - `$RangeEquals(other, start_idx, end_idx, other_start_idx)` : +#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the +#' data in the array to change its type. +#' - `$View(type)`: Construct a zero-copy view of this array with the given type. +#' - `$Validate()` : Perform any validation checks to determine obvious inconsistencies +#' within the array's internal data. This can be an expensive check, potentially `O(length)` +#' +#' @rdname array +#' @name array +#' @examplesIf arrow_available() +#' my_array <- Array$create(1:10) +#' my_array$type +#' my_array$cast(int8()) +#' +#' # Check if value is null; zero-indexed +#' na_array <- Array$create(c(1:5, NA)) +#' na_array$IsNull(0) +#' na_array$IsNull(5) +#' na_array$IsValid(5) +#' na_array$null_count +#' +#' # zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice +#' new_array <- na_array$Slice(5) +#' new_array$offset +#' +#' # Compare 2 arrays +#' na_array2 <- na_array +#' na_array2 == na_array # element-wise comparison +#' na_array2$Equals(na_array) # overall comparison +#' @export +Array <- R6Class("Array", + inherit = ArrowDatum, + public = list( + IsNull = function(i) Array__IsNull(self, i), + IsValid = function(i) Array__IsValid(self, i), + length = function() Array__length(self), + type_id = function() Array__type_id(self), + Equals = function(other, ...) { + inherits(other, "Array") && Array__Equals(self, other) + }, + ApproxEquals = function(other) { + inherits(other, "Array") && Array__ApproxEquals(self, other) + }, + Diff = function(other) { + if (!inherits(other, "Array")) { + other <- Array$create(other) + } + Array__Diff(self, other) + }, + data = function() Array__data(self), + as_vector = function() Array__as_vector(self), + ToString = function() { + typ <- paste0("<", self$type$ToString(), ">") + paste(typ, Array__ToString(self), sep = "\n") + }, + Slice = function(offset, length = NULL) { + if (is.null(length)) { + Array__Slice1(self, offset) + } else { + Array__Slice2(self, offset, length) + } + }, + Take = function(i) { + if (is.numeric(i)) { + i <- as.integer(i) + } + if (is.integer(i)) { + i <- Array$create(i) + } + call_function("take", self, i) + }, + Filter = function(i, keep_na = TRUE) { + if (is.logical(i)) { + i <- Array$create(i) + } + assert_is(i, "Array") + call_function("filter", self, i, options = list(keep_na = keep_na)) + }, + SortIndices = function(descending = FALSE) { + assert_that(is.logical(descending)) + assert_that(length(descending) == 1L) + assert_that(!is.na(descending)) + call_function("array_sort_indices", self, options = list(order = descending)) + }, + RangeEquals = function(other, start_idx, end_idx, other_start_idx = 0L) { + assert_is(other, "Array") + Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) + }, + View = function(type) { + Array$create(Array__View(self, as_type(type))) + }, + Validate = function() Array__Validate(self), + export_to_c = function(array_ptr, schema_ptr) ExportArray(self, array_ptr, schema_ptr) + ), + active = list( + null_count = function() Array__null_count(self), + offset = function() Array__offset(self), + type = function() Array__type(self) + ) +) +Array$create <- function(x, type = NULL) { + if (!is.null(type)) { + type <- as_type(type) + } + if (inherits(x, "Scalar")) { + out <- x$as_array() + if (!is.null(type)) { + out <- out$cast(type) + } + return(out) + } + vec_to_arrow(x, type) +} +#' @include arrowExports.R +Array$import_from_c <- ImportArray + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +DictionaryArray <- R6Class("DictionaryArray", + inherit = Array, + public = list( + indices = function() DictionaryArray__indices(self), + dictionary = function() DictionaryArray__dictionary(self) + ), + active = list( + ordered = function() self$type$ordered + ) +) +DictionaryArray$create <- function(x, dict = NULL) { + if (is.factor(x)) { + # The simple case: converting a factor. + # Ignoring `dict`; should probably error if dict is not NULL + return(Array$create(x)) + } + + assert_that(!is.null(dict)) + if (!is.Array(x)) { + x <- Array$create(x) + } + if (!is.Array(dict)) { + dict <- Array$create(dict) + } + type <- DictionaryType$create(x$type, dict$type) + DictionaryArray__FromArrays(type, x, dict) +} + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +StructArray <- R6Class("StructArray", + inherit = Array, + public = list( + field = function(i) StructArray__field(self, i), + GetFieldByName = function(name) StructArray__GetFieldByName(self, name), + Flatten = function() StructArray__Flatten(self) + ) +) + + +#' @export +`[[.StructArray` <- function(x, i, ...) { + if (is.character(i)) { + x$GetFieldByName(i) + } else if (is.numeric(i)) { + x$field(i - 1) + } else { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } +} + +#' @export +`$.StructArray` <- function(x, name, ...) { + assert_that(is.string(name)) + if (name %in% ls(x)) { + get(name, x) + } else { + x$GetFieldByName(name) + } +} + +#' @export +names.StructArray <- function(x, ...) StructType__field_names(x$type) + +#' @export +dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields) + +#' @export +as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { + as.vector(x) +} + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +ListArray <- R6Class("ListArray", + inherit = Array, + public = list( + values = function() ListArray__values(self), + value_length = function(i) ListArray__value_length(self, i), + value_offset = function(i) ListArray__value_offset(self, i), + raw_value_offsets = function() ListArray__raw_value_offsets(self) + ), + active = list( + value_type = function() ListArray__value_type(self) + ) +) + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +LargeListArray <- R6Class("LargeListArray", + inherit = Array, + public = list( + values = function() LargeListArray__values(self), + value_length = function(i) LargeListArray__value_length(self, i), + value_offset = function(i) LargeListArray__value_offset(self, i), + raw_value_offsets = function() LargeListArray__raw_value_offsets(self) + ), + active = list( + value_type = function() LargeListArray__value_type(self) + ) +) + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +FixedSizeListArray <- R6Class("FixedSizeListArray", + inherit = Array, + public = list( + values = function() FixedSizeListArray__values(self), + value_length = function(i) FixedSizeListArray__value_length(self, i), + value_offset = function(i) FixedSizeListArray__value_offset(self, i) + ), + active = list( + value_type = function() FixedSizeListArray__value_type(self), + list_size = function() self$type$list_size + ) +) + +is.Array <- function(x, type = NULL) { # nolint + is_it <- inherits(x, c("Array", "ChunkedArray")) + if (is_it && !is.null(type)) { + is_it <- x$type$ToString() %in% type + } + is_it +} diff --git a/src/arrow/r/R/arrow-datum.R b/src/arrow/r/R/arrow-datum.R new file mode 100644 index 000000000..557321f68 --- /dev/null +++ b/src/arrow/r/R/arrow-datum.R @@ -0,0 +1,266 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R + +# Base class for Array, ChunkedArray, and Scalar, for S3 method dispatch only. +# Does not exist in C++ class hierarchy +ArrowDatum <- R6Class("ArrowDatum", + inherit = ArrowObject, + public = list( + cast = function(target_type, safe = TRUE, ...) { + opts <- cast_options(safe, ...) + opts$to_type <- as_type(target_type) + call_function("cast", self, options = opts) + } + ) +) + +#' @export +length.ArrowDatum <- function(x) x$length() + +#' @export +is.finite.ArrowDatum <- function(x) { + is_fin <- call_function("is_finite", x) + # for compatibility with base::is.finite(), return FALSE for NA_real_ + is_fin & !is.na(is_fin) +} + +#' @export +is.infinite.ArrowDatum <- function(x) { + is_inf <- call_function("is_inf", x) + # for compatibility with base::is.infinite(), return FALSE for NA_real_ + is_inf & !is.na(is_inf) +} + +#' @export +is.na.ArrowDatum <- function(x) { + call_function("is_null", x, options = list(nan_is_null = TRUE)) +} + +#' @export +is.nan.ArrowDatum <- function(x) { + if (x$type_id() %in% TYPES_WITH_NAN) { + # TODO: if an option is added to the is_nan kernel to treat NA as NaN, + # use that to simplify the code here (ARROW-13366) + call_function("is_nan", x) & call_function("is_valid", x) + } else { + Scalar$create(FALSE)$as_array(length(x)) + } +} + +#' @export +as.vector.ArrowDatum <- function(x, mode) { + x$as_vector() +} + +#' @export +Ops.ArrowDatum <- function(e1, e2) { + if (.Generic == "!") { + eval_array_expression(.Generic, e1) + } else if (.Generic %in% names(.array_function_map)) { + eval_array_expression(.Generic, e1, e2) + } else { + stop(paste0("Unsupported operation on `", class(e1)[1L], "` : "), .Generic, call. = FALSE) + } +} + +# Wrapper around call_function that: +# (1) maps R function names to Arrow C++ compute ("/" --> "divide_checked") +# (2) wraps R input args as Array or Scalar +eval_array_expression <- function(FUN, + ..., + args = list(...), + options = empty_named_list()) { + if (FUN == "-" && length(args) == 1L) { + if (inherits(args[[1]], "ArrowObject")) { + return(eval_array_expression("negate_checked", args[[1]])) + } else { + return(-args[[1]]) + } + } + args <- lapply(args, .wrap_arrow, FUN) + + # In Arrow, "divide" is one function, which does integer division on + # integer inputs and floating-point division on floats + if (FUN == "/") { + # TODO: omg so many ways it's wrong to assume these types + args <- map(args, ~ .$cast(float64())) + } else if (FUN == "%/%") { + # In R, integer division works like floor(float division) + out <- eval_array_expression("/", args = args, options = options) + return(out$cast(int32(), allow_float_truncate = TRUE)) + } else if (FUN == "%%") { + # We can't simply do {e1 - e2 * ( e1 %/% e2 )} since Ops.Array evaluates + # eagerly, but we can build that up + quotient <- eval_array_expression("%/%", args = args) + base <- eval_array_expression("*", quotient, args[[2]]) + # this cast is to ensure that the result of this and e1 are the same + # (autocasting only applies to scalars) + base <- base$cast(args[[1]]$type) + return(eval_array_expression("-", args[[1]], base)) + } + + call_function( + .array_function_map[[FUN]] %||% FUN, + args = args, + options = options + ) +} + +.wrap_arrow <- function(arg, fun) { + if (!inherits(arg, "ArrowObject")) { + # TODO: Array$create if lengths are equal? + if (fun == "%in%") { + arg <- Array$create(arg) + } else { + arg <- Scalar$create(arg) + } + } + arg +} + +#' @export +na.omit.ArrowDatum <- function(object, ...) { + object$Filter(!is.na(object)) +} + +#' @export +na.exclude.ArrowDatum <- na.omit.ArrowDatum + +#' @export +na.fail.ArrowDatum <- function(object, ...) { + if (object$null_count > 0) { + stop("missing values in object", call. = FALSE) + } + object +} + +filter_rows <- function(x, i, keep_na = TRUE, ...) { + # General purpose function for [ row subsetting with R semantics + # Based on the input for `i`, calls x$Filter, x$Slice, or x$Take + nrows <- x$num_rows %||% x$length() # Depends on whether Array or Table-like + if (is.logical(i)) { + if (isTRUE(i)) { + # Shortcut without doing any work + x + } else { + i <- rep_len(i, nrows) # For R recycling behavior; consider vctrs::vec_recycle() + x$Filter(i, keep_na) + } + } else if (is.numeric(i)) { + if (all(i < 0)) { + # in R, negative i means "everything but i" + i <- setdiff(seq_len(nrows), -1 * i) + } + if (is.sliceable(i)) { + x$Slice(i[1] - 1, length(i)) + } else if (all(i > 0)) { + x$Take(i - 1) + } else { + stop("Cannot mix positive and negative indices", call. = FALSE) + } + } else if (is.Array(i, INTEGER_TYPES)) { + # NOTE: this doesn't do the - 1 offset + x$Take(i) + } else if (is.Array(i, "bool")) { + x$Filter(i, keep_na) + } else { + # Unsupported cases + if (is.Array(i)) { + stop("Cannot extract rows with an Array of type ", i$type$ToString(), call. = FALSE) + } + stop("Cannot extract rows with an object of class ", class(i), call. = FALSE) + } +} + +#' @export +`[.ArrowDatum` <- filter_rows + +#' @importFrom utils head +#' @export +head.ArrowDatum <- function(x, n = 6L, ...) { + assert_is(n, c("numeric", "integer")) + assert_that(length(n) == 1) + len <- NROW(x) + if (n < 0) { + # head(x, negative) means all but the last n rows + n <- max(len + n, 0) + } else { + n <- min(len, n) + } + if (n == len) { + return(x) + } + x$Slice(0, n) +} + +#' @importFrom utils tail +#' @export +tail.ArrowDatum <- function(x, n = 6L, ...) { + assert_is(n, c("numeric", "integer")) + assert_that(length(n) == 1) + len <- NROW(x) + if (n < 0) { + # tail(x, negative) means all but the first n rows + n <- min(-n, len) + } else { + n <- max(len - n, 0) + } + if (n == 0) { + return(x) + } + x$Slice(n) +} + +is.sliceable <- function(i) { + # Determine whether `i` can be expressed as a $Slice() command + is.numeric(i) && + length(i) > 0 && + all(i > 0) && + i[1] <= i[length(i)] && + identical(as.integer(i), i[1]:i[length(i)]) +} + +#' @export +as.double.ArrowDatum <- function(x, ...) as.double(as.vector(x), ...) + +#' @export +as.integer.ArrowDatum <- function(x, ...) as.integer(as.vector(x), ...) + +#' @export +as.character.ArrowDatum <- function(x, ...) as.character(as.vector(x), ...) + +#' @export +sort.ArrowDatum <- function(x, decreasing = FALSE, na.last = NA, ...) { + # Arrow always sorts nulls at the end of the array. This corresponds to + # sort(na.last = TRUE). For the other two cases (na.last = NA and + # na.last = FALSE) we need to use workarounds. + # TODO: Implement this more cleanly after ARROW-12063 + if (is.na(na.last)) { + # Filter out NAs before sorting + x <- x$Filter(!is.na(x)) + x$Take(x$SortIndices(descending = decreasing)) + } else if (na.last) { + x$Take(x$SortIndices(descending = decreasing)) + } else { + # Create a new array that encodes missing values as 1 and non-missing values + # as 0. Sort descending by that array first to get the NAs at the beginning + tbl <- Table$create(x = x, `is_na` = as.integer(is.na(x))) + tbl$x$Take(tbl$SortIndices(names = c("is_na", "x"), descending = c(TRUE, decreasing))) + } +} diff --git a/src/arrow/r/R/arrow-package.R b/src/arrow/r/R/arrow-package.R new file mode 100644 index 000000000..edc2652b6 --- /dev/null +++ b/src/arrow/r/R/arrow-package.R @@ -0,0 +1,351 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail +#' @importFrom R6 R6Class +#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr +#' @importFrom assertthat assert_that is.string +#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos +#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec +#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive +#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure +#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select +#' @useDynLib arrow, .registration = TRUE +#' @keywords internal +"_PACKAGE" + +#' @importFrom vctrs s3_register vec_size vec_cast vec_unique +.onLoad <- function(...) { + dplyr_methods <- paste0( + "dplyr::", + c( + "select", "filter", "collect", "summarise", "group_by", "groups", + "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute", + "arrange", "rename", "pull", "relocate", "compute", "collapse", + "distinct", "left_join", "right_join", "inner_join", "full_join", + "semi_join", "anti_join", "count", "tally" + ) + ) + for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) { + for (m in dplyr_methods) { + s3_register(m, cl) + } + } + s3_register("dplyr::tbl_vars", "arrow_dplyr_query") + + for (cl in c( + "Array", "RecordBatch", "ChunkedArray", "Table", "Schema", + "Field", "DataType", "RecordBatchReader" + )) { + s3_register("reticulate::py_to_r", paste0("pyarrow.lib.", cl)) + s3_register("reticulate::r_to_py", cl) + } + + # Create these once, at package build time + if (arrow_available()) { + # Also include all available Arrow Compute functions, + # namespaced as arrow_fun. + # We can't do this at install time because list_compute_functions() may error + all_arrow_funs <- list_compute_functions() + arrow_funcs <- set_names( + lapply(all_arrow_funs, function(fun) { + force(fun) + function(...) build_expr(fun, ...) + }), + paste0("arrow_", all_arrow_funs) + ) + .cache$functions <- c(nse_funcs, arrow_funcs) + } + + if (tolower(Sys.info()[["sysname"]]) == "windows") { + # Disable multithreading on Windows + # See https://issues.apache.org/jira/browse/ARROW-8379 + options(arrow.use_threads = FALSE) + } + + invisible() +} + +.onAttach <- function(libname, pkgname) { + if (!arrow_available()) { + msg <- paste( + "The Arrow C++ library is not available. To retry installation with debug output, run:", + " install_arrow(verbose = TRUE)", + "See https://arrow.apache.org/docs/r/articles/install.html for more guidance and troubleshooting.", + sep = "\n" + ) + packageStartupMessage(msg) + } else { + # Just to be extra safe, let's wrap this in a try(); + # we don't a failed startup message to prevent the package from loading + try({ + features <- arrow_info()$capabilities + # That has all of the #ifdef features, plus the compression libs and the + # string libraries (but not the memory allocators, they're added elsewhere) + # + # Let's print a message if some are off + if (some_features_are_off(features)) { + packageStartupMessage("See arrow_info() for available features") + } + }) + } +} + +#' Is the C++ Arrow library available? +#' +#' You won't generally need to call these function, but they're made available +#' for diagnostic purposes. +#' @return `TRUE` or `FALSE` depending on whether the package was installed +#' with: +#' * The Arrow C++ library (check with `arrow_available()`) +#' * Arrow Dataset support enabled (check with `arrow_with_dataset()`) +#' * Parquet support enabled (check with `arrow_with_parquet()`) +#' * JSON support enabled (check with `arrow_with_json()`) +#' * Amazon S3 support enabled (check with `arrow_with_s3()`) +#' @export +#' @examples +#' arrow_available() +#' arrow_with_dataset() +#' arrow_with_parquet() +#' arrow_with_json() +#' arrow_with_s3() +#' @seealso If any of these are `FALSE`, see +#' `vignette("install", package = "arrow")` for guidance on reinstalling the +#' package. +arrow_available <- function() { + tryCatch(.Call(`_arrow_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_dataset <- function() { + is_32bit <- .Machine$sizeof.pointer < 8 + is_old_r <- getRversion() < "4.0.0" + is_windows <- tolower(Sys.info()[["sysname"]]) == "windows" + if (is_32bit && is_old_r && is_windows) { + # 32-bit rtools 3.5 does not properly implement the std::thread expectations + # but we can't just disable ARROW_DATASET in that build, + # so report it as "off" here. + return(FALSE) + } + tryCatch(.Call(`_dataset_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_parquet <- function() { + tryCatch(.Call(`_parquet_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_s3 <- function() { + tryCatch(.Call(`_s3_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_json <- function() { + tryCatch(.Call(`_json_available`), error = function(e) { + return(FALSE) + }) +} + +option_use_threads <- function() { + !is_false(getOption("arrow.use_threads")) +} + +#' Report information on the package's capabilities +#' +#' This function summarizes a number of build-time configurations and run-time +#' settings for the Arrow package. It may be useful for diagnostics. +#' @return A list including version information, boolean "capabilities", and +#' statistics from Arrow's memory allocator, and also Arrow's run-time +#' information. +#' @export +#' @importFrom utils packageVersion +arrow_info <- function() { + opts <- options() + out <- list( + version = packageVersion("arrow"), + libarrow = arrow_available(), + options = opts[grep("^arrow\\.", names(opts))] + ) + if (out$libarrow) { + pool <- default_memory_pool() + runtimeinfo <- runtime_info() + buildinfo <- build_info() + compute_funcs <- list_compute_functions() + out <- c(out, list( + capabilities = c( + dataset = arrow_with_dataset(), + parquet = arrow_with_parquet(), + json = arrow_with_json(), + s3 = arrow_with_s3(), + utf8proc = "utf8_upper" %in% compute_funcs, + re2 = "replace_substring_regex" %in% compute_funcs, + vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1)) + ), + memory_pool = list( + backend_name = pool$backend_name, + bytes_allocated = pool$bytes_allocated, + max_memory = pool$max_memory, + available_backends = supported_memory_backends() + ), + runtime_info = list( + simd_level = runtimeinfo[1], + detected_simd_level = runtimeinfo[2] + ), + build_info = list( + cpp_version = buildinfo[1], + cpp_compiler = buildinfo[2], + cpp_compiler_version = buildinfo[3], + cpp_compiler_flags = buildinfo[4], + # git_id is "" if not built from a git checkout + # convert that to NULL + git_id = if (nzchar(buildinfo[5])) buildinfo[5] + ) + )) + } + structure(out, class = "arrow_info") +} + +some_features_are_off <- function(features) { + # `features` is a named logical vector (as in arrow_info()$capabilities) + # Let's exclude some less relevant ones + blocklist <- c("lzo", "bz2", "brotli") + # Return TRUE if any of the other features are FALSE + !all(features[setdiff(names(features), blocklist)]) +} + +#' @export +print.arrow_info <- function(x, ...) { + print_key_values <- function(title, vals, ...) { + # Make a key-value table for printing, no column names + df <- data.frame(vals, stringsAsFactors = FALSE, ...) + names(df) <- "" + + cat(title, ":\n", sep = "") + print(df) + cat("\n") + } + cat("Arrow package version: ", format(x$version), "\n\n", sep = "") + if (x$libarrow) { + print_key_values("Capabilities", c( + x$capabilities, + jemalloc = "jemalloc" %in% x$memory_pool$available_backends, + mimalloc = "mimalloc" %in% x$memory_pool$available_backends + )) + if (some_features_are_off(x$capabilities) && identical(tolower(Sys.info()[["sysname"]]), "linux")) { + # Only on linux because (e.g.) we disable certain features on purpose on rtools35 and solaris + cat( + "To reinstall with more optional capabilities enabled, see\n", + " https://arrow.apache.org/docs/r/articles/install.html\n\n" + ) + } + + if (length(x$options)) { + print_key_values("Arrow options()", map_chr(x$options, format)) + } + + format_bytes <- function(b, units = "auto", digits = 2L, ...) { + format(structure(b, class = "object_size"), units = units, digits = digits, ...) + } + print_key_values("Memory", c( + Allocator = x$memory_pool$backend_name, + # utils:::format.object_size is not properly vectorized + Current = format_bytes(x$memory_pool$bytes_allocated, ...), + Max = format_bytes(x$memory_pool$max_memory, ...) + )) + print_key_values("Runtime", c( + `SIMD Level` = x$runtime_info$simd_level, + `Detected SIMD Level` = x$runtime_info$detected_simd_level + )) + print_key_values("Build", c( + `C++ Library Version` = x$build_info$cpp_version, + `C++ Compiler` = x$build_info$cpp_compiler, + `C++ Compiler Version` = x$build_info$cpp_compiler_version, + `Git ID` = x$build_info$git_id + )) + } else { + cat( + "Arrow C++ library not available. See https://arrow.apache.org/docs/r/articles/install.html ", + "for troubleshooting.\n" + ) + } + invisible(x) +} + +option_compress_metadata <- function() { + !is_false(getOption("arrow.compress_metadata")) +} + +#' @include enums.R +ArrowObject <- R6Class("ArrowObject", + public = list( + initialize = function(xp) self$set_pointer(xp), + pointer = function() get(".:xp:.", envir = self), + `.:xp:.` = NULL, + set_pointer = function(xp) { + if (!inherits(xp, "externalptr")) { + stop( + class(self)[1], "$new() requires a pointer as input: ", + "did you mean $create() instead?", + call. = FALSE + ) + } + assign(".:xp:.", xp, envir = self) + }, + print = function(...) { + if (!is.null(self$.class_title)) { + # Allow subclasses to override just printing the class name first + class_title <- self$.class_title() + } else { + class_title <- class(self)[[1]] + } + cat(class_title, "\n", sep = "") + if (!is.null(self$ToString)) { + cat(self$ToString(), "\n", sep = "") + } + invisible(self) + }, + invalidate = function() { + assign(".:xp:.", NULL, envir = self) + } + ) +) + +#' @export +`!=.ArrowObject` <- function(lhs, rhs) !(lhs == rhs) # nolint + +#' @export +`==.ArrowObject` <- function(x, y) { # nolint + x$Equals(y) +} + +#' @export +all.equal.ArrowObject <- function(target, current, ..., check.attributes = TRUE) { + target$Equals(current, check_metadata = check.attributes) +} diff --git a/src/arrow/r/R/arrow-tabular.R b/src/arrow/r/R/arrow-tabular.R new file mode 100644 index 000000000..43110ccf2 --- /dev/null +++ b/src/arrow/r/R/arrow-tabular.R @@ -0,0 +1,272 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-datum.R + +# Base class for RecordBatch and Table for S3 method dispatch only. +# Does not exist in C++ class hierarchy +ArrowTabular <- R6Class("ArrowTabular", + inherit = ArrowObject, + public = list( + ToString = function() { + sch <- unlist(strsplit(self$schema$ToString(), "\n")) + sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch) + dims <- sprintf("%s rows x %s columns", self$num_rows, self$num_columns) + paste(c(dims, sch), collapse = "\n") + }, + Take = function(i) { + if (is.numeric(i)) { + i <- as.integer(i) + } + if (is.integer(i)) { + i <- Array$create(i) + } + assert_that(is.Array(i)) + call_function("take", self, i) + }, + Filter = function(i, keep_na = TRUE) { + if (is.logical(i)) { + i <- Array$create(i) + } + assert_that(is.Array(i, "bool")) + call_function("filter", self, i, options = list(keep_na = keep_na)) + }, + SortIndices = function(names, descending = FALSE) { + assert_that(is.character(names)) + assert_that(length(names) > 0) + assert_that(!any(is.na(names))) + if (length(descending) == 1L) { + descending <- rep_len(descending, length(names)) + } + assert_that(is.logical(descending)) + assert_that(identical(length(names), length(descending))) + assert_that(!any(is.na(descending))) + call_function( + "sort_indices", + self, + # cpp11 does not support logical vectors so convert to integer + options = list(names = names, orders = as.integer(descending)) + ) + } + ), + active = list( + metadata = function(new) { + if (missing(new)) { + # Get the metadata (from the schema) + self$schema$metadata + } else { + # Set the metadata + new <- prepare_key_value_metadata(new) + out <- self$ReplaceSchemaMetadata(new) + # ReplaceSchemaMetadata returns a new object but we're modifying in place, + # so swap in that new C++ object pointer into our R6 object + self$set_pointer(out$pointer()) + self + } + }, + r_metadata = function(new) { + # Helper for the R metadata that handles the serialization + # See also method on Schema + if (missing(new)) { + out <- self$metadata$r + if (!is.null(out)) { + # Can't unserialize NULL + out <- .unserialize_arrow_r_metadata(out) + } + # Returns either NULL or a named list + out + } else { + # Set the R metadata + self$metadata$r <- .serialize_arrow_r_metadata(new) + self + } + } + ) +) + +#' @export +as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { + df <- x$to_data_frame() + + if (!is.null(r_metadata <- x$metadata$r)) { + df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata)) + } + df +} + +#' @export +`names<-.ArrowTabular` <- function(x, value) x$RenameColumns(value) + +#' @importFrom methods as +#' @export +`[.ArrowTabular` <- function(x, i, j, ..., drop = FALSE) { + if (nargs() == 2L) { + # List-like column extraction (x[i]) + return(x[, i]) + } + if (!missing(j)) { + # Selecting columns is cheaper than filtering rows, so do it first. + # That way, if we're filtering too, we have fewer arrays to filter/slice/take + if (is.character(j)) { + j_new <- match(j, names(x)) + if (any(is.na(j_new))) { + stop("Column not found: ", oxford_paste(j[is.na(j_new)]), call. = FALSE) + } + j <- j_new + } + if (is_integerish(j)) { + if (any(is.na(j))) { + stop("Column indices cannot be NA", call. = FALSE) + } + if (length(j) && all(j < 0)) { + # in R, negative j means "everything but j" + j <- setdiff(seq_len(x$num_columns), -1 * j) + } + x <- x$SelectColumns(as.integer(j) - 1L) + } + + if (drop && ncol(x) == 1L) { + x <- x$column(0) + } + } + if (!missing(i)) { + x <- filter_rows(x, i, ...) + } + x +} + +#' @export +`[[.ArrowTabular` <- function(x, i, ...) { + if (is.character(i)) { + x$GetColumnByName(i) + } else if (is.numeric(i)) { + x$column(i - 1) + } else { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } +} + +#' @export +`$.ArrowTabular` <- function(x, name, ...) { + assert_that(is.string(name)) + if (name %in% ls(x)) { + get(name, x) + } else { + x$GetColumnByName(name) + } +} + +#' @export +`[[<-.ArrowTabular` <- function(x, i, value) { + if (!is.character(i) & !is.numeric(i)) { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } + assert_that(length(i) == 1, !is.na(i)) + + if (is.null(value)) { + if (is.character(i)) { + i <- match(i, names(x)) + } + x <- x$RemoveColumn(i - 1L) + } else { + if (!is.character(i)) { + # get or create a/the column name + if (i <= x$num_columns) { + i <- names(x)[i] + } else { + i <- as.character(i) + } + } + + # auto-magic recycling on non-ArrowObjects + if (!inherits(value, "ArrowObject")) { + value <- vctrs::vec_recycle(value, x$num_rows) + } + + # construct the field + if (inherits(x, "RecordBatch") && !inherits(value, "Array")) { + value <- Array$create(value) + } else if (inherits(x, "Table") && !inherits(value, "ChunkedArray")) { + value <- ChunkedArray$create(value) + } + new_field <- field(i, value$type) + + if (i %in% names(x)) { + i <- match(i, names(x)) - 1L + x <- x$SetColumn(i, new_field, value) + } else { + i <- x$num_columns + x <- x$AddColumn(i, new_field, value) + } + } + x +} + +#' @export +`$<-.ArrowTabular` <- function(x, i, value) { + assert_that(is.string(i)) + # We need to check if `i` is in names in case it is an active binding (e.g. + # `metadata`, in which case we use assign to change the active binding instead + # of the column in the table) + if (i %in% ls(x)) { + assign(i, value, x) + } else { + x[[i]] <- value + } + x +} + +#' @export +dim.ArrowTabular <- function(x) c(x$num_rows, x$num_columns) + +#' @export +length.ArrowTabular <- function(x) x$num_columns + +#' @export +as.list.ArrowTabular <- function(x, ...) as.list(as.data.frame(x, ...)) + +#' @export +row.names.ArrowTabular <- function(x) as.character(seq_len(nrow(x))) + +#' @export +dimnames.ArrowTabular <- function(x) list(row.names(x), names(x)) + +#' @export +head.ArrowTabular <- head.ArrowDatum + +#' @export +tail.ArrowTabular <- tail.ArrowDatum + +#' @export +na.fail.ArrowTabular <- function(object, ...) { + for (col in seq_len(object$num_columns)) { + if (object$column(col - 1L)$null_count > 0) { + stop("missing values in object", call. = FALSE) + } + } + object +} + +#' @export +na.omit.ArrowTabular <- function(object, ...) { + not_na <- map(object$columns, ~ call_function("is_valid", .x)) + not_na_agg <- Reduce("&", not_na) + object$Filter(not_na_agg) +} + +#' @export +na.exclude.ArrowTabular <- na.omit.ArrowTabular diff --git a/src/arrow/r/R/arrowExports.R b/src/arrow/r/R/arrowExports.R new file mode 100644 index 000000000..014b1641f --- /dev/null +++ b/src/arrow/r/R/arrowExports.R @@ -0,0 +1,1801 @@ +# Generated by using data-raw/codegen.R -> do not edit by hand + +test_SET_STRING_ELT <- function(s) { + invisible(.Call(`_arrow_test_SET_STRING_ELT`, s)) +} + +test_same_Array <- function(x, y) { + .Call(`_arrow_test_same_Array`, x, y) +} + +is_arrow_altrep <- function(x) { + .Call(`_arrow_is_arrow_altrep`, x) +} + +Array__Slice1 <- function(array, offset) { + .Call(`_arrow_Array__Slice1`, array, offset) +} + +Array__Slice2 <- function(array, offset, length) { + .Call(`_arrow_Array__Slice2`, array, offset, length) +} + +Array__IsNull <- function(x, i) { + .Call(`_arrow_Array__IsNull`, x, i) +} + +Array__IsValid <- function(x, i) { + .Call(`_arrow_Array__IsValid`, x, i) +} + +Array__length <- function(x) { + .Call(`_arrow_Array__length`, x) +} + +Array__offset <- function(x) { + .Call(`_arrow_Array__offset`, x) +} + +Array__null_count <- function(x) { + .Call(`_arrow_Array__null_count`, x) +} + +Array__type <- function(x) { + .Call(`_arrow_Array__type`, x) +} + +Array__ToString <- function(x) { + .Call(`_arrow_Array__ToString`, x) +} + +Array__type_id <- function(x) { + .Call(`_arrow_Array__type_id`, x) +} + +Array__Equals <- function(lhs, rhs) { + .Call(`_arrow_Array__Equals`, lhs, rhs) +} + +Array__ApproxEquals <- function(lhs, rhs) { + .Call(`_arrow_Array__ApproxEquals`, lhs, rhs) +} + +Array__Diff <- function(lhs, rhs) { + .Call(`_arrow_Array__Diff`, lhs, rhs) +} + +Array__data <- function(array) { + .Call(`_arrow_Array__data`, array) +} + +Array__RangeEquals <- function(self, other, start_idx, end_idx, other_start_idx) { + .Call(`_arrow_Array__RangeEquals`, self, other, start_idx, end_idx, other_start_idx) +} + +Array__View <- function(array, type) { + .Call(`_arrow_Array__View`, array, type) +} + +Array__Validate <- function(array) { + invisible(.Call(`_arrow_Array__Validate`, array)) +} + +DictionaryArray__indices <- function(array) { + .Call(`_arrow_DictionaryArray__indices`, array) +} + +DictionaryArray__dictionary <- function(array) { + .Call(`_arrow_DictionaryArray__dictionary`, array) +} + +StructArray__field <- function(array, i) { + .Call(`_arrow_StructArray__field`, array, i) +} + +StructArray__GetFieldByName <- function(array, name) { + .Call(`_arrow_StructArray__GetFieldByName`, array, name) +} + +StructArray__Flatten <- function(array) { + .Call(`_arrow_StructArray__Flatten`, array) +} + +ListArray__value_type <- function(array) { + .Call(`_arrow_ListArray__value_type`, array) +} + +LargeListArray__value_type <- function(array) { + .Call(`_arrow_LargeListArray__value_type`, array) +} + +ListArray__values <- function(array) { + .Call(`_arrow_ListArray__values`, array) +} + +LargeListArray__values <- function(array) { + .Call(`_arrow_LargeListArray__values`, array) +} + +ListArray__value_length <- function(array, i) { + .Call(`_arrow_ListArray__value_length`, array, i) +} + +LargeListArray__value_length <- function(array, i) { + .Call(`_arrow_LargeListArray__value_length`, array, i) +} + +FixedSizeListArray__value_length <- function(array, i) { + .Call(`_arrow_FixedSizeListArray__value_length`, array, i) +} + +ListArray__value_offset <- function(array, i) { + .Call(`_arrow_ListArray__value_offset`, array, i) +} + +LargeListArray__value_offset <- function(array, i) { + .Call(`_arrow_LargeListArray__value_offset`, array, i) +} + +FixedSizeListArray__value_offset <- function(array, i) { + .Call(`_arrow_FixedSizeListArray__value_offset`, array, i) +} + +ListArray__raw_value_offsets <- function(array) { + .Call(`_arrow_ListArray__raw_value_offsets`, array) +} + +LargeListArray__raw_value_offsets <- function(array) { + .Call(`_arrow_LargeListArray__raw_value_offsets`, array) +} + +Array__as_vector <- function(array) { + .Call(`_arrow_Array__as_vector`, array) +} + +ChunkedArray__as_vector <- function(chunked_array, use_threads) { + .Call(`_arrow_ChunkedArray__as_vector`, chunked_array, use_threads) +} + +RecordBatch__to_dataframe <- function(batch, use_threads) { + .Call(`_arrow_RecordBatch__to_dataframe`, batch, use_threads) +} + +Table__to_dataframe <- function(table, use_threads) { + .Call(`_arrow_Table__to_dataframe`, table, use_threads) +} + +ArrayData__get_type <- function(x) { + .Call(`_arrow_ArrayData__get_type`, x) +} + +ArrayData__get_length <- function(x) { + .Call(`_arrow_ArrayData__get_length`, x) +} + +ArrayData__get_null_count <- function(x) { + .Call(`_arrow_ArrayData__get_null_count`, x) +} + +ArrayData__get_offset <- function(x) { + .Call(`_arrow_ArrayData__get_offset`, x) +} + +ArrayData__buffers <- function(x) { + .Call(`_arrow_ArrayData__buffers`, x) +} + +Buffer__is_mutable <- function(buffer) { + .Call(`_arrow_Buffer__is_mutable`, buffer) +} + +Buffer__ZeroPadding <- function(buffer) { + invisible(.Call(`_arrow_Buffer__ZeroPadding`, buffer)) +} + +Buffer__capacity <- function(buffer) { + .Call(`_arrow_Buffer__capacity`, buffer) +} + +Buffer__size <- function(buffer) { + .Call(`_arrow_Buffer__size`, buffer) +} + +r___RBuffer__initialize <- function(x) { + .Call(`_arrow_r___RBuffer__initialize`, x) +} + +Buffer__data <- function(buffer) { + .Call(`_arrow_Buffer__data`, buffer) +} + +Buffer__Equals <- function(x, y) { + .Call(`_arrow_Buffer__Equals`, x, y) +} + +ChunkedArray__length <- function(chunked_array) { + .Call(`_arrow_ChunkedArray__length`, chunked_array) +} + +ChunkedArray__null_count <- function(chunked_array) { + .Call(`_arrow_ChunkedArray__null_count`, chunked_array) +} + +ChunkedArray__num_chunks <- function(chunked_array) { + .Call(`_arrow_ChunkedArray__num_chunks`, chunked_array) +} + +ChunkedArray__chunk <- function(chunked_array, i) { + .Call(`_arrow_ChunkedArray__chunk`, chunked_array, i) +} + +ChunkedArray__chunks <- function(chunked_array) { + .Call(`_arrow_ChunkedArray__chunks`, chunked_array) +} + +ChunkedArray__type <- function(chunked_array) { + .Call(`_arrow_ChunkedArray__type`, chunked_array) +} + +ChunkedArray__Slice1 <- function(chunked_array, offset) { + .Call(`_arrow_ChunkedArray__Slice1`, chunked_array, offset) +} + +ChunkedArray__Slice2 <- function(chunked_array, offset, length) { + .Call(`_arrow_ChunkedArray__Slice2`, chunked_array, offset, length) +} + +ChunkedArray__View <- function(array, type) { + .Call(`_arrow_ChunkedArray__View`, array, type) +} + +ChunkedArray__Validate <- function(chunked_array) { + invisible(.Call(`_arrow_ChunkedArray__Validate`, chunked_array)) +} + +ChunkedArray__Equals <- function(x, y) { + .Call(`_arrow_ChunkedArray__Equals`, x, y) +} + +ChunkedArray__ToString <- function(x) { + .Call(`_arrow_ChunkedArray__ToString`, x) +} + +ChunkedArray__from_list <- function(chunks, s_type) { + .Call(`_arrow_ChunkedArray__from_list`, chunks, s_type) +} + +util___Codec__Create <- function(codec, compression_level) { + .Call(`_arrow_util___Codec__Create`, codec, compression_level) +} + +util___Codec__name <- function(codec) { + .Call(`_arrow_util___Codec__name`, codec) +} + +util___Codec__IsAvailable <- function(codec) { + .Call(`_arrow_util___Codec__IsAvailable`, codec) +} + +io___CompressedOutputStream__Make <- function(codec, raw) { + .Call(`_arrow_io___CompressedOutputStream__Make`, codec, raw) +} + +io___CompressedInputStream__Make <- function(codec, raw) { + .Call(`_arrow_io___CompressedInputStream__Make`, codec, raw) +} + +ExecPlan_create <- function(use_threads) { + .Call(`_arrow_ExecPlan_create`, use_threads) +} + +ExecPlan_run <- function(plan, final_node, sort_options, head) { + .Call(`_arrow_ExecPlan_run`, plan, final_node, sort_options, head) +} + +ExecPlan_StopProducing <- function(plan) { + invisible(.Call(`_arrow_ExecPlan_StopProducing`, plan)) +} + +ExecNode_output_schema <- function(node) { + .Call(`_arrow_ExecNode_output_schema`, node) +} + +ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { + .Call(`_arrow_ExecNode_Scan`, plan, dataset, filter, materialized_field_names) +} + +ExecNode_Filter <- function(input, filter) { + .Call(`_arrow_ExecNode_Filter`, input, filter) +} + +ExecNode_Project <- function(input, exprs, names) { + .Call(`_arrow_ExecNode_Project`, input, exprs, names) +} + +ExecNode_Aggregate <- function(input, options, target_names, out_field_names, key_names) { + .Call(`_arrow_ExecNode_Aggregate`, input, options, target_names, out_field_names, key_names) +} + +ExecNode_Join <- function(input, type, right_data, left_keys, right_keys, left_output, right_output) { + .Call(`_arrow_ExecNode_Join`, input, type, right_data, left_keys, right_keys, left_output, right_output) +} + +ExecNode_ReadFromRecordBatchReader <- function(plan, reader) { + .Call(`_arrow_ExecNode_ReadFromRecordBatchReader`, plan, reader) +} + +RecordBatch__cast <- function(batch, schema, options) { + .Call(`_arrow_RecordBatch__cast`, batch, schema, options) +} + +Table__cast <- function(table, schema, options) { + .Call(`_arrow_Table__cast`, table, schema, options) +} + +compute__CallFunction <- function(func_name, args, options) { + .Call(`_arrow_compute__CallFunction`, func_name, args, options) +} + +compute__GetFunctionNames <- function() { + .Call(`_arrow_compute__GetFunctionNames`) +} + +build_info <- function() { + .Call(`_arrow_build_info`) +} + +runtime_info <- function() { + .Call(`_arrow_runtime_info`) +} + +csv___WriteOptions__initialize <- function(options) { + .Call(`_arrow_csv___WriteOptions__initialize`, options) +} + +csv___ReadOptions__initialize <- function(options) { + .Call(`_arrow_csv___ReadOptions__initialize`, options) +} + +csv___ParseOptions__initialize <- function(options) { + .Call(`_arrow_csv___ParseOptions__initialize`, options) +} + +csv___ReadOptions__column_names <- function(options) { + .Call(`_arrow_csv___ReadOptions__column_names`, options) +} + +csv___ConvertOptions__initialize <- function(options) { + .Call(`_arrow_csv___ConvertOptions__initialize`, options) +} + +csv___TableReader__Make <- function(input, read_options, parse_options, convert_options) { + .Call(`_arrow_csv___TableReader__Make`, input, read_options, parse_options, convert_options) +} + +csv___TableReader__Read <- function(table_reader) { + .Call(`_arrow_csv___TableReader__Read`, table_reader) +} + +TimestampParser__kind <- function(parser) { + .Call(`_arrow_TimestampParser__kind`, parser) +} + +TimestampParser__format <- function(parser) { + .Call(`_arrow_TimestampParser__format`, parser) +} + +TimestampParser__MakeStrptime <- function(format) { + .Call(`_arrow_TimestampParser__MakeStrptime`, format) +} + +TimestampParser__MakeISO8601 <- function() { + .Call(`_arrow_TimestampParser__MakeISO8601`) +} + +csv___WriteCSV__Table <- function(table, write_options, stream) { + invisible(.Call(`_arrow_csv___WriteCSV__Table`, table, write_options, stream)) +} + +csv___WriteCSV__RecordBatch <- function(record_batch, write_options, stream) { + invisible(.Call(`_arrow_csv___WriteCSV__RecordBatch`, record_batch, write_options, stream)) +} + +dataset___Dataset__NewScan <- function(ds) { + .Call(`_arrow_dataset___Dataset__NewScan`, ds) +} + +dataset___Dataset__schema <- function(dataset) { + .Call(`_arrow_dataset___Dataset__schema`, dataset) +} + +dataset___Dataset__type_name <- function(dataset) { + .Call(`_arrow_dataset___Dataset__type_name`, dataset) +} + +dataset___Dataset__ReplaceSchema <- function(dataset, schm) { + .Call(`_arrow_dataset___Dataset__ReplaceSchema`, dataset, schm) +} + +dataset___UnionDataset__create <- function(datasets, schm) { + .Call(`_arrow_dataset___UnionDataset__create`, datasets, schm) +} + +dataset___InMemoryDataset__create <- function(table) { + .Call(`_arrow_dataset___InMemoryDataset__create`, table) +} + +dataset___UnionDataset__children <- function(ds) { + .Call(`_arrow_dataset___UnionDataset__children`, ds) +} + +dataset___FileSystemDataset__format <- function(dataset) { + .Call(`_arrow_dataset___FileSystemDataset__format`, dataset) +} + +dataset___FileSystemDataset__filesystem <- function(dataset) { + .Call(`_arrow_dataset___FileSystemDataset__filesystem`, dataset) +} + +dataset___FileSystemDataset__files <- function(dataset) { + .Call(`_arrow_dataset___FileSystemDataset__files`, dataset) +} + +dataset___DatasetFactory__Finish1 <- function(factory, unify_schemas) { + .Call(`_arrow_dataset___DatasetFactory__Finish1`, factory, unify_schemas) +} + +dataset___DatasetFactory__Finish2 <- function(factory, schema) { + .Call(`_arrow_dataset___DatasetFactory__Finish2`, factory, schema) +} + +dataset___DatasetFactory__Inspect <- function(factory, unify_schemas) { + .Call(`_arrow_dataset___DatasetFactory__Inspect`, factory, unify_schemas) +} + +dataset___UnionDatasetFactory__Make <- function(children) { + .Call(`_arrow_dataset___UnionDatasetFactory__Make`, children) +} + +dataset___FileSystemDatasetFactory__Make0 <- function(fs, paths, format) { + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make0`, fs, paths, format) +} + +dataset___FileSystemDatasetFactory__Make2 <- function(fs, selector, format, partitioning) { + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make2`, fs, selector, format, partitioning) +} + +dataset___FileSystemDatasetFactory__Make1 <- function(fs, selector, format) { + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make1`, fs, selector, format) +} + +dataset___FileSystemDatasetFactory__Make3 <- function(fs, selector, format, factory) { + .Call(`_arrow_dataset___FileSystemDatasetFactory__Make3`, fs, selector, format, factory) +} + +dataset___FileFormat__type_name <- function(format) { + .Call(`_arrow_dataset___FileFormat__type_name`, format) +} + +dataset___FileFormat__DefaultWriteOptions <- function(fmt) { + .Call(`_arrow_dataset___FileFormat__DefaultWriteOptions`, fmt) +} + +dataset___ParquetFileFormat__Make <- function(options, dict_columns) { + .Call(`_arrow_dataset___ParquetFileFormat__Make`, options, dict_columns) +} + +dataset___FileWriteOptions__type_name <- function(options) { + .Call(`_arrow_dataset___FileWriteOptions__type_name`, options) +} + +dataset___ParquetFileWriteOptions__update <- function(options, writer_props, arrow_writer_props) { + invisible(.Call(`_arrow_dataset___ParquetFileWriteOptions__update`, options, writer_props, arrow_writer_props)) +} + +dataset___IpcFileWriteOptions__update2 <- function(ipc_options, use_legacy_format, codec, metadata_version) { + invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update2`, ipc_options, use_legacy_format, codec, metadata_version)) +} + +dataset___IpcFileWriteOptions__update1 <- function(ipc_options, use_legacy_format, metadata_version) { + invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update1`, ipc_options, use_legacy_format, metadata_version)) +} + +dataset___CsvFileWriteOptions__update <- function(csv_options, write_options) { + invisible(.Call(`_arrow_dataset___CsvFileWriteOptions__update`, csv_options, write_options)) +} + +dataset___IpcFileFormat__Make <- function() { + .Call(`_arrow_dataset___IpcFileFormat__Make`) +} + +dataset___CsvFileFormat__Make <- function(parse_options, convert_options, read_options) { + .Call(`_arrow_dataset___CsvFileFormat__Make`, parse_options, convert_options, read_options) +} + +dataset___FragmentScanOptions__type_name <- function(fragment_scan_options) { + .Call(`_arrow_dataset___FragmentScanOptions__type_name`, fragment_scan_options) +} + +dataset___CsvFragmentScanOptions__Make <- function(convert_options, read_options) { + .Call(`_arrow_dataset___CsvFragmentScanOptions__Make`, convert_options, read_options) +} + +dataset___ParquetFragmentScanOptions__Make <- function(use_buffered_stream, buffer_size, pre_buffer) { + .Call(`_arrow_dataset___ParquetFragmentScanOptions__Make`, use_buffered_stream, buffer_size, pre_buffer) +} + +dataset___DirectoryPartitioning <- function(schm, segment_encoding) { + .Call(`_arrow_dataset___DirectoryPartitioning`, schm, segment_encoding) +} + +dataset___DirectoryPartitioning__MakeFactory <- function(field_names, segment_encoding) { + .Call(`_arrow_dataset___DirectoryPartitioning__MakeFactory`, field_names, segment_encoding) +} + +dataset___HivePartitioning <- function(schm, null_fallback, segment_encoding) { + .Call(`_arrow_dataset___HivePartitioning`, schm, null_fallback, segment_encoding) +} + +dataset___HivePartitioning__MakeFactory <- function(null_fallback, segment_encoding) { + .Call(`_arrow_dataset___HivePartitioning__MakeFactory`, null_fallback, segment_encoding) +} + +dataset___ScannerBuilder__ProjectNames <- function(sb, cols) { + invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectNames`, sb, cols)) +} + +dataset___ScannerBuilder__ProjectExprs <- function(sb, exprs, names) { + invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectExprs`, sb, exprs, names)) +} + +dataset___ScannerBuilder__Filter <- function(sb, expr) { + invisible(.Call(`_arrow_dataset___ScannerBuilder__Filter`, sb, expr)) +} + +dataset___ScannerBuilder__UseThreads <- function(sb, threads) { + invisible(.Call(`_arrow_dataset___ScannerBuilder__UseThreads`, sb, threads)) +} + +dataset___ScannerBuilder__UseAsync <- function(sb, use_async) { + invisible(.Call(`_arrow_dataset___ScannerBuilder__UseAsync`, sb, use_async)) +} + +dataset___ScannerBuilder__BatchSize <- function(sb, batch_size) { + invisible(.Call(`_arrow_dataset___ScannerBuilder__BatchSize`, sb, batch_size)) +} + +dataset___ScannerBuilder__FragmentScanOptions <- function(sb, options) { + invisible(.Call(`_arrow_dataset___ScannerBuilder__FragmentScanOptions`, sb, options)) +} + +dataset___ScannerBuilder__schema <- function(sb) { + .Call(`_arrow_dataset___ScannerBuilder__schema`, sb) +} + +dataset___ScannerBuilder__Finish <- function(sb) { + .Call(`_arrow_dataset___ScannerBuilder__Finish`, sb) +} + +dataset___ScannerBuilder__FromRecordBatchReader <- function(reader) { + .Call(`_arrow_dataset___ScannerBuilder__FromRecordBatchReader`, reader) +} + +dataset___Scanner__ToTable <- function(scanner) { + .Call(`_arrow_dataset___Scanner__ToTable`, scanner) +} + +dataset___Scanner__ScanBatches <- function(scanner) { + .Call(`_arrow_dataset___Scanner__ScanBatches`, scanner) +} + +dataset___Scanner__ToRecordBatchReader <- function(scanner) { + .Call(`_arrow_dataset___Scanner__ToRecordBatchReader`, scanner) +} + +dataset___Scanner__head <- function(scanner, n) { + .Call(`_arrow_dataset___Scanner__head`, scanner, n) +} + +dataset___Scanner__schema <- function(sc) { + .Call(`_arrow_dataset___Scanner__schema`, sc) +} + +dataset___ScanTask__get_batches <- function(scan_task) { + .Call(`_arrow_dataset___ScanTask__get_batches`, scan_task) +} + +dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior) { + invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior)) +} + +dataset___Scanner__TakeRows <- function(scanner, indices) { + .Call(`_arrow_dataset___Scanner__TakeRows`, scanner, indices) +} + +dataset___Scanner__CountRows <- function(scanner) { + .Call(`_arrow_dataset___Scanner__CountRows`, scanner) +} + +Int8__initialize <- function() { + .Call(`_arrow_Int8__initialize`) +} + +Int16__initialize <- function() { + .Call(`_arrow_Int16__initialize`) +} + +Int32__initialize <- function() { + .Call(`_arrow_Int32__initialize`) +} + +Int64__initialize <- function() { + .Call(`_arrow_Int64__initialize`) +} + +UInt8__initialize <- function() { + .Call(`_arrow_UInt8__initialize`) +} + +UInt16__initialize <- function() { + .Call(`_arrow_UInt16__initialize`) +} + +UInt32__initialize <- function() { + .Call(`_arrow_UInt32__initialize`) +} + +UInt64__initialize <- function() { + .Call(`_arrow_UInt64__initialize`) +} + +Float16__initialize <- function() { + .Call(`_arrow_Float16__initialize`) +} + +Float32__initialize <- function() { + .Call(`_arrow_Float32__initialize`) +} + +Float64__initialize <- function() { + .Call(`_arrow_Float64__initialize`) +} + +Boolean__initialize <- function() { + .Call(`_arrow_Boolean__initialize`) +} + +Utf8__initialize <- function() { + .Call(`_arrow_Utf8__initialize`) +} + +LargeUtf8__initialize <- function() { + .Call(`_arrow_LargeUtf8__initialize`) +} + +Binary__initialize <- function() { + .Call(`_arrow_Binary__initialize`) +} + +LargeBinary__initialize <- function() { + .Call(`_arrow_LargeBinary__initialize`) +} + +Date32__initialize <- function() { + .Call(`_arrow_Date32__initialize`) +} + +Date64__initialize <- function() { + .Call(`_arrow_Date64__initialize`) +} + +Null__initialize <- function() { + .Call(`_arrow_Null__initialize`) +} + +Decimal128Type__initialize <- function(precision, scale) { + .Call(`_arrow_Decimal128Type__initialize`, precision, scale) +} + +FixedSizeBinary__initialize <- function(byte_width) { + .Call(`_arrow_FixedSizeBinary__initialize`, byte_width) +} + +Timestamp__initialize <- function(unit, timezone) { + .Call(`_arrow_Timestamp__initialize`, unit, timezone) +} + +Time32__initialize <- function(unit) { + .Call(`_arrow_Time32__initialize`, unit) +} + +Time64__initialize <- function(unit) { + .Call(`_arrow_Time64__initialize`, unit) +} + +list__ <- function(x) { + .Call(`_arrow_list__`, x) +} + +large_list__ <- function(x) { + .Call(`_arrow_large_list__`, x) +} + +fixed_size_list__ <- function(x, list_size) { + .Call(`_arrow_fixed_size_list__`, x, list_size) +} + +struct__ <- function(fields) { + .Call(`_arrow_struct__`, fields) +} + +DataType__ToString <- function(type) { + .Call(`_arrow_DataType__ToString`, type) +} + +DataType__name <- function(type) { + .Call(`_arrow_DataType__name`, type) +} + +DataType__Equals <- function(lhs, rhs) { + .Call(`_arrow_DataType__Equals`, lhs, rhs) +} + +DataType__num_fields <- function(type) { + .Call(`_arrow_DataType__num_fields`, type) +} + +DataType__fields <- function(type) { + .Call(`_arrow_DataType__fields`, type) +} + +DataType__id <- function(type) { + .Call(`_arrow_DataType__id`, type) +} + +ListType__ToString <- function(type) { + .Call(`_arrow_ListType__ToString`, type) +} + +FixedWidthType__bit_width <- function(type) { + .Call(`_arrow_FixedWidthType__bit_width`, type) +} + +DateType__unit <- function(type) { + .Call(`_arrow_DateType__unit`, type) +} + +TimeType__unit <- function(type) { + .Call(`_arrow_TimeType__unit`, type) +} + +DecimalType__precision <- function(type) { + .Call(`_arrow_DecimalType__precision`, type) +} + +DecimalType__scale <- function(type) { + .Call(`_arrow_DecimalType__scale`, type) +} + +TimestampType__timezone <- function(type) { + .Call(`_arrow_TimestampType__timezone`, type) +} + +TimestampType__unit <- function(type) { + .Call(`_arrow_TimestampType__unit`, type) +} + +DictionaryType__initialize <- function(index_type, value_type, ordered) { + .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered) +} + +DictionaryType__index_type <- function(type) { + .Call(`_arrow_DictionaryType__index_type`, type) +} + +DictionaryType__value_type <- function(type) { + .Call(`_arrow_DictionaryType__value_type`, type) +} + +DictionaryType__name <- function(type) { + .Call(`_arrow_DictionaryType__name`, type) +} + +DictionaryType__ordered <- function(type) { + .Call(`_arrow_DictionaryType__ordered`, type) +} + +StructType__GetFieldByName <- function(type, name) { + .Call(`_arrow_StructType__GetFieldByName`, type, name) +} + +StructType__GetFieldIndex <- function(type, name) { + .Call(`_arrow_StructType__GetFieldIndex`, type, name) +} + +StructType__field_names <- function(type) { + .Call(`_arrow_StructType__field_names`, type) +} + +ListType__value_field <- function(type) { + .Call(`_arrow_ListType__value_field`, type) +} + +ListType__value_type <- function(type) { + .Call(`_arrow_ListType__value_type`, type) +} + +LargeListType__value_field <- function(type) { + .Call(`_arrow_LargeListType__value_field`, type) +} + +LargeListType__value_type <- function(type) { + .Call(`_arrow_LargeListType__value_type`, type) +} + +FixedSizeListType__value_field <- function(type) { + .Call(`_arrow_FixedSizeListType__value_field`, type) +} + +FixedSizeListType__value_type <- function(type) { + .Call(`_arrow_FixedSizeListType__value_type`, type) +} + +FixedSizeListType__list_size <- function(type) { + .Call(`_arrow_FixedSizeListType__list_size`, type) +} + +compute___expr__equals <- function(lhs, rhs) { + .Call(`_arrow_compute___expr__equals`, lhs, rhs) +} + +compute___expr__call <- function(func_name, argument_list, options) { + .Call(`_arrow_compute___expr__call`, func_name, argument_list, options) +} + +field_names_in_expression <- function(x) { + .Call(`_arrow_field_names_in_expression`, x) +} + +compute___expr__get_field_ref_name <- function(x) { + .Call(`_arrow_compute___expr__get_field_ref_name`, x) +} + +compute___expr__field_ref <- function(name) { + .Call(`_arrow_compute___expr__field_ref`, name) +} + +compute___expr__scalar <- function(x) { + .Call(`_arrow_compute___expr__scalar`, x) +} + +compute___expr__ToString <- function(x) { + .Call(`_arrow_compute___expr__ToString`, x) +} + +compute___expr__type <- function(x, schema) { + .Call(`_arrow_compute___expr__type`, x, schema) +} + +compute___expr__type_id <- function(x, schema) { + .Call(`_arrow_compute___expr__type_id`, x, schema) +} + +ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) { + invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level)) +} + +ipc___feather___Reader__version <- function(reader) { + .Call(`_arrow_ipc___feather___Reader__version`, reader) +} + +ipc___feather___Reader__Read <- function(reader, columns) { + .Call(`_arrow_ipc___feather___Reader__Read`, reader, columns) +} + +ipc___feather___Reader__Open <- function(stream) { + .Call(`_arrow_ipc___feather___Reader__Open`, stream) +} + +ipc___feather___Reader__schema <- function(reader) { + .Call(`_arrow_ipc___feather___Reader__schema`, reader) +} + +Field__initialize <- function(name, field, nullable) { + .Call(`_arrow_Field__initialize`, name, field, nullable) +} + +Field__ToString <- function(field) { + .Call(`_arrow_Field__ToString`, field) +} + +Field__name <- function(field) { + .Call(`_arrow_Field__name`, field) +} + +Field__Equals <- function(field, other) { + .Call(`_arrow_Field__Equals`, field, other) +} + +Field__nullable <- function(field) { + .Call(`_arrow_Field__nullable`, field) +} + +Field__type <- function(field) { + .Call(`_arrow_Field__type`, field) +} + +fs___FileInfo__type <- function(x) { + .Call(`_arrow_fs___FileInfo__type`, x) +} + +fs___FileInfo__set_type <- function(x, type) { + invisible(.Call(`_arrow_fs___FileInfo__set_type`, x, type)) +} + +fs___FileInfo__path <- function(x) { + .Call(`_arrow_fs___FileInfo__path`, x) +} + +fs___FileInfo__set_path <- function(x, path) { + invisible(.Call(`_arrow_fs___FileInfo__set_path`, x, path)) +} + +fs___FileInfo__size <- function(x) { + .Call(`_arrow_fs___FileInfo__size`, x) +} + +fs___FileInfo__set_size <- function(x, size) { + invisible(.Call(`_arrow_fs___FileInfo__set_size`, x, size)) +} + +fs___FileInfo__base_name <- function(x) { + .Call(`_arrow_fs___FileInfo__base_name`, x) +} + +fs___FileInfo__extension <- function(x) { + .Call(`_arrow_fs___FileInfo__extension`, x) +} + +fs___FileInfo__mtime <- function(x) { + .Call(`_arrow_fs___FileInfo__mtime`, x) +} + +fs___FileInfo__set_mtime <- function(x, time) { + invisible(.Call(`_arrow_fs___FileInfo__set_mtime`, x, time)) +} + +fs___FileSelector__base_dir <- function(selector) { + .Call(`_arrow_fs___FileSelector__base_dir`, selector) +} + +fs___FileSelector__allow_not_found <- function(selector) { + .Call(`_arrow_fs___FileSelector__allow_not_found`, selector) +} + +fs___FileSelector__recursive <- function(selector) { + .Call(`_arrow_fs___FileSelector__recursive`, selector) +} + +fs___FileSelector__create <- function(base_dir, allow_not_found, recursive) { + .Call(`_arrow_fs___FileSelector__create`, base_dir, allow_not_found, recursive) +} + +fs___FileSystem__GetTargetInfos_Paths <- function(file_system, paths) { + .Call(`_arrow_fs___FileSystem__GetTargetInfos_Paths`, file_system, paths) +} + +fs___FileSystem__GetTargetInfos_FileSelector <- function(file_system, selector) { + .Call(`_arrow_fs___FileSystem__GetTargetInfos_FileSelector`, file_system, selector) +} + +fs___FileSystem__CreateDir <- function(file_system, path, recursive) { + invisible(.Call(`_arrow_fs___FileSystem__CreateDir`, file_system, path, recursive)) +} + +fs___FileSystem__DeleteDir <- function(file_system, path) { + invisible(.Call(`_arrow_fs___FileSystem__DeleteDir`, file_system, path)) +} + +fs___FileSystem__DeleteDirContents <- function(file_system, path) { + invisible(.Call(`_arrow_fs___FileSystem__DeleteDirContents`, file_system, path)) +} + +fs___FileSystem__DeleteFile <- function(file_system, path) { + invisible(.Call(`_arrow_fs___FileSystem__DeleteFile`, file_system, path)) +} + +fs___FileSystem__DeleteFiles <- function(file_system, paths) { + invisible(.Call(`_arrow_fs___FileSystem__DeleteFiles`, file_system, paths)) +} + +fs___FileSystem__Move <- function(file_system, src, dest) { + invisible(.Call(`_arrow_fs___FileSystem__Move`, file_system, src, dest)) +} + +fs___FileSystem__CopyFile <- function(file_system, src, dest) { + invisible(.Call(`_arrow_fs___FileSystem__CopyFile`, file_system, src, dest)) +} + +fs___FileSystem__OpenInputStream <- function(file_system, path) { + .Call(`_arrow_fs___FileSystem__OpenInputStream`, file_system, path) +} + +fs___FileSystem__OpenInputFile <- function(file_system, path) { + .Call(`_arrow_fs___FileSystem__OpenInputFile`, file_system, path) +} + +fs___FileSystem__OpenOutputStream <- function(file_system, path) { + .Call(`_arrow_fs___FileSystem__OpenOutputStream`, file_system, path) +} + +fs___FileSystem__OpenAppendStream <- function(file_system, path) { + .Call(`_arrow_fs___FileSystem__OpenAppendStream`, file_system, path) +} + +fs___FileSystem__type_name <- function(file_system) { + .Call(`_arrow_fs___FileSystem__type_name`, file_system) +} + +fs___LocalFileSystem__create <- function() { + .Call(`_arrow_fs___LocalFileSystem__create`) +} + +fs___SubTreeFileSystem__create <- function(base_path, base_fs) { + .Call(`_arrow_fs___SubTreeFileSystem__create`, base_path, base_fs) +} + +fs___SubTreeFileSystem__base_fs <- function(file_system) { + .Call(`_arrow_fs___SubTreeFileSystem__base_fs`, file_system) +} + +fs___SubTreeFileSystem__base_path <- function(file_system) { + .Call(`_arrow_fs___SubTreeFileSystem__base_path`, file_system) +} + +fs___FileSystemFromUri <- function(path) { + .Call(`_arrow_fs___FileSystemFromUri`, path) +} + +fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads) { + invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads)) +} + +fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) { + .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) +} + +fs___S3FileSystem__region <- function(fs) { + .Call(`_arrow_fs___S3FileSystem__region`, fs) +} + +io___Readable__Read <- function(x, nbytes) { + .Call(`_arrow_io___Readable__Read`, x, nbytes) +} + +io___InputStream__Close <- function(x) { + invisible(.Call(`_arrow_io___InputStream__Close`, x)) +} + +io___OutputStream__Close <- function(x) { + invisible(.Call(`_arrow_io___OutputStream__Close`, x)) +} + +io___RandomAccessFile__GetSize <- function(x) { + .Call(`_arrow_io___RandomAccessFile__GetSize`, x) +} + +io___RandomAccessFile__supports_zero_copy <- function(x) { + .Call(`_arrow_io___RandomAccessFile__supports_zero_copy`, x) +} + +io___RandomAccessFile__Seek <- function(x, position) { + invisible(.Call(`_arrow_io___RandomAccessFile__Seek`, x, position)) +} + +io___RandomAccessFile__Tell <- function(x) { + .Call(`_arrow_io___RandomAccessFile__Tell`, x) +} + +io___RandomAccessFile__Read0 <- function(x) { + .Call(`_arrow_io___RandomAccessFile__Read0`, x) +} + +io___RandomAccessFile__ReadAt <- function(x, position, nbytes) { + .Call(`_arrow_io___RandomAccessFile__ReadAt`, x, position, nbytes) +} + +io___MemoryMappedFile__Create <- function(path, size) { + .Call(`_arrow_io___MemoryMappedFile__Create`, path, size) +} + +io___MemoryMappedFile__Open <- function(path, mode) { + .Call(`_arrow_io___MemoryMappedFile__Open`, path, mode) +} + +io___MemoryMappedFile__Resize <- function(x, size) { + invisible(.Call(`_arrow_io___MemoryMappedFile__Resize`, x, size)) +} + +io___ReadableFile__Open <- function(path) { + .Call(`_arrow_io___ReadableFile__Open`, path) +} + +io___BufferReader__initialize <- function(buffer) { + .Call(`_arrow_io___BufferReader__initialize`, buffer) +} + +io___Writable__write <- function(stream, buf) { + invisible(.Call(`_arrow_io___Writable__write`, stream, buf)) +} + +io___OutputStream__Tell <- function(stream) { + .Call(`_arrow_io___OutputStream__Tell`, stream) +} + +io___FileOutputStream__Open <- function(path) { + .Call(`_arrow_io___FileOutputStream__Open`, path) +} + +io___BufferOutputStream__Create <- function(initial_capacity) { + .Call(`_arrow_io___BufferOutputStream__Create`, initial_capacity) +} + +io___BufferOutputStream__capacity <- function(stream) { + .Call(`_arrow_io___BufferOutputStream__capacity`, stream) +} + +io___BufferOutputStream__Finish <- function(stream) { + .Call(`_arrow_io___BufferOutputStream__Finish`, stream) +} + +io___BufferOutputStream__Tell <- function(stream) { + .Call(`_arrow_io___BufferOutputStream__Tell`, stream) +} + +io___BufferOutputStream__Write <- function(stream, bytes) { + invisible(.Call(`_arrow_io___BufferOutputStream__Write`, stream, bytes)) +} + +json___ReadOptions__initialize <- function(use_threads, block_size) { + .Call(`_arrow_json___ReadOptions__initialize`, use_threads, block_size) +} + +json___ParseOptions__initialize1 <- function(newlines_in_values) { + .Call(`_arrow_json___ParseOptions__initialize1`, newlines_in_values) +} + +json___ParseOptions__initialize2 <- function(newlines_in_values, explicit_schema) { + .Call(`_arrow_json___ParseOptions__initialize2`, newlines_in_values, explicit_schema) +} + +json___TableReader__Make <- function(input, read_options, parse_options) { + .Call(`_arrow_json___TableReader__Make`, input, read_options, parse_options) +} + +json___TableReader__Read <- function(table_reader) { + .Call(`_arrow_json___TableReader__Read`, table_reader) +} + +MemoryPool__default <- function() { + .Call(`_arrow_MemoryPool__default`) +} + +MemoryPool__bytes_allocated <- function(pool) { + .Call(`_arrow_MemoryPool__bytes_allocated`, pool) +} + +MemoryPool__max_memory <- function(pool) { + .Call(`_arrow_MemoryPool__max_memory`, pool) +} + +MemoryPool__backend_name <- function(pool) { + .Call(`_arrow_MemoryPool__backend_name`, pool) +} + +supported_memory_backends <- function() { + .Call(`_arrow_supported_memory_backends`) +} + +ipc___Message__body_length <- function(message) { + .Call(`_arrow_ipc___Message__body_length`, message) +} + +ipc___Message__metadata <- function(message) { + .Call(`_arrow_ipc___Message__metadata`, message) +} + +ipc___Message__body <- function(message) { + .Call(`_arrow_ipc___Message__body`, message) +} + +ipc___Message__Verify <- function(message) { + .Call(`_arrow_ipc___Message__Verify`, message) +} + +ipc___Message__type <- function(message) { + .Call(`_arrow_ipc___Message__type`, message) +} + +ipc___Message__Equals <- function(x, y) { + .Call(`_arrow_ipc___Message__Equals`, x, y) +} + +ipc___ReadRecordBatch__Message__Schema <- function(message, schema) { + .Call(`_arrow_ipc___ReadRecordBatch__Message__Schema`, message, schema) +} + +ipc___ReadSchema_InputStream <- function(stream) { + .Call(`_arrow_ipc___ReadSchema_InputStream`, stream) +} + +ipc___ReadSchema_Message <- function(message) { + .Call(`_arrow_ipc___ReadSchema_Message`, message) +} + +ipc___MessageReader__Open <- function(stream) { + .Call(`_arrow_ipc___MessageReader__Open`, stream) +} + +ipc___MessageReader__ReadNextMessage <- function(reader) { + .Call(`_arrow_ipc___MessageReader__ReadNextMessage`, reader) +} + +ipc___ReadMessage <- function(stream) { + .Call(`_arrow_ipc___ReadMessage`, stream) +} + +parquet___arrow___ArrowReaderProperties__Make <- function(use_threads) { + .Call(`_arrow_parquet___arrow___ArrowReaderProperties__Make`, use_threads) +} + +parquet___arrow___ArrowReaderProperties__set_use_threads <- function(properties, use_threads) { + invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads`, properties, use_threads)) +} + +parquet___arrow___ArrowReaderProperties__get_use_threads <- function(properties, use_threads) { + .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads`, properties, use_threads) +} + +parquet___arrow___ArrowReaderProperties__get_read_dictionary <- function(properties, column_index) { + .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary`, properties, column_index) +} + +parquet___arrow___ArrowReaderProperties__set_read_dictionary <- function(properties, column_index, read_dict) { + invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary`, properties, column_index, read_dict)) +} + +parquet___arrow___FileReader__OpenFile <- function(file, props) { + .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props) +} + +parquet___arrow___FileReader__ReadTable1 <- function(reader) { + .Call(`_arrow_parquet___arrow___FileReader__ReadTable1`, reader) +} + +parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices) { + .Call(`_arrow_parquet___arrow___FileReader__ReadTable2`, reader, column_indices) +} + +parquet___arrow___FileReader__ReadRowGroup1 <- function(reader, i) { + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup1`, reader, i) +} + +parquet___arrow___FileReader__ReadRowGroup2 <- function(reader, i, column_indices) { + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup2`, reader, i, column_indices) +} + +parquet___arrow___FileReader__ReadRowGroups1 <- function(reader, row_groups) { + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups1`, reader, row_groups) +} + +parquet___arrow___FileReader__ReadRowGroups2 <- function(reader, row_groups, column_indices) { + .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups2`, reader, row_groups, column_indices) +} + +parquet___arrow___FileReader__num_rows <- function(reader) { + .Call(`_arrow_parquet___arrow___FileReader__num_rows`, reader) +} + +parquet___arrow___FileReader__num_columns <- function(reader) { + .Call(`_arrow_parquet___arrow___FileReader__num_columns`, reader) +} + +parquet___arrow___FileReader__num_row_groups <- function(reader) { + .Call(`_arrow_parquet___arrow___FileReader__num_row_groups`, reader) +} + +parquet___arrow___FileReader__ReadColumn <- function(reader, i) { + .Call(`_arrow_parquet___arrow___FileReader__ReadColumn`, reader, i) +} + +parquet___ArrowWriterProperties___create <- function(allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit) { + .Call(`_arrow_parquet___ArrowWriterProperties___create`, allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit) +} + +parquet___WriterProperties___Builder__create <- function() { + .Call(`_arrow_parquet___WriterProperties___Builder__create`) +} + +parquet___WriterProperties___Builder__version <- function(builder, version) { + invisible(.Call(`_arrow_parquet___WriterProperties___Builder__version`, builder, version)) +} + +parquet___ArrowWriterProperties___Builder__set_compressions <- function(builder, paths, types) { + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compressions`, builder, paths, types)) +} + +parquet___ArrowWriterProperties___Builder__set_compression_levels <- function(builder, paths, levels) { + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels`, builder, paths, levels)) +} + +parquet___ArrowWriterProperties___Builder__set_use_dictionary <- function(builder, paths, use_dictionary) { + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary`, builder, paths, use_dictionary)) +} + +parquet___ArrowWriterProperties___Builder__set_write_statistics <- function(builder, paths, write_statistics) { + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics`, builder, paths, write_statistics)) +} + +parquet___ArrowWriterProperties___Builder__data_page_size <- function(builder, data_page_size) { + invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__data_page_size`, builder, data_page_size)) +} + +parquet___WriterProperties___Builder__build <- function(builder) { + .Call(`_arrow_parquet___WriterProperties___Builder__build`, builder) +} + +parquet___arrow___ParquetFileWriter__Open <- function(schema, sink, properties, arrow_properties) { + .Call(`_arrow_parquet___arrow___ParquetFileWriter__Open`, schema, sink, properties, arrow_properties) +} + +parquet___arrow___FileWriter__WriteTable <- function(writer, table, chunk_size) { + invisible(.Call(`_arrow_parquet___arrow___FileWriter__WriteTable`, writer, table, chunk_size)) +} + +parquet___arrow___FileWriter__Close <- function(writer) { + invisible(.Call(`_arrow_parquet___arrow___FileWriter__Close`, writer)) +} + +parquet___arrow___WriteTable <- function(table, sink, properties, arrow_properties) { + invisible(.Call(`_arrow_parquet___arrow___WriteTable`, table, sink, properties, arrow_properties)) +} + +parquet___arrow___FileReader__GetSchema <- function(reader) { + .Call(`_arrow_parquet___arrow___FileReader__GetSchema`, reader) +} + +allocate_arrow_schema <- function() { + .Call(`_arrow_allocate_arrow_schema`) +} + +delete_arrow_schema <- function(ptr) { + invisible(.Call(`_arrow_delete_arrow_schema`, ptr)) +} + +allocate_arrow_array <- function() { + .Call(`_arrow_allocate_arrow_array`) +} + +delete_arrow_array <- function(ptr) { + invisible(.Call(`_arrow_delete_arrow_array`, ptr)) +} + +allocate_arrow_array_stream <- function() { + .Call(`_arrow_allocate_arrow_array_stream`) +} + +delete_arrow_array_stream <- function(ptr) { + invisible(.Call(`_arrow_delete_arrow_array_stream`, ptr)) +} + +ImportArray <- function(array, schema) { + .Call(`_arrow_ImportArray`, array, schema) +} + +ImportRecordBatch <- function(array, schema) { + .Call(`_arrow_ImportRecordBatch`, array, schema) +} + +ImportSchema <- function(schema) { + .Call(`_arrow_ImportSchema`, schema) +} + +ImportField <- function(field) { + .Call(`_arrow_ImportField`, field) +} + +ImportType <- function(type) { + .Call(`_arrow_ImportType`, type) +} + +ImportRecordBatchReader <- function(stream) { + .Call(`_arrow_ImportRecordBatchReader`, stream) +} + +ExportType <- function(type, ptr) { + invisible(.Call(`_arrow_ExportType`, type, ptr)) +} + +ExportField <- function(field, ptr) { + invisible(.Call(`_arrow_ExportField`, field, ptr)) +} + +ExportSchema <- function(schema, ptr) { + invisible(.Call(`_arrow_ExportSchema`, schema, ptr)) +} + +ExportArray <- function(array, array_ptr, schema_ptr) { + invisible(.Call(`_arrow_ExportArray`, array, array_ptr, schema_ptr)) +} + +ExportRecordBatch <- function(batch, array_ptr, schema_ptr) { + invisible(.Call(`_arrow_ExportRecordBatch`, batch, array_ptr, schema_ptr)) +} + +ExportRecordBatchReader <- function(reader, stream_ptr) { + invisible(.Call(`_arrow_ExportRecordBatchReader`, reader, stream_ptr)) +} + +Table__from_dots <- function(lst, schema_sxp, use_threads) { + .Call(`_arrow_Table__from_dots`, lst, schema_sxp, use_threads) +} + +vec_to_arrow <- function(x, s_type) { + .Call(`_arrow_vec_to_arrow`, x, s_type) +} + +DictionaryArray__FromArrays <- function(type, indices, dict) { + .Call(`_arrow_DictionaryArray__FromArrays`, type, indices, dict) +} + +RecordBatch__num_columns <- function(x) { + .Call(`_arrow_RecordBatch__num_columns`, x) +} + +RecordBatch__num_rows <- function(x) { + .Call(`_arrow_RecordBatch__num_rows`, x) +} + +RecordBatch__schema <- function(x) { + .Call(`_arrow_RecordBatch__schema`, x) +} + +RecordBatch__RenameColumns <- function(batch, names) { + .Call(`_arrow_RecordBatch__RenameColumns`, batch, names) +} + +RecordBatch__ReplaceSchemaMetadata <- function(x, metadata) { + .Call(`_arrow_RecordBatch__ReplaceSchemaMetadata`, x, metadata) +} + +RecordBatch__columns <- function(batch) { + .Call(`_arrow_RecordBatch__columns`, batch) +} + +RecordBatch__column <- function(batch, i) { + .Call(`_arrow_RecordBatch__column`, batch, i) +} + +RecordBatch__GetColumnByName <- function(batch, name) { + .Call(`_arrow_RecordBatch__GetColumnByName`, batch, name) +} + +RecordBatch__SelectColumns <- function(batch, indices) { + .Call(`_arrow_RecordBatch__SelectColumns`, batch, indices) +} + +RecordBatch__Equals <- function(self, other, check_metadata) { + .Call(`_arrow_RecordBatch__Equals`, self, other, check_metadata) +} + +RecordBatch__AddColumn <- function(batch, i, field, column) { + .Call(`_arrow_RecordBatch__AddColumn`, batch, i, field, column) +} + +RecordBatch__SetColumn <- function(batch, i, field, column) { + .Call(`_arrow_RecordBatch__SetColumn`, batch, i, field, column) +} + +RecordBatch__RemoveColumn <- function(batch, i) { + .Call(`_arrow_RecordBatch__RemoveColumn`, batch, i) +} + +RecordBatch__column_name <- function(batch, i) { + .Call(`_arrow_RecordBatch__column_name`, batch, i) +} + +RecordBatch__names <- function(batch) { + .Call(`_arrow_RecordBatch__names`, batch) +} + +RecordBatch__Slice1 <- function(self, offset) { + .Call(`_arrow_RecordBatch__Slice1`, self, offset) +} + +RecordBatch__Slice2 <- function(self, offset, length) { + .Call(`_arrow_RecordBatch__Slice2`, self, offset, length) +} + +ipc___SerializeRecordBatch__Raw <- function(batch) { + .Call(`_arrow_ipc___SerializeRecordBatch__Raw`, batch) +} + +ipc___ReadRecordBatch__InputStream__Schema <- function(stream, schema) { + .Call(`_arrow_ipc___ReadRecordBatch__InputStream__Schema`, stream, schema) +} + +RecordBatch__from_arrays <- function(schema_sxp, lst) { + .Call(`_arrow_RecordBatch__from_arrays`, schema_sxp, lst) +} + +RecordBatchReader__schema <- function(reader) { + .Call(`_arrow_RecordBatchReader__schema`, reader) +} + +RecordBatchReader__ReadNext <- function(reader) { + .Call(`_arrow_RecordBatchReader__ReadNext`, reader) +} + +RecordBatchReader__batches <- function(reader) { + .Call(`_arrow_RecordBatchReader__batches`, reader) +} + +Table__from_RecordBatchReader <- function(reader) { + .Call(`_arrow_Table__from_RecordBatchReader`, reader) +} + +ipc___RecordBatchStreamReader__Open <- function(stream) { + .Call(`_arrow_ipc___RecordBatchStreamReader__Open`, stream) +} + +ipc___RecordBatchFileReader__schema <- function(reader) { + .Call(`_arrow_ipc___RecordBatchFileReader__schema`, reader) +} + +ipc___RecordBatchFileReader__num_record_batches <- function(reader) { + .Call(`_arrow_ipc___RecordBatchFileReader__num_record_batches`, reader) +} + +ipc___RecordBatchFileReader__ReadRecordBatch <- function(reader, i) { + .Call(`_arrow_ipc___RecordBatchFileReader__ReadRecordBatch`, reader, i) +} + +ipc___RecordBatchFileReader__Open <- function(file) { + .Call(`_arrow_ipc___RecordBatchFileReader__Open`, file) +} + +Table__from_RecordBatchFileReader <- function(reader) { + .Call(`_arrow_Table__from_RecordBatchFileReader`, reader) +} + +ipc___RecordBatchFileReader__batches <- function(reader) { + .Call(`_arrow_ipc___RecordBatchFileReader__batches`, reader) +} + +ipc___RecordBatchWriter__WriteRecordBatch <- function(batch_writer, batch) { + invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteRecordBatch`, batch_writer, batch)) +} + +ipc___RecordBatchWriter__WriteTable <- function(batch_writer, table) { + invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteTable`, batch_writer, table)) +} + +ipc___RecordBatchWriter__Close <- function(batch_writer) { + invisible(.Call(`_arrow_ipc___RecordBatchWriter__Close`, batch_writer)) +} + +ipc___RecordBatchFileWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) { + .Call(`_arrow_ipc___RecordBatchFileWriter__Open`, stream, schema, use_legacy_format, metadata_version) +} + +ipc___RecordBatchStreamWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) { + .Call(`_arrow_ipc___RecordBatchStreamWriter__Open`, stream, schema, use_legacy_format, metadata_version) +} + +Array__GetScalar <- function(x, i) { + .Call(`_arrow_Array__GetScalar`, x, i) +} + +Scalar__ToString <- function(s) { + .Call(`_arrow_Scalar__ToString`, s) +} + +StructScalar__field <- function(s, i) { + .Call(`_arrow_StructScalar__field`, s, i) +} + +StructScalar__GetFieldByName <- function(s, name) { + .Call(`_arrow_StructScalar__GetFieldByName`, s, name) +} + +Scalar__as_vector <- function(scalar) { + .Call(`_arrow_Scalar__as_vector`, scalar) +} + +MakeArrayFromScalar <- function(scalar, n) { + .Call(`_arrow_MakeArrayFromScalar`, scalar, n) +} + +Scalar__is_valid <- function(s) { + .Call(`_arrow_Scalar__is_valid`, s) +} + +Scalar__type <- function(s) { + .Call(`_arrow_Scalar__type`, s) +} + +Scalar__Equals <- function(lhs, rhs) { + .Call(`_arrow_Scalar__Equals`, lhs, rhs) +} + +Scalar__ApproxEquals <- function(lhs, rhs) { + .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs) +} + +schema_ <- function(fields) { + .Call(`_arrow_schema_`, fields) +} + +Schema__ToString <- function(s) { + .Call(`_arrow_Schema__ToString`, s) +} + +Schema__num_fields <- function(s) { + .Call(`_arrow_Schema__num_fields`, s) +} + +Schema__field <- function(s, i) { + .Call(`_arrow_Schema__field`, s, i) +} + +Schema__AddField <- function(s, i, field) { + .Call(`_arrow_Schema__AddField`, s, i, field) +} + +Schema__SetField <- function(s, i, field) { + .Call(`_arrow_Schema__SetField`, s, i, field) +} + +Schema__RemoveField <- function(s, i) { + .Call(`_arrow_Schema__RemoveField`, s, i) +} + +Schema__GetFieldByName <- function(s, x) { + .Call(`_arrow_Schema__GetFieldByName`, s, x) +} + +Schema__fields <- function(schema) { + .Call(`_arrow_Schema__fields`, schema) +} + +Schema__field_names <- function(schema) { + .Call(`_arrow_Schema__field_names`, schema) +} + +Schema__HasMetadata <- function(schema) { + .Call(`_arrow_Schema__HasMetadata`, schema) +} + +Schema__metadata <- function(schema) { + .Call(`_arrow_Schema__metadata`, schema) +} + +Schema__WithMetadata <- function(schema, metadata) { + .Call(`_arrow_Schema__WithMetadata`, schema, metadata) +} + +Schema__serialize <- function(schema) { + .Call(`_arrow_Schema__serialize`, schema) +} + +Schema__Equals <- function(schema, other, check_metadata) { + .Call(`_arrow_Schema__Equals`, schema, other, check_metadata) +} + +arrow__UnifySchemas <- function(schemas) { + .Call(`_arrow_arrow__UnifySchemas`, schemas) +} + +Table__num_columns <- function(x) { + .Call(`_arrow_Table__num_columns`, x) +} + +Table__num_rows <- function(x) { + .Call(`_arrow_Table__num_rows`, x) +} + +Table__schema <- function(x) { + .Call(`_arrow_Table__schema`, x) +} + +Table__ReplaceSchemaMetadata <- function(x, metadata) { + .Call(`_arrow_Table__ReplaceSchemaMetadata`, x, metadata) +} + +Table__column <- function(table, i) { + .Call(`_arrow_Table__column`, table, i) +} + +Table__field <- function(table, i) { + .Call(`_arrow_Table__field`, table, i) +} + +Table__columns <- function(table) { + .Call(`_arrow_Table__columns`, table) +} + +Table__ColumnNames <- function(table) { + .Call(`_arrow_Table__ColumnNames`, table) +} + +Table__RenameColumns <- function(table, names) { + .Call(`_arrow_Table__RenameColumns`, table, names) +} + +Table__Slice1 <- function(table, offset) { + .Call(`_arrow_Table__Slice1`, table, offset) +} + +Table__Slice2 <- function(table, offset, length) { + .Call(`_arrow_Table__Slice2`, table, offset, length) +} + +Table__Equals <- function(lhs, rhs, check_metadata) { + .Call(`_arrow_Table__Equals`, lhs, rhs, check_metadata) +} + +Table__Validate <- function(table) { + .Call(`_arrow_Table__Validate`, table) +} + +Table__ValidateFull <- function(table) { + .Call(`_arrow_Table__ValidateFull`, table) +} + +Table__GetColumnByName <- function(table, name) { + .Call(`_arrow_Table__GetColumnByName`, table, name) +} + +Table__RemoveColumn <- function(table, i) { + .Call(`_arrow_Table__RemoveColumn`, table, i) +} + +Table__AddColumn <- function(table, i, field, column) { + .Call(`_arrow_Table__AddColumn`, table, i, field, column) +} + +Table__SetColumn <- function(table, i, field, column) { + .Call(`_arrow_Table__SetColumn`, table, i, field, column) +} + +Table__SelectColumns <- function(table, indices) { + .Call(`_arrow_Table__SelectColumns`, table, indices) +} + +all_record_batches <- function(lst) { + .Call(`_arrow_all_record_batches`, lst) +} + +Table__from_record_batches <- function(batches, schema_sxp) { + .Call(`_arrow_Table__from_record_batches`, batches, schema_sxp) +} + +GetCpuThreadPoolCapacity <- function() { + .Call(`_arrow_GetCpuThreadPoolCapacity`) +} + +SetCpuThreadPoolCapacity <- function(threads) { + invisible(.Call(`_arrow_SetCpuThreadPoolCapacity`, threads)) +} + +GetIOThreadPoolCapacity <- function() { + .Call(`_arrow_GetIOThreadPoolCapacity`) +} + +SetIOThreadPoolCapacity <- function(threads) { + invisible(.Call(`_arrow_SetIOThreadPoolCapacity`, threads)) +} + +Array__infer_type <- function(x) { + .Call(`_arrow_Array__infer_type`, x) +} diff --git a/src/arrow/r/R/buffer.R b/src/arrow/r/R/buffer.R new file mode 100644 index 000000000..a9424fd0d --- /dev/null +++ b/src/arrow/r/R/buffer.R @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @title Buffer class +#' @usage NULL +#' @format NULL +#' @docType class +#' @description A Buffer is an object containing a pointer to a piece of +#' contiguous memory with a particular size. +#' @section Factory: +#' `buffer()` lets you create an `arrow::Buffer` from an R object +#' @section Methods: +#' +#' - `$is_mutable` : is this buffer mutable? +#' - `$ZeroPadding()` : zero bytes in padding, i.e. bytes between size and capacity +#' - `$size` : size in memory, in bytes +#' - `$capacity`: possible capacity, in bytes +#' +#' @rdname buffer +#' @name buffer +#' @examplesIf arrow_available() +#' my_buffer <- buffer(c(1, 2, 3, 4)) +#' my_buffer$is_mutable +#' my_buffer$ZeroPadding() +#' my_buffer$size +#' my_buffer$capacity +#' @export +#' @include arrow-package.R +#' @include enums.R +Buffer <- R6Class("Buffer", + inherit = ArrowObject, + public = list( + ZeroPadding = function() Buffer__ZeroPadding(self), + data = function() Buffer__data(self), + Equals = function(other, ...) { + inherits(other, "Buffer") && Buffer__Equals(self, other) + } + ), + active = list( + is_mutable = function() Buffer__is_mutable(self), + size = function() Buffer__size(self), + capacity = function() Buffer__capacity(self) + ) +) + +Buffer$create <- function(x) { + if (inherits(x, "Buffer")) { + x + } else if (inherits(x, c("raw", "numeric", "integer", "complex"))) { + r___RBuffer__initialize(x) + } else if (inherits(x, "BufferOutputStream")) { + x$finish() + } else { + stop("Cannot convert object of class ", class(x), " to arrow::Buffer") + } +} + +#' @param x R object. Only raw, numeric and integer vectors are currently supported +#' @return an instance of `Buffer` that borrows memory from `x` +#' @export +buffer <- Buffer$create + +#' @export +as.raw.Buffer <- function(x) x$data() diff --git a/src/arrow/r/R/chunked-array.R b/src/arrow/r/R/chunked-array.R new file mode 100644 index 000000000..597180ea7 --- /dev/null +++ b/src/arrow/r/R/chunked-array.R @@ -0,0 +1,153 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-datum.R + +#' @title ChunkedArray class +#' @usage NULL +#' @format NULL +#' @docType class +#' @description A `ChunkedArray` is a data structure managing a list of +#' primitive Arrow [Arrays][Array] logically as one large array. Chunked arrays +#' may be grouped together in a [Table]. +#' @section Factory: +#' The `ChunkedArray$create()` factory method instantiates the object from +#' various Arrays or R vectors. `chunked_array()` is an alias for it. +#' +#' @section Methods: +#' +#' - `$length()`: Size in the number of elements this array contains +#' - `$chunk(i)`: Extract an `Array` chunk by integer position +#' - `$as_vector()`: convert to an R vector +#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array +#' with the indicated offset and length. If length is `NULL`, the slice goes +#' until the end of the array. +#' - `$Take(i)`: return a `ChunkedArray` with values at positions given by +#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be +#' coerced to an R vector before taking. +#' - `$Filter(i, keep_na = TRUE)`: return a `ChunkedArray` with values at positions where +#' logical vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`. +#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be +#' used to rearrange the `ChunkedArray` in ascending or descending order +#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the +#' data in the array to change its type. +#' - `$null_count`: The number of null entries in the array +#' - `$chunks`: return a list of `Array`s +#' - `$num_chunks`: integer number of chunks in the `ChunkedArray` +#' - `$type`: logical type of data +#' - `$View(type)`: Construct a zero-copy view of this `ChunkedArray` with the +#' given type. +#' - `$Validate()`: Perform any validation checks to determine obvious inconsistencies +#' within the array's internal data. This can be an expensive check, potentially `O(length)` +#' +#' @rdname ChunkedArray +#' @name ChunkedArray +#' @seealso [Array] +#' @examplesIf arrow_available() +#' # Pass items into chunked_array as separate objects to create chunks +#' class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73)) +#' class_scores$num_chunks +#' +#' # When taking a Slice from a chunked_array, chunks are preserved +#' class_scores$Slice(2, length = 5) +#' +#' # You can combine Take and SortIndices to return a ChunkedArray with 1 chunk +#' # containing all values, ordered. +#' class_scores$Take(class_scores$SortIndices(descending = TRUE)) +#' +#' # If you pass a list into chunked_array, you get a list of length 1 +#' list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8))) +#' list_scores$num_chunks +#' +#' # When constructing a ChunkedArray, the first chunk is used to infer type. +#' doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L)) +#' doubles$type +#' @export +ChunkedArray <- R6Class("ChunkedArray", + inherit = ArrowDatum, + public = list( + length = function() ChunkedArray__length(self), + type_id = function() ChunkedArray__type(self)$id, + chunk = function(i) Array$create(ChunkedArray__chunk(self, i)), + as_vector = function() ChunkedArray__as_vector(self, option_use_threads()), + Slice = function(offset, length = NULL) { + if (is.null(length)) { + ChunkedArray__Slice1(self, offset) + } else { + ChunkedArray__Slice2(self, offset, length) + } + }, + Take = function(i) { + if (is.numeric(i)) { + i <- as.integer(i) + } + if (is.integer(i)) { + i <- Array$create(i) + } + call_function("take", self, i) + }, + Filter = function(i, keep_na = TRUE) { + if (is.logical(i)) { + i <- Array$create(i) + } + call_function("filter", self, i, options = list(keep_na = keep_na)) + }, + SortIndices = function(descending = FALSE) { + assert_that(is.logical(descending)) + assert_that(length(descending) == 1L) + assert_that(!is.na(descending)) + # TODO: after ARROW-12042 is closed, review whether this and the + # Array$SortIndices definition can be consolidated + call_function( + "sort_indices", + self, + options = list(names = "", orders = as.integer(descending)) + ) + }, + View = function(type) { + ChunkedArray__View(self, as_type(type)) + }, + Validate = function() { + ChunkedArray__Validate(self) + }, + ToString = function() { + ChunkedArray__ToString(self) + }, + Equals = function(other, ...) { + inherits(other, "ChunkedArray") && ChunkedArray__Equals(self, other) + } + ), + active = list( + null_count = function() ChunkedArray__null_count(self), + num_chunks = function() ChunkedArray__num_chunks(self), + chunks = function() map(ChunkedArray__chunks(self), Array$create), + type = function() ChunkedArray__type(self) + ) +) + +ChunkedArray$create <- function(..., type = NULL) { + if (!is.null(type)) { + type <- as_type(type) + } + ChunkedArray__from_list(list2(...), type) +} + +#' @param \dots Vectors to coerce +#' @param type currently ignored +#' @rdname ChunkedArray +#' @export +chunked_array <- ChunkedArray$create diff --git a/src/arrow/r/R/compression.R b/src/arrow/r/R/compression.R new file mode 100644 index 000000000..7107012d0 --- /dev/null +++ b/src/arrow/r/R/compression.R @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include enums.R +#' @include arrow-package.R +#' @include io.R + +#' @title Compression Codec class +#' @usage NULL +#' @format NULL +#' @docType class +#' @description Codecs allow you to create [compressed input and output +#' streams][compression]. +#' @section Factory: +#' The `Codec$create()` factory method takes the following arguments: +#' * `type`: string name of the compression method. Possible values are +#' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo", or +#' "bz2". `type` may be upper- or lower-cased. Not all methods may be +#' available; support depends on build-time flags for the C++ library. +#' See [codec_is_available()]. Most builds support at least "snappy" and +#' "gzip". All support "uncompressed". +#' * `compression_level`: compression level, the default value (`NA`) uses the +#' default compression level for the selected compression `type`. +#' @rdname Codec +#' @name Codec +#' @export +Codec <- R6Class("Codec", + inherit = ArrowObject, + active = list( + name = function() util___Codec__name(self), + level = function() abort("Codec$level() not yet implemented") + ) +) +Codec$create <- function(type = "gzip", compression_level = NA) { + if (is.string(type)) { + type <- util___Codec__Create( + compression_from_name(type), compression_level + ) + } + assert_is(type, "Codec") + type +} + +#' Check whether a compression codec is available +#' +#' Support for compression libraries depends on the build-time settings of +#' the Arrow C++ library. This function lets you know which are available for +#' use. +#' @param type A string, one of "uncompressed", "snappy", "gzip", "brotli", +#' "zstd", "lz4", "lzo", or "bz2", case insensitive. +#' @return Logical: is `type` available? +#' @export +#' @examplesIf arrow_available() +#' codec_is_available("gzip") +codec_is_available <- function(type) { + util___Codec__IsAvailable(compression_from_name(type)) +} + +compression_from_name <- function(name) { + map_int(name, ~ CompressionType[[match.arg(toupper(.x), names(CompressionType))]]) +} + +#' @title Compressed stream classes +#' @rdname compression +#' @name compression +#' @aliases CompressedInputStream CompressedOutputStream +#' @docType class +#' @usage NULL +#' @format NULL +#' @description `CompressedInputStream` and `CompressedOutputStream` +#' allow you to apply a compression [Codec] to an +#' input or output stream. +#' +#' @section Factory: +#' +#' The `CompressedInputStream$create()` and `CompressedOutputStream$create()` +#' factory methods instantiate the object and take the following arguments: +#' +#' - `stream` An [InputStream] or [OutputStream], respectively +#' - `codec` A `Codec`, either a [Codec][Codec] instance or a string +#' - `compression_level` compression level for when the `codec` argument is given as a string +#' +#' @section Methods: +#' +#' Methods are inherited from [InputStream] and [OutputStream], respectively +#' @export +#' @include arrow-package.R +CompressedOutputStream <- R6Class("CompressedOutputStream", inherit = OutputStream) +CompressedOutputStream$create <- function(stream, codec = "gzip", compression_level = NA) { + codec <- Codec$create(codec, compression_level = compression_level) + if (is.string(stream)) { + stream <- FileOutputStream$create(stream) + } + assert_is(stream, "OutputStream") + io___CompressedOutputStream__Make(codec, stream) +} + +#' @rdname compression +#' @usage NULL +#' @format NULL +#' @export +CompressedInputStream <- R6Class("CompressedInputStream", inherit = InputStream) +CompressedInputStream$create <- function(stream, codec = "gzip", compression_level = NA) { + codec <- Codec$create(codec, compression_level = compression_level) + if (is.string(stream)) { + stream <- ReadableFile$create(stream) + } + assert_is(stream, "InputStream") + io___CompressedInputStream__Make(codec, stream) +} diff --git a/src/arrow/r/R/compute.R b/src/arrow/r/R/compute.R new file mode 100644 index 000000000..0a7d77a09 --- /dev/null +++ b/src/arrow/r/R/compute.R @@ -0,0 +1,309 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Call an Arrow compute function +#' +#' This function provides a lower-level API for calling Arrow functions by their +#' string function name. You won't use it directly for most applications. +#' Many Arrow compute functions are mapped to R methods, +#' and in a `dplyr` evaluation context, [all Arrow functions][list_compute_functions()] +#' are callable with an `arrow_` prefix. +#' @param function_name string Arrow compute function name +#' @param ... Function arguments, which may include `Array`, `ChunkedArray`, `Scalar`, +#' `RecordBatch`, or `Table`. +#' @param args list arguments as an alternative to specifying in `...` +#' @param options named list of C++ function options. +#' @details When passing indices in `...`, `args`, or `options`, express them as +#' 0-based integers (consistent with C++). +#' @return An `Array`, `ChunkedArray`, `Scalar`, `RecordBatch`, or `Table`, whatever the compute function results in. +#' @seealso [Arrow C++ documentation](https://arrow.apache.org/docs/cpp/compute.html) for +#' the functions and their respective options. +#' @examplesIf arrow_available() +#' a <- Array$create(c(1L, 2L, 3L, NA, 5L)) +#' s <- Scalar$create(4L) +#' call_function("coalesce", a, s) +#' +#' a <- Array$create(rnorm(10000)) +#' call_function("quantile", a, options = list(q = seq(0, 1, 0.25))) +#' @export +#' @include array.R +#' @include chunked-array.R +#' @include scalar.R +call_function <- function(function_name, ..., args = list(...), options = empty_named_list()) { + assert_that(is.string(function_name)) + assert_that(is.list(options), !is.null(names(options))) + + datum_classes <- c("Array", "ChunkedArray", "RecordBatch", "Table", "Scalar") + valid_args <- map_lgl(args, ~ inherits(., datum_classes)) + if (!all(valid_args)) { + # Lame, just pick one to report + first_bad <- min(which(!valid_args)) + stop( + "Argument ", first_bad, " is of class ", head(class(args[[first_bad]]), 1), + " but it must be one of ", oxford_paste(datum_classes, "or"), + call. = FALSE + ) + } + + compute__CallFunction(function_name, args, options) +} + +#' List available Arrow C++ compute functions +#' +#' This function lists the names of all available Arrow C++ library compute functions. +#' These can be called by passing to [call_function()], or they can be +#' called by name with an `arrow_` prefix inside a `dplyr` verb. +#' +#' The resulting list describes the capabilities of your `arrow` build. +#' Some functions, such as string and regular expression functions, +#' require optional build-time C++ dependencies. If your `arrow` package +#' was not compiled with those features enabled, those functions will +#' not appear in this list. +#' +#' Some functions take options that need to be passed when calling them +#' (in a list called `options`). These options require custom handling +#' in C++; many functions already have that handling set up but not all do. +#' If you encounter one that needs special handling for options, please +#' report an issue. +#' +#' Note that this list does *not* enumerate all of the R bindings for these functions. +#' The package includes Arrow methods for many base R functions that can +#' be called directly on Arrow objects, as well as some tidyverse-flavored versions +#' available inside `dplyr` verbs. +#' +#' @param pattern Optional regular expression to filter the function list +#' @param ... Additional parameters passed to `grep()` +#' @return A character vector of available Arrow C++ function names +#' @examplesIf arrow_available() +#' available_funcs <- list_compute_functions() +#' utf8_funcs <- list_compute_functions(pattern = "^UTF8", ignore.case = TRUE) +#' @export +list_compute_functions <- function(pattern = NULL, ...) { + funcs <- compute__GetFunctionNames() + if (!is.null(pattern)) { + funcs <- grep(pattern, funcs, value = TRUE, ...) + } + # TODO: Filtering of hash funcs will already happen in C++ with ARROW-13943 + funcs <- grep( + "^hash_", + funcs, + value = TRUE, + invert = TRUE + ) + funcs +} + +#' @export +sum.ArrowDatum <- function(..., na.rm = FALSE) { + scalar_aggregate("sum", ..., na.rm = na.rm) +} + +#' @export +mean.ArrowDatum <- function(..., na.rm = FALSE) { + scalar_aggregate("mean", ..., na.rm = na.rm) +} + +#' @export +min.ArrowDatum <- function(..., na.rm = FALSE) { + scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("min") +} + +#' @export +max.ArrowDatum <- function(..., na.rm = FALSE) { + scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("max") +} + +scalar_aggregate <- function(FUN, ..., na.rm = FALSE, min_count = 0L) { + a <- collect_arrays_from_dots(list(...)) + if (FUN == "min_max" && na.rm && a$null_count == length(a)) { + Array$create(data.frame(min = Inf, max = -Inf)) + # If na.rm == TRUE and all values in array are NA, R returns + # Inf/-Inf, which are type double. Since Arrow is type-stable + # and does not do that, we handle this special case here. + } else { + call_function(FUN, a, options = list(skip_nulls = na.rm, min_count = min_count)) + } +} + +collect_arrays_from_dots <- function(dots) { + # Given a list that may contain both Arrays and ChunkedArrays, + # return a single ChunkedArray containing all of those chunks + # (may return a regular Array if there is only one element in dots) + # If there is only one element and it is a scalar, it returns the scalar + if (length(dots) == 1) { + return(dots[[1]]) + } + + assert_that(all(map_lgl(dots, is.Array))) + arrays <- unlist(lapply(dots, function(x) { + if (inherits(x, "ChunkedArray")) { + x$chunks + } else { + x + } + })) + ChunkedArray$create(!!!arrays) +} + +#' @export +quantile.ArrowDatum <- function(x, + probs = seq(0, 1, 0.25), + na.rm = FALSE, + type = 7, + interpolation = c("linear", "lower", "higher", "nearest", "midpoint"), + ...) { + if (inherits(x, "Scalar")) x <- Array$create(x) + assert_is(probs, c("numeric", "integer")) + assert_that(length(probs) > 0) + assert_that(all(probs >= 0 & probs <= 1)) + if (!na.rm && x$null_count > 0) { + stop("Missing values not allowed if 'na.rm' is FALSE", call. = FALSE) + } + if (type != 7) { + stop( + "Argument `type` not supported in Arrow. To control the quantile ", + "interpolation algorithm, set argument `interpolation` to one of: ", + "\"linear\" (the default), \"lower\", \"higher\", \"nearest\", or ", + "\"midpoint\".", + call. = FALSE + ) + } + interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]] + out <- call_function("quantile", x, options = list(q = probs, interpolation = interpolation)) + if (length(out) == 0) { + # When there are no non-missing values in the data, the Arrow quantile + # function returns an empty Array, but for consistency with the R quantile + # function, we want an Array of NA_real_ with the same length as probs + out <- Array$create(rep(NA_real_, length(probs))) + } + out +} + +#' @export +median.ArrowDatum <- function(x, na.rm = FALSE, ...) { + if (!na.rm && x$null_count > 0) { + Scalar$create(NA_real_) + } else { + Scalar$create(quantile(x, probs = 0.5, na.rm = TRUE, ...)) + } +} + +#' @export +unique.ArrowDatum <- function(x, incomparables = FALSE, ...) { + call_function("unique", x) +} + +#' @export +any.ArrowDatum <- function(..., na.rm = FALSE) { + scalar_aggregate("any", ..., na.rm = na.rm) +} + +#' @export +all.ArrowDatum <- function(..., na.rm = FALSE) { + scalar_aggregate("all", ..., na.rm = na.rm) +} + +#' `match` and `%in%` for Arrow objects +#' +#' `base::match()` is not a generic, so we can't just define Arrow methods for +#' it. This function exposes the analogous functions in the Arrow C++ library. +#' +#' @param x `Scalar`, `Array` or `ChunkedArray` +#' @param table `Scalar`, Array`, `ChunkedArray`, or R vector lookup table. +#' @param ... additional arguments, ignored +#' @return `match_arrow()` returns an `int32`-type Arrow object of the same length +#' and type as `x` with the (0-based) indexes into `table`. `is_in()` returns a +#' `boolean`-type Arrow object of the same length and type as `x` with values indicating +#' per element of `x` it it is present in `table`. +#' @examplesIf arrow_available() +#' # note that the returned value is 0-indexed +#' cars_tbl <- arrow_table(name = rownames(mtcars), mtcars) +#' match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name) +#' +#' is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name) +#' +#' # Although there are multiple matches, you are returned the index of the first +#' # match, as with the base R equivalent +#' match(4, mtcars$cyl) # 1-indexed +#' match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed +#' +#' # If `x` contains multiple values, you are returned the indices of the first +#' # match for each value. +#' match(c(4, 6, 8), mtcars$cyl) +#' match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl) +#' +#' # Return type matches type of `x` +#' is_in(c(4, 6, 8), mtcars$cyl) # returns vector +#' is_in(Scalar$create(4), mtcars$cyl) # returns Scalar +#' is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array +#' is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray +#' @export +match_arrow <- function(x, table, ...) { + if (!inherits(x, "ArrowDatum")) { + x <- Array$create(x) + } + + if (!inherits(table, c("Array", "ChunkedArray"))) { + table <- Array$create(table) + } + call_function("index_in_meta_binary", x, table) +} + +#' @rdname match_arrow +#' @export +is_in <- function(x, table, ...) { + if (!inherits(x, "ArrowDatum")) { + x <- Array$create(x) + } + + if (!inherits(table, c("Array", "DictionaryArray", "ChunkedArray"))) { + table <- Array$create(table) + } + call_function("is_in_meta_binary", x, table) +} + +#' `table` for Arrow objects +#' +#' This function tabulates the values in the array and returns a table of counts. +#' @param x `Array` or `ChunkedArray` +#' @return A `StructArray` containing "values" (same type as `x`) and "counts" +#' `Int64`. +#' @examplesIf arrow_available() +#' cyl_vals <- Array$create(mtcars$cyl) +#' counts <- value_counts(cyl_vals) +#' @export +value_counts <- function(x) { + call_function("value_counts", x) +} + +#' Cast options +#' +#' @param safe logical: enforce safe conversion? Default `TRUE` +#' @param ... additional cast options, such as `allow_int_overflow`, +#' `allow_time_truncate`, and `allow_float_truncate`, which are set to `!safe` +#' by default +#' @return A list +#' @export +#' @keywords internal +cast_options <- function(safe = TRUE, ...) { + opts <- list( + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe + ) + modifyList(opts, list(...)) +} diff --git a/src/arrow/r/R/config.R b/src/arrow/r/R/config.R new file mode 100644 index 000000000..af07ad9a9 --- /dev/null +++ b/src/arrow/r/R/config.R @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Manage the global CPU thread pool in libarrow +#' +#' @export +cpu_count <- function() { + GetCpuThreadPoolCapacity() +} + +#' @rdname cpu_count +#' @param num_threads integer: New number of threads for thread pool +#' @export +set_cpu_count <- function(num_threads) { + SetCpuThreadPoolCapacity(as.integer(num_threads)) +} + +#' Manage the global I/O thread pool in libarrow +#' +#' @export +io_thread_count <- function() { + GetIOThreadPoolCapacity() +} + +#' @rdname io_thread_count +#' @param num_threads integer: New number of threads for thread pool +#' @export +set_io_thread_count <- function(num_threads) { + SetIOThreadPoolCapacity(as.integer(num_threads)) +} diff --git a/src/arrow/r/R/csv.R b/src/arrow/r/R/csv.R new file mode 100644 index 000000000..ee890578f --- /dev/null +++ b/src/arrow/r/R/csv.R @@ -0,0 +1,644 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read a CSV or other delimited file with Arrow +#' +#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`. +#' Arrow C++ options have been mapped to argument names that follow those of +#' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`. +#' +#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around +#' `read_delim_arrow()` that specify a delimiter. +#' +#' Note that not all `readr` options are currently implemented here. Please file +#' an issue if you encounter one that `arrow` should support. +#' +#' If you need to control Arrow-specific reader parameters that don't have an +#' equivalent in `readr::read_csv()`, you can either provide them in the +#' `parse_options`, `convert_options`, or `read_options` arguments, or you can +#' use [CsvTableReader] directly for lower-level access. +#' +#' @section Specifying column types and names: +#' +#' By default, the CSV reader will infer the column names and data types from the file, but there +#' are a few ways you can specify them directly. +#' +#' One way is to provide an Arrow [Schema] in the `schema` argument, +#' which is an ordered map of column name to type. +#' When provided, it satisfies both the `col_names` and `col_types` arguments. +#' This is good if you know all of this information up front. +#' +#' You can also pass a `Schema` to the `col_types` argument. If you do this, +#' column names will still be inferred from the file unless you also specify +#' `col_names`. In either case, the column names in the `Schema` must match the +#' data's column names, whether they are explicitly provided or inferred. That +#' said, this `Schema` does not have to reference all columns: those omitted +#' will have their types inferred. +#' +#' Alternatively, you can declare column types by providing the compact string representation +#' that `readr` uses to the `col_types` argument. This means you provide a +#' single string, one character per column, where the characters map to Arrow +#' types analogously to the `readr` type mapping: +#' +#' * "c": `utf8()` +#' * "i": `int32()` +#' * "n": `float64()` +#' * "d": `float64()` +#' * "l": `bool()` +#' * "f": `dictionary()` +#' * "D": `date32()` +#' * "T": `timestamp()` +#' * "t": `time32()` +#' * "_": `null()` +#' * "-": `null()` +#' * "?": infer the type from the data +#' +#' If you use the compact string representation for `col_types`, you must also +#' specify `col_names`. +#' +#' Regardless of how types are specified, all columns with a `null()` type will +#' be dropped. +#' +#' Note that if you are specifying column names, whether by `schema` or +#' `col_names`, and the CSV file has a header row that would otherwise be used +#' to idenfity column names, you'll need to add `skip = 1` to skip that row. +#' +#' @param file A character file name or URI, `raw` vector, an Arrow input stream, +#' or a `FileSystem` with path (`SubTreeFileSystem`). +#' If a file name, a memory-mapped Arrow [InputStream] will be opened and +#' closed when finished; compression will be detected from the file extension +#' and handled automatically. If an input stream is provided, it will be left +#' open. +#' @param delim Single character used to separate fields within a record. +#' @param quote Single character used to quote strings. +#' @param escape_double Does the file escape quotes by doubling them? +#' i.e. If this option is `TRUE`, the value `""""` represents +#' a single quote, `\"`. +#' @param escape_backslash Does the file use backslashes to escape special +#' characters? This is more general than `escape_double` as backslashes +#' can be used to escape the delimiter character, the quote character, or +#' to add special characters like `\\n`. +#' @param schema [Schema] that describes the table. If provided, it will be +#' used to satisfy both `col_names` and `col_types`. +#' @param col_names If `TRUE`, the first row of the input will be used as the +#' column names and will not be included in the data frame. If `FALSE`, column +#' names will be generated by Arrow, starting with "f0", "f1", ..., "fN". +#' Alternatively, you can specify a character vector of column names. +#' @param col_types A compact string representation of the column types, or +#' `NULL` (the default) to infer types from the data. +#' @param col_select A character vector of column names to keep, as in the +#' "select" argument to `data.table::fread()`, or a +#' [tidy selection specification][tidyselect::vars_select()] +#' of columns, as used in `dplyr::select()`. +#' @param na A character vector of strings to interpret as missing values. +#' @param quoted_na Should missing values inside quotes be treated as missing +#' values (the default) or strings. (Note that this is different from the +#' the Arrow C++ default for the corresponding convert option, +#' `strings_can_be_null`.) +#' @param skip_empty_rows Should blank rows be ignored altogether? If +#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be +#' filled with missings. +#' @param skip Number of lines to skip before reading data. +#' @param timestamp_parsers User-defined timestamp parsers. If more than one +#' parser is specified, the CSV conversion logic will try parsing values +#' starting from the beginning of this vector. Possible values are: +#' - `NULL`: the default, which uses the ISO-8601 parser +#' - a character vector of [strptime][base::strptime()] parse strings +#' - a list of [TimestampParser] objects +#' @param parse_options see [file reader options][CsvReadOptions]. +#' If given, this overrides any +#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.). +#' @param convert_options see [file reader options][CsvReadOptions] +#' @param read_options see [file reader options][CsvReadOptions] +#' @param as_data_frame Should the function return a `data.frame` (default) or +#' an Arrow [Table]? +#' +#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`. +#' @export +#' @examplesIf arrow_available() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' write.csv(mtcars, file = tf) +#' df <- read_csv_arrow(tf) +#' dim(df) +#' # Can select columns +#' df <- read_csv_arrow(tf, col_select = starts_with("d")) +read_delim_arrow <- function(file, + delim = ",", + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL) { + if (inherits(schema, "Schema")) { + col_names <- names(schema) + col_types <- schema + } + if (is.null(parse_options)) { + parse_options <- readr_to_csv_parse_options( + delim, + quote, + escape_double, + escape_backslash, + skip_empty_rows + ) + } + if (is.null(read_options)) { + read_options <- readr_to_csv_read_options(skip, col_names) + } + if (is.null(convert_options)) { + convert_options <- readr_to_csv_convert_options( + na, + quoted_na, + col_types = col_types, + col_names = read_options$column_names, + timestamp_parsers = timestamp_parsers + ) + } + + if (!inherits(file, "InputStream")) { + file <- make_readable_file(file) + on.exit(file$close()) + } + reader <- CsvTableReader$create( + file, + read_options = read_options, + parse_options = parse_options, + convert_options = convert_options + ) + + tab <- reader$Read() + + # TODO: move this into convert_options using include_columns + col_select <- enquo(col_select) + if (!quo_is_null(col_select)) { + tab <- tab[vars_select(names(tab), !!col_select)] + } + + if (isTRUE(as_data_frame)) { + tab <- as.data.frame(tab) + } + + tab +} + +#' @rdname read_delim_arrow +#' @export +read_csv_arrow <- function(file, + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL) { + mc <- match.call() + mc$delim <- "," + mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow")) + eval.parent(mc) +} + +#' @rdname read_delim_arrow +#' @export +read_tsv_arrow <- function(file, + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL) { + mc <- match.call() + mc$delim <- "\t" + mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow")) + eval.parent(mc) +} + +#' @title Arrow CSV and JSON table reader classes +#' @rdname CsvTableReader +#' @name CsvTableReader +#' @docType class +#' @usage NULL +#' @format NULL +#' @description `CsvTableReader` and `JsonTableReader` wrap the Arrow C++ CSV +#' and JSON table readers. See their usage in [read_csv_arrow()] and +#' [read_json_arrow()], respectively. +#' +#' @section Factory: +#' +#' The `CsvTableReader$create()` and `JsonTableReader$create()` factory methods +#' take the following arguments: +#' +#' - `file` An Arrow [InputStream] +#' - `convert_options` (CSV only), `parse_options`, `read_options`: see +#' [CsvReadOptions] +#' - `...` additional parameters. +#' +#' @section Methods: +#' +#' - `$Read()`: returns an Arrow Table. +#' +#' @include arrow-package.R +#' @export +CsvTableReader <- R6Class("CsvTableReader", + inherit = ArrowObject, + public = list( + Read = function() csv___TableReader__Read(self) + ) +) +CsvTableReader$create <- function(file, + read_options = CsvReadOptions$create(), + parse_options = CsvParseOptions$create(), + convert_options = CsvConvertOptions$create(), + ...) { + assert_is(file, "InputStream") + csv___TableReader__Make(file, read_options, parse_options, convert_options) +} + +#' @title File reader options +#' @rdname CsvReadOptions +#' @name CsvReadOptions +#' @docType class +#' @usage NULL +#' @format NULL +#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`, +#' `JsonReadOptions`, `JsonParseOptions`, and `TimestampParser` are containers for various +#' file reading options. See their usage in [read_csv_arrow()] and +#' [read_json_arrow()], respectively. +#' +#' @section Factory: +#' +#' The `CsvReadOptions$create()` and `JsonReadOptions$create()` factory methods +#' take the following arguments: +#' +#' - `use_threads` Whether to use the global CPU thread pool +#' - `block_size` Block size we request from the IO layer; also determines +#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input +#' must end with an empty line. +#' +#' `CsvReadOptions$create()` further accepts these additional arguments: +#' +#' - `skip_rows` Number of lines to skip before reading data (default 0) +#' - `column_names` Character vector to supply column names. If length-0 +#' (the default), the first non-skipped row will be parsed to generate column +#' names, unless `autogenerate_column_names` is `TRUE`. +#' - `autogenerate_column_names` Logical: generate column names instead of +#' using the first non-skipped row (the default)? If `TRUE`, column names will +#' be "f0", "f1", ..., "fN". +#' +#' `CsvParseOptions$create()` takes the following arguments: +#' +#' - `delimiter` Field delimiting character (default `","`) +#' - `quoting` Logical: are strings quoted? (default `TRUE`) +#' - `quote_char` Quoting character, if `quoting` is `TRUE` +#' - `double_quote` Logical: are quotes inside values double-quoted? (default `TRUE`) +#' - `escaping` Logical: whether escaping is used (default `FALSE`) +#' - `escape_char` Escaping character, if `escaping` is `TRUE` +#' - `newlines_in_values` Logical: are values allowed to contain CR (`0x0d`) +#' and LF (`0x0a`) characters? (default `FALSE`) +#' - `ignore_empty_lines` Logical: should empty lines be ignored (default) or +#' generate a row of missing values (if `FALSE`)? +#' +#' `JsonParseOptions$create()` accepts only the `newlines_in_values` argument. +#' +#' `CsvConvertOptions$create()` takes the following arguments: +#' +#' - `check_utf8` Logical: check UTF8 validity of string columns? (default `TRUE`) +#' - `null_values` character vector of recognized spellings for null values. +#' Analogous to the `na.strings` argument to +#' [`read.csv()`][utils::read.csv()] or `na` in `readr::read_csv()`. +#' - `strings_can_be_null` Logical: can string / binary columns have +#' null values? Similar to the `quoted_na` argument to `readr::read_csv()`. +#' (default `FALSE`) +#' - `true_values` character vector of recognized spellings for `TRUE` values +#' - `false_values` character vector of recognized spellings for `FALSE` values +#' - `col_types` A `Schema` or `NULL` to infer types +#' - `auto_dict_encode` Logical: Whether to try to automatically +#' dictionary-encode string / binary data (think `stringsAsFactors`). Default `FALSE`. +#' This setting is ignored for non-inferred columns (those in `col_types`). +#' - `auto_dict_max_cardinality` If `auto_dict_encode`, string/binary columns +#' are dictionary-encoded up to this number of unique values (default 50), +#' after which it switches to regular encoding. +#' - `include_columns` If non-empty, indicates the names of columns from the +#' CSV file that should be actually read and converted (in the vector's order). +#' - `include_missing_columns` Logical: if `include_columns` is provided, should +#' columns named in it but not found in the data be included as a column of +#' type `null()`? The default (`FALSE`) means that the reader will instead +#' raise an error. +#' - `timestamp_parsers` User-defined timestamp parsers. If more than one +#' parser is specified, the CSV conversion logic will try parsing values +#' starting from the beginning of this vector. Possible values are +#' (a) `NULL`, the default, which uses the ISO-8601 parser; +#' (b) a character vector of [strptime][base::strptime()] parse strings; or +#' (c) a list of [TimestampParser] objects. +#' +#' `TimestampParser$create()` takes an optional `format` string argument. +#' See [`strptime()`][base::strptime()] for example syntax. +#' The default is to use an ISO-8601 format parser. +#' +#' The `CsvWriteOptions$create()` factory method takes the following arguments: +#' - `include_header` Whether to write an initial header line with column names +#' - `batch_size` Maximum number of rows processed at a time. Default is 1024. +#' +#' @section Active bindings: +#' +#' - `column_names`: from `CsvReadOptions` +#' +#' @export +CsvReadOptions <- R6Class("CsvReadOptions", + inherit = ArrowObject, + active = list( + column_names = function() csv___ReadOptions__column_names(self) + ) +) +CsvReadOptions$create <- function(use_threads = option_use_threads(), + block_size = 1048576L, + skip_rows = 0L, + column_names = character(0), + autogenerate_column_names = FALSE) { + csv___ReadOptions__initialize( + list( + use_threads = use_threads, + block_size = block_size, + skip_rows = skip_rows, + column_names = column_names, + autogenerate_column_names = autogenerate_column_names + ) + ) +} + +#' @rdname CsvReadOptions +#' @export +CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject) +CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) { + assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0) + csv___WriteOptions__initialize( + list( + include_header = include_header, + batch_size = as.integer(batch_size) + ) + ) +} + +readr_to_csv_read_options <- function(skip, col_names, col_types) { + if (isTRUE(col_names)) { + # C++ default to parse is 0-length string array + col_names <- character(0) + } + if (identical(col_names, FALSE)) { + CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE) + } else { + CsvReadOptions$create(skip_rows = skip, column_names = col_names) + } +} + +#' @rdname CsvReadOptions +#' @usage NULL +#' @format NULL +#' @docType class +#' @export +CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject) +CsvParseOptions$create <- function(delimiter = ",", + quoting = TRUE, + quote_char = '"', + double_quote = TRUE, + escaping = FALSE, + escape_char = "\\", + newlines_in_values = FALSE, + ignore_empty_lines = TRUE) { + csv___ParseOptions__initialize( + list( + delimiter = delimiter, + quoting = quoting, + quote_char = quote_char, + double_quote = double_quote, + escaping = escaping, + escape_char = escape_char, + newlines_in_values = newlines_in_values, + ignore_empty_lines = ignore_empty_lines + ) + ) +} + +readr_to_csv_parse_options <- function(delim = ",", + quote = '"', + escape_double = TRUE, + escape_backslash = FALSE, + skip_empty_rows = TRUE) { + # This function translates from the readr argument list to the arrow arg names + # TODO: validate inputs + CsvParseOptions$create( + delimiter = delim, + quoting = nzchar(quote), + quote_char = quote, + double_quote = escape_double, + escaping = escape_backslash, + escape_char = "\\", + newlines_in_values = escape_backslash, + ignore_empty_lines = skip_empty_rows + ) +} + +#' @rdname CsvReadOptions +#' @usage NULL +#' @format NULL +#' @docType class +#' @export +TimestampParser <- R6Class("TimestampParser", + inherit = ArrowObject, + public = list( + kind = function() TimestampParser__kind(self), + format = function() TimestampParser__format(self) + ) +) +TimestampParser$create <- function(format = NULL) { + if (is.null(format)) { + TimestampParser__MakeISO8601() + } else { + TimestampParser__MakeStrptime(format) + } +} + +#' @rdname CsvReadOptions +#' @usage NULL +#' @format NULL +#' @docType class +#' @export +CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject) +CsvConvertOptions$create <- function(check_utf8 = TRUE, + null_values = c("", "NA"), + true_values = c("T", "true", "TRUE"), + false_values = c("F", "false", "FALSE"), + strings_can_be_null = FALSE, + col_types = NULL, + auto_dict_encode = FALSE, + auto_dict_max_cardinality = 50L, + include_columns = character(), + include_missing_columns = FALSE, + timestamp_parsers = NULL) { + if (!is.null(col_types) && !inherits(col_types, "Schema")) { + abort(c( + "Unsupported `col_types` specification.", + i = "`col_types` must be NULL, or a <Schema>." + )) + } + + csv___ConvertOptions__initialize( + list( + check_utf8 = check_utf8, + null_values = null_values, + strings_can_be_null = strings_can_be_null, + col_types = col_types, + true_values = true_values, + false_values = false_values, + auto_dict_encode = auto_dict_encode, + auto_dict_max_cardinality = auto_dict_max_cardinality, + include_columns = include_columns, + include_missing_columns = include_missing_columns, + timestamp_parsers = timestamp_parsers + ) + ) +} + +readr_to_csv_convert_options <- function(na, + quoted_na, + col_types = NULL, + col_names = NULL, + timestamp_parsers = NULL) { + include_columns <- character() + + if (is.character(col_types)) { + if (length(col_types) != 1L) { + abort("`col_types` is a character vector that is not of size 1") + } + n <- nchar(col_types) + specs <- substring(col_types, seq_len(n), seq_len(n)) + if (!is_bare_character(col_names, n)) { + abort("Compact specification for `col_types` requires `col_names`") + } + + col_types <- set_names(nm = col_names, map2(specs, col_names, ~ { + switch(.x, + "c" = utf8(), + "i" = int32(), + "n" = float64(), + "d" = float64(), + "l" = bool(), + "f" = dictionary(), + "D" = date32(), + "T" = timestamp(), + "t" = time32(), + "_" = null(), + "-" = null(), + "?" = NULL, + abort("Unsupported compact specification: '", .x, "' for column '", .y, "'") + ) + })) + # To "guess" types, omit them from col_types + col_types <- keep(col_types, ~ !is.null(.x)) + col_types <- schema(!!!col_types) + } + + if (!is.null(col_types)) { + assert_is(col_types, "Schema") + # If any columns are null(), drop them + # (by specifying the other columns in include_columns) + nulls <- map_lgl(col_types$fields, ~ .$type$Equals(null())) + if (any(nulls)) { + include_columns <- setdiff(col_names, names(col_types)[nulls]) + } + } + CsvConvertOptions$create( + null_values = na, + strings_can_be_null = quoted_na, + col_types = col_types, + timestamp_parsers = timestamp_parsers, + include_columns = include_columns + ) +} + +#' Write CSV file to disk +#' +#' @param x `data.frame`, [RecordBatch], or [Table] +#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' system (`SubTreeFileSystem`) +#' @param include_header Whether to write an initial header line with column names +#' @param batch_size Maximum number of rows processed at a time. Default is 1024. +#' +#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream], +#' the stream will be left open. +#' @export +#' @examplesIf arrow_available() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' write_csv_arrow(mtcars, tf) +#' @include arrow-package.R +write_csv_arrow <- function(x, + sink, + include_header = TRUE, + batch_size = 1024L) { + write_options <- CsvWriteOptions$create(include_header, batch_size) + + x_out <- x + if (is.data.frame(x)) { + x <- Table$create(x) + } + + assert_that(is_writable_table(x)) + + if (!inherits(sink, "OutputStream")) { + sink <- make_output_stream(sink) + on.exit(sink$close()) + } + + if (inherits(x, "RecordBatch")) { + csv___WriteCSV__RecordBatch(x, write_options, sink) + } else if (inherits(x, "Table")) { + csv___WriteCSV__Table(x, write_options, sink) + } + + invisible(x_out) +} diff --git a/src/arrow/r/R/dataset-factory.R b/src/arrow/r/R/dataset-factory.R new file mode 100644 index 000000000..c56a6b181 --- /dev/null +++ b/src/arrow/r/R/dataset-factory.R @@ -0,0 +1,170 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include dataset.R + +#' @usage NULL +#' @format NULL +#' @rdname Dataset +#' @export +DatasetFactory <- R6Class("DatasetFactory", + inherit = ArrowObject, + public = list( + Finish = function(schema = NULL, unify_schemas = FALSE) { + if (is.null(schema)) { + dataset___DatasetFactory__Finish1(self, unify_schemas) + } else { + assert_is(schema, "Schema") + dataset___DatasetFactory__Finish2(self, schema) + } + }, + Inspect = function(unify_schemas = FALSE) { + dataset___DatasetFactory__Inspect(self, unify_schemas) + } + ) +) +DatasetFactory$create <- function(x, + filesystem = NULL, + format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"), + partitioning = NULL, + ...) { + if (is_list_of(x, "DatasetFactory")) { + return(dataset___UnionDatasetFactory__Make(x)) + } + + if (is.character(format)) { + format <- FileFormat$create(match.arg(format), ...) + } else { + assert_is(format, "FileFormat") + } + + path_and_fs <- get_paths_and_filesystem(x, filesystem) + info <- path_and_fs$fs$GetFileInfo(path_and_fs$path) + + if (length(info) > 1 || info[[1]]$type == FileType$File) { + # x looks like a vector of one or more file paths (not a directory path) + return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL, path_and_fs$path, format)) + } + + if (!is.null(partitioning)) { + if (inherits(partitioning, "Schema")) { + partitioning <- DirectoryPartitioning$create(partitioning) + } else if (is.character(partitioning)) { + # These are the column/field names, and we should autodetect their types + partitioning <- DirectoryPartitioningFactory$create(partitioning) + } + } + + selector <- FileSelector$create(path_and_fs$path, allow_not_found = FALSE, recursive = TRUE) + + FileSystemDatasetFactory$create(path_and_fs$fs, selector, NULL, format, partitioning) +} + +#' Create a DatasetFactory +#' +#' A [Dataset] can constructed using one or more [DatasetFactory]s. +#' This function helps you construct a `DatasetFactory` that you can pass to +#' [open_dataset()]. +#' +#' If you would only have a single `DatasetFactory` (for example, you have a +#' single directory containing Parquet files), you can call `open_dataset()` +#' directly. Use `dataset_factory()` when you +#' want to combine different directories, file systems, or file formats. +#' +#' @param x A string path to a directory containing data files, a vector of one +#' one or more string paths to data files, or a list of `DatasetFactory` objects +#' whose datasets should be combined. If this argument is specified it will be +#' used to construct a `UnionDatasetFactory` and other arguments will be +#' ignored. +#' @param filesystem A [FileSystem] object; if omitted, the `FileSystem` will +#' be detected from `x` +#' @param format A [FileFormat] object, or a string identifier of the format of +#' the files in `x`. Currently supported values: +#' * "parquet" +#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +#' only version 2 files are supported +#' * "csv"/"text", aliases for the same thing (because comma is the default +#' delimiter for text files +#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"` +#' +#' Default is "parquet", unless a `delimiter` is also specified, in which case +#' it is assumed to be "text". +#' @param partitioning One of +#' * A `Schema`, in which case the file paths relative to `sources` will be +#' parsed, and path segments will be matched with the schema fields. For +#' example, `schema(year = int16(), month = int8())` would create partitions +#' for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc. +#' * A character vector that defines the field names corresponding to those +#' path segments (that is, you're providing the names that would correspond +#' to a `Schema` but the types will be autodetected) +#' * A `HivePartitioning` or `HivePartitioningFactory`, as returned +#' by [hive_partition()] which parses explicit or autodetected fields from +#' Hive-style path segments +#' * `NULL` for no partitioning +#' @param ... Additional format-specific options, passed to +#' `FileFormat$create()`. For CSV options, note that you can specify them either +#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if you +#' encounter one that `arrow` should support. +#' @return A `DatasetFactory` object. Pass this to [open_dataset()], +#' in a list potentially with other `DatasetFactory` objects, to create +#' a `Dataset`. +#' @export +dataset_factory <- DatasetFactory$create + +#' @usage NULL +#' @format NULL +#' @rdname Dataset +#' @export +FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory", + inherit = DatasetFactory +) +FileSystemDatasetFactory$create <- function(filesystem, + selector = NULL, + paths = NULL, + format, + partitioning = NULL) { + assert_is(filesystem, "FileSystem") + is.null(selector) || assert_is(selector, "FileSelector") + is.null(paths) || assert_is(paths, "character") + assert_that( + xor(is.null(selector), is.null(paths)), + msg = "Either selector or paths must be specified" + ) + assert_is(format, "FileFormat") + if (!is.null(paths)) { + assert_that(is.null(partitioning), msg = "Partitioning not supported with paths") + } + + if (!is.null(paths)) { + ptr <- dataset___FileSystemDatasetFactory__Make0(filesystem, paths, format) + } else if (is.null(partitioning)) { + ptr <- dataset___FileSystemDatasetFactory__Make1(filesystem, selector, format) + } else if (inherits(partitioning, "PartitioningFactory")) { + ptr <- dataset___FileSystemDatasetFactory__Make3(filesystem, selector, format, partitioning) + } else if (inherits(partitioning, "Partitioning")) { + ptr <- dataset___FileSystemDatasetFactory__Make2(filesystem, selector, format, partitioning) + } else { + stop( + "Expected 'partitioning' to be NULL, PartitioningFactory or Partitioning", + call. = FALSE + ) + } + + ptr +} diff --git a/src/arrow/r/R/dataset-format.R b/src/arrow/r/R/dataset-format.R new file mode 100644 index 000000000..b0b93219e --- /dev/null +++ b/src/arrow/r/R/dataset-format.R @@ -0,0 +1,353 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Dataset file formats +#' +#' @description +#' A `FileFormat` holds information about how to read and parse the files +#' included in a `Dataset`. There are subclasses corresponding to the supported +#' file formats (`ParquetFileFormat` and `IpcFileFormat`). +#' +#' @section Factory: +#' `FileFormat$create()` takes the following arguments: +#' * `format`: A string identifier of the file format. Currently supported values: +#' * "parquet" +#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +#' only version 2 files are supported +#' * "csv"/"text", aliases for the same thing (because comma is the default +#' delimiter for text files +#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"` +#' * `...`: Additional format-specific options +#' +#' `format = "parquet"``: +#' * `dict_columns`: Names of columns which should be read as dictionaries. +#' * Any Parquet options from [FragmentScanOptions]. +#' +#' `format = "text"`: see [CsvParseOptions]. Note that you can specify them either +#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the +#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.). +#' Not all `readr` options are currently supported; please file an issue if +#' you encounter one that `arrow` should support. Also, the following options are +#' supported. From [CsvReadOptions]: +#' * `skip_rows` +#' * `column_names` +#' * `autogenerate_column_names` +#' From [CsvFragmentScanOptions] (these values can be overridden at scan time): +#' * `convert_options`: a [CsvConvertOptions] +#' * `block_size` +#' +#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`) +#' @rdname FileFormat +#' @name FileFormat +#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows" +#' ## Semi-colon delimited files +#' # Set up directory for examples +#' tf <- tempfile() +#' dir.create(tf) +#' on.exit(unlink(tf)) +#' write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE) +#' +#' # Create FileFormat object +#' format <- FileFormat$create(format = "text", delimiter = ";") +#' +#' open_dataset(tf, format = format) +#' @export +FileFormat <- R6Class("FileFormat", + inherit = ArrowObject, + active = list( + # @description + # Return the `FileFormat`'s type + type = function() dataset___FileFormat__type_name(self) + ) +) +FileFormat$create <- function(format, schema = NULL, ...) { + opt_names <- names(list(...)) + if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) { + CsvFileFormat$create(schema = schema, ...) + } else if (format == c("tsv")) { + CsvFileFormat$create(delimiter = "\t", schema = schema, ...) + } else if (format == "parquet") { + ParquetFileFormat$create(...) + } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing + dataset___IpcFileFormat__Make() + } else { + stop("Unsupported file format: ", format, call. = FALSE) + } +} + +#' @export +as.character.FileFormat <- function(x, ...) { + out <- x$type + # Slight hack: special case IPC -> feather, otherwise is just the type_name + ifelse(out == "ipc", "feather", out) +} + +#' @usage NULL +#' @format NULL +#' @rdname FileFormat +#' @export +ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat) +ParquetFileFormat$create <- function(..., + dict_columns = character(0)) { + options <- ParquetFragmentScanOptions$create(...) + dataset___ParquetFileFormat__Make(options, dict_columns) +} + +#' @usage NULL +#' @format NULL +#' @rdname FileFormat +#' @export +IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat) + +#' @usage NULL +#' @format NULL +#' @rdname FileFormat +#' @export +CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat) +CsvFileFormat$create <- function(..., + opts = csv_file_format_parse_options(...), + convert_options = csv_file_format_convert_opts(...), + read_options = csv_file_format_read_opts(...)) { + dataset___CsvFileFormat__Make(opts, convert_options, read_options) +} + +# Support both readr-style option names and Arrow C++ option names +csv_file_format_parse_options <- function(...) { + opts <- list(...) + # Filter out arguments meant for CsvConvertOptions/CsvReadOptions + convert_opts <- names(formals(CsvConvertOptions$create)) + read_opts <- names(formals(CsvReadOptions$create)) + opts[convert_opts] <- NULL + opts[read_opts] <- NULL + opts[["schema"]] <- NULL + opt_names <- names(opts) + # Catch any readr-style options specified with full option names that are + # supported by read_delim_arrow() (and its wrappers) but are not yet + # supported here + unsup_readr_opts <- setdiff( + names(formals(read_delim_arrow)), + names(formals(readr_to_csv_parse_options)) + ) + is_unsup_opt <- opt_names %in% unsup_readr_opts + unsup_opts <- opt_names[is_unsup_opt] + if (length(unsup_opts)) { + stop( + "The following ", + ngettext(length(unsup_opts), "option is ", "options are "), + "supported in \"read_delim_arrow\" functions ", + "but not yet supported here: ", + oxford_paste(unsup_opts), + call. = FALSE + ) + } + # Catch any options with full or partial names that do not match any of the + # recognized Arrow C++ option names or readr-style option names + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) + is_readr_opt <- !is.na(pmatch(opt_names, readr_opts)) + unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt] + if (length(unrec_opts)) { + stop( + "Unrecognized ", + ngettext(length(unrec_opts), "option", "options"), + ": ", + oxford_paste(unrec_opts), + call. = FALSE + ) + } + # Catch options with ambiguous partial names (such as "del") that make it + # unclear whether the user is specifying Arrow C++ options ("delimiter") or + # readr-style options ("delim") + is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts))) + ambig_opts <- opt_names[is_ambig_opt] + if (length(ambig_opts)) { + stop("Ambiguous ", + ngettext(length(ambig_opts), "option", "options"), + ": ", + oxford_paste(ambig_opts), + ". Use full argument names", + call. = FALSE + ) + } + if (any(is_readr_opt)) { + # Catch cases when the user specifies a mix of Arrow C++ options and + # readr-style options + if (!all(is_readr_opt)) { + stop("Use either Arrow parse options or readr parse options, not both", + call. = FALSE + ) + } + do.call(readr_to_csv_parse_options, opts) # all options have readr-style names + } else { + do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names + } +} + +csv_file_format_convert_opts <- function(...) { + opts <- list(...) + # Filter out arguments meant for CsvParseOptions/CsvReadOptions + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + read_opts <- names(formals(CsvReadOptions$create)) + opts[arrow_opts] <- NULL + opts[readr_opts] <- NULL + opts[read_opts] <- NULL + opts[["schema"]] <- NULL + do.call(CsvConvertOptions$create, opts) +} + +csv_file_format_read_opts <- function(schema = NULL, ...) { + opts <- list(...) + # Filter out arguments meant for CsvParseOptions/CsvConvertOptions + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + convert_opts <- names(formals(CsvConvertOptions$create)) + opts[arrow_opts] <- NULL + opts[readr_opts] <- NULL + opts[convert_opts] <- NULL + if (!is.null(schema)) { + opts[["column_names"]] <- names(schema) + } + do.call(CsvReadOptions$create, opts) +} + +#' Format-specific scan options +#' +#' @description +#' A `FragmentScanOptions` holds options specific to a `FileFormat` and a scan +#' operation. +#' +#' @section Factory: +#' `FragmentScanOptions$create()` takes the following arguments: +#' * `format`: A string identifier of the file format. Currently supported values: +#' * "parquet" +#' * "csv"/"text", aliases for the same format. +#' * `...`: Additional format-specific options +#' +#' `format = "parquet"``: +#' * `use_buffered_stream`: Read files through buffered input streams rather than +#' loading entire row groups at once. This may be enabled +#' to reduce memory overhead. Disabled by default. +#' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB. +#' * `pre_buffer`: Pre-buffer the raw Parquet data. This can improve performance +#' on high-latency filesystems. Disabled by default. +# +#' `format = "text"`: see [CsvConvertOptions]. Note that options can only be +#' specified with the Arrow C++ library naming. Also, "block_size" from +#' [CsvReadOptions] may be given. +#' +#' It returns the appropriate subclass of `FragmentScanOptions` +#' (e.g. `CsvFragmentScanOptions`). +#' @rdname FragmentScanOptions +#' @name FragmentScanOptions +#' @export +FragmentScanOptions <- R6Class("FragmentScanOptions", + inherit = ArrowObject, + active = list( + # @description + # Return the `FragmentScanOptions`'s type + type = function() dataset___FragmentScanOptions__type_name(self) + ) +) +FragmentScanOptions$create <- function(format, ...) { + if (format %in% c("csv", "text", "tsv")) { + CsvFragmentScanOptions$create(...) + } else if (format == "parquet") { + ParquetFragmentScanOptions$create(...) + } else { + stop("Unsupported file format: ", format, call. = FALSE) + } +} + +#' @export +as.character.FragmentScanOptions <- function(x, ...) { + x$type +} + +#' @usage NULL +#' @format NULL +#' @rdname FragmentScanOptions +#' @export +CsvFragmentScanOptions <- R6Class("CsvFragmentScanOptions", inherit = FragmentScanOptions) +CsvFragmentScanOptions$create <- function(..., + convert_opts = csv_file_format_convert_opts(...), + read_opts = csv_file_format_read_opts(...)) { + dataset___CsvFragmentScanOptions__Make(convert_opts, read_opts) +} + +#' @usage NULL +#' @format NULL +#' @rdname FragmentScanOptions +#' @export +ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = FragmentScanOptions) +ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE, + buffer_size = 8196, + pre_buffer = TRUE) { + dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer) +} + +#' Format-specific write options +#' +#' @description +#' A `FileWriteOptions` holds write options specific to a `FileFormat`. +FileWriteOptions <- R6Class("FileWriteOptions", + inherit = ArrowObject, + public = list( + update = function(table, ...) { + if (self$type == "parquet") { + dataset___ParquetFileWriteOptions__update( + self, + ParquetWriterProperties$create(table, ...), + ParquetArrowWriterProperties$create(...) + ) + } else if (self$type == "ipc") { + args <- list(...) + if (is.null(args$codec)) { + dataset___IpcFileWriteOptions__update1( + self, + get_ipc_use_legacy_format(args$use_legacy_format), + get_ipc_metadata_version(args$metadata_version) + ) + } else { + dataset___IpcFileWriteOptions__update2( + self, + get_ipc_use_legacy_format(args$use_legacy_format), + args$codec, + get_ipc_metadata_version(args$metadata_version) + ) + } + } else if (self$type == "csv") { + dataset___CsvFileWriteOptions__update( + self, + CsvWriteOptions$create(...) + ) + } + invisible(self) + } + ), + active = list( + type = function() dataset___FileWriteOptions__type_name(self) + ) +) +FileWriteOptions$create <- function(format, ...) { + if (!inherits(format, "FileFormat")) { + format <- FileFormat$create(format) + } + options <- dataset___FileFormat__DefaultWriteOptions(format) + options$update(...) +} diff --git a/src/arrow/r/R/dataset-partition.R b/src/arrow/r/R/dataset-partition.R new file mode 100644 index 000000000..35d5bc00c --- /dev/null +++ b/src/arrow/r/R/dataset-partition.R @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Define Partitioning for a Dataset +#' +#' @description +#' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()` +#' method to indicate how the file's paths should be interpreted to define +#' partitioning. +#' +#' `DirectoryPartitioning` describes how to interpret raw path segments, in +#' order. For example, `schema(year = int16(), month = int8())` would define +#' partitions for file paths like "2019/01/file.parquet", +#' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In +#' the previous example: when writing a dataset if the month was `NA` (or +#' `NULL`), the files would be placed in "2019/file.parquet". When reading, the +#' rows in "2019/file.parquet" would return an `NA` for the month column. An +#' error will be raised if an outer directory is `NULL` and an inner directory +#' is not. +#' +#' `HivePartitioning` is for Hive-style partitioning, which embeds field +#' names and values in path segments, such as +#' "/year=2019/month=2/data.parquet". Because fields are named in the path +#' segments, order does not matter. This partitioning scheme allows `NULL` +#' values. They will be replaced by a configurable `null_fallback` which +#' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When +#' reading, the `null_fallback` string will be replaced with `NA`s as +#' appropriate. +#' +#' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect +#' partition features from the file paths. +#' @section Factory: +#' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()` +#' methods take a [Schema] as a single input argument. The helper +#' function [`hive_partition(...)`][hive_partition] is shorthand for +#' `HivePartitioning$create(schema(...))`. +#' +#' With `DirectoryPartitioningFactory$create()`, you can provide just the +#' names of the path segments (in our example, `c("year", "month")`), and +#' the `DatasetFactory` will infer the data types for those partition variables. +#' `HivePartitioningFactory$create()` takes no arguments: both variable names +#' and their types can be inferred from the file paths. `hive_partition()` with +#' no arguments returns a `HivePartitioningFactory`. +#' @name Partitioning +#' @rdname Partitioning +#' @export +Partitioning <- R6Class("Partitioning", inherit = ArrowObject) +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning) +DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") { + dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding) +} + +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning) +HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") { + dataset___HivePartitioning(schm, + null_fallback = null_fallback_or_default(null_fallback), + segment_encoding = segment_encoding + ) +} + +#' Construct Hive partitioning +#' +#' Hive partitioning embeds field names and values in path segments, such as +#' "/year=2019/month=2/data.parquet". +#' +#' Because fields are named in the path segments, order of fields passed to +#' `hive_partition()` does not matter. +#' @param ... named list of [data types][data-type], passed to [schema()] +#' @param null_fallback character to be used in place of missing values (`NA` or `NULL`) +#' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`, +#' which is what Hive uses. +#' @param segment_encoding Decode partition segments after splitting paths. +#' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is). +#' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if +#' calling `hive_partition()` with no arguments. +#' @examplesIf arrow_with_dataset() +#' hive_partition(year = int16(), month = int8()) +#' @export +hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") { + schm <- schema(...) + if (length(schm) == 0) { + HivePartitioningFactory$create(null_fallback, segment_encoding) + } else { + HivePartitioning$create(schm, null_fallback, segment_encoding) + } +} + +PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject) + +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory) +DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") { + dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding) +} + +#' @usage NULL +#' @format NULL +#' @rdname Partitioning +#' @export +HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory) +HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") { + dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding) +} + +null_fallback_or_default <- function(null_fallback) { + null_fallback %||% "__HIVE_DEFAULT_PARTITION__" +} diff --git a/src/arrow/r/R/dataset-scan.R b/src/arrow/r/R/dataset-scan.R new file mode 100644 index 000000000..03c926fb4 --- /dev/null +++ b/src/arrow/r/R/dataset-scan.R @@ -0,0 +1,262 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Scan the contents of a dataset +#' +#' @description +#' A `Scanner` iterates over a [Dataset]'s fragments and returns data +#' according to given row filtering and column projection. A `ScannerBuilder` +#' can help create one. +#' +#' @section Factory: +#' `Scanner$create()` wraps the `ScannerBuilder` interface to make a `Scanner`. +#' It takes the following arguments: +#' +#' * `dataset`: A `Dataset` or `arrow_dplyr_query` object, as returned by the +#' `dplyr` methods on `Dataset`. +#' * `projection`: A character vector of column names to select columns or a +#' named list of expressions +#' * `filter`: A `Expression` to filter the scanned rows by, or `TRUE` (default) +#' to keep all rows. +#' * `use_threads`: logical: should scanning use multithreading? Default `TRUE` +#' * `use_async`: logical: should the async scanner (performs better on +#' high-latency/highly parallel filesystems like S3) be used? Default `FALSE` +#' * `...`: Additional arguments, currently ignored +#' @section Methods: +#' `ScannerBuilder` has the following methods: +#' +#' - `$Project(cols)`: Indicate that the scan should only return columns given +#' by `cols`, a character vector of column names +#' - `$Filter(expr)`: Filter rows by an [Expression]. +#' - `$UseThreads(threads)`: logical: should the scan use multithreading? +#' The method's default input is `TRUE`, but you must call the method to enable +#' multithreading because the scanner default is `FALSE`. +#' - `$UseAsync(use_async)`: logical: should the async scanner be used? +#' - `$BatchSize(batch_size)`: integer: Maximum row count of scanned record +#' batches, default is 32K. If scanned record batches are overflowing memory +#' then this method can be called to reduce their size. +#' - `$schema`: Active binding, returns the [Schema] of the Dataset +#' - `$Finish()`: Returns a `Scanner` +#' +#' `Scanner` currently has a single method, `$ToTable()`, which evaluates the +#' query and returns an Arrow [Table]. +#' @rdname Scanner +#' @name Scanner +#' @export +Scanner <- R6Class("Scanner", + inherit = ArrowObject, + public = list( + ToTable = function() dataset___Scanner__ToTable(self), + ScanBatches = function() dataset___Scanner__ScanBatches(self), + ToRecordBatchReader = function() dataset___Scanner__ToRecordBatchReader(self), + CountRows = function() dataset___Scanner__CountRows(self) + ), + active = list( + schema = function() dataset___Scanner__schema(self) + ) +) +Scanner$create <- function(dataset, + projection = NULL, + filter = TRUE, + use_threads = option_use_threads(), + use_async = getOption("arrow.use_async", FALSE), + batch_size = NULL, + fragment_scan_options = NULL, + ...) { + if (inherits(dataset, "arrow_dplyr_query")) { + if (is_collapsed(dataset)) { + # TODO: Is there a way to get a RecordBatchReader rather than evaluating? + dataset$.data <- as_adq(dplyr::compute(dataset$.data))$.data + } + + proj <- c(dataset$selected_columns, dataset$temp_columns) + + if (!is.null(projection)) { + if (is.character(projection)) { + stopifnot("attempting to project with unknown columns" = all(projection %in% names(proj))) + proj <- proj[projection] + } else { + # TODO: ARROW-13802 accepting lists of Expressions as a projection + warning( + "Scanner$create(projection = ...) must be a character vector, ", + "ignoring the projection argument." + ) + } + } + + if (!isTRUE(filter)) { + dataset <- set_filters(dataset, filter) + } + + return(Scanner$create( + dataset$.data, + proj, + dataset$filtered_rows, + use_threads, + use_async, + batch_size, + fragment_scan_options, + ... + )) + } + + scanner_builder <- ScannerBuilder$create(dataset) + if (use_threads) { + scanner_builder$UseThreads() + } + if (use_async) { + scanner_builder$UseAsync() + } + if (!is.null(projection)) { + scanner_builder$Project(projection) + } + if (!isTRUE(filter)) { + scanner_builder$Filter(filter) + } + if (is_integerish(batch_size)) { + scanner_builder$BatchSize(batch_size) + } + if (!is.null(fragment_scan_options)) { + scanner_builder$FragmentScanOptions(fragment_scan_options) + } + scanner_builder$Finish() +} + +#' @export +names.Scanner <- function(x) names(x$schema) + +#' @export +head.Scanner <- function(x, n = 6L, ...) { + assert_that(n > 0) # For now + dataset___Scanner__head(x, n) +} + +#' @export +tail.Scanner <- function(x, n = 6L, ...) { + assert_that(n > 0) # For now + result <- list() + batch_num <- 0 + for (batch in rev(dataset___Scanner__ScanBatches(x))) { + batch_num <- batch_num + 1 + result[[batch_num]] <- tail(batch, n) + n <- n - nrow(batch) + if (n <= 0) break + } + Table$create(!!!rev(result)) +} + +ScanTask <- R6Class("ScanTask", + inherit = ArrowObject, + public = list( + Execute = function() dataset___ScanTask__get_batches(self) + ) +) + +#' Apply a function to a stream of RecordBatches +#' +#' As an alternative to calling `collect()` on a `Dataset` query, you can +#' use this function to access the stream of `RecordBatch`es in the `Dataset`. +#' This lets you aggregate on each chunk and pull the intermediate results into +#' a `data.frame` for further aggregation, even if you couldn't fit the whole +#' `Dataset` result in memory. +#' +#' This is experimental and not recommended for production use. +#' +#' @param X A `Dataset` or `arrow_dplyr_query` object, as returned by the +#' `dplyr` methods on `Dataset`. +#' @param FUN A function or `purrr`-style lambda expression to apply to each +#' batch +#' @param ... Additional arguments passed to `FUN` +#' @param .data.frame logical: collect the resulting chunks into a single +#' `data.frame`? Default `TRUE` +#' @export +map_batches <- function(X, FUN, ..., .data.frame = TRUE) { + if (.data.frame) { + lapply <- map_dfr + } + scanner <- Scanner$create(ensure_group_vars(X)) + FUN <- as_mapper(FUN) + lapply(scanner$ScanBatches(), function(batch) { + # TODO: wrap batch in arrow_dplyr_query with X$selected_columns, + # X$temp_columns, and X$group_by_vars + # if X is arrow_dplyr_query, if some other arg (.dplyr?) == TRUE + FUN(batch, ...) + }) +} + +#' @usage NULL +#' @format NULL +#' @rdname Scanner +#' @export +ScannerBuilder <- R6Class("ScannerBuilder", + inherit = ArrowObject, + public = list( + Project = function(cols) { + # cols is either a character vector or a named list of Expressions + if (is.character(cols)) { + dataset___ScannerBuilder__ProjectNames(self, cols) + } else if (length(cols) == 0) { + # Empty projection + dataset___ScannerBuilder__ProjectNames(self, character(0)) + } else { + # List of Expressions + dataset___ScannerBuilder__ProjectExprs(self, cols, names(cols)) + } + self + }, + Filter = function(expr) { + assert_is(expr, "Expression") + dataset___ScannerBuilder__Filter(self, expr) + self + }, + UseThreads = function(threads = option_use_threads()) { + dataset___ScannerBuilder__UseThreads(self, threads) + self + }, + UseAsync = function(use_async = TRUE) { + dataset___ScannerBuilder__UseAsync(self, use_async) + self + }, + BatchSize = function(batch_size) { + dataset___ScannerBuilder__BatchSize(self, batch_size) + self + }, + FragmentScanOptions = function(options) { + dataset___ScannerBuilder__FragmentScanOptions(self, options) + self + }, + Finish = function() dataset___ScannerBuilder__Finish(self) + ), + active = list( + schema = function() dataset___ScannerBuilder__schema(self) + ) +) +ScannerBuilder$create <- function(dataset) { + if (inherits(dataset, "RecordBatchReader")) { + return(dataset___ScannerBuilder__FromRecordBatchReader(dataset)) + } + + if (inherits(dataset, c("data.frame", "ArrowTabular"))) { + dataset <- InMemoryDataset$create(dataset) + } + assert_is(dataset, "Dataset") + + dataset$NewScan() +} + +#' @export +names.ScannerBuilder <- function(x) names(x$schema) diff --git a/src/arrow/r/R/dataset-write.R b/src/arrow/r/R/dataset-write.R new file mode 100644 index 000000000..3a98357b0 --- /dev/null +++ b/src/arrow/r/R/dataset-write.R @@ -0,0 +1,144 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Write a dataset +#' +#' This function allows you to write a dataset. By writing to more efficient +#' binary storage formats, and by specifying relevant partitioning, you can +#' make it much faster to read and query. +#' +#' @param dataset [Dataset], [RecordBatch], [Table], `arrow_dplyr_query`, or +#' `data.frame`. If an `arrow_dplyr_query`, the query will be evaluated and +#' the result will be written. This means that you can `select()`, `filter()`, `mutate()`, +#' etc. to transform the data before it is written if you need to. +#' @param path string path, URI, or `SubTreeFileSystem` referencing a directory +#' to write to (directory will be created if it does not exist) +#' @param format a string identifier of the file format. Default is to use +#' "parquet" (see [FileFormat]) +#' @param partitioning `Partitioning` or a character vector of columns to +#' use as partition keys (to be written as path segments). Default is to +#' use the current `group_by()` columns. +#' @param basename_template string template for the names of files to be written. +#' Must contain `"{i}"`, which will be replaced with an autoincremented +#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"` +#' will yield `"part-0.feather", ...`. +#' @param hive_style logical: write partition segments as Hive-style +#' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`. +#' @param existing_data_behavior The behavior to use when there is already data +#' in the destination directory. Must be one of "overwrite", "error", or +#' "delete_matching". +#' - "overwrite" (the default) then any new files created will overwrite +#' existing files +#' - "error" then the operation will fail if the destination directory is not +#' empty +#' - "delete_matching" then the writer will delete any existing partitions +#' if data is going to be written to those partitions and will leave alone +#' partitions which data is not written to. +#' @param ... additional format-specific arguments. For available Parquet +#' options, see [write_parquet()]. The available Feather options are +#' - `use_legacy_format` logical: write data formatted so that Arrow libraries +#' versions 0.14 and lower can read it. Default is `FALSE`. You can also +#' enable this by setting the environment variable `ARROW_PRE_0_15_IPC_FORMAT=1`. +#' - `metadata_version`: A string like "V5" or the equivalent integer indicating +#' the Arrow IPC MetadataVersion. Default (NULL) will use the latest version, +#' unless the environment variable `ARROW_PRE_1_0_METADATA_VERSION=1`, in +#' which case it will be V4. +#' - `codec`: A [Codec] which will be used to compress body buffers of written +#' files. Default (NULL) will not compress body buffers. +#' - `null_fallback`: character to be used in place of missing values (`NA` or +#' `NULL`) when using Hive-style partitioning. See [hive_partition()]. +#' @return The input `dataset`, invisibly +#' @examplesIf arrow_with_dataset() & arrow_with_parquet() & requireNamespace("dplyr", quietly = TRUE) +#' # You can write datasets partitioned by the values in a column (here: "cyl"). +#' # This creates a structure of the form cyl=X/part-Z.parquet. +#' one_level_tree <- tempfile() +#' write_dataset(mtcars, one_level_tree, partitioning = "cyl") +#' list.files(one_level_tree, recursive = TRUE) +#' +#' # You can also partition by the values in multiple columns +#' # (here: "cyl" and "gear"). +#' # This creates a structure of the form cyl=X/gear=Y/part-Z.parquet. +#' two_levels_tree <- tempfile() +#' write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear")) +#' list.files(two_levels_tree, recursive = TRUE) +#' +#' # In the two previous examples we would have: +#' # X = {4,6,8}, the number of cylinders. +#' # Y = {3,4,5}, the number of forward gears. +#' # Z = {0,1,2}, the number of saved parts, starting from 0. +#' +#' # You can obtain the same result as as the previous examples using arrow with +#' # a dplyr pipeline. This will be the same as two_levels_tree above, but the +#' # output directory will be different. +#' library(dplyr) +#' two_levels_tree_2 <- tempfile() +#' mtcars %>% +#' group_by(cyl, gear) %>% +#' write_dataset(two_levels_tree_2) +#' list.files(two_levels_tree_2, recursive = TRUE) +#' +#' # And you can also turn off the Hive-style directory naming where the column +#' # name is included with the values by using `hive_style = FALSE`. +#' +#' # Write a structure X/Y/part-Z.parquet. +#' two_levels_tree_no_hive <- tempfile() +#' mtcars %>% +#' group_by(cyl, gear) %>% +#' write_dataset(two_levels_tree_no_hive, hive_style = FALSE) +#' list.files(two_levels_tree_no_hive, recursive = TRUE) +#' @export +write_dataset <- function(dataset, + path, + format = c("parquet", "feather", "arrow", "ipc", "csv"), + partitioning = dplyr::group_vars(dataset), + basename_template = paste0("part-{i}.", as.character(format)), + hive_style = TRUE, + existing_data_behavior = c("overwrite", "error", "delete_matching"), + ...) { + format <- match.arg(format) + if (inherits(dataset, "arrow_dplyr_query")) { + # partitioning vars need to be in the `select` schema + dataset <- ensure_group_vars(dataset) + } else if (inherits(dataset, "grouped_df")) { + force(partitioning) + # Drop the grouping metadata before writing; we've already consumed it + # now to construct `partitioning` and don't want it in the metadata$r + dataset <- dplyr::ungroup(dataset) + } + + scanner <- Scanner$create(dataset, use_async = TRUE) + if (!inherits(partitioning, "Partitioning")) { + partition_schema <- scanner$schema[partitioning] + if (isTRUE(hive_style)) { + partitioning <- HivePartitioning$create(partition_schema, null_fallback = list(...)$null_fallback) + } else { + partitioning <- DirectoryPartitioning$create(partition_schema) + } + } + + path_and_fs <- get_path_and_filesystem(path) + options <- FileWriteOptions$create(format, table = scanner, ...) + + existing_data_behavior_opts <- c("delete_matching", "overwrite", "error") + existing_data_behavior <- match(match.arg(existing_data_behavior), existing_data_behavior_opts) - 1L + + dataset___Dataset__Write( + options, path_and_fs$fs, path_and_fs$path, + partitioning, basename_template, scanner, + existing_data_behavior + ) +} diff --git a/src/arrow/r/R/dataset.R b/src/arrow/r/R/dataset.R new file mode 100644 index 000000000..7207a5543 --- /dev/null +++ b/src/arrow/r/R/dataset.R @@ -0,0 +1,367 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Open a multi-file dataset +#' +#' Arrow Datasets allow you to query against data that has been split across +#' multiple files. This sharding of data may indicate partitioning, which +#' can accelerate queries that only touch some partitions (files). Call +#' `open_dataset()` to point to a directory of data files and return a +#' `Dataset`, then use `dplyr` methods to query it. +#' +#' @param sources One of: +#' * a string path or URI to a directory containing data files +#' * a string path or URI to a single file +#' * a character vector of paths or URIs to individual data files +#' * a list of `Dataset` objects as created by this function +#' * a list of `DatasetFactory` objects as created by [dataset_factory()]. +#' +#' When `sources` is a vector of file URIs, they must all use the same protocol +#' and point to files located in the same file system and having the same +#' format. +#' @param schema [Schema] for the `Dataset`. If `NULL` (the default), the schema +#' will be inferred from the data sources. +#' @param partitioning When `sources` is a directory path/URI, one of: +#' * a `Schema`, in which case the file paths relative to `sources` will be +#' parsed, and path segments will be matched with the schema fields. For +#' example, `schema(year = int16(), month = int8())` would create partitions +#' for file paths like `"2019/01/file.parquet"`, `"2019/02/file.parquet"`, +#' etc. +#' * a character vector that defines the field names corresponding to those +#' path segments (that is, you're providing the names that would correspond +#' to a `Schema` but the types will be autodetected) +#' * a `HivePartitioning` or `HivePartitioningFactory`, as returned +#' by [hive_partition()] which parses explicit or autodetected fields from +#' Hive-style path segments +#' * `NULL` for no partitioning +#' +#' The default is to autodetect Hive-style partitions. When `sources` is not a +#' directory path/URI, `partitioning` is ignored. +#' @param unify_schemas logical: should all data fragments (files, `Dataset`s) +#' be scanned in order to create a unified schema from them? If `FALSE`, only +#' the first fragment will be inspected for its schema. Use this fast path +#' when you know and trust that all fragments have an identical schema. +#' The default is `FALSE` when creating a dataset from a directory path/URI or +#' vector of file paths/URIs (because there may be many files and scanning may +#' be slow) but `TRUE` when `sources` is a list of `Dataset`s (because there +#' should be few `Dataset`s in the list and their `Schema`s are already in +#' memory). +#' @param format A [FileFormat] object, or a string identifier of the format of +#' the files in `x`. This argument is ignored when `sources` is a list of `Dataset` objects. +#' Currently supported values: +#' * "parquet" +#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +#' only version 2 files are supported +#' * "csv"/"text", aliases for the same thing (because comma is the default +#' delimiter for text files +#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"` +#' +#' Default is "parquet", unless a `delimiter` is also specified, in which case +#' it is assumed to be "text". +#' @param ... additional arguments passed to `dataset_factory()` when `sources` +#' is a directory path/URI or vector of file paths/URIs, otherwise ignored. +#' These may include `format` to indicate the file format, or other +#' format-specific options. +#' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data, +#' or call [`$NewScan()`][Scanner] to construct a query directly. +#' @export +#' @seealso `vignette("dataset", package = "arrow")` +#' @include arrow-package.R +#' @examplesIf arrow_with_dataset() & arrow_with_parquet() +#' # Set up directory for examples +#' tf <- tempfile() +#' dir.create(tf) +#' on.exit(unlink(tf)) +#' +#' data <- dplyr::group_by(mtcars, cyl) +#' write_dataset(data, tf) +#' +#' # You can specify a directory containing the files for your dataset and +#' # open_dataset will scan all files in your directory. +#' open_dataset(tf) +#' +#' # You can also supply a vector of paths +#' open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"), file.path(tf, "cyl=8/part-0.parquet"))) +#' +#' ## You must specify the file format if using a format other than parquet. +#' tf2 <- tempfile() +#' dir.create(tf2) +#' on.exit(unlink(tf2)) +#' write_dataset(data, tf2, format = "ipc") +#' # This line will results in errors when you try to work with the data +#' \dontrun{ +#' open_dataset(tf2) +#' } +#' # This line will work +#' open_dataset(tf2, format = "ipc") +#' +#' ## You can specify file partitioning to include it as a field in your dataset +#' # Create a temporary directory and write example dataset +#' tf3 <- tempfile() +#' dir.create(tf3) +#' on.exit(unlink(tf3)) +#' write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE) +#' +#' # View files - you can see the partitioning means that files have been written +#' # to folders based on Month/Day values +#' tf3_files <- list.files(tf3, recursive = TRUE) +#' +#' # With no partitioning specified, dataset contains all files but doesn't include +#' # directory names as field names +#' open_dataset(tf3) +#' +#' # Now that partitioning has been specified, your dataset contains columns for Month and Day +#' open_dataset(tf3, partitioning = c("Month", "Day")) +#' +#' # If you want to specify the data types for your fields, you can pass in a Schema +#' open_dataset(tf3, partitioning = schema(Month = int8(), Day = int8())) +open_dataset <- function(sources, + schema = NULL, + partitioning = hive_partition(), + unify_schemas = NULL, + format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"), + ...) { + if (!arrow_with_dataset()) { + stop("This build of the arrow package does not support Datasets", call. = FALSE) + } + if (is_list_of(sources, "Dataset")) { + if (is.null(schema)) { + if (is.null(unify_schemas) || isTRUE(unify_schemas)) { + # Default is to unify schemas here + schema <- unify_schemas(schemas = map(sources, ~ .$schema)) + } else { + # Take the first one. + schema <- sources[[1]]$schema + } + } + # Enforce that all datasets have the same schema + assert_is(schema, "Schema") + sources <- lapply(sources, function(x) { + x$schema <- schema + x + }) + return(dataset___UnionDataset__create(sources, schema)) + } + + factory <- DatasetFactory$create(sources, partitioning = partitioning, format = format, schema = schema, ...) + tryCatch( + # Default is _not_ to inspect/unify schemas + factory$Finish(schema, isTRUE(unify_schemas)), + error = function(e) { + handle_parquet_io_error(e, format) + } + ) +} + +#' Multi-file datasets +#' +#' @description +#' Arrow Datasets allow you to query against data that has been split across +#' multiple files. This sharding of data may indicate partitioning, which +#' can accelerate queries that only touch some partitions (files). +#' +#' A `Dataset` contains one or more `Fragments`, such as files, of potentially +#' differing type and partitioning. +#' +#' For `Dataset$create()`, see [open_dataset()], which is an alias for it. +#' +#' `DatasetFactory` is used to provide finer control over the creation of `Dataset`s. +#' +#' @section Factory: +#' `DatasetFactory` is used to create a `Dataset`, inspect the [Schema] of the +#' fragments contained in it, and declare a partitioning. +#' `FileSystemDatasetFactory` is a subclass of `DatasetFactory` for +#' discovering files in the local file system, the only currently supported +#' file system. +#' +#' For the `DatasetFactory$create()` factory method, see [dataset_factory()], an +#' alias for it. A `DatasetFactory` has: +#' +#' - `$Inspect(unify_schemas)`: If `unify_schemas` is `TRUE`, all fragments +#' will be scanned and a unified [Schema] will be created from them; if `FALSE` +#' (default), only the first fragment will be inspected for its schema. Use this +#' fast path when you know and trust that all fragments have an identical schema. +#' - `$Finish(schema, unify_schemas)`: Returns a `Dataset`. If `schema` is provided, +#' it will be used for the `Dataset`; if omitted, a `Schema` will be created from +#' inspecting the fragments (files) in the dataset, following `unify_schemas` +#' as described above. +#' +#' `FileSystemDatasetFactory$create()` is a lower-level factory method and +#' takes the following arguments: +#' * `filesystem`: A [FileSystem] +#' * `selector`: Either a [FileSelector] or `NULL` +#' * `paths`: Either a character vector of file paths or `NULL` +#' * `format`: A [FileFormat] +#' * `partitioning`: Either `Partitioning`, `PartitioningFactory`, or `NULL` +#' @section Methods: +#' +#' A `Dataset` has the following methods: +#' - `$NewScan()`: Returns a [ScannerBuilder] for building a query +#' - `$schema`: Active binding that returns the [Schema] of the Dataset; you +#' may also replace the dataset's schema by using `ds$schema <- new_schema`. +#' This method currently supports only adding, removing, or reordering +#' fields in the schema: you cannot alter or cast the field types. +#' +#' `FileSystemDataset` has the following methods: +#' - `$files`: Active binding, returns the files of the `FileSystemDataset` +#' - `$format`: Active binding, returns the [FileFormat] of the `FileSystemDataset` +#' +#' `UnionDataset` has the following methods: +#' - `$children`: Active binding, returns all child `Dataset`s. +#' +#' @export +#' @seealso [open_dataset()] for a simple interface to creating a `Dataset` +Dataset <- R6Class("Dataset", + inherit = ArrowObject, + public = list( + # @description + # Start a new scan of the data + # @return A [ScannerBuilder] + NewScan = function() dataset___Dataset__NewScan(self), + ToString = function() self$schema$ToString() + ), + active = list( + schema = function(schema) { + if (missing(schema)) { + dataset___Dataset__schema(self) + } else { + assert_is(schema, "Schema") + invisible(dataset___Dataset__ReplaceSchema(self, schema)) + } + }, + metadata = function() self$schema$metadata, + num_rows = function() self$NewScan()$Finish()$CountRows(), + num_cols = function() length(self$schema), + # @description + # Return the Dataset's type. + type = function() dataset___Dataset__type_name(self) + ) +) +Dataset$create <- open_dataset + +#' @name FileSystemDataset +#' @rdname Dataset +#' @export +FileSystemDataset <- R6Class("FileSystemDataset", + inherit = Dataset, + public = list( + .class_title = function() { + nfiles <- length(self$files) + file_type <- self$format$type + pretty_file_type <- list( + parquet = "Parquet", + ipc = "Feather" + )[[file_type]] + + paste( + class(self)[[1]], + "with", + nfiles, + pretty_file_type %||% file_type, + ifelse(nfiles == 1, "file", "files") + ) + } + ), + active = list( + # @description + # Return the files contained in this `FileSystemDataset` + files = function() dataset___FileSystemDataset__files(self), + # @description + # Return the format of files in this `Dataset` + format = function() { + dataset___FileSystemDataset__format(self) + }, + # @description + # Return the filesystem of files in this `Dataset` + filesystem = function() { + dataset___FileSystemDataset__filesystem(self) + } + ) +) + +#' @name UnionDataset +#' @rdname Dataset +#' @export +UnionDataset <- R6Class("UnionDataset", + inherit = Dataset, + active = list( + # @description + # Return the UnionDataset's child `Dataset`s + children = function() { + dataset___UnionDataset__children(self) + } + ) +) + +#' @name InMemoryDataset +#' @rdname Dataset +#' @export +InMemoryDataset <- R6Class("InMemoryDataset", inherit = Dataset) +InMemoryDataset$create <- function(x) { + if (!arrow_with_dataset()) { + stop("This build of the arrow package does not support Datasets", call. = FALSE) + } + if (!inherits(x, "Table")) { + x <- Table$create(x) + } + dataset___InMemoryDataset__create(x) +} + + +#' @export +names.Dataset <- function(x) names(x$schema) + +#' @export +dim.Dataset <- function(x) c(x$num_rows, x$num_cols) + +#' @export +c.Dataset <- function(...) Dataset$create(list(...)) + +#' @export +head.Dataset <- function(x, n = 6L, ...) { + head(Scanner$create(x), n) +} + +#' @export +tail.Dataset <- function(x, n = 6L, ...) { + tail(Scanner$create(x), n) +} + +#' @export +`[.Dataset` <- function(x, i, j, ..., drop = FALSE) { + if (nargs() == 2L) { + # List-like column extraction (x[i]) + return(x[, i]) + } + if (!missing(j)) { + x <- select.Dataset(x, all_of(j)) + } + + if (!missing(i)) { + x <- take_dataset_rows(x, i) + } + x +} + +take_dataset_rows <- function(x, i) { + if (!is.numeric(i) || any(i < 0)) { + stop("Only slicing with positive indices is supported", call. = FALSE) + } + scanner <- Scanner$create(x) + i <- Array$create(i - 1) + dataset___Scanner__TakeRows(scanner, i) +} diff --git a/src/arrow/r/R/deprecated.R b/src/arrow/r/R/deprecated.R new file mode 100644 index 000000000..e8848c4aa --- /dev/null +++ b/src/arrow/r/R/deprecated.R @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @rdname read_ipc_stream +#' @export +read_arrow <- function(file, ...) { + .Deprecated(msg = "Use 'read_ipc_stream' or 'read_feather' instead.") + if (inherits(file, "raw")) { + read_ipc_stream(file, ...) + } else { + read_feather(file, ...) + } +} + +#' @rdname write_ipc_stream +#' @export +write_arrow <- function(x, sink, ...) { + .Deprecated(msg = "Use 'write_ipc_stream' or 'write_feather' instead.") + if (inherits(sink, "raw")) { + # HACK for sparklyr + # Note that this returns a new R raw vector, not the one passed as `sink` + write_to_raw(x) + } else { + write_feather(x, sink, ...) + } +} diff --git a/src/arrow/r/R/dictionary.R b/src/arrow/r/R/dictionary.R new file mode 100644 index 000000000..b701768d6 --- /dev/null +++ b/src/arrow/r/R/dictionary.R @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include type.R + +#' @title class DictionaryType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname DictionaryType +#' @name DictionaryType +DictionaryType <- R6Class("DictionaryType", + inherit = FixedWidthType, + public = list( + ToString = function() { + prettier_dictionary_type(DataType__ToString(self)) + } + ), + active = list( + index_type = function() DictionaryType__index_type(self), + value_type = function() DictionaryType__value_type(self), + name = function() DictionaryType__name(self), + ordered = function() DictionaryType__ordered(self) + ) +) +DictionaryType$create <- function(index_type = int32(), + value_type = utf8(), + ordered = FALSE) { + assert_is(index_type, "DataType") + assert_is(value_type, "DataType") + DictionaryType__initialize(index_type, value_type, ordered) +} + +#' Create a dictionary type +#' +#' @param index_type A DataType for the indices (default [int32()]) +#' @param value_type A DataType for the values (default [utf8()]) +#' @param ordered Is this an ordered dictionary (default `FALSE`)? +#' +#' @return A [DictionaryType] +#' @seealso [Other Arrow data types][data-type] +#' @export +dictionary <- DictionaryType$create + +prettier_dictionary_type <- function(x) { + # Prettier format the "ordered" attribute + x <- sub(", ordered=0", "", x) + sub("ordered=1", "ordered", x) +} diff --git a/src/arrow/r/R/dplyr-arrange.R b/src/arrow/r/R/dplyr-arrange.R new file mode 100644 index 000000000..4c8c687a3 --- /dev/null +++ b/src/arrow/r/R/dplyr-arrange.R @@ -0,0 +1,98 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { + call <- match.call() + exprs <- quos(...) + if (.by_group) { + # when the data is is grouped and .by_group is TRUE, order the result by + # the grouping columns first + exprs <- c(quos(!!!dplyr::groups(.data)), exprs) + } + if (length(exprs) == 0) { + # Nothing to do + return(.data) + } + .data <- as_adq(.data) + # find and remove any dplyr::desc() and tidy-eval + # the arrange expressions inside an Arrow data_mask + sorts <- vector("list", length(exprs)) + descs <- logical(0) + mask <- arrow_mask(.data) + for (i in seq_along(exprs)) { + x <- find_and_remove_desc(exprs[[i]]) + exprs[[i]] <- x[["quos"]] + sorts[[i]] <- arrow_eval(exprs[[i]], mask) + names(sorts)[i] <- format_expr(exprs[[i]]) + if (inherits(sorts[[i]], "try-error")) { + msg <- paste("Expression", names(sorts)[i], "not supported in Arrow") + return(abandon_ship(call, .data, msg)) + } + descs[i] <- x[["desc"]] + } + .data$arrange_vars <- c(sorts, .data$arrange_vars) + .data$arrange_desc <- c(descs, .data$arrange_desc) + .data +} +arrange.Dataset <- arrange.ArrowTabular <- arrange.arrow_dplyr_query + +# Helper to handle desc() in arrange() +# * Takes a quosure as input +# * Returns a list with two elements: +# 1. The quosure with any wrapping parentheses and desc() removed +# 2. A logical value indicating whether desc() was found +# * Performs some other validation +find_and_remove_desc <- function(quosure) { + expr <- quo_get_expr(quosure) + descending <- FALSE + if (length(all.vars(expr)) < 1L) { + stop( + "Expression in arrange() does not contain any field names: ", + deparse(expr), + call. = FALSE + ) + } + # Use a while loop to remove any number of nested pairs of enclosing + # parentheses and any number of nested desc() calls. In the case of multiple + # nested desc() calls, each one toggles the sort order. + while (identical(typeof(expr), "language") && is.call(expr)) { + if (identical(expr[[1]], quote(`(`))) { + # remove enclosing parentheses + expr <- expr[[2]] + } else if (identical(expr[[1]], quote(desc))) { + # ensure desc() has only one argument (when an R expression is a function + # call, length == 2 means it has exactly one argument) + if (length(expr) > 2) { + stop("desc() expects only one argument", call. = FALSE) + } + # remove desc() and toggle descending + expr <- expr[[2]] + descending <- !descending + } else { + break + } + } + return( + list( + quos = quo_set_expr(quosure, expr), + desc = descending + ) + ) +} diff --git a/src/arrow/r/R/dplyr-collect.R b/src/arrow/r/R/dplyr-collect.R new file mode 100644 index 000000000..13e68f3f4 --- /dev/null +++ b/src/arrow/r/R/dplyr-collect.R @@ -0,0 +1,121 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { + # head and tail are not ExecNodes, at best we can handle them via sink node + # so if there are any steps done after head/tail, we need to + # evaluate the query up to then and then do a new query for the rest + if (is_collapsed(x) && has_head_tail(x$.data)) { + x$.data <- as_adq(dplyr::compute(x$.data))$.data + } + + # See query-engine.R for ExecPlan/Nodes + tab <- do_exec_plan(x) + if (as_data_frame) { + df <- as.data.frame(tab) + tab$invalidate() + restore_dplyr_features(df, x) + } else { + restore_dplyr_features(tab, x) + } +} +collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) { + if (as_data_frame) { + as.data.frame(x, ...) + } else { + x + } +} +collect.Dataset <- function(x, ...) dplyr::collect(as_adq(x), ...) + +compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame = FALSE) +compute.ArrowTabular <- function(x, ...) x +compute.Dataset <- compute.arrow_dplyr_query + +pull.arrow_dplyr_query <- function(.data, var = -1) { + .data <- as_adq(.data) + var <- vars_pull(names(.data), !!enquo(var)) + .data$selected_columns <- set_names(.data$selected_columns[var], var) + dplyr::collect(.data)[[1]] +} +pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query + +restore_dplyr_features <- function(df, query) { + # An arrow_dplyr_query holds some attributes that Arrow doesn't know about + # After calling collect(), make sure these features are carried over + + if (length(query$group_by_vars) > 0) { + # Preserve groupings, if present + if (is.data.frame(df)) { + df <- dplyr::grouped_df( + df, + dplyr::group_vars(query), + drop = dplyr::group_by_drop_default(query) + ) + } else { + # This is a Table, via compute() or collect(as_data_frame = FALSE) + df <- as_adq(df) + df$group_by_vars <- query$group_by_vars + df$drop_empty_groups <- query$drop_empty_groups + } + } + df +} + +collapse.arrow_dplyr_query <- function(x, ...) { + # Figure out what schema will result from the query + x$schema <- implicit_schema(x) + # Nest inside a new arrow_dplyr_query (and keep groups) + restore_dplyr_features(arrow_dplyr_query(x), x) +} +collapse.Dataset <- collapse.ArrowTabular <- function(x, ...) { + arrow_dplyr_query(x) +} + +implicit_schema <- function(.data) { + .data <- ensure_group_vars(.data) + old_schm <- .data$.data$schema + + if (is.null(.data$aggregations)) { + new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) + if (!is.null(.data$join) && !(.data$join$type %in% JoinType[1:4])) { + # Add cols from right side, except for semi/anti joins + right_cols <- .data$join$right_data$selected_columns + new_fields <- c(new_fields, map( + right_cols[setdiff(names(right_cols), .data$join$by)], + ~ .$type(.data$join$right_data$.data$schema) + )) + } + } else { + new_fields <- map(summarize_projection(.data), ~ .$type(old_schm)) + # * Put group_by_vars first (this can't be done by summarize, + # they have to be last per the aggregate node signature, + # and they get projected to this order after aggregation) + # * Infer the output types from the aggregations + group_fields <- new_fields[.data$group_by_vars] + hash <- length(.data$group_by_vars) > 0 + agg_fields <- imap( + new_fields[setdiff(names(new_fields), .data$group_by_vars)], + ~ output_type(.data$aggregations[[.y]][["fun"]], .x, hash) + ) + new_fields <- c(group_fields, agg_fields) + } + schema(!!!new_fields) +} diff --git a/src/arrow/r/R/dplyr-count.R b/src/arrow/r/R/dplyr-count.R new file mode 100644 index 000000000..c567c285f --- /dev/null +++ b/src/arrow/r/R/dplyr-count.R @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# The following S3 methods are registered on load if dplyr is present + +count.arrow_dplyr_query <- function(x, ..., wt = NULL, sort = FALSE, name = NULL) { + if (!missing(...)) { + out <- dplyr::group_by(x, ..., .add = TRUE) + } else { + out <- x + } + out <- dplyr::tally(out, wt = {{ wt }}, sort = sort, name = name) + + # Restore original group vars + gv <- dplyr::group_vars(x) + if (length(gv)) { + out$group_by_vars <- gv + } + + out +} + +count.Dataset <- count.ArrowTabular <- count.arrow_dplyr_query + +#' @importFrom rlang sym := +tally.arrow_dplyr_query <- function(x, wt = NULL, sort = FALSE, name = NULL) { + check_name <- utils::getFromNamespace("check_name", "dplyr") + name <- check_name(name, dplyr::group_vars(x)) + + if (quo_is_null(enquo(wt))) { + out <- dplyr::summarize(x, !!name := n()) + } else { + out <- dplyr::summarize(x, !!name := sum({{ wt }}, na.rm = TRUE)) + } + + if (sort) { + dplyr::arrange(out, desc(!!sym(name))) + } else { + out + } +} + +tally.Dataset <- tally.ArrowTabular <- tally.arrow_dplyr_query + +# we don't want to depend on dplyr, but we refrence these above +utils::globalVariables(c("n", "desc")) diff --git a/src/arrow/r/R/dplyr-distinct.R b/src/arrow/r/R/dplyr-distinct.R new file mode 100644 index 000000000..5dfcb641f --- /dev/null +++ b/src/arrow/r/R/dplyr-distinct.R @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# The following S3 methods are registered on load if dplyr is present + +distinct.arrow_dplyr_query <- function(.data, ..., .keep_all = FALSE) { + if (.keep_all == TRUE) { + # After ARROW-13993 is merged, we can implement this (ARROW-14045) + arrow_not_supported("`distinct()` with `.keep_all = TRUE`") + } + + original_gv <- dplyr::group_vars(.data) + if (length(quos(...))) { + # group_by() calls mutate() if there are any expressions in ... + .data <- dplyr::group_by(.data, ..., .add = TRUE) + # `data %>% group_by() %>% summarise()` returns cols in order supplied + # but distinct() returns cols in dataset order, so sort group vars + .data$group_by_vars <- names(.data)[names(.data) %in% .data$group_by_vars] + } else { + # distinct() with no vars specified means distinct across all cols + .data <- dplyr::group_by(.data, !!!syms(names(.data))) + } + + out <- dplyr::summarize(.data, .groups = "drop") + # distinct() doesn't modify group by vars, so restore the original ones + if (length(original_gv)) { + out$group_by_vars <- original_gv + } + out +} + +distinct.Dataset <- distinct.ArrowTabular <- distinct.arrow_dplyr_query diff --git a/src/arrow/r/R/dplyr-eval.R b/src/arrow/r/R/dplyr-eval.R new file mode 100644 index 000000000..9d944ab80 --- /dev/null +++ b/src/arrow/r/R/dplyr-eval.R @@ -0,0 +1,123 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arrow_eval <- function(expr, mask) { + # filter(), mutate(), etc. work by evaluating the quoted `exprs` to generate Expressions + # with references to Arrays (if .data is Table/RecordBatch) or Fields (if + # .data is a Dataset). + + # This yields an Expression as long as the `exprs` are implemented in Arrow. + # Otherwise, it returns a try-error + tryCatch(eval_tidy(expr, mask), error = function(e) { + # Look for the cases where bad input was given, i.e. this would fail + # in regular dplyr anyway, and let those raise those as errors; + # else, for things not supported by Arrow return a "try-error", + # which we'll handle differently + msg <- conditionMessage(e) + if (getOption("arrow.debug", FALSE)) print(msg) + patterns <- .cache$i18ized_error_pattern + if (is.null(patterns)) { + patterns <- i18ize_error_messages() + # Memoize it + .cache$i18ized_error_pattern <- patterns + } + if (grepl(patterns, msg)) { + stop(e) + } + + out <- structure(msg, class = "try-error", condition = e) + if (grepl("not supported.*Arrow", msg) || getOption("arrow.debug", FALSE)) { + # One of ours. Mark it so that consumers can handle it differently + class(out) <- c("arrow-try-error", class(out)) + } + invisible(out) + }) +} + +handle_arrow_not_supported <- function(err, lab) { + # Look for informative message from the Arrow function version (see above) + if (inherits(err, "arrow-try-error")) { + # Include it if found + paste0("In ", lab, ", ", as.character(err)) + } else { + # Otherwise be opaque (the original error is probably not useful) + paste("Expression", lab, "not supported in Arrow") + } +} + +i18ize_error_messages <- function() { + # Figure out what the error messages will be with this LANGUAGE + # so that we can look for them + out <- list( + obj = tryCatch(eval(parse(text = "X_____X")), error = function(e) conditionMessage(e)), + fun = tryCatch(eval(parse(text = "X_____X()")), error = function(e) conditionMessage(e)) + ) + paste(map(out, ~ sub("X_____X", ".*", .)), collapse = "|") +} + +# Helper to raise a common error +arrow_not_supported <- function(msg) { + # TODO: raise a classed error? + stop(paste(msg, "not supported by Arrow"), call. = FALSE) +} + +# Create a data mask for evaluating a dplyr expression +arrow_mask <- function(.data, aggregation = FALSE) { + f_env <- new_environment(.cache$functions) + + # Add functions that need to error hard and clear. + # Some R functions will still try to evaluate on an Expression + # and return NA with a warning + fail <- function(...) stop("Not implemented") + for (f in c("mean", "sd")) { + f_env[[f]] <- fail + } + + if (aggregation) { + # This should probably be done with an environment inside an environment + # but a first attempt at that had scoping problems (ARROW-13499) + for (f in names(agg_funcs)) { + f_env[[f]] <- agg_funcs[[f]] + } + } + + # Assign the schema to the expressions + map(.data$selected_columns, ~ (.$schema <- .data$.data$schema)) + + # Add the column references and make the mask + out <- new_data_mask( + new_environment(.data$selected_columns, parent = f_env), + f_env + ) + # Then insert the data pronoun + # TODO: figure out what rlang::as_data_pronoun does/why we should use it + # (because if we do we get `Error: Can't modify the data pronoun` in mutate()) + out$.data <- .data$selected_columns + out +} + +format_expr <- function(x) { + if (is_quosure(x)) { + x <- quo_get_expr(x) + } + out <- deparse(x) + if (length(out) > 1) { + # Add ellipses because we are going to truncate + out[1] <- paste0(out[1], "...") + } + head(out, 1) +} diff --git a/src/arrow/r/R/dplyr-filter.R b/src/arrow/r/R/dplyr-filter.R new file mode 100644 index 000000000..3c8c08ea5 --- /dev/null +++ b/src/arrow/r/R/dplyr-filter.R @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) { + # TODO something with the .preserve argument + filts <- quos(...) + if (length(filts) == 0) { + # Nothing to do + return(.data) + } + + .data <- as_adq(.data) + # tidy-eval the filter expressions inside an Arrow data_mask + filters <- lapply(filts, arrow_eval, arrow_mask(.data)) + bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) + if (any(bad_filters)) { + # This is similar to abandon_ship() except that the filter eval is + # vectorized, and we apply filters that _did_ work before abandoning ship + # with the rest + expr_labs <- map_chr(filts[bad_filters], format_expr) + if (query_on_dataset(.data)) { + # Abort. We don't want to auto-collect if this is a Dataset because that + # could blow up, too big. + stop( + "Filter expression not supported for Arrow Datasets: ", + oxford_paste(expr_labs, quote = FALSE), + "\nCall collect() first to pull data into R.", + call. = FALSE + ) + } else { + arrow_errors <- map2_chr( + filters[bad_filters], expr_labs, + handle_arrow_not_supported + ) + if (length(arrow_errors) == 1) { + msg <- paste0(arrow_errors, "; ") + } else { + msg <- paste0("* ", arrow_errors, "\n", collapse = "") + } + warning( + msg, "pulling data into R", + immediate. = TRUE, + call. = FALSE + ) + # Set any valid filters first, then collect and then apply the invalid ones in R + .data <- set_filters(.data, filters[!bad_filters]) + return(dplyr::filter(dplyr::collect(.data), !!!filts[bad_filters])) + } + } + + set_filters(.data, filters) +} +filter.Dataset <- filter.ArrowTabular <- filter.arrow_dplyr_query + +set_filters <- function(.data, expressions) { + if (length(expressions)) { + if (is_list_of(expressions, "Expression")) { + # expressions is a list of Expressions. AND them together and set them on .data + new_filter <- Reduce("&", expressions) + } else if (inherits(expressions, "Expression")) { + new_filter <- expressions + } else { + stop("filter expressions must be either an expression or a list of expressions", call. = FALSE) + } + + if (isTRUE(.data$filtered_rows)) { + # TRUE is default (i.e. no filter yet), so we don't need to & with it + .data$filtered_rows <- new_filter + } else { + .data$filtered_rows <- .data$filtered_rows & new_filter + } + } + .data +} diff --git a/src/arrow/r/R/dplyr-functions.R b/src/arrow/r/R/dplyr-functions.R new file mode 100644 index 000000000..717cdae96 --- /dev/null +++ b/src/arrow/r/R/dplyr-functions.R @@ -0,0 +1,1087 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +#' @include expression.R +NULL + +# This environment is an internal cache for things including data mask functions +# We'll populate it at package load time. +.cache <- NULL +init_env <- function() { + .cache <<- new.env(hash = TRUE) +} +init_env() + +# nse_funcs is a list of functions that operated on (and return) Expressions +# These will be the basis for a data_mask inside dplyr methods +# and will be added to .cache at package load time + +# Start with mappings from R function name spellings +nse_funcs <- lapply(set_names(names(.array_function_map)), function(operator) { + force(operator) + function(...) build_expr(operator, ...) +}) + +# Now add functions to that list where the mapping from R to Arrow isn't 1:1 +# Each of these functions should have the same signature as the R function +# they're replacing. +# +# When to use `build_expr()` vs. `Expression$create()`? +# +# Use `build_expr()` if you need to +# (1) map R function names to Arrow C++ functions +# (2) wrap R inputs (vectors) as Array/Scalar +# +# `Expression$create()` is lower level. Most of the functions below use it +# because they manage the preparation of the user-provided inputs +# and don't need to wrap scalars + +nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) { + opts <- cast_options(safe, ...) + opts$to_type <- as_type(target_type) + Expression$create("cast", x, options = opts) +} + +nse_funcs$coalesce <- function(...) { + args <- list2(...) + if (length(args) < 1) { + abort("At least one argument must be supplied to coalesce()") + } + + # Treat NaN like NA for consistency with dplyr::coalesce(), but if *all* + # the values are NaN, we should return NaN, not NA, so don't replace + # NaN with NA in the final (or only) argument + # TODO: if an option is added to the coalesce kernel to treat NaN as NA, + # use that to simplify the code here (ARROW-13389) + attr(args[[length(args)]], "last") <- TRUE + args <- lapply(args, function(arg) { + last_arg <- is.null(attr(arg, "last")) + attr(arg, "last") <- NULL + + if (!inherits(arg, "Expression")) { + arg <- Expression$scalar(arg) + } + + # coalesce doesn't yet support factors/dictionaries + # TODO: remove this after ARROW-14167 is merged + if (nse_funcs$is.factor(arg)) { + warning("Dictionaries (in R: factors) are currently converted to strings (characters) in coalesce", call. = FALSE) + } + + if (last_arg && arg$type_id() %in% TYPES_WITH_NAN) { + # store the NA_real_ in the same type as arg to avoid avoid casting + # smaller float types to larger float types + NA_expr <- Expression$scalar(Scalar$create(NA_real_, type = arg$type())) + Expression$create("if_else", Expression$create("is_nan", arg), NA_expr, arg) + } else { + arg + } + }) + Expression$create("coalesce", args = args) +} + +nse_funcs$is.na <- function(x) { + build_expr("is_null", x, options = list(nan_is_null = TRUE)) +} + +nse_funcs$is.nan <- function(x) { + if (is.double(x) || (inherits(x, "Expression") && + x$type_id() %in% TYPES_WITH_NAN)) { + # TODO: if an option is added to the is_nan kernel to treat NA as NaN, + # use that to simplify the code here (ARROW-13366) + build_expr("is_nan", x) & build_expr("is_valid", x) + } else { + Expression$scalar(FALSE) + } +} + +nse_funcs$is <- function(object, class2) { + if (is.string(class2)) { + switch(class2, + # for R data types, pass off to is.*() functions + character = nse_funcs$is.character(object), + numeric = nse_funcs$is.numeric(object), + integer = nse_funcs$is.integer(object), + integer64 = nse_funcs$is.integer64(object), + logical = nse_funcs$is.logical(object), + factor = nse_funcs$is.factor(object), + list = nse_funcs$is.list(object), + # for Arrow data types, compare class2 with object$type()$ToString(), + # but first strip off any parameters to only compare the top-level data + # type, and canonicalize class2 + sub("^([^([<]+).*$", "\\1", object$type()$ToString()) == + canonical_type_str(class2) + ) + } else if (inherits(class2, "DataType")) { + object$type() == as_type(class2) + } else { + stop("Second argument to is() is not a string or DataType", call. = FALSE) + } +} + +nse_funcs$dictionary_encode <- function(x, + null_encoding_behavior = c("mask", "encode")) { + behavior <- toupper(match.arg(null_encoding_behavior)) + null_encoding_behavior <- NullEncodingBehavior[[behavior]] + Expression$create( + "dictionary_encode", + x, + options = list(null_encoding_behavior = null_encoding_behavior) + ) +} + +nse_funcs$between <- function(x, left, right) { + x >= left & x <= right +} + +nse_funcs$is.finite <- function(x) { + is_fin <- Expression$create("is_finite", x) + # for compatibility with base::is.finite(), return FALSE for NA_real_ + is_fin & !nse_funcs$is.na(is_fin) +} + +nse_funcs$is.infinite <- function(x) { + is_inf <- Expression$create("is_inf", x) + # for compatibility with base::is.infinite(), return FALSE for NA_real_ + is_inf & !nse_funcs$is.na(is_inf) +} + +# as.* type casting functions +# as.factor() is mapped in expression.R +nse_funcs$as.character <- function(x) { + Expression$create("cast", x, options = cast_options(to_type = string())) +} +nse_funcs$as.double <- function(x) { + Expression$create("cast", x, options = cast_options(to_type = float64())) +} +nse_funcs$as.integer <- function(x) { + Expression$create( + "cast", + x, + options = cast_options( + to_type = int32(), + allow_float_truncate = TRUE, + allow_decimal_truncate = TRUE + ) + ) +} +nse_funcs$as.integer64 <- function(x) { + Expression$create( + "cast", + x, + options = cast_options( + to_type = int64(), + allow_float_truncate = TRUE, + allow_decimal_truncate = TRUE + ) + ) +} +nse_funcs$as.logical <- function(x) { + Expression$create("cast", x, options = cast_options(to_type = boolean())) +} +nse_funcs$as.numeric <- function(x) { + Expression$create("cast", x, options = cast_options(to_type = float64())) +} + +# is.* type functions +nse_funcs$is.character <- function(x) { + is.character(x) || (inherits(x, "Expression") && + x$type_id() %in% Type[c("STRING", "LARGE_STRING")]) +} +nse_funcs$is.numeric <- function(x) { + is.numeric(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( + "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", + "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE", + "DECIMAL", "DECIMAL256" + )]) +} +nse_funcs$is.double <- function(x) { + is.double(x) || (inherits(x, "Expression") && x$type_id() == Type["DOUBLE"]) +} +nse_funcs$is.integer <- function(x) { + is.integer(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( + "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32", + "UINT64", "INT64" + )]) +} +nse_funcs$is.integer64 <- function(x) { + is.integer64(x) || (inherits(x, "Expression") && x$type_id() == Type["INT64"]) +} +nse_funcs$is.logical <- function(x) { + is.logical(x) || (inherits(x, "Expression") && x$type_id() == Type["BOOL"]) +} +nse_funcs$is.factor <- function(x) { + is.factor(x) || (inherits(x, "Expression") && x$type_id() == Type["DICTIONARY"]) +} +nse_funcs$is.list <- function(x) { + is.list(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c( + "LIST", "FIXED_SIZE_LIST", "LARGE_LIST" + )]) +} + +# rlang::is_* type functions +nse_funcs$is_character <- function(x, n = NULL) { + assert_that(is.null(n)) + nse_funcs$is.character(x) +} +nse_funcs$is_double <- function(x, n = NULL, finite = NULL) { + assert_that(is.null(n) && is.null(finite)) + nse_funcs$is.double(x) +} +nse_funcs$is_integer <- function(x, n = NULL) { + assert_that(is.null(n)) + nse_funcs$is.integer(x) +} +nse_funcs$is_list <- function(x, n = NULL) { + assert_that(is.null(n)) + nse_funcs$is.list(x) +} +nse_funcs$is_logical <- function(x, n = NULL) { + assert_that(is.null(n)) + nse_funcs$is.logical(x) +} +nse_funcs$is_timestamp <- function(x, n = NULL) { + assert_that(is.null(n)) + inherits(x, "POSIXt") || (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")]) +} + +# String functions +nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) { + if (allowNA) { + arrow_not_supported("allowNA = TRUE") + } + if (is.na(keepNA)) { + keepNA <- !identical(type, "width") + } + if (!keepNA) { + # TODO: I think there is a fill_null kernel we could use, set null to 2 + arrow_not_supported("keepNA = TRUE") + } + if (identical(type, "bytes")) { + Expression$create("binary_length", x) + } else { + Expression$create("utf8_length", x) + } +} + +nse_funcs$paste <- function(..., sep = " ", collapse = NULL, recycle0 = FALSE) { + assert_that( + is.null(collapse), + msg = "paste() with the collapse argument is not yet supported in Arrow" + ) + if (!inherits(sep, "Expression")) { + assert_that(!is.na(sep), msg = "Invalid separator") + } + arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep) +} + +nse_funcs$paste0 <- function(..., collapse = NULL, recycle0 = FALSE) { + assert_that( + is.null(collapse), + msg = "paste0() with the collapse argument is not yet supported in Arrow" + ) + arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "") +} + +nse_funcs$str_c <- function(..., sep = "", collapse = NULL) { + assert_that( + is.null(collapse), + msg = "str_c() with the collapse argument is not yet supported in Arrow" + ) + arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep) +} + +arrow_string_join_function <- function(null_handling, null_replacement = NULL) { + # the `binary_join_element_wise` Arrow C++ compute kernel takes the separator + # as the last argument, so pass `sep` as the last dots arg to this function + function(...) { + args <- lapply(list(...), function(arg) { + # handle scalar literal args, and cast all args to string for + # consistency with base::paste(), base::paste0(), and stringr::str_c() + if (!inherits(arg, "Expression")) { + assert_that( + length(arg) == 1, + msg = "Literal vectors of length != 1 not supported in string concatenation" + ) + Expression$scalar(as.character(arg)) + } else { + nse_funcs$as.character(arg) + } + }) + Expression$create( + "binary_join_element_wise", + args = args, + options = list( + null_handling = null_handling, + null_replacement = null_replacement + ) + ) + } +} + +# Currently, Arrow does not supports a locale option for string case conversion +# functions, contrast to stringr's API, so the 'locale' argument is only valid +# for stringr's default value ("en"). The following are string functions that +# take a 'locale' option as its second argument: +# str_to_lower +# str_to_upper +# str_to_title +# +# Arrow locale will be supported with ARROW-14126 +stop_if_locale_provided <- function(locale) { + if (!identical(locale, "en")) { + stop("Providing a value for 'locale' other than the default ('en') is not supported by Arrow. ", + "To change locale, use 'Sys.setlocale()'", + call. = FALSE + ) + } +} + +nse_funcs$str_to_lower <- function(string, locale = "en") { + stop_if_locale_provided(locale) + Expression$create("utf8_lower", string) +} + +nse_funcs$str_to_upper <- function(string, locale = "en") { + stop_if_locale_provided(locale) + Expression$create("utf8_upper", string) +} + +nse_funcs$str_to_title <- function(string, locale = "en") { + stop_if_locale_provided(locale) + Expression$create("utf8_title", string) +} + +nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) { + side <- match.arg(side) + trim_fun <- switch(side, + left = "utf8_ltrim_whitespace", + right = "utf8_rtrim_whitespace", + both = "utf8_trim_whitespace" + ) + Expression$create(trim_fun, string) +} + +nse_funcs$substr <- function(x, start, stop) { + assert_that( + length(start) == 1, + msg = "`start` must be length 1 - other lengths are not supported in Arrow" + ) + assert_that( + length(stop) == 1, + msg = "`stop` must be length 1 - other lengths are not supported in Arrow" + ) + + # substr treats values as if they're on a continous number line, so values + # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics + # this behavior + if (start <= 0) { + start <- 1 + } + + # if `stop` is lower than `start`, this is invalid, so set `stop` to + # 0 so that an empty string will be returned (consistent with base::substr()) + if (stop < start) { + stop <- 0 + } + + Expression$create( + "utf8_slice_codeunits", + x, + # we don't need to subtract 1 from `stop` as C++ counts exclusively + # which effectively cancels out the difference in indexing between R & C++ + options = list(start = start - 1L, stop = stop) + ) +} + +nse_funcs$substring <- function(text, first, last) { + nse_funcs$substr(x = text, start = first, stop = last) +} + +nse_funcs$str_sub <- function(string, start = 1L, end = -1L) { + assert_that( + length(start) == 1, + msg = "`start` must be length 1 - other lengths are not supported in Arrow" + ) + assert_that( + length(end) == 1, + msg = "`end` must be length 1 - other lengths are not supported in Arrow" + ) + + # In stringr::str_sub, an `end` value of -1 means the end of the string, so + # set it to the maximum integer to match this behavior + if (end == -1) { + end <- .Machine$integer.max + } + + # An end value lower than a start value returns an empty string in + # stringr::str_sub so set end to 0 here to match this behavior + if (end < start) { + end <- 0 + } + + # subtract 1 from `start` because C++ is 0-based and R is 1-based + # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0 + # when `start` < 0, both str_sub and utf8_slice_codeunits count backwards from the end + if (start > 0) { + start <- start - 1L + } + + Expression$create( + "utf8_slice_codeunits", + string, + options = list(start = start, stop = end) + ) +} + +nse_funcs$grepl <- function(pattern, x, ignore.case = FALSE, fixed = FALSE) { + arrow_fun <- ifelse(fixed, "match_substring", "match_substring_regex") + Expression$create( + arrow_fun, + x, + options = list(pattern = pattern, ignore_case = ignore.case) + ) +} + +nse_funcs$str_detect <- function(string, pattern, negate = FALSE) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + out <- nse_funcs$grepl( + pattern = opts$pattern, + x = string, + ignore.case = opts$ignore_case, + fixed = opts$fixed + ) + if (negate) { + out <- !out + } + out +} + +nse_funcs$str_like <- function(string, pattern, ignore_case = TRUE) { + Expression$create( + "match_like", + string, + options = list(pattern = pattern, ignore_case = ignore_case) + ) +} + +# Encapsulate some common logic for sub/gsub/str_replace/str_replace_all +arrow_r_string_replace_function <- function(max_replacements) { + function(pattern, replacement, x, ignore.case = FALSE, fixed = FALSE) { + Expression$create( + ifelse(fixed && !ignore.case, "replace_substring", "replace_substring_regex"), + x, + options = list( + pattern = format_string_pattern(pattern, ignore.case, fixed), + replacement = format_string_replacement(replacement, ignore.case, fixed), + max_replacements = max_replacements + ) + ) + } +} + +arrow_stringr_string_replace_function <- function(max_replacements) { + function(string, pattern, replacement) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + arrow_r_string_replace_function(max_replacements)( + pattern = opts$pattern, + replacement = replacement, + x = string, + ignore.case = opts$ignore_case, + fixed = opts$fixed + ) + } +} + +nse_funcs$sub <- arrow_r_string_replace_function(1L) +nse_funcs$gsub <- arrow_r_string_replace_function(-1L) +nse_funcs$str_replace <- arrow_stringr_string_replace_function(1L) +nse_funcs$str_replace_all <- arrow_stringr_string_replace_function(-1L) + +nse_funcs$strsplit <- function(x, + split, + fixed = FALSE, + perl = FALSE, + useBytes = FALSE) { + assert_that(is.string(split)) + + arrow_fun <- ifelse(fixed, "split_pattern", "split_pattern_regex") + # warn when the user specifies both fixed = TRUE and perl = TRUE, for + # consistency with the behavior of base::strsplit() + if (fixed && perl) { + warning("Argument 'perl = TRUE' will be ignored", call. = FALSE) + } + # since split is not a regex, proceed without any warnings or errors regardless + # of the value of perl, for consistency with the behavior of base::strsplit() + Expression$create( + arrow_fun, + x, + options = list(pattern = split, reverse = FALSE, max_splits = -1L) + ) +} + +nse_funcs$str_split <- function(string, pattern, n = Inf, simplify = FALSE) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + arrow_fun <- ifelse(opts$fixed, "split_pattern", "split_pattern_regex") + if (opts$ignore_case) { + arrow_not_supported("Case-insensitive string splitting") + } + if (n == 0) { + arrow_not_supported("Splitting strings into zero parts") + } + if (identical(n, Inf)) { + n <- 0L + } + if (simplify) { + warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE) + } + # The max_splits option in the Arrow C++ library controls the maximum number + # of places at which the string is split, whereas the argument n to + # str_split() controls the maximum number of pieces to return. So we must + # subtract 1 from n to get max_splits. + Expression$create( + arrow_fun, + string, + options = list( + pattern = opts$pattern, + reverse = FALSE, + max_splits = n - 1L + ) + ) +} + +nse_funcs$pmin <- function(..., na.rm = FALSE) { + build_expr( + "min_element_wise", + ..., + options = list(skip_nulls = na.rm) + ) +} + +nse_funcs$pmax <- function(..., na.rm = FALSE) { + build_expr( + "max_element_wise", + ..., + options = list(skip_nulls = na.rm) + ) +} + +nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { + assert_that(is_integerish(width)) + side <- match.arg(side) + assert_that(is.string(pad)) + + if (side == "left") { + pad_func <- "utf8_lpad" + } else if (side == "right") { + pad_func <- "utf8_rpad" + } else if (side == "both") { + pad_func <- "utf8_center" + } + + Expression$create( + pad_func, + string, + options = list(width = width, padding = pad) + ) +} + +nse_funcs$startsWith <- function(x, prefix) { + Expression$create( + "starts_with", + x, + options = list(pattern = prefix) + ) +} + +nse_funcs$endsWith <- function(x, suffix) { + Expression$create( + "ends_with", + x, + options = list(pattern = suffix) + ) +} + +nse_funcs$str_starts <- function(string, pattern, negate = FALSE) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (opts$fixed) { + out <- nse_funcs$startsWith(x = string, prefix = opts$pattern) + } else { + out <- nse_funcs$grepl(pattern = paste0("^", opts$pattern), x = string, fixed = FALSE) + } + + if (negate) { + out <- !out + } + out +} + +nse_funcs$str_ends <- function(string, pattern, negate = FALSE) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (opts$fixed) { + out <- nse_funcs$endsWith(x = string, suffix = opts$pattern) + } else { + out <- nse_funcs$grepl(pattern = paste0(opts$pattern, "$"), x = string, fixed = FALSE) + } + + if (negate) { + out <- !out + } + out +} + +nse_funcs$str_count <- function(string, pattern) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (!is.string(pattern)) { + arrow_not_supported("`pattern` must be a length 1 character vector; other values") + } + arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex") + Expression$create( + arrow_fun, + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) +} + +# String function helpers + +# format `pattern` as needed for case insensitivity and literal matching by RE2 +format_string_pattern <- function(pattern, ignore.case, fixed) { + # Arrow lacks native support for case-insensitive literal string matching and + # replacement, so we use the regular expression engine (RE2) to do this. + # https://github.com/google/re2/wiki/Syntax + if (ignore.case) { + if (fixed) { + # Everything between "\Q" and "\E" is treated as literal text. + # If the search text contains any literal "\E" strings, make them + # lowercase so they won't signal the end of the literal text: + pattern <- gsub("\\E", "\\e", pattern, fixed = TRUE) + pattern <- paste0("\\Q", pattern, "\\E") + } + # Prepend "(?i)" for case-insensitive matching + pattern <- paste0("(?i)", pattern) + } + pattern +} + +# format `replacement` as needed for literal replacement by RE2 +format_string_replacement <- function(replacement, ignore.case, fixed) { + # Arrow lacks native support for case-insensitive literal string + # replacement, so we use the regular expression engine (RE2) to do this. + # https://github.com/google/re2/wiki/Syntax + if (ignore.case && fixed) { + # Escape single backslashes in the regex replacement text so they are + # interpreted as literal backslashes: + replacement <- gsub("\\", "\\\\", replacement, fixed = TRUE) + } + replacement +} + +#' Get `stringr` pattern options +#' +#' This function assigns definitions for the `stringr` pattern modifier +#' functions (`fixed()`, `regex()`, etc.) inside itself, and uses them to +#' evaluate the quoted expression `pattern`, returning a list that is used +#' to control pattern matching behavior in internal `arrow` functions. +#' +#' @param pattern Unevaluated expression containing a call to a `stringr` +#' pattern modifier function +#' +#' @return List containing elements `pattern`, `fixed`, and `ignore_case` +#' @keywords internal +get_stringr_pattern_options <- function(pattern) { + fixed <- function(pattern, ignore_case = FALSE, ...) { + check_dots(...) + list(pattern = pattern, fixed = TRUE, ignore_case = ignore_case) + } + regex <- function(pattern, ignore_case = FALSE, ...) { + check_dots(...) + list(pattern = pattern, fixed = FALSE, ignore_case = ignore_case) + } + coll <- function(...) { + arrow_not_supported("Pattern modifier `coll()`") + } + boundary <- function(...) { + arrow_not_supported("Pattern modifier `boundary()`") + } + check_dots <- function(...) { + dots <- list(...) + if (length(dots)) { + warning( + "Ignoring pattern modifier ", + ngettext(length(dots), "argument ", "arguments "), + "not supported in Arrow: ", + oxford_paste(names(dots)), + call. = FALSE + ) + } + } + ensure_opts <- function(opts) { + if (is.character(opts)) { + opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE) + } + opts + } + ensure_opts(eval(pattern)) +} + +#' Does this string contain regex metacharacters? +#' +#' @param string String to be tested +#' @keywords internal +#' @return Logical: does `string` contain regex metacharacters? +contains_regex <- function(string) { + grepl("[.\\|()[{^$*+?]", string) +} + +nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") { + # Arrow uses unit for time parsing, strptime() does not. + # Arrow has no default option for strptime (format, unit), + # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms", + # (ARROW-12809) + + # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820). + # Stop if tz is provided. + if (is.character(tz)) { + arrow_not_supported("Time zone argument") + } + + unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) + + Expression$create("strptime", x, options = list(format = format, unit = unit)) +} + +nse_funcs$strftime <- function(x, format = "", tz = "", usetz = FALSE) { + if (usetz) { + format <- paste(format, "%Z") + } + if (tz == "") { + tz <- Sys.timezone() + } + # Arrow's strftime prints in timezone of the timestamp. To match R's strftime behavior we first + # cast the timestamp to desired timezone. This is a metadata only change. + if (nse_funcs$is_timestamp(x)) { + ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz))) + } else { + ts <- x + } + Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME"))) +} + +nse_funcs$format_ISO8601 <- function(x, usetz = FALSE, precision = NULL, ...) { + ISO8601_precision_map <- + list( + y = "%Y", + ym = "%Y-%m", + ymd = "%Y-%m-%d", + ymdh = "%Y-%m-%dT%H", + ymdhm = "%Y-%m-%dT%H:%M", + ymdhms = "%Y-%m-%dT%H:%M:%S" + ) + + if (is.null(precision)) { + precision <- "ymdhms" + } + if (!precision %in% names(ISO8601_precision_map)) { + abort( + paste( + "`precision` must be one of the following values:", + paste(names(ISO8601_precision_map), collapse = ", "), + "\nValue supplied was: ", + precision + ) + ) + } + format <- ISO8601_precision_map[[precision]] + if (usetz) { + format <- paste0(format, "%z") + } + Expression$create("strftime", x, options = list(format = format, locale = "C")) +} + +nse_funcs$second <- function(x) { + Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x)) +} + +nse_funcs$trunc <- function(x, ...) { + # accepts and ignores ... for consistency with base::trunc() + build_expr("trunc", x) +} + +nse_funcs$round <- function(x, digits = 0) { + build_expr( + "round", + x, + options = list(ndigits = digits, round_mode = RoundMode$HALF_TO_EVEN) + ) +} + +nse_funcs$wday <- function(x, + label = FALSE, + abbr = TRUE, + week_start = getOption("lubridate.week.start", 7), + locale = Sys.getlocale("LC_TIME")) { + if (label) { + if (abbr) { + format <- "%a" + } else { + format <- "%A" + } + return(Expression$create("strftime", x, options = list(format = format, locale = locale))) + } + + Expression$create("day_of_week", x, options = list(count_from_zero = FALSE, week_start = week_start)) +} + +nse_funcs$log <- nse_funcs$logb <- function(x, base = exp(1)) { + # like other binary functions, either `x` or `base` can be Expression or double(1) + if (is.numeric(x) && length(x) == 1) { + x <- Expression$scalar(x) + } else if (!inherits(x, "Expression")) { + arrow_not_supported("x must be a column or a length-1 numeric; other values") + } + + # handle `base` differently because we use the simpler ln, log2, and log10 + # functions for specific scalar base values + if (inherits(base, "Expression")) { + return(Expression$create("logb_checked", x, base)) + } + + if (!is.numeric(base) || length(base) != 1) { + arrow_not_supported("base must be a column or a length-1 numeric; other values") + } + + if (base == exp(1)) { + return(Expression$create("ln_checked", x)) + } + + if (base == 2) { + return(Expression$create("log2_checked", x)) + } + + if (base == 10) { + return(Expression$create("log10_checked", x)) + } + + Expression$create("logb_checked", x, Expression$scalar(base)) +} + +nse_funcs$if_else <- function(condition, true, false, missing = NULL) { + if (!is.null(missing)) { + return(nse_funcs$if_else( + nse_funcs$is.na(condition), + missing, + nse_funcs$if_else(condition, true, false) + )) + } + + # if_else doesn't yet support factors/dictionaries + # TODO: remove this after ARROW-13358 is merged + warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false) + if (warn_types) { + warning( + "Dictionaries (in R: factors) are currently converted to strings (characters) ", + "in if_else and ifelse", + call. = FALSE + ) + } + + build_expr("if_else", condition, true, false) +} + +# Although base R ifelse allows `yes` and `no` to be different classes +nse_funcs$ifelse <- function(test, yes, no) { + nse_funcs$if_else(condition = test, true = yes, false = no) +} + +nse_funcs$case_when <- function(...) { + formulas <- list2(...) + n <- length(formulas) + if (n == 0) { + abort("No cases provided in case_when()") + } + query <- vector("list", n) + value <- vector("list", n) + mask <- caller_env() + for (i in seq_len(n)) { + f <- formulas[[i]] + if (!inherits(f, "formula")) { + abort("Each argument to case_when() must be a two-sided formula") + } + query[[i]] <- arrow_eval(f[[2]], mask) + value[[i]] <- arrow_eval(f[[3]], mask) + if (!nse_funcs$is.logical(query[[i]])) { + abort("Left side of each formula in case_when() must be a logical expression") + } + if (inherits(value[[i]], "try-error")) { + abort(handle_arrow_not_supported(value[[i]], format_expr(f[[3]]))) + } + } + build_expr( + "case_when", + args = c( + build_expr( + "make_struct", + args = query, + options = list(field_names = as.character(seq_along(query))) + ), + value + ) + ) +} + +# Aggregation functions +# These all return a list of: +# @param fun string function name +# @param data Expression (these are all currently a single field) +# @param options list of function options, as passed to call_function +# For group-by aggregation, `hash_` gets prepended to the function name. +# So to see a list of available hash aggregation functions, +# you can use list_compute_functions("^hash_") +agg_funcs <- list() +agg_funcs$sum <- function(..., na.rm = FALSE) { + list( + fun = "sum", + data = ensure_one_arg(list2(...), "sum"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) +} +agg_funcs$any <- function(..., na.rm = FALSE) { + list( + fun = "any", + data = ensure_one_arg(list2(...), "any"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) +} +agg_funcs$all <- function(..., na.rm = FALSE) { + list( + fun = "all", + data = ensure_one_arg(list2(...), "all"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) +} +agg_funcs$mean <- function(x, na.rm = FALSE) { + list( + fun = "mean", + data = x, + options = list(skip_nulls = na.rm, min_count = 0L) + ) +} +agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) { + list( + fun = "stddev", + data = x, + options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) + ) +} +agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { + list( + fun = "variance", + data = x, + options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) + ) +} +agg_funcs$quantile <- function(x, probs, na.rm = FALSE) { + if (length(probs) != 1) { + arrow_not_supported("quantile() with length(probs) != 1") + } + # TODO: Bind to the Arrow function that returns an exact quantile and remove + # this warning (ARROW-14021) + warn( + "quantile() currently returns an approximate quantile in Arrow", + .frequency = ifelse(is_interactive(), "once", "always"), + .frequency_id = "arrow.quantile.approximate" + ) + list( + fun = "tdigest", + data = x, + options = list(skip_nulls = na.rm, q = probs) + ) +} +agg_funcs$median <- function(x, na.rm = FALSE) { + # TODO: Bind to the Arrow function that returns an exact median and remove + # this warning (ARROW-14021) + warn( + "median() currently returns an approximate median in Arrow", + .frequency = ifelse(is_interactive(), "once", "always"), + .frequency_id = "arrow.median.approximate" + ) + list( + fun = "approximate_median", + data = x, + options = list(skip_nulls = na.rm) + ) +} +agg_funcs$n_distinct <- function(..., na.rm = FALSE) { + list( + fun = "count_distinct", + data = ensure_one_arg(list2(...), "n_distinct"), + options = list(na.rm = na.rm) + ) +} +agg_funcs$n <- function() { + list( + fun = "sum", + data = Expression$scalar(1L), + options = list() + ) +} +agg_funcs$min <- function(..., na.rm = FALSE) { + list( + fun = "min", + data = ensure_one_arg(list2(...), "min"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) +} +agg_funcs$max <- function(..., na.rm = FALSE) { + list( + fun = "max", + data = ensure_one_arg(list2(...), "max"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) +} + +ensure_one_arg <- function(args, fun) { + if (length(args) == 0) { + arrow_not_supported(paste0(fun, "() with 0 arguments")) + } else if (length(args) > 1) { + arrow_not_supported(paste0("Multiple arguments to ", fun, "()")) + } + args[[1]] +} + +output_type <- function(fun, input_type, hash) { + # These are quick and dirty heuristics. + if (fun %in% c("any", "all")) { + bool() + } else if (fun %in% "sum") { + # It may upcast to a bigger type but this is close enough + input_type + } else if (fun %in% c("mean", "stddev", "variance", "approximate_median")) { + float64() + } else if (fun %in% "tdigest") { + if (hash) { + fixed_size_list_of(float64(), 1L) + } else { + float64() + } + } else { + # Just so things don't error, assume the resulting type is the same + input_type + } +} diff --git a/src/arrow/r/R/dplyr-group-by.R b/src/arrow/r/R/dplyr-group-by.R new file mode 100644 index 000000000..66b867210 --- /dev/null +++ b/src/arrow/r/R/dplyr-group-by.R @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +group_by.arrow_dplyr_query <- function(.data, + ..., + .add = FALSE, + add = .add, + .drop = dplyr::group_by_drop_default(.data)) { + .data <- as_adq(.data) + new_groups <- enquos(...) + # ... can contain expressions (i.e. can add (or rename?) columns) and so we + # need to identify those and add them on to the query with mutate. Specifically, + # we want to mark as new: + # * expressions (named or otherwise) + # * variables that have new names + # All others (i.e. simple references to variables) should not be (re)-added + + # Identify any groups with names which aren't in names of .data + new_group_ind <- map_lgl(new_groups, ~ !(quo_name(.x) %in% names(.data))) + # Identify any groups which don't have names + named_group_ind <- map_lgl(names(new_groups), nzchar) + # Retain any new groups identified above + new_groups <- new_groups[new_group_ind | named_group_ind] + if (length(new_groups)) { + # now either use the name that was given in ... or if that is "" then use the expr + names(new_groups) <- imap_chr(new_groups, ~ ifelse(.y == "", quo_name(.x), .y)) + + # Add them to the data + .data <- dplyr::mutate(.data, !!!new_groups) + } + if (".add" %in% names(formals(dplyr::group_by))) { + # For compatibility with dplyr >= 1.0 + gv <- dplyr::group_by_prepare(.data, ..., .add = .add)$group_names + } else { + gv <- dplyr::group_by_prepare(.data, ..., add = add)$group_names + } + .data$group_by_vars <- gv + .data$drop_empty_groups <- ifelse(length(gv), .drop, dplyr::group_by_drop_default(.data)) + .data +} +group_by.Dataset <- group_by.ArrowTabular <- group_by.arrow_dplyr_query + +groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x)) +groups.Dataset <- groups.ArrowTabular <- function(x) NULL + +group_vars.arrow_dplyr_query <- function(x) x$group_by_vars +group_vars.Dataset <- function(x) NULL +group_vars.RecordBatchReader <- function(x) NULL +group_vars.ArrowTabular <- function(x) { + x$r_metadata$attributes$.group_vars +} + +# the logical literal in the two functions below controls the default value of +# the .drop argument to group_by() +group_by_drop_default.arrow_dplyr_query <- + function(.tbl) .tbl$drop_empty_groups %||% TRUE +group_by_drop_default.Dataset <- group_by_drop_default.ArrowTabular <- + function(.tbl) TRUE + +ungroup.arrow_dplyr_query <- function(x, ...) { + x$group_by_vars <- character() + x$drop_empty_groups <- NULL + x +} +ungroup.Dataset <- force +ungroup.ArrowTabular <- function(x) { + x$r_metadata$attributes$.group_vars <- NULL + x +} diff --git a/src/arrow/r/R/dplyr-join.R b/src/arrow/r/R/dplyr-join.R new file mode 100644 index 000000000..c14b1a8f3 --- /dev/null +++ b/src/arrow/r/R/dplyr-join.R @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +do_join <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + keep = FALSE, + na_matches, + join_type) { + # TODO: handle `copy` arg: ignore? + # TODO: handle `suffix` arg: Arrow does prefix + # TODO: handle `keep` arg: "Should the join keys from both ‘x’ and ‘y’ be preserved in the output?" + # TODO: handle `na_matches` arg + x <- as_adq(x) + y <- as_adq(y) + by <- handle_join_by(by, x, y) + + x$join <- list( + type = JoinType[[join_type]], + right_data = y, + by = by + ) + collapse.arrow_dplyr_query(x) +} + +left_join.arrow_dplyr_query <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + keep = FALSE) { + do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_OUTER") +} +left_join.Dataset <- left_join.ArrowTabular <- left_join.arrow_dplyr_query + +right_join.arrow_dplyr_query <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + keep = FALSE) { + do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER") +} +right_join.Dataset <- right_join.ArrowTabular <- right_join.arrow_dplyr_query + +inner_join.arrow_dplyr_query <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + keep = FALSE) { + do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "INNER") +} +inner_join.Dataset <- inner_join.ArrowTabular <- inner_join.arrow_dplyr_query + +full_join.arrow_dplyr_query <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + keep = FALSE) { + do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "FULL_OUTER") +} +full_join.Dataset <- full_join.ArrowTabular <- full_join.arrow_dplyr_query + +semi_join.arrow_dplyr_query <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + keep = FALSE) { + do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_SEMI") +} +semi_join.Dataset <- semi_join.ArrowTabular <- semi_join.arrow_dplyr_query + +anti_join.arrow_dplyr_query <- function(x, + y, + by = NULL, + copy = FALSE, + suffix = c(".x", ".y"), + ..., + keep = FALSE) { + do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_ANTI") +} +anti_join.Dataset <- anti_join.ArrowTabular <- anti_join.arrow_dplyr_query + +handle_join_by <- function(by, x, y) { + if (is.null(by)) { + return(set_names(intersect(names(x), names(y)))) + } + stopifnot(is.character(by)) + if (is.null(names(by))) { + by <- set_names(by) + } + # TODO: nicer messages? + stopifnot( + all(names(by) %in% names(x)), + all(by %in% names(y)) + ) + by +} diff --git a/src/arrow/r/R/dplyr-mutate.R b/src/arrow/r/R/dplyr-mutate.R new file mode 100644 index 000000000..2e5239484 --- /dev/null +++ b/src/arrow/r/R/dplyr-mutate.R @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +mutate.arrow_dplyr_query <- function(.data, + ..., + .keep = c("all", "used", "unused", "none"), + .before = NULL, + .after = NULL) { + call <- match.call() + exprs <- ensure_named_exprs(quos(...)) + + .keep <- match.arg(.keep) + .before <- enquo(.before) + .after <- enquo(.after) + + if (.keep %in% c("all", "unused") && length(exprs) == 0) { + # Nothing to do + return(.data) + } + + .data <- as_adq(.data) + + # Restrict the cases we support for now + has_aggregations <- any(unlist(lapply(exprs, all_funs)) %in% names(agg_funcs)) + if (has_aggregations) { + # ARROW-13926 + # mutate() on a grouped dataset does calculations within groups + # This doesn't matter on scalar ops (arithmetic etc.) but it does + # for things with aggregations (e.g. subtracting the mean) + return(abandon_ship(call, .data, "window functions not currently supported in Arrow")) + } + + mask <- arrow_mask(.data) + results <- list() + for (i in seq_along(exprs)) { + # Iterate over the indices and not the names because names may be repeated + # (which overwrites the previous name) + new_var <- names(exprs)[i] + results[[new_var]] <- arrow_eval(exprs[[i]], mask) + if (inherits(results[[new_var]], "try-error")) { + msg <- handle_arrow_not_supported( + results[[new_var]], + format_expr(exprs[[i]]) + ) + return(abandon_ship(call, .data, msg)) + } else if (!inherits(results[[new_var]], "Expression") && + !is.null(results[[new_var]])) { + # We need some wrapping to handle literal values + if (length(results[[new_var]]) != 1) { + msg <- paste0("In ", new_var, " = ", format_expr(exprs[[i]]), ", only values of size one are recycled") + return(abandon_ship(call, .data, msg)) + } + results[[new_var]] <- Expression$scalar(results[[new_var]]) + } + # Put it in the data mask too + mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] + } + + old_vars <- names(.data$selected_columns) + # Note that this is names(exprs) not names(results): + # if results$new_var is NULL, that means we are supposed to remove it + new_vars <- names(exprs) + + # Assign the new columns into the .data$selected_columns + for (new_var in new_vars) { + .data$selected_columns[[new_var]] <- results[[new_var]] + } + + # Deduplicate new_vars and remove NULL columns from new_vars + new_vars <- intersect(new_vars, names(.data$selected_columns)) + + # Respect .before and .after + if (!quo_is_null(.before) || !quo_is_null(.after)) { + new <- setdiff(new_vars, old_vars) + .data <- dplyr::relocate(.data, all_of(new), .before = !!.before, .after = !!.after) + } + + # Respect .keep + if (.keep == "none") { + .data$selected_columns <- .data$selected_columns[new_vars] + } else if (.keep != "all") { + # "used" or "unused" + used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE) + if (.keep == "used") { + .data$selected_columns[setdiff(old_vars, used_vars)] <- NULL + } else { + # "unused" + .data$selected_columns[intersect(old_vars, used_vars)] <- NULL + } + } + # Even if "none", we still keep group vars + ensure_group_vars(.data) +} +mutate.Dataset <- mutate.ArrowTabular <- mutate.arrow_dplyr_query + +transmute.arrow_dplyr_query <- function(.data, ...) { + dots <- check_transmute_args(...) + dplyr::mutate(.data, !!!dots, .keep = "none") +} +transmute.Dataset <- transmute.ArrowTabular <- transmute.arrow_dplyr_query + +# This function is a copy of dplyr:::check_transmute_args at +# https://github.com/tidyverse/dplyr/blob/master/R/mutate.R +check_transmute_args <- function(..., .keep, .before, .after) { + if (!missing(.keep)) { + abort("`transmute()` does not support the `.keep` argument") + } + if (!missing(.before)) { + abort("`transmute()` does not support the `.before` argument") + } + if (!missing(.after)) { + abort("`transmute()` does not support the `.after` argument") + } + enquos(...) +} + +ensure_named_exprs <- function(exprs) { + # Check for unnamed expressions and fix if any + unnamed <- !nzchar(names(exprs)) + # Deparse and take the first element in case they're long expressions + names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr) + exprs +} diff --git a/src/arrow/r/R/dplyr-select.R b/src/arrow/r/R/dplyr-select.R new file mode 100644 index 000000000..9a867ced9 --- /dev/null +++ b/src/arrow/r/R/dplyr-select.R @@ -0,0 +1,125 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns) + +select.arrow_dplyr_query <- function(.data, ...) { + check_select_helpers(enexprs(...)) + column_select(as_adq(.data), !!!enquos(...)) +} +select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query + +rename.arrow_dplyr_query <- function(.data, ...) { + check_select_helpers(enexprs(...)) + column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename) +} +rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query + +column_select <- function(.data, ..., .FUN = vars_select) { + # .FUN is either tidyselect::vars_select or tidyselect::vars_rename + # It operates on the names() of selected_columns, i.e. the column names + # factoring in any renaming that may already have happened + out <- .FUN(names(.data), !!!enquos(...)) + # Make sure that the resulting selected columns map back to the original data, + # as in when there are multiple renaming steps + .data$selected_columns <- set_names(.data$selected_columns[out], names(out)) + + # If we've renamed columns, we need to project that renaming into other + # query parameters we've collected + renamed <- out[names(out) != out] + if (length(renamed)) { + # Massage group_by + gbv <- .data$group_by_vars + renamed_groups <- gbv %in% renamed + gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)] + .data$group_by_vars <- gbv + # No need to massage filters because those contain references to Arrow objects + } + .data +} + +relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) { + # The code in this function is adapted from the code in dplyr::relocate.data.frame + # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R + # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829 + + .data <- as_adq(.data) + + # Assign the schema to the expressions + map(.data$selected_columns, ~ (.$schema <- .data$.data$schema)) + + # Create a mask for evaluating expressions in tidyselect helpers + mask <- new_environment(.cache$functions, parent = caller_env()) + + to_move <- eval_select(substitute(c(...)), .data$selected_columns, mask) + + .before <- enquo(.before) + .after <- enquo(.after) + has_before <- !quo_is_null(.before) + has_after <- !quo_is_null(.after) + + if (has_before && has_after) { + abort("Must supply only one of `.before` and `.after`.") + } else if (has_before) { + where <- min(unname(eval_select(quo_get_expr(.before), .data$selected_columns, mask))) + if (!where %in% to_move) { + to_move <- c(to_move, where) + } + } else if (has_after) { + where <- max(unname(eval_select(quo_get_expr(.after), .data$selected_columns, mask))) + if (!where %in% to_move) { + to_move <- c(where, to_move) + } + } else { + where <- 1L + if (!where %in% to_move) { + to_move <- c(to_move, where) + } + } + + lhs <- setdiff(seq2(1, where - 1), to_move) + rhs <- setdiff(seq2(where + 1, length(.data$selected_columns)), to_move) + + pos <- vec_unique(c(lhs, to_move, rhs)) + new_names <- names(pos) + .data$selected_columns <- .data$selected_columns[pos] + + if (!is.null(new_names)) { + names(.data$selected_columns)[new_names != ""] <- new_names[new_names != ""] + } + .data +} +relocate.Dataset <- relocate.ArrowTabular <- relocate.arrow_dplyr_query + +check_select_helpers <- function(exprs) { + # Throw an error if unsupported tidyselect selection helpers in `exprs` + exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x) + unsup_select_helpers <- "where" + funs_in_exprs <- unlist(lapply(exprs, all_funs)) + unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers] + if (length(unsup_funs)) { + stop( + "Unsupported selection ", + ngettext(length(unsup_funs), "helper: ", "helpers: "), + oxford_paste(paste0(unsup_funs, "()"), quote = FALSE), + call. = FALSE + ) + } +} diff --git a/src/arrow/r/R/dplyr-summarize.R b/src/arrow/r/R/dplyr-summarize.R new file mode 100644 index 000000000..a6b7a3592 --- /dev/null +++ b/src/arrow/r/R/dplyr-summarize.R @@ -0,0 +1,289 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# The following S3 methods are registered on load if dplyr is present + +summarise.arrow_dplyr_query <- function(.data, ...) { + call <- match.call() + .data <- as_adq(.data) + exprs <- quos(...) + # Only retain the columns we need to do our aggregations + vars_to_keep <- unique(c( + unlist(lapply(exprs, all.vars)), # vars referenced in summarise + dplyr::group_vars(.data) # vars needed for grouping + )) + # If exprs rely on the results of previous exprs + # (total = sum(x), mean = total / n()) + # then not all vars will correspond to columns in the data, + # so don't try to select() them (use intersect() to exclude them) + # Note that this select() isn't useful for the Arrow summarize implementation + # because it will effectively project to keep what it needs anyway, + # but the data.frame fallback version does benefit from select here + .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data))) + + # Try stuff, if successful return() + out <- try(do_arrow_summarize(.data, ...), silent = TRUE) + if (inherits(out, "try-error")) { + return(abandon_ship(call, .data, format(out))) + } else { + return(out) + } +} +summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query + +# This is the Arrow summarize implementation +do_arrow_summarize <- function(.data, ..., .groups = NULL) { + exprs <- ensure_named_exprs(quos(...)) + + # Create a stateful environment for recording our evaluated expressions + # It's more complex than other places because a single summarize() expr + # may result in multiple query nodes (Aggregate, Project), + # and we have to walk through the expressions to disentangle them. + ctx <- env( + mask = arrow_mask(.data, aggregation = TRUE), + aggregations = empty_named_list(), + post_mutate = empty_named_list() + ) + for (i in seq_along(exprs)) { + # Iterate over the indices and not the names because names may be repeated + # (which overwrites the previous name) + summarize_eval( + names(exprs)[i], + exprs[[i]], + ctx, + length(.data$group_by_vars) > 0 + ) + } + + # Apply the results to the .data object. + # First, the aggregations + .data$aggregations <- ctx$aggregations + # Then collapse the query so that the resulting query object can have + # additional operations applied to it + out <- collapse.arrow_dplyr_query(.data) + # The expressions may have been translated into + # "first, aggregate, then transform the result further" + # nolint start + # For example, + # summarize(mean = sum(x) / n()) + # is effectively implemented as + # summarize(..temp0 = sum(x), ..temp1 = n()) %>% + # mutate(mean = ..temp0 / ..temp1) %>% + # select(-starts_with("..temp")) + # If this is the case, there will be expressions in post_mutate + # nolint end + if (length(ctx$post_mutate)) { + # Append post_mutate, and make sure order is correct + # according to input exprs (also dropping ..temp columns) + out$selected_columns <- c( + out$selected_columns, + ctx$post_mutate + )[c(.data$group_by_vars, names(exprs))] + } + + # If the object has .drop = FALSE and any group vars are dictionaries, + # we can't (currently) preserve the empty rows that dplyr does, + # so give a warning about that. + if (!dplyr::group_by_drop_default(.data)) { + group_by_exprs <- .data$selected_columns[.data$group_by_vars] + if (any(map_lgl(group_by_exprs, ~ inherits(.$type(), "DictionaryType")))) { + warning( + ".drop = FALSE currently not supported in Arrow aggregation", + call. = FALSE + ) + } + } + + # Handle .groups argument + if (length(.data$group_by_vars)) { + if (is.null(.groups)) { + # dplyr docs say: + # When ‘.groups’ is not specified, it is chosen based on the + # number of rows of the results: + # • If all the results have 1 row, you get "drop_last". + # • If the number of rows varies, you get "keep". + # + # But we don't support anything that returns multiple rows now + .groups <- "drop_last" + } else { + assert_that(is.string(.groups)) + } + if (.groups == "drop_last") { + out$group_by_vars <- head(.data$group_by_vars, -1) + } else if (.groups == "keep") { + out$group_by_vars <- .data$group_by_vars + } else if (.groups == "rowwise") { + stop(arrow_not_supported('.groups = "rowwise"')) + } else if (.groups == "drop") { + # collapse() preserves groups so remove them + out <- dplyr::ungroup(out) + } else { + stop(paste("Invalid .groups argument:", .groups)) + } + # TODO: shouldn't we be doing something with `drop_empty_groups` in summarize? (ARROW-14044) + out$drop_empty_groups <- .data$drop_empty_groups + } + out +} + +arrow_eval_or_stop <- function(expr, mask) { + # TODO: change arrow_eval error handling behavior? + out <- arrow_eval(expr, mask) + if (inherits(out, "try-error")) { + msg <- handle_arrow_not_supported(out, format_expr(expr)) + stop(msg, call. = FALSE) + } + out +} + +summarize_projection <- function(.data) { + c( + map(.data$aggregations, ~ .$data), + .data$selected_columns[.data$group_by_vars] + ) +} + +format_aggregation <- function(x) { + paste0(x$fun, "(", x$data$ToString(), ")") +} + +# This function handles each summarize expression and turns it into the +# appropriate combination of (1) aggregations (possibly temporary) and +# (2) post-aggregation transformations (mutate) +# The function returns nothing: it assigns into the `ctx` environment +summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) { + expr <- quo_get_expr(quosure) + ctx$quo_env <- quo_get_env(quosure) + + funs_in_expr <- all_funs(expr) + if (length(funs_in_expr) == 0) { + # If it is a scalar or field ref, no special handling required + ctx$aggregations[[name]] <- arrow_eval_or_stop(quosure, ctx$mask) + return() + } + + # For the quantile() binding in the hash aggregation case, we need to mutate + # the list output from the Arrow hash_tdigest kernel to flatten it into a + # column of type float64. We do that by modifying the unevaluated expression + # to replace quantile(...) with arrow_list_element(quantile(...), 0L) + if (hash && "quantile" %in% funs_in_expr) { + expr <- wrap_hash_quantile(expr) + funs_in_expr <- all_funs(expr) + } + + # Start inspecting the expr to see what aggregations it involves + agg_funs <- names(agg_funcs) + outer_agg <- funs_in_expr[1] %in% agg_funs + inner_agg <- funs_in_expr[-1] %in% agg_funs + + # First, pull out any aggregations wrapped in other function calls + if (any(inner_agg)) { + expr <- extract_aggregations(expr, ctx) + } + + # By this point, there are no more aggregation functions in expr + # except for possibly the outer function call: + # they've all been pulled out to ctx$aggregations, and in their place in expr + # there are variable names, which will correspond to field refs in the + # query object after aggregation and collapse(). + # So if we want to know if there are any aggregations inside expr, + # we have to look for them by their new var names + inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations) + + if (outer_agg) { + # This is something like agg(fun(x, y) + # It just works by normal arrow_eval, unless there's a mix of aggs and + # columns in the original data like agg(fun(x, agg(x))) + # (but that will have been caught in extract_aggregations()) + ctx$aggregations[[name]] <- arrow_eval_or_stop( + as_quosure(expr, ctx$quo_env), + ctx$mask + ) + return() + } else if (all(inner_agg_exprs)) { + # Something like: fun(agg(x), agg(y)) + # So based on the aggregations that have been extracted, mutate after + mutate_mask <- arrow_mask( + list(selected_columns = make_field_refs(names(ctx$aggregations))) + ) + ctx$post_mutate[[name]] <- arrow_eval_or_stop( + as_quosure(expr, ctx$quo_env), + mutate_mask + ) + return() + } + + # Backstop for any other odd cases, like fun(x, y) (i.e. no aggregation), + # or aggregation functions that aren't supported in Arrow (not in agg_funcs) + stop( + handle_arrow_not_supported(quo_get_expr(quosure), format_expr(quosure)), + call. = FALSE + ) +} + +# This function recurses through expr, pulls out any aggregation expressions, +# and inserts a variable name (field ref) in place of the aggregation +extract_aggregations <- function(expr, ctx) { + # Keep the input in case we need to raise an error message with it + original_expr <- expr + funs <- all_funs(expr) + if (length(funs) == 0) { + return(expr) + } else if (length(funs) > 1) { + # Recurse more + expr[-1] <- lapply(expr[-1], extract_aggregations, ctx) + } + if (funs[1] %in% names(agg_funcs)) { + inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations) + if (any(inner_agg_exprs) & !all(inner_agg_exprs)) { + # We can't aggregate over a combination of dataset columns and other + # aggregations (e.g. sum(x - mean(x))) + # TODO: support in ARROW-13926 + # TODO: Add "because" arg to explain _why_ it's not supported? + # TODO: this message could also say "not supported in summarize()" + # since some of these expressions may be legal elsewhere + stop( + handle_arrow_not_supported(original_expr, format_expr(original_expr)), + call. = FALSE + ) + } + + # We have an aggregation expression with no other aggregations inside it, + # so arrow_eval the expression on the data and give it a ..temp name prefix, + # then insert that name (symbol) back into the expression so that we can + # mutate() on the result of the aggregation and reference this field. + tmpname <- paste0("..temp", length(ctx$aggregations)) + ctx$aggregations[[tmpname]] <- arrow_eval_or_stop(as_quosure(expr, ctx$quo_env), ctx$mask) + expr <- as.symbol(tmpname) + } + expr +} + +# This function recurses through expr and wraps each call to quantile() with a +# call to arrow_list_element() +wrap_hash_quantile <- function(expr) { + if (length(expr) == 1) { + return(expr) + } else { + if (is.call(expr) && expr[[1]] == quote(quantile)) { + return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)"))) + } else { + return(as.call(lapply(expr, wrap_hash_quantile))) + } + } +} diff --git a/src/arrow/r/R/dplyr.R b/src/arrow/r/R/dplyr.R new file mode 100644 index 000000000..e6f67c066 --- /dev/null +++ b/src/arrow/r/R/dplyr.R @@ -0,0 +1,259 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include expression.R +#' @include record-batch.R +#' @include table.R + +arrow_dplyr_query <- function(.data) { + # An arrow_dplyr_query is a container for an Arrow data object (Table, + # RecordBatch, or Dataset) and the state of the user's dplyr query--things + # like selected columns, filters, and group vars. + # An arrow_dplyr_query can contain another arrow_dplyr_query in .data + gv <- dplyr::group_vars(.data) %||% character() + + if (!inherits(.data, c("Dataset", "arrow_dplyr_query", "RecordBatchReader"))) { + .data <- InMemoryDataset$create(.data) + } + # Evaluating expressions on a dataset with duplicated fieldnames will error + dupes <- duplicated(names(.data)) + if (any(dupes)) { + abort(c( + "Duplicated field names", + x = paste0( + "The following field names were found more than once in the data: ", + oxford_paste(names(.data)[dupes]) + ) + )) + } + structure( + list( + .data = .data, + # selected_columns is a named list: + # * contents are references/expressions pointing to the data + # * names are the names they should be in the end (i.e. this + # records any renaming) + selected_columns = make_field_refs(names(.data$schema)), + # filtered_rows will be an Expression + filtered_rows = TRUE, + # group_by_vars is a character vector of columns (as renamed) + # in the data. They will be kept when data is pulled into R. + group_by_vars = gv, + # drop_empty_groups is a logical value indicating whether to drop + # groups formed by factor levels that don't appear in the data. It + # should be non-null only when the data is grouped. + drop_empty_groups = NULL, + # arrange_vars will be a list of expressions named by their associated + # column names + arrange_vars = list(), + # arrange_desc will be a logical vector indicating the sort order for each + # expression in arrange_vars (FALSE for ascending, TRUE for descending) + arrange_desc = logical() + ), + class = "arrow_dplyr_query" + ) +} + +# The only difference between `arrow_dplyr_query()` and `as_adq()` is that if +# `.data` is already an `arrow_dplyr_query`, `as_adq()`, will return it as is, but +# `arrow_dplyr_query()` will nest it inside a new `arrow_dplyr_query`. The only +# place where `arrow_dplyr_query()` should be called directly is inside +# `collapse()` methods; everywhere else, call `as_adq()`. +as_adq <- function(.data) { + # For most dplyr methods, + # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query + # This works because the functions all pass .data through as_adq() + if (inherits(.data, "arrow_dplyr_query")) { + return(.data) + } + arrow_dplyr_query(.data) +} + +make_field_refs <- function(field_names) { + set_names(lapply(field_names, Expression$field_ref), field_names) +} + +#' @export +print.arrow_dplyr_query <- function(x, ...) { + schm <- x$.data$schema + types <- map_chr(x$selected_columns, function(expr) { + name <- expr$field_name + if (nzchar(name)) { + # Just a field_ref, so look up in the schema + schm$GetFieldByName(name)$type$ToString() + } else { + # Expression, so get its type and append the expression + paste0( + expr$type(schm)$ToString(), + " (", expr$ToString(), ")" + ) + } + }) + fields <- paste(names(types), types, sep = ": ", collapse = "\n") + cat(class(source_data(x))[1], " (query)\n", sep = "") + cat(fields, "\n", sep = "") + cat("\n") + if (length(x$aggregations)) { + cat("* Aggregations:\n") + aggs <- paste0(names(x$aggregations), ": ", map_chr(x$aggregations, format_aggregation), collapse = "\n") + cat(aggs, "\n", sep = "") + } + if (!isTRUE(x$filtered_rows)) { + filter_string <- x$filtered_rows$ToString() + cat("* Filter: ", filter_string, "\n", sep = "") + } + if (length(x$group_by_vars)) { + cat("* Grouped by ", paste(x$group_by_vars, collapse = ", "), "\n", sep = "") + } + if (length(x$arrange_vars)) { + arrange_strings <- map_chr(x$arrange_vars, function(x) x$ToString()) + cat( + "* Sorted by ", + paste( + paste0( + arrange_strings, + " [", ifelse(x$arrange_desc, "desc", "asc"), "]" + ), + collapse = ", " + ), + "\n", + sep = "" + ) + } + cat("See $.data for the source Arrow object\n") + invisible(x) +} + +# These are the names reflecting all select/rename, not what is in Arrow +#' @export +names.arrow_dplyr_query <- function(x) names(x$selected_columns) + +#' @export +dim.arrow_dplyr_query <- function(x) { + cols <- length(names(x)) + + if (is_collapsed(x)) { + # Don't evaluate just for nrow + rows <- NA_integer_ + } else if (isTRUE(x$filtered_rows)) { + rows <- x$.data$num_rows + } else { + rows <- Scanner$create(x)$CountRows() + } + c(rows, cols) +} + +#' @export +as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) { + collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...) +} + +#' @export +head.arrow_dplyr_query <- function(x, n = 6L, ...) { + x$head <- n + collapse.arrow_dplyr_query(x) +} + +#' @export +tail.arrow_dplyr_query <- function(x, n = 6L, ...) { + x$tail <- n + collapse.arrow_dplyr_query(x) +} + +#' @export +`[.arrow_dplyr_query` <- function(x, i, j, ..., drop = FALSE) { + x <- ensure_group_vars(x) + if (nargs() == 2L) { + # List-like column extraction (x[i]) + return(x[, i]) + } + if (!missing(j)) { + x <- select.arrow_dplyr_query(x, all_of(j)) + } + + if (!missing(i)) { + out <- take_dataset_rows(x, i) + x <- restore_dplyr_features(out, x) + } + x +} + +ensure_group_vars <- function(x) { + if (inherits(x, "arrow_dplyr_query")) { + # Before pulling data from Arrow, make sure all group vars are in the projection + gv <- set_names(setdiff(dplyr::group_vars(x), names(x))) + if (length(gv)) { + # Add them back + x$selected_columns <- c( + x$selected_columns, + make_field_refs(gv) + ) + } + } + x +} + +ensure_arrange_vars <- function(x) { + # The arrange() operation is not performed until later, because: + # - It must be performed after mutate(), to enable sorting by new columns. + # - It should be performed after filter() and select(), for efficiency. + # However, we need users to be able to arrange() by columns and expressions + # that are *not* returned in the query result. To enable this, we must + # *temporarily* include these columns and expressions in the projection. We + # use x$temp_columns to store these. Later, after the arrange() operation has + # been performed, these are omitted from the result. This differs from the + # columns in x$group_by_vars which *are* returned in the result. + x$temp_columns <- x$arrange_vars[!names(x$arrange_vars) %in% names(x$selected_columns)] + x +} + +# Helper to handle unsupported dplyr features +# * For Table/RecordBatch, we collect() and then call the dplyr method in R +# * For Dataset, we just error +abandon_ship <- function(call, .data, msg) { + msg <- trimws(msg) + dplyr_fun_name <- sub("^(.*?)\\..*", "\\1", as.character(call[[1]])) + if (query_on_dataset(.data)) { + stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE) + } + # else, collect and call dplyr method + warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE) + call$.data <- dplyr::collect(.data) + call[[1]] <- get(dplyr_fun_name, envir = asNamespace("dplyr")) + eval.parent(call, 2) +} + +query_on_dataset <- function(x) !inherits(source_data(x), "InMemoryDataset") + +source_data <- function(x) { + if (is_collapsed(x)) { + source_data(x$.data) + } else { + x$.data + } +} + +is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query") + +has_aggregation <- function(x) { + # TODO: update with joins (check right side data too) + !is.null(x$aggregations) || (is_collapsed(x) && has_aggregation(x$.data)) +} + +has_head_tail <- function(x) { + !is.null(x$head) || !is.null(x$tail) || (is_collapsed(x) && has_head_tail(x$.data)) +} diff --git a/src/arrow/r/R/duckdb.R b/src/arrow/r/R/duckdb.R new file mode 100644 index 000000000..c772d4fbd --- /dev/null +++ b/src/arrow/r/R/duckdb.R @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Create a (virtual) DuckDB table from an Arrow object +#' +#' This will do the necessary configuration to create a (virtual) table in DuckDB +#' that is backed by the Arrow object given. No data is copied or modified until +#' `collect()` or `compute()` are called or a query is run against the table. +#' +#' The result is a dbplyr-compatible object that can be used in d(b)plyr pipelines. +#' +#' If `auto_disconnect = TRUE`, the DuckDB table that is created will be configured +#' to be unregistered when the `tbl` object is garbage collected. This is helpful +#' if you don't want to have extra table objects in DuckDB after you've finished +#' using them. Currently, this cleanup can, however, sometimes lead to hangs if +#' tables are created and deleted in quick succession, hence the default value +#' of `FALSE` +#' +#' @param .data the Arrow object (e.g. Dataset, Table) to use for the DuckDB table +#' @param con a DuckDB connection to use (default will create one and store it +#' in `options("arrow_duck_con")`) +#' @param table_name a name to use in DuckDB for this object. The default is a +#' unique string `"arrow_"` followed by numbers. +#' @param auto_disconnect should the table be automatically cleaned up when the +#' resulting object is removed (and garbage collected)? Default: `FALSE` +#' +#' @return A `tbl` of the new table in DuckDB +#' +#' @name to_duckdb +#' @export +#' @examplesIf getFromNamespace("run_duckdb_examples", "arrow")() +#' library(dplyr) +#' +#' ds <- InMemoryDataset$create(mtcars) +#' +#' ds %>% +#' filter(mpg < 30) %>% +#' to_duckdb() %>% +#' group_by(cyl) %>% +#' summarize(mean_mpg = mean(mpg, na.rm = TRUE)) +to_duckdb <- function(.data, + con = arrow_duck_connection(), + table_name = unique_arrow_tablename(), + auto_disconnect = FALSE) { + .data <- as_adq(.data) + duckdb::duckdb_register_arrow(con, table_name, .data) + + tbl <- tbl(con, table_name) + groups <- dplyr::groups(.data) + if (length(groups)) { + tbl <- dplyr::group_by(tbl, groups) + } + + if (auto_disconnect) { + # this will add the correct connection disconnection when the tbl is gced. + # we should probably confirm that this use of src$disco is kosher. + tbl$src$disco <- duckdb_disconnector(con, table_name) + } + + tbl +} + +arrow_duck_connection <- function() { + con <- getOption("arrow_duck_con") + if (is.null(con) || !DBI::dbIsValid(con)) { + con <- DBI::dbConnect(duckdb::duckdb()) + # Use the same CPU count that the arrow library is set to + DBI::dbExecute(con, paste0("PRAGMA threads=", cpu_count())) + options(arrow_duck_con = con) + } + con +} + +# helper function to determine if duckdb examples should run +# see: https://github.com/r-lib/roxygen2/issues/1242 +run_duckdb_examples <- function() { + arrow_with_dataset() && + requireNamespace("duckdb", quietly = TRUE) && + packageVersion("duckdb") > "0.2.7" && + requireNamespace("dplyr", quietly = TRUE) && + requireNamespace("dbplyr", quietly = TRUE) +} + +# Adapted from dbplyr +unique_arrow_tablename <- function() { + i <- getOption("arrow_table_name", 0) + 1 + options(arrow_table_name = i) + sprintf("arrow_%03i", i) +} + +# Creates an environment that disconnects the database when it's GC'd +duckdb_disconnector <- function(con, tbl_name) { + reg.finalizer(environment(), function(...) { + # remote the table we ephemerally created (though only if the connection is + # still valid) + if (DBI::dbIsValid(con)) { + duckdb::duckdb_unregister_arrow(con, tbl_name) + } + + # and there are no more tables, so we can safely shutdown + if (length(DBI::dbListTables(con)) == 0) { + DBI::dbDisconnect(con, shutdown = TRUE) + } + }) + environment() +} + +#' Create an Arrow object from others +#' +#' This can be used in pipelines that pass data back and forth between Arrow and +#' other processes (like DuckDB). +#' +#' @param .data the object to be converted +#' +#' @return an `arrow_dplyr_query` object, to be used in dplyr pipelines. +#' @export +#' +#' @examplesIf getFromNamespace("run_duckdb_examples", "arrow")() +#' library(dplyr) +#' +#' ds <- InMemoryDataset$create(mtcars) +#' +#' ds %>% +#' filter(mpg < 30) %>% +#' to_duckdb() %>% +#' group_by(cyl) %>% +#' summarize(mean_mpg = mean(mpg, na.rm = TRUE)) %>% +#' to_arrow() %>% +#' collect() +to_arrow <- function(.data) { + # If this is an Arrow object already, return quickly since we're already Arrow + if (inherits(.data, c("arrow_dplyr_query", "ArrowObject"))) { + return(.data) + } + + # For now, we only handle .data from duckdb, so check that it is that if we've + # gotten this far + if (!inherits(dbplyr::remote_con(.data), "duckdb_connection")) { + stop( + "to_arrow() currently only supports Arrow tables, Arrow datasets, ", + "Arrow queries, or dbplyr tbls from duckdb connections", + call. = FALSE + ) + } + + # Run the query + res <- DBI::dbSendQuery(dbplyr::remote_con(.data), dbplyr::remote_query(.data), arrow = TRUE) + + # TODO: we shouldn't need $read_table(), but we get segfaults when we do. + arrow_dplyr_query(duckdb::duckdb_fetch_record_batch(res)$read_table()) +} diff --git a/src/arrow/r/R/enums.R b/src/arrow/r/R/enums.R new file mode 100644 index 000000000..4e69b7a19 --- /dev/null +++ b/src/arrow/r/R/enums.R @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @export +`print.arrow-enum` <- function(x, ...) { + NextMethod() +} + +enum <- function(class, ..., .list = list(...)) { + structure( + .list, + class = c(class, "arrow-enum") + ) +} + +#' Arrow enums +#' @name enums +#' @export +#' @keywords internal +TimeUnit <- enum("TimeUnit::type", + SECOND = 0L, MILLI = 1L, MICRO = 2L, NANO = 3L +) + +#' @rdname enums +#' @export +DateUnit <- enum("DateUnit", DAY = 0L, MILLI = 1L) + +#' @rdname enums +#' @export +Type <- enum("Type::type", + "NA" = 0L, + BOOL = 1L, + UINT8 = 2L, + INT8 = 3L, + UINT16 = 4L, + INT16 = 5L, + UINT32 = 6L, + INT32 = 7L, + UINT64 = 8L, + INT64 = 9L, + HALF_FLOAT = 10L, + FLOAT = 11L, + DOUBLE = 12L, + STRING = 13L, + BINARY = 14L, + FIXED_SIZE_BINARY = 15L, + DATE32 = 16L, + DATE64 = 17L, + TIMESTAMP = 18L, + TIME32 = 19L, + TIME64 = 20L, + INTERVAL_MONTHS = 21L, + INTERVAL_DAY_TIME = 22L, + DECIMAL = 23L, + DECIMAL256 = 24L, + LIST = 25L, + STRUCT = 26L, + SPARSE_UNION = 27L, + DENSE_UNION = 28L, + DICTIONARY = 29L, + MAP = 30L, + EXTENSION = 31L, + FIXED_SIZE_LIST = 32L, + DURATION = 33L, + LARGE_STRING = 34L, + LARGE_BINARY = 35L, + LARGE_LIST = 36L +) + +TYPES_WITH_NAN <- Type[c("HALF_FLOAT", "FLOAT", "DOUBLE")] + +#' @rdname enums +#' @export +StatusCode <- enum("StatusCode", + OK = 0L, OutOfMemory = 1L, KeyError = 2L, TypeError = 3L, + Invalid = 4L, IOError = 5L, CapacityError = 6L, IndexError = 7L, + UnknownError = 9L, NotImplemented = 10L, SerializationError = 11L, + PythonError = 12L, RError = 13L, + PlasmaObjectExists = 20L, PlasmaObjectNotFound = 21L, + PlasmaStoreFull = 22L, PlasmaObjectAlreadySealed = 23L +) + +#' @rdname enums +#' @export +FileMode <- enum("FileMode", + READ = 0L, WRITE = 1L, READWRITE = 2L +) + +#' @rdname enums +#' @export +MessageType <- enum("MessageType", + NONE = 0L, SCHEMA = 1L, DICTIONARY_BATCH = 2L, RECORD_BATCH = 3L, TENSOR = 4L +) + +#' @rdname enums +#' @export +CompressionType <- enum("Compression::type", + UNCOMPRESSED = 0L, SNAPPY = 1L, GZIP = 2L, BROTLI = 3L, ZSTD = 4L, LZ4 = 5L, + LZ4_FRAME = 6L, LZO = 7L, BZ2 = 8L +) + +#' @export +#' @rdname enums +FileType <- enum("FileType", + NotFound = 0L, Unknown = 1L, File = 2L, Directory = 3L +) + +#' @export +#' @rdname enums +ParquetVersionType <- enum("ParquetVersionType", + PARQUET_1_0 = 0L, PARQUET_2_0 = 1L +) + +#' @export +#' @rdname enums +MetadataVersion <- enum("MetadataVersion", + V1 = 0L, V2 = 1L, V3 = 2L, V4 = 3L, V5 = 4L +) + +#' @export +#' @rdname enums +QuantileInterpolation <- enum("QuantileInterpolation", + LINEAR = 0L, LOWER = 1L, HIGHER = 2L, NEAREST = 3L, MIDPOINT = 4L +) + +#' @export +#' @rdname enums +NullEncodingBehavior <- enum("NullEncodingBehavior", + ENCODE = 0L, MASK = 1L +) + +#' @export +#' @rdname enums +NullHandlingBehavior <- enum("NullHandlingBehavior", + EMIT_NULL = 0L, SKIP = 1L, REPLACE = 2L +) + +#' @export +#' @rdname enums +RoundMode <- enum("RoundMode", + DOWN = 0L, + UP = 1L, + TOWARDS_ZERO = 2L, + TOWARDS_INFINITY = 3L, + HALF_DOWN = 4L, + HALF_UP = 5L, + HALF_TOWARDS_ZERO = 6L, + HALF_TOWARDS_INFINITY = 7L, + HALF_TO_EVEN = 8L, + HALF_TO_ODD = 9L +) + +#' @export +#' @rdname enums +JoinType <- enum("JoinType", + LEFT_SEMI = 0L, + RIGHT_SEMI = 1L, + LEFT_ANTI = 2L, + RIGHT_ANTI = 3L, + INNER = 4L, + LEFT_OUTER = 5L, + RIGHT_OUTER = 6L, + FULL_OUTER = 7L +) diff --git a/src/arrow/r/R/expression.R b/src/arrow/r/R/expression.R new file mode 100644 index 000000000..b1b6635f5 --- /dev/null +++ b/src/arrow/r/R/expression.R @@ -0,0 +1,240 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrowExports.R + +.unary_function_map <- list( + # NOTE: Each of the R functions mapped here takes exactly *one* argument, maps + # *directly* to an Arrow C++ compute kernel, and does not require any + # non-default options to be specified. More complex R function mappings are + # defined in dplyr-functions.R. + + # functions are arranged alphabetically by name within categories + + # arithmetic functions + "abs" = "abs_checked", + "ceiling" = "ceil", + "floor" = "floor", + "log10" = "log10_checked", + "log1p" = "log1p_checked", + "log2" = "log2_checked", + "sign" = "sign", + # trunc is defined in dplyr-functions.R + + # trigonometric functions + "acos" = "acos_checked", + "asin" = "asin_checked", + "cos" = "cos_checked", + "sin" = "sin_checked", + "tan" = "tan_checked", + + # logical functions + "!" = "invert", + + # string functions + # nchar is defined in dplyr-functions.R + "str_length" = "utf8_length", + # str_pad is defined in dplyr-functions.R + # str_sub is defined in dplyr-functions.R + # str_to_lower is defined in dplyr-functions.R + # str_to_title is defined in dplyr-functions.R + # str_to_upper is defined in dplyr-functions.R + # str_trim is defined in dplyr-functions.R + "stri_reverse" = "utf8_reverse", + # substr is defined in dplyr-functions.R + # substring is defined in dplyr-functions.R + "tolower" = "utf8_lower", + "toupper" = "utf8_upper", + + # date and time functions + "day" = "day", + "hour" = "hour", + "isoweek" = "iso_week", + "epiweek" = "us_week", + "isoyear" = "iso_year", + "minute" = "minute", + "month" = "month", + "quarter" = "quarter", + # second is defined in dplyr-functions.R + # wday is defined in dplyr-functions.R + "yday" = "day_of_year", + "year" = "year", + + # type conversion functions + "as.factor" = "dictionary_encode" +) + +.binary_function_map <- list( + # NOTE: Each of the R functions/operators mapped here takes exactly *two* + # arguments. Most map *directly* to an Arrow C++ compute kernel and require no + # non-default options, but some are modified by build_expr(). More complex R + # function/operator mappings are defined in dplyr-functions.R. + "==" = "equal", + "!=" = "not_equal", + ">" = "greater", + ">=" = "greater_equal", + "<" = "less", + "<=" = "less_equal", + "&" = "and_kleene", + "|" = "or_kleene", + "+" = "add_checked", + "-" = "subtract_checked", + "*" = "multiply_checked", + "/" = "divide", + "%/%" = "divide_checked", + # we don't actually use divide_checked with `%%`, rather it is rewritten to + # use `%/%` above. + "%%" = "divide_checked", + "^" = "power_checked", + "%in%" = "is_in_meta_binary" +) + +.array_function_map <- c(.unary_function_map, .binary_function_map) + +#' Arrow expressions +#' +#' @description +#' `Expression`s are used to define filter logic for passing to a [Dataset] +#' [Scanner]. +#' +#' `Expression$scalar(x)` constructs an `Expression` which always evaluates to +#' the provided scalar (length-1) R value. +#' +#' `Expression$field_ref(name)` is used to construct an `Expression` which +#' evaluates to the named column in the `Dataset` against which it is evaluated. +#' +#' `Expression$create(function_name, ..., options)` builds a function-call +#' `Expression` containing one or more `Expression`s. +#' @name Expression +#' @rdname Expression +#' @export +Expression <- R6Class("Expression", + inherit = ArrowObject, + public = list( + ToString = function() compute___expr__ToString(self), + Equals = function(other, ...) { + inherits(other, "Expression") && compute___expr__equals(self, other) + }, + # TODO: Implement type determination without storing + # schemas in Expression objects (ARROW-13186) + schema = NULL, + type = function(schema = self$schema) { + assert_that(!is.null(schema)) + compute___expr__type(self, schema) + }, + type_id = function(schema = self$schema) { + assert_that(!is.null(schema)) + compute___expr__type_id(self, schema) + }, + cast = function(to_type, safe = TRUE, ...) { + opts <- list( + to_type = to_type, + allow_int_overflow = !safe, + allow_time_truncate = !safe, + allow_float_truncate = !safe + ) + Expression$create("cast", self, options = modifyList(opts, list(...))) + } + ), + active = list( + field_name = function() compute___expr__get_field_ref_name(self) + ) +) +Expression$create <- function(function_name, + ..., + args = list(...), + options = empty_named_list()) { + assert_that(is.string(function_name)) + assert_that(is_list_of(args, "Expression"), msg = "Expression arguments must be Expression objects") + expr <- compute___expr__call(function_name, args, options) + expr$schema <- unify_schemas(schemas = lapply(args, function(x) x$schema)) + expr +} + +Expression$field_ref <- function(name) { + assert_that(is.string(name)) + compute___expr__field_ref(name) +} +Expression$scalar <- function(x) { + expr <- compute___expr__scalar(Scalar$create(x)) + expr$schema <- schema() + expr +} + +# Wrapper around Expression$create that: +# (1) maps R function names to Arrow C++ compute ("/" --> "divide_checked") +# (2) wraps R input args as Array or Scalar +build_expr <- function(FUN, + ..., + args = list(...), + options = empty_named_list()) { + if (FUN == "-" && length(args) == 1L) { + if (inherits(args[[1]], c("ArrowObject", "Expression"))) { + return(build_expr("negate_checked", args[[1]])) + } else { + return(-args[[1]]) + } + } + if (FUN == "%in%") { + # Special-case %in%, which is different from the Array function name + expr <- Expression$create("is_in", args[[1]], + options = list( + # If args[[2]] is already an Arrow object (like a scalar), + # this wouldn't work + value_set = Array$create(args[[2]]), + skip_nulls = TRUE + ) + ) + } else { + args <- lapply(args, function(x) { + if (!inherits(x, "Expression")) { + x <- Expression$scalar(x) + } + x + }) + + # In Arrow, "divide" is one function, which does integer division on + # integer inputs and floating-point division on floats + if (FUN == "/") { + # TODO: omg so many ways it's wrong to assume these types + args <- lapply(args, function(x) x$cast(float64())) + } else if (FUN == "%/%") { + # In R, integer division works like floor(float division) + out <- build_expr("/", args = args) + return(out$cast(int32(), allow_float_truncate = TRUE)) + } else if (FUN == "%%") { + return(args[[1]] - args[[2]] * (args[[1]] %/% args[[2]])) + } + + expr <- Expression$create(.array_function_map[[FUN]] %||% FUN, args = args, options = options) + } + expr +} + +#' @export +Ops.Expression <- function(e1, e2) { + if (.Generic == "!") { + build_expr(.Generic, e1) + } else { + build_expr(.Generic, e1, e2) + } +} + +#' @export +is.na.Expression <- function(x) { + Expression$create("is_null", x, options = list(nan_is_null = TRUE)) +} diff --git a/src/arrow/r/R/feather.R b/src/arrow/r/R/feather.R new file mode 100644 index 000000000..70a270bbe --- /dev/null +++ b/src/arrow/r/R/feather.R @@ -0,0 +1,219 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Write data in the Feather format +#' +#' Feather provides binary columnar serialization for data frames. +#' It is designed to make reading and writing data frames efficient, +#' and to make sharing data across data analysis languages easy. +#' This function writes both the original, limited specification of the format +#' and the version 2 specification, which is the Apache Arrow IPC file format. +#' +#' @param x `data.frame`, [RecordBatch], or [Table] +#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' system (`SubTreeFileSystem`) +#' @param version integer Feather file version. Version 2 is the current. +#' Version 1 is the more limited legacy format. +#' @param chunk_size For V2 files, the number of rows that each chunk of data +#' should have in the file. Use a smaller `chunk_size` when you need faster +#' random row access. Default is 64K. This option is not supported for V1. +#' @param compression Name of compression codec to use, if any. Default is +#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise +#' "uncompressed". "zstd" is the other available codec and generally has better +#' compression ratios in exchange for slower read and write performance +#' See [codec_is_available()]. This option is not supported for V1. +#' @param compression_level If `compression` is "zstd", you may +#' specify an integer compression level. If omitted, the compression codec's +#' default compression level is used. +#' +#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream], +#' the stream will be left open. +#' @export +#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data. +#' @seealso [Schema] for information about schemas and metadata handling. +#' @examplesIf arrow_available() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' write_feather(mtcars, tf) +#' @include arrow-package.R +write_feather <- function(x, + sink, + version = 2, + chunk_size = 65536L, + compression = c("default", "lz4", "uncompressed", "zstd"), + compression_level = NULL) { + # Handle and validate options before touching data + version <- as.integer(version) + assert_that(version %in% 1:2) + compression <- match.arg(compression) + chunk_size <- as.integer(chunk_size) + assert_that(chunk_size > 0) + if (compression == "default") { + if (version == 2 && codec_is_available("lz4")) { + compression <- "lz4" + } else { + compression <- "uncompressed" + } + } + if (is.null(compression_level)) { + # Use -1 as sentinal for "default" + compression_level <- -1L + } + compression_level <- as.integer(compression_level) + # Now make sure that options make sense together + if (version == 1) { + if (chunk_size != 65536L) { + stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE) + } + if (compression != "uncompressed") { + stop("Feather version 1 does not support the 'compression' option", call. = FALSE) + } + if (compression_level != -1L) { + stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE) + } + } + if (compression != "zstd" && compression_level != -1L) { + stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE) + } + # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug: + version <- version + 1L + + # "lz4" is the convenience + if (compression == "lz4") { + compression <- "lz4_frame" + } + + compression <- compression_from_name(compression) + + x_out <- x + if (is.data.frame(x) || inherits(x, "RecordBatch")) { + x <- Table$create(x) + } + + assert_that(is_writable_table(x)) + + if (!inherits(sink, "OutputStream")) { + sink <- make_output_stream(sink) + on.exit(sink$close()) + } + ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level) + invisible(x_out) +} + +#' Read a Feather file +#' +#' Feather provides binary columnar serialization for data frames. +#' It is designed to make reading and writing data frames efficient, +#' and to make sharing data across data analysis languages easy. +#' This function reads both the original, limited specification of the format +#' and the version 2 specification, which is the Apache Arrow IPC file format. +#' +#' @inheritParams read_ipc_stream +#' @inheritParams read_delim_arrow +#' @param ... additional parameters, passed to [make_readable_file()]. +#' +#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an +#' Arrow [Table] otherwise +#' +#' @export +#' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data. +#' @examplesIf arrow_available() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' write_feather(mtcars, tf) +#' df <- read_feather(tf) +#' dim(df) +#' # Can select columns +#' df <- read_feather(tf, col_select = starts_with("d")) +read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { + if (!inherits(file, "RandomAccessFile")) { + file <- make_readable_file(file, ...) + on.exit(file$close()) + } + reader <- FeatherReader$create(file) + + col_select <- enquo(col_select) + columns <- if (!quo_is_null(col_select)) { + vars_select(names(reader), !!col_select) + } + + out <- tryCatch( + reader$Read(columns), + error = read_compressed_error + ) + + if (isTRUE(as_data_frame)) { + out <- as.data.frame(out) + } + out +} + +#' @title FeatherReader class +#' @rdname FeatherReader +#' @name FeatherReader +#' @docType class +#' @usage NULL +#' @format NULL +#' @description This class enables you to interact with Feather files. Create +#' one to connect to a file or other InputStream, and call `Read()` on it to +#' make an `arrow::Table`. See its usage in [`read_feather()`]. +#' +#' @section Factory: +#' +#' The `FeatherReader$create()` factory method instantiates the object and +#' takes the following argument: +#' +#' - `file` an Arrow file connection object inheriting from `RandomAccessFile`. +#' +#' @section Methods: +#' +#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of +#' integer indices +#' - `$column_names`: Active binding, returns the column names in the Feather file +#' - `$schema`: Active binding, returns the schema of the Feather file +#' - `$version`: Active binding, returns `1` or `2`, according to the Feather +#' file version +#' +#' @export +#' @include arrow-package.R +FeatherReader <- R6Class("FeatherReader", + inherit = ArrowObject, + public = list( + Read = function(columns) { + ipc___feather___Reader__Read(self, columns) + }, + print = function(...) { + cat("FeatherReader:\n") + print(self$schema) + invisible(self) + } + ), + active = list( + # versions are officially 2 for V1 and 3 for V2 :shrug: + version = function() ipc___feather___Reader__version(self) - 1L, + column_names = function() names(self$schema), + schema = function() ipc___feather___Reader__schema(self) + ) +) + +#' @export +names.FeatherReader <- function(x) x$column_names + +FeatherReader$create <- function(file) { + assert_is(file, "RandomAccessFile") + ipc___feather___Reader__Open(file) +} diff --git a/src/arrow/r/R/field.R b/src/arrow/r/R/field.R new file mode 100644 index 000000000..d10ee7818 --- /dev/null +++ b/src/arrow/r/R/field.R @@ -0,0 +1,84 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @title Field class +#' @usage NULL +#' @format NULL +#' @docType class +#' @description `field()` lets you create an `arrow::Field` that maps a +#' [DataType][data-type] to a column name. Fields are contained in +#' [Schemas][Schema]. +#' @section Methods: +#' +#' - `f$ToString()`: convert to a string +#' - `f$Equals(other)`: test for equality. More naturally called as `f == other` +#' +#' @rdname Field +#' @name Field +#' @export +Field <- R6Class("Field", + inherit = ArrowObject, + public = list( + ToString = function() { + prettier_dictionary_type(Field__ToString(self)) + }, + Equals = function(other, ...) { + inherits(other, "Field") && Field__Equals(self, other) + }, + export_to_c = function(ptr) ExportField(self, ptr) + ), + active = list( + name = function() { + Field__name(self) + }, + nullable = function() { + Field__nullable(self) + }, + type = function() { + Field__type(self) + } + ) +) +Field$create <- function(name, type, metadata, nullable = TRUE) { + assert_that(inherits(name, "character"), length(name) == 1L) + type <- as_type(type, name) + assert_that(missing(metadata), msg = "metadata= is currently ignored") + Field__initialize(enc2utf8(name), type, nullable) +} +#' @include arrowExports.R +Field$import_from_c <- ImportField + +#' @param name field name +#' @param type logical type, instance of [DataType] +#' @param metadata currently ignored +#' @param nullable TRUE if field is nullable +#' +#' @examplesIf arrow_available() +#' field("x", int32()) +#' @rdname Field +#' @export +field <- Field$create + +.fields <- function(.list, nullable = TRUE) { + if (length(.list)) { + assert_that(!is.null(nms <- names(.list))) + map2(nms, .list, field) + } else { + list() + } +} diff --git a/src/arrow/r/R/filesystem.R b/src/arrow/r/R/filesystem.R new file mode 100644 index 000000000..a09d0a51d --- /dev/null +++ b/src/arrow/r/R/filesystem.R @@ -0,0 +1,505 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @title FileSystem entry info +#' @usage NULL +#' @format NULL +#' +#' @section Methods: +#' +#' - `base_name()` : The file base name (component after the last directory +#' separator). +#' - `extension()` : The file extension +#' +#' @section Active bindings: +#' +#' - `$type`: The file type +#' - `$path`: The full file path in the filesystem +#' - `$size`: The size in bytes, if available. Only regular files are +#' guaranteed to have a size. +#' - `$mtime`: The time of last modification, if available. +#' +#' @rdname FileInfo +#' @export +FileInfo <- R6Class("FileInfo", + inherit = ArrowObject, + public = list( + base_name = function() fs___FileInfo__base_name(self), + extension = function() fs___FileInfo__extension(self) + ), + active = list( + type = function(type) { + if (missing(type)) { + fs___FileInfo__type(self) + } else { + fs___FileInfo__set_type(self, type) + } + }, + path = function(path) { + if (missing(path)) { + fs___FileInfo__path(self) + } else { + invisible(fs___FileInfo__set_path(self)) + } + }, + size = function(size) { + if (missing(size)) { + fs___FileInfo__size(self) + } else { + invisible(fs___FileInfo__set_size(self, size)) + } + }, + mtime = function(time) { + if (missing(time)) { + fs___FileInfo__mtime(self) + } else { + if (!inherits(time, "POSIXct") && length(time) == 1L) { + abort("invalid time") + } + invisible(fs___FileInfo__set_mtime(self, time)) + } + } + ) +) + +#' @title file selector +#' @format NULL +#' +#' @section Factory: +#' +#' The `$create()` factory method instantiates a `FileSelector` given the 3 fields +#' described below. +#' +#' @section Fields: +#' +#' - `base_dir`: The directory in which to select files. If the path exists but +#' doesn't point to a directory, this should be an error. +#' - `allow_not_found`: The behavior if `base_dir` doesn't exist in the +#' filesystem. If `FALSE`, an error is returned. If `TRUE`, an empty +#' selection is returned +#' - `recursive`: Whether to recurse into subdirectories. +#' +#' @rdname FileSelector +#' @export +FileSelector <- R6Class("FileSelector", + inherit = ArrowObject, + active = list( + base_dir = function() fs___FileSelector__base_dir(self), + allow_not_found = function() fs___FileSelector__allow_not_found(self), + recursive = function() fs___FileSelector__recursive(self) + ) +) + +FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = FALSE) { + fs___FileSelector__create(clean_path_rel(base_dir), allow_not_found, recursive) +} + +#' @title FileSystem classes +#' @description `FileSystem` is an abstract file system API, +#' `LocalFileSystem` is an implementation accessing files +#' on the local machine. `SubTreeFileSystem` is an implementation that delegates +#' to another implementation after prepending a fixed base path +#' +#' @section Factory: +#' +#' `LocalFileSystem$create()` returns the object and takes no arguments. +#' +#' `SubTreeFileSystem$create()` takes the following arguments: +#' +#' - `base_path`, a string path +#' - `base_fs`, a `FileSystem` object +#' +#' `S3FileSystem$create()` optionally takes arguments: +#' +#' - `anonymous`: logical, default `FALSE`. If true, will not attempt to look up +#' credentials using standard AWS configuration methods. +#' - `access_key`, `secret_key`: authentication credentials. If one is provided, +#' the other must be as well. If both are provided, they will override any +#' AWS configuration set at the environment level. +#' - `session_token`: optional string for authentication along with +#' `access_key` and `secret_key` +#' - `role_arn`: string AWS ARN of an AccessRole. If provided instead of `access_key` and +#' `secret_key`, temporary credentials will be fetched by assuming this role. +#' - `session_name`: optional string identifier for the assumed role session. +#' - `external_id`: optional unique string identifier that might be required +#' when you assume a role in another account. +#' - `load_frequency`: integer, frequency (in seconds) with which temporary +#' credentials from an assumed role session will be refreshed. Default is +#' 900 (i.e. 15 minutes) +#' - `region`: AWS region to connect to. If omitted, the AWS library will +#' provide a sensible default based on client configuration, falling back +#' to "us-east-1" if no other alternatives are found. +#' - `endpoint_override`: If non-empty, override region with a connect string +#' such as "localhost:9000". This is useful for connecting to file systems +#' that emulate S3. +#' - `scheme`: S3 connection transport (default "https") +#' - `background_writes`: logical, whether `OutputStream` writes will be issued +#' in the background, without blocking (default `TRUE`) +#' +#' @section Methods: +#' +#' - `$GetFileInfo(x)`: `x` may be a [FileSelector][FileSelector] or a character +#' vector of paths. Returns a list of [FileInfo][FileInfo] +#' - `$CreateDir(path, recursive = TRUE)`: Create a directory and subdirectories. +#' - `$DeleteDir(path)`: Delete a directory and its contents, recursively. +#' - `$DeleteDirContents(path)`: Delete a directory's contents, recursively. +#' Like `$DeleteDir()`, +#' but doesn't delete the directory itself. Passing an empty path (`""`) will +#' wipe the entire filesystem tree. +#' - `$DeleteFile(path)` : Delete a file. +#' - `$DeleteFiles(paths)` : Delete many files. The default implementation +#' issues individual delete operations in sequence. +#' - `$Move(src, dest)`: Move / rename a file or directory. If the destination +#' exists: +#' if it is a non-empty directory, an error is returned +#' otherwise, if it has the same type as the source, it is replaced +#' otherwise, behavior is unspecified (implementation-dependent). +#' - `$CopyFile(src, dest)`: Copy a file. If the destination exists and is a +#' directory, an error is returned. Otherwise, it is replaced. +#' - `$OpenInputStream(path)`: Open an [input stream][InputStream] for +#' sequential reading. +#' - `$OpenInputFile(path)`: Open an [input file][RandomAccessFile] for random +#' access reading. +#' - `$OpenOutputStream(path)`: Open an [output stream][OutputStream] for +#' sequential writing. +#' - `$OpenAppendStream(path)`: Open an [output stream][OutputStream] for +#' appending. +#' +#' @section Active bindings: +#' +#' - `$type_name`: string filesystem type name, such as "local", "s3", etc. +#' - `$region`: string AWS region, for `S3FileSystem` and `SubTreeFileSystem` +#' containing a `S3FileSystem` +#' - `$base_fs`: for `SubTreeFileSystem`, the `FileSystem` it contains +#' - `$base_path`: for `SubTreeFileSystem`, the path in `$base_fs` which is considered +#' root in this `SubTreeFileSystem`. +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @rdname FileSystem +#' @name FileSystem +#' @export +FileSystem <- R6Class("FileSystem", + inherit = ArrowObject, + public = list( + GetFileInfo = function(x) { + if (inherits(x, "FileSelector")) { + fs___FileSystem__GetTargetInfos_FileSelector(self, x) + } else if (is.character(x)) { + fs___FileSystem__GetTargetInfos_Paths(self, clean_path_rel(x)) + } else { + abort("incompatible type for FileSystem$GetFileInfo()") + } + }, + CreateDir = function(path, recursive = TRUE) { + fs___FileSystem__CreateDir(self, clean_path_rel(path), isTRUE(recursive)) + }, + DeleteDir = function(path) { + fs___FileSystem__DeleteDir(self, clean_path_rel(path)) + }, + DeleteDirContents = function(path) { + fs___FileSystem__DeleteDirContents(self, clean_path_rel(path)) + }, + DeleteFile = function(path) { + fs___FileSystem__DeleteFile(self, clean_path_rel(path)) + }, + DeleteFiles = function(paths) { + fs___FileSystem__DeleteFiles(self, clean_path_rel(paths)) + }, + Move = function(src, dest) { + fs___FileSystem__Move(self, clean_path_rel(src), clean_path_rel(dest)) + }, + CopyFile = function(src, dest) { + fs___FileSystem__CopyFile(self, clean_path_rel(src), clean_path_rel(dest)) + }, + OpenInputStream = function(path) { + fs___FileSystem__OpenInputStream(self, clean_path_rel(path)) + }, + OpenInputFile = function(path) { + fs___FileSystem__OpenInputFile(self, clean_path_rel(path)) + }, + OpenOutputStream = function(path) { + fs___FileSystem__OpenOutputStream(self, clean_path_rel(path)) + }, + OpenAppendStream = function(path) { + fs___FileSystem__OpenAppendStream(self, clean_path_rel(path)) + }, + + # Friendlier R user interface + path = function(x) SubTreeFileSystem$create(x, self), + cd = function(x) SubTreeFileSystem$create(x, self), + ls = function(path = "", ...) { + selector <- FileSelector$create(path, ...) # ... for recursive = TRUE + infos <- self$GetFileInfo(selector) + map_chr(infos, ~ .$path) + # TODO: add full.names argument like base::dir() (default right now is TRUE) + # TODO: see fs package for glob/regexp filtering + # TODO: verbose method that shows other attributes as df + # TODO: print methods for FileInfo, SubTreeFileSystem, S3FileSystem + } + ), + active = list( + type_name = function() fs___FileSystem__type_name(self) + ) +) +FileSystem$from_uri <- function(uri) { + assert_that(is.string(uri)) + fs___FileSystemFromUri(uri) +} + +get_paths_and_filesystem <- function(x, filesystem = NULL) { + # Wrapper around FileSystem$from_uri that handles local paths + # and an optional explicit filesystem + if (inherits(x, "SubTreeFileSystem")) { + return(list(fs = x$base_fs, path = x$base_path)) + } + assert_that(is.character(x)) + are_urls <- are_urls(x) + if (any(are_urls)) { + if (!all(are_urls)) { + stop("Vectors of mixed paths and URIs are not supported", call. = FALSE) + } + if (!is.null(filesystem)) { + # Stop? Can't have URL (which yields a fs) and another fs + } + x <- lapply(x, FileSystem$from_uri) + if (length(unique(map(x, ~ class(.$fs)))) > 1) { + stop( + "Vectors of URIs for different file systems are not supported", + call. = FALSE + ) + } + fs <- x[[1]]$fs + path <- map_chr(x, ~ .$path) # singular name "path" used for compatibility + } else { + fs <- filesystem %||% LocalFileSystem$create() + if (inherits(fs, "LocalFileSystem")) { + path <- clean_path_abs(x) + } else { + path <- clean_path_rel(x) + } + } + list( + fs = fs, + path = path + ) +} + +# variant of the above function that asserts that x is either a scalar string +# or a SubTreeFileSystem +get_path_and_filesystem <- function(x, filesystem = NULL) { + assert_that(is.string(x) || inherits(x, "SubTreeFileSystem")) + get_paths_and_filesystem(x, filesystem) +} + +is_url <- function(x) is.string(x) && grepl("://", x) +are_urls <- function(x) if (!is.character(x)) FALSE else grepl("://", x) + +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @export +LocalFileSystem <- R6Class("LocalFileSystem", inherit = FileSystem) +LocalFileSystem$create <- function() { + fs___LocalFileSystem__create() +} + +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @importFrom utils modifyList +#' @export +S3FileSystem <- R6Class("S3FileSystem", + inherit = FileSystem, + active = list( + region = function() fs___S3FileSystem__region(self) + ) +) +S3FileSystem$create <- function(anonymous = FALSE, ...) { + args <- list2(...) + if (anonymous) { + invalid_args <- intersect( + c( + "access_key", "secret_key", "session_token", "role_arn", "session_name", + "external_id", "load_frequency" + ), + names(args) + ) + if (length(invalid_args)) { + stop("Cannot specify ", oxford_paste(invalid_args), " when anonymous = TRUE", call. = FALSE) + } + } else { + keys_present <- length(intersect(c("access_key", "secret_key"), names(args))) + if (keys_present == 1) { + stop("Key authentication requires both access_key and secret_key", call. = FALSE) + } + if ("session_token" %in% names(args) && keys_present != 2) { + stop( + "In order to initialize a session with temporary credentials, ", + "both secret_key and access_key must be provided ", + "in addition to session_token.", + call. = FALSE + ) + } + arn <- "role_arn" %in% names(args) + if (keys_present == 2 && arn) { + stop("Cannot provide both key authentication and role_arn", call. = FALSE) + } + arn_extras <- intersect(c("session_name", "external_id", "load_frequency"), names(args)) + if (length(arn_extras) > 0 && !arn) { + stop("Cannot specify ", oxford_paste(arn_extras), " without providing a role_arn string", call. = FALSE) + } + } + args <- c(modifyList(default_s3_options, args), anonymous = anonymous) + exec(fs___S3FileSystem__create, !!!args) +} + +default_s3_options <- list( + access_key = "", + secret_key = "", + session_token = "", + role_arn = "", + session_name = "", + external_id = "", + load_frequency = 900L, + region = "", + endpoint_override = "", + scheme = "", + background_writes = TRUE +) + +#' Connect to an AWS S3 bucket +#' +#' `s3_bucket()` is a convenience function to create an `S3FileSystem` object +#' that automatically detects the bucket's AWS region and holding onto the its +#' relative path. +#' +#' @param bucket string S3 bucket name or path +#' @param ... Additional connection options, passed to `S3FileSystem$create()` +#' @return A `SubTreeFileSystem` containing an `S3FileSystem` and the bucket's +#' relative path. Note that this function's success does not guarantee that you +#' are authorized to access the bucket's contents. +#' @examplesIf arrow_with_s3() +#' bucket <- s3_bucket("ursa-labs-taxi-data") +#' @export +s3_bucket <- function(bucket, ...) { + assert_that(is.string(bucket)) + args <- list2(...) + + # Use FileSystemFromUri to detect the bucket's region + if (!is_url(bucket)) { + bucket <- paste0("s3://", bucket) + } + fs_and_path <- FileSystem$from_uri(bucket) + fs <- fs_and_path$fs + # If there are no additional S3Options, we can use that filesystem + # Otherwise, take the region that was detected and make a new fs with the args + if (length(args)) { + args$region <- fs$region + fs <- exec(S3FileSystem$create, !!!args) + } + # Return a subtree pointing at that bucket path + SubTreeFileSystem$create(fs_and_path$path, fs) +} + +#' @usage NULL +#' @format NULL +#' @rdname FileSystem +#' @export +SubTreeFileSystem <- R6Class("SubTreeFileSystem", + inherit = FileSystem, + public = list( + print = function(...) { + if (inherits(self$base_fs, "LocalFileSystem")) { + cat("SubTreeFileSystem: ", "file://", self$base_path, "\n", sep = "") + } else if (inherits(self$base_fs, "S3FileSystem")) { + cat("SubTreeFileSystem: ", "s3://", self$base_path, "\n", sep = "") + } else { + cat("SubTreeFileSystem", "\n", sep = "") + } + invisible(self) + } + ), + active = list( + base_fs = function() { + fs___SubTreeFileSystem__base_fs(self) + }, + base_path = function() fs___SubTreeFileSystem__base_path(self) + ) +) +SubTreeFileSystem$create <- function(base_path, base_fs = NULL) { + fs_and_path <- get_path_and_filesystem(base_path, base_fs) + fs___SubTreeFileSystem__create(fs_and_path$path, fs_and_path$fs) +} + +#' @export +`$.SubTreeFileSystem` <- function(x, name, ...) { + # This is to allow delegating methods/properties to the base_fs + assert_that(is.string(name)) + if (name %in% ls(envir = x)) { + get(name, x) + } else if (name %in% ls(envir = x$base_fs)) { + get(name, x$base_fs) + } else { + NULL + } +} + +#' Copy files between FileSystems +#' +#' @param from A string path to a local directory or file, a URI, or a +#' `SubTreeFileSystem`. Files will be copied recursively from this path. +#' @param to A string path to a local directory or file, a URI, or a +#' `SubTreeFileSystem`. Directories will be created as necessary +#' @param chunk_size The maximum size of block to read before flushing +#' to the destination file. A larger chunk_size will use more memory while +#' copying but may help accommodate high latency FileSystems. +#' @return Nothing: called for side effects in the file system +#' @export +#' @examplesIf FALSE +#' # Copy an S3 bucket's files to a local directory: +#' copy_files("s3://your-bucket-name", "local-directory") +#' # Using a FileSystem object +#' copy_files(s3_bucket("your-bucket-name"), "local-directory") +#' # Or go the other way, from local to S3 +#' copy_files("local-directory", s3_bucket("your-bucket-name")) +copy_files <- function(from, to, chunk_size = 1024L * 1024L) { + from <- get_path_and_filesystem(from) + to <- get_path_and_filesystem(to) + invisible(fs___CopyFiles( + from$fs, + FileSelector$create(from$path, recursive = TRUE), + to$fs, + to$path, + chunk_size, + option_use_threads() + )) +} + +clean_path_abs <- function(path) { + # Make sure we have a valid, absolute, forward-slashed path for passing to Arrow + normalizePath(path, winslash = "/", mustWork = FALSE) +} + +clean_path_rel <- function(path) { + # Make sure all path separators are "/", not "\" as on Windows + path_sep <- ifelse(tolower(Sys.info()[["sysname"]]) == "windows", "\\\\", "/") + gsub(path_sep, "/", path) +} diff --git a/src/arrow/r/R/flight.R b/src/arrow/r/R/flight.R new file mode 100644 index 000000000..cde297853 --- /dev/null +++ b/src/arrow/r/R/flight.R @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Load a Python Flight server +#' +#' @param name string Python module name +#' @param path file system path where the Python module is found. Default is +#' to look in the `inst/` directory for included modules. +#' @export +#' @examplesIf FALSE +#' load_flight_server("demo_flight_server") +load_flight_server <- function(name, path = system.file(package = "arrow")) { + reticulate::import_from_path(name, path) +} + +#' Connect to a Flight server +#' +#' @param host string hostname to connect to +#' @param port integer port to connect on +#' @param scheme URL scheme, default is "grpc+tcp" +#' @return A `pyarrow.flight.FlightClient`. +#' @export +flight_connect <- function(host = "localhost", port, scheme = "grpc+tcp") { + pa <- reticulate::import("pyarrow") + location <- paste0(scheme, "://", host, ":", port) + pa$flight$FlightClient(location) +} + +#' Send data to a Flight server +#' +#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()] +#' @param data `data.frame`, [RecordBatch], or [Table] to upload +#' @param path string identifier to store the data under +#' @param overwrite logical: if `path` exists on `client` already, should we +#' replace it with the contents of `data`? Default is `TRUE`; if `FALSE` and +#' `path` exists, the function will error. +#' @return `client`, invisibly. +#' @export +flight_put <- function(client, data, path, overwrite = TRUE) { + if (!overwrite && flight_path_exists(client, path)) { + stop(path, " exists.", call. = FALSE) + } + if (is.data.frame(data)) { + data <- Table$create(data) + } + py_data <- reticulate::r_to_py(data) + writer <- client$do_put(descriptor_for_path(path), py_data$schema)[[1]] + if (inherits(data, "RecordBatch")) { + writer$write_batch(py_data) + } else { + writer$write_table(py_data) + } + writer$close() + invisible(client) +} + +#' Get data from a Flight server +#' +#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()] +#' @param path string identifier under which data is stored +#' @return A [Table] +#' @export +flight_get <- function(client, path) { + reader <- flight_reader(client, path) + reader$read_all() +} + +# TODO: could use this as a RecordBatch iterator, call $read_chunk() on this +flight_reader <- function(client, path) { + info <- client$get_flight_info(descriptor_for_path(path)) + # Hack: assume a single ticket, on the same server as client is already connected + ticket <- info$endpoints[[1]]$ticket + client$do_get(ticket) +} + +descriptor_for_path <- function(path) { + pa <- reticulate::import("pyarrow") + pa$flight$FlightDescriptor$for_path(path) +} + +#' See available resources on a Flight server +#' +#' @inheritParams flight_get +#' @return `list_flights()` returns a character vector of paths. +#' `flight_path_exists()` returns a logical value, the equivalent of `path %in% list_flights()` +#' @export +list_flights <- function(client) { + generator <- client$list_flights() + out <- reticulate::iterate(generator, function(x) as.character(x$descriptor$path[[1]])) + out +} + +#' @rdname list_flights +#' @export +flight_path_exists <- function(client, path) { + it_exists <- tryCatch( + expr = { + client$get_flight_info(descriptor_for_path(path)) + TRUE + }, + error = function(e) { + msg <- conditionMessage(e) + if (!any(grepl("ArrowKeyError", msg))) { + # Raise an error if this fails for any reason other than not found + stop(e) + } + FALSE + } + ) +} diff --git a/src/arrow/r/R/install-arrow.R b/src/arrow/r/R/install-arrow.R new file mode 100644 index 000000000..3e295c543 --- /dev/null +++ b/src/arrow/r/R/install-arrow.R @@ -0,0 +1,239 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Install or upgrade the Arrow library +#' +#' Use this function to install the latest release of `arrow`, to switch to or +#' from a nightly development version, or on Linux to try reinstalling with +#' all necessary C++ dependencies. +#' +#' Note that, unlike packages like `tensorflow`, `blogdown`, and others that +#' require external dependencies, you do not need to run `install_arrow()` +#' after a successful `arrow` installation. +#' +#' @param nightly logical: Should we install a development version of the +#' package, or should we install from CRAN (the default). +#' @param binary On Linux, value to set for the environment variable +#' `LIBARROW_BINARY`, which governs how C++ binaries are used, if at all. +#' The default value, `TRUE`, tells the installation script to detect the +#' Linux distribution and version and find an appropriate C++ library. `FALSE` +#' would tell the script not to retrieve a binary and instead build Arrow C++ +#' from source. Other valid values are strings corresponding to a Linux +#' distribution-version, to override the value that would be detected. +#' See `vignette("install", package = "arrow")` for further details. +#' @param use_system logical: Should we use `pkg-config` to look for Arrow +#' system packages? Default is `FALSE`. If `TRUE`, source installation may be +#' faster, but there is a risk of version mismatch. This sets the +#' `ARROW_USE_PKG_CONFIG` environment variable. +#' @param minimal logical: If building from source, should we build without +#' optional dependencies (compression libraries, for example)? Default is +#' `FALSE`. This sets the `LIBARROW_MINIMAL` environment variable. +#' @param verbose logical: Print more debugging output when installing? Default +#' is `FALSE`. This sets the `ARROW_R_DEV` environment variable. +#' @param repos character vector of base URLs of the repositories to install +#' from (passed to `install.packages()`) +#' @param ... Additional arguments passed to `install.packages()` +#' @export +#' @importFrom utils install.packages +#' @seealso [arrow_available()] to see if the package was configured with +#' necessary C++ dependencies. `vignette("install", package = "arrow")` for +#' more ways to tune installation on Linux. +install_arrow <- function(nightly = FALSE, + binary = Sys.getenv("LIBARROW_BINARY", TRUE), + use_system = Sys.getenv("ARROW_USE_PKG_CONFIG", FALSE), + minimal = Sys.getenv("LIBARROW_MINIMAL", FALSE), + verbose = Sys.getenv("ARROW_R_DEV", FALSE), + repos = getOption("repos"), + ...) { + sysname <- tolower(Sys.info()[["sysname"]]) + conda <- isTRUE(grepl("conda", R.Version()$platform)) + + if (conda) { + if (nightly) { + system("conda install -y -c arrow-nightlies -c conda-forge --strict-channel-priority r-arrow") + } else { + system("conda install -y -c conda-forge --strict-channel-priority r-arrow") + } + } else { + Sys.setenv( + LIBARROW_BINARY = binary, + LIBARROW_MINIMAL = minimal, + ARROW_R_DEV = verbose, + ARROW_USE_PKG_CONFIG = use_system + ) + # On the M1, we can't use the usual autobrew, which pulls Intel dependencies + apple_m1 <- grepl("arm-apple|aarch64.*darwin", R.Version()$platform) + # On Rosetta, we have to build without JEMALLOC, so we also can't autobrew + rosetta <- identical(sysname, "darwin") && identical(system("sysctl -n sysctl.proc_translated", intern = TRUE), "1") + if (rosetta) { + Sys.setenv(ARROW_JEMALLOC = "OFF") + } + if (apple_m1 || rosetta) { + Sys.setenv(FORCE_BUNDLED_BUILD = "true") + } + + opts <- list() + if (apple_m1 || rosetta) { + # Skip binaries (esp. for rosetta) + opts$pkgType <- "source" + } else if (isTRUE(binary)) { + # Unless otherwise directed, don't consider newer source packages when + # options(pkgType) == "both" (default on win/mac) + opts$install.packages.check.source <- "no" + opts$install.packages.compile.from.source <- "never" + } + if (length(opts)) { + old <- options(opts) + on.exit(options(old)) + } + install.packages("arrow", repos = arrow_repos(repos, nightly), ...) + } + if ("arrow" %in% loadedNamespaces()) { + # If you've just sourced this file, "arrow" won't be (re)loaded + reload_arrow() + } +} + +arrow_repos <- function(repos = getOption("repos"), nightly = FALSE) { + if (length(repos) == 0 || identical(repos, c(CRAN = "@CRAN@"))) { + # Set the default/CDN + repos <- "https://cloud.r-project.org/" + } + dev_repo <- getOption("arrow.dev_repo", "https://arrow-r-nightly.s3.amazonaws.com") + # Remove it if it's there (so nightly=FALSE won't accidentally pull from it) + repos <- setdiff(repos, dev_repo) + if (nightly) { + # Add it first + repos <- c(dev_repo, repos) + } + repos +} + +reload_arrow <- function() { + if (requireNamespace("pkgload", quietly = TRUE)) { + is_attached <- "package:arrow" %in% search() + pkgload::unload("arrow") + if (is_attached) { + require("arrow", character.only = TRUE, quietly = TRUE) + } else { + requireNamespace("arrow", quietly = TRUE) + } + } else { + message("Please restart R to use the 'arrow' package.") + } +} + + +#' Create a source bundle that includes all thirdparty dependencies +#' +#' @param dest_file File path for the new tar.gz package. Defaults to +#' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version) +#' @param source_file File path for the input tar.gz package. Defaults to +#' downloading the package from CRAN (or whatever you have set as the first in +#' `getOption("repos")`) +#' @return The full path to `dest_file`, invisibly +#' +#' This function is used for setting up an offline build. If it's possible to +#' download at build time, don't use this function. Instead, let `cmake` +#' download the required dependencies for you. +#' These downloaded dependencies are only used in the build if +#' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`. +#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +#' +#' If you're using binary packages you shouldn't need to use this function. You +#' should download the appropriate binary from your package repository, transfer +#' that to the offline computer, and install that. Any OS can create the source +#' bundle, but it cannot be installed on Windows. (Instead, use a standard +#' Windows binary package.) +#' +#' Note if you're using RStudio Package Manager on Linux: If you still want to +#' make a source bundle with this function, make sure to set the first repo in +#' `options("repos")` to be a mirror that contains source packages (that is: +#' something other than the RSPM binary mirror URLs). +#' +#' ## Steps for an offline install with optional dependencies: +#' +#' ### Using a computer with internet access, pre-download the dependencies: +#' * Install the `arrow` package _or_ run +#' `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")` +#' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` +#' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access +#' +#' ### On the computer without internet access, install the prepared package: +#' * Install the `arrow` package from the copied file +#' * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))` +#' * This installation will build from source, so `cmake` must be available +#' * Run [arrow_info()] to check installed capabilities +#' +#' +#' @examples +#' \dontrun{ +#' new_pkg <- create_package_with_all_dependencies() +#' # Note: this works when run in the same R session, but it's meant to be +#' # copied to a different computer. +#' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) +#' } +#' @export +create_package_with_all_dependencies <- function(dest_file = NULL, source_file = NULL) { + if (is.null(source_file)) { + pkg_download_dir <- tempfile() + dir.create(pkg_download_dir) + on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE) + message("Downloading Arrow source file") + downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source") + source_file <- downloaded[1, 2, drop = TRUE] + } + if (!file.exists(source_file) || !endsWith(source_file, "tar.gz")) { + stop("Arrow package .tar.gz file not found") + } + if (is.null(dest_file)) { + # e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz + # (add 'with_deps' for clarity if the file was downloaded locally) + dest_file <- paste0(gsub(".tar.gz$", "", basename(source_file)), "_with_deps.tar.gz") + } + untar_dir <- tempfile() + on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE) + utils::untar(source_file, exdir = untar_dir) + tools_dir <- file.path(untar_dir, "arrow/tools") + download_dependencies_sh <- file.path(tools_dir, "cpp/thirdparty/download_dependencies.sh") + # If you change this path, also need to edit nixlibs.R + download_dir <- file.path(tools_dir, "thirdparty_dependencies") + dir.create(download_dir) + + message("Downloading files to ", download_dir) + download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0 + if (!download_successful) { + stop("Failed to download thirdparty dependencies") + } + # Need to change directory to untar_dir so tar() will use relative paths. That + # means we'll need a full, non-relative path for dest_file. (extra_flags="-C" + # doesn't work with R's internal tar) + orig_wd <- getwd() + on.exit(setwd(orig_wd), add = TRUE) + # normalizePath() may return the input unchanged if dest_file doesn't exist, + # so create it first. + file.create(dest_file) + dest_file <- normalizePath(dest_file, mustWork = TRUE) + setwd(untar_dir) + + message("Repacking tar.gz file to ", dest_file) + tar_successful <- utils::tar(dest_file, compression = "gz") == 0 + if (!tar_successful) { + stop("Failed to create new tar.gz file") + } + invisible(dest_file) +} diff --git a/src/arrow/r/R/io.R b/src/arrow/r/R/io.R new file mode 100644 index 000000000..898b306a3 --- /dev/null +++ b/src/arrow/r/R/io.R @@ -0,0 +1,295 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @include enums.R +#' @include buffer.R + +# OutputStream ------------------------------------------------------------ + +Writable <- R6Class("Writable", + inherit = ArrowObject, + public = list( + write = function(x) io___Writable__write(self, buffer(x)) + ) +) + +#' @title OutputStream classes +#' @description `FileOutputStream` is for writing to a file; +#' `BufferOutputStream` writes to a buffer; +#' You can create one and pass it to any of the table writers, for example. +#' @usage NULL +#' @format NULL +#' @docType class +#' @section Factory: +#' +#' The `$create()` factory methods instantiate the `OutputStream` object and +#' take the following arguments, depending on the subclass: +#' +#' - `path` For `FileOutputStream`, a character file name +#' - `initial_capacity` For `BufferOutputStream`, the size in bytes of the +#' buffer. +#' +#' @section Methods: +#' +#' - `$tell()`: return the position in the stream +#' - `$close()`: close the stream +#' - `$write(x)`: send `x` to the stream +#' - `$capacity()`: for `BufferOutputStream` +#' - `$finish()`: for `BufferOutputStream` +#' - `$GetExtentBytesWritten()`: for `MockOutputStream`, report how many bytes +#' were sent. +#' +#' @rdname OutputStream +#' @name OutputStream +OutputStream <- R6Class("OutputStream", + inherit = Writable, + public = list( + close = function() io___OutputStream__Close(self), + tell = function() io___OutputStream__Tell(self) + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname OutputStream +#' @export +FileOutputStream <- R6Class("FileOutputStream", inherit = OutputStream) +FileOutputStream$create <- function(path) { + io___FileOutputStream__Open(clean_path_abs(path)) +} + +#' @usage NULL +#' @format NULL +#' @rdname OutputStream +#' @export +BufferOutputStream <- R6Class("BufferOutputStream", + inherit = OutputStream, + public = list( + capacity = function() io___BufferOutputStream__capacity(self), + finish = function() io___BufferOutputStream__Finish(self), + write = function(bytes) io___BufferOutputStream__Write(self, bytes), + tell = function() io___BufferOutputStream__Tell(self) + ) +) +BufferOutputStream$create <- function(initial_capacity = 0L) { + io___BufferOutputStream__Create(initial_capacity) +} + +# InputStream ------------------------------------------------------------- + + +Readable <- R6Class("Readable", + inherit = ArrowObject, + public = list( + Read = function(nbytes) io___Readable__Read(self, nbytes) + ) +) + +#' @title InputStream classes +#' @description `RandomAccessFile` inherits from `InputStream` and is a base +#' class for: `ReadableFile` for reading from a file; `MemoryMappedFile` for +#' the same but with memory mapping; and `BufferReader` for reading from a +#' buffer. Use these with the various table readers. +#' @usage NULL +#' @format NULL +#' @docType class +#' @section Factory: +#' +#' The `$create()` factory methods instantiate the `InputStream` object and +#' take the following arguments, depending on the subclass: +#' +#' - `path` For `ReadableFile`, a character file name +#' - `x` For `BufferReader`, a [Buffer] or an object that can be +#' made into a buffer via `buffer()`. +#' +#' To instantiate a `MemoryMappedFile`, call [mmap_open()]. +#' +#' @section Methods: +#' +#' - `$GetSize()`: +#' - `$supports_zero_copy()`: Logical +#' - `$seek(position)`: go to that position in the stream +#' - `$tell()`: return the position in the stream +#' - `$close()`: close the stream +#' - `$Read(nbytes)`: read data from the stream, either a specified `nbytes` or +#' all, if `nbytes` is not provided +#' - `$ReadAt(position, nbytes)`: similar to `$seek(position)$Read(nbytes)` +#' - `$Resize(size)`: for a `MemoryMappedFile` that is writeable +#' +#' @rdname InputStream +#' @name InputStream +InputStream <- R6Class("InputStream", + inherit = Readable, + public = list( + close = function() io___InputStream__Close(self) + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname InputStream +#' @export +RandomAccessFile <- R6Class("RandomAccessFile", + inherit = InputStream, + public = list( + GetSize = function() io___RandomAccessFile__GetSize(self), + supports_zero_copy = function() io___RandomAccessFile__supports_zero_copy(self), + seek = function(position) io___RandomAccessFile__Seek(self, position), + tell = function() io___RandomAccessFile__Tell(self), + Read = function(nbytes = NULL) { + if (is.null(nbytes)) { + io___RandomAccessFile__Read0(self) + } else { + io___Readable__Read(self, nbytes) + } + }, + ReadAt = function(position, nbytes = NULL) { + if (is.null(nbytes)) { + nbytes <- self$GetSize() - position + } + io___RandomAccessFile__ReadAt(self, position, nbytes) + } + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname InputStream +#' @export +MemoryMappedFile <- R6Class("MemoryMappedFile", + inherit = RandomAccessFile, + public = list( + Resize = function(size) io___MemoryMappedFile__Resize(self, size) + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname InputStream +#' @export +ReadableFile <- R6Class("ReadableFile", inherit = RandomAccessFile) +ReadableFile$create <- function(path) { + io___ReadableFile__Open(clean_path_abs(path)) +} + +#' @usage NULL +#' @format NULL +#' @rdname InputStream +#' @export +BufferReader <- R6Class("BufferReader", inherit = RandomAccessFile) +BufferReader$create <- function(x) { + x <- buffer(x) + io___BufferReader__initialize(x) +} + +#' Create a new read/write memory mapped file of a given size +#' +#' @param path file path +#' @param size size in bytes +#' +#' @return a [arrow::io::MemoryMappedFile][MemoryMappedFile] +#' +#' @export +mmap_create <- function(path, size) { + path <- clean_path_abs(path) + io___MemoryMappedFile__Create(path, size) +} + +#' Open a memory mapped file +#' +#' @param path file path +#' @param mode file mode (read/write/readwrite) +#' +#' @export +mmap_open <- function(path, mode = c("read", "write", "readwrite")) { + mode <- match(match.arg(mode), c("read", "write", "readwrite")) - 1L + path <- clean_path_abs(path) + io___MemoryMappedFile__Open(path, mode) +} + +#' Handle a range of possible input sources +#' @param file A character file name, `raw` vector, or an Arrow input stream +#' @param mmap Logical: whether to memory-map the file (default `TRUE`) +#' @param compression If the file is compressed, created a [CompressedInputStream] +#' with this compression codec, either a [Codec] or the string name of one. +#' If `NULL` (default) and `file` is a string file name, the function will try +#' to infer compression from the file extension. +#' @param filesystem If not `NULL`, `file` will be opened via the +#' `filesystem$OpenInputFile()` filesystem method, rather than the `io` module's +#' `MemoryMappedFile` or `ReadableFile` constructors. +#' @return An `InputStream` or a subclass of one. +#' @keywords internal +make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem = NULL) { + if (inherits(file, "SubTreeFileSystem")) { + filesystem <- file$base_fs + file <- file$base_path + } + if (is.string(file)) { + if (is_url(file)) { + fs_and_path <- FileSystem$from_uri(file) + filesystem <- fs_and_path$fs + file <- fs_and_path$path + } + if (is.null(compression)) { + # Infer compression from the file path + compression <- detect_compression(file) + } + if (!is.null(filesystem)) { + file <- filesystem$OpenInputFile(file) + } else if (isTRUE(mmap)) { + file <- mmap_open(file) + } else { + file <- ReadableFile$create(file) + } + if (!identical(compression, "uncompressed")) { + file <- CompressedInputStream$create(file, compression) + } + } else if (inherits(file, c("raw", "Buffer"))) { + file <- BufferReader$create(file) + } + assert_is(file, "InputStream") + file +} + +make_output_stream <- function(x, filesystem = NULL) { + if (inherits(x, "SubTreeFileSystem")) { + filesystem <- x$base_fs + x <- x$base_path + } else if (is_url(x)) { + fs_and_path <- FileSystem$from_uri(x) + filesystem <- fs_and_path$fs + x <- fs_and_path$path + } + assert_that(is.string(x)) + if (is.null(filesystem)) { + FileOutputStream$create(x) + } else { + filesystem$OpenOutputStream(x) + } +} + +detect_compression <- function(path) { + assert_that(is.string(path)) + switch(tools::file_ext(path), + bz2 = "bz2", + gz = "gzip", + lz4 = "lz4", + zst = "zstd", + "uncompressed" + ) +} diff --git a/src/arrow/r/R/ipc_stream.R b/src/arrow/r/R/ipc_stream.R new file mode 100644 index 000000000..c45d1de6e --- /dev/null +++ b/src/arrow/r/R/ipc_stream.R @@ -0,0 +1,123 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Write Arrow IPC stream format +#' +#' Apache Arrow defines two formats for [serializing data for interprocess +#' communication +#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc): +#' a "stream" format and a "file" format, known as Feather. `write_ipc_stream()` +#' and [write_feather()] write those formats, respectively. +#' +#' `write_arrow()`, a wrapper around `write_ipc_stream()` and `write_feather()` +#' with some nonstandard behavior, is deprecated. You should explicitly choose +#' the function that will write the desired IPC format (stream or file) since +#' either can be written to a file or `OutputStream`. +#' +#' @inheritParams write_feather +#' @param ... extra parameters passed to `write_feather()`. +#' +#' @return `x`, invisibly. +#' @seealso [write_feather()] for writing IPC files. [write_to_raw()] to +#' serialize data to a buffer. +#' [RecordBatchWriter] for a lower-level interface. +#' @export +#' @examplesIf arrow_available() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' write_ipc_stream(mtcars, tf) +write_ipc_stream <- function(x, sink, ...) { + x_out <- x # So we can return the data we got + if (is.data.frame(x)) { + x <- Table$create(x) + } + assert_that(is_writable_table(x)) + if (!inherits(sink, "OutputStream")) { + sink <- make_output_stream(sink) + on.exit(sink$close()) + } + + writer <- RecordBatchStreamWriter$create(sink, x$schema) + writer$write(x) + writer$close() + invisible(x_out) +} + +#' Write Arrow data to a raw vector +#' +#' [write_ipc_stream()] and [write_feather()] write data to a sink and return +#' the data (`data.frame`, `RecordBatch`, or `Table`) they were given. +#' This function wraps those so that you can serialize data to a buffer and +#' access that buffer as a `raw` vector in R. +#' @inheritParams write_feather +#' @param format one of `c("stream", "file")`, indicating the IPC format to use +#' @return A `raw` vector containing the bytes of the IPC serialized data. +#' @examplesIf arrow_available() +#' # The default format is "stream" +#' mtcars_raw <- write_to_raw(mtcars) +#' @export +write_to_raw <- function(x, format = c("stream", "file")) { + sink <- BufferOutputStream$create() + if (match.arg(format) == "stream") { + write_ipc_stream(x, sink) + } else { + write_feather(x, sink) + } + as.raw(buffer(sink)) +} + +#' Read Arrow IPC stream format +#' +#' Apache Arrow defines two formats for [serializing data for interprocess +#' communication +#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc): +#' a "stream" format and a "file" format, known as Feather. `read_ipc_stream()` +#' and [read_feather()] read those formats, respectively. +#' +#' `read_arrow()`, a wrapper around `read_ipc_stream()` and `read_feather()`, +#' is deprecated. You should explicitly choose +#' the function that will read the desired IPC format (stream or file) since +#' a file or `InputStream` may contain either. +#' +#' @param file A character file name or URI, `raw` vector, an Arrow input stream, +#' or a `FileSystem` with path (`SubTreeFileSystem`). +#' If a file name or URI, an Arrow [InputStream] will be opened and +#' closed when finished. If an input stream is provided, it will be left +#' open. +#' @param as_data_frame Should the function return a `data.frame` (default) or +#' an Arrow [Table]? +#' @param ... extra parameters passed to `read_feather()`. +#' +#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an +#' Arrow [Table] otherwise +#' @seealso [read_feather()] for writing IPC files. [RecordBatchReader] for a +#' lower-level interface. +#' @export +read_ipc_stream <- function(file, as_data_frame = TRUE, ...) { + if (!inherits(file, "InputStream")) { + file <- make_readable_file(file) + on.exit(file$close()) + } + + # TODO: this could take col_select, like the other readers + # https://issues.apache.org/jira/browse/ARROW-6830 + out <- RecordBatchStreamReader$create(file)$read_table() + if (as_data_frame) { + out <- as.data.frame(out) + } + out +} diff --git a/src/arrow/r/R/json.R b/src/arrow/r/R/json.R new file mode 100644 index 000000000..0d54c8a8a --- /dev/null +++ b/src/arrow/r/R/json.R @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read a JSON file +#' +#' Using [JsonTableReader] +#' +#' @inheritParams read_delim_arrow +#' @param schema [Schema] that describes the table. +#' @param ... Additional options passed to `JsonTableReader$create()` +#' +#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`. +#' @export +#' @examplesIf arrow_with_json() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' writeLines(' +#' { "hello": 3.5, "world": false, "yo": "thing" } +#' { "hello": 3.25, "world": null } +#' { "hello": 0.0, "world": true, "yo": null } +#' ', tf, useBytes = TRUE) +#' df <- read_json_arrow(tf) +read_json_arrow <- function(file, + col_select = NULL, + as_data_frame = TRUE, + schema = NULL, + ...) { + if (!inherits(file, "InputStream")) { + file <- make_readable_file(file) + on.exit(file$close()) + } + tab <- JsonTableReader$create(file, schema = schema, ...)$Read() + + col_select <- enquo(col_select) + if (!quo_is_null(col_select)) { + tab <- tab[vars_select(names(tab), !!col_select)] + } + + if (isTRUE(as_data_frame)) { + tab <- as.data.frame(tab) + } + tab +} + +#' @include arrow-package.R +#' @rdname CsvTableReader +#' @usage NULL +#' @format NULL +#' @docType class +#' @export +JsonTableReader <- R6Class("JsonTableReader", + inherit = ArrowObject, + public = list( + Read = function() json___TableReader__Read(self) + ) +) +JsonTableReader$create <- function(file, + read_options = JsonReadOptions$create(), + parse_options = JsonParseOptions$create(schema = schema), + schema = NULL, + ...) { + assert_is(file, "InputStream") + json___TableReader__Make(file, read_options, parse_options) +} + +#' @rdname CsvReadOptions +#' @usage NULL +#' @format NULL +#' @docType class +#' @export +JsonReadOptions <- R6Class("JsonReadOptions", inherit = ArrowObject) +JsonReadOptions$create <- function(use_threads = option_use_threads(), block_size = 1048576L) { + json___ReadOptions__initialize(use_threads, block_size) +} + +#' @rdname CsvReadOptions +#' @usage NULL +#' @format NULL +#' @docType class +#' @export +JsonParseOptions <- R6Class("JsonParseOptions", inherit = ArrowObject) +JsonParseOptions$create <- function(newlines_in_values = FALSE, schema = NULL) { + if (is.null(schema)) { + json___ParseOptions__initialize1(newlines_in_values) + } else { + json___ParseOptions__initialize2(newlines_in_values, schema) + } +} diff --git a/src/arrow/r/R/memory-pool.R b/src/arrow/r/R/memory-pool.R new file mode 100644 index 000000000..2207ed6be --- /dev/null +++ b/src/arrow/r/R/memory-pool.R @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' +#' @title class arrow::MemoryPool +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' - `backend_name`: one of "jemalloc", "mimalloc", or "system". Alternative +#' memory allocators are optionally enabled at build time. Windows builds +#' generally have `mimalloc`, and most others have both `jemalloc` (used by +#' default) and `mimalloc`. To change memory allocators at runtime, set the +#' environment variable `ARROW_DEFAULT_MEMORY_POOL` to one of those strings +#' prior to loading the `arrow` library. +#' - `bytes_allocated` +#' - `max_memory` +#' +#' @rdname MemoryPool +#' @name MemoryPool +#' @keywords internal +MemoryPool <- R6Class("MemoryPool", + inherit = ArrowObject, + public = list( + # TODO: Allocate + # TODO: Reallocate + # TODO: Free + ), + active = list( + backend_name = function() MemoryPool__backend_name(self), + bytes_allocated = function() MemoryPool__bytes_allocated(self), + max_memory = function() MemoryPool__max_memory(self) + ) +) + +#' Arrow's default [MemoryPool] +#' +#' @return the default [MemoryPool] +#' @export +#' @keywords internal +default_memory_pool <- function() { + MemoryPool__default() +} diff --git a/src/arrow/r/R/message.R b/src/arrow/r/R/message.R new file mode 100644 index 000000000..ef33f1623 --- /dev/null +++ b/src/arrow/r/R/message.R @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R + +#' @title class arrow::Message +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname Message +#' @name Message +Message <- R6Class("Message", + inherit = ArrowObject, + public = list( + Equals = function(other, ...) { + inherits(other, "Message") && ipc___Message__Equals(self, other) + }, + body_length = function() ipc___Message__body_length(self), + Verify = function() ipc___Message__Verify(self) + ), + active = list( + type = function() ipc___Message__type(self), + metadata = function() ipc___Message__metadata(self), + body = function() ipc___Message__body(self) + ) +) + +#' @title class arrow::MessageReader +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname MessageReader +#' @name MessageReader +#' @export +MessageReader <- R6Class("MessageReader", + inherit = ArrowObject, + public = list( + ReadNextMessage = function() ipc___MessageReader__ReadNextMessage(self) + ) +) + +MessageReader$create <- function(stream) { + if (!inherits(stream, "InputStream")) { + stream <- BufferReader$create(stream) + } + ipc___MessageReader__Open(stream) +} + +#' Read a Message from a stream +#' +#' @param stream an InputStream +#' +#' @export +read_message <- function(stream) { + UseMethod("read_message") +} + +#' @export +read_message.default <- function(stream) { + read_message(BufferReader$create(stream)) +} + +#' @export +read_message.InputStream <- function(stream) { + ipc___ReadMessage(stream) +} + +#' @export +read_message.MessageReader <- function(stream) { + stream$ReadNextMessage() +} diff --git a/src/arrow/r/R/metadata.R b/src/arrow/r/R/metadata.R new file mode 100644 index 000000000..768abeda7 --- /dev/null +++ b/src/arrow/r/R/metadata.R @@ -0,0 +1,210 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @importFrom utils object.size +.serialize_arrow_r_metadata <- function(x) { + assert_is(x, "list") + + # drop problems attributes (most likely from readr) + x[["attributes"]][["problems"]] <- NULL + + out <- serialize(x, NULL, ascii = TRUE) + + # if the metadata is over 100 kB, compress + if (option_compress_metadata() && object.size(out) > 100000) { + out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE) + + # but ensure that the compression+serialization is effective. + if (object.size(out) > object.size(out_comp)) out <- out_comp + } + + rawToChar(out) +} + +.unserialize_arrow_r_metadata <- function(x) { + tryCatch( + expr = { + out <- unserialize(charToRaw(x)) + + # if this is still raw, try decompressing + if (is.raw(out)) { + out <- unserialize(memDecompress(out, type = "gzip")) + } + out + }, + error = function(e) { + warning("Invalid metadata$r", call. = FALSE) + NULL + } + ) +} + +#' @importFrom rlang trace_back +apply_arrow_r_metadata <- function(x, r_metadata) { + tryCatch( + expr = { + columns_metadata <- r_metadata$columns + if (is.data.frame(x)) { + if (length(names(x)) && !is.null(columns_metadata)) { + for (name in intersect(names(columns_metadata), names(x))) { + x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]]) + } + } + } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) { + # If we have a list and "columns_metadata" this applies row-level metadata + # inside of a column in a dataframe. + + # However, if we are inside of a dplyr collection (including all datasets), + # we cannot apply this row-level metadata, since the order of the rows is + # not guaranteed to be the same, so don't even try, but warn what's going on + trace <- trace_back() + # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released + in_dplyr_collect <- any(map_lgl(trace$calls %||% trace$call, function(x) { + grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]] + })) + if (in_dplyr_collect) { + warning( + "Row-level metadata is not compatible with this operation and has ", + "been ignored", + call. = FALSE + ) + } else { + x <- map2(x, columns_metadata, function(.x, .y) { + apply_arrow_r_metadata(.x, .y) + }) + } + x + } + + if (!is.null(r_metadata$attributes)) { + attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes + if (inherits(x, "POSIXlt")) { + # We store POSIXlt as a StructArray, which is translated back to R + # as a data.frame, but while data frames have a row.names = c(NA, nrow(x)) + # attribute, POSIXlt does not, so since this is now no longer an object + # of class data.frame, remove the extraneous attribute + attr(x, "row.names") <- NULL + } + if (!is.null(attr(x, ".group_vars")) && requireNamespace("dplyr", quietly = TRUE)) { + x <- dplyr::group_by(x, !!!syms(attr(x, ".group_vars"))) + attr(x, ".group_vars") <- NULL + } + } + }, + error = function(e) { + warning("Invalid metadata$r", call. = FALSE) + } + ) + x +} + +remove_attributes <- function(x) { + removed_attributes <- character() + if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) { + removed_attributes <- c("class", "row.names", "names") + } else if (inherits(x, "data.frame")) { + removed_attributes <- c("row.names", "names") + } else if (inherits(x, "factor")) { + removed_attributes <- c("class", "levels") + } else if (inherits(x, c("integer64", "Date", "arrow_binary", "arrow_large_binary"))) { + removed_attributes <- c("class") + } else if (inherits(x, "arrow_fixed_size_binary")) { + removed_attributes <- c("class", "byte_width") + } else if (inherits(x, "POSIXct")) { + removed_attributes <- c("class", "tzone") + } else if (inherits(x, "hms") || inherits(x, "difftime")) { + removed_attributes <- c("class", "units") + } + removed_attributes +} + +arrow_attributes <- function(x, only_top_level = FALSE) { + if (inherits(x, "grouped_df")) { + # Keep only the group var names, not the rest of the cached data that dplyr + # uses, which may be large + if (requireNamespace("dplyr", quietly = TRUE)) { + gv <- dplyr::group_vars(x) + x <- dplyr::ungroup(x) + # ungroup() first, then set attribute, bc ungroup() would erase it + attr(x, ".group_vars") <- gv + } else { + # Regardless, we shouldn't keep groups around + attr(x, "groups") <- NULL + } + } + att <- attributes(x) + + removed_attributes <- remove_attributes(x) + + att <- att[setdiff(names(att), removed_attributes)] + if (isTRUE(only_top_level)) { + return(att) + } + + if (is.data.frame(x)) { + columns <- map(x, arrow_attributes) + out <- if (length(att) || !all(map_lgl(columns, is.null))) { + list(attributes = att, columns = columns) + } + return(out) + } + + columns <- NULL + attempt_to_save_row_level <- getOption("arrow.preserve_row_level_metadata", FALSE) && + is.list(x) && !inherits(x, "POSIXlt") + if (attempt_to_save_row_level) { + # However, if we are inside of a dplyr collection (including all datasets), + # we cannot apply this row-level metadata, since the order of the rows is + # not guaranteed to be the same, so don't even try, but warn what's going on + trace <- trace_back() + # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released + in_dataset_write <- any(map_lgl(trace$calls %||% trace$call, function(x) { + grepl("write_dataset", x, fixed = TRUE)[[1]] + })) + if (in_dataset_write) { + warning( + "Row-level metadata is not compatible with datasets and will be discarded", + call. = FALSE + ) + } else { + # for list columns, we also keep attributes of each + # element in columns + columns <- map(x, arrow_attributes) + } + if (all(map_lgl(columns, is.null))) { + columns <- NULL + } + } else if (inherits(x, c("sfc", "sf"))) { + # Check if there are any columns that look like sf columns, warn that we will + # not be saving this data for now (but only if arrow.preserve_row_level_metadata + # is set to FALSE) + warning( + "One of the columns given appears to be an `sfc` SF column. Due to their unique ", + "nature, these columns do not convert to Arrow well. We are working on ", + "better ways to do this, but in the interim we recommend converting any `sfc` ", + "columns to WKB (well-known binary) columns before using them with Arrow ", + "(for example, with `sf::st_as_binary(col)`).", + call. = FALSE + ) + } + + if (length(att) || !is.null(columns)) { + list(attributes = att, columns = columns) + } else { + NULL + } +} diff --git a/src/arrow/r/R/parquet.R b/src/arrow/r/R/parquet.R new file mode 100644 index 000000000..ee2ed57de --- /dev/null +++ b/src/arrow/r/R/parquet.R @@ -0,0 +1,585 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read a Parquet file +#' +#' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format. +#' This function enables you to read Parquet files into R. +#' +#' @inheritParams read_feather +#' @param props [ParquetArrowReaderProperties] +#' @param ... Additional arguments passed to `ParquetFileReader$create()` +#' +#' @return A [arrow::Table][Table], or a `data.frame` if `as_data_frame` is +#' `TRUE` (the default). +#' @examplesIf arrow_with_parquet() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' write_parquet(mtcars, tf) +#' df <- read_parquet(tf, col_select = starts_with("d")) +#' head(df) +#' @export +read_parquet <- function(file, + col_select = NULL, + as_data_frame = TRUE, + props = ParquetArrowReaderProperties$create(), + ...) { + if (is.string(file)) { + file <- make_readable_file(file) + on.exit(file$close()) + } + reader <- ParquetFileReader$create(file, props = props, ...) + + col_select <- enquo(col_select) + if (!quo_is_null(col_select)) { + # infer which columns to keep from schema + schema <- reader$GetSchema() + names <- names(schema) + indices <- match(vars_select(names, !!col_select), names) - 1L + tab <- tryCatch( + reader$ReadTable(indices), + error = read_compressed_error + ) + } else { + # read all columns + tab <- tryCatch( + reader$ReadTable(), + error = read_compressed_error + ) + } + + if (as_data_frame) { + tab <- as.data.frame(tab) + } + tab +} + +#' Write Parquet file to disk +#' +#' [Parquet](https://parquet.apache.org/) is a columnar storage file format. +#' This function enables you to write Parquet files from R. +#' +#' Due to features of the format, Parquet files cannot be appended to. +#' If you want to use the Parquet format but also want the ability to extend +#' your dataset, you can write to additional Parquet files and then treat +#' the whole directory of files as a [Dataset] you can query. +#' See `vignette("dataset", package = "arrow")` for examples of this. +#' +#' @param x `data.frame`, [RecordBatch], or [Table] +#' @param sink A string file path, URI, or [OutputStream], or path in a file +#' system (`SubTreeFileSystem`) +#' @param chunk_size chunk size in number of rows. If NULL, the total number of rows is used. +#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values +#' are coerced to character. +#' @param compression compression algorithm. Default "snappy". See details. +#' @param compression_level compression level. Meaning depends on compression algorithm +#' @param use_dictionary Specify if we should use dictionary encoding. Default `TRUE` +#' @param write_statistics Specify if we should write statistics. Default `TRUE` +#' @param data_page_size Set a target threshold for the approximate encoded +#' size of data pages within a column chunk (in bytes). Default 1 MiB. +#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format. Default `FALSE`. +#' @param coerce_timestamps Cast timestamps a particular resolution. Can be +#' `NULL`, "ms" or "us". Default `NULL` (no casting) +#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a +#' particular resolution. E.g. if microsecond or nanosecond data is lost when coercing +#' to "ms", do not raise an exception +#' @param properties A `ParquetWriterProperties` object, used instead of the options +#' enumerated in this function's signature. Providing `properties` as an argument +#' is deprecated; if you need to assemble `ParquetWriterProperties` outside +#' of `write_parquet()`, use `ParquetFileWriter` instead. +#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like +#' `properties`, this argument is deprecated. +#' +#' @details The parameters `compression`, `compression_level`, `use_dictionary` and +#' `write_statistics` support various patterns: +#' +#' - The default `NULL` leaves the parameter unspecified, and the C++ library +#' uses an appropriate default for each column (defaults listed above) +#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns +#' - An unnamed vector, of the same size as the number of columns, to specify a +#' value for each column, in positional order +#' - A named vector, to specify the value for the named columns, the default +#' value for the setting is used when not supplied +#' +#' The `compression` argument can be any of the following (case insensitive): +#' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2". +#' Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip" +#' are almost always included. See [codec_is_available()]. +#' The default "snappy" is used if available, otherwise "uncompressed". To +#' disable compression, set `compression = "uncompressed"`. +#' Note that "uncompressed" columns may still have dictionary encoding. +#' +#' @return the input `x` invisibly. +#' +#' @examplesIf arrow_with_parquet() +#' tf1 <- tempfile(fileext = ".parquet") +#' write_parquet(data.frame(x = 1:5), tf1) +#' +#' # using compression +#' if (codec_is_available("gzip")) { +#' tf2 <- tempfile(fileext = ".gz.parquet") +#' write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5) +#' } +#' @export +write_parquet <- function(x, + sink, + chunk_size = NULL, + # writer properties + version = NULL, + compression = default_parquet_compression(), + compression_level = NULL, + use_dictionary = NULL, + write_statistics = NULL, + data_page_size = NULL, + # arrow writer properties + use_deprecated_int96_timestamps = FALSE, + coerce_timestamps = NULL, + allow_truncated_timestamps = FALSE, + properties = NULL, + arrow_properties = NULL) { + x_out <- x + + if (is.data.frame(x) || inherits(x, "RecordBatch")) { + x <- Table$create(x) + } + + assert_that(is_writable_table(x)) + + if (!inherits(sink, "OutputStream")) { + sink <- make_output_stream(sink) + on.exit(sink$close()) + } + + # Deprecation warnings + if (!is.null(properties)) { + warning( + "Providing 'properties' is deprecated. If you need to assemble properties outside ", + "this function, use ParquetFileWriter instead." + ) + } + if (!is.null(arrow_properties)) { + warning( + "Providing 'arrow_properties' is deprecated. If you need to assemble arrow_properties ", + "outside this function, use ParquetFileWriter instead." + ) + } + + writer <- ParquetFileWriter$create( + x$schema, + sink, + properties = properties %||% ParquetWriterProperties$create( + x, + version = version, + compression = compression, + compression_level = compression_level, + use_dictionary = use_dictionary, + write_statistics = write_statistics, + data_page_size = data_page_size + ), + arrow_properties = arrow_properties %||% ParquetArrowWriterProperties$create( + use_deprecated_int96_timestamps = use_deprecated_int96_timestamps, + coerce_timestamps = coerce_timestamps, + allow_truncated_timestamps = allow_truncated_timestamps + ) + ) + writer$WriteTable(x, chunk_size = chunk_size %||% x$num_rows) + writer$Close() + + invisible(x_out) +} + +default_parquet_compression <- function() { + # Match the pyarrow default (overriding the C++ default) + if (codec_is_available("snappy")) { + "snappy" + } else { + NULL + } +} + +ParquetArrowWriterProperties <- R6Class("ParquetArrowWriterProperties", inherit = ArrowObject) +ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps = FALSE, + coerce_timestamps = NULL, + allow_truncated_timestamps = FALSE, + ...) { + if (is.null(coerce_timestamps)) { + timestamp_unit <- -1L # null sentinel value + } else { + timestamp_unit <- make_valid_time_unit( + coerce_timestamps, + c("ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO) + ) + } + parquet___ArrowWriterProperties___create( + use_deprecated_int96_timestamps = isTRUE(use_deprecated_int96_timestamps), + timestamp_unit = timestamp_unit, + allow_truncated_timestamps = isTRUE(allow_truncated_timestamps) + ) +} + +valid_parquet_version <- c( + "1.0" = ParquetVersionType$PARQUET_1_0, + "2.0" = ParquetVersionType$PARQUET_2_0 +) + +make_valid_version <- function(version, valid_versions = valid_parquet_version) { + if (is_integerish(version)) { + version <- as.character(version) + } + tryCatch( + valid_versions[[match.arg(version, choices = names(valid_versions))]], + error = function(cond) { + stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call. = FALSE) + } + ) +} + +#' @title ParquetWriterProperties class +#' @rdname ParquetWriterProperties +#' @name ParquetWriterProperties +#' @docType class +#' @usage NULL +#' @format NULL +#' @description This class holds settings to control how a Parquet file is read +#' by [ParquetFileWriter]. +#' +#' @section Factory: +#' +#' The `ParquetWriterProperties$create()` factory method instantiates the object +#' and takes the following arguments: +#' +#' - `table`: table to write (required) +#' - `version`: Parquet version, "1.0" or "2.0". Default "1.0" +#' - `compression`: Compression type, algorithm `"uncompressed"` +#' - `compression_level`: Compression level; meaning depends on compression algorithm +#' - `use_dictionary`: Specify if we should use dictionary encoding. Default `TRUE` +#' - `write_statistics`: Specify if we should write statistics. Default `TRUE` +#' - `data_page_size`: Set a target threshold for the approximate encoded +#' size of data pages within a column chunk (in bytes). Default 1 MiB. +#' +#' @details The parameters `compression`, `compression_level`, `use_dictionary` +#' and write_statistics` support various patterns: +#' +#' - The default `NULL` leaves the parameter unspecified, and the C++ library +#' uses an appropriate default for each column (defaults listed above) +#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns +#' - An unnamed vector, of the same size as the number of columns, to specify a +#' value for each column, in positional order +#' - A named vector, to specify the value for the named columns, the default +#' value for the setting is used when not supplied +#' +#' Unlike the high-level [write_parquet], `ParquetWriterProperties` arguments +#' use the C++ defaults. Currently this means "uncompressed" rather than +#' "snappy" for the `compression` argument. +#' +#' @seealso [write_parquet] +#' @seealso [Schema] for information about schemas and metadata handling. +#' +#' @export +ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = ArrowObject) +ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder", + inherit = ArrowObject, + public = list( + set_version = function(version) { + parquet___WriterProperties___Builder__version(self, make_valid_version(version)) + }, + set_compression = function(table, compression) { + compression <- compression_from_name(compression) + assert_that(is.integer(compression)) + private$.set( + table, compression, + parquet___ArrowWriterProperties___Builder__set_compressions + ) + }, + set_compression_level = function(table, compression_level) { + # cast to integer but keep names + compression_level <- set_names(as.integer(compression_level), names(compression_level)) + private$.set( + table, compression_level, + parquet___ArrowWriterProperties___Builder__set_compression_levels + ) + }, + set_dictionary = function(table, use_dictionary) { + assert_that(is.logical(use_dictionary)) + private$.set( + table, use_dictionary, + parquet___ArrowWriterProperties___Builder__set_use_dictionary + ) + }, + set_write_statistics = function(table, write_statistics) { + assert_that(is.logical(write_statistics)) + private$.set( + table, write_statistics, + parquet___ArrowWriterProperties___Builder__set_write_statistics + ) + }, + set_data_page_size = function(data_page_size) { + parquet___ArrowWriterProperties___Builder__data_page_size(self, data_page_size) + } + ), + private = list( + .set = function(table, value, FUN) { + msg <- paste0("unsupported ", substitute(value), "= specification") + column_names <- names(table) + given_names <- names(value) + if (is.null(given_names)) { + if (length(value) %in% c(1L, length(column_names))) { + # If there's a single, unnamed value, FUN will set it globally + # If there are values for all columns, send them along with the names + FUN(self, column_names, value) + } else { + abort(msg) + } + } else if (all(given_names %in% column_names)) { + # Use the given names + FUN(self, given_names, value) + } else { + abort(msg) + } + } + ) +) + +ParquetWriterProperties$create <- function(table, + version = NULL, + compression = default_parquet_compression(), + compression_level = NULL, + use_dictionary = NULL, + write_statistics = NULL, + data_page_size = NULL, + ...) { + builder <- parquet___WriterProperties___Builder__create() + if (!is.null(version)) { + builder$set_version(version) + } + if (!is.null(compression)) { + builder$set_compression(table, compression = compression) + } + if (!is.null(compression_level)) { + builder$set_compression_level(table, compression_level = compression_level) + } + if (!is.null(use_dictionary)) { + builder$set_dictionary(table, use_dictionary) + } + if (!is.null(write_statistics)) { + builder$set_write_statistics(table, write_statistics) + } + if (!is.null(data_page_size)) { + builder$set_data_page_size(data_page_size) + } + parquet___WriterProperties___Builder__build(builder) +} + +#' @title ParquetFileWriter class +#' @rdname ParquetFileWriter +#' @name ParquetFileWriter +#' @docType class +#' @usage NULL +#' @format NULL +#' @description This class enables you to interact with Parquet files. +#' +#' @section Factory: +#' +#' The `ParquetFileWriter$create()` factory method instantiates the object and +#' takes the following arguments: +#' +#' - `schema` A [Schema] +#' - `sink` An [arrow::io::OutputStream][OutputStream] +#' - `properties` An instance of [ParquetWriterProperties] +#' - `arrow_properties` An instance of `ParquetArrowWriterProperties` +#' +#' @section Methods: +#' +#' - `WriteTable` Write a [Table] to `sink` +#' - `Close` Close the writer. Note: does not close the `sink`. +#' [arrow::io::OutputStream][OutputStream] has its own `close()` method. +#' +#' @export +#' @include arrow-package.R +ParquetFileWriter <- R6Class("ParquetFileWriter", + inherit = ArrowObject, + public = list( + WriteTable = function(table, chunk_size) { + parquet___arrow___FileWriter__WriteTable(self, table, chunk_size) + }, + Close = function() parquet___arrow___FileWriter__Close(self) + ) +) +ParquetFileWriter$create <- function(schema, + sink, + properties = ParquetWriterProperties$create(), + arrow_properties = ParquetArrowWriterProperties$create()) { + assert_is(sink, "OutputStream") + parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties) +} + + +#' @title ParquetFileReader class +#' @rdname ParquetFileReader +#' @name ParquetFileReader +#' @docType class +#' @usage NULL +#' @format NULL +#' @description This class enables you to interact with Parquet files. +#' +#' @section Factory: +#' +#' The `ParquetFileReader$create()` factory method instantiates the object and +#' takes the following arguments: +#' +#' - `file` A character file name, raw vector, or Arrow file connection object +#' (e.g. `RandomAccessFile`). +#' - `props` Optional [ParquetArrowReaderProperties] +#' - `mmap` Logical: whether to memory-map the file (default `TRUE`) +#' - `...` Additional arguments, currently ignored +#' +#' @section Methods: +#' +#' - `$ReadTable(column_indices)`: get an `arrow::Table` from the file. The optional +#' `column_indices=` argument is a 0-based integer vector indicating which columns to retain. +#' - `$ReadRowGroup(i, column_indices)`: get an `arrow::Table` by reading the `i`th row group (0-based). +#' The optional `column_indices=` argument is a 0-based integer vector indicating which columns to retain. +#' - `$ReadRowGroups(row_groups, column_indices)`: get an `arrow::Table` by reading several row +#' groups (0-based integers). +#' The optional `column_indices=` argument is a 0-based integer vector indicating which columns to retain. +#' - `$GetSchema()`: get the `arrow::Schema` of the data in the file +#' - `$ReadColumn(i)`: read the `i`th column (0-based) as a [ChunkedArray]. +#' +#' @section Active bindings: +#' +#' - `$num_rows`: number of rows. +#' - `$num_columns`: number of columns. +#' - `$num_row_groups`: number of row groups. +#' +#' @export +#' @examplesIf arrow_with_parquet() +#' f <- system.file("v0.7.1.parquet", package = "arrow") +#' pq <- ParquetFileReader$create(f) +#' pq$GetSchema() +#' if (codec_is_available("snappy")) { +#' # This file has compressed data columns +#' tab <- pq$ReadTable() +#' tab$schema +#' } +#' @include arrow-package.R +ParquetFileReader <- R6Class("ParquetFileReader", + inherit = ArrowObject, + active = list( + num_rows = function() { + as.integer(parquet___arrow___FileReader__num_rows(self)) + }, + num_columns = function() { + parquet___arrow___FileReader__num_columns(self) + }, + num_row_groups = function() { + parquet___arrow___FileReader__num_row_groups(self) + } + ), + public = list( + ReadTable = function(column_indices = NULL) { + if (is.null(column_indices)) { + parquet___arrow___FileReader__ReadTable1(self) + } else { + column_indices <- vec_cast(column_indices, integer()) + parquet___arrow___FileReader__ReadTable2(self, column_indices) + } + }, + ReadRowGroup = function(i, column_indices = NULL) { + i <- vec_cast(i, integer()) + if (is.null(column_indices)) { + parquet___arrow___FileReader__ReadRowGroup1(self, i) + } else { + column_indices <- vec_cast(column_indices, integer()) + parquet___arrow___FileReader__ReadRowGroup2(self, i, column_indices) + } + }, + ReadRowGroups = function(row_groups, column_indices = NULL) { + row_groups <- vec_cast(row_groups, integer()) + if (is.null(column_indices)) { + parquet___arrow___FileReader__ReadRowGroups1(self, row_groups) + } else { + column_indices <- vec_cast(column_indices, integer()) + parquet___arrow___FileReader__ReadRowGroups2(self, row_groups, column_indices) + } + }, + ReadColumn = function(i) { + i <- vec_cast(i, integer()) + parquet___arrow___FileReader__ReadColumn(self, i) + }, + GetSchema = function() { + parquet___arrow___FileReader__GetSchema(self) + } + ) +) + +ParquetFileReader$create <- function(file, + props = ParquetArrowReaderProperties$create(), + mmap = TRUE, + ...) { + file <- make_readable_file(file, mmap) + assert_is(props, "ParquetArrowReaderProperties") + + parquet___arrow___FileReader__OpenFile(file, props) +} + +#' @title ParquetArrowReaderProperties class +#' @rdname ParquetArrowReaderProperties +#' @name ParquetArrowReaderProperties +#' @docType class +#' @usage NULL +#' @format NULL +#' @description This class holds settings to control how a Parquet file is read +#' by [ParquetFileReader]. +#' +#' @section Factory: +#' +#' The `ParquetArrowReaderProperties$create()` factory method instantiates the object +#' and takes the following arguments: +#' +#' - `use_threads` Logical: whether to use multithreading (default `TRUE`) +#' +#' @section Methods: +#' +#' - `$read_dictionary(column_index)` +#' - `$set_read_dictionary(column_index, read_dict)` +#' - `$use_threads(use_threads)` +#' +#' @export +ParquetArrowReaderProperties <- R6Class("ParquetArrowReaderProperties", + inherit = ArrowObject, + public = list( + read_dictionary = function(column_index) { + parquet___arrow___ArrowReaderProperties__get_read_dictionary(self, column_index) + }, + set_read_dictionary = function(column_index, read_dict) { + parquet___arrow___ArrowReaderProperties__set_read_dictionary(self, column_index, read_dict) + } + ), + active = list( + use_threads = function(use_threads) { + if (missing(use_threads)) { + parquet___arrow___ArrowReaderProperties__get_use_threads(self) + } else { + parquet___arrow___ArrowReaderProperties__set_use_threads(self, use_threads) + } + } + ) +) + +ParquetArrowReaderProperties$create <- function(use_threads = option_use_threads()) { + parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads)) +} diff --git a/src/arrow/r/R/python.R b/src/arrow/r/R/python.R new file mode 100644 index 000000000..07cd4456b --- /dev/null +++ b/src/arrow/r/R/python.R @@ -0,0 +1,225 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +py_to_r.pyarrow.lib.Array <- function(x, ...) { + schema_ptr <- allocate_arrow_schema() + array_ptr <- allocate_arrow_array() + on.exit({ + delete_arrow_schema(schema_ptr) + delete_arrow_array(array_ptr) + }) + + x$`_export_to_c`(array_ptr, schema_ptr) + Array$import_from_c(array_ptr, schema_ptr) +} + +r_to_py.Array <- function(x, convert = FALSE) { + schema_ptr <- allocate_arrow_schema() + array_ptr <- allocate_arrow_array() + on.exit({ + delete_arrow_schema(schema_ptr) + delete_arrow_array(array_ptr) + }) + + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + x$export_to_c(array_ptr, schema_ptr) + out <- pa$Array$`_import_from_c`(array_ptr, schema_ptr) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + +py_to_r.pyarrow.lib.RecordBatch <- function(x, ...) { + schema_ptr <- allocate_arrow_schema() + array_ptr <- allocate_arrow_array() + on.exit({ + delete_arrow_schema(schema_ptr) + delete_arrow_array(array_ptr) + }) + + x$`_export_to_c`(array_ptr, schema_ptr) + + RecordBatch$import_from_c(array_ptr, schema_ptr) +} + +r_to_py.RecordBatch <- function(x, convert = FALSE) { + schema_ptr <- allocate_arrow_schema() + array_ptr <- allocate_arrow_array() + on.exit({ + delete_arrow_schema(schema_ptr) + delete_arrow_array(array_ptr) + }) + + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + x$export_to_c(array_ptr, schema_ptr) + out <- pa$RecordBatch$`_import_from_c`(array_ptr, schema_ptr) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + +r_to_py.ChunkedArray <- function(x, convert = FALSE) { + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + out <- pa$chunked_array(x$chunks) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + +py_to_r.pyarrow.lib.ChunkedArray <- function(x, ...) { + ChunkedArray$create(!!!maybe_py_to_r(x$chunks)) +} + +r_to_py.Table <- function(x, convert = FALSE) { + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + out <- pa$Table$from_arrays(x$columns, schema = x$schema) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + +py_to_r.pyarrow.lib.Table <- function(x, ...) { + colnames <- maybe_py_to_r(x$column_names) + r_cols <- maybe_py_to_r(x$columns) + names(r_cols) <- colnames + Table$create(!!!r_cols, schema = maybe_py_to_r(x$schema)) +} + +py_to_r.pyarrow.lib.Schema <- function(x, ...) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + + x$`_export_to_c`(schema_ptr) + Schema$import_from_c(schema_ptr) +} + +r_to_py.Schema <- function(x, convert = FALSE) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + x$export_to_c(schema_ptr) + out <- pa$Schema$`_import_from_c`(schema_ptr) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + +py_to_r.pyarrow.lib.Field <- function(x, ...) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + + x$`_export_to_c`(schema_ptr) + Field$import_from_c(schema_ptr) +} + +r_to_py.Field <- function(x, convert = FALSE) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + x$export_to_c(schema_ptr) + out <- pa$Field$`_import_from_c`(schema_ptr) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + +py_to_r.pyarrow.lib.DataType <- function(x, ...) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + + x$`_export_to_c`(schema_ptr) + DataType$import_from_c(schema_ptr) +} + +r_to_py.DataType <- function(x, convert = FALSE) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + x$export_to_c(schema_ptr) + out <- pa$DataType$`_import_from_c`(schema_ptr) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + +py_to_r.pyarrow.lib.RecordBatchReader <- function(x, ...) { + stream_ptr <- allocate_arrow_array_stream() + on.exit(delete_arrow_array_stream(stream_ptr)) + + x$`_export_to_c`(stream_ptr) + RecordBatchReader$import_from_c(stream_ptr) +} + +r_to_py.RecordBatchReader <- function(x, convert = FALSE) { + stream_ptr <- allocate_arrow_array_stream() + on.exit(delete_arrow_array_stream(stream_ptr)) + + # Import with convert = FALSE so that `_import_from_c` returns a Python object + pa <- reticulate::import("pyarrow", convert = FALSE) + x$export_to_c(stream_ptr) + # TODO: handle subclasses of RecordBatchReader? + out <- pa$lib$RecordBatchReader$`_import_from_c`(stream_ptr) + # But set the convert attribute on the return object to the requested value + assign("convert", convert, out) + out +} + + +maybe_py_to_r <- function(x) { + if (inherits(x, "python.builtin.object")) { + # Depending on some auto-convert behavior, x may already be converted + # or it may still be a Python object + x <- reticulate::py_to_r(x) + } + x +} + +#' Install pyarrow for use with reticulate +#' +#' `pyarrow` is the Python package for Apache Arrow. This function helps with +#' installing it for use with `reticulate`. +#' +#' @param envname The name or full path of the Python environment to install +#' into. This can be a virtualenv or conda environment created by `reticulate`. +#' See `reticulate::py_install()`. +#' @param nightly logical: Should we install a development version of the +#' package? Default is to use the official release version. +#' @param ... additional arguments passed to `reticulate::py_install()`. +#' @export +install_pyarrow <- function(envname = NULL, nightly = FALSE, ...) { + if (nightly) { + reticulate::py_install("pyarrow", + envname = envname, ..., + # Nightly for pip + pip_options = "--extra-index-url https://repo.fury.io/arrow-nightlies/ --pre --upgrade", + # Nightly for conda + channel = "arrow-nightlies" + ) + } else { + reticulate::py_install("pyarrow", envname = envname, ...) + } +} diff --git a/src/arrow/r/R/query-engine.R b/src/arrow/r/R/query-engine.R new file mode 100644 index 000000000..234aaf569 --- /dev/null +++ b/src/arrow/r/R/query-engine.R @@ -0,0 +1,298 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +do_exec_plan <- function(.data) { + plan <- ExecPlan$create() + final_node <- plan$Build(.data) + tab <- plan$Run(final_node) + + # TODO (ARROW-14289): make the head/tail methods return RBR not Table + if (inherits(tab, "RecordBatchReader")) { + tab <- tab$read_table() + } + + # If arrange() created $temp_columns, make sure to omit them from the result + # We can't currently handle this in the ExecPlan itself because sorting + # happens in the end (SinkNode) so nothing comes after it. + if (length(final_node$sort$temp_columns) > 0) { + tab <- tab[, setdiff(names(tab), final_node$sort$temp_columns), drop = FALSE] + } + + if (ncol(tab)) { + # Apply any column metadata from the original schema, where appropriate + original_schema <- source_data(.data)$schema + # TODO: do we care about other (non-R) metadata preservation? + # How would we know if it were meaningful? + r_meta <- original_schema$r_metadata + if (!is.null(r_meta)) { + # Filter r_metadata$columns on columns with name _and_ type match + new_schema <- tab$schema + common_names <- intersect(names(r_meta$columns), names(tab)) + keep <- common_names[ + map_lgl(common_names, ~ original_schema[[.]] == new_schema[[.]]) + ] + r_meta$columns <- r_meta$columns[keep] + if (has_aggregation(.data)) { + # dplyr drops top-level attributes if you do summarize + r_meta$attributes <- NULL + } + tab$r_metadata <- r_meta + } + } + + tab +} + +ExecPlan <- R6Class("ExecPlan", + inherit = ArrowObject, + public = list( + Scan = function(dataset) { + # Handle arrow_dplyr_query + if (inherits(dataset, "arrow_dplyr_query")) { + if (inherits(dataset$.data, "RecordBatchReader")) { + return(ExecNode_ReadFromRecordBatchReader(self, dataset$.data)) + } + + filter <- dataset$filtered_rows + if (isTRUE(filter)) { + filter <- Expression$scalar(TRUE) + } + # Use FieldsInExpression to find all from dataset$selected_columns + colnames <- unique(unlist(map( + dataset$selected_columns, + field_names_in_expression + ))) + dataset <- dataset$.data + assert_is(dataset, "Dataset") + } else { + if (inherits(dataset, "ArrowTabular")) { + dataset <- InMemoryDataset$create(dataset) + } + assert_is(dataset, "Dataset") + # Set some defaults + filter <- Expression$scalar(TRUE) + colnames <- names(dataset) + } + # ScanNode needs the filter to do predicate pushdown and skip partitions, + # and it needs to know which fields to materialize (and which are unnecessary) + ExecNode_Scan(self, dataset, filter, colnames %||% character(0)) + }, + Build = function(.data) { + # This method takes an arrow_dplyr_query and chains together the + # ExecNodes that they produce. It does not evaluate them--that is Run(). + group_vars <- dplyr::group_vars(.data) + grouped <- length(group_vars) > 0 + + # Collect the target names first because we have to add back the group vars + target_names <- names(.data) + .data <- ensure_group_vars(.data) + .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns + + if (inherits(.data$.data, "arrow_dplyr_query")) { + # We have a nested query. Recurse. + node <- self$Build(.data$.data) + } else { + node <- self$Scan(.data) + } + + # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again + if (inherits(.data$filtered_rows, "Expression")) { + node <- node$Filter(.data$filtered_rows) + } + + if (!is.null(.data$aggregations)) { + # Project to include just the data required for each aggregation, + # plus group_by_vars (last) + # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) + # dplyr does not error on this but the result it gives isn't great + node <- node$Project(summarize_projection(.data)) + + if (grouped) { + # We need to prefix all of the aggregation function names with "hash_" + .data$aggregations <- lapply(.data$aggregations, function(x) { + x[["fun"]] <- paste0("hash_", x[["fun"]]) + x + }) + } + + node <- node$Aggregate( + options = map(.data$aggregations, ~ .[c("fun", "options")]), + target_names = names(.data$aggregations), + out_field_names = names(.data$aggregations), + key_names = group_vars + ) + + if (grouped) { + # The result will have result columns first then the grouping cols. + # dplyr orders group cols first, so adapt the result to meet that expectation. + node <- node$Project( + make_field_refs(c(group_vars, names(.data$aggregations))) + ) + if (getOption("arrow.summarise.sort", FALSE)) { + # Add sorting instructions for the rows too to match dplyr + # (see below about why sorting isn't itself a Node) + node$sort <- list( + names = group_vars, + orders = rep(0L, length(group_vars)) + ) + } + } + } else { + # If any columns are derived, reordered, or renamed we need to Project + # If there are aggregations, the projection was already handled above + # We have to project at least once to eliminate some junk columns + # that the ExecPlan adds: + # __fragment_index, __batch_index, __last_in_fragment + # Presumably extraneous repeated projection of the same thing + # (as when we've done collapse() and not projected after) is cheap/no-op + projection <- c(.data$selected_columns, .data$temp_columns) + node <- node$Project(projection) + + if (!is.null(.data$join)) { + node <- node$Join( + type = .data$join$type, + right_node = self$Build(.data$join$right_data), + by = .data$join$by, + left_output = names(.data), + right_output = setdiff(names(.data$join$right_data), .data$join$by) + ) + } + } + + # Apply sorting: this is currently not an ExecNode itself, it is a + # sink node option. + # TODO: handle some cases: + # (1) arrange > summarize > arrange + # (2) ARROW-13779: arrange then operation where order matters (e.g. cumsum) + if (length(.data$arrange_vars)) { + node$sort <- list( + names = names(.data$arrange_vars), + orders = .data$arrange_desc, + temp_columns = names(.data$temp_columns) + ) + } + + # This is only safe because we are going to evaluate queries that end + # with head/tail first, then evaluate any subsequent query as a new query + if (!is.null(.data$head)) { + node$head <- .data$head + } + if (!is.null(.data$tail)) { + node$tail <- .data$tail + } + + node + }, + Run = function(node) { + assert_is(node, "ExecNode") + + # Sorting and head/tail (if sorted) are handled in the SinkNode, + # created in ExecPlan_run + sorting <- node$sort %||% list() + select_k <- node$head %||% -1L + has_sorting <- length(sorting) > 0 + if (has_sorting) { + if (!is.null(node$tail)) { + # Reverse the sort order and take the top K, then after we'll reverse + # the resulting rows so that it is ordered as expected + sorting$orders <- !sorting$orders + select_k <- node$tail + } + sorting$orders <- as.integer(sorting$orders) + } + + out <- ExecPlan_run(self, node, sorting, select_k) + + if (!has_sorting) { + # Since ExecPlans don't scan in deterministic order, head/tail are both + # essentially taking a random slice from somewhere in the dataset. + # And since the head() implementation is way more efficient than tail(), + # just use it to take the random slice + slice_size <- node$head %||% node$tail + if (!is.null(slice_size)) { + # TODO (ARROW-14289): make the head methods return RBR not Table + out <- head(out, slice_size) + } + # Can we now tell `self$Stop()` to StopProducing? We already have + # everything we need for the head (but it seems to segfault: ARROW-14329) + } else if (!is.null(node$tail)) { + # Reverse the row order to get back what we expect + # TODO: don't return Table, return RecordBatchReader + out <- out$read_table() + out <- out[rev(seq_len(nrow(out))), , drop = FALSE] + } + + out + }, + Stop = function() ExecPlan_StopProducing(self) + ) +) +ExecPlan$create <- function(use_threads = option_use_threads()) { + ExecPlan_create(use_threads) +} + +ExecNode <- R6Class("ExecNode", + inherit = ArrowObject, + public = list( + # `sort` is a slight hack to be able to keep around arrange() params, + # which don't currently yield their own ExecNode but rather are consumed + # in the SinkNode (in ExecPlan$run()) + sort = NULL, + # Similar hacks for head and tail + head = NULL, + tail = NULL, + preserve_sort = function(new_node) { + new_node$sort <- self$sort + new_node$head <- self$head + new_node$tail <- self$tail + new_node + }, + Project = function(cols) { + if (length(cols)) { + assert_is_list_of(cols, "Expression") + self$preserve_sort(ExecNode_Project(self, cols, names(cols))) + } else { + self$preserve_sort(ExecNode_Project(self, character(0), character(0))) + } + }, + Filter = function(expr) { + assert_is(expr, "Expression") + self$preserve_sort(ExecNode_Filter(self, expr)) + }, + Aggregate = function(options, target_names, out_field_names, key_names) { + self$preserve_sort( + ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) + ) + }, + Join = function(type, right_node, by, left_output, right_output) { + self$preserve_sort( + ExecNode_Join( + self, + type, + right_node, + left_keys = names(by), + right_keys = by, + left_output = left_output, + right_output = right_output + ) + ) + } + ), + active = list( + schema = function() ExecNode_output_schema(self) + ) +) diff --git a/src/arrow/r/R/record-batch-reader.R b/src/arrow/r/R/record-batch-reader.R new file mode 100644 index 000000000..1542e3649 --- /dev/null +++ b/src/arrow/r/R/record-batch-reader.R @@ -0,0 +1,164 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +#' @title RecordBatchReader classes +#' @description Apache Arrow defines two formats for [serializing data for interprocess +#' communication +#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc): +#' a "stream" format and a "file" format, known as Feather. +#' `RecordBatchStreamReader` and `RecordBatchFileReader` are +#' interfaces for accessing record batches from input sources in those formats, +#' respectively. +#' +#' For guidance on how to use these classes, see the examples section. +#' +#' @seealso [read_ipc_stream()] and [read_feather()] provide a much simpler interface +#' for reading data from these formats and are sufficient for many use cases. +#' @usage NULL +#' @format NULL +#' @docType class +#' @section Factory: +#' +#' The `RecordBatchFileReader$create()` and `RecordBatchStreamReader$create()` +#' factory methods instantiate the object and +#' take a single argument, named according to the class: +#' +#' - `file` A character file name, raw vector, or Arrow file connection object +#' (e.g. [RandomAccessFile]). +#' - `stream` A raw vector, [Buffer], or [InputStream]. +#' +#' @section Methods: +#' +#' - `$read_next_batch()`: Returns a `RecordBatch`, iterating through the +#' Reader. If there are no further batches in the Reader, it returns `NULL`. +#' - `$schema`: Returns a [Schema] (active binding) +#' - `$batches()`: Returns a list of `RecordBatch`es +#' - `$read_table()`: Collects the reader's `RecordBatch`es into a [Table] +#' - `$get_batch(i)`: For `RecordBatchFileReader`, return a particular batch +#' by an integer index. +#' - `$num_record_batches()`: For `RecordBatchFileReader`, see how many batches +#' are in the file. +#' +#' @rdname RecordBatchReader +#' @name RecordBatchReader +#' @include arrow-package.R +#' @examplesIf arrow_available() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' +#' batch <- record_batch(chickwts) +#' +#' # This opens a connection to the file in Arrow +#' file_obj <- FileOutputStream$create(tf) +#' # Pass that to a RecordBatchWriter to write data conforming to a schema +#' writer <- RecordBatchFileWriter$create(file_obj, batch$schema) +#' writer$write(batch) +#' # You may write additional batches to the stream, provided that they have +#' # the same schema. +#' # Call "close" on the writer to indicate end-of-file/stream +#' writer$close() +#' # Then, close the connection--closing the IPC message does not close the file +#' file_obj$close() +#' +#' # Now, we have a file we can read from. Same pattern: open file connection, +#' # then pass it to a RecordBatchReader +#' read_file_obj <- ReadableFile$create(tf) +#' reader <- RecordBatchFileReader$create(read_file_obj) +#' # RecordBatchFileReader knows how many batches it has (StreamReader does not) +#' reader$num_record_batches +#' # We could consume the Reader by calling $read_next_batch() until all are, +#' # consumed, or we can call $read_table() to pull them all into a Table +#' tab <- reader$read_table() +#' # Call as.data.frame to turn that Table into an R data.frame +#' df <- as.data.frame(tab) +#' # This should be the same data we sent +#' all.equal(df, chickwts, check.attributes = FALSE) +#' # Unlike the Writers, we don't have to close RecordBatchReaders, +#' # but we do still need to close the file connection +#' read_file_obj$close() +RecordBatchReader <- R6Class("RecordBatchReader", + inherit = ArrowObject, + public = list( + read_next_batch = function() RecordBatchReader__ReadNext(self), + batches = function() RecordBatchReader__batches(self), + read_table = function() Table__from_RecordBatchReader(self), + export_to_c = function(stream_ptr) ExportRecordBatchReader(self, stream_ptr) + ), + active = list( + schema = function() RecordBatchReader__schema(self) + ) +) + +#' @export +head.RecordBatchReader <- function(x, n = 6L, ...) { + head(Scanner$create(x), n) +} + +#' @export +tail.RecordBatchReader <- function(x, n = 6L, ...) { + tail(Scanner$create(x), n) +} + +#' @rdname RecordBatchReader +#' @usage NULL +#' @format NULL +#' @export +RecordBatchStreamReader <- R6Class("RecordBatchStreamReader", inherit = RecordBatchReader) +RecordBatchStreamReader$create <- function(stream) { + if (inherits(stream, c("raw", "Buffer"))) { + # TODO: deprecate this because it doesn't close the connection to the Buffer + # (that's a problem, right?) + stream <- BufferReader$create(stream) + } + assert_is(stream, "InputStream") + ipc___RecordBatchStreamReader__Open(stream) +} +#' @include arrowExports.R +RecordBatchReader$import_from_c <- RecordBatchStreamReader$import_from_c <- ImportRecordBatchReader + +#' @rdname RecordBatchReader +#' @usage NULL +#' @format NULL +#' @export +RecordBatchFileReader <- R6Class("RecordBatchFileReader", + inherit = ArrowObject, + # Why doesn't this inherit from RecordBatchReader in C++? + # Origin: https://github.com/apache/arrow/pull/679 + public = list( + get_batch = function(i) { + ipc___RecordBatchFileReader__ReadRecordBatch(self, i) + }, + batches = function() { + ipc___RecordBatchFileReader__batches(self) + }, + read_table = function() Table__from_RecordBatchFileReader(self) + ), + active = list( + num_record_batches = function() ipc___RecordBatchFileReader__num_record_batches(self), + schema = function() ipc___RecordBatchFileReader__schema(self) + ) +) +RecordBatchFileReader$create <- function(file) { + if (inherits(file, c("raw", "Buffer"))) { + # TODO: deprecate this because it doesn't close the connection to the Buffer + # (that's a problem, right?) + file <- BufferReader$create(file) + } + assert_is(file, "InputStream") + ipc___RecordBatchFileReader__Open(file) +} diff --git a/src/arrow/r/R/record-batch-writer.R b/src/arrow/r/R/record-batch-writer.R new file mode 100644 index 000000000..8675e785a --- /dev/null +++ b/src/arrow/r/R/record-batch-writer.R @@ -0,0 +1,194 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +#' @title RecordBatchWriter classes +#' @description Apache Arrow defines two formats for [serializing data for interprocess +#' communication +#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc): +#' a "stream" format and a "file" format, known as Feather. +#' `RecordBatchStreamWriter` and `RecordBatchFileWriter` are +#' interfaces for writing record batches to those formats, respectively. +#' +#' For guidance on how to use these classes, see the examples section. +#' +#' @seealso [write_ipc_stream()] and [write_feather()] provide a much simpler +#' interface for writing data to these formats and are sufficient for many use +#' cases. [write_to_raw()] is a version that serializes data to a buffer. +#' @usage NULL +#' @format NULL +#' @docType class +#' @section Factory: +#' +#' The `RecordBatchFileWriter$create()` and `RecordBatchStreamWriter$create()` +#' factory methods instantiate the object and take the following arguments: +#' +#' - `sink` An `OutputStream` +#' - `schema` A [Schema] for the data to be written +#' - `use_legacy_format` logical: write data formatted so that Arrow libraries +#' versions 0.14 and lower can read it. Default is `FALSE`. You can also +#' enable this by setting the environment variable `ARROW_PRE_0_15_IPC_FORMAT=1`. +#' - `metadata_version`: A string like "V5" or the equivalent integer indicating +#' the Arrow IPC MetadataVersion. Default (NULL) will use the latest version, +#' unless the environment variable `ARROW_PRE_1_0_METADATA_VERSION=1`, in +#' which case it will be V4. +#' +#' @section Methods: +#' +#' - `$write(x)`: Write a [RecordBatch], [Table], or `data.frame`, dispatching +#' to the methods below appropriately +#' - `$write_batch(batch)`: Write a `RecordBatch` to stream +#' - `$write_table(table)`: Write a `Table` to stream +#' - `$close()`: close stream. Note that this indicates end-of-file or +#' end-of-stream--it does not close the connection to the `sink`. That needs +#' to be closed separately. +#' +#' @rdname RecordBatchWriter +#' @name RecordBatchWriter +#' @include arrow-package.R +#' @examplesIf arrow_available() +#' tf <- tempfile() +#' on.exit(unlink(tf)) +#' +#' batch <- record_batch(chickwts) +#' +#' # This opens a connection to the file in Arrow +#' file_obj <- FileOutputStream$create(tf) +#' # Pass that to a RecordBatchWriter to write data conforming to a schema +#' writer <- RecordBatchFileWriter$create(file_obj, batch$schema) +#' writer$write(batch) +#' # You may write additional batches to the stream, provided that they have +#' # the same schema. +#' # Call "close" on the writer to indicate end-of-file/stream +#' writer$close() +#' # Then, close the connection--closing the IPC message does not close the file +#' file_obj$close() +#' +#' # Now, we have a file we can read from. Same pattern: open file connection, +#' # then pass it to a RecordBatchReader +#' read_file_obj <- ReadableFile$create(tf) +#' reader <- RecordBatchFileReader$create(read_file_obj) +#' # RecordBatchFileReader knows how many batches it has (StreamReader does not) +#' reader$num_record_batches +#' # We could consume the Reader by calling $read_next_batch() until all are, +#' # consumed, or we can call $read_table() to pull them all into a Table +#' tab <- reader$read_table() +#' # Call as.data.frame to turn that Table into an R data.frame +#' df <- as.data.frame(tab) +#' # This should be the same data we sent +#' all.equal(df, chickwts, check.attributes = FALSE) +#' # Unlike the Writers, we don't have to close RecordBatchReaders, +#' # but we do still need to close the file connection +#' read_file_obj$close() +RecordBatchWriter <- R6Class("RecordBatchWriter", + inherit = ArrowObject, + public = list( + write_batch = function(batch) ipc___RecordBatchWriter__WriteRecordBatch(self, batch), + write_table = function(table) ipc___RecordBatchWriter__WriteTable(self, table), + write = function(x) { + if (inherits(x, "RecordBatch")) { + self$write_batch(x) + } else if (inherits(x, "Table")) { + self$write_table(x) + } else { + self$write_table(Table$create(x)) + } + }, + close = function() ipc___RecordBatchWriter__Close(self) + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname RecordBatchWriter +#' @export +RecordBatchStreamWriter <- R6Class("RecordBatchStreamWriter", inherit = RecordBatchWriter) +RecordBatchStreamWriter$create <- function(sink, + schema, + use_legacy_format = NULL, + metadata_version = NULL) { + if (is.string(sink)) { + stop( + "RecordBatchStreamWriter$create() requires an Arrow InputStream. ", + "Try providing FileOutputStream$create(", substitute(sink), ")", + call. = FALSE + ) + } + assert_is(sink, "OutputStream") + assert_is(schema, "Schema") + + ipc___RecordBatchStreamWriter__Open( + sink, + schema, + get_ipc_use_legacy_format(use_legacy_format), + get_ipc_metadata_version(metadata_version) + ) +} + +#' @usage NULL +#' @format NULL +#' @rdname RecordBatchWriter +#' @export +RecordBatchFileWriter <- R6Class("RecordBatchFileWriter", inherit = RecordBatchStreamWriter) +RecordBatchFileWriter$create <- function(sink, + schema, + use_legacy_format = NULL, + metadata_version = NULL) { + if (is.string(sink)) { + stop( + "RecordBatchFileWriter$create() requires an Arrow InputStream. ", + "Try providing FileOutputStream$create(", substitute(sink), ")", + call. = FALSE + ) + } + assert_is(sink, "OutputStream") + assert_is(schema, "Schema") + + ipc___RecordBatchFileWriter__Open( + sink, + schema, + get_ipc_use_legacy_format(use_legacy_format), + get_ipc_metadata_version(metadata_version) + ) +} + +get_ipc_metadata_version <- function(x) { + input <- x + if (is_integerish(x)) { + # 4 means "V4", which actually happens to be 3L + x <- paste0("V", x) + } else if (is.null(x)) { + if (identical(Sys.getenv("ARROW_PRE_1_0_METADATA_VERSION"), "1") || + identical(Sys.getenv("ARROW_PRE_0_15_IPC_FORMAT"), "1")) { + # PRE_1_0 is specific for this; + # if you already set PRE_0_15, PRE_1_0 should be implied + x <- "V4" + } else { + # Take the latest + x <- length(MetadataVersion) + } + } + out <- MetadataVersion[[x]] + if (is.null(out)) { + stop(deparse(input), " is not a valid IPC MetadataVersion", call. = FALSE) + } + out +} + +get_ipc_use_legacy_format <- function(x) { + isTRUE(x %||% identical(Sys.getenv("ARROW_PRE_0_15_IPC_FORMAT"), "1")) +} diff --git a/src/arrow/r/R/record-batch.R b/src/arrow/r/R/record-batch.R new file mode 100644 index 000000000..c66ff7fb0 --- /dev/null +++ b/src/arrow/r/R/record-batch.R @@ -0,0 +1,193 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @include array.R +#' @title RecordBatch class +#' @description A record batch is a collection of equal-length arrays matching +#' a particular [Schema]. It is a table-like data structure that is semantically +#' a sequence of [fields][Field], each a contiguous Arrow [Array]. +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section S3 Methods and Usage: +#' Record batches are data-frame-like, and many methods you expect to work on +#' a `data.frame` are implemented for `RecordBatch`. This includes `[`, `[[`, +#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull +#' the data from an Arrow record batch into R with `as.data.frame()`. See the +#' examples. +#' +#' A caveat about the `$` method: because `RecordBatch` is an `R6` object, +#' `$` is also used to access the object's methods (see below). Methods take +#' precedence over the table's columns. So, `batch$Slice` would return the +#' "Slice" method function even if there were a column in the table called +#' "Slice". +#' +#' @section R6 Methods: +#' In addition to the more R-friendly S3 methods, a `RecordBatch` object has +#' the following R6 methods that map onto the underlying C++ methods: +#' +#' - `$Equals(other)`: Returns `TRUE` if the `other` record batch is equal +#' - `$column(i)`: Extract an `Array` by integer position from the batch +#' - `$column_name(i)`: Get a column's name by integer position +#' - `$names()`: Get all column names (called by `names(batch)`) +#' - `$RenameColumns(value)`: Set all column names (called by `names(batch) <- value`) +#' - `$GetColumnByName(name)`: Extract an `Array` by string name +#' - `$RemoveColumn(i)`: Drops a column from the batch by integer position +#' - `$SelectColumns(indices)`: Return a new record batch with a selection of columns, expressed as 0-based integers. +#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the +#' indicated integer offset and going for the given length, or to the end +#' of the table if `NULL`, the default. +#' - `$Take(i)`: return an `RecordBatch` with rows at positions given by +#' integers (R vector or Array Array) `i`. +#' - `$Filter(i, keep_na = TRUE)`: return an `RecordBatch` with rows at positions where logical +#' vector (or Arrow boolean Array) `i` is `TRUE`. +#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row +#' positions that can be used to rearrange the `RecordBatch` in ascending or +#' descending order by the first named column, breaking ties with further named +#' columns. `descending` can be a logical vector of length one or of the same +#' length as `names`. +#' - `$serialize()`: Returns a raw vector suitable for interprocess communication +#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter +#' the schema of the record batch. +#' +#' There are also some active bindings +#' - `$num_columns` +#' - `$num_rows` +#' - `$schema` +#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list. +#' Modify or replace by assigning in (`batch$metadata <- new_metadata`). +#' All list elements are coerced to string. See `schema()` for more information. +#' - `$columns`: Returns a list of `Array`s +#' @rdname RecordBatch +#' @name RecordBatch +#' @export +RecordBatch <- R6Class("RecordBatch", + inherit = ArrowTabular, + public = list( + column = function(i) RecordBatch__column(self, i), + column_name = function(i) RecordBatch__column_name(self, i), + names = function() RecordBatch__names(self), + RenameColumns = function(value) RecordBatch__RenameColumns(self, value), + Equals = function(other, check_metadata = FALSE, ...) { + inherits(other, "RecordBatch") && RecordBatch__Equals(self, other, isTRUE(check_metadata)) + }, + GetColumnByName = function(name) { + assert_that(is.string(name)) + RecordBatch__GetColumnByName(self, name) + }, + SelectColumns = function(indices) RecordBatch__SelectColumns(self, indices), + AddColumn = function(i, new_field, value) { + RecordBatch__AddColumn(self, i, new_field, value) + }, + SetColumn = function(i, new_field, value) { + RecordBatch__SetColumn(self, i, new_field, value) + }, + RemoveColumn = function(i) RecordBatch__RemoveColumn(self, i), + ReplaceSchemaMetadata = function(new) { + RecordBatch__ReplaceSchemaMetadata(self, new) + }, + Slice = function(offset, length = NULL) { + if (is.null(length)) { + RecordBatch__Slice1(self, offset) + } else { + RecordBatch__Slice2(self, offset, length) + } + }, + # Take, Filter, and SortIndices are methods on ArrowTabular + serialize = function() ipc___SerializeRecordBatch__Raw(self), + to_data_frame = function() { + RecordBatch__to_dataframe(self, use_threads = option_use_threads()) + }, + cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { + assert_is(target_schema, "Schema") + assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") + RecordBatch__cast(self, target_schema, options) + }, + invalidate = function() { + .Call(`_arrow_RecordBatch__Reset`, self) + super$invalidate() + }, + export_to_c = function(array_ptr, schema_ptr) { + ExportRecordBatch(self, array_ptr, schema_ptr) + } + ), + active = list( + num_columns = function() RecordBatch__num_columns(self), + num_rows = function() RecordBatch__num_rows(self), + schema = function() RecordBatch__schema(self), + columns = function() RecordBatch__columns(self) + ) +) + +RecordBatch$create <- function(..., schema = NULL) { + arrays <- list2(...) + if (length(arrays) == 1 && inherits(arrays[[1]], c("raw", "Buffer", "InputStream", "Message"))) { + return(RecordBatch$from_message(arrays[[1]], schema)) + } + + # Else, a list of arrays or data.frames + # making sure there are always names + if (is.null(names(arrays))) { + names(arrays) <- rep_len("", length(arrays)) + } + stopifnot(length(arrays) > 0) + + # If any arrays are length 1, recycle them + arrays <- recycle_scalars(arrays) + + # TODO: should this also assert that they're all Arrays? + RecordBatch__from_arrays(schema, arrays) +} + +RecordBatch$from_message <- function(obj, schema) { + # Message/Buffer readers, previously in read_record_batch() + assert_is(schema, "Schema") + if (inherits(obj, c("raw", "Buffer"))) { + obj <- BufferReader$create(obj) + on.exit(obj$close()) + } + if (inherits(obj, "InputStream")) { + ipc___ReadRecordBatch__InputStream__Schema(obj, schema) + } else { + ipc___ReadRecordBatch__Message__Schema(obj, schema) + } +} +#' @include arrowExports.R +RecordBatch$import_from_c <- ImportRecordBatch + +#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a +#' mixture of data.frames and vectors, the inputs will be autospliced together +#' (see examples). Alternatively, you can provide a single Arrow IPC +#' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`. +#' @param schema a [Schema], or `NULL` (the default) to infer the schema from +#' the data in `...`. When providing an Arrow IPC buffer, `schema` is required. +#' @rdname RecordBatch +#' @examplesIf arrow_available() +#' batch <- record_batch(name = rownames(mtcars), mtcars) +#' dim(batch) +#' dim(head(batch)) +#' names(batch) +#' batch$mpg +#' batch[["cyl"]] +#' as.data.frame(batch[4:8, c("gear", "hp", "wt")]) +#' @export +record_batch <- RecordBatch$create + +#' @export +names.RecordBatch <- function(x) x$names() diff --git a/src/arrow/r/R/reexports-bit64.R b/src/arrow/r/R/reexports-bit64.R new file mode 100644 index 000000000..c89d2b150 --- /dev/null +++ b/src/arrow/r/R/reexports-bit64.R @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @importFrom bit64 print.integer64 +bit64::print.integer64 + +#' @importFrom bit64 str.integer64 +bit64::str.integer64 diff --git a/src/arrow/r/R/reexports-tidyselect.R b/src/arrow/r/R/reexports-tidyselect.R new file mode 100644 index 000000000..cd0de2849 --- /dev/null +++ b/src/arrow/r/R/reexports-tidyselect.R @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Alias required for help links in downstream packages +#' @aliases select_helpers +#' @importFrom tidyselect contains +#' @export +tidyselect::contains +#' @importFrom tidyselect ends_with +#' @export +tidyselect::ends_with +#' @importFrom tidyselect everything +#' @export +tidyselect::everything +#' @importFrom tidyselect matches +#' @export +tidyselect::matches +#' @importFrom tidyselect num_range +#' @export +tidyselect::num_range +#' @importFrom tidyselect one_of +#' @export +tidyselect::one_of +#' @importFrom tidyselect starts_with +#' @export +tidyselect::starts_with +#' @importFrom tidyselect last_col +#' @export +tidyselect::last_col +#' @importFrom tidyselect all_of +#' @export +tidyselect::all_of diff --git a/src/arrow/r/R/scalar.R b/src/arrow/r/R/scalar.R new file mode 100644 index 000000000..4dedc6c12 --- /dev/null +++ b/src/arrow/r/R/scalar.R @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-datum.R + +#' @title Arrow scalars +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @description A `Scalar` holds a single value of an Arrow type. +#' +#' @section Methods: +#' `$ToString()`: convert to a string +#' `$as_vector()`: convert to an R vector +#' `$as_array()`: convert to an Arrow `Array` +#' `$Equals(other)`: is this Scalar equal to `other` +#' `$ApproxEquals(other)`: is this Scalar approximately equal to `other` +#' `$is_valid`: is this Scalar valid +#' `$null_count`: number of invalid values - 1 or 0 +#' `$type`: Scalar type +#' +#' @name Scalar +#' @rdname Scalar +#' @examplesIf arrow_available() +#' Scalar$create(pi) +#' Scalar$create(404) +#' # If you pass a vector into Scalar$create, you get a list containing your items +#' Scalar$create(c(1, 2, 3)) +#' +#' # Comparisons +#' my_scalar <- Scalar$create(99) +#' my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE +#' my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE +#' my_scalar$Equals(Scalar$create(99.000009)) # FALSE +#' my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match) +#' +#' my_scalar$ToString() +#' @export +Scalar <- R6Class("Scalar", + inherit = ArrowDatum, + # TODO: document the methods + public = list( + ToString = function() Scalar__ToString(self), + type_id = function() Scalar__type(self)$id, + as_vector = function() Scalar__as_vector(self), + as_array = function(length = 1L) MakeArrayFromScalar(self, as.integer(length)), + Equals = function(other, ...) { + inherits(other, "Scalar") && Scalar__Equals(self, other) + }, + ApproxEquals = function(other, ...) { + inherits(other, "Scalar") && Scalar__ApproxEquals(self, other) + } + ), + active = list( + is_valid = function() Scalar__is_valid(self), + null_count = function() sum(!self$is_valid), + type = function() Scalar__type(self) + ) +) +Scalar$create <- function(x, type = NULL) { + if (is.null(x)) { + x <- vctrs::unspecified(1) + } else if (length(x) != 1 && !is.data.frame(x)) { + # Wrap in a list type + x <- list(x) + } + Array__GetScalar(Array$create(x, type = type), 0) +} + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +StructScalar <- R6Class("StructScalar", + inherit = Scalar, + public = list( + field = function(i) StructScalar__field(self, i), + GetFieldByName = function(name) StructScalar__GetFieldByName(self, name) + ) +) + +#' @export +length.Scalar <- function(x) 1L + +#' @export +sort.Scalar <- function(x, decreasing = FALSE, ...) x diff --git a/src/arrow/r/R/schema.R b/src/arrow/r/R/schema.R new file mode 100644 index 000000000..c3dfee5f9 --- /dev/null +++ b/src/arrow/r/R/schema.R @@ -0,0 +1,330 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @title Schema class +#' +#' @description A `Schema` is a list of [Field]s, which map names to +#' Arrow [data types][data-type]. Create a `Schema` when you +#' want to convert an R `data.frame` to Arrow but don't want to rely on the +#' default mapping of R types to Arrow types, such as when you want to choose a +#' specific numeric precision, or when creating a [Dataset] and you want to +#' ensure a specific schema rather than inferring it from the various files. +#' +#' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method +#' (active binding) that lets you access their schema. +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' @section Methods: +#' +#' - `$ToString()`: convert to a string +#' - `$field(i)`: returns the field at index `i` (0-based) +#' - `$GetFieldByName(x)`: returns the field with name `x` +#' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value +#' `metadata` set. Note that all list elements in `metadata` will be coerced +#' to `character`. +#' +#' @section Active bindings: +#' +#' - `$names`: returns the field names (called in `names(Schema)`) +#' - `$num_fields`: returns the number of fields (called in `length(Schema)`) +#' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for +#' iterating over +#' - `$HasMetadata`: logical: does this `Schema` have extra metadata? +#' - `$metadata`: returns the key-value metadata as a named list. +#' Modify or replace by assigning in (`sch$metadata <- new_metadata`). +#' All list elements are coerced to string. +#' +#' @section R Metadata: +#' +#' When converting a data.frame to an Arrow Table or RecordBatch, attributes +#' from the `data.frame` are saved alongside tables so that the object can be +#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata +#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or +#' at the column (e.g. `attributes(df$col_a)`) or for list columns only: +#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows +#' for storing `haven` columns in a table and being able to faithfully +#' re-create them when pulled back into R. This metadata is separate from the +#' schema (column names and types) which is compatible with other Arrow +#' clients. The R metadata is only read by R and is ignored by other clients +#' (e.g. Pandas has its own custom metadata). This metadata is stored in +#' `$metadata$r`. +#' +#' Since Schema metadata keys and values must be strings, this metadata is +#' saved by serializing R's attribute list structure to a string. If the +#' serialized metadata exceeds 100Kb in size, by default it is compressed +#' starting in version 3.0.0. To disable this compression (e.g. for tables +#' that are compatible with Arrow versions before 3.0.0 and include large +#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`. +#' Files with compressed metadata are readable by older versions of arrow, but +#' the metadata is dropped. +#' +#' @rdname Schema +#' @name Schema +#' @examplesIf arrow_available() +#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5)) +#' tab1 <- arrow_table(df) +#' tab1$schema +#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32())) +#' tab2$schema +#' @export +Schema <- R6Class("Schema", + inherit = ArrowObject, + public = list( + ToString = function() { + fields <- print_schema_fields(self) + if (self$HasMetadata) { + fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata") + } + fields + }, + field = function(i) Schema__field(self, i), + GetFieldByName = function(x) Schema__GetFieldByName(self, x), + AddField = function(i, field) { + assert_is(field, "Field") + Schema__AddField(self, i, field) + }, + SetField = function(i, field) { + assert_is(field, "Field") + Schema__SetField(self, i, field) + }, + RemoveField = function(i) Schema__RemoveField(self, i), + serialize = function() Schema__serialize(self), + WithMetadata = function(metadata = NULL) { + metadata <- prepare_key_value_metadata(metadata) + Schema__WithMetadata(self, metadata) + }, + Equals = function(other, check_metadata = FALSE, ...) { + inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata)) + }, + export_to_c = function(ptr) ExportSchema(self, ptr) + ), + active = list( + names = function() { + Schema__field_names(self) + }, + num_fields = function() Schema__num_fields(self), + fields = function() Schema__fields(self), + HasMetadata = function() Schema__HasMetadata(self), + metadata = function(new_metadata) { + if (missing(new_metadata)) { + Schema__metadata(self) + } else { + # Set the metadata + out <- self$WithMetadata(new_metadata) + # $WithMetadata returns a new object but we're modifying in place, + # so swap in that new C++ object pointer into our R6 object + self$set_pointer(out$pointer()) + self + } + }, + r_metadata = function(new) { + # Helper for the R metadata that handles the serialization + # See also method on ArrowTabular + if (missing(new)) { + out <- self$metadata$r + if (!is.null(out)) { + # Can't unserialize NULL + out <- .unserialize_arrow_r_metadata(out) + } + # Returns either NULL or a named list + out + } else { + # Set the R metadata + self$metadata$r <- .serialize_arrow_r_metadata(new) + self + } + } + ) +) +Schema$create <- function(...) { + .list <- list2(...) + if (all(map_lgl(.list, ~ inherits(., "Field")))) { + schema_(.list) + } else { + schema_(.fields(.list)) + } +} +#' @include arrowExports.R +Schema$import_from_c <- ImportSchema + +prepare_key_value_metadata <- function(metadata) { + # key-value-metadata must be a named character vector; + # this function validates and coerces + if (is.null(metadata)) { + # NULL to remove metadata, so equivalent to setting an empty list + metadata <- empty_named_list() + } + if (is.null(names(metadata))) { + stop( + "Key-value metadata must be a named list or character vector", + call. = FALSE + ) + } + map_chr(metadata, as.character) +} + +print_schema_fields <- function(s) { + # Alternative to Schema__ToString that doesn't print metadata + paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n") +} + +#' @param ... named list containing [data types][data-type] or +#' a list of [fields][field] containing the fields for the schema +#' @export +#' @rdname Schema +schema <- Schema$create + +#' @export +names.Schema <- function(x) x$names + +#' @export +length.Schema <- function(x) x$num_fields + +#' @export +`[[.Schema` <- function(x, i, ...) { + if (is.character(i)) { + x$GetFieldByName(i) + } else if (is.numeric(i)) { + x$field(i - 1) + } else { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } +} + +#' @export +`[[<-.Schema` <- function(x, i, value) { + assert_that(length(i) == 1) + if (is.character(i)) { + field_names <- names(x) + if (anyDuplicated(field_names)) { + stop("Cannot update field by name with duplicates", call. = FALSE) + } + + # If i is character, it's the field name + if (!is.null(value) && !inherits(value, "Field")) { + value <- field(i, as_type(value, "value")) + } + + # No match means we're adding to the end + i <- match(i, field_names, nomatch = length(field_names) + 1L) + } else { + assert_that(is.numeric(i), !is.na(i), i > 0) + # If i is numeric and we have a type, + # we need to grab the existing field name for the new one + if (!is.null(value) && !inherits(value, "Field")) { + value <- field(names(x)[i], as_type(value, "value")) + } + } + + i <- as.integer(i - 1L) + if (i >= length(x)) { + if (!is.null(value)) { + x <- x$AddField(i, value) + } + } else if (is.null(value)) { + x <- x$RemoveField(i) + } else { + x <- x$SetField(i, value) + } + x +} + +#' @export +`$<-.Schema` <- `$<-.ArrowTabular` + +#' @export +`[.Schema` <- function(x, i, ...) { + if (is.logical(i)) { + i <- rep_len(i, length(x)) # For R recycling behavior + i <- which(i) + } + if (is.numeric(i)) { + if (all(i < 0)) { + # in R, negative i means "everything but i" + i <- setdiff(seq_len(length(x)), -1 * i) + } + } + fields <- map(i, ~ x[[.]]) + invalid <- map_lgl(fields, is.null) + if (any(invalid)) { + stop( + "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "), + oxford_paste(i[invalid]), + call. = FALSE + ) + } + schema_(fields) +} + +#' @export +`$.Schema` <- function(x, name, ...) { + assert_that(is.string(name)) + if (name %in% ls(x)) { + get(name, x) + } else { + x$GetFieldByName(name) + } +} + +#' @export +as.list.Schema <- function(x, ...) x$fields + +#' read a Schema from a stream +#' +#' @param stream a `Message`, `InputStream`, or `Buffer` +#' @param ... currently ignored +#' @return A [Schema] +#' @export +read_schema <- function(stream, ...) { + if (inherits(stream, "Message")) { + return(ipc___ReadSchema_Message(stream)) + } else { + if (!inherits(stream, "InputStream")) { + stream <- BufferReader$create(stream) + on.exit(stream$close()) + } + return(ipc___ReadSchema_InputStream(stream)) + } +} + +#' Combine and harmonize schemas +#' +#' @param ... [Schema]s to unify +#' @param schemas Alternatively, a list of schemas +#' @return A `Schema` with the union of fields contained in the inputs, or +#' `NULL` if any of `schemas` is `NULL` +#' @export +#' @examplesIf arrow_available() +#' a <- schema(b = double(), c = bool()) +#' z <- schema(b = double(), k = utf8()) +#' unify_schemas(a, z) +unify_schemas <- function(..., schemas = list(...)) { + if (any(vapply(schemas, is.null, TRUE))) { + return(NULL) + } + arrow__UnifySchemas(schemas) +} + +#' @export +print.arrow_r_metadata <- function(x, ...) { + utils::str(x) + utils::str(.unserialize_arrow_r_metadata(x)) + invisible(x) +} diff --git a/src/arrow/r/R/table.R b/src/arrow/r/R/table.R new file mode 100644 index 000000000..5ae87f7e3 --- /dev/null +++ b/src/arrow/r/R/table.R @@ -0,0 +1,170 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include record-batch.R +#' @title Table class +#' @description A Table is a sequence of [chunked arrays][ChunkedArray]. They +#' have a similar interface to [record batches][RecordBatch], but they can be +#' composed from multiple record batches or chunked arrays. +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section S3 Methods and Usage: +#' Tables are data-frame-like, and many methods you expect to work on +#' a `data.frame` are implemented for `Table`. This includes `[`, `[[`, +#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull +#' the data from an Arrow table into R with `as.data.frame()`. See the +#' examples. +#' +#' A caveat about the `$` method: because `Table` is an `R6` object, +#' `$` is also used to access the object's methods (see below). Methods take +#' precedence over the table's columns. So, `tab$Slice` would return the +#' "Slice" method function even if there were a column in the table called +#' "Slice". +#' +#' @section R6 Methods: +#' In addition to the more R-friendly S3 methods, a `Table` object has +#' the following R6 methods that map onto the underlying C++ methods: +#' +#' - `$column(i)`: Extract a `ChunkedArray` by integer position from the table +#' - `$ColumnNames()`: Get all column names (called by `names(tab)`) +#' - `$RenameColumns(value)`: Set all column names (called by `names(tab) <- value`) +#' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name +#' - `$field(i)`: Extract a `Field` from the table schema by integer position +#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers. +#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the +#' indicated integer offset and going for the given length, or to the end +#' of the table if `NULL`, the default. +#' - `$Take(i)`: return an `Table` with rows at positions given by +#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be +#' coerced to an R vector before taking. +#' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical +#' vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`. +#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row +#' positions that can be used to rearrange the `Table` in ascending or descending +#' order by the first named column, breaking ties with further named columns. +#' `descending` can be a logical vector of length one or of the same length as +#' `names`. +#' - `$serialize(output_stream, ...)`: Write the table to the given +#' [OutputStream] +#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter +#' the schema of the record batch. +#' +#' There are also some active bindings: +#' - `$num_columns` +#' - `$num_rows` +#' - `$schema` +#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list. +#' Modify or replace by assigning in (`tab$metadata <- new_metadata`). +#' All list elements are coerced to string. See `schema()` for more information. +#' - `$columns`: Returns a list of `ChunkedArray`s +#' @rdname Table +#' @name Table +#' @export +Table <- R6Class("Table", + inherit = ArrowTabular, + public = list( + column = function(i) Table__column(self, i), + ColumnNames = function() Table__ColumnNames(self), + RenameColumns = function(value) Table__RenameColumns(self, value), + GetColumnByName = function(name) { + assert_is(name, "character") + assert_that(length(name) == 1) + Table__GetColumnByName(self, name) + }, + RemoveColumn = function(i) Table__RemoveColumn(self, i), + AddColumn = function(i, new_field, value) Table__AddColumn(self, i, new_field, value), + SetColumn = function(i, new_field, value) Table__SetColumn(self, i, new_field, value), + ReplaceSchemaMetadata = function(new) { + Table__ReplaceSchemaMetadata(self, new) + }, + field = function(i) Table__field(self, i), + serialize = function(output_stream, ...) write_table(self, output_stream, ...), + to_data_frame = function() { + Table__to_dataframe(self, use_threads = option_use_threads()) + }, + cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { + assert_is(target_schema, "Schema") + assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") + Table__cast(self, target_schema, options) + }, + SelectColumns = function(indices) Table__SelectColumns(self, indices), + Slice = function(offset, length = NULL) { + if (is.null(length)) { + Table__Slice1(self, offset) + } else { + Table__Slice2(self, offset, length) + } + }, + # Take, Filter, and SortIndices are methods on ArrowTabular + Equals = function(other, check_metadata = FALSE, ...) { + inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata)) + }, + Validate = function() Table__Validate(self), + ValidateFull = function() Table__ValidateFull(self), + invalidate = function() { + .Call(`_arrow_Table__Reset`, self) + super$invalidate() + } + ), + active = list( + num_columns = function() Table__num_columns(self), + num_rows = function() Table__num_rows(self), + schema = function() Table__schema(self), + columns = function() Table__columns(self) + ) +) + +Table$create <- function(..., schema = NULL) { + dots <- list2(...) + # making sure there are always names + if (is.null(names(dots))) { + names(dots) <- rep_len("", length(dots)) + } + stopifnot(length(dots) > 0) + + if (all_record_batches(dots)) { + return(Table__from_record_batches(dots, schema)) + } + + # If any arrays are length 1, recycle them + dots <- recycle_scalars(dots) + + Table__from_dots(dots, schema, option_use_threads()) +} + +#' @export +names.Table <- function(x) x$ColumnNames() + +#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a +#' mixture of data.frames and named vectors, the inputs will be autospliced together +#' (see examples). Alternatively, you can provide a single Arrow IPC +#' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`. +#' @param schema a [Schema], or `NULL` (the default) to infer the schema from +#' the data in `...`. When providing an Arrow IPC buffer, `schema` is required. +#' @rdname Table +#' @examplesIf arrow_available() +#' tbl <- arrow_table(name = rownames(mtcars), mtcars) +#' dim(tbl) +#' dim(head(tbl)) +#' names(tbl) +#' tbl$mpg +#' tbl[["cyl"]] +#' as.data.frame(tbl[4:8, c("gear", "hp", "wt")]) +#' @export +arrow_table <- Table$create diff --git a/src/arrow/r/R/type.R b/src/arrow/r/R/type.R new file mode 100644 index 000000000..4ef7cefb5 --- /dev/null +++ b/src/arrow/r/R/type.R @@ -0,0 +1,541 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @title class arrow::DataType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname DataType +#' @name DataType +DataType <- R6Class("DataType", + inherit = ArrowObject, + public = list( + ToString = function() { + DataType__ToString(self) + }, + Equals = function(other, ...) { + inherits(other, "DataType") && DataType__Equals(self, other) + }, + fields = function() { + DataType__fields(self) + }, + export_to_c = function(ptr) ExportType(self, ptr) + ), + active = list( + id = function() DataType__id(self), + name = function() DataType__name(self), + num_fields = function() DataType__num_fields(self) + ) +) + +#' @include arrowExports.R +DataType$import_from_c <- ImportType + +INTEGER_TYPES <- as.character(outer(c("uint", "int"), c(8, 16, 32, 64), paste0)) +FLOAT_TYPES <- c("float16", "float32", "float64", "halffloat", "float", "double") + +#' infer the arrow Array type from an R vector +#' +#' @param x an R vector +#' +#' @return an arrow logical type +#' @examplesIf arrow_available() +#' type(1:10) +#' type(1L:10L) +#' type(c(1, 1.5, 2)) +#' type(c("A", "B", "C")) +#' type(mtcars) +#' type(Sys.Date()) +#' @export +type <- function(x) UseMethod("type") + +#' @export +type.default <- function(x) Array__infer_type(x) + +#' @export +type.ArrowDatum <- function(x) x$type + +#----- metadata + +#' @title class arrow::FixedWidthType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname FixedWidthType +#' @name FixedWidthType +FixedWidthType <- R6Class("FixedWidthType", + inherit = DataType, + active = list( + bit_width = function() FixedWidthType__bit_width(self) + ) +) + +Int8 <- R6Class("Int8", inherit = FixedWidthType) +Int16 <- R6Class("Int16", inherit = FixedWidthType) +Int32 <- R6Class("Int32", inherit = FixedWidthType) +Int64 <- R6Class("Int64", inherit = FixedWidthType) +UInt8 <- R6Class("UInt8", inherit = FixedWidthType) +UInt16 <- R6Class("UInt16", inherit = FixedWidthType) +UInt32 <- R6Class("UInt32", inherit = FixedWidthType) +UInt64 <- R6Class("UInt64", inherit = FixedWidthType) +Float16 <- R6Class("Float16", inherit = FixedWidthType) +Float32 <- R6Class("Float32", inherit = FixedWidthType) +Float64 <- R6Class("Float64", inherit = FixedWidthType) +Boolean <- R6Class("Boolean", inherit = FixedWidthType) +Utf8 <- R6Class("Utf8", inherit = DataType) +LargeUtf8 <- R6Class("LargeUtf8", inherit = DataType) +Binary <- R6Class("Binary", inherit = DataType) +FixedSizeBinary <- R6Class("FixedSizeBinary", inherit = FixedWidthType) +LargeBinary <- R6Class("LargeBinary", inherit = DataType) + +DateType <- R6Class("DateType", + inherit = FixedWidthType, + public = list( + unit = function() DateType__unit(self) + ) +) +Date32 <- R6Class("Date32", inherit = DateType) +Date64 <- R6Class("Date64", inherit = DateType) + +TimeType <- R6Class("TimeType", + inherit = FixedWidthType, + public = list( + unit = function() TimeType__unit(self) + ) +) +Time32 <- R6Class("Time32", inherit = TimeType) +Time64 <- R6Class("Time64", inherit = TimeType) + +Null <- R6Class("Null", inherit = DataType) + +Timestamp <- R6Class("Timestamp", + inherit = FixedWidthType, + public = list( + timezone = function() TimestampType__timezone(self), + unit = function() TimestampType__unit(self) + ) +) + +DecimalType <- R6Class("DecimalType", + inherit = FixedWidthType, + public = list( + precision = function() DecimalType__precision(self), + scale = function() DecimalType__scale(self) + ) +) +Decimal128Type <- R6Class("Decimal128Type", inherit = DecimalType) + +NestedType <- R6Class("NestedType", inherit = DataType) + +#' Apache Arrow data types +#' +#' These functions create type objects corresponding to Arrow types. Use them +#' when defining a [schema()] or as inputs to other types, like `struct`. Most +#' of these functions don't take arguments, but a few do. +#' +#' A few functions have aliases: +#' +#' * `utf8()` and `string()` +#' * `float16()` and `halffloat()` +#' * `float32()` and `float()` +#' * `bool()` and `boolean()` +#' * When called inside an `arrow` function, such as `schema()` or `cast()`, +#' `double()` also is supported as a way of creating a `float64()` +#' +#' `date32()` creates a datetime type with a "day" unit, like the R `Date` +#' class. `date64()` has a "ms" unit. +#' +#' `uint32` (32 bit unsigned integer), `uint64` (64 bit unsigned integer), and +#' `int64` (64-bit signed integer) types may contain values that exceed the +#' range of R's `integer` type (32-bit signed integer). When these arrow objects +#' are translated to R objects, `uint32` and `uint64` are converted to `double` +#' ("numeric") and `int64` is converted to `bit64::integer64`. For `int64` +#' types, this conversion can be disabled (so that `int64` always yields a +#' `bit64::integer64` object) by setting `options(arrow.int64_downcast = +#' FALSE)`. +#' +#' @param unit For time/timestamp types, the time unit. `time32()` can take +#' either "s" or "ms", while `time64()` can be "us" or "ns". `timestamp()` can +#' take any of those four values. +#' @param timezone For `timestamp()`, an optional time zone string. +#' @param byte_width byte width for `FixedSizeBinary` type. +#' @param list_size list size for `FixedSizeList` type. +#' @param precision For `decimal()`, precision +#' @param scale For `decimal()`, scale +#' @param type For `list_of()`, a data type to make a list-of-type +#' @param ... For `struct()`, a named list of types to define the struct columns +#' +#' @name data-type +#' @return An Arrow type object inheriting from DataType. +#' @export +#' @seealso [dictionary()] for creating a dictionary (factor-like) type. +#' @examplesIf arrow_available() +#' bool() +#' struct(a = int32(), b = double()) +#' timestamp("ms", timezone = "CEST") +#' time64("ns") +int8 <- function() Int8__initialize() + +#' @rdname data-type +#' @export +int16 <- function() Int16__initialize() + +#' @rdname data-type +#' @export +int32 <- function() Int32__initialize() + +#' @rdname data-type +#' @export +int64 <- function() Int64__initialize() + +#' @rdname data-type +#' @export +uint8 <- function() UInt8__initialize() + +#' @rdname data-type +#' @export +uint16 <- function() UInt16__initialize() + +#' @rdname data-type +#' @export +uint32 <- function() UInt32__initialize() + +#' @rdname data-type +#' @export +uint64 <- function() UInt64__initialize() + +#' @rdname data-type +#' @export +float16 <- function() Float16__initialize() + +#' @rdname data-type +#' @export +halffloat <- float16 + +#' @rdname data-type +#' @export +float32 <- function() Float32__initialize() + +#' @rdname data-type +#' @export +float <- float32 + +#' @rdname data-type +#' @export +float64 <- function() Float64__initialize() + +#' @rdname data-type +#' @export +boolean <- function() Boolean__initialize() + +#' @rdname data-type +#' @export +bool <- boolean + +#' @rdname data-type +#' @export +utf8 <- function() Utf8__initialize() + +#' @rdname data-type +#' @export +large_utf8 <- function() LargeUtf8__initialize() + +#' @rdname data-type +#' @export +binary <- function() Binary__initialize() + +#' @rdname data-type +#' @export +large_binary <- function() LargeBinary__initialize() + +#' @rdname data-type +#' @export +fixed_size_binary <- function(byte_width) FixedSizeBinary__initialize(byte_width) + +#' @rdname data-type +#' @export +string <- utf8 + +#' @rdname data-type +#' @export +date32 <- function() Date32__initialize() + +#' @rdname data-type +#' @export +date64 <- function() Date64__initialize() + +#' @rdname data-type +#' @export +time32 <- function(unit = c("ms", "s")) { + if (is.character(unit)) { + unit <- match.arg(unit) + } + unit <- make_valid_time_unit(unit, valid_time32_units) + Time32__initialize(unit) +} + +valid_time32_units <- c( + "ms" = TimeUnit$MILLI, + "s" = TimeUnit$SECOND +) + +valid_time64_units <- c( + "ns" = TimeUnit$NANO, + "us" = TimeUnit$MICRO +) + +make_valid_time_unit <- function(unit, valid_units) { + if (is.character(unit)) { + unit <- valid_units[match.arg(unit, choices = names(valid_units))] + } + if (is.numeric(unit)) { + # Allow non-integer input for convenience + unit <- as.integer(unit) + } else { + stop('"unit" should be one of ', oxford_paste(names(valid_units), "or"), call. = FALSE) + } + if (!(unit %in% valid_units)) { + stop('"unit" should be one of ', oxford_paste(valid_units, "or"), call. = FALSE) + } + unit +} + +#' @rdname data-type +#' @export +time64 <- function(unit = c("ns", "us")) { + if (is.character(unit)) { + unit <- match.arg(unit) + } + unit <- make_valid_time_unit(unit, valid_time64_units) + Time64__initialize(unit) +} + +#' @rdname data-type +#' @export +null <- function() Null__initialize() + +#' @rdname data-type +#' @export +timestamp <- function(unit = c("s", "ms", "us", "ns"), timezone = "") { + if (is.character(unit)) { + unit <- match.arg(unit) + } + unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) + assert_that(is.string(timezone)) + Timestamp__initialize(unit, timezone) +} + +#' @rdname data-type +#' @export +decimal <- function(precision, scale) { + if (is.numeric(precision)) { + precision <- as.integer(precision) + } else { + stop('"precision" must be an integer', call. = FALSE) + } + if (is.numeric(scale)) { + scale <- as.integer(scale) + } else { + stop('"scale" must be an integer', call. = FALSE) + } + Decimal128Type__initialize(precision, scale) +} + +StructType <- R6Class("StructType", + inherit = NestedType, + public = list( + GetFieldByName = function(name) StructType__GetFieldByName(self, name), + GetFieldIndex = function(name) StructType__GetFieldIndex(self, name) + ) +) +StructType$create <- function(...) struct__(.fields(list(...))) + +#' @rdname data-type +#' @export +struct <- StructType$create + +ListType <- R6Class("ListType", + inherit = NestedType, + active = list( + value_field = function() ListType__value_field(self), + value_type = function() ListType__value_type(self) + ) +) + +#' @rdname data-type +#' @export +list_of <- function(type) list__(type) + +LargeListType <- R6Class("LargeListType", + inherit = NestedType, + active = list( + value_field = function() LargeListType__value_field(self), + value_type = function() LargeListType__value_type(self) + ) +) + +#' @rdname data-type +#' @export +large_list_of <- function(type) large_list__(type) + +#' @rdname data-type +#' @export +FixedSizeListType <- R6Class("FixedSizeListType", + inherit = NestedType, + active = list( + value_field = function() FixedSizeListType__value_field(self), + value_type = function() FixedSizeListType__value_type(self), + list_size = function() FixedSizeListType__list_size(self) + ) +) + +#' @rdname data-type +#' @export +fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size) + +as_type <- function(type, name = "type") { + # magic so we don't have to mask base::double() + if (identical(type, double())) { + type <- float64() + } + if (!inherits(type, "DataType")) { + stop(name, " must be a DataType, not ", class(type), call. = FALSE) + } + type +} + +canonical_type_str <- function(type_str) { + # canonicalizes data type strings, converting data type function names and + # aliases to match the strings returned by DataType$ToString() + assert_that(is.string(type_str)) + if (grepl("[([<]", type_str)) { + stop("Cannot interpret string representations of data types that have parameters", call. = FALSE) + } + switch(type_str, + int8 = "int8", + int16 = "int16", + int32 = "int32", + int64 = "int64", + uint8 = "uint8", + uint16 = "uint16", + uint32 = "uint32", + uint64 = "uint64", + float16 = "halffloat", + halffloat = "halffloat", + float32 = "float", + float = "float", + float64 = "double", + double = "double", + boolean = "bool", + bool = "bool", + utf8 = "string", + large_utf8 = "large_string", + large_string = "large_string", + binary = "binary", + large_binary = "large_binary", + fixed_size_binary = "fixed_size_binary", + string = "string", + date32 = "date32", + date64 = "date64", + time32 = "time32", + time64 = "time64", + null = "null", + timestamp = "timestamp", + decimal = "decimal128", + struct = "struct", + list_of = "list", + list = "list", + large_list_of = "large_list", + large_list = "large_list", + fixed_size_list_of = "fixed_size_list", + fixed_size_list = "fixed_size_list", + stop("Unrecognized string representation of data type", call. = FALSE) + ) +} + +# vctrs support ----------------------------------------------------------- +str_dup <- function(x, times) { + paste0(rep(x, times = times), collapse = "") +} + +indent <- function(x, n) { + pad <- str_dup(" ", n) + sapply(x, gsub, pattern = "(\n+)", replacement = paste0("\\1", pad)) +} + +#' @importFrom vctrs vec_ptype_full vec_ptype_abbr +#' @export +vec_ptype_full.arrow_fixed_size_binary <- function(x, ...) { + paste0("fixed_size_binary<", attr(x, "byte_width"), ">") +} + +#' @export +vec_ptype_full.arrow_list <- function(x, ...) { + param <- vec_ptype_full(attr(x, "ptype")) + if (grepl("\n", param)) { + param <- paste0(indent(paste0("\n", param), 2), "\n") + } + paste0("list<", param, ">") +} + +#' @export +vec_ptype_full.arrow_large_list <- function(x, ...) { + param <- vec_ptype_full(attr(x, "ptype")) + if (grepl("\n", param)) { + param <- paste0(indent(paste0("\n", param), 2), "\n") + } + paste0("large_list<", param, ">") +} + +#' @export +vec_ptype_full.arrow_fixed_size_list <- function(x, ...) { + param <- vec_ptype_full(attr(x, "ptype")) + if (grepl("\n", param)) { + param <- paste0(indent(paste0("\n", param), 2), "\n") + } + paste0("fixed_size_list<", param, ", ", attr(x, "list_size"), ">") +} + +#' @export +vec_ptype_abbr.arrow_fixed_size_binary <- function(x, ...) { + vec_ptype_full(x, ...) +} +#' @export +vec_ptype_abbr.arrow_list <- function(x, ...) { + vec_ptype_full(x, ...) +} +#' @export +vec_ptype_abbr.arrow_large_list <- function(x, ...) { + vec_ptype_full(x, ...) +} +#' @export +vec_ptype_abbr.arrow_fixed_size_list <- function(x, ...) { + vec_ptype_full(x, ...) +} diff --git a/src/arrow/r/R/util.R b/src/arrow/r/R/util.R new file mode 100644 index 000000000..9e3ade6a9 --- /dev/null +++ b/src/arrow/r/R/util.R @@ -0,0 +1,195 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# for compatibility with R versions earlier than 4.0.0 +if (!exists("deparse1")) { + deparse1 <- function(expr, collapse = " ", width.cutoff = 500L, ...) { + paste(deparse(expr, width.cutoff, ...), collapse = collapse) + } +} + +# for compatibility with R versions earlier than 3.6.0 +if (!exists("str2lang")) { + str2lang <- function(s) { + parse(text = s, keep.source = FALSE)[[1]] + } +} + +oxford_paste <- function(x, conjunction = "and", quote = TRUE) { + if (quote && is.character(x)) { + x <- paste0('"', x, '"') + } + if (length(x) < 2) { + return(x) + } + x[length(x)] <- paste(conjunction, x[length(x)]) + if (length(x) > 2) { + return(paste(x, collapse = ", ")) + } else { + return(paste(x, collapse = " ")) + } +} + +assert_is <- function(object, class) { + msg <- paste(substitute(object), "must be a", oxford_paste(class, "or")) + assert_that(inherits(object, class), msg = msg) +} + +assert_is_list_of <- function(object, class) { + msg <- paste(substitute(object), "must be a list of", oxford_paste(class, "or")) + assert_that(is_list_of(object, class), msg = msg) +} + +is_list_of <- function(object, class) { + is.list(object) && all(map_lgl(object, ~ inherits(., class))) +} + +empty_named_list <- function() structure(list(), .Names = character(0)) + +r_symbolic_constants <- c( + "pi", "TRUE", "FALSE", "NULL", "Inf", "NA", "NaN", + "NA_integer_", "NA_real_", "NA_complex_", "NA_character_" +) + +is_function <- function(expr, name) { + # We could have a quosure here if we have an expression like `sum({{ var }})` + if (is_quosure(expr)) { + expr <- quo_get_expr(expr) + } + if (!is.call(expr)) { + return(FALSE) + } else { + if (deparse(expr[[1]]) == name) { + return(TRUE) + } + out <- lapply(expr, is_function, name) + } + any(map_lgl(out, isTRUE)) +} + +all_funs <- function(expr) { + # It is not sufficient to simply do: setdiff(all.names, all.vars) + # here because that would fail to return the names of functions that + # share names with variables. + # To preserve duplicates, call `all.names()` not `all_names()` here. + if (is_quosure(expr)) { + expr <- quo_get_expr(expr) + } + names <- all.names(expr) + names[map_lgl(names, ~ is_function(expr, .))] +} + +all_vars <- function(expr) { + setdiff(all.vars(expr), r_symbolic_constants) +} + +all_names <- function(expr) { + setdiff(all.names(expr), r_symbolic_constants) +} + +is_constant <- function(expr) { + length(all_vars(expr)) == 0 +} + +read_compressed_error <- function(e) { + msg <- conditionMessage(e) + if (grepl(" codec ", msg)) { + compression <- sub(".*Support for codec '(.*)'.*", "\\1", msg) + e$message <- paste0( + msg, + "\nIn order to read this file, you will need to reinstall arrow with additional features enabled.", + "\nSet one of these environment variables before installing:", + sprintf("\n\n * LIBARROW_MINIMAL=false (for all optional features, including '%s')", compression), + sprintf("\n * ARROW_WITH_%s=ON (for just '%s')", toupper(compression), compression), + "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details" + ) + } + stop(e) +} + +handle_parquet_io_error <- function(e, format) { + msg <- conditionMessage(e) + if (grepl("Parquet magic bytes not found in footer", msg) && length(format) > 1 && is_character(format)) { + # If length(format) > 1, that means it is (almost certainly) the default/not specified value + # so let the user know that they should specify the actual (not parquet) format + abort(c( + msg, + i = "Did you mean to specify a 'format' other than the default (parquet)?" + )) + } + stop(e) +} + +is_writable_table <- function(x) { + inherits(x, c("data.frame", "ArrowTabular")) +} + +# This attribute is used when is_writable is passed into assert_that, and allows +# the call to form part of the error message when is_writable is FALSE +attr(is_writable_table, "fail") <- function(call, env) { + paste0( + deparse(call$x), + " must be an object of class 'data.frame', 'RecordBatch', or 'Table', not '", + class(env[[deparse(call$x)]])[[1]], + "'." + ) +} + +#' Recycle scalar values in a list of arrays +#' +#' @param arrays List of arrays +#' @return List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled +#' @keywords internal +recycle_scalars <- function(arrays) { + # Get lengths of items in arrays + arr_lens <- map_int(arrays, NROW) + + is_scalar <- arr_lens == 1 + + if (length(arrays) > 1 && any(is_scalar) && !all(is_scalar)) { + + # Recycling not supported for tibbles and data.frames + if (all(map_lgl(arrays, ~ inherits(.x, "data.frame")))) { + abort(c( + "All input tibbles or data.frames must have the same number of rows", + x = paste( + "Number of rows in longest and shortest inputs:", + oxford_paste(c(max(arr_lens), min(arr_lens))) + ) + )) + } + + max_array_len <- max(arr_lens) + arrays[is_scalar] <- lapply(arrays[is_scalar], repeat_value_as_array, max_array_len) + } + arrays +} + +#' Take an object of length 1 and repeat it. +#' +#' @param object Object of length 1 to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray` +#' @param n Number of repetitions +#' +#' @return `Array` of length `n` +#' +#' @keywords internal +repeat_value_as_array <- function(object, n) { + if (inherits(object, "ChunkedArray")) { + return(Scalar$create(object$chunks[[1]])$as_array(n)) + } + return(Scalar$create(object)$as_array(n)) +} |