diff options
Diffstat (limited to 'src/arrow/r/R/array.R')
-rw-r--r-- | src/arrow/r/R/array.R | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/src/arrow/r/R/array.R b/src/arrow/r/R/array.R new file mode 100644 index 000000000..46acc14ff --- /dev/null +++ b/src/arrow/r/R/array.R @@ -0,0 +1,329 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-datum.R + +#' @title Arrow Arrays +#' @description An `Array` is an immutable data array with some logical type +#' and some length. Most logical types are contained in the base +#' `Array` class; there are also subclasses for `DictionaryArray`, `ListArray`, +#' and `StructArray`. +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Factory: +#' The `Array$create()` factory method instantiates an `Array` and +#' takes the following arguments: +#' * `x`: an R vector, list, or `data.frame` +#' * `type`: an optional [data type][data-type] for `x`. If omitted, the type +#' will be inferred from the data. +#' +#' `Array$create()` will return the appropriate subclass of `Array`, such as +#' `DictionaryArray` when given an R factor. +#' +#' To compose a `DictionaryArray` directly, call `DictionaryArray$create()`, +#' which takes two arguments: +#' * `x`: an R vector or `Array` of integers for the dictionary indices +#' * `dict`: an R vector or `Array` of dictionary values (like R factor levels +#' but not limited to strings only) +#' @section Usage: +#' +#' ``` +#' a <- Array$create(x) +#' length(a) +#' +#' print(a) +#' a == a +#' ``` +#' +#' @section Methods: +#' +#' - `$IsNull(i)`: Return true if value at index is null. Does not boundscheck +#' - `$IsValid(i)`: Return true if value at index is valid. Does not boundscheck +#' - `$length()`: Size in the number of elements this array contains +#' - `$offset`: A relative position into another array's data, to enable zero-copy slicing +#' - `$null_count`: The number of null entries in the array +#' - `$type`: logical type of data +#' - `$type_id()`: type id +#' - `$Equals(other)` : is this array equal to `other` +#' - `$ApproxEquals(other)` : +#' - `$Diff(other)` : return a string expressing the difference between two arrays +#' - `$data()`: return the underlying [ArrayData][ArrayData] +#' - `$as_vector()`: convert to an R vector +#' - `$ToString()`: string representation of the array +#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array +#' with the indicated offset and length. If length is `NULL`, the slice goes +#' until the end of the array. +#' - `$Take(i)`: return an `Array` with values at positions given by integers +#' (R vector or Array Array) `i`. +#' - `$Filter(i, keep_na = TRUE)`: return an `Array` with values at positions where logical +#' vector (or Arrow boolean Array) `i` is `TRUE`. +#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be +#' used to rearrange the `Array` in ascending or descending order +#' - `$RangeEquals(other, start_idx, end_idx, other_start_idx)` : +#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the +#' data in the array to change its type. +#' - `$View(type)`: Construct a zero-copy view of this array with the given type. +#' - `$Validate()` : Perform any validation checks to determine obvious inconsistencies +#' within the array's internal data. This can be an expensive check, potentially `O(length)` +#' +#' @rdname array +#' @name array +#' @examplesIf arrow_available() +#' my_array <- Array$create(1:10) +#' my_array$type +#' my_array$cast(int8()) +#' +#' # Check if value is null; zero-indexed +#' na_array <- Array$create(c(1:5, NA)) +#' na_array$IsNull(0) +#' na_array$IsNull(5) +#' na_array$IsValid(5) +#' na_array$null_count +#' +#' # zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice +#' new_array <- na_array$Slice(5) +#' new_array$offset +#' +#' # Compare 2 arrays +#' na_array2 <- na_array +#' na_array2 == na_array # element-wise comparison +#' na_array2$Equals(na_array) # overall comparison +#' @export +Array <- R6Class("Array", + inherit = ArrowDatum, + public = list( + IsNull = function(i) Array__IsNull(self, i), + IsValid = function(i) Array__IsValid(self, i), + length = function() Array__length(self), + type_id = function() Array__type_id(self), + Equals = function(other, ...) { + inherits(other, "Array") && Array__Equals(self, other) + }, + ApproxEquals = function(other) { + inherits(other, "Array") && Array__ApproxEquals(self, other) + }, + Diff = function(other) { + if (!inherits(other, "Array")) { + other <- Array$create(other) + } + Array__Diff(self, other) + }, + data = function() Array__data(self), + as_vector = function() Array__as_vector(self), + ToString = function() { + typ <- paste0("<", self$type$ToString(), ">") + paste(typ, Array__ToString(self), sep = "\n") + }, + Slice = function(offset, length = NULL) { + if (is.null(length)) { + Array__Slice1(self, offset) + } else { + Array__Slice2(self, offset, length) + } + }, + Take = function(i) { + if (is.numeric(i)) { + i <- as.integer(i) + } + if (is.integer(i)) { + i <- Array$create(i) + } + call_function("take", self, i) + }, + Filter = function(i, keep_na = TRUE) { + if (is.logical(i)) { + i <- Array$create(i) + } + assert_is(i, "Array") + call_function("filter", self, i, options = list(keep_na = keep_na)) + }, + SortIndices = function(descending = FALSE) { + assert_that(is.logical(descending)) + assert_that(length(descending) == 1L) + assert_that(!is.na(descending)) + call_function("array_sort_indices", self, options = list(order = descending)) + }, + RangeEquals = function(other, start_idx, end_idx, other_start_idx = 0L) { + assert_is(other, "Array") + Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx) + }, + View = function(type) { + Array$create(Array__View(self, as_type(type))) + }, + Validate = function() Array__Validate(self), + export_to_c = function(array_ptr, schema_ptr) ExportArray(self, array_ptr, schema_ptr) + ), + active = list( + null_count = function() Array__null_count(self), + offset = function() Array__offset(self), + type = function() Array__type(self) + ) +) +Array$create <- function(x, type = NULL) { + if (!is.null(type)) { + type <- as_type(type) + } + if (inherits(x, "Scalar")) { + out <- x$as_array() + if (!is.null(type)) { + out <- out$cast(type) + } + return(out) + } + vec_to_arrow(x, type) +} +#' @include arrowExports.R +Array$import_from_c <- ImportArray + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +DictionaryArray <- R6Class("DictionaryArray", + inherit = Array, + public = list( + indices = function() DictionaryArray__indices(self), + dictionary = function() DictionaryArray__dictionary(self) + ), + active = list( + ordered = function() self$type$ordered + ) +) +DictionaryArray$create <- function(x, dict = NULL) { + if (is.factor(x)) { + # The simple case: converting a factor. + # Ignoring `dict`; should probably error if dict is not NULL + return(Array$create(x)) + } + + assert_that(!is.null(dict)) + if (!is.Array(x)) { + x <- Array$create(x) + } + if (!is.Array(dict)) { + dict <- Array$create(dict) + } + type <- DictionaryType$create(x$type, dict$type) + DictionaryArray__FromArrays(type, x, dict) +} + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +StructArray <- R6Class("StructArray", + inherit = Array, + public = list( + field = function(i) StructArray__field(self, i), + GetFieldByName = function(name) StructArray__GetFieldByName(self, name), + Flatten = function() StructArray__Flatten(self) + ) +) + + +#' @export +`[[.StructArray` <- function(x, i, ...) { + if (is.character(i)) { + x$GetFieldByName(i) + } else if (is.numeric(i)) { + x$field(i - 1) + } else { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } +} + +#' @export +`$.StructArray` <- function(x, name, ...) { + assert_that(is.string(name)) + if (name %in% ls(x)) { + get(name, x) + } else { + x$GetFieldByName(name) + } +} + +#' @export +names.StructArray <- function(x, ...) StructType__field_names(x$type) + +#' @export +dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields) + +#' @export +as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) { + as.vector(x) +} + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +ListArray <- R6Class("ListArray", + inherit = Array, + public = list( + values = function() ListArray__values(self), + value_length = function(i) ListArray__value_length(self, i), + value_offset = function(i) ListArray__value_offset(self, i), + raw_value_offsets = function() ListArray__raw_value_offsets(self) + ), + active = list( + value_type = function() ListArray__value_type(self) + ) +) + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +LargeListArray <- R6Class("LargeListArray", + inherit = Array, + public = list( + values = function() LargeListArray__values(self), + value_length = function(i) LargeListArray__value_length(self, i), + value_offset = function(i) LargeListArray__value_offset(self, i), + raw_value_offsets = function() LargeListArray__raw_value_offsets(self) + ), + active = list( + value_type = function() LargeListArray__value_type(self) + ) +) + +#' @rdname array +#' @usage NULL +#' @format NULL +#' @export +FixedSizeListArray <- R6Class("FixedSizeListArray", + inherit = Array, + public = list( + values = function() FixedSizeListArray__values(self), + value_length = function(i) FixedSizeListArray__value_length(self, i), + value_offset = function(i) FixedSizeListArray__value_offset(self, i) + ), + active = list( + value_type = function() FixedSizeListArray__value_type(self), + list_size = function() self$type$list_size + ) +) + +is.Array <- function(x, type = NULL) { # nolint + is_it <- inherits(x, c("Array", "ChunkedArray")) + if (is_it && !is.null(type)) { + is_it <- x$type$ToString() %in% type + } + is_it +} |