# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' @include arrow-datum.R # Base class for RecordBatch and Table for S3 method dispatch only. # Does not exist in C++ class hierarchy ArrowTabular <- R6Class("ArrowTabular", inherit = ArrowObject, public = list( ToString = function() { sch <- unlist(strsplit(self$schema$ToString(), "\n")) sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch) dims <- sprintf("%s rows x %s columns", self$num_rows, self$num_columns) paste(c(dims, sch), collapse = "\n") }, Take = function(i) { if (is.numeric(i)) { i <- as.integer(i) } if (is.integer(i)) { i <- Array$create(i) } assert_that(is.Array(i)) call_function("take", self, i) }, Filter = function(i, keep_na = TRUE) { if (is.logical(i)) { i <- Array$create(i) } assert_that(is.Array(i, "bool")) call_function("filter", self, i, options = list(keep_na = keep_na)) }, SortIndices = function(names, descending = FALSE) { assert_that(is.character(names)) assert_that(length(names) > 0) assert_that(!any(is.na(names))) if (length(descending) == 1L) { descending <- rep_len(descending, length(names)) } assert_that(is.logical(descending)) assert_that(identical(length(names), length(descending))) assert_that(!any(is.na(descending))) call_function( "sort_indices", self, # cpp11 does not support logical vectors so convert to integer options = list(names = names, orders = as.integer(descending)) ) } ), active = list( metadata = function(new) { if (missing(new)) { # Get the metadata (from the schema) self$schema$metadata } else { # Set the metadata new <- prepare_key_value_metadata(new) out <- self$ReplaceSchemaMetadata(new) # ReplaceSchemaMetadata returns a new object but we're modifying in place, # so swap in that new C++ object pointer into our R6 object self$set_pointer(out$pointer()) self } }, r_metadata = function(new) { # Helper for the R metadata that handles the serialization # See also method on Schema if (missing(new)) { out <- self$metadata$r if (!is.null(out)) { # Can't unserialize NULL out <- .unserialize_arrow_r_metadata(out) } # Returns either NULL or a named list out } else { # Set the R metadata self$metadata$r <- .serialize_arrow_r_metadata(new) self } } ) ) #' @export as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) { df <- x$to_data_frame() if (!is.null(r_metadata <- x$metadata$r)) { df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata)) } df } #' @export `names<-.ArrowTabular` <- function(x, value) x$RenameColumns(value) #' @importFrom methods as #' @export `[.ArrowTabular` <- function(x, i, j, ..., drop = FALSE) { if (nargs() == 2L) { # List-like column extraction (x[i]) return(x[, i]) } if (!missing(j)) { # Selecting columns is cheaper than filtering rows, so do it first. # That way, if we're filtering too, we have fewer arrays to filter/slice/take if (is.character(j)) { j_new <- match(j, names(x)) if (any(is.na(j_new))) { stop("Column not found: ", oxford_paste(j[is.na(j_new)]), call. = FALSE) } j <- j_new } if (is_integerish(j)) { if (any(is.na(j))) { stop("Column indices cannot be NA", call. = FALSE) } if (length(j) && all(j < 0)) { # in R, negative j means "everything but j" j <- setdiff(seq_len(x$num_columns), -1 * j) } x <- x$SelectColumns(as.integer(j) - 1L) } if (drop && ncol(x) == 1L) { x <- x$column(0) } } if (!missing(i)) { x <- filter_rows(x, i, ...) } x } #' @export `[[.ArrowTabular` <- function(x, i, ...) { if (is.character(i)) { x$GetColumnByName(i) } else if (is.numeric(i)) { x$column(i - 1) } else { stop("'i' must be character or numeric, not ", class(i), call. = FALSE) } } #' @export `$.ArrowTabular` <- function(x, name, ...) { assert_that(is.string(name)) if (name %in% ls(x)) { get(name, x) } else { x$GetColumnByName(name) } } #' @export `[[<-.ArrowTabular` <- function(x, i, value) { if (!is.character(i) & !is.numeric(i)) { stop("'i' must be character or numeric, not ", class(i), call. = FALSE) } assert_that(length(i) == 1, !is.na(i)) if (is.null(value)) { if (is.character(i)) { i <- match(i, names(x)) } x <- x$RemoveColumn(i - 1L) } else { if (!is.character(i)) { # get or create a/the column name if (i <= x$num_columns) { i <- names(x)[i] } else { i <- as.character(i) } } # auto-magic recycling on non-ArrowObjects if (!inherits(value, "ArrowObject")) { value <- vctrs::vec_recycle(value, x$num_rows) } # construct the field if (inherits(x, "RecordBatch") && !inherits(value, "Array")) { value <- Array$create(value) } else if (inherits(x, "Table") && !inherits(value, "ChunkedArray")) { value <- ChunkedArray$create(value) } new_field <- field(i, value$type) if (i %in% names(x)) { i <- match(i, names(x)) - 1L x <- x$SetColumn(i, new_field, value) } else { i <- x$num_columns x <- x$AddColumn(i, new_field, value) } } x } #' @export `$<-.ArrowTabular` <- function(x, i, value) { assert_that(is.string(i)) # We need to check if `i` is in names in case it is an active binding (e.g. # `metadata`, in which case we use assign to change the active binding instead # of the column in the table) if (i %in% ls(x)) { assign(i, value, x) } else { x[[i]] <- value } x } #' @export dim.ArrowTabular <- function(x) c(x$num_rows, x$num_columns) #' @export length.ArrowTabular <- function(x) x$num_columns #' @export as.list.ArrowTabular <- function(x, ...) as.list(as.data.frame(x, ...)) #' @export row.names.ArrowTabular <- function(x) as.character(seq_len(nrow(x))) #' @export dimnames.ArrowTabular <- function(x) list(row.names(x), names(x)) #' @export head.ArrowTabular <- head.ArrowDatum #' @export tail.ArrowTabular <- tail.ArrowDatum #' @export na.fail.ArrowTabular <- function(object, ...) { for (col in seq_len(object$num_columns)) { if (object$column(col - 1L)$null_count > 0) { stop("missing values in object", call. = FALSE) } } object } #' @export na.omit.ArrowTabular <- function(object, ...) { not_na <- map(object$columns, ~ call_function("is_valid", .x)) not_na_agg <- Reduce("&", not_na) object$Filter(not_na_agg) } #' @export na.exclude.ArrowTabular <- na.omit.ArrowTabular