diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/R/table.R | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/R/table.R')
-rw-r--r-- | src/arrow/r/R/table.R | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/src/arrow/r/R/table.R b/src/arrow/r/R/table.R new file mode 100644 index 000000000..5ae87f7e3 --- /dev/null +++ b/src/arrow/r/R/table.R @@ -0,0 +1,170 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include record-batch.R +#' @title Table class +#' @description A Table is a sequence of [chunked arrays][ChunkedArray]. They +#' have a similar interface to [record batches][RecordBatch], but they can be +#' composed from multiple record batches or chunked arrays. +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section S3 Methods and Usage: +#' Tables are data-frame-like, and many methods you expect to work on +#' a `data.frame` are implemented for `Table`. This includes `[`, `[[`, +#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull +#' the data from an Arrow table into R with `as.data.frame()`. See the +#' examples. +#' +#' A caveat about the `$` method: because `Table` is an `R6` object, +#' `$` is also used to access the object's methods (see below). Methods take +#' precedence over the table's columns. So, `tab$Slice` would return the +#' "Slice" method function even if there were a column in the table called +#' "Slice". +#' +#' @section R6 Methods: +#' In addition to the more R-friendly S3 methods, a `Table` object has +#' the following R6 methods that map onto the underlying C++ methods: +#' +#' - `$column(i)`: Extract a `ChunkedArray` by integer position from the table +#' - `$ColumnNames()`: Get all column names (called by `names(tab)`) +#' - `$RenameColumns(value)`: Set all column names (called by `names(tab) <- value`) +#' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name +#' - `$field(i)`: Extract a `Field` from the table schema by integer position +#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers. +#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the +#' indicated integer offset and going for the given length, or to the end +#' of the table if `NULL`, the default. +#' - `$Take(i)`: return an `Table` with rows at positions given by +#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be +#' coerced to an R vector before taking. +#' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical +#' vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`. +#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row +#' positions that can be used to rearrange the `Table` in ascending or descending +#' order by the first named column, breaking ties with further named columns. +#' `descending` can be a logical vector of length one or of the same length as +#' `names`. +#' - `$serialize(output_stream, ...)`: Write the table to the given +#' [OutputStream] +#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter +#' the schema of the record batch. +#' +#' There are also some active bindings: +#' - `$num_columns` +#' - `$num_rows` +#' - `$schema` +#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list. +#' Modify or replace by assigning in (`tab$metadata <- new_metadata`). +#' All list elements are coerced to string. See `schema()` for more information. +#' - `$columns`: Returns a list of `ChunkedArray`s +#' @rdname Table +#' @name Table +#' @export +Table <- R6Class("Table", + inherit = ArrowTabular, + public = list( + column = function(i) Table__column(self, i), + ColumnNames = function() Table__ColumnNames(self), + RenameColumns = function(value) Table__RenameColumns(self, value), + GetColumnByName = function(name) { + assert_is(name, "character") + assert_that(length(name) == 1) + Table__GetColumnByName(self, name) + }, + RemoveColumn = function(i) Table__RemoveColumn(self, i), + AddColumn = function(i, new_field, value) Table__AddColumn(self, i, new_field, value), + SetColumn = function(i, new_field, value) Table__SetColumn(self, i, new_field, value), + ReplaceSchemaMetadata = function(new) { + Table__ReplaceSchemaMetadata(self, new) + }, + field = function(i) Table__field(self, i), + serialize = function(output_stream, ...) write_table(self, output_stream, ...), + to_data_frame = function() { + Table__to_dataframe(self, use_threads = option_use_threads()) + }, + cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) { + assert_is(target_schema, "Schema") + assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") + Table__cast(self, target_schema, options) + }, + SelectColumns = function(indices) Table__SelectColumns(self, indices), + Slice = function(offset, length = NULL) { + if (is.null(length)) { + Table__Slice1(self, offset) + } else { + Table__Slice2(self, offset, length) + } + }, + # Take, Filter, and SortIndices are methods on ArrowTabular + Equals = function(other, check_metadata = FALSE, ...) { + inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata)) + }, + Validate = function() Table__Validate(self), + ValidateFull = function() Table__ValidateFull(self), + invalidate = function() { + .Call(`_arrow_Table__Reset`, self) + super$invalidate() + } + ), + active = list( + num_columns = function() Table__num_columns(self), + num_rows = function() Table__num_rows(self), + schema = function() Table__schema(self), + columns = function() Table__columns(self) + ) +) + +Table$create <- function(..., schema = NULL) { + dots <- list2(...) + # making sure there are always names + if (is.null(names(dots))) { + names(dots) <- rep_len("", length(dots)) + } + stopifnot(length(dots) > 0) + + if (all_record_batches(dots)) { + return(Table__from_record_batches(dots, schema)) + } + + # If any arrays are length 1, recycle them + dots <- recycle_scalars(dots) + + Table__from_dots(dots, schema, option_use_threads()) +} + +#' @export +names.Table <- function(x) x$ColumnNames() + +#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a +#' mixture of data.frames and named vectors, the inputs will be autospliced together +#' (see examples). Alternatively, you can provide a single Arrow IPC +#' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`. +#' @param schema a [Schema], or `NULL` (the default) to infer the schema from +#' the data in `...`. When providing an Arrow IPC buffer, `schema` is required. +#' @rdname Table +#' @examplesIf arrow_available() +#' tbl <- arrow_table(name = rownames(mtcars), mtcars) +#' dim(tbl) +#' dim(head(tbl)) +#' names(tbl) +#' tbl$mpg +#' tbl[["cyl"]] +#' as.data.frame(tbl[4:8, c("gear", "hp", "wt")]) +#' @export +arrow_table <- Table$create |