diff options
Diffstat (limited to 'src/arrow/r/R/schema.R')
-rw-r--r-- | src/arrow/r/R/schema.R | 330 |
1 files changed, 330 insertions, 0 deletions
diff --git a/src/arrow/r/R/schema.R b/src/arrow/r/R/schema.R new file mode 100644 index 000000000..c3dfee5f9 --- /dev/null +++ b/src/arrow/r/R/schema.R @@ -0,0 +1,330 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @title Schema class +#' +#' @description A `Schema` is a list of [Field]s, which map names to +#' Arrow [data types][data-type]. Create a `Schema` when you +#' want to convert an R `data.frame` to Arrow but don't want to rely on the +#' default mapping of R types to Arrow types, such as when you want to choose a +#' specific numeric precision, or when creating a [Dataset] and you want to +#' ensure a specific schema rather than inferring it from the various files. +#' +#' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method +#' (active binding) that lets you access their schema. +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' @section Methods: +#' +#' - `$ToString()`: convert to a string +#' - `$field(i)`: returns the field at index `i` (0-based) +#' - `$GetFieldByName(x)`: returns the field with name `x` +#' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value +#' `metadata` set. Note that all list elements in `metadata` will be coerced +#' to `character`. +#' +#' @section Active bindings: +#' +#' - `$names`: returns the field names (called in `names(Schema)`) +#' - `$num_fields`: returns the number of fields (called in `length(Schema)`) +#' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for +#' iterating over +#' - `$HasMetadata`: logical: does this `Schema` have extra metadata? +#' - `$metadata`: returns the key-value metadata as a named list. +#' Modify or replace by assigning in (`sch$metadata <- new_metadata`). +#' All list elements are coerced to string. +#' +#' @section R Metadata: +#' +#' When converting a data.frame to an Arrow Table or RecordBatch, attributes +#' from the `data.frame` are saved alongside tables so that the object can be +#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata +#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or +#' at the column (e.g. `attributes(df$col_a)`) or for list columns only: +#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows +#' for storing `haven` columns in a table and being able to faithfully +#' re-create them when pulled back into R. This metadata is separate from the +#' schema (column names and types) which is compatible with other Arrow +#' clients. The R metadata is only read by R and is ignored by other clients +#' (e.g. Pandas has its own custom metadata). This metadata is stored in +#' `$metadata$r`. +#' +#' Since Schema metadata keys and values must be strings, this metadata is +#' saved by serializing R's attribute list structure to a string. If the +#' serialized metadata exceeds 100Kb in size, by default it is compressed +#' starting in version 3.0.0. To disable this compression (e.g. for tables +#' that are compatible with Arrow versions before 3.0.0 and include large +#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`. +#' Files with compressed metadata are readable by older versions of arrow, but +#' the metadata is dropped. +#' +#' @rdname Schema +#' @name Schema +#' @examplesIf arrow_available() +#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5)) +#' tab1 <- arrow_table(df) +#' tab1$schema +#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32())) +#' tab2$schema +#' @export +Schema <- R6Class("Schema", + inherit = ArrowObject, + public = list( + ToString = function() { + fields <- print_schema_fields(self) + if (self$HasMetadata) { + fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata") + } + fields + }, + field = function(i) Schema__field(self, i), + GetFieldByName = function(x) Schema__GetFieldByName(self, x), + AddField = function(i, field) { + assert_is(field, "Field") + Schema__AddField(self, i, field) + }, + SetField = function(i, field) { + assert_is(field, "Field") + Schema__SetField(self, i, field) + }, + RemoveField = function(i) Schema__RemoveField(self, i), + serialize = function() Schema__serialize(self), + WithMetadata = function(metadata = NULL) { + metadata <- prepare_key_value_metadata(metadata) + Schema__WithMetadata(self, metadata) + }, + Equals = function(other, check_metadata = FALSE, ...) { + inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata)) + }, + export_to_c = function(ptr) ExportSchema(self, ptr) + ), + active = list( + names = function() { + Schema__field_names(self) + }, + num_fields = function() Schema__num_fields(self), + fields = function() Schema__fields(self), + HasMetadata = function() Schema__HasMetadata(self), + metadata = function(new_metadata) { + if (missing(new_metadata)) { + Schema__metadata(self) + } else { + # Set the metadata + out <- self$WithMetadata(new_metadata) + # $WithMetadata returns a new object but we're modifying in place, + # so swap in that new C++ object pointer into our R6 object + self$set_pointer(out$pointer()) + self + } + }, + r_metadata = function(new) { + # Helper for the R metadata that handles the serialization + # See also method on ArrowTabular + if (missing(new)) { + out <- self$metadata$r + if (!is.null(out)) { + # Can't unserialize NULL + out <- .unserialize_arrow_r_metadata(out) + } + # Returns either NULL or a named list + out + } else { + # Set the R metadata + self$metadata$r <- .serialize_arrow_r_metadata(new) + self + } + } + ) +) +Schema$create <- function(...) { + .list <- list2(...) + if (all(map_lgl(.list, ~ inherits(., "Field")))) { + schema_(.list) + } else { + schema_(.fields(.list)) + } +} +#' @include arrowExports.R +Schema$import_from_c <- ImportSchema + +prepare_key_value_metadata <- function(metadata) { + # key-value-metadata must be a named character vector; + # this function validates and coerces + if (is.null(metadata)) { + # NULL to remove metadata, so equivalent to setting an empty list + metadata <- empty_named_list() + } + if (is.null(names(metadata))) { + stop( + "Key-value metadata must be a named list or character vector", + call. = FALSE + ) + } + map_chr(metadata, as.character) +} + +print_schema_fields <- function(s) { + # Alternative to Schema__ToString that doesn't print metadata + paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n") +} + +#' @param ... named list containing [data types][data-type] or +#' a list of [fields][field] containing the fields for the schema +#' @export +#' @rdname Schema +schema <- Schema$create + +#' @export +names.Schema <- function(x) x$names + +#' @export +length.Schema <- function(x) x$num_fields + +#' @export +`[[.Schema` <- function(x, i, ...) { + if (is.character(i)) { + x$GetFieldByName(i) + } else if (is.numeric(i)) { + x$field(i - 1) + } else { + stop("'i' must be character or numeric, not ", class(i), call. = FALSE) + } +} + +#' @export +`[[<-.Schema` <- function(x, i, value) { + assert_that(length(i) == 1) + if (is.character(i)) { + field_names <- names(x) + if (anyDuplicated(field_names)) { + stop("Cannot update field by name with duplicates", call. = FALSE) + } + + # If i is character, it's the field name + if (!is.null(value) && !inherits(value, "Field")) { + value <- field(i, as_type(value, "value")) + } + + # No match means we're adding to the end + i <- match(i, field_names, nomatch = length(field_names) + 1L) + } else { + assert_that(is.numeric(i), !is.na(i), i > 0) + # If i is numeric and we have a type, + # we need to grab the existing field name for the new one + if (!is.null(value) && !inherits(value, "Field")) { + value <- field(names(x)[i], as_type(value, "value")) + } + } + + i <- as.integer(i - 1L) + if (i >= length(x)) { + if (!is.null(value)) { + x <- x$AddField(i, value) + } + } else if (is.null(value)) { + x <- x$RemoveField(i) + } else { + x <- x$SetField(i, value) + } + x +} + +#' @export +`$<-.Schema` <- `$<-.ArrowTabular` + +#' @export +`[.Schema` <- function(x, i, ...) { + if (is.logical(i)) { + i <- rep_len(i, length(x)) # For R recycling behavior + i <- which(i) + } + if (is.numeric(i)) { + if (all(i < 0)) { + # in R, negative i means "everything but i" + i <- setdiff(seq_len(length(x)), -1 * i) + } + } + fields <- map(i, ~ x[[.]]) + invalid <- map_lgl(fields, is.null) + if (any(invalid)) { + stop( + "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "), + oxford_paste(i[invalid]), + call. = FALSE + ) + } + schema_(fields) +} + +#' @export +`$.Schema` <- function(x, name, ...) { + assert_that(is.string(name)) + if (name %in% ls(x)) { + get(name, x) + } else { + x$GetFieldByName(name) + } +} + +#' @export +as.list.Schema <- function(x, ...) x$fields + +#' read a Schema from a stream +#' +#' @param stream a `Message`, `InputStream`, or `Buffer` +#' @param ... currently ignored +#' @return A [Schema] +#' @export +read_schema <- function(stream, ...) { + if (inherits(stream, "Message")) { + return(ipc___ReadSchema_Message(stream)) + } else { + if (!inherits(stream, "InputStream")) { + stream <- BufferReader$create(stream) + on.exit(stream$close()) + } + return(ipc___ReadSchema_InputStream(stream)) + } +} + +#' Combine and harmonize schemas +#' +#' @param ... [Schema]s to unify +#' @param schemas Alternatively, a list of schemas +#' @return A `Schema` with the union of fields contained in the inputs, or +#' `NULL` if any of `schemas` is `NULL` +#' @export +#' @examplesIf arrow_available() +#' a <- schema(b = double(), c = bool()) +#' z <- schema(b = double(), k = utf8()) +#' unify_schemas(a, z) +unify_schemas <- function(..., schemas = list(...)) { + if (any(vapply(schemas, is.null, TRUE))) { + return(NULL) + } + arrow__UnifySchemas(schemas) +} + +#' @export +print.arrow_r_metadata <- function(x, ...) { + utils::str(x) + utils::str(.unserialize_arrow_r_metadata(x)) + invisible(x) +} |