diff options
Diffstat (limited to '')
-rw-r--r-- | src/arrow/r/R/type.R | 541 |
1 files changed, 541 insertions, 0 deletions
diff --git a/src/arrow/r/R/type.R b/src/arrow/r/R/type.R new file mode 100644 index 000000000..4ef7cefb5 --- /dev/null +++ b/src/arrow/r/R/type.R @@ -0,0 +1,541 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @include arrow-package.R +#' @title class arrow::DataType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname DataType +#' @name DataType +DataType <- R6Class("DataType", + inherit = ArrowObject, + public = list( + ToString = function() { + DataType__ToString(self) + }, + Equals = function(other, ...) { + inherits(other, "DataType") && DataType__Equals(self, other) + }, + fields = function() { + DataType__fields(self) + }, + export_to_c = function(ptr) ExportType(self, ptr) + ), + active = list( + id = function() DataType__id(self), + name = function() DataType__name(self), + num_fields = function() DataType__num_fields(self) + ) +) + +#' @include arrowExports.R +DataType$import_from_c <- ImportType + +INTEGER_TYPES <- as.character(outer(c("uint", "int"), c(8, 16, 32, 64), paste0)) +FLOAT_TYPES <- c("float16", "float32", "float64", "halffloat", "float", "double") + +#' infer the arrow Array type from an R vector +#' +#' @param x an R vector +#' +#' @return an arrow logical type +#' @examplesIf arrow_available() +#' type(1:10) +#' type(1L:10L) +#' type(c(1, 1.5, 2)) +#' type(c("A", "B", "C")) +#' type(mtcars) +#' type(Sys.Date()) +#' @export +type <- function(x) UseMethod("type") + +#' @export +type.default <- function(x) Array__infer_type(x) + +#' @export +type.ArrowDatum <- function(x) x$type + +#----- metadata + +#' @title class arrow::FixedWidthType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname FixedWidthType +#' @name FixedWidthType +FixedWidthType <- R6Class("FixedWidthType", + inherit = DataType, + active = list( + bit_width = function() FixedWidthType__bit_width(self) + ) +) + +Int8 <- R6Class("Int8", inherit = FixedWidthType) +Int16 <- R6Class("Int16", inherit = FixedWidthType) +Int32 <- R6Class("Int32", inherit = FixedWidthType) +Int64 <- R6Class("Int64", inherit = FixedWidthType) +UInt8 <- R6Class("UInt8", inherit = FixedWidthType) +UInt16 <- R6Class("UInt16", inherit = FixedWidthType) +UInt32 <- R6Class("UInt32", inherit = FixedWidthType) +UInt64 <- R6Class("UInt64", inherit = FixedWidthType) +Float16 <- R6Class("Float16", inherit = FixedWidthType) +Float32 <- R6Class("Float32", inherit = FixedWidthType) +Float64 <- R6Class("Float64", inherit = FixedWidthType) +Boolean <- R6Class("Boolean", inherit = FixedWidthType) +Utf8 <- R6Class("Utf8", inherit = DataType) +LargeUtf8 <- R6Class("LargeUtf8", inherit = DataType) +Binary <- R6Class("Binary", inherit = DataType) +FixedSizeBinary <- R6Class("FixedSizeBinary", inherit = FixedWidthType) +LargeBinary <- R6Class("LargeBinary", inherit = DataType) + +DateType <- R6Class("DateType", + inherit = FixedWidthType, + public = list( + unit = function() DateType__unit(self) + ) +) +Date32 <- R6Class("Date32", inherit = DateType) +Date64 <- R6Class("Date64", inherit = DateType) + +TimeType <- R6Class("TimeType", + inherit = FixedWidthType, + public = list( + unit = function() TimeType__unit(self) + ) +) +Time32 <- R6Class("Time32", inherit = TimeType) +Time64 <- R6Class("Time64", inherit = TimeType) + +Null <- R6Class("Null", inherit = DataType) + +Timestamp <- R6Class("Timestamp", + inherit = FixedWidthType, + public = list( + timezone = function() TimestampType__timezone(self), + unit = function() TimestampType__unit(self) + ) +) + +DecimalType <- R6Class("DecimalType", + inherit = FixedWidthType, + public = list( + precision = function() DecimalType__precision(self), + scale = function() DecimalType__scale(self) + ) +) +Decimal128Type <- R6Class("Decimal128Type", inherit = DecimalType) + +NestedType <- R6Class("NestedType", inherit = DataType) + +#' Apache Arrow data types +#' +#' These functions create type objects corresponding to Arrow types. Use them +#' when defining a [schema()] or as inputs to other types, like `struct`. Most +#' of these functions don't take arguments, but a few do. +#' +#' A few functions have aliases: +#' +#' * `utf8()` and `string()` +#' * `float16()` and `halffloat()` +#' * `float32()` and `float()` +#' * `bool()` and `boolean()` +#' * When called inside an `arrow` function, such as `schema()` or `cast()`, +#' `double()` also is supported as a way of creating a `float64()` +#' +#' `date32()` creates a datetime type with a "day" unit, like the R `Date` +#' class. `date64()` has a "ms" unit. +#' +#' `uint32` (32 bit unsigned integer), `uint64` (64 bit unsigned integer), and +#' `int64` (64-bit signed integer) types may contain values that exceed the +#' range of R's `integer` type (32-bit signed integer). When these arrow objects +#' are translated to R objects, `uint32` and `uint64` are converted to `double` +#' ("numeric") and `int64` is converted to `bit64::integer64`. For `int64` +#' types, this conversion can be disabled (so that `int64` always yields a +#' `bit64::integer64` object) by setting `options(arrow.int64_downcast = +#' FALSE)`. +#' +#' @param unit For time/timestamp types, the time unit. `time32()` can take +#' either "s" or "ms", while `time64()` can be "us" or "ns". `timestamp()` can +#' take any of those four values. +#' @param timezone For `timestamp()`, an optional time zone string. +#' @param byte_width byte width for `FixedSizeBinary` type. +#' @param list_size list size for `FixedSizeList` type. +#' @param precision For `decimal()`, precision +#' @param scale For `decimal()`, scale +#' @param type For `list_of()`, a data type to make a list-of-type +#' @param ... For `struct()`, a named list of types to define the struct columns +#' +#' @name data-type +#' @return An Arrow type object inheriting from DataType. +#' @export +#' @seealso [dictionary()] for creating a dictionary (factor-like) type. +#' @examplesIf arrow_available() +#' bool() +#' struct(a = int32(), b = double()) +#' timestamp("ms", timezone = "CEST") +#' time64("ns") +int8 <- function() Int8__initialize() + +#' @rdname data-type +#' @export +int16 <- function() Int16__initialize() + +#' @rdname data-type +#' @export +int32 <- function() Int32__initialize() + +#' @rdname data-type +#' @export +int64 <- function() Int64__initialize() + +#' @rdname data-type +#' @export +uint8 <- function() UInt8__initialize() + +#' @rdname data-type +#' @export +uint16 <- function() UInt16__initialize() + +#' @rdname data-type +#' @export +uint32 <- function() UInt32__initialize() + +#' @rdname data-type +#' @export +uint64 <- function() UInt64__initialize() + +#' @rdname data-type +#' @export +float16 <- function() Float16__initialize() + +#' @rdname data-type +#' @export +halffloat <- float16 + +#' @rdname data-type +#' @export +float32 <- function() Float32__initialize() + +#' @rdname data-type +#' @export +float <- float32 + +#' @rdname data-type +#' @export +float64 <- function() Float64__initialize() + +#' @rdname data-type +#' @export +boolean <- function() Boolean__initialize() + +#' @rdname data-type +#' @export +bool <- boolean + +#' @rdname data-type +#' @export +utf8 <- function() Utf8__initialize() + +#' @rdname data-type +#' @export +large_utf8 <- function() LargeUtf8__initialize() + +#' @rdname data-type +#' @export +binary <- function() Binary__initialize() + +#' @rdname data-type +#' @export +large_binary <- function() LargeBinary__initialize() + +#' @rdname data-type +#' @export +fixed_size_binary <- function(byte_width) FixedSizeBinary__initialize(byte_width) + +#' @rdname data-type +#' @export +string <- utf8 + +#' @rdname data-type +#' @export +date32 <- function() Date32__initialize() + +#' @rdname data-type +#' @export +date64 <- function() Date64__initialize() + +#' @rdname data-type +#' @export +time32 <- function(unit = c("ms", "s")) { + if (is.character(unit)) { + unit <- match.arg(unit) + } + unit <- make_valid_time_unit(unit, valid_time32_units) + Time32__initialize(unit) +} + +valid_time32_units <- c( + "ms" = TimeUnit$MILLI, + "s" = TimeUnit$SECOND +) + +valid_time64_units <- c( + "ns" = TimeUnit$NANO, + "us" = TimeUnit$MICRO +) + +make_valid_time_unit <- function(unit, valid_units) { + if (is.character(unit)) { + unit <- valid_units[match.arg(unit, choices = names(valid_units))] + } + if (is.numeric(unit)) { + # Allow non-integer input for convenience + unit <- as.integer(unit) + } else { + stop('"unit" should be one of ', oxford_paste(names(valid_units), "or"), call. = FALSE) + } + if (!(unit %in% valid_units)) { + stop('"unit" should be one of ', oxford_paste(valid_units, "or"), call. = FALSE) + } + unit +} + +#' @rdname data-type +#' @export +time64 <- function(unit = c("ns", "us")) { + if (is.character(unit)) { + unit <- match.arg(unit) + } + unit <- make_valid_time_unit(unit, valid_time64_units) + Time64__initialize(unit) +} + +#' @rdname data-type +#' @export +null <- function() Null__initialize() + +#' @rdname data-type +#' @export +timestamp <- function(unit = c("s", "ms", "us", "ns"), timezone = "") { + if (is.character(unit)) { + unit <- match.arg(unit) + } + unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units)) + assert_that(is.string(timezone)) + Timestamp__initialize(unit, timezone) +} + +#' @rdname data-type +#' @export +decimal <- function(precision, scale) { + if (is.numeric(precision)) { + precision <- as.integer(precision) + } else { + stop('"precision" must be an integer', call. = FALSE) + } + if (is.numeric(scale)) { + scale <- as.integer(scale) + } else { + stop('"scale" must be an integer', call. = FALSE) + } + Decimal128Type__initialize(precision, scale) +} + +StructType <- R6Class("StructType", + inherit = NestedType, + public = list( + GetFieldByName = function(name) StructType__GetFieldByName(self, name), + GetFieldIndex = function(name) StructType__GetFieldIndex(self, name) + ) +) +StructType$create <- function(...) struct__(.fields(list(...))) + +#' @rdname data-type +#' @export +struct <- StructType$create + +ListType <- R6Class("ListType", + inherit = NestedType, + active = list( + value_field = function() ListType__value_field(self), + value_type = function() ListType__value_type(self) + ) +) + +#' @rdname data-type +#' @export +list_of <- function(type) list__(type) + +LargeListType <- R6Class("LargeListType", + inherit = NestedType, + active = list( + value_field = function() LargeListType__value_field(self), + value_type = function() LargeListType__value_type(self) + ) +) + +#' @rdname data-type +#' @export +large_list_of <- function(type) large_list__(type) + +#' @rdname data-type +#' @export +FixedSizeListType <- R6Class("FixedSizeListType", + inherit = NestedType, + active = list( + value_field = function() FixedSizeListType__value_field(self), + value_type = function() FixedSizeListType__value_type(self), + list_size = function() FixedSizeListType__list_size(self) + ) +) + +#' @rdname data-type +#' @export +fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size) + +as_type <- function(type, name = "type") { + # magic so we don't have to mask base::double() + if (identical(type, double())) { + type <- float64() + } + if (!inherits(type, "DataType")) { + stop(name, " must be a DataType, not ", class(type), call. = FALSE) + } + type +} + +canonical_type_str <- function(type_str) { + # canonicalizes data type strings, converting data type function names and + # aliases to match the strings returned by DataType$ToString() + assert_that(is.string(type_str)) + if (grepl("[([<]", type_str)) { + stop("Cannot interpret string representations of data types that have parameters", call. = FALSE) + } + switch(type_str, + int8 = "int8", + int16 = "int16", + int32 = "int32", + int64 = "int64", + uint8 = "uint8", + uint16 = "uint16", + uint32 = "uint32", + uint64 = "uint64", + float16 = "halffloat", + halffloat = "halffloat", + float32 = "float", + float = "float", + float64 = "double", + double = "double", + boolean = "bool", + bool = "bool", + utf8 = "string", + large_utf8 = "large_string", + large_string = "large_string", + binary = "binary", + large_binary = "large_binary", + fixed_size_binary = "fixed_size_binary", + string = "string", + date32 = "date32", + date64 = "date64", + time32 = "time32", + time64 = "time64", + null = "null", + timestamp = "timestamp", + decimal = "decimal128", + struct = "struct", + list_of = "list", + list = "list", + large_list_of = "large_list", + large_list = "large_list", + fixed_size_list_of = "fixed_size_list", + fixed_size_list = "fixed_size_list", + stop("Unrecognized string representation of data type", call. = FALSE) + ) +} + +# vctrs support ----------------------------------------------------------- +str_dup <- function(x, times) { + paste0(rep(x, times = times), collapse = "") +} + +indent <- function(x, n) { + pad <- str_dup(" ", n) + sapply(x, gsub, pattern = "(\n+)", replacement = paste0("\\1", pad)) +} + +#' @importFrom vctrs vec_ptype_full vec_ptype_abbr +#' @export +vec_ptype_full.arrow_fixed_size_binary <- function(x, ...) { + paste0("fixed_size_binary<", attr(x, "byte_width"), ">") +} + +#' @export +vec_ptype_full.arrow_list <- function(x, ...) { + param <- vec_ptype_full(attr(x, "ptype")) + if (grepl("\n", param)) { + param <- paste0(indent(paste0("\n", param), 2), "\n") + } + paste0("list<", param, ">") +} + +#' @export +vec_ptype_full.arrow_large_list <- function(x, ...) { + param <- vec_ptype_full(attr(x, "ptype")) + if (grepl("\n", param)) { + param <- paste0(indent(paste0("\n", param), 2), "\n") + } + paste0("large_list<", param, ">") +} + +#' @export +vec_ptype_full.arrow_fixed_size_list <- function(x, ...) { + param <- vec_ptype_full(attr(x, "ptype")) + if (grepl("\n", param)) { + param <- paste0(indent(paste0("\n", param), 2), "\n") + } + paste0("fixed_size_list<", param, ", ", attr(x, "list_size"), ">") +} + +#' @export +vec_ptype_abbr.arrow_fixed_size_binary <- function(x, ...) { + vec_ptype_full(x, ...) +} +#' @export +vec_ptype_abbr.arrow_list <- function(x, ...) { + vec_ptype_full(x, ...) +} +#' @export +vec_ptype_abbr.arrow_large_list <- function(x, ...) { + vec_ptype_full(x, ...) +} +#' @export +vec_ptype_abbr.arrow_fixed_size_list <- function(x, ...) { + vec_ptype_full(x, ...) +} |