summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R/type.R
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/R/type.R541
1 files changed, 541 insertions, 0 deletions
diff --git a/src/arrow/r/R/type.R b/src/arrow/r/R/type.R
new file mode 100644
index 000000000..4ef7cefb5
--- /dev/null
+++ b/src/arrow/r/R/type.R
@@ -0,0 +1,541 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title class arrow::DataType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname DataType
+#' @name DataType
+DataType <- R6Class("DataType",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ DataType__ToString(self)
+ },
+ Equals = function(other, ...) {
+ inherits(other, "DataType") && DataType__Equals(self, other)
+ },
+ fields = function() {
+ DataType__fields(self)
+ },
+ export_to_c = function(ptr) ExportType(self, ptr)
+ ),
+ active = list(
+ id = function() DataType__id(self),
+ name = function() DataType__name(self),
+ num_fields = function() DataType__num_fields(self)
+ )
+)
+
+#' @include arrowExports.R
+DataType$import_from_c <- ImportType
+
+INTEGER_TYPES <- as.character(outer(c("uint", "int"), c(8, 16, 32, 64), paste0))
+FLOAT_TYPES <- c("float16", "float32", "float64", "halffloat", "float", "double")
+
+#' infer the arrow Array type from an R vector
+#'
+#' @param x an R vector
+#'
+#' @return an arrow logical type
+#' @examplesIf arrow_available()
+#' type(1:10)
+#' type(1L:10L)
+#' type(c(1, 1.5, 2))
+#' type(c("A", "B", "C"))
+#' type(mtcars)
+#' type(Sys.Date())
+#' @export
+type <- function(x) UseMethod("type")
+
+#' @export
+type.default <- function(x) Array__infer_type(x)
+
+#' @export
+type.ArrowDatum <- function(x) x$type
+
+#----- metadata
+
+#' @title class arrow::FixedWidthType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname FixedWidthType
+#' @name FixedWidthType
+FixedWidthType <- R6Class("FixedWidthType",
+ inherit = DataType,
+ active = list(
+ bit_width = function() FixedWidthType__bit_width(self)
+ )
+)
+
+Int8 <- R6Class("Int8", inherit = FixedWidthType)
+Int16 <- R6Class("Int16", inherit = FixedWidthType)
+Int32 <- R6Class("Int32", inherit = FixedWidthType)
+Int64 <- R6Class("Int64", inherit = FixedWidthType)
+UInt8 <- R6Class("UInt8", inherit = FixedWidthType)
+UInt16 <- R6Class("UInt16", inherit = FixedWidthType)
+UInt32 <- R6Class("UInt32", inherit = FixedWidthType)
+UInt64 <- R6Class("UInt64", inherit = FixedWidthType)
+Float16 <- R6Class("Float16", inherit = FixedWidthType)
+Float32 <- R6Class("Float32", inherit = FixedWidthType)
+Float64 <- R6Class("Float64", inherit = FixedWidthType)
+Boolean <- R6Class("Boolean", inherit = FixedWidthType)
+Utf8 <- R6Class("Utf8", inherit = DataType)
+LargeUtf8 <- R6Class("LargeUtf8", inherit = DataType)
+Binary <- R6Class("Binary", inherit = DataType)
+FixedSizeBinary <- R6Class("FixedSizeBinary", inherit = FixedWidthType)
+LargeBinary <- R6Class("LargeBinary", inherit = DataType)
+
+DateType <- R6Class("DateType",
+ inherit = FixedWidthType,
+ public = list(
+ unit = function() DateType__unit(self)
+ )
+)
+Date32 <- R6Class("Date32", inherit = DateType)
+Date64 <- R6Class("Date64", inherit = DateType)
+
+TimeType <- R6Class("TimeType",
+ inherit = FixedWidthType,
+ public = list(
+ unit = function() TimeType__unit(self)
+ )
+)
+Time32 <- R6Class("Time32", inherit = TimeType)
+Time64 <- R6Class("Time64", inherit = TimeType)
+
+Null <- R6Class("Null", inherit = DataType)
+
+Timestamp <- R6Class("Timestamp",
+ inherit = FixedWidthType,
+ public = list(
+ timezone = function() TimestampType__timezone(self),
+ unit = function() TimestampType__unit(self)
+ )
+)
+
+DecimalType <- R6Class("DecimalType",
+ inherit = FixedWidthType,
+ public = list(
+ precision = function() DecimalType__precision(self),
+ scale = function() DecimalType__scale(self)
+ )
+)
+Decimal128Type <- R6Class("Decimal128Type", inherit = DecimalType)
+
+NestedType <- R6Class("NestedType", inherit = DataType)
+
+#' Apache Arrow data types
+#'
+#' These functions create type objects corresponding to Arrow types. Use them
+#' when defining a [schema()] or as inputs to other types, like `struct`. Most
+#' of these functions don't take arguments, but a few do.
+#'
+#' A few functions have aliases:
+#'
+#' * `utf8()` and `string()`
+#' * `float16()` and `halffloat()`
+#' * `float32()` and `float()`
+#' * `bool()` and `boolean()`
+#' * When called inside an `arrow` function, such as `schema()` or `cast()`,
+#' `double()` also is supported as a way of creating a `float64()`
+#'
+#' `date32()` creates a datetime type with a "day" unit, like the R `Date`
+#' class. `date64()` has a "ms" unit.
+#'
+#' `uint32` (32 bit unsigned integer), `uint64` (64 bit unsigned integer), and
+#' `int64` (64-bit signed integer) types may contain values that exceed the
+#' range of R's `integer` type (32-bit signed integer). When these arrow objects
+#' are translated to R objects, `uint32` and `uint64` are converted to `double`
+#' ("numeric") and `int64` is converted to `bit64::integer64`. For `int64`
+#' types, this conversion can be disabled (so that `int64` always yields a
+#' `bit64::integer64` object) by setting `options(arrow.int64_downcast =
+#' FALSE)`.
+#'
+#' @param unit For time/timestamp types, the time unit. `time32()` can take
+#' either "s" or "ms", while `time64()` can be "us" or "ns". `timestamp()` can
+#' take any of those four values.
+#' @param timezone For `timestamp()`, an optional time zone string.
+#' @param byte_width byte width for `FixedSizeBinary` type.
+#' @param list_size list size for `FixedSizeList` type.
+#' @param precision For `decimal()`, precision
+#' @param scale For `decimal()`, scale
+#' @param type For `list_of()`, a data type to make a list-of-type
+#' @param ... For `struct()`, a named list of types to define the struct columns
+#'
+#' @name data-type
+#' @return An Arrow type object inheriting from DataType.
+#' @export
+#' @seealso [dictionary()] for creating a dictionary (factor-like) type.
+#' @examplesIf arrow_available()
+#' bool()
+#' struct(a = int32(), b = double())
+#' timestamp("ms", timezone = "CEST")
+#' time64("ns")
+int8 <- function() Int8__initialize()
+
+#' @rdname data-type
+#' @export
+int16 <- function() Int16__initialize()
+
+#' @rdname data-type
+#' @export
+int32 <- function() Int32__initialize()
+
+#' @rdname data-type
+#' @export
+int64 <- function() Int64__initialize()
+
+#' @rdname data-type
+#' @export
+uint8 <- function() UInt8__initialize()
+
+#' @rdname data-type
+#' @export
+uint16 <- function() UInt16__initialize()
+
+#' @rdname data-type
+#' @export
+uint32 <- function() UInt32__initialize()
+
+#' @rdname data-type
+#' @export
+uint64 <- function() UInt64__initialize()
+
+#' @rdname data-type
+#' @export
+float16 <- function() Float16__initialize()
+
+#' @rdname data-type
+#' @export
+halffloat <- float16
+
+#' @rdname data-type
+#' @export
+float32 <- function() Float32__initialize()
+
+#' @rdname data-type
+#' @export
+float <- float32
+
+#' @rdname data-type
+#' @export
+float64 <- function() Float64__initialize()
+
+#' @rdname data-type
+#' @export
+boolean <- function() Boolean__initialize()
+
+#' @rdname data-type
+#' @export
+bool <- boolean
+
+#' @rdname data-type
+#' @export
+utf8 <- function() Utf8__initialize()
+
+#' @rdname data-type
+#' @export
+large_utf8 <- function() LargeUtf8__initialize()
+
+#' @rdname data-type
+#' @export
+binary <- function() Binary__initialize()
+
+#' @rdname data-type
+#' @export
+large_binary <- function() LargeBinary__initialize()
+
+#' @rdname data-type
+#' @export
+fixed_size_binary <- function(byte_width) FixedSizeBinary__initialize(byte_width)
+
+#' @rdname data-type
+#' @export
+string <- utf8
+
+#' @rdname data-type
+#' @export
+date32 <- function() Date32__initialize()
+
+#' @rdname data-type
+#' @export
+date64 <- function() Date64__initialize()
+
+#' @rdname data-type
+#' @export
+time32 <- function(unit = c("ms", "s")) {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, valid_time32_units)
+ Time32__initialize(unit)
+}
+
+valid_time32_units <- c(
+ "ms" = TimeUnit$MILLI,
+ "s" = TimeUnit$SECOND
+)
+
+valid_time64_units <- c(
+ "ns" = TimeUnit$NANO,
+ "us" = TimeUnit$MICRO
+)
+
+make_valid_time_unit <- function(unit, valid_units) {
+ if (is.character(unit)) {
+ unit <- valid_units[match.arg(unit, choices = names(valid_units))]
+ }
+ if (is.numeric(unit)) {
+ # Allow non-integer input for convenience
+ unit <- as.integer(unit)
+ } else {
+ stop('"unit" should be one of ', oxford_paste(names(valid_units), "or"), call. = FALSE)
+ }
+ if (!(unit %in% valid_units)) {
+ stop('"unit" should be one of ', oxford_paste(valid_units, "or"), call. = FALSE)
+ }
+ unit
+}
+
+#' @rdname data-type
+#' @export
+time64 <- function(unit = c("ns", "us")) {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, valid_time64_units)
+ Time64__initialize(unit)
+}
+
+#' @rdname data-type
+#' @export
+null <- function() Null__initialize()
+
+#' @rdname data-type
+#' @export
+timestamp <- function(unit = c("s", "ms", "us", "ns"), timezone = "") {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units))
+ assert_that(is.string(timezone))
+ Timestamp__initialize(unit, timezone)
+}
+
+#' @rdname data-type
+#' @export
+decimal <- function(precision, scale) {
+ if (is.numeric(precision)) {
+ precision <- as.integer(precision)
+ } else {
+ stop('"precision" must be an integer', call. = FALSE)
+ }
+ if (is.numeric(scale)) {
+ scale <- as.integer(scale)
+ } else {
+ stop('"scale" must be an integer', call. = FALSE)
+ }
+ Decimal128Type__initialize(precision, scale)
+}
+
+StructType <- R6Class("StructType",
+ inherit = NestedType,
+ public = list(
+ GetFieldByName = function(name) StructType__GetFieldByName(self, name),
+ GetFieldIndex = function(name) StructType__GetFieldIndex(self, name)
+ )
+)
+StructType$create <- function(...) struct__(.fields(list(...)))
+
+#' @rdname data-type
+#' @export
+struct <- StructType$create
+
+ListType <- R6Class("ListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() ListType__value_field(self),
+ value_type = function() ListType__value_type(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+list_of <- function(type) list__(type)
+
+LargeListType <- R6Class("LargeListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() LargeListType__value_field(self),
+ value_type = function() LargeListType__value_type(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+large_list_of <- function(type) large_list__(type)
+
+#' @rdname data-type
+#' @export
+FixedSizeListType <- R6Class("FixedSizeListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() FixedSizeListType__value_field(self),
+ value_type = function() FixedSizeListType__value_type(self),
+ list_size = function() FixedSizeListType__list_size(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size)
+
+as_type <- function(type, name = "type") {
+ # magic so we don't have to mask base::double()
+ if (identical(type, double())) {
+ type <- float64()
+ }
+ if (!inherits(type, "DataType")) {
+ stop(name, " must be a DataType, not ", class(type), call. = FALSE)
+ }
+ type
+}
+
+canonical_type_str <- function(type_str) {
+ # canonicalizes data type strings, converting data type function names and
+ # aliases to match the strings returned by DataType$ToString()
+ assert_that(is.string(type_str))
+ if (grepl("[([<]", type_str)) {
+ stop("Cannot interpret string representations of data types that have parameters", call. = FALSE)
+ }
+ switch(type_str,
+ int8 = "int8",
+ int16 = "int16",
+ int32 = "int32",
+ int64 = "int64",
+ uint8 = "uint8",
+ uint16 = "uint16",
+ uint32 = "uint32",
+ uint64 = "uint64",
+ float16 = "halffloat",
+ halffloat = "halffloat",
+ float32 = "float",
+ float = "float",
+ float64 = "double",
+ double = "double",
+ boolean = "bool",
+ bool = "bool",
+ utf8 = "string",
+ large_utf8 = "large_string",
+ large_string = "large_string",
+ binary = "binary",
+ large_binary = "large_binary",
+ fixed_size_binary = "fixed_size_binary",
+ string = "string",
+ date32 = "date32",
+ date64 = "date64",
+ time32 = "time32",
+ time64 = "time64",
+ null = "null",
+ timestamp = "timestamp",
+ decimal = "decimal128",
+ struct = "struct",
+ list_of = "list",
+ list = "list",
+ large_list_of = "large_list",
+ large_list = "large_list",
+ fixed_size_list_of = "fixed_size_list",
+ fixed_size_list = "fixed_size_list",
+ stop("Unrecognized string representation of data type", call. = FALSE)
+ )
+}
+
+# vctrs support -----------------------------------------------------------
+str_dup <- function(x, times) {
+ paste0(rep(x, times = times), collapse = "")
+}
+
+indent <- function(x, n) {
+ pad <- str_dup(" ", n)
+ sapply(x, gsub, pattern = "(\n+)", replacement = paste0("\\1", pad))
+}
+
+#' @importFrom vctrs vec_ptype_full vec_ptype_abbr
+#' @export
+vec_ptype_full.arrow_fixed_size_binary <- function(x, ...) {
+ paste0("fixed_size_binary<", attr(x, "byte_width"), ">")
+}
+
+#' @export
+vec_ptype_full.arrow_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("list<", param, ">")
+}
+
+#' @export
+vec_ptype_full.arrow_large_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("large_list<", param, ">")
+}
+
+#' @export
+vec_ptype_full.arrow_fixed_size_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("fixed_size_list<", param, ", ", attr(x, "list_size"), ">")
+}
+
+#' @export
+vec_ptype_abbr.arrow_fixed_size_binary <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_large_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_fixed_size_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}