summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R/metadata.R
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/R/metadata.R210
1 files changed, 210 insertions, 0 deletions
diff --git a/src/arrow/r/R/metadata.R b/src/arrow/r/R/metadata.R
new file mode 100644
index 000000000..768abeda7
--- /dev/null
+++ b/src/arrow/r/R/metadata.R
@@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom utils object.size
+.serialize_arrow_r_metadata <- function(x) {
+ assert_is(x, "list")
+
+ # drop problems attributes (most likely from readr)
+ x[["attributes"]][["problems"]] <- NULL
+
+ out <- serialize(x, NULL, ascii = TRUE)
+
+ # if the metadata is over 100 kB, compress
+ if (option_compress_metadata() && object.size(out) > 100000) {
+ out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
+
+ # but ensure that the compression+serialization is effective.
+ if (object.size(out) > object.size(out_comp)) out <- out_comp
+ }
+
+ rawToChar(out)
+}
+
+.unserialize_arrow_r_metadata <- function(x) {
+ tryCatch(
+ expr = {
+ out <- unserialize(charToRaw(x))
+
+ # if this is still raw, try decompressing
+ if (is.raw(out)) {
+ out <- unserialize(memDecompress(out, type = "gzip"))
+ }
+ out
+ },
+ error = function(e) {
+ warning("Invalid metadata$r", call. = FALSE)
+ NULL
+ }
+ )
+}
+
+#' @importFrom rlang trace_back
+apply_arrow_r_metadata <- function(x, r_metadata) {
+ tryCatch(
+ expr = {
+ columns_metadata <- r_metadata$columns
+ if (is.data.frame(x)) {
+ if (length(names(x)) && !is.null(columns_metadata)) {
+ for (name in intersect(names(columns_metadata), names(x))) {
+ x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
+ }
+ }
+ } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
+ # If we have a list and "columns_metadata" this applies row-level metadata
+ # inside of a column in a dataframe.
+
+ # However, if we are inside of a dplyr collection (including all datasets),
+ # we cannot apply this row-level metadata, since the order of the rows is
+ # not guaranteed to be the same, so don't even try, but warn what's going on
+ trace <- trace_back()
+ # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
+ in_dplyr_collect <- any(map_lgl(trace$calls %||% trace$call, function(x) {
+ grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]]
+ }))
+ if (in_dplyr_collect) {
+ warning(
+ "Row-level metadata is not compatible with this operation and has ",
+ "been ignored",
+ call. = FALSE
+ )
+ } else {
+ x <- map2(x, columns_metadata, function(.x, .y) {
+ apply_arrow_r_metadata(.x, .y)
+ })
+ }
+ x
+ }
+
+ if (!is.null(r_metadata$attributes)) {
+ attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
+ if (inherits(x, "POSIXlt")) {
+ # We store POSIXlt as a StructArray, which is translated back to R
+ # as a data.frame, but while data frames have a row.names = c(NA, nrow(x))
+ # attribute, POSIXlt does not, so since this is now no longer an object
+ # of class data.frame, remove the extraneous attribute
+ attr(x, "row.names") <- NULL
+ }
+ if (!is.null(attr(x, ".group_vars")) && requireNamespace("dplyr", quietly = TRUE)) {
+ x <- dplyr::group_by(x, !!!syms(attr(x, ".group_vars")))
+ attr(x, ".group_vars") <- NULL
+ }
+ }
+ },
+ error = function(e) {
+ warning("Invalid metadata$r", call. = FALSE)
+ }
+ )
+ x
+}
+
+remove_attributes <- function(x) {
+ removed_attributes <- character()
+ if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) {
+ removed_attributes <- c("class", "row.names", "names")
+ } else if (inherits(x, "data.frame")) {
+ removed_attributes <- c("row.names", "names")
+ } else if (inherits(x, "factor")) {
+ removed_attributes <- c("class", "levels")
+ } else if (inherits(x, c("integer64", "Date", "arrow_binary", "arrow_large_binary"))) {
+ removed_attributes <- c("class")
+ } else if (inherits(x, "arrow_fixed_size_binary")) {
+ removed_attributes <- c("class", "byte_width")
+ } else if (inherits(x, "POSIXct")) {
+ removed_attributes <- c("class", "tzone")
+ } else if (inherits(x, "hms") || inherits(x, "difftime")) {
+ removed_attributes <- c("class", "units")
+ }
+ removed_attributes
+}
+
+arrow_attributes <- function(x, only_top_level = FALSE) {
+ if (inherits(x, "grouped_df")) {
+ # Keep only the group var names, not the rest of the cached data that dplyr
+ # uses, which may be large
+ if (requireNamespace("dplyr", quietly = TRUE)) {
+ gv <- dplyr::group_vars(x)
+ x <- dplyr::ungroup(x)
+ # ungroup() first, then set attribute, bc ungroup() would erase it
+ attr(x, ".group_vars") <- gv
+ } else {
+ # Regardless, we shouldn't keep groups around
+ attr(x, "groups") <- NULL
+ }
+ }
+ att <- attributes(x)
+
+ removed_attributes <- remove_attributes(x)
+
+ att <- att[setdiff(names(att), removed_attributes)]
+ if (isTRUE(only_top_level)) {
+ return(att)
+ }
+
+ if (is.data.frame(x)) {
+ columns <- map(x, arrow_attributes)
+ out <- if (length(att) || !all(map_lgl(columns, is.null))) {
+ list(attributes = att, columns = columns)
+ }
+ return(out)
+ }
+
+ columns <- NULL
+ attempt_to_save_row_level <- getOption("arrow.preserve_row_level_metadata", FALSE) &&
+ is.list(x) && !inherits(x, "POSIXlt")
+ if (attempt_to_save_row_level) {
+ # However, if we are inside of a dplyr collection (including all datasets),
+ # we cannot apply this row-level metadata, since the order of the rows is
+ # not guaranteed to be the same, so don't even try, but warn what's going on
+ trace <- trace_back()
+ # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
+ in_dataset_write <- any(map_lgl(trace$calls %||% trace$call, function(x) {
+ grepl("write_dataset", x, fixed = TRUE)[[1]]
+ }))
+ if (in_dataset_write) {
+ warning(
+ "Row-level metadata is not compatible with datasets and will be discarded",
+ call. = FALSE
+ )
+ } else {
+ # for list columns, we also keep attributes of each
+ # element in columns
+ columns <- map(x, arrow_attributes)
+ }
+ if (all(map_lgl(columns, is.null))) {
+ columns <- NULL
+ }
+ } else if (inherits(x, c("sfc", "sf"))) {
+ # Check if there are any columns that look like sf columns, warn that we will
+ # not be saving this data for now (but only if arrow.preserve_row_level_metadata
+ # is set to FALSE)
+ warning(
+ "One of the columns given appears to be an `sfc` SF column. Due to their unique ",
+ "nature, these columns do not convert to Arrow well. We are working on ",
+ "better ways to do this, but in the interim we recommend converting any `sfc` ",
+ "columns to WKB (well-known binary) columns before using them with Arrow ",
+ "(for example, with `sf::st_as_binary(col)`).",
+ call. = FALSE
+ )
+ }
+
+ if (length(att) || !is.null(columns)) {
+ list(attributes = att, columns = columns)
+ } else {
+ NULL
+ }
+}