1 files changed, 369 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-metadata.R b/src/arrow/r/tests/testthat/test-metadata.R
new file mode 100644
index 000000000..4c4d8a767
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-metadata.R
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Schema metadata", {
+  s <- schema(b = double())
+  expect_equal(s$metadata, empty_named_list())
+  expect_false(s$HasMetadata)
+  s$metadata <- list(test = TRUE)
+  expect_identical(s$metadata, list(test = "TRUE"))
+  expect_true(s$HasMetadata)
+  s$metadata$foo <- 42
+  expect_identical(s$metadata, list(test = "TRUE", foo = "42"))
+  expect_true(s$HasMetadata)
+  s$metadata$foo <- NULL
+  expect_identical(s$metadata, list(test = "TRUE"))
+  expect_true(s$HasMetadata)
+  s$metadata <- NULL
+  expect_equal(s$metadata, empty_named_list())
+  expect_false(s$HasMetadata)
+  expect_error(
+    s$metadata <- 4,
+    "Key-value metadata must be a named list or character vector"
+  )
+})
+
+test_that("Table metadata", {
+  tab <- Table$create(x = 1:2, y = c("a", "b"))
+  expect_equal(tab$metadata, empty_named_list())
+  tab$metadata <- list(test = TRUE)
+  expect_identical(tab$metadata, list(test = "TRUE"))
+  tab$metadata$foo <- 42
+  expect_identical(tab$metadata, list(test = "TRUE", foo = "42"))
+  tab$metadata$foo <- NULL
+  expect_identical(tab$metadata, list(test = "TRUE"))
+  tab$metadata <- NULL
+  expect_equal(tab$metadata, empty_named_list())
+})
+
+test_that("Table R metadata", {
+  tab <- Table$create(example_with_metadata)
+  expect_output(print(tab$metadata), "arrow_r_metadata")
+  expect_identical(as.data.frame(tab), example_with_metadata)
+})
+
+test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", {
+  tab <- Table$create(example_data[1:6])
+  expect_null(tab$metadata$r)
+
+  expect_null(Table$create(example_with_times[1:3])$metadata$r)
+})
+
+test_that("classes are not stored for arrow_binary/arrow_large_binary/arrow_fixed_size_binary (ARROW-14140)", {
+  raws <- charToRaw("bonjour")
+
+  binary <- Array$create(list(raws), binary())
+  large_binary <- Array$create(list(raws), large_binary())
+  fixed_size_binary <- Array$create(list(raws), fixed_size_binary(7L))
+
+  expect_null(RecordBatch$create(b = binary)$metadata$r)
+  expect_null(RecordBatch$create(b = large_binary)$metadata$r)
+  expect_null(RecordBatch$create(b = fixed_size_binary)$metadata$r)
+
+  expect_null(Table$create(b = binary)$metadata$r)
+  expect_null(Table$create(b = large_binary)$metadata$r)
+  expect_null(Table$create(b = fixed_size_binary)$metadata$r)
+})
+
+test_that("Garbage R metadata doesn't break things", {
+  tab <- Table$create(example_data[1:6])
+  tab$metadata$r <- "garbage"
+  expect_warning(
+    expect_identical(as.data.frame(tab), example_data[1:6]),
+    "Invalid metadata$r",
+    fixed = TRUE
+  )
+  # serialize data like .serialize_arrow_r_metadata does, but don't call that
+  # directly since it checks to ensure that the data is a list
+  tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE))
+  expect_warning(
+    expect_identical(as.data.frame(tab), example_data[1:6]),
+    "Invalid metadata$r",
+    fixed = TRUE
+  )
+})
+
+test_that("Metadata serialization compression", {
+  # attributes that (when serialized) are just under 100kb are not compressed,
+  # and simply serialized
+  strings <- as.list(rep(make_string_of_size(1), 98))
+  small <- .serialize_arrow_r_metadata(strings)
+  expect_equal(
+    object.size(small),
+    object.size(rawToChar(serialize(strings, NULL, ascii = TRUE)))
+  )
+
+  # Large strings will be compressed
+  large_strings <- as.list(rep(make_string_of_size(1), 100))
+  large <- .serialize_arrow_r_metadata(large_strings)
+  expect_lt(
+    object.size(large),
+    object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+  )
+  # and this compression ends up being smaller than even the "small" strings
+  expect_lt(object.size(large), object.size(small))
+
+  # However strings where compression + serialization is not effective are no
+  # worse than only serialization alone
+  large_few_strings <- as.list(rep(make_random_string_of_size(50), 2))
+  large_few <- .serialize_arrow_r_metadata(large_few_strings)
+  expect_equal(
+    object.size(large_few),
+    object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE)))
+  )
+
+  # But we can disable compression
+  op <- options(arrow.compress_metadata = FALSE)
+  on.exit(options(op))
+
+  large_strings <- as.list(rep(make_string_of_size(1), 100))
+  large <- .serialize_arrow_r_metadata(large_strings)
+  expect_equal(
+    object.size(large),
+    object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+  )
+})
+
+test_that("RecordBatch metadata", {
+  rb <- RecordBatch$create(x = 1:2, y = c("a", "b"))
+  expect_equal(rb$metadata, empty_named_list())
+  rb$metadata <- list(test = TRUE)
+  expect_identical(rb$metadata, list(test = "TRUE"))
+  rb$metadata$foo <- 42
+  expect_identical(rb$metadata, list(test = "TRUE", foo = "42"))
+  rb$metadata$foo <- NULL
+  expect_identical(rb$metadata, list(test = "TRUE"))
+  rb$metadata <- NULL
+  expect_equal(rb$metadata, empty_named_list())
+})
+
+test_that("RecordBatch R metadata", {
+  expect_identical(as.data.frame(record_batch(example_with_metadata)), example_with_metadata)
+})
+
+test_that("R metadata roundtrip via parquet", {
+  skip_if_not_available("parquet")
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  write_parquet(example_with_metadata, tf)
+  expect_identical(read_parquet(tf), example_with_metadata)
+})
+
+test_that("R metadata roundtrip via feather", {
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  write_feather(example_with_metadata, tf)
+  expect_identical(read_feather(tf), example_with_metadata)
+})
+
+test_that("haven types roundtrip via feather", {
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  write_feather(haven_data, tf)
+  expect_identical(read_feather(tf), haven_data)
+})
+
+test_that("Date/time type roundtrip", {
+  rb <- record_batch(example_with_times)
+  expect_r6_class(rb$schema$posixlt$type, "StructType")
+  expect_identical(as.data.frame(rb), example_with_times)
+})
+
+test_that("metadata keeps attribute of top level data frame", {
+  df <- structure(data.frame(x = 1, y = 2), foo = "bar")
+  tab <- Table$create(df)
+  expect_identical(attr(as.data.frame(tab), "foo"), "bar")
+  expect_identical(as.data.frame(tab), df)
+})
+
+
+test_that("metadata drops readr's problems attribute", {
+  readr_like <- tibble::tibble(
+    dbl = 1.1,
+    not_here = NA_character_
+  )
+  attributes(readr_like) <- append(
+    attributes(readr_like),
+    list(problems = tibble::tibble(
+      row = 1L,
+      col = NA_character_,
+      expected = "2 columns",
+      actual = "1 columns",
+      file = "'test'"
+    ))
+  )
+
+  tab <- Table$create(readr_like)
+  expect_null(attr(as.data.frame(tab), "problems"))
+})
+
+test_that("Row-level metadata (does not by default) roundtrip", {
+  # First tracked at ARROW-10386, though it was later determined that row-level
+  # metadata should be handled separately ARROW-14020, ARROW-12542
+  df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux"))))
+  tab <- Table$create(df)
+  r_metadata <- tab$r_metadata
+  expect_type(r_metadata, "list")
+  expect_null(r_metadata$columns$x$columns)
+
+  # But we can re-enable this / read data that has already been written with
+  # row-level metadata
+  withr::with_options(
+    list("arrow.preserve_row_level_metadata" = TRUE), {
+      tab <- Table$create(df)
+      expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+      expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
+    })
+})
+
+
+test_that("Row-level metadata (does not) roundtrip in datasets", {
+  # First tracked at ARROW-10386, though it was later determined that row-level
+  # metadata should be handled separately ARROW-14020, ARROW-12542
+  skip_if_not_available("dataset")
+  skip_if_not_available("parquet")
+
+  library(dplyr, warn.conflicts = FALSE)
+
+  df <- tibble::tibble(
+    metadata = list(
+      structure(1, my_value_as_attr = 1),
+      structure(2, my_value_as_attr = 2),
+      structure(3, my_value_as_attr = 3),
+      structure(4, my_value_as_attr = 3)
+    ),
+    int = 1L:4L,
+    part = c(1, 3, 2, 1)
+  )
+
+  dst_dir <- make_temp_dir()
+
+  withr::with_options(
+    list("arrow.preserve_row_level_metadata" = TRUE), {
+      expect_warning(
+        write_dataset(df, dst_dir, partitioning = "part"),
+        "Row-level metadata is not compatible with datasets and will be discarded"
+      )
+
+      # Reset directory as previous write will have created some files and the default
+      # behavior is to error on existing
+      dst_dir <- make_temp_dir()
+      # but we need to write a dataset with row-level metadata to make sure when
+      # reading ones that have been written with them we warn appropriately
+      fake_func_name <- write_dataset
+      fake_func_name(df, dst_dir, partitioning = "part")
+
+      ds <- open_dataset(dst_dir)
+      expect_warning(
+        df_from_ds <- collect(ds),
+        "Row-level metadata is not compatible with this operation and has been ignored"
+      )
+      expect_equal(
+        arrange(df_from_ds, int),
+        arrange(df, int),
+        ignore_attr = TRUE
+      )
+
+      # however there is *no* warning if we don't select the metadata column
+      expect_warning(
+        df_from_ds <- ds %>% select(int) %>% collect(),
+        NA
+      )
+    })
+})
+
+test_that("When we encounter SF cols, we warn", {
+  df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux"))))
+  class(df$x) <- c("sfc_MULTIPOLYGON", "sfc", "list")
+
+  expect_warning(
+    tab <- Table$create(df),
+    "One of the columns given appears to be an"
+  )
+
+  # but the table was read fine, just sans (row-level) metadata
+  r_metadata <- .unserialize_arrow_r_metadata(tab$metadata$r)
+  expect_null(r_metadata$columns$x$columns)
+
+  # But we can re-enable this / read data that has already been written with
+  # row-level metadata without a warning
+  withr::with_options(
+    list("arrow.preserve_row_level_metadata" = TRUE), {
+      expect_warning(tab <- Table$create(df), NA)
+      expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+      expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
+    })
+})
+
+test_that("dplyr with metadata", {
+  skip_if_not_available("dataset")
+
+  compare_dplyr_binding(
+    .input %>%
+      collect(),
+    example_with_metadata
+  )
+  compare_dplyr_binding(
+    .input %>%
+      select(a) %>%
+      collect(),
+    example_with_metadata
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(z = b * 4) %>%
+      select(z, a) %>%
+      collect(),
+    example_with_metadata
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(z = nchar(a)) %>%
+      select(z, a) %>%
+      collect(),
+    example_with_metadata
+  )
+  # dplyr drops top-level attributes if you do summarize, though attributes
+  # of grouping columns appear to come through
+  compare_dplyr_binding(
+    .input %>%
+      group_by(a) %>%
+      summarize(n()) %>%
+      collect(),
+    example_with_metadata
+  )
+  # Same name in output but different data, so the column metadata shouldn't
+  # carry through
+  compare_dplyr_binding(
+    .input %>%
+      mutate(a = nchar(a)) %>%
+      select(a) %>%
+      collect(),
+    example_with_metadata
+  )
+})
+
+test_that("grouped_df metadata is recorded (efficiently)", {
+  grouped <- group_by(tibble(a = 1:2, b = 3:4), a)
+  expect_s3_class(grouped, "grouped_df")
+  grouped_tab <- Table$create(grouped)
+  expect_r6_class(grouped_tab, "Table")
+  expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a")
+})