diff options
Diffstat (limited to 'src/arrow/r/tests/testthat/test-metadata.R')
-rw-r--r-- | src/arrow/r/tests/testthat/test-metadata.R | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-metadata.R b/src/arrow/r/tests/testthat/test-metadata.R new file mode 100644 index 000000000..4c4d8a767 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-metadata.R @@ -0,0 +1,369 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("Schema metadata", { + s <- schema(b = double()) + expect_equal(s$metadata, empty_named_list()) + expect_false(s$HasMetadata) + s$metadata <- list(test = TRUE) + expect_identical(s$metadata, list(test = "TRUE")) + expect_true(s$HasMetadata) + s$metadata$foo <- 42 + expect_identical(s$metadata, list(test = "TRUE", foo = "42")) + expect_true(s$HasMetadata) + s$metadata$foo <- NULL + expect_identical(s$metadata, list(test = "TRUE")) + expect_true(s$HasMetadata) + s$metadata <- NULL + expect_equal(s$metadata, empty_named_list()) + expect_false(s$HasMetadata) + expect_error( + s$metadata <- 4, + "Key-value metadata must be a named list or character vector" + ) +}) + +test_that("Table metadata", { + tab <- Table$create(x = 1:2, y = c("a", "b")) + expect_equal(tab$metadata, empty_named_list()) + tab$metadata <- list(test = TRUE) + expect_identical(tab$metadata, list(test = "TRUE")) + tab$metadata$foo <- 42 + expect_identical(tab$metadata, list(test = "TRUE", foo = "42")) + tab$metadata$foo <- NULL + expect_identical(tab$metadata, list(test = "TRUE")) + tab$metadata <- NULL + expect_equal(tab$metadata, empty_named_list()) +}) + +test_that("Table R metadata", { + tab <- Table$create(example_with_metadata) + expect_output(print(tab$metadata), "arrow_r_metadata") + expect_identical(as.data.frame(tab), example_with_metadata) +}) + +test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", { + tab <- Table$create(example_data[1:6]) + expect_null(tab$metadata$r) + + expect_null(Table$create(example_with_times[1:3])$metadata$r) +}) + +test_that("classes are not stored for arrow_binary/arrow_large_binary/arrow_fixed_size_binary (ARROW-14140)", { + raws <- charToRaw("bonjour") + + binary <- Array$create(list(raws), binary()) + large_binary <- Array$create(list(raws), large_binary()) + fixed_size_binary <- Array$create(list(raws), fixed_size_binary(7L)) + + expect_null(RecordBatch$create(b = binary)$metadata$r) + expect_null(RecordBatch$create(b = large_binary)$metadata$r) + expect_null(RecordBatch$create(b = fixed_size_binary)$metadata$r) + + expect_null(Table$create(b = binary)$metadata$r) + expect_null(Table$create(b = large_binary)$metadata$r) + expect_null(Table$create(b = fixed_size_binary)$metadata$r) +}) + +test_that("Garbage R metadata doesn't break things", { + tab <- Table$create(example_data[1:6]) + tab$metadata$r <- "garbage" + expect_warning( + expect_identical(as.data.frame(tab), example_data[1:6]), + "Invalid metadata$r", + fixed = TRUE + ) + # serialize data like .serialize_arrow_r_metadata does, but don't call that + # directly since it checks to ensure that the data is a list + tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE)) + expect_warning( + expect_identical(as.data.frame(tab), example_data[1:6]), + "Invalid metadata$r", + fixed = TRUE + ) +}) + +test_that("Metadata serialization compression", { + # attributes that (when serialized) are just under 100kb are not compressed, + # and simply serialized + strings <- as.list(rep(make_string_of_size(1), 98)) + small <- .serialize_arrow_r_metadata(strings) + expect_equal( + object.size(small), + object.size(rawToChar(serialize(strings, NULL, ascii = TRUE))) + ) + + # Large strings will be compressed + large_strings <- as.list(rep(make_string_of_size(1), 100)) + large <- .serialize_arrow_r_metadata(large_strings) + expect_lt( + object.size(large), + object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE))) + ) + # and this compression ends up being smaller than even the "small" strings + expect_lt(object.size(large), object.size(small)) + + # However strings where compression + serialization is not effective are no + # worse than only serialization alone + large_few_strings <- as.list(rep(make_random_string_of_size(50), 2)) + large_few <- .serialize_arrow_r_metadata(large_few_strings) + expect_equal( + object.size(large_few), + object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE))) + ) + + # But we can disable compression + op <- options(arrow.compress_metadata = FALSE) + on.exit(options(op)) + + large_strings <- as.list(rep(make_string_of_size(1), 100)) + large <- .serialize_arrow_r_metadata(large_strings) + expect_equal( + object.size(large), + object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE))) + ) +}) + +test_that("RecordBatch metadata", { + rb <- RecordBatch$create(x = 1:2, y = c("a", "b")) + expect_equal(rb$metadata, empty_named_list()) + rb$metadata <- list(test = TRUE) + expect_identical(rb$metadata, list(test = "TRUE")) + rb$metadata$foo <- 42 + expect_identical(rb$metadata, list(test = "TRUE", foo = "42")) + rb$metadata$foo <- NULL + expect_identical(rb$metadata, list(test = "TRUE")) + rb$metadata <- NULL + expect_equal(rb$metadata, empty_named_list()) +}) + +test_that("RecordBatch R metadata", { + expect_identical(as.data.frame(record_batch(example_with_metadata)), example_with_metadata) +}) + +test_that("R metadata roundtrip via parquet", { + skip_if_not_available("parquet") + tf <- tempfile() + on.exit(unlink(tf)) + + write_parquet(example_with_metadata, tf) + expect_identical(read_parquet(tf), example_with_metadata) +}) + +test_that("R metadata roundtrip via feather", { + tf <- tempfile() + on.exit(unlink(tf)) + + write_feather(example_with_metadata, tf) + expect_identical(read_feather(tf), example_with_metadata) +}) + +test_that("haven types roundtrip via feather", { + tf <- tempfile() + on.exit(unlink(tf)) + + write_feather(haven_data, tf) + expect_identical(read_feather(tf), haven_data) +}) + +test_that("Date/time type roundtrip", { + rb <- record_batch(example_with_times) + expect_r6_class(rb$schema$posixlt$type, "StructType") + expect_identical(as.data.frame(rb), example_with_times) +}) + +test_that("metadata keeps attribute of top level data frame", { + df <- structure(data.frame(x = 1, y = 2), foo = "bar") + tab <- Table$create(df) + expect_identical(attr(as.data.frame(tab), "foo"), "bar") + expect_identical(as.data.frame(tab), df) +}) + + +test_that("metadata drops readr's problems attribute", { + readr_like <- tibble::tibble( + dbl = 1.1, + not_here = NA_character_ + ) + attributes(readr_like) <- append( + attributes(readr_like), + list(problems = tibble::tibble( + row = 1L, + col = NA_character_, + expected = "2 columns", + actual = "1 columns", + file = "'test'" + )) + ) + + tab <- Table$create(readr_like) + expect_null(attr(as.data.frame(tab), "problems")) +}) + +test_that("Row-level metadata (does not by default) roundtrip", { + # First tracked at ARROW-10386, though it was later determined that row-level + # metadata should be handled separately ARROW-14020, ARROW-12542 + df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux")))) + tab <- Table$create(df) + r_metadata <- tab$r_metadata + expect_type(r_metadata, "list") + expect_null(r_metadata$columns$x$columns) + + # But we can re-enable this / read data that has already been written with + # row-level metadata + withr::with_options( + list("arrow.preserve_row_level_metadata" = TRUE), { + tab <- Table$create(df) + expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") + expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") + }) +}) + + +test_that("Row-level metadata (does not) roundtrip in datasets", { + # First tracked at ARROW-10386, though it was later determined that row-level + # metadata should be handled separately ARROW-14020, ARROW-12542 + skip_if_not_available("dataset") + skip_if_not_available("parquet") + + library(dplyr, warn.conflicts = FALSE) + + df <- tibble::tibble( + metadata = list( + structure(1, my_value_as_attr = 1), + structure(2, my_value_as_attr = 2), + structure(3, my_value_as_attr = 3), + structure(4, my_value_as_attr = 3) + ), + int = 1L:4L, + part = c(1, 3, 2, 1) + ) + + dst_dir <- make_temp_dir() + + withr::with_options( + list("arrow.preserve_row_level_metadata" = TRUE), { + expect_warning( + write_dataset(df, dst_dir, partitioning = "part"), + "Row-level metadata is not compatible with datasets and will be discarded" + ) + + # Reset directory as previous write will have created some files and the default + # behavior is to error on existing + dst_dir <- make_temp_dir() + # but we need to write a dataset with row-level metadata to make sure when + # reading ones that have been written with them we warn appropriately + fake_func_name <- write_dataset + fake_func_name(df, dst_dir, partitioning = "part") + + ds <- open_dataset(dst_dir) + expect_warning( + df_from_ds <- collect(ds), + "Row-level metadata is not compatible with this operation and has been ignored" + ) + expect_equal( + arrange(df_from_ds, int), + arrange(df, int), + ignore_attr = TRUE + ) + + # however there is *no* warning if we don't select the metadata column + expect_warning( + df_from_ds <- ds %>% select(int) %>% collect(), + NA + ) + }) +}) + +test_that("When we encounter SF cols, we warn", { + df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux")))) + class(df$x) <- c("sfc_MULTIPOLYGON", "sfc", "list") + + expect_warning( + tab <- Table$create(df), + "One of the columns given appears to be an" + ) + + # but the table was read fine, just sans (row-level) metadata + r_metadata <- .unserialize_arrow_r_metadata(tab$metadata$r) + expect_null(r_metadata$columns$x$columns) + + # But we can re-enable this / read data that has already been written with + # row-level metadata without a warning + withr::with_options( + list("arrow.preserve_row_level_metadata" = TRUE), { + expect_warning(tab <- Table$create(df), NA) + expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") + expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") + }) +}) + +test_that("dplyr with metadata", { + skip_if_not_available("dataset") + + compare_dplyr_binding( + .input %>% + collect(), + example_with_metadata + ) + compare_dplyr_binding( + .input %>% + select(a) %>% + collect(), + example_with_metadata + ) + compare_dplyr_binding( + .input %>% + mutate(z = b * 4) %>% + select(z, a) %>% + collect(), + example_with_metadata + ) + compare_dplyr_binding( + .input %>% + mutate(z = nchar(a)) %>% + select(z, a) %>% + collect(), + example_with_metadata + ) + # dplyr drops top-level attributes if you do summarize, though attributes + # of grouping columns appear to come through + compare_dplyr_binding( + .input %>% + group_by(a) %>% + summarize(n()) %>% + collect(), + example_with_metadata + ) + # Same name in output but different data, so the column metadata shouldn't + # carry through + compare_dplyr_binding( + .input %>% + mutate(a = nchar(a)) %>% + select(a) %>% + collect(), + example_with_metadata + ) +}) + +test_that("grouped_df metadata is recorded (efficiently)", { + grouped <- group_by(tibble(a = 1:2, b = 3:4), a) + expect_s3_class(grouped, "grouped_df") + grouped_tab <- Table$create(grouped) + expect_r6_class(grouped_tab, "Table") + expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a") +}) |