From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/arrow/r/tests/testthat/test-parquet.R | 274 ++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 src/arrow/r/tests/testthat/test-parquet.R (limited to 'src/arrow/r/tests/testthat/test-parquet.R') diff --git a/src/arrow/r/tests/testthat/test-parquet.R b/src/arrow/r/tests/testthat/test-parquet.R new file mode 100644 index 000000000..55d86b532 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-parquet.R @@ -0,0 +1,274 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("parquet") + +pq_file <- system.file("v0.7.1.parquet", package = "arrow") + +test_that("reading a known Parquet file to tibble", { + skip_if_not_available("snappy") + df <- read_parquet(pq_file) + expect_true(tibble::is_tibble(df)) + expect_identical(dim(df), c(10L, 11L)) + # TODO: assert more about the contents +}) + +test_that("simple int column roundtrip", { + df <- tibble::tibble(x = 1:5) + pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary + + write_parquet(df, pq_tmp_file) + df_read <- read_parquet(pq_tmp_file) + expect_equal(df, df_read) + # Make sure file connection is cleaned up + expect_error(file.remove(pq_tmp_file), NA) + expect_false(file.exists(pq_tmp_file)) +}) + +test_that("read_parquet() supports col_select", { + skip_if_not_available("snappy") + df <- read_parquet(pq_file, col_select = c(x, y, z)) + expect_equal(names(df), c("x", "y", "z")) + + df <- read_parquet(pq_file, col_select = starts_with("c")) + expect_equal(names(df), c("carat", "cut", "color", "clarity")) +}) + +test_that("read_parquet() with raw data", { + skip_if_not_available("snappy") + test_raw <- readBin(pq_file, what = "raw", n = 5000) + df <- read_parquet(test_raw) + expect_identical(dim(df), c(10L, 11L)) +}) + +test_that("write_parquet() handles various compression= specs", { + skip_if_not_available("snappy") + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "snappy") + expect_parquet_roundtrip(tab, compression = rep("snappy", 3L)) + expect_parquet_roundtrip(tab, compression = c(x1 = "snappy", x2 = "snappy")) +}) + +test_that("write_parquet() handles various compression_level= specs", { + skip_if_not_available("gzip") + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = 4) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = rep(4L, 3L)) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = c(x1 = 5L, x2 = 3L)) +}) + +test_that("write_parquet() handles various use_dictionary= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, use_dictionary = TRUE) + expect_parquet_roundtrip(tab, use_dictionary = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, use_dictionary = c(x1 = TRUE, x2 = TRUE)) + expect_error( + write_parquet(tab, tempfile(), use_dictionary = c(TRUE, FALSE)), + "unsupported use_dictionary= specification" + ) + expect_error( + write_parquet(tab, tempfile(), use_dictionary = 12), + "is.logical(use_dictionary) is not TRUE", + fixed = TRUE + ) +}) + +test_that("write_parquet() handles various write_statistics= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, write_statistics = TRUE) + expect_parquet_roundtrip(tab, write_statistics = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, write_statistics = c(x1 = TRUE, x2 = TRUE)) +}) + +test_that("write_parquet() accepts RecordBatch too", { + batch <- RecordBatch$create(x1 = 1:5, x2 = 1:5, y = 1:5) + tab <- parquet_roundtrip(batch) + expect_equal(tab, Table$create(batch)) +}) + +test_that("write_parquet() handles grouped_df", { + library(dplyr, warn.conflicts = FALSE) + df <- tibble::tibble(a = 1:4, b = 5) %>% group_by(b) + # Since `df` is a "grouped_df", this test asserts that we get a grouped_df back + expect_parquet_roundtrip(df, as_data_frame = TRUE) +}) + +test_that("write_parquet() with invalid input type", { + bad_input <- Array$create(1:5) + expect_error( + write_parquet(bad_input, tempfile()), + regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'." + ) +}) + +test_that("write_parquet() can truncate timestamps", { + tab <- Table$create(x1 = as.POSIXct("2020/06/03 18:00:00", tz = "UTC")) + expect_type_equal(tab$x1, timestamp("us", "UTC")) + + tf <- tempfile() + on.exit(unlink(tf)) + + write_parquet(tab, tf, coerce_timestamps = "ms", allow_truncated_timestamps = TRUE) + new <- read_parquet(tf, as_data_frame = FALSE) + expect_type_equal(new$x1, timestamp("ms", "UTC")) + expect_equal(as.data.frame(tab), as.data.frame(new)) +}) + +test_that("make_valid_version()", { + expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0) +}) + +test_that("write_parquet() defaults to snappy compression", { + skip_if_not_available("snappy") + tmp1 <- tempfile() + tmp2 <- tempfile() + write_parquet(mtcars, tmp1) + write_parquet(mtcars, tmp2, compression = "snappy") + expect_equal(file.size(tmp1), file.size(tmp2)) +}) + +test_that("Factors are preserved when writing/reading from Parquet", { + fct <- factor(c("a", "b"), levels = c("c", "a", "b")) + ord <- factor(c("a", "b"), levels = c("c", "a", "b"), ordered = TRUE) + chr <- c("a", "b") + df <- tibble::tibble(fct = fct, ord = ord, chr = chr) + + pq_tmp_file <- tempfile() + on.exit(unlink(pq_tmp_file)) + + write_parquet(df, pq_tmp_file) + df_read <- read_parquet(pq_tmp_file) + expect_equal(df, df_read) +}) + +test_that("Lists are preserved when writing/reading from Parquet", { + bool <- list(logical(0), NA, c(TRUE, FALSE)) + int <- list(integer(0), NA_integer_, 1:4) + num <- list(numeric(0), NA_real_, c(1, 2)) + char <- list(character(0), NA_character_, c("itsy", "bitsy")) + df <- tibble::tibble(bool = bool, int = int, num = num, char = char) + + pq_tmp_file <- tempfile() + on.exit(unlink(pq_tmp_file)) + + write_parquet(df, pq_tmp_file) + df_read <- read_parquet(pq_tmp_file) + expect_equal(df, df_read, ignore_attr = TRUE) +}) + +test_that("write_parquet() to stream", { + df <- tibble::tibble(x = 1:5) + tf <- tempfile() + con <- FileOutputStream$create(tf) + on.exit(unlink(tf)) + write_parquet(df, con) + con$close() + expect_equal(read_parquet(tf), df) +}) + +test_that("write_parquet() returns its input", { + df <- tibble::tibble(x = 1:5) + tf <- tempfile() + on.exit(unlink(tf)) + df_out <- write_parquet(df, tf) + expect_equal(df, df_out) +}) + +test_that("write_parquet() handles version argument", { + df <- tibble::tibble(x = 1:5) + tf <- tempfile() + on.exit(unlink(tf)) + + purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ { + write_parquet(df, tf, version = .x) + expect_identical(read_parquet(tf), df) + }) + purrr::walk(list("3.0", 3.0, 3L, "A"), ~ { + expect_error(write_parquet(df, tf, version = .x)) + }) +}) + +test_that("ParquetFileWriter raises an error for non-OutputStream sink", { + sch <- schema(a = float32()) + # ARROW-9946 + expect_error( + ParquetFileWriter$create(schema = sch, sink = tempfile()), + regex = "OutputStream" + ) +}) + +test_that("ParquetFileReader $ReadRowGroup(s) methods", { + tab <- Table$create(x = 1:100) + tf <- tempfile() + on.exit(unlink(tf)) + write_parquet(tab, tf, chunk_size = 10) + + reader <- ParquetFileReader$create(tf) + expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:10)) + expect_true(reader$ReadRowGroup(9) == Table$create(x = 91:100)) + expect_error(reader$ReadRowGroup(-1), "Some index in row_group_indices") + expect_error(reader$ReadRowGroup(111), "Some index in row_group_indices") + expect_error(reader$ReadRowGroup(c(1, 2))) + expect_error(reader$ReadRowGroup("a")) + + expect_true(reader$ReadRowGroups(c(0, 1)) == Table$create(x = 1:20)) + expect_error(reader$ReadRowGroups(c(0, 1, -2))) # although it gives a weird error + expect_error(reader$ReadRowGroups(c(0, 1, 31))) # ^^ + expect_error(reader$ReadRowGroups(c("a", "b"))) + + ## -- with column_indices + expect_true(reader$ReadRowGroup(0, 0) == Table$create(x = 1:10)) + expect_error(reader$ReadRowGroup(0, 1)) + + expect_true(reader$ReadRowGroups(c(0, 1), 0) == Table$create(x = 1:20)) + expect_error(reader$ReadRowGroups(c(0, 1), 1)) +}) + +test_that("Error messages are shown when the compression algorithm snappy is not found", { + msg <- paste0( + "NotImplemented: Support for codec 'snappy' not built\nIn order to read this file, ", + "you will need to reinstall arrow with additional features enabled.\nSet one of these ", + "environment variables before installing:\n\n * LIBARROW_MINIMAL=false (for all optional ", + "features, including 'snappy')\n * ARROW_WITH_SNAPPY=ON (for just 'snappy')\n\n", + "See https://arrow.apache.org/docs/r/articles/install.html for details" + ) + + if (codec_is_available("snappy")) { + d <- read_parquet(pq_file) + expect_s3_class(d, "data.frame") + } else { + expect_error(read_parquet(pq_file), msg, fixed = TRUE) + } +}) + +test_that("Error is created when parquet reads a feather file", { + expect_error( + read_parquet(test_path("golden-files/data-arrow_2.0.0_lz4.feather")), + "Parquet magic bytes not found in footer" + ) +}) -- cgit v1.2.3