summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/tests/testthat/test-parquet.R
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/r/tests/testthat/test-parquet.R')
-rw-r--r--src/arrow/r/tests/testthat/test-parquet.R274
1 files changed, 274 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-parquet.R b/src/arrow/r/tests/testthat/test-parquet.R
new file mode 100644
index 000000000..55d86b532
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-parquet.R
@@ -0,0 +1,274 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("parquet")
+
+pq_file <- system.file("v0.7.1.parquet", package = "arrow")
+
+test_that("reading a known Parquet file to tibble", {
+ skip_if_not_available("snappy")
+ df <- read_parquet(pq_file)
+ expect_true(tibble::is_tibble(df))
+ expect_identical(dim(df), c(10L, 11L))
+ # TODO: assert more about the contents
+})
+
+test_that("simple int column roundtrip", {
+ df <- tibble::tibble(x = 1:5)
+ pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read)
+ # Make sure file connection is cleaned up
+ expect_error(file.remove(pq_tmp_file), NA)
+ expect_false(file.exists(pq_tmp_file))
+})
+
+test_that("read_parquet() supports col_select", {
+ skip_if_not_available("snappy")
+ df <- read_parquet(pq_file, col_select = c(x, y, z))
+ expect_equal(names(df), c("x", "y", "z"))
+
+ df <- read_parquet(pq_file, col_select = starts_with("c"))
+ expect_equal(names(df), c("carat", "cut", "color", "clarity"))
+})
+
+test_that("read_parquet() with raw data", {
+ skip_if_not_available("snappy")
+ test_raw <- readBin(pq_file, what = "raw", n = 5000)
+ df <- read_parquet(test_raw)
+ expect_identical(dim(df), c(10L, 11L))
+})
+
+test_that("write_parquet() handles various compression= specs", {
+ skip_if_not_available("snappy")
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, compression = "snappy")
+ expect_parquet_roundtrip(tab, compression = rep("snappy", 3L))
+ expect_parquet_roundtrip(tab, compression = c(x1 = "snappy", x2 = "snappy"))
+})
+
+test_that("write_parquet() handles various compression_level= specs", {
+ skip_if_not_available("gzip")
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = 4)
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = rep(4L, 3L))
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = c(x1 = 5L, x2 = 3L))
+})
+
+test_that("write_parquet() handles various use_dictionary= specs", {
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, use_dictionary = TRUE)
+ expect_parquet_roundtrip(tab, use_dictionary = c(TRUE, FALSE, TRUE))
+ expect_parquet_roundtrip(tab, use_dictionary = c(x1 = TRUE, x2 = TRUE))
+ expect_error(
+ write_parquet(tab, tempfile(), use_dictionary = c(TRUE, FALSE)),
+ "unsupported use_dictionary= specification"
+ )
+ expect_error(
+ write_parquet(tab, tempfile(), use_dictionary = 12),
+ "is.logical(use_dictionary) is not TRUE",
+ fixed = TRUE
+ )
+})
+
+test_that("write_parquet() handles various write_statistics= specs", {
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, write_statistics = TRUE)
+ expect_parquet_roundtrip(tab, write_statistics = c(TRUE, FALSE, TRUE))
+ expect_parquet_roundtrip(tab, write_statistics = c(x1 = TRUE, x2 = TRUE))
+})
+
+test_that("write_parquet() accepts RecordBatch too", {
+ batch <- RecordBatch$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+ tab <- parquet_roundtrip(batch)
+ expect_equal(tab, Table$create(batch))
+})
+
+test_that("write_parquet() handles grouped_df", {
+ library(dplyr, warn.conflicts = FALSE)
+ df <- tibble::tibble(a = 1:4, b = 5) %>% group_by(b)
+ # Since `df` is a "grouped_df", this test asserts that we get a grouped_df back
+ expect_parquet_roundtrip(df, as_data_frame = TRUE)
+})
+
+test_that("write_parquet() with invalid input type", {
+ bad_input <- Array$create(1:5)
+ expect_error(
+ write_parquet(bad_input, tempfile()),
+ regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
+ )
+})
+
+test_that("write_parquet() can truncate timestamps", {
+ tab <- Table$create(x1 = as.POSIXct("2020/06/03 18:00:00", tz = "UTC"))
+ expect_type_equal(tab$x1, timestamp("us", "UTC"))
+
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_parquet(tab, tf, coerce_timestamps = "ms", allow_truncated_timestamps = TRUE)
+ new <- read_parquet(tf, as_data_frame = FALSE)
+ expect_type_equal(new$x1, timestamp("ms", "UTC"))
+ expect_equal(as.data.frame(tab), as.data.frame(new))
+})
+
+test_that("make_valid_version()", {
+ expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0)
+
+ expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0)
+
+ expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0)
+})
+
+test_that("write_parquet() defaults to snappy compression", {
+ skip_if_not_available("snappy")
+ tmp1 <- tempfile()
+ tmp2 <- tempfile()
+ write_parquet(mtcars, tmp1)
+ write_parquet(mtcars, tmp2, compression = "snappy")
+ expect_equal(file.size(tmp1), file.size(tmp2))
+})
+
+test_that("Factors are preserved when writing/reading from Parquet", {
+ fct <- factor(c("a", "b"), levels = c("c", "a", "b"))
+ ord <- factor(c("a", "b"), levels = c("c", "a", "b"), ordered = TRUE)
+ chr <- c("a", "b")
+ df <- tibble::tibble(fct = fct, ord = ord, chr = chr)
+
+ pq_tmp_file <- tempfile()
+ on.exit(unlink(pq_tmp_file))
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read)
+})
+
+test_that("Lists are preserved when writing/reading from Parquet", {
+ bool <- list(logical(0), NA, c(TRUE, FALSE))
+ int <- list(integer(0), NA_integer_, 1:4)
+ num <- list(numeric(0), NA_real_, c(1, 2))
+ char <- list(character(0), NA_character_, c("itsy", "bitsy"))
+ df <- tibble::tibble(bool = bool, int = int, num = num, char = char)
+
+ pq_tmp_file <- tempfile()
+ on.exit(unlink(pq_tmp_file))
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read, ignore_attr = TRUE)
+})
+
+test_that("write_parquet() to stream", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ con <- FileOutputStream$create(tf)
+ on.exit(unlink(tf))
+ write_parquet(df, con)
+ con$close()
+ expect_equal(read_parquet(tf), df)
+})
+
+test_that("write_parquet() returns its input", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ df_out <- write_parquet(df, tf)
+ expect_equal(df, df_out)
+})
+
+test_that("write_parquet() handles version argument", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ {
+ write_parquet(df, tf, version = .x)
+ expect_identical(read_parquet(tf), df)
+ })
+ purrr::walk(list("3.0", 3.0, 3L, "A"), ~ {
+ expect_error(write_parquet(df, tf, version = .x))
+ })
+})
+
+test_that("ParquetFileWriter raises an error for non-OutputStream sink", {
+ sch <- schema(a = float32())
+ # ARROW-9946
+ expect_error(
+ ParquetFileWriter$create(schema = sch, sink = tempfile()),
+ regex = "OutputStream"
+ )
+})
+
+test_that("ParquetFileReader $ReadRowGroup(s) methods", {
+ tab <- Table$create(x = 1:100)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_parquet(tab, tf, chunk_size = 10)
+
+ reader <- ParquetFileReader$create(tf)
+ expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:10))
+ expect_true(reader$ReadRowGroup(9) == Table$create(x = 91:100))
+ expect_error(reader$ReadRowGroup(-1), "Some index in row_group_indices")
+ expect_error(reader$ReadRowGroup(111), "Some index in row_group_indices")
+ expect_error(reader$ReadRowGroup(c(1, 2)))
+ expect_error(reader$ReadRowGroup("a"))
+
+ expect_true(reader$ReadRowGroups(c(0, 1)) == Table$create(x = 1:20))
+ expect_error(reader$ReadRowGroups(c(0, 1, -2))) # although it gives a weird error
+ expect_error(reader$ReadRowGroups(c(0, 1, 31))) # ^^
+ expect_error(reader$ReadRowGroups(c("a", "b")))
+
+ ## -- with column_indices
+ expect_true(reader$ReadRowGroup(0, 0) == Table$create(x = 1:10))
+ expect_error(reader$ReadRowGroup(0, 1))
+
+ expect_true(reader$ReadRowGroups(c(0, 1), 0) == Table$create(x = 1:20))
+ expect_error(reader$ReadRowGroups(c(0, 1), 1))
+})
+
+test_that("Error messages are shown when the compression algorithm snappy is not found", {
+ msg <- paste0(
+ "NotImplemented: Support for codec 'snappy' not built\nIn order to read this file, ",
+ "you will need to reinstall arrow with additional features enabled.\nSet one of these ",
+ "environment variables before installing:\n\n * LIBARROW_MINIMAL=false (for all optional ",
+ "features, including 'snappy')\n * ARROW_WITH_SNAPPY=ON (for just 'snappy')\n\n",
+ "See https://arrow.apache.org/docs/r/articles/install.html for details"
+ )
+
+ if (codec_is_available("snappy")) {
+ d <- read_parquet(pq_file)
+ expect_s3_class(d, "data.frame")
+ } else {
+ expect_error(read_parquet(pq_file), msg, fixed = TRUE)
+ }
+})
+
+test_that("Error is created when parquet reads a feather file", {
+ expect_error(
+ read_parquet(test_path("golden-files/data-arrow_2.0.0_lz4.feather")),
+ "Parquet magic bytes not found in footer"
+ )
+})