summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/tests/testthat/test-dplyr-collapse.R
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-collapse.R235
1 files changed, 235 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-dplyr-collapse.R b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
new file mode 100644
index 000000000..c7281b62d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+withr::local_options(list(arrow.summarise.sort = TRUE))
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+tab <- Table$create(tbl)
+
+test_that("implicit_schema with select", {
+ expect_equal(
+ tab %>%
+ select(int, lgl) %>%
+ implicit_schema(),
+ schema(int = int32(), lgl = bool())
+ )
+})
+
+test_that("implicit_schema with rename", {
+ expect_equal(
+ tab %>%
+ select(numbers = int, lgl) %>%
+ implicit_schema(),
+ schema(numbers = int32(), lgl = bool())
+ )
+})
+
+test_that("implicit_schema with mutate", {
+ expect_equal(
+ tab %>%
+ transmute(
+ numbers = int * 4,
+ words = as.character(int)
+ ) %>%
+ implicit_schema(),
+ schema(numbers = float64(), words = utf8())
+ )
+})
+
+test_that("implicit_schema with summarize", {
+ expect_equal(
+ tab %>%
+ summarize(
+ avg = mean(int)
+ ) %>%
+ implicit_schema(),
+ schema(avg = float64())
+ )
+})
+
+test_that("implicit_schema with group_by summarize", {
+ expect_equal(
+ tab %>%
+ group_by(some_grouping) %>%
+ summarize(
+ avg = mean(int * 5L)
+ ) %>%
+ implicit_schema(),
+ schema(some_grouping = float64(), avg = float64())
+ )
+})
+
+test_that("collapse", {
+ q <- tab %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L)
+ expect_false(is_collapsed(q))
+ expect_true(is_collapsed(collapse(q)))
+ expect_false(is_collapsed(collapse(q)$.data))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, twice) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ collapse() %>%
+ select(chr, int, lgl) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ collapse() %>%
+ group_by(chr) %>%
+ select(chr, int, lgl) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Properties of collapsed query", {
+ q <- tab %>%
+ filter(dbl > 2) %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ group_by(lgl) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ mutate(extra = total * 5)
+
+ # print(tbl %>%
+ # filter(dbl > 2) %>%
+ # select(chr, int, lgl) %>%
+ # mutate(twice = int * 2L) %>%
+ # group_by(lgl) %>%
+ # summarize(total = sum(int, na.rm = TRUE)) %>%
+ # mutate(extra = total * 5))
+
+ # # A tibble: 3 × 3
+ # lgl total extra
+ # <lgl> <int> <dbl>
+ # 1 FALSE 8 40
+ # 2 TRUE 8 40
+ # 3 NA 25 125
+
+ # Avoid evaluating just for nrow
+ expect_identical(dim(q), c(NA_integer_, 3L))
+
+ expect_output(
+ print(q),
+ "InMemoryDataset (query)
+lgl: bool
+total: int32
+extra: double (multiply_checked(total, 5))
+
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+ expect_output(
+ print(q$.data),
+ "InMemoryDataset (query)
+int: int32
+lgl: bool
+
+* Aggregations:
+total: sum(int)
+* Filter: (dbl > 2)
+* Grouped by lgl
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+
+ skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal")
+ # On older R versions:
+ # ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ────────
+ # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40).
+ # Component "total": Mean relative difference: 0.3846154
+ # Component "extra": Mean relative difference: 0.3846154
+ # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ────────
+ # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125).
+ # Component "total": Mean relative difference: 0.9230769
+ # Component "extra": Mean relative difference: 0.9230769
+ expect_equal(
+ q %>% head(1) %>% collect(),
+ tibble::tibble(lgl = FALSE, total = 8L, extra = 40)
+ )
+ skip("TODO (ARROW-1XXXX): implement sorting option about where NAs go")
+ expect_equal(
+ q %>% tail(1) %>% collect(),
+ tibble::tibble(lgl = NA, total = 25L, extra = 125)
+ )
+})
+
+test_that("query_on_dataset handles collapse()", {
+ expect_false(query_on_dataset(
+ tab %>%
+ select(int, chr)
+ ))
+ expect_false(query_on_dataset(
+ tab %>%
+ select(int, chr) %>%
+ collapse() %>%
+ select(int)
+ ))
+
+ ds_dir <- tempfile()
+ dir.create(ds_dir)
+ on.exit(unlink(ds_dir))
+ write_parquet(tab, file.path(ds_dir, "file.parquet"))
+ ds <- open_dataset(ds_dir)
+
+ expect_true(query_on_dataset(
+ ds %>%
+ select(int, chr)
+ ))
+ expect_true(query_on_dataset(
+ ds %>%
+ select(int, chr) %>%
+ collapse() %>%
+ select(int)
+ ))
+})