1 files changed, 235 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-dplyr-collapse.R b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
new file mode 100644
index 000000000..c7281b62d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+withr::local_options(list(arrow.summarise.sort = TRUE))
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", "  b  ", "   c   ", ...) increasing padding
+# nchar =   3  5  7  9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+tab <- Table$create(tbl)
+
+test_that("implicit_schema with select", {
+  expect_equal(
+    tab %>%
+      select(int, lgl) %>%
+      implicit_schema(),
+    schema(int = int32(), lgl = bool())
+  )
+})
+
+test_that("implicit_schema with rename", {
+  expect_equal(
+    tab %>%
+      select(numbers = int, lgl) %>%
+      implicit_schema(),
+    schema(numbers = int32(), lgl = bool())
+  )
+})
+
+test_that("implicit_schema with mutate", {
+  expect_equal(
+    tab %>%
+      transmute(
+        numbers = int * 4,
+        words = as.character(int)
+      ) %>%
+      implicit_schema(),
+    schema(numbers = float64(), words = utf8())
+  )
+})
+
+test_that("implicit_schema with summarize", {
+  expect_equal(
+    tab %>%
+      summarize(
+        avg = mean(int)
+      ) %>%
+      implicit_schema(),
+    schema(avg = float64())
+  )
+})
+
+test_that("implicit_schema with group_by summarize", {
+  expect_equal(
+    tab %>%
+      group_by(some_grouping) %>%
+      summarize(
+        avg = mean(int * 5L)
+      ) %>%
+      implicit_schema(),
+    schema(some_grouping = float64(), avg = float64())
+  )
+})
+
+test_that("collapse", {
+  q <- tab %>%
+    filter(dbl > 2, chr == "d" | chr == "f") %>%
+    select(chr, int, lgl) %>%
+    mutate(twice = int * 2L)
+  expect_false(is_collapsed(q))
+  expect_true(is_collapsed(collapse(q)))
+  expect_false(is_collapsed(collapse(q)$.data))
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(dbl > 2, chr == "d" | chr == "f") %>%
+      select(chr, int, lgl) %>%
+      mutate(twice = int * 2L) %>%
+      collapse() %>%
+      filter(int < 5) %>%
+      select(int, twice) %>%
+      collect(),
+    tbl
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(dbl > 2, chr == "d" | chr == "f") %>%
+      collapse() %>%
+      select(chr, int, lgl) %>%
+      collapse() %>%
+      filter(int < 5) %>%
+      select(int, chr) %>%
+      collect(),
+    tbl
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(dbl > 2, chr == "d" | chr == "f") %>%
+      collapse() %>%
+      group_by(chr) %>%
+      select(chr, int, lgl) %>%
+      collapse() %>%
+      filter(int < 5) %>%
+      select(int, chr) %>%
+      collect(),
+    tbl
+  )
+})
+
+test_that("Properties of collapsed query", {
+  q <- tab %>%
+    filter(dbl > 2) %>%
+    select(chr, int, lgl) %>%
+    mutate(twice = int * 2L) %>%
+    group_by(lgl) %>%
+    summarize(total = sum(int, na.rm = TRUE)) %>%
+    mutate(extra = total * 5)
+
+  # print(tbl %>%
+  #   filter(dbl > 2) %>%
+  #   select(chr, int, lgl) %>%
+  #   mutate(twice = int * 2L) %>%
+  #   group_by(lgl) %>%
+  #   summarize(total = sum(int, na.rm = TRUE)) %>%
+  #   mutate(extra = total * 5))
+
+  #   # A tibble: 3 × 3
+  #   lgl   total extra
+  #   <lgl> <int> <dbl>
+  # 1 FALSE     8    40
+  # 2 TRUE      8    40
+  # 3 NA       25   125
+
+  # Avoid evaluating just for nrow
+  expect_identical(dim(q), c(NA_integer_, 3L))
+
+  expect_output(
+    print(q),
+    "InMemoryDataset (query)
+lgl: bool
+total: int32
+extra: double (multiply_checked(total, 5))
+
+See $.data for the source Arrow object",
+    fixed = TRUE
+  )
+  expect_output(
+    print(q$.data),
+    "InMemoryDataset (query)
+int: int32
+lgl: bool
+
+* Aggregations:
+total: sum(int)
+* Filter: (dbl > 2)
+* Grouped by lgl
+See $.data for the source Arrow object",
+    fixed = TRUE
+  )
+
+  skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal")
+  # On older R versions:
+  #  ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ────────
+  # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40).
+  # Component "total": Mean relative difference: 0.3846154
+  # Component "extra": Mean relative difference: 0.3846154
+  # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ────────
+  # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125).
+  # Component "total": Mean relative difference: 0.9230769
+  # Component "extra": Mean relative difference: 0.9230769
+  expect_equal(
+    q %>% head(1) %>% collect(),
+    tibble::tibble(lgl = FALSE, total = 8L, extra = 40)
+  )
+  skip("TODO (ARROW-1XXXX): implement sorting option about where NAs go")
+  expect_equal(
+    q %>% tail(1) %>% collect(),
+    tibble::tibble(lgl = NA, total = 25L, extra = 125)
+  )
+})
+
+test_that("query_on_dataset handles collapse()", {
+  expect_false(query_on_dataset(
+    tab %>%
+      select(int, chr)
+  ))
+  expect_false(query_on_dataset(
+    tab %>%
+      select(int, chr) %>%
+      collapse() %>%
+      select(int)
+  ))
+
+  ds_dir <- tempfile()
+  dir.create(ds_dir)
+  on.exit(unlink(ds_dir))
+  write_parquet(tab, file.path(ds_dir, "file.parquet"))
+  ds <- open_dataset(ds_dir)
+
+  expect_true(query_on_dataset(
+    ds %>%
+      select(int, chr)
+  ))
+  expect_true(query_on_dataset(
+    ds %>%
+      select(int, chr) %>%
+      collapse() %>%
+      select(int)
+  ))
+})