summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/tests/testthat/test-dplyr-query.R
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-query.R296
1 files changed, 296 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-dplyr-query.R b/src/arrow/r/tests/testthat/test-dplyr-query.R
new file mode 100644
index 000000000..21a55f4b4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-query.R
@@ -0,0 +1,296 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$another_chr <- tail(letters, 10)
+
+test_that("basic select/filter/collect", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5)
+
+ expect_s3_class(b2, "arrow_dplyr_query")
+ t2 <- collect(b2)
+ expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")])
+ # Test that the original object is not affected
+ expect_identical(collect(batch), tbl)
+})
+
+test_that("dim() on query", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 5) %>%
+ select(int, chr) %>%
+ dim(),
+ tbl
+ )
+})
+
+test_that("Print method", {
+ expect_output(
+ record_batch(tbl) %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ print(),
+ 'InMemoryDataset (query)
+int: int32
+chr: string
+
+* Filter: (((dbl > 2) and ((chr == "d") or (chr == "f"))) and (int < 5))
+See $.data for the source Arrow object',
+ fixed = TRUE
+ )
+})
+
+test_that("pull", {
+ compare_dplyr_binding(
+ .input %>% pull(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>% pull(1),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>% pull(chr),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 4) %>%
+ rename(strng = chr) %>%
+ pull(strng),
+ tbl
+ )
+})
+
+test_that("collect(as_data_frame=FALSE)", {
+ batch <- record_batch(tbl)
+
+ b1 <- batch %>% collect(as_data_frame = FALSE)
+
+ expect_r6_class(b1, "RecordBatch")
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ collect(as_data_frame = FALSE)
+
+ # collect(as_data_frame = FALSE) always returns Table now
+ expect_r6_class(b2, "Table")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ collect(as_data_frame = FALSE)
+ expect_r6_class(b3, "Table")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ collect(as_data_frame = FALSE)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("compute()", {
+ batch <- record_batch(tbl)
+
+ b1 <- batch %>% compute()
+
+ expect_r6_class(b1, "RecordBatch")
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ compute()
+
+ expect_r6_class(b2, "Table")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ compute()
+ expect_r6_class(b3, "Table")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ compute()
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("head", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ head(2)
+ expect_s3_class(b2, "arrow_dplyr_query")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")][1:2, ]
+ expect_equal(collect(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ head(2)
+ expect_s3_class(b3, "arrow_dplyr_query")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ head(2)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+
+ expect_equal(
+ batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ head(2) %>%
+ mutate(twice = int * 2) %>%
+ collect(),
+ expected %>%
+ rename(strng = chr) %>%
+ mutate(twice = int * 2)
+ )
+
+ # This would fail if we evaluated head() after filter()
+ expect_equal(
+ batch %>%
+ select(int, strng = chr) %>%
+ head(2) %>%
+ filter(int > 5) %>%
+ collect(),
+ expected %>%
+ rename(strng = chr) %>%
+ filter(FALSE)
+ )
+})
+
+test_that("arrange then head returns the right data (ARROW-14162)", {
+
+ compare_dplyr_binding(
+ .input %>%
+ # mpg has ties so we need to sort by two things to get deterministic order
+ arrange(mpg, disp) %>%
+ head(4) %>%
+ collect(),
+ mtcars,
+ ignore_attr = "row.names"
+ )
+})
+
+test_that("arrange then tail returns the right data", {
+ compare_dplyr_binding(
+ .input %>%
+ # mpg has ties so we need to sort by two things to get deterministic order
+ arrange(mpg, disp) %>%
+ tail(4) %>%
+ collect(),
+ mtcars,
+ ignore_attr = "row.names"
+ )
+})
+
+test_that("tail", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ arrange(int) %>%
+ tail(2)
+
+ expect_s3_class(b2, "arrow_dplyr_query")
+ expected <- tail(tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")], 2)
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ arrange(int) %>%
+ tail(2)
+ expect_s3_class(b3, "arrow_dplyr_query")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ arrange(int) %>%
+ tail(2)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("No duplicate field names are allowed in an arrow_dplyr_query", {
+ expect_error(
+ Table$create(tbl, tbl) %>%
+ filter(int > 0),
+ regexp = paste0(
+ 'The following field names were found more than once in the data: "int", "dbl", ',
+ '"dbl2", "lgl", "false", "chr", "fct", "verses", "padded_strings"'
+ )
+ )
+})