diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/tests/testthat/test-dplyr-filter.R | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/tests/testthat/test-dplyr-filter.R')
-rw-r--r-- | src/arrow/r/tests/testthat/test-dplyr-filter.R | 412 |
1 files changed, 412 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-dplyr-filter.R b/src/arrow/r/tests/testthat/test-dplyr-filter.R new file mode 100644 index 000000000..72a64229c --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-filter.R @@ -0,0 +1,412 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$some_negative <- tbl$int * (-1)^(1:nrow(tbl)) # nolint + +test_that("filter() on is.na()", { + compare_dplyr_binding( + .input %>% + filter(is.na(lgl)) %>% + select(chr, int, lgl) %>% + collect(), + tbl + ) +}) + +test_that("filter() with NAs in selection", { + compare_dplyr_binding( + .input %>% + filter(lgl) %>% + select(chr, int, lgl) %>% + collect(), + tbl + ) +}) + +test_that("Filter returning an empty Table should not segfault (ARROW-8354)", { + compare_dplyr_binding( + .input %>% + filter(false) %>% + select(chr, int, lgl) %>% + collect(), + tbl + ) +}) + +test_that("filtering with expression", { + char_sym <- "b" + compare_dplyr_binding( + .input %>% + filter(chr == char_sym) %>% + select(string = chr, int) %>% + collect(), + tbl + ) +}) + +test_that("filtering with arithmetic", { + compare_dplyr_binding( + .input %>% + filter(dbl + 1 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl / 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl / 2L > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int / 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int / 2L > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl %/% 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl^2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) +}) + +test_that("filtering with expression + autocasting", { + compare_dplyr_binding( + .input %>% + filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int + 1 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int^2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) +}) + +test_that("More complex select/filter", { + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) +}) + +test_that("filter() with %in%", { + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr %in% c("d", "f")) %>% + collect(), + tbl + ) +}) + +test_that("Negative scalar values", { + compare_dplyr_binding( + .input %>% + filter(some_negative > -2) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + filter(some_negative %in% -1) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + filter(int == -some_negative) %>% + collect(), + tbl + ) +}) + +test_that("filter() with between()", { + compare_dplyr_binding( + .input %>% + filter(between(dbl, 1, 2)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(between(dbl, 0.5, 2)) %>% + collect(), + tbl + ) + + expect_identical( + tbl %>% + record_batch() %>% + filter(between(dbl, int, dbl2)) %>% + collect(), + tbl %>% + filter(dbl >= int, dbl <= dbl2) + ) + + expect_error( + tbl %>% + record_batch() %>% + filter(between(dbl, 1, "2")) %>% + collect() + ) + + expect_error( + tbl %>% + record_batch() %>% + filter(between(dbl, 1, NA)) %>% + collect() + ) + + expect_error( + tbl %>% + record_batch() %>% + filter(between(chr, 1, 2)) %>% + collect() + ) +}) + +test_that("filter() with string ops", { + skip_if_not_available("utf8proc") + compare_dplyr_binding( + .input %>% + filter(dbl > 2, str_length(verses) > 25) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>% + collect(), + tbl + ) +}) + +test_that("filter environment scope", { + # "object 'b_var' not found" + compare_dplyr_error(.input %>% filter(chr == b_var), tbl) + + b_var <- "b" + compare_dplyr_binding( + .input %>% + filter(chr == b_var) %>% + collect(), + tbl + ) + # Also for functions + # 'could not find function "isEqualTo"' because we haven't defined it yet + compare_dplyr_error(.input %>% filter(isEqualTo(int, 4)), tbl) + + # This works but only because there are S3 methods for those operations + isEqualTo <- function(x, y) x == y & !is.na(x) + compare_dplyr_binding( + .input %>% + select(-fct) %>% # factor levels aren't identical + filter(isEqualTo(int, 4)) %>% + collect(), + tbl + ) + # Try something that needs to call another nse_func + compare_dplyr_binding( + .input %>% + select(-fct) %>% + filter(nchar(padded_strings) < 10) %>% + collect(), + tbl + ) + isShortString <- function(x) nchar(x) < 10 + skip("TODO: 14071") + compare_dplyr_binding( + .input %>% + select(-fct) %>% + filter(isShortString(padded_strings)) %>% + collect(), + tbl + ) +}) + +test_that("Filtering on a column that doesn't exist errors correctly", { + with_language("fr", { + # expect_warning(., NA) because the usual behavior when it hits a filter + # that it can't evaluate is to raise a warning, collect() to R, and retry + # the filter. But we want this to error the first time because it's + # a user error, not solvable by retrying in R + expect_warning( + expect_error( + tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(), + "objet 'not_a_col' introuvable" + ), + NA + ) + }) + with_language("en", { + expect_warning( + expect_error( + tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(), + "object 'not_a_col' not found" + ), + NA + ) + }) +}) + +test_that("Filtering with unsupported functions", { + compare_dplyr_binding( + .input %>% + filter(int > 2, pnorm(dbl) > .99) %>% + collect(), + tbl, + warning = "Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow; pulling data into R" + ) + compare_dplyr_binding( + .input %>% + filter( + nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg + int > 2, # good + pnorm(dbl) > .99 # bad, opaque + ) %>% + collect(), + tbl, + warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported by Arrow +\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow +pulling data into R' + ) +}) + +test_that("Calling Arrow compute functions 'directly'", { + expect_equal( + tbl %>% + record_batch() %>% + filter(arrow_add(dbl, 1) > 3L) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl %>% + filter(dbl + 1 > 3L) %>% + select(string = chr, int, dbl) + ) + + compare_dplyr_binding( + tbl %>% + record_batch() %>% + filter(arrow_greater(arrow_add(dbl, 1), 3L)) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl %>% + filter(dbl + 1 > 3L) %>% + select(string = chr, int, dbl) + ) +}) + +test_that("filter() with .data pronoun", { + compare_dplyr_binding( + .input %>% + filter(.data$dbl > 4) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(is.na(.data$lgl)) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) + + # and the .env pronoun too! + chr <- 4 + compare_dplyr_binding( + .input %>% + filter(.data$dbl > .env$chr) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) + + skip("test now faulty - code no longer gives error & outputs a empty tibble") + # but there is an error if we don't override the masking with `.env` + compare_dplyr_error( + .input %>% + filter(.data$dbl > chr) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) +}) |