diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/tests/testthat/test-dplyr-collapse.R | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/tests/testthat/test-dplyr-collapse.R')
-rw-r--r-- | src/arrow/r/tests/testthat/test-dplyr-collapse.R | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-dplyr-collapse.R b/src/arrow/r/tests/testthat/test-dplyr-collapse.R new file mode 100644 index 000000000..c7281b62d --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-collapse.R @@ -0,0 +1,235 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +withr::local_options(list(arrow.summarise.sort = TRUE)) + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$some_grouping <- rep(c(1, 2), 5) + +tab <- Table$create(tbl) + +test_that("implicit_schema with select", { + expect_equal( + tab %>% + select(int, lgl) %>% + implicit_schema(), + schema(int = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with rename", { + expect_equal( + tab %>% + select(numbers = int, lgl) %>% + implicit_schema(), + schema(numbers = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with mutate", { + expect_equal( + tab %>% + transmute( + numbers = int * 4, + words = as.character(int) + ) %>% + implicit_schema(), + schema(numbers = float64(), words = utf8()) + ) +}) + +test_that("implicit_schema with summarize", { + expect_equal( + tab %>% + summarize( + avg = mean(int) + ) %>% + implicit_schema(), + schema(avg = float64()) + ) +}) + +test_that("implicit_schema with group_by summarize", { + expect_equal( + tab %>% + group_by(some_grouping) %>% + summarize( + avg = mean(int * 5L) + ) %>% + implicit_schema(), + schema(some_grouping = float64(), avg = float64()) + ) +}) + +test_that("collapse", { + q <- tab %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) + expect_false(is_collapsed(q)) + expect_true(is_collapsed(collapse(q))) + expect_false(is_collapsed(collapse(q)$.data)) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + collapse() %>% + filter(int < 5) %>% + select(int, twice) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + collapse() %>% + select(chr, int, lgl) %>% + collapse() %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + collapse() %>% + group_by(chr) %>% + select(chr, int, lgl) %>% + collapse() %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) +}) + +test_that("Properties of collapsed query", { + q <- tab %>% + filter(dbl > 2) %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + group_by(lgl) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + mutate(extra = total * 5) + + # print(tbl %>% + # filter(dbl > 2) %>% + # select(chr, int, lgl) %>% + # mutate(twice = int * 2L) %>% + # group_by(lgl) %>% + # summarize(total = sum(int, na.rm = TRUE)) %>% + # mutate(extra = total * 5)) + + # # A tibble: 3 × 3 + # lgl total extra + # <lgl> <int> <dbl> + # 1 FALSE 8 40 + # 2 TRUE 8 40 + # 3 NA 25 125 + + # Avoid evaluating just for nrow + expect_identical(dim(q), c(NA_integer_, 3L)) + + expect_output( + print(q), + "InMemoryDataset (query) +lgl: bool +total: int32 +extra: double (multiply_checked(total, 5)) + +See $.data for the source Arrow object", + fixed = TRUE + ) + expect_output( + print(q$.data), + "InMemoryDataset (query) +int: int32 +lgl: bool + +* Aggregations: +total: sum(int) +* Filter: (dbl > 2) +* Grouped by lgl +See $.data for the source Arrow object", + fixed = TRUE + ) + + skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal") + # On older R versions: + # ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ──────── + # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40). + # Component "total": Mean relative difference: 0.3846154 + # Component "extra": Mean relative difference: 0.3846154 + # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ──────── + # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125). + # Component "total": Mean relative difference: 0.9230769 + # Component "extra": Mean relative difference: 0.9230769 + expect_equal( + q %>% head(1) %>% collect(), + tibble::tibble(lgl = FALSE, total = 8L, extra = 40) + ) + skip("TODO (ARROW-1XXXX): implement sorting option about where NAs go") + expect_equal( + q %>% tail(1) %>% collect(), + tibble::tibble(lgl = NA, total = 25L, extra = 125) + ) +}) + +test_that("query_on_dataset handles collapse()", { + expect_false(query_on_dataset( + tab %>% + select(int, chr) + )) + expect_false(query_on_dataset( + tab %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) + + ds_dir <- tempfile() + dir.create(ds_dir) + on.exit(unlink(ds_dir)) + write_parquet(tab, file.path(ds_dir, "file.parquet")) + ds <- open_dataset(ds_dir) + + expect_true(query_on_dataset( + ds %>% + select(int, chr) + )) + expect_true(query_on_dataset( + ds %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) +}) |