diff options
Diffstat (limited to 'src/arrow/r/tests/testthat/test-compute-aggregate.R')
-rw-r--r-- | src/arrow/r/tests/testthat/test-compute-aggregate.R | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-compute-aggregate.R b/src/arrow/r/tests/testthat/test-compute-aggregate.R new file mode 100644 index 000000000..018279d4b --- /dev/null +++ b/src/arrow/r/tests/testthat/test-compute-aggregate.R @@ -0,0 +1,434 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("list_compute_functions", { + allfuncs <- list_compute_functions() + expect_false(all(grepl("min", allfuncs))) + justmins <- list_compute_functions("^min") + expect_true(length(justmins) > 0) + expect_true(all(grepl("min", justmins))) + no_hash_funcs <- list_compute_functions("^hash") + expect_true(length(no_hash_funcs) == 0) +}) + +test_that("sum.Array", { + ints <- 1:5 + a <- Array$create(ints) + expect_r6_class(sum(a), "Scalar") + expect_identical(as.integer(sum(a)), sum(ints)) + + floats <- c(1.3, 2.4, 3) + f <- Array$create(floats) + expect_identical(as.numeric(sum(f)), sum(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + if (!grepl("devel", R.version.string)) { + # Valgrind on R-devel confuses NaN and NA_real_ + # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html + expect_identical(as.numeric(sum(na)), sum(floats)) + } + expect_r6_class(sum(na, na.rm = TRUE), "Scalar") + expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE)) + + bools <- c(TRUE, NA, TRUE, FALSE) + b <- Array$create(bools) + expect_identical(as.integer(sum(b)), sum(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) +}) + +test_that("sum.ChunkedArray", { + a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_r6_class(sum(a), "Scalar") + expect_true(is.na(as.vector(sum(a)))) + expect_identical(as.numeric(sum(a, na.rm = TRUE)), 35) +}) + +test_that("sum dots", { + a1 <- Array$create(1:4) + a2 <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_identical(as.numeric(sum(a1, a2, na.rm = TRUE)), 45) +}) + +test_that("sum.Scalar", { + s <- Scalar$create(4) + expect_identical(as.numeric(s), as.numeric(sum(s))) +}) + +test_that("mean.Array", { + ints <- 1:4 + a <- Array$create(ints) + expect_r6_class(mean(a), "Scalar") + expect_identical(as.vector(mean(a)), mean(ints)) + + floats <- c(1.3, 2.4, 3) + f <- Array$create(floats) + expect_identical(as.vector(mean(f)), mean(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + if (!grepl("devel", R.version.string)) { + # Valgrind on R-devel confuses NaN and NA_real_ + # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html + expect_identical(as.vector(mean(na)), mean(floats)) + } + expect_r6_class(mean(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE)) + + bools <- c(TRUE, NA, TRUE, FALSE) + b <- Array$create(bools) + expect_identical(as.vector(mean(b)), mean(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) +}) + +test_that("mean.ChunkedArray", { + a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_r6_class(mean(a), "Scalar") + expect_true(is.na(as.vector(mean(a)))) + expect_identical(as.vector(mean(a, na.rm = TRUE)), 35 / 13) +}) + +test_that("mean.Scalar", { + s <- Scalar$create(4) + expect_equal(s, mean(s)) +}) + +test_that("Bad input handling of call_function", { + expect_error( + call_function("sum", 2, 3), + 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"' + ) +}) + +test_that("min.Array", { + ints <- 1:4 + a <- Array$create(ints) + expect_r6_class(min(a), "Scalar") + expect_identical(as.vector(min(a)), min(ints)) + + floats <- c(1.3, 3, 2.4) + f <- Array$create(floats) + expect_identical(as.vector(min(f)), min(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + expect_identical(as.vector(min(na)), min(floats)) + expect_r6_class(min(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- Array$create(bools) + # R is inconsistent here: typeof(min(NA)) == "integer", not "logical" + expect_identical(as.vector(min(b)), as.logical(min(bools))) +}) + +test_that("max.Array", { + ints <- 1:4 + a <- Array$create(ints) + expect_r6_class(max(a), "Scalar") + expect_identical(as.vector(max(a)), max(ints)) + + floats <- c(1.3, 3, 2.4) + f <- Array$create(floats) + expect_identical(as.vector(max(f)), max(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + expect_identical(as.vector(max(na)), max(floats)) + expect_r6_class(max(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- Array$create(bools) + # R is inconsistent here: typeof(max(NA)) == "integer", not "logical" + expect_identical(as.vector(max(b)), as.logical(max(bools))) +}) + +test_that("min.ChunkedArray", { + ints <- 1:4 + a <- ChunkedArray$create(ints) + expect_r6_class(min(a), "Scalar") + expect_identical(as.vector(min(a)), min(ints)) + + floats <- c(1.3, 3, 2.4) + f <- ChunkedArray$create(floats) + expect_identical(as.vector(min(f)), min(floats)) + + floats <- c(floats, NA) + na <- ChunkedArray$create(floats) + expect_identical(as.vector(min(na)), min(floats)) + expect_r6_class(min(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- ChunkedArray$create(bools) + # R is inconsistent here: typeof(min(NA)) == "integer", not "logical" + expect_identical(as.vector(min(b)), as.logical(min(bools))) +}) + +test_that("max.ChunkedArray", { + ints <- 1:4 + a <- ChunkedArray$create(ints) + expect_r6_class(max(a), "Scalar") + expect_identical(as.vector(max(a)), max(ints)) + + floats <- c(1.3, 3, 2.4) + f <- ChunkedArray$create(floats) + expect_identical(as.vector(max(f)), max(floats)) + + floats <- c(floats, NA) + na <- ChunkedArray$create(floats) + expect_identical(as.vector(max(na)), max(floats)) + expect_r6_class(max(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- ChunkedArray$create(bools) + # R is inconsistent here: typeof(max(NA)) == "integer", not "logical" + expect_identical(as.vector(max(b)), as.logical(max(bools))) +}) + +test_that("Edge cases", { + a <- Array$create(NA) + for (type in c(int32(), float64(), bool())) { + expect_as_vector(sum(a$cast(type), na.rm = TRUE), sum(NA, na.rm = TRUE)) + expect_as_vector(mean(a$cast(type), na.rm = TRUE), mean(NA, na.rm = TRUE)) + expect_as_vector( + min(a$cast(type), na.rm = TRUE), + # Suppress the base R warning about no non-missing arguments + suppressWarnings(min(NA, na.rm = TRUE)) + ) + expect_as_vector( + max(a$cast(type), na.rm = TRUE), + suppressWarnings(max(NA, na.rm = TRUE)) + ) + } +}) + +test_that("quantile.Array and quantile.ChunkedArray", { + a <- Array$create(c(0, 1, 2, 3)) + ca <- ChunkedArray$create(c(0, 1), c(2, 3)) + probs <- c(0.49, 0.51) + for (ad in list(a, ca)) { + for (type in c(int32(), uint64(), float64())) { + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "linear"), + Array$create(c(1.47, 1.53)) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "lower"), + Array$create(c(1, 1))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "higher"), + Array$create(c(2, 2))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "nearest"), + Array$create(c(1, 2))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "midpoint"), + Array$create(c(1.5, 1.5)) + ) + } + } +}) + +test_that("quantile and median NAs, edge cases, and exceptions", { + expect_equal( + quantile(Array$create(c(1, 2)), probs = c(0, 1)), + Array$create(c(1, 2)) + ) + expect_error( + quantile(Array$create(c(1, 2, NA))), + "Missing values not allowed if 'na.rm' is FALSE" + ) + expect_equal( + quantile(Array$create(numeric(0))), + Array$create(rep(NA_real_, 5)) + ) + expect_equal( + quantile(Array$create(rep(NA_integer_, 3)), na.rm = TRUE), + Array$create(rep(NA_real_, 5)) + ) + expect_equal( + quantile(Scalar$create(0L)), + Array$create(rep(0, 5)) + ) + expect_equal( + median(Scalar$create(1L)), + Scalar$create(1) + ) + expect_error( + quantile(Array$create(1:3), type = 9), + "not supported" + ) +}) + +test_that("median passes ... args to quantile", { + skip_if( + !"..." %in% names(formals(median)), + "The median generic lacks dots in R 3.3.0 and earlier" + ) + expect_equal( + median(Array$create(c(1, 2)), interpolation = "higher"), + Scalar$create(2) + ) + expect_error( + median(Array$create(c(1, 2)), probs = c(.25, .75)) + ) +}) + +test_that("median.Array and median.ChunkedArray", { + compare_expression( + median(.input), + 1:4 + ) + compare_expression( + median(.input), + 1:5 + ) + compare_expression( + median(.input), + numeric(0) + ) + compare_expression( + median(.input, na.rm = FALSE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + NA_real_ + ) + compare_expression( + median(.input, na.rm = FALSE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + NA_real_ + ) +}) + +test_that("unique.Array", { + a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) + expect_equal(unique(a), Array$create(c(1, 4, 3))) + ca <- ChunkedArray$create(a, a) + expect_equal(unique(ca), Array$create(c(1, 4, 3))) +}) + +test_that("match_arrow", { + a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) + tab <- c(4, 3, 2, 1) + expect_equal(match_arrow(a, tab), Array$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L))) + + ca <- ChunkedArray$create(c(1, 4, 3, 1, 1, 3, 4)) + expect_equal(match_arrow(ca, tab), ChunkedArray$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L))) + + sc <- Scalar$create(3) + expect_equal(match_arrow(sc, tab), Scalar$create(1L)) + + vec <- c(1, 2) + expect_equal(match_arrow(vec, tab), Array$create(c(3L, 2L))) +}) + +test_that("is_in", { + a <- Array$create(c(9, 4, 3)) + tab <- c(4, 3, 2, 1) + expect_equal(is_in(a, tab), Array$create(c(FALSE, TRUE, TRUE))) + + ca <- ChunkedArray$create(c(9, 4, 3)) + expect_equal(is_in(ca, tab), ChunkedArray$create(c(FALSE, TRUE, TRUE))) + + sc <- Scalar$create(3) + expect_equal(is_in(sc, tab), Scalar$create(TRUE)) + + vec <- c(1, 9) + expect_equal(is_in(vec, tab), Array$create(c(TRUE, FALSE))) +}) + +test_that("value_counts", { + a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) + result_df <- tibble::tibble( + values = c(1, 4, 3), + counts = c(3L, 2L, 2L) + ) + result <- Array$create( + result_df, + type = struct(values = float64(), counts = int64()) + ) + expect_equal(value_counts(a), result) + expect_identical(as.data.frame(value_counts(a)), result_df) + expect_identical(as.vector(value_counts(a)$counts), result_df$counts) +}) + +test_that("any.Array and any.ChunkedArray", { + data <- c(1:10, NA, NA) + + compare_expression(any(.input > 5), data) + compare_expression(any(.input > 5, na.rm = TRUE), data) + compare_expression(any(.input < 1), data) + compare_expression(any(.input < 1, na.rm = TRUE), data) + + data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) + + compare_expression(any(.input), data_logical) + compare_expression(any(.input, na.rm = FALSE), data_logical) + compare_expression(any(.input, na.rm = TRUE), data_logical) +}) + +test_that("all.Array and all.ChunkedArray", { + data <- c(1:10, NA, NA) + + compare_expression(all(.input > 5), data) + compare_expression(all(.input > 5, na.rm = TRUE), data) + + compare_expression(all(.input < 11), data) + compare_expression(all(.input < 11, na.rm = TRUE), data) + + data_logical <- c(TRUE, TRUE, NA) + + compare_expression(all(.input), data_logical) + compare_expression(all(.input, na.rm = TRUE), data_logical) +}) + +test_that("variance", { + data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(34596)) + expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(34596)) +}) + +test_that("stddev", { + data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(186)) + expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(186)) +}) |