summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/tests/testthat/test-compute-aggregate.R
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/tests/testthat/test-compute-aggregate.R434
1 files changed, 434 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-compute-aggregate.R b/src/arrow/r/tests/testthat/test-compute-aggregate.R
new file mode 100644
index 000000000..018279d4b
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-aggregate.R
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("list_compute_functions", {
+ allfuncs <- list_compute_functions()
+ expect_false(all(grepl("min", allfuncs)))
+ justmins <- list_compute_functions("^min")
+ expect_true(length(justmins) > 0)
+ expect_true(all(grepl("min", justmins)))
+ no_hash_funcs <- list_compute_functions("^hash")
+ expect_true(length(no_hash_funcs) == 0)
+})
+
+test_that("sum.Array", {
+ ints <- 1:5
+ a <- Array$create(ints)
+ expect_r6_class(sum(a), "Scalar")
+ expect_identical(as.integer(sum(a)), sum(ints))
+
+ floats <- c(1.3, 2.4, 3)
+ f <- Array$create(floats)
+ expect_identical(as.numeric(sum(f)), sum(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ if (!grepl("devel", R.version.string)) {
+ # Valgrind on R-devel confuses NaN and NA_real_
+ # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html
+ expect_identical(as.numeric(sum(na)), sum(floats))
+ }
+ expect_r6_class(sum(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, NA, TRUE, FALSE)
+ b <- Array$create(bools)
+ expect_identical(as.integer(sum(b)), sum(bools))
+ expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE))
+})
+
+test_that("sum.ChunkedArray", {
+ a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_r6_class(sum(a), "Scalar")
+ expect_true(is.na(as.vector(sum(a))))
+ expect_identical(as.numeric(sum(a, na.rm = TRUE)), 35)
+})
+
+test_that("sum dots", {
+ a1 <- Array$create(1:4)
+ a2 <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_identical(as.numeric(sum(a1, a2, na.rm = TRUE)), 45)
+})
+
+test_that("sum.Scalar", {
+ s <- Scalar$create(4)
+ expect_identical(as.numeric(s), as.numeric(sum(s)))
+})
+
+test_that("mean.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(mean(a), "Scalar")
+ expect_identical(as.vector(mean(a)), mean(ints))
+
+ floats <- c(1.3, 2.4, 3)
+ f <- Array$create(floats)
+ expect_identical(as.vector(mean(f)), mean(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ if (!grepl("devel", R.version.string)) {
+ # Valgrind on R-devel confuses NaN and NA_real_
+ # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html
+ expect_identical(as.vector(mean(na)), mean(floats))
+ }
+ expect_r6_class(mean(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, NA, TRUE, FALSE)
+ b <- Array$create(bools)
+ expect_identical(as.vector(mean(b)), mean(bools))
+ expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE))
+})
+
+test_that("mean.ChunkedArray", {
+ a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_r6_class(mean(a), "Scalar")
+ expect_true(is.na(as.vector(mean(a))))
+ expect_identical(as.vector(mean(a, na.rm = TRUE)), 35 / 13)
+})
+
+test_that("mean.Scalar", {
+ s <- Scalar$create(4)
+ expect_equal(s, mean(s))
+})
+
+test_that("Bad input handling of call_function", {
+ expect_error(
+ call_function("sum", 2, 3),
+ 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"'
+ )
+})
+
+test_that("min.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(min(a), "Scalar")
+ expect_identical(as.vector(min(a)), min(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- Array$create(floats)
+ expect_identical(as.vector(min(f)), min(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ expect_identical(as.vector(min(na)), min(floats))
+ expect_r6_class(min(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- Array$create(bools)
+ # R is inconsistent here: typeof(min(NA)) == "integer", not "logical"
+ expect_identical(as.vector(min(b)), as.logical(min(bools)))
+})
+
+test_that("max.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(max(a), "Scalar")
+ expect_identical(as.vector(max(a)), max(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- Array$create(floats)
+ expect_identical(as.vector(max(f)), max(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ expect_identical(as.vector(max(na)), max(floats))
+ expect_r6_class(max(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- Array$create(bools)
+ # R is inconsistent here: typeof(max(NA)) == "integer", not "logical"
+ expect_identical(as.vector(max(b)), as.logical(max(bools)))
+})
+
+test_that("min.ChunkedArray", {
+ ints <- 1:4
+ a <- ChunkedArray$create(ints)
+ expect_r6_class(min(a), "Scalar")
+ expect_identical(as.vector(min(a)), min(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- ChunkedArray$create(floats)
+ expect_identical(as.vector(min(f)), min(floats))
+
+ floats <- c(floats, NA)
+ na <- ChunkedArray$create(floats)
+ expect_identical(as.vector(min(na)), min(floats))
+ expect_r6_class(min(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- ChunkedArray$create(bools)
+ # R is inconsistent here: typeof(min(NA)) == "integer", not "logical"
+ expect_identical(as.vector(min(b)), as.logical(min(bools)))
+})
+
+test_that("max.ChunkedArray", {
+ ints <- 1:4
+ a <- ChunkedArray$create(ints)
+ expect_r6_class(max(a), "Scalar")
+ expect_identical(as.vector(max(a)), max(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- ChunkedArray$create(floats)
+ expect_identical(as.vector(max(f)), max(floats))
+
+ floats <- c(floats, NA)
+ na <- ChunkedArray$create(floats)
+ expect_identical(as.vector(max(na)), max(floats))
+ expect_r6_class(max(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- ChunkedArray$create(bools)
+ # R is inconsistent here: typeof(max(NA)) == "integer", not "logical"
+ expect_identical(as.vector(max(b)), as.logical(max(bools)))
+})
+
+test_that("Edge cases", {
+ a <- Array$create(NA)
+ for (type in c(int32(), float64(), bool())) {
+ expect_as_vector(sum(a$cast(type), na.rm = TRUE), sum(NA, na.rm = TRUE))
+ expect_as_vector(mean(a$cast(type), na.rm = TRUE), mean(NA, na.rm = TRUE))
+ expect_as_vector(
+ min(a$cast(type), na.rm = TRUE),
+ # Suppress the base R warning about no non-missing arguments
+ suppressWarnings(min(NA, na.rm = TRUE))
+ )
+ expect_as_vector(
+ max(a$cast(type), na.rm = TRUE),
+ suppressWarnings(max(NA, na.rm = TRUE))
+ )
+ }
+})
+
+test_that("quantile.Array and quantile.ChunkedArray", {
+ a <- Array$create(c(0, 1, 2, 3))
+ ca <- ChunkedArray$create(c(0, 1), c(2, 3))
+ probs <- c(0.49, 0.51)
+ for (ad in list(a, ca)) {
+ for (type in c(int32(), uint64(), float64())) {
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "linear"),
+ Array$create(c(1.47, 1.53))
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "lower"),
+ Array$create(c(1, 1))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "higher"),
+ Array$create(c(2, 2))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "nearest"),
+ Array$create(c(1, 2))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "midpoint"),
+ Array$create(c(1.5, 1.5))
+ )
+ }
+ }
+})
+
+test_that("quantile and median NAs, edge cases, and exceptions", {
+ expect_equal(
+ quantile(Array$create(c(1, 2)), probs = c(0, 1)),
+ Array$create(c(1, 2))
+ )
+ expect_error(
+ quantile(Array$create(c(1, 2, NA))),
+ "Missing values not allowed if 'na.rm' is FALSE"
+ )
+ expect_equal(
+ quantile(Array$create(numeric(0))),
+ Array$create(rep(NA_real_, 5))
+ )
+ expect_equal(
+ quantile(Array$create(rep(NA_integer_, 3)), na.rm = TRUE),
+ Array$create(rep(NA_real_, 5))
+ )
+ expect_equal(
+ quantile(Scalar$create(0L)),
+ Array$create(rep(0, 5))
+ )
+ expect_equal(
+ median(Scalar$create(1L)),
+ Scalar$create(1)
+ )
+ expect_error(
+ quantile(Array$create(1:3), type = 9),
+ "not supported"
+ )
+})
+
+test_that("median passes ... args to quantile", {
+ skip_if(
+ !"..." %in% names(formals(median)),
+ "The median generic lacks dots in R 3.3.0 and earlier"
+ )
+ expect_equal(
+ median(Array$create(c(1, 2)), interpolation = "higher"),
+ Scalar$create(2)
+ )
+ expect_error(
+ median(Array$create(c(1, 2)), probs = c(.25, .75))
+ )
+})
+
+test_that("median.Array and median.ChunkedArray", {
+ compare_expression(
+ median(.input),
+ 1:4
+ )
+ compare_expression(
+ median(.input),
+ 1:5
+ )
+ compare_expression(
+ median(.input),
+ numeric(0)
+ )
+ compare_expression(
+ median(.input, na.rm = FALSE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ NA_real_
+ )
+ compare_expression(
+ median(.input, na.rm = FALSE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ NA_real_
+ )
+})
+
+test_that("unique.Array", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ expect_equal(unique(a), Array$create(c(1, 4, 3)))
+ ca <- ChunkedArray$create(a, a)
+ expect_equal(unique(ca), Array$create(c(1, 4, 3)))
+})
+
+test_that("match_arrow", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ tab <- c(4, 3, 2, 1)
+ expect_equal(match_arrow(a, tab), Array$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L)))
+
+ ca <- ChunkedArray$create(c(1, 4, 3, 1, 1, 3, 4))
+ expect_equal(match_arrow(ca, tab), ChunkedArray$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L)))
+
+ sc <- Scalar$create(3)
+ expect_equal(match_arrow(sc, tab), Scalar$create(1L))
+
+ vec <- c(1, 2)
+ expect_equal(match_arrow(vec, tab), Array$create(c(3L, 2L)))
+})
+
+test_that("is_in", {
+ a <- Array$create(c(9, 4, 3))
+ tab <- c(4, 3, 2, 1)
+ expect_equal(is_in(a, tab), Array$create(c(FALSE, TRUE, TRUE)))
+
+ ca <- ChunkedArray$create(c(9, 4, 3))
+ expect_equal(is_in(ca, tab), ChunkedArray$create(c(FALSE, TRUE, TRUE)))
+
+ sc <- Scalar$create(3)
+ expect_equal(is_in(sc, tab), Scalar$create(TRUE))
+
+ vec <- c(1, 9)
+ expect_equal(is_in(vec, tab), Array$create(c(TRUE, FALSE)))
+})
+
+test_that("value_counts", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ result_df <- tibble::tibble(
+ values = c(1, 4, 3),
+ counts = c(3L, 2L, 2L)
+ )
+ result <- Array$create(
+ result_df,
+ type = struct(values = float64(), counts = int64())
+ )
+ expect_equal(value_counts(a), result)
+ expect_identical(as.data.frame(value_counts(a)), result_df)
+ expect_identical(as.vector(value_counts(a)$counts), result_df$counts)
+})
+
+test_that("any.Array and any.ChunkedArray", {
+ data <- c(1:10, NA, NA)
+
+ compare_expression(any(.input > 5), data)
+ compare_expression(any(.input > 5, na.rm = TRUE), data)
+ compare_expression(any(.input < 1), data)
+ compare_expression(any(.input < 1, na.rm = TRUE), data)
+
+ data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE)
+
+ compare_expression(any(.input), data_logical)
+ compare_expression(any(.input, na.rm = FALSE), data_logical)
+ compare_expression(any(.input, na.rm = TRUE), data_logical)
+})
+
+test_that("all.Array and all.ChunkedArray", {
+ data <- c(1:10, NA, NA)
+
+ compare_expression(all(.input > 5), data)
+ compare_expression(all(.input > 5, na.rm = TRUE), data)
+
+ compare_expression(all(.input < 11), data)
+ compare_expression(all(.input < 11, na.rm = TRUE), data)
+
+ data_logical <- c(TRUE, TRUE, NA)
+
+ compare_expression(all(.input), data_logical)
+ compare_expression(all(.input, na.rm = TRUE), data_logical)
+})
+
+test_that("variance", {
+ data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA)
+ arr <- Array$create(data)
+ chunked_arr <- ChunkedArray$create(data)
+
+ expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(34596))
+ expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(34596))
+})
+
+test_that("stddev", {
+ data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA)
+ arr <- Array$create(data)
+ chunked_arr <- ChunkedArray$create(data)
+
+ expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(186))
+ expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(186))
+})