diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/tests/testthat | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/r/tests/testthat')
85 files changed, 17787 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet Binary files differnew file mode 100644 index 000000000..3394be241 --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather Binary files differnew file mode 100644 index 000000000..d91acd0cc --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather Binary files differnew file mode 100644 index 000000000..0198024ec --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather Binary files differnew file mode 100644 index 000000000..f6788231c --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet Binary files differnew file mode 100644 index 000000000..e1d589bf0 --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather Binary files differnew file mode 100644 index 000000000..f3a71435a --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather Binary files differnew file mode 100644 index 000000000..1188ac669 --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather Binary files differnew file mode 100644 index 000000000..056b26c17 --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet Binary files differnew file mode 100644 index 000000000..6c5911560 --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather Binary files differnew file mode 100644 index 000000000..b65da7234 --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather Binary files differnew file mode 100644 index 000000000..508903cb4 --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather Binary files differnew file mode 100644 index 000000000..39c829fda --- /dev/null +++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather diff --git a/src/arrow/r/tests/testthat/helper-arrow.R b/src/arrow/r/tests/testthat/helper-arrow.R new file mode 100644 index 000000000..545f2d044 --- /dev/null +++ b/src/arrow/r/tests/testthat/helper-arrow.R @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Wrap testthat::test_that with a check for the C++ library +options(..skip.tests = !arrow:::arrow_available()) + +set.seed(1) + +MAX_INT <- 2147483647L + +# Make sure this is unset +Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = "") + +# use the C locale for string collation (ARROW-12046) +Sys.setlocale("LC_COLLATE", "C") + +# Set English language so that error messages aren't internationalized +# (R CMD check does this, but in case you're running outside of check) +Sys.setenv(LANGUAGE = "en") + +with_language <- function(lang, expr) { + old <- Sys.getenv("LANGUAGE") + # Check what this message is before changing languages; this will + # trigger caching the transations if the OS does that (some do). + # If the OS does cache, then we can't test changing languages safely. + before <- i18ize_error_messages() + Sys.setenv(LANGUAGE = lang) + on.exit({ + Sys.setenv(LANGUAGE = old) + .cache$i18ized_error_pattern <<- NULL + }) + if (!identical(before, i18ize_error_messages())) { + skip(paste("This OS either does not support changing languages to", lang, "or it caches translations")) + } + force(expr) +} + +test_that <- function(what, code) { + testthat::test_that(what, { + skip_if(getOption("..skip.tests", TRUE), "arrow C++ library not available") + code + }) +} + +# Wrapper to run tests that only touch R code even when the C++ library isn't +# available (so that at least some tests are run on those platforms) +r_only <- function(code) { + withr::with_options(list(..skip.tests = FALSE), code) +} + +make_temp_dir <- function() { + path <- tempfile() + dir.create(path) + normalizePath(path, winslash = "/") +} diff --git a/src/arrow/r/tests/testthat/helper-data.R b/src/arrow/r/tests/testthat/helper-data.R new file mode 100644 index 000000000..c693e84b2 --- /dev/null +++ b/src/arrow/r/tests/testthat/helper-data.R @@ -0,0 +1,191 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +example_data <- tibble::tibble( + int = c(1:3, NA_integer_, 5:10), + dbl = c(1:8, NA, 10) + .1, + dbl2 = rep(5, 10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + false = logical(10), + chr = letters[c(1:5, NA, 7:10)], + fct = factor(letters[c(1:4, NA, NA, 7:10)]) +) + +example_with_metadata <- tibble::tibble( + a = structure("one", class = "special_string"), + b = 2, + c = tibble::tibble( + c1 = structure("inner", extra_attr = "something"), + c2 = 4, + c3 = 50 + ), + d = "four" +) + +attr(example_with_metadata, "top_level") <- list( + field_one = 12, + field_two = "more stuff" +) + +haven_data <- tibble::tibble( + num = structure(c(5.1, 4.9), + format.spss = "F8.2" + ), + cat_int = structure(c(3, 1), + format.spss = "F8.0", + labels = c(first = 1, second = 2, third = 3), + class = c("haven_labelled", "vctrs_vctr", "double") + ), + cat_chr = structure(c("B", "B"), + labels = c(Alpha = "A", Beta = "B"), + class = c("haven_labelled", "vctrs_vctr", "character") + ) +) + +example_with_times <- tibble::tibble( + date = Sys.Date() + 1:10, + posixct = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10, + posixct_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10, + posixlt = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10), + posixlt_tz = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10) +) + +verses <- list( + # Since we tend to test with dataframes with 10 rows, here are verses from + # "Milonga del moro judío", by Jorge Drexler. They are décimas, 10-line + # poems with a particular meter and rhyme scheme. + # (They also have non-ASCII characters, which is nice for testing) + c( + "Por cada muro, un lamento", + "En Jerusalén la dorada", + "Y mil vidas malgastadas", + "Por cada mandamiento", + "Yo soy polvo de tu viento", + "Y aunque sangro de tu herida", + "Y cada piedra querida", + "Guarda mi amor más profundo", + "No hay una piedra en el mundo", + "Que valga lo que una vida" + ), + c( + "No hay muerto que no me duela", + "No hay un bando ganador", + "No hay nada más que dolor", + "Y otra vida que se vuela", + "La guerra es muy mala escuela", + "No importa el disfraz que viste", + "Perdonen que no me aliste", + "Bajo ninguna bandera", + "Vale más cualquier quimera", + "Que un trozo de tela triste" + ), + c( + "Y a nadie le di permiso", + "Para matar en mi nombre", + "Un hombre no es más que un hombre", + "Y si hay Dios, así lo quiso", + "El mismo suelo que piso", + "Seguirá, yo me habré ido", + "Rumbo también del olvido", + "No hay doctrina que no vaya", + "Y no hay pueblo que no se haya", + "Creído el pueblo elegido" + ) +) + +make_big_string <- function() { + # This creates a character vector that would exceed the capacity of BinaryArray + rep(purrr::map_chr(2047:2050, ~ paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18) +} + +make_random_string_of_size <- function(size = 1) { + purrr::map_chr(1000 * size, ~ paste(sample(letters, ., replace = TRUE), collapse = "")) +} + +make_string_of_size <- function(size = 1) { + paste(rep(letters, length.out = 1000 * size), collapse = "") +} + +example_with_extra_metadata <- example_with_metadata +attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100)) + +example_with_logical_factors <- tibble::tibble( + starting_a_fight = factor(c(FALSE, TRUE, TRUE, TRUE)), + consoling_a_child = factor(c(TRUE, FALSE, TRUE, TRUE)), + petting_a_dog = factor(c(TRUE, TRUE, FALSE, TRUE)), + saying = c( + "shhhhh, it's ok", + "you wanna go outside?", + "you want your mommy?", + "hey buddy" + ) +) + +# The values in each column of this tibble are in ascending order. There are +# some ties, so tests should use two or more columns to ensure deterministic +# sort order. The Arrow C++ library orders strings lexicographically as byte +# strings. The order of a string array sorted by Arrow will not match the order +# of an equivalent character vector sorted by R unless you set the R collation +# locale to "C" by running: Sys.setlocale("LC_COLLATE", "C") +# These test scripts set that, but if you are running individual tests you might +# need to set it manually. When finished, you can restore the default +# collation locale by running: Sys.setlocale("LC_COLLATE") +# In the future, the string collation locale used by the Arrow C++ library might +# be configurable (ARROW-12046). +example_data_for_sorting <- tibble::tibble( + int = c(-.Machine$integer.max, -101L, -100L, 0L, 0L, 1L, 100L, 1000L, .Machine$integer.max, NA_integer_), + dbl = c( + -Inf, -.Machine$double.xmax, -.Machine$double.xmin, 0, .Machine$double.xmin, + pi, .Machine$double.xmax, Inf, NaN, NA_real_ + ), + chr = c("", "", "\"", "&", "ABC", "NULL", "a", "abc", "zzz", NA_character_), + lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA), + dttm = lubridate::ymd_hms(c( + "0000-01-01 00:00:00", + "1919-05-29 13:08:55", + "1955-06-20 04:10:42", + "1973-06-30 11:38:41", + "1987-03-29 12:49:47", + "1991-06-11 19:07:01", + NA_character_, + "2017-08-21 18:26:40", + "2017-08-21 18:26:40", + "9999-12-31 23:59:59" + )), + grp = c(rep("A", 5), rep("B", 5)) +) + +# For Dataset tests +first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") +df1 <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[1:10], + fct = factor(LETTERS[1:10]), + ts = first_date + lubridate::days(1:10) +) + +second_date <- lubridate::ymd_hms("2017-03-09 07:01:02") +df2 <- tibble::tibble( + int = 101:110, + dbl = c(as.numeric(51:59), NaN), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[10:1], + fct = factor(LETTERS[10:1]), + ts = second_date + lubridate::days(10:1) +) diff --git a/src/arrow/r/tests/testthat/helper-expectation.R b/src/arrow/r/tests/testthat/helper-expectation.R new file mode 100644 index 000000000..ef6142bb4 --- /dev/null +++ b/src/arrow/r/tests/testthat/helper-expectation.R @@ -0,0 +1,320 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expect_as_vector <- function(x, y, ...) { + expect_equal(as.vector(x), y, ...) +} + +expect_data_frame <- function(x, y, ...) { + expect_equal(as.data.frame(x), y, ...) +} + +expect_r6_class <- function(object, class) { + expect_s3_class(object, class) + expect_s3_class(object, "R6") +} + +#' Mask `testthat::expect_equal()` in order to compare ArrowObjects using their +#' `Equals` methods from the C++ library. +expect_equal <- function(object, expected, ignore_attr = FALSE, ..., info = NULL, label = NULL) { + if (inherits(object, "ArrowObject") && inherits(expected, "ArrowObject")) { + mc <- match.call() + expect_true( + all.equal(object, expected, check.attributes = !ignore_attr), + info = info, + label = paste(rlang::as_label(mc[["object"]]), "==", rlang::as_label(mc[["expected"]])) + ) + } else { + testthat::expect_equal(object, expected, ignore_attr = ignore_attr, ..., info = info, label = label) + } +} + +expect_type_equal <- function(object, expected, ...) { + if (is.Array(object)) { + object <- object$type + } + if (is.Array(expected)) { + expected <- expected$type + } + expect_equal(object, expected, ...) +} + +expect_match_arg_error <- function(object, values = c()) { + expect_error(object, paste0("'arg' .*", paste(dQuote(values), collapse = ", "))) +} + +expect_deprecated <- expect_warning + +verify_output <- function(...) { + if (isTRUE(grepl("conda", R.Version()$platform))) { + skip("On conda") + } + testthat::verify_output(...) +} + +#' Ensure that dplyr methods on Arrow objects return the same as for data frames +#' +#' This function compares the output of running a dplyr expression on a tibble +#' or data.frame object against the output of the same expression run on +#' Arrow Table and RecordBatch objects. +#' +#' +#' @param expr A dplyr pipeline which must have `.input` as its start +#' @param tbl A tibble or data.frame which will be substituted for `.input` +#' @param skip_record_batch The skip message to show (if you should skip the +#' RecordBatch test) +#' @param skip_table The skip message to show (if you should skip the Table test) +#' @param warning The expected warning from the RecordBatch and Table comparison +#' paths, passed to `expect_warning()`. Special values: +#' * `NA` (the default) for ensuring no warning message +#' * `TRUE` is a special case to mean to check for the +#' "not supported in Arrow; pulling data into R" message. +#' @param ... additional arguments, passed to `expect_equal()` +compare_dplyr_binding <- function(expr, + tbl, + skip_record_batch = NULL, + skip_table = NULL, + warning = NA, + ...) { + + # Quote the contents of `expr` so that we can evaluate it a few different ways + expr <- rlang::enquo(expr) + # Get the expected output by evaluating expr on the .input data.frame using regular dplyr + expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl))) + + if (isTRUE(warning)) { + # Special-case the simple warning: + # TODO: ARROW-13362 pick one of in or by and use it everywhere + warning <- "not supported (in|by) Arrow; pulling data into R" + } + + skip_msg <- NULL + + # Evaluate `expr` on a RecordBatch object and compare with `expected` + if (is.null(skip_record_batch)) { + expect_warning( + via_batch <- rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = record_batch(tbl))) + ), + warning + ) + expect_equal(via_batch, expected, ...) + } else { + skip_msg <- c(skip_msg, skip_record_batch) + } + + # Evaluate `expr` on a Table object and compare with `expected` + if (is.null(skip_table)) { + expect_warning( + via_table <- rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = arrow_table(tbl))) + ), + warning + ) + expect_equal(via_table, expected, ...) + } else { + skip_msg <- c(skip_msg, skip_table) + } + + if (!is.null(skip_msg)) { + skip(paste(skip_msg, collapse = "\n")) + } +} + +#' Assert that Arrow dplyr methods error in the same way as methods on data.frame +#' +#' Comparing the error message generated when running expressions on R objects +#' against the error message generated by running the same expression on Arrow +#' Tables and RecordBatches. +#' +#' @param expr A dplyr pipeline which must have `.input` as its start +#' @param tbl A tibble or data.frame which will be substituted for `.input` +#' @param ... additional arguments, passed to `expect_error()` +compare_dplyr_error <- function(expr, + tbl, + ...) { + # ensure we have supplied tbl + force(tbl) + + expr <- rlang::enquo(expr) + msg <- tryCatch( + rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl))), + error = function(e) { + msg <- conditionMessage(e) + + # The error here is of the form: + # + # Problem with `filter()` .input `..1`. + # x object 'b_var' not found + # ℹ Input `..1` is `chr == b_var`. + # + # but what we really care about is the `x` block + # so (temporarily) let's pull those blocks out when we find them + pattern <- i18ize_error_messages() + + if (grepl(pattern, msg)) { + msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg) + } + msg + } + ) + # make sure msg is a character object (i.e. there has been an error) + # If it did not error, we would get a data.frame or whatever + # This expectation will tell us "dplyr on data.frame errored is not TRUE" + expect_true(identical(typeof(msg), "character"), label = "dplyr on data.frame errored") + + expect_error( + rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = record_batch(tbl))) + ), + msg, + ... + ) + expect_error( + rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = arrow_table(tbl))) + ), + msg, + ... + ) +} + +#' Comparing the output of running expressions on R vectors against the same +#' expression run on Arrow Arrays and ChunkedArrays. +#' +#' @param expr A vectorized R expression which must have `.input` as its start +#' @param vec A vector which will be substituted for `.input` +#' @param skip_array The skip message to show (if you should skip the Array test) +#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test) +#' @param ignore_attr Ignore differences in specified attributes? +#' @param ... additional arguments, passed to `expect_as_vector()` +compare_expression <- function(expr, + vec, + skip_array = NULL, + skip_chunked_array = NULL, + ignore_attr = FALSE, + ...) { + expr <- rlang::enquo(expr) + expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec))) + skip_msg <- NULL + + if (is.null(skip_array)) { + via_array <- rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = Array$create(vec))) + ) + expect_as_vector(via_array, expected, ignore_attr, ...) + } else { + skip_msg <- c(skip_msg, skip_array) + } + + if (is.null(skip_chunked_array)) { + # split input vector into two to exercise ChunkedArray with >1 chunk + split_vector <- split_vector_as_list(vec) + + via_chunked <- rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]]))) + ) + expect_as_vector(via_chunked, expected, ignore_attr, ...) + } else { + skip_msg <- c(skip_msg, skip_chunked_array) + } + + if (!is.null(skip_msg)) { + skip(paste(skip_msg, collapse = "\n")) + } +} + +#' Comparing the error message generated when running expressions on R objects +#' against the error message generated by running the same expression on Arrow +#' Arrays and ChunkedArrays. +#' +#' @param expr An R expression which must have `.input` as its start +#' @param vec A vector which will be substituted for `.input` +#' @param skip_array The skip message to show (if you should skip the Array test) +#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test) +#' @param ... additional arguments, passed to `expect_error()` +compare_expression_error <- function(expr, + vec, + skip_array = NULL, + skip_chunked_array = NULL, + ...) { + expr <- rlang::enquo(expr) + + msg <- tryCatch( + rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec))), + error = function(e) { + msg <- conditionMessage(e) + + pattern <- i18ize_error_messages() + + if (grepl(pattern, msg)) { + msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg) + } + msg + } + ) + + expect_true(identical(typeof(msg), "character"), label = "vector errored") + + skip_msg <- NULL + + if (is.null(skip_array)) { + expect_error( + rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = Array$create(vec))) + ), + msg, + ... + ) + } else { + skip_msg <- c(skip_msg, skip_array) + } + + if (is.null(skip_chunked_array)) { + # split input vector into two to exercise ChunkedArray with >1 chunk + split_vector <- split_vector_as_list(vec) + + expect_error( + rlang::eval_tidy( + expr, + rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]]))) + ), + msg, + ... + ) + } else { + skip_msg <- c(skip_msg, skip_chunked_array) + } + + if (!is.null(skip_msg)) { + skip(paste(skip_msg, collapse = "\n")) + } +} + +split_vector_as_list <- function(vec) { + vec_split <- length(vec) %/% 2 + vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)] + vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)] + list(vec1, vec2) +} diff --git a/src/arrow/r/tests/testthat/helper-parquet.R b/src/arrow/r/tests/testthat/helper-parquet.R new file mode 100644 index 000000000..a0dd445bb --- /dev/null +++ b/src/arrow/r/tests/testthat/helper-parquet.R @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expect_parquet_roundtrip <- function(tab, ...) { + expect_equal(parquet_roundtrip(tab, ...), tab) +} + +parquet_roundtrip <- function(x, ..., as_data_frame = FALSE) { + # write/read parquet, returns Table + tf <- tempfile() + on.exit(unlink(tf)) + + write_parquet(x, tf, ...) + read_parquet(tf, as_data_frame = as_data_frame) +} diff --git a/src/arrow/r/tests/testthat/helper-roundtrip.R b/src/arrow/r/tests/testthat/helper-roundtrip.R new file mode 100644 index 000000000..80bcb42f1 --- /dev/null +++ b/src/arrow/r/tests/testthat/helper-roundtrip.R @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expect_array_roundtrip <- function(x, type, as = NULL) { + a <- Array$create(x, type = as) + expect_equal(a$type, type) + expect_identical(length(a), length(x)) + if (!inherits(type, c("ListType", "LargeListType", "FixedSizeListType"))) { + # TODO: revisit how missingness works with ListArrays + # R list objects don't handle missingness the same way as other vectors. + # Is there some vctrs thing we should do on the roundtrip back to R? + expect_as_vector(is.na(a), is.na(x)) + } + roundtrip <- as.vector(a) + expect_equal(roundtrip, x, ignore_attr = TRUE) + # Make sure the storage mode is the same on roundtrip (esp. integer vs. numeric) + expect_identical(typeof(roundtrip), typeof(x)) + + if (length(x)) { + a_sliced <- a$Slice(1) + x_sliced <- x[-1] + expect_equal(a_sliced$type, type) + expect_identical(length(a_sliced), length(x_sliced)) + if (!inherits(type, c("ListType", "LargeListType", "FixedSizeListType"))) { + expect_as_vector(is.na(a_sliced), is.na(x_sliced)) + } + expect_as_vector(a_sliced, x_sliced, ignore_attr = TRUE) + } + invisible(a) +} diff --git a/src/arrow/r/tests/testthat/helper-skip.R b/src/arrow/r/tests/testthat/helper-skip.R new file mode 100644 index 000000000..4256ec4ab --- /dev/null +++ b/src/arrow/r/tests/testthat/helper-skip.R @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +build_features <- c( + arrow_info()$capabilities, + # Special handling for "uncompressed", for tests that iterate over compressions + uncompressed = TRUE +) + +skip_if_not_available <- function(feature) { + if (feature == "re2") { + # RE2 does not support valgrind (on purpose): https://github.com/google/re2/issues/177 + skip_on_valgrind() + } + + yes <- feature %in% names(build_features) && build_features[feature] + if (!yes) { + skip(paste("Arrow C++ not built with", feature)) + } +} + +skip_if_no_pyarrow <- function() { + skip_on_valgrind() + skip_on_os("windows") + + skip_if_not_installed("reticulate") + if (!reticulate::py_module_available("pyarrow")) { + skip("pyarrow not available for testing") + } +} + +skip_if_not_dev_mode <- function() { + skip_if_not( + identical(tolower(Sys.getenv("ARROW_R_DEV")), "true"), + "environment variable ARROW_R_DEV" + ) +} + +skip_if_not_running_large_memory_tests <- function() { + skip_if_not( + identical(tolower(Sys.getenv("ARROW_LARGE_MEMORY_TESTS")), "true"), + "environment variable ARROW_LARGE_MEMORY_TESTS" + ) +} + +skip_on_valgrind <- function() { + # This does not actually skip on valgrind because we can't exactly detect it. + # Instead, it skips on CRAN when the OS is linux + and the R version is development + # (which is where valgrind is run as of this code) + linux_dev <- identical(tolower(Sys.info()[["sysname"]]), "linux") && + grepl("devel", R.version.string) + + if (linux_dev) { + skip_on_cran() + } +} + +skip_if_r_version <- function(r_version) { + if (getRversion() <= r_version) { + skip(paste("R version:", getRversion())) + } +} + +process_is_running <- function(x) { + cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x) + tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE) +} diff --git a/src/arrow/r/tests/testthat/latin1.R b/src/arrow/r/tests/testthat/latin1.R new file mode 100644 index 000000000..150192d31 --- /dev/null +++ b/src/arrow/r/tests/testthat/latin1.R @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +x <- iconv("Veitingastair", to = "latin1") +df <- tibble::tibble( + chr = x, + fct = as.factor(x) +) +names(df) <- iconv(paste(x, names(df), sep = "_"), to = "latin1") +df_struct <- tibble::tibble(a = df) + +raw_schema <- list(utf8(), dictionary(int8(), utf8())) +names(raw_schema) <- names(df) + +# Confirm setup +expect_identical(Encoding(x), "latin1") +expect_identical(Encoding(names(df)), c("latin1", "latin1")) +expect_identical(Encoding(df[[1]]), "latin1") +expect_identical(Encoding(levels(df[[2]])), "latin1") + +# Array +expect_identical(as.vector(Array$create(x)), x) +# struct +expect_identical(as.vector(Array$create(df)), df) + +# ChunkedArray +expect_identical(as.vector(ChunkedArray$create(x)), x) +# struct +expect_identical(as.vector(ChunkedArray$create(df)), df) + +# Table (including field name) +expect_identical(as.data.frame(Table$create(df)), df) +expect_identical(as.data.frame(Table$create(df_struct)), df_struct) + +# RecordBatch +expect_identical(as.data.frame(record_batch(df)), df) +expect_identical(as.data.frame(record_batch(df_struct)), df_struct) + +# Schema field name +df_schema <- do.call(schema, raw_schema) +expect_identical(names(df_schema), names(df)) + +df_struct_schema <- schema(a = do.call(struct, raw_schema)) +# StructType doesn't expose names (in C++) +# expect_identical(names(df_struct_schema$a), names(df)) + +# Create table/batch with schema +expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df) +expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct) +expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df) +expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct) + +# Serialization +feather_file <- tempfile() +write_feather(df_struct, feather_file) +expect_identical(read_feather(feather_file), df_struct) + +if (arrow_with_parquet()) { + parquet_file <- tempfile() + write_parquet(df, parquet_file) # Parquet doesn't yet support nested types + expect_identical(read_parquet(parquet_file), df) +} diff --git a/src/arrow/r/tests/testthat/test-Array.R b/src/arrow/r/tests/testthat/test-Array.R new file mode 100644 index 000000000..ce23c2609 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-Array.R @@ -0,0 +1,963 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("Integer Array", { + ints <- c(1:10, 1:10, 1:5) + x <- expect_array_roundtrip(ints, int32()) +}) + +test_that("binary Array", { + # if the type is given, we just need a list of raw vectors + bin <- list(as.raw(1:10), as.raw(1:10)) + expect_array_roundtrip(bin, binary(), as = binary()) + expect_array_roundtrip(bin, large_binary(), as = large_binary()) + expect_array_roundtrip(bin, fixed_size_binary(10), as = fixed_size_binary(10)) + + bin[[1L]] <- as.raw(1:20) + expect_error(Array$create(bin, fixed_size_binary(10))) + + # otherwise the arrow type is deduced from the R classes + bin <- vctrs::new_vctr( + list(as.raw(1:10), as.raw(11:20)), + class = "arrow_binary" + ) + expect_array_roundtrip(bin, binary()) + + bin <- vctrs::new_vctr( + list(as.raw(1:10), as.raw(11:20)), + class = "arrow_large_binary" + ) + expect_array_roundtrip(bin, large_binary()) + + bin <- vctrs::new_vctr( + list(as.raw(1:10), as.raw(11:20)), + class = "arrow_fixed_size_binary", + byte_width = 10L + ) + expect_array_roundtrip(bin, fixed_size_binary(byte_width = 10)) + + # degenerate cases + skip_on_valgrind() # valgrind errors on these tests ARROW-12638 + bin <- vctrs::new_vctr( + list(1:10), + class = "arrow_binary" + ) + expect_error(Array$create(bin)) + + bin <- vctrs::new_vctr( + list(1:10), + ptype = raw(), + class = "arrow_large_binary" + ) + expect_error(Array$create(bin)) + + bin <- vctrs::new_vctr( + list(1:10), + class = "arrow_fixed_size_binary", + byte_width = 10 + ) + expect_error(Array$create(bin)) + + bin <- vctrs::new_vctr( + list(as.raw(1:5)), + class = "arrow_fixed_size_binary", + byte_width = 10 + ) + expect_error(Array$create(bin)) + + bin <- vctrs::new_vctr( + list(as.raw(1:5)), + class = "arrow_fixed_size_binary" + ) + expect_error(Array$create(bin)) +}) + +test_that("Slice() and RangeEquals()", { + ints <- c(1:10, 101:110, 201:205) + x <- Array$create(ints) + + y <- x$Slice(10) + expect_equal(y$type, int32()) + expect_equal(length(y), 15L) + expect_as_vector(y, c(101:110, 201:205)) + expect_true(x$RangeEquals(y, 10, 24)) + expect_false(x$RangeEquals(y, 9, 23)) + expect_false(x$RangeEquals(y, 11, 24)) + + z <- x$Slice(10, 5) + expect_as_vector(z, c(101:105)) + expect_true(x$RangeEquals(z, 10, 15, 0)) + + # Input validation + expect_error(x$Slice("ten")) + expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA") + expect_error(x$Slice(NA), "Slice 'offset' cannot be NA") + expect_error(x$Slice(10, "ten")) + expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA") + expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA") + expect_error(x$Slice(c(10, 10))) + expect_error(x$Slice(10, c(10, 10))) + expect_error(x$Slice(1000), "Slice 'offset' greater than array length") + expect_error(x$Slice(-1), "Slice 'offset' cannot be negative") + expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length") + expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative") + expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative") + + expect_warning(x$Slice(10, 15), NA) + expect_warning( + overslice <- x$Slice(10, 16), + "Slice 'length' greater than available length" + ) + expect_equal(length(overslice), 15) + expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length") + + expect_error(x$RangeEquals(10, 24, 0), 'other must be a "Array"') + expect_error(x$RangeEquals(y, NA, 24), "'start_idx' cannot be NA") + expect_error(x$RangeEquals(y, 10, NA), "'end_idx' cannot be NA") + expect_error(x$RangeEquals(y, 10, 24, NA), "'other_start_idx' cannot be NA") + expect_error(x$RangeEquals(y, "ten", 24)) + + skip("TODO: (if anyone uses RangeEquals)") + expect_error(x$RangeEquals(y, 10, 2400, 0)) # does not error + expect_error(x$RangeEquals(y, 1000, 24, 0)) # does not error + expect_error(x$RangeEquals(y, 10, 24, 1000)) # does not error +}) + +test_that("Double Array", { + dbls <- c(1, 2, 3, 4, 5, 6) + x_dbl <- expect_array_roundtrip(dbls, float64()) +}) + +test_that("Array print method includes type", { + x <- Array$create(c(1:10, 1:10, 1:5)) + expect_output(print(x), "Array\n<int32>\n[\n", fixed = TRUE) +}) + +test_that("Array supports NA", { + x_int <- Array$create(as.integer(c(1:10, NA))) + x_dbl <- Array$create(as.numeric(c(1:10, NA))) + expect_true(x_int$IsValid(0)) + expect_true(x_dbl$IsValid(0L)) + expect_true(x_int$IsNull(10L)) + expect_true(x_dbl$IsNull(10)) + + expect_as_vector(is.na(x_int), c(rep(FALSE, 10), TRUE)) + expect_as_vector(is.na(x_dbl), c(rep(FALSE, 10), TRUE)) + + # Input validation + expect_error(x_int$IsValid("ten")) + expect_error(x_int$IsNull("ten")) + expect_error(x_int$IsValid(c(10, 10))) + expect_error(x_int$IsNull(c(10, 10))) + expect_error(x_int$IsValid(NA), "'i' cannot be NA") + expect_error(x_int$IsNull(NA), "'i' cannot be NA") + expect_error(x_int$IsValid(1000), "subscript out of bounds") + expect_error(x_int$IsValid(-1), "subscript out of bounds") + expect_error(x_int$IsNull(1000), "subscript out of bounds") + expect_error(x_int$IsNull(-1), "subscript out of bounds") +}) + +test_that("Array support null type (ARROW-7064)", { + expect_array_roundtrip(vctrs::unspecified(10), null()) +}) + +test_that("Array supports logical vectors (ARROW-3341)", { + # with NA + x <- sample(c(TRUE, FALSE, NA), 1000, replace = TRUE) + expect_array_roundtrip(x, bool()) + + # without NA + x <- sample(c(TRUE, FALSE), 1000, replace = TRUE) + expect_array_roundtrip(x, bool()) +}) + +test_that("Array supports character vectors (ARROW-3339)", { + # without NA + expect_array_roundtrip(c("itsy", "bitsy", "spider"), utf8()) + expect_array_roundtrip(c("itsy", "bitsy", "spider"), large_utf8(), as = large_utf8()) + + # with NA + expect_array_roundtrip(c("itsy", NA, "spider"), utf8()) + expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8()) +}) + +test_that("Character vectors > 2GB become large_utf8", { + skip_on_cran() + skip_if_not_running_large_memory_tests() + big <- make_big_string() + expect_array_roundtrip(big, large_utf8()) +}) + +test_that("empty arrays are supported", { + expect_array_roundtrip(character(), utf8()) + expect_array_roundtrip(character(), large_utf8(), as = large_utf8()) + expect_array_roundtrip(integer(), int32()) + expect_array_roundtrip(numeric(), float64()) + expect_array_roundtrip(factor(character()), dictionary(int8(), utf8())) + expect_array_roundtrip(logical(), bool()) +}) + +test_that("array with all nulls are supported", { + nas <- c(NA, NA) + expect_array_roundtrip(as.character(nas), utf8()) + expect_array_roundtrip(as.integer(nas), int32()) + expect_array_roundtrip(as.numeric(nas), float64()) + expect_array_roundtrip(as.factor(nas), dictionary(int8(), utf8())) + expect_array_roundtrip(as.logical(nas), bool()) +}) + +test_that("Array supports unordered factors (ARROW-3355)", { + # without NA + f <- factor(c("itsy", "bitsy", "spider", "spider")) + expect_array_roundtrip(f, dictionary(int8(), utf8())) + + # with NA + f <- factor(c("itsy", "bitsy", NA, "spider", "spider")) + expect_array_roundtrip(f, dictionary(int8(), utf8())) +}) + +test_that("Array supports ordered factors (ARROW-3355)", { + # without NA + f <- ordered(c("itsy", "bitsy", "spider", "spider")) + arr_fac <- expect_array_roundtrip(f, dictionary(int8(), utf8(), ordered = TRUE)) + expect_true(arr_fac$ordered) + + # with NA + f <- ordered(c("itsy", "bitsy", NA, "spider", "spider")) + expect_array_roundtrip(f, dictionary(int8(), utf8(), ordered = TRUE)) +}) + +test_that("array supports Date (ARROW-3340)", { + d <- Sys.Date() + 1:10 + expect_array_roundtrip(d, date32()) + + d[5] <- NA + expect_array_roundtrip(d, date32()) +}) + +test_that("array supports POSIXct (ARROW-3340)", { + times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 + expect_array_roundtrip(times, timestamp("us", "UTC")) + + times[5] <- NA + expect_array_roundtrip(times, timestamp("us", "UTC")) + + times2 <- lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10 + expect_array_roundtrip(times2, timestamp("us", "US/Eastern")) +}) + +test_that("array supports POSIXct without timezone", { + # Make sure timezone is not set + withr::with_envvar(c(TZ = ""), { + times <- strptime("2019-02-03 12:34:56", format = "%Y-%m-%d %H:%M:%S") + 1:10 + expect_array_roundtrip(times, timestamp("us", "")) + + # Also test the INTSXP code path + skip("Ingest_POSIXct only implemented for REALSXP") + times_int <- as.integer(times) + attributes(times_int) <- attributes(times) + expect_array_roundtrip(times_int, timestamp("us", "")) + }) +}) + +test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", { + # Write a feather file as that's what the initial bug report used + df <- tibble::tibble( + no_tz = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10, + yes_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas") + 1:10 + ) + if (!identical(Sys.timezone(), "Pacific/Marquesas")) { + # Confirming that the columns are in fact different + expect_false(any(df$no_tz == df$yes_tz)) + } + feather_file <- tempfile() + on.exit(unlink(feather_file)) + write_feather(df, feather_file) + expect_identical(read_feather(feather_file), df) +}) + +test_that("array supports integer64", { + x <- bit64::as.integer64(1:10) + MAX_INT + expect_array_roundtrip(x, int64()) + + x[4] <- NA + expect_array_roundtrip(x, int64()) + + # all NA int64 (ARROW-3795) + all_na <- Array$create(bit64::as.integer64(NA)) + expect_type_equal(all_na, int64()) + expect_true(as.vector(is.na(all_na))) +}) + +test_that("array supports difftime", { + time <- hms::hms(56, 34, 12) + expect_array_roundtrip(c(time, time), time32("s")) + expect_array_roundtrip(vctrs::vec_c(NA, time), time32("s")) +}) + +test_that("support for NaN (ARROW-3615)", { + x <- c(1, NA, NaN, -1) + y <- Array$create(x) + expect_true(y$IsValid(2)) + expect_equal(y$null_count, 1L) +}) + +test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", { + x <- c(1.0, NA, NaN, -1.0) + compare_expression(is.nan(.input), x) +}) + +test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", { + x <- c(1L, 2L, 3L) + y <- c("foo", "bar") + compare_expression(is.nan(.input), x) + compare_expression(is.nan(.input), y) +}) + +test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", { + x <- c(1, NA, NaN, -1) + compare_expression(is.na(.input), x) +}) + +test_that("integer types casts (ARROW-3741)", { + # Defining some type groups for use here and in the following tests + int_types <- c(int8(), int16(), int32(), int64()) + uint_types <- c(uint8(), uint16(), uint32(), uint64()) + float_types <- c(float32(), float64()) # float16() not really supported in C++ yet + + a <- Array$create(c(1:10, NA)) + for (type in c(int_types, uint_types)) { + casted <- a$cast(type) + expect_equal(casted$type, type) + expect_identical(as.vector(is.na(casted)), c(rep(FALSE, 10), TRUE)) + } +}) + +test_that("integer types cast safety (ARROW-3741, ARROW-5541)", { + a <- Array$create(-(1:10)) + for (type in uint_types) { + expect_error(a$cast(type), regexp = "Integer value -1 not in range") + expect_error(a$cast(type, safe = FALSE), NA) + } +}) + +test_that("float types casts (ARROW-3741)", { + x <- c(1, 2, 3, NA) + a <- Array$create(x) + for (type in float_types) { + casted <- a$cast(type) + expect_equal(casted$type, type) + expect_identical(as.vector(is.na(casted)), c(rep(FALSE, 3), TRUE)) + expect_identical(as.vector(casted), x) + } +}) + +test_that("cast to half float works", { + skip("Need halffloat support: https://issues.apache.org/jira/browse/ARROW-3802") + a <- Array$create(1:4) + a_f16 <- a$cast(float16()) + expect_type_equal(a_16$type, float16()) +}) + +test_that("cast input validation", { + a <- Array$create(1:4) + expect_error(a$cast("not a type"), "type must be a DataType, not character") +}) + +test_that("Array$create() supports the type= argument. conversion from INTSXP and int64 to all int types", { + num_int32 <- 12L + num_int64 <- bit64::as.integer64(10) + + types <- c( + int_types, + uint_types, + float_types, + double() # not actually a type, a base R function but should be alias for float64 + ) + for (type in types) { + expect_type_equal(Array$create(num_int32, type = type)$type, as_type(type)) + expect_type_equal(Array$create(num_int64, type = type)$type, as_type(type)) + } + + # Input validation + expect_error( + Array$create(5, type = "not a type"), + "type must be a DataType, not character" + ) +}) + +test_that("Array$create() aborts on overflow", { + expect_error(Array$create(128L, type = int8())) + expect_error(Array$create(-129L, type = int8())) + + expect_error(Array$create(256L, type = uint8())) + expect_error(Array$create(-1L, type = uint8())) + + expect_error(Array$create(32768L, type = int16())) + expect_error(Array$create(-32769L, type = int16())) + + expect_error(Array$create(65536L, type = uint16())) + expect_error(Array$create(-1L, type = uint16())) + + expect_error(Array$create(65536L, type = uint16())) + expect_error(Array$create(-1L, type = uint16())) + + expect_error(Array$create(bit64::as.integer64(2^31), type = int32())) + expect_error(Array$create(bit64::as.integer64(2^32), type = uint32())) +}) + +test_that("Array$create() does not convert doubles to integer", { + for (type in c(int_types, uint_types)) { + a <- Array$create(10, type = type) + expect_type_equal(a$type, type) + expect_true(as.vector(a) == 10L) + } +}) + +test_that("Array$create() converts raw vectors to uint8 arrays (ARROW-3794)", { + expect_type_equal(Array$create(as.raw(1:10))$type, uint8()) +}) + +test_that("Array<int8>$as_vector() converts to integer (ARROW-3794)", { + i8 <- (-128):127 + a <- Array$create(i8)$cast(int8()) + expect_type_equal(a, int8()) + expect_as_vector(a, i8) + + u8 <- 0:255 + a <- Array$create(u8)$cast(uint8()) + expect_type_equal(a, uint8()) + expect_as_vector(a, u8) +}) + +test_that("Arrays of {,u}int{32,64} convert to integer if they can fit", { + u32 <- Array$create(1L)$cast(uint32()) + expect_identical(as.vector(u32), 1L) + + u64 <- Array$create(1L)$cast(uint64()) + expect_identical(as.vector(u64), 1L) + + i64 <- Array$create(bit64::as.integer64(1:10)) + expect_identical(as.vector(i64), 1:10) +}) + +test_that("Arrays of uint{32,64} convert to numeric if they can't fit integer", { + u32 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint32()) + expect_identical(as.vector(u32), 1 + MAX_INT) + + u64 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint64()) + expect_identical(as.vector(u64), 1 + MAX_INT) +}) + +test_that("Array$create() recognise arrow::Array (ARROW-3815)", { + a <- Array$create(1:10) + expect_equal(a, Array$create(a)) +}) + +test_that("Array$create() handles data frame -> struct arrays (ARROW-3811)", { + df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10]) + a <- Array$create(df) + expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8())) + expect_as_vector(a, df) + + df <- structure( + list(col = structure(list(structure(list(list(structure(1))), class = "inner")), class = "outer")), + class = "data.frame", row.names = c(NA, -1L) + ) + a <- Array$create(df) + expect_type_equal(a$type, struct(col = list_of(list_of(list_of(float64()))))) + expect_as_vector(a, df, ignore_attr = TRUE) +}) + +test_that("StructArray methods", { + df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10]) + a <- Array$create(df) + expect_equal(a$x, Array$create(df$x)) + expect_equal(a[["x"]], Array$create(df$x)) + expect_equal(a[[1]], Array$create(df$x)) + expect_identical(names(a), c("x", "y", "z")) + expect_identical(dim(a), c(10L, 3L)) +}) + +test_that("Array$create() can handle data frame with custom struct type (not inferred)", { + df <- tibble::tibble(x = 1:10, y = 1:10) + type <- struct(x = float64(), y = int16()) + a <- Array$create(df, type = type) + expect_type_equal(a$type, type) + + type <- struct(x = float64(), y = int16(), z = int32()) + expect_error( + Array$create(df, type = type), + regexp = "Number of fields in struct.* incompatible with number of columns in the data frame" + ) + + type <- struct(y = int16(), x = float64()) + expect_error( + Array$create(df, type = type), + regexp = "Field name in position.*does not match the name of the column of the data frame" + ) + + type <- struct(x = float64(), y = utf8()) + expect_error(Array$create(df, type = type), regexp = "Invalid") +}) + +test_that("Array$create() supports tibble with no columns (ARROW-8354)", { + df <- tibble::tibble() + expect_equal(Array$create(df)$as_vector(), df) +}) + +test_that("Array$create() handles vector -> list arrays (ARROW-7662)", { + # Should be able to create an empty list with a type hint. + expect_r6_class(Array$create(list(), list_of(bool())), "ListArray") + + # logical + expect_array_roundtrip(list(NA), list_of(bool())) + expect_array_roundtrip(list(logical(0)), list_of(bool())) + expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), list_of(bool())) + expect_array_roundtrip(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), list_of(bool())) + + # integer + expect_array_roundtrip(list(NA_integer_), list_of(int32())) + expect_array_roundtrip(list(integer(0)), list_of(int32())) + expect_array_roundtrip(list(1:2, 3:4, 12:18), list_of(int32())) + expect_array_roundtrip(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), list_of(int32())) + + # numeric + expect_array_roundtrip(list(NA_real_), list_of(float64())) + expect_array_roundtrip(list(numeric(0)), list_of(float64())) + expect_array_roundtrip(list(1, c(2, 3), 4), list_of(float64())) + expect_array_roundtrip(list(1, numeric(0), c(2, 3, NA_real_), 4), list_of(float64())) + + # character + expect_array_roundtrip(list(NA_character_), list_of(utf8())) + expect_array_roundtrip(list(character(0)), list_of(utf8())) + expect_array_roundtrip(list("itsy", c("bitsy", "spider"), c("is")), list_of(utf8())) + expect_array_roundtrip(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), list_of(utf8())) + + # factor + expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), list_of(dictionary(int8(), utf8()))) + expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8()))) + + # struct + expect_array_roundtrip( + list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), + list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())) + ) + expect_array_roundtrip( + list(tibble::tibble(a = list(integer()))), + list_of(struct(a = list_of(int32()))) + ) + # degenerated data frame + df <- structure(list(x = 1:2, y = 1), class = "data.frame", row.names = 1:2) + expect_error(Array$create(list(df))) +}) + +test_that("Array$create() handles vector -> large list arrays", { + # Should be able to create an empty list with a type hint. + expect_r6_class(Array$create(list(), type = large_list_of(bool())), "LargeListArray") + + # logical + expect_array_roundtrip(list(NA), large_list_of(bool()), as = large_list_of(bool())) + expect_array_roundtrip(list(logical(0)), large_list_of(bool()), as = large_list_of(bool())) + expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), large_list_of(bool()), as = large_list_of(bool())) + expect_array_roundtrip( + list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), + large_list_of(bool()), + as = large_list_of(bool()) + ) + + # integer + expect_array_roundtrip(list(NA_integer_), large_list_of(int32()), as = large_list_of(int32())) + expect_array_roundtrip(list(integer(0)), large_list_of(int32()), as = large_list_of(int32())) + expect_array_roundtrip(list(1:2, 3:4, 12:18), large_list_of(int32()), as = large_list_of(int32())) + expect_array_roundtrip( + list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), + large_list_of(int32()), + as = large_list_of(int32()) + ) + + # numeric + expect_array_roundtrip(list(NA_real_), large_list_of(float64()), as = large_list_of(float64())) + expect_array_roundtrip(list(numeric(0)), large_list_of(float64()), as = large_list_of(float64())) + expect_array_roundtrip(list(1, c(2, 3), 4), large_list_of(float64()), as = large_list_of(float64())) + expect_array_roundtrip( + list(1, numeric(0), c(2, 3, NA_real_), 4), + large_list_of(float64()), + as = large_list_of(float64()) + ) + + # character + expect_array_roundtrip(list(NA_character_), large_list_of(utf8()), as = large_list_of(utf8())) + expect_array_roundtrip(list(character(0)), large_list_of(utf8()), as = large_list_of(utf8())) + expect_array_roundtrip( + list("itsy", c("bitsy", "spider"), c("is")), + large_list_of(utf8()), + as = large_list_of(utf8()) + ) + expect_array_roundtrip( + list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), + large_list_of(utf8()), + as = large_list_of(utf8()) + ) + + # factor + expect_array_roundtrip( + list(factor(c("b", "a"), levels = c("a", "b"))), + large_list_of(dictionary(int8(), utf8())), + as = large_list_of(dictionary(int8(), utf8())) + ) + expect_array_roundtrip( + list(factor(NA, levels = c("a", "b"))), + large_list_of(dictionary(int8(), utf8())), + as = large_list_of(dictionary(int8(), utf8())) + ) + + # struct + expect_array_roundtrip( + list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), + large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())), + as = large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())) + ) + expect_array_roundtrip( + list(tibble::tibble(a = list(integer()))), + large_list_of(struct(a = list_of(int32()))), + as = large_list_of(struct(a = list_of(int32()))) + ) +}) + +test_that("Array$create() handles vector -> fixed size list arrays", { + # Should be able to create an empty list with a type hint. + expect_r6_class(Array$create(list(), type = fixed_size_list_of(bool(), 20)), "FixedSizeListArray") + + # logical + expect_array_roundtrip(list(NA), fixed_size_list_of(bool(), 1L), as = fixed_size_list_of(bool(), 1L)) + expect_array_roundtrip( + list(c(TRUE, FALSE), c(FALSE, TRUE)), + fixed_size_list_of(bool(), 2L), + as = fixed_size_list_of(bool(), 2L) + ) + expect_array_roundtrip( + list(c(TRUE), c(FALSE), NA), + fixed_size_list_of(bool(), 1L), + as = fixed_size_list_of(bool(), 1L) + ) + + # integer + expect_array_roundtrip(list(NA_integer_), fixed_size_list_of(int32(), 1L), as = fixed_size_list_of(int32(), 1L)) + expect_array_roundtrip(list(1:2, 3:4, 11:12), fixed_size_list_of(int32(), 2L), as = fixed_size_list_of(int32(), 2L)) + expect_array_roundtrip( + list(c(1:2), c(NA_integer_, 3L)), + fixed_size_list_of(int32(), 2L), + as = fixed_size_list_of(int32(), 2L) + ) + + # numeric + expect_array_roundtrip(list(NA_real_), fixed_size_list_of(float64(), 1L), as = fixed_size_list_of(float64(), 1L)) + expect_array_roundtrip( + list(c(1, 2), c(2, 3)), + fixed_size_list_of(float64(), 2L), + as = fixed_size_list_of(float64(), 2L) + ) + expect_array_roundtrip( + list(c(1, 2), c(NA_real_, 4)), + fixed_size_list_of(float64(), 2L), + as = fixed_size_list_of(float64(), 2L) + ) + + # character + expect_array_roundtrip(list(NA_character_), fixed_size_list_of(utf8(), 1L), as = fixed_size_list_of(utf8(), 1L)) + expect_array_roundtrip( + list(c("itsy", "bitsy"), c("spider", "is"), c(NA_character_, NA_character_), c("", "")), + fixed_size_list_of(utf8(), 2L), + as = fixed_size_list_of(utf8(), 2L) + ) + + # factor + expect_array_roundtrip( + list(factor(c("b", "a"), levels = c("a", "b"))), + fixed_size_list_of(dictionary(int8(), utf8()), 2L), + as = fixed_size_list_of(dictionary(int8(), utf8()), 2L) + ) + + # struct + expect_array_roundtrip( + list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)), + fixed_size_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()), 1L), + as = fixed_size_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()), 1L) + ) + expect_array_roundtrip( + list(tibble::tibble(a = list(1L))), + fixed_size_list_of(struct(a = list_of(int32())), 1L), + as = fixed_size_list_of(struct(a = list_of(int32())), 1L) + ) + expect_array_roundtrip( + list(tibble::tibble(a = list(1L))), + list_of(struct(a = fixed_size_list_of(int32(), 1L))), + as = list_of(struct(a = fixed_size_list_of(int32(), 1L))) + ) +}) + +test_that("Handling string data with embedded nuls", { + raws <- structure(list( + as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), + as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), + as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls + as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), + as.raw(c(0x74, 0x76)) + ), + class = c("arrow_binary", "vctrs_vctr", "list") + ) + expect_error( + rawToChar(raws[[3]]), + "embedded nul in string: 'ma\\0n'", # See? + fixed = TRUE + ) + array_with_nul <- Array$create(raws)$cast(utf8()) + + # The behavior of the warnings/errors is slightly different with and without + # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately + # on `as.vector()` where as with it, the error only happens on materialization) + skip_if_r_version("3.5.0") + + # no error on conversion, because altrep laziness + v <- expect_error(as.vector(array_with_nul), NA) + + # attempting materialization -> error + + expect_error(v[], + paste0( + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow ", + "to R, set options(arrow.skip_nul = TRUE)" + ), + fixed = TRUE + ) + + # also error on materializing v[3] + expect_error(v[3], + paste0( + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow ", + "to R, set options(arrow.skip_nul = TRUE)" + ), + fixed = TRUE + ) + + withr::with_options(list(arrow.skip_nul = TRUE), { + # no warning yet because altrep laziness + v <- as.vector(array_with_nul) + + expect_warning( + expect_identical( + v[], + c("person", "woman", "man", "fan", "camera", "tv") + ), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + + v <- as.vector(array_with_nul) + expect_warning( + expect_identical(v[3], "man"), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + + v <- as.vector(array_with_nul) + expect_warning( + expect_identical(v[4], "fan"), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) +}) + +test_that("Array$create() should have helpful error", { + expect_error(Array$create(list(numeric(0)), list_of(bool())), "Expecting a logical vector") + + lgl <- logical(0) + int <- integer(0) + num <- numeric(0) + char <- character(0) + expect_error(Array$create(list()), "Requires at least one element to infer") + expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector") + expect_error(Array$create(list(char, num, char)), "Expecting a character vector") +}) + +test_that("Array$View() (ARROW-6542)", { + a <- Array$create(1:3) + b <- a$View(float32()) + expect_equal(b$type, float32()) + expect_equal(length(b), 3L) + + # Input validation + expect_error(a$View("not a type"), "type must be a DataType, not character") +}) + +test_that("Array$Validate()", { + a <- Array$create(1:10) + expect_error(a$Validate(), NA) +}) + +test_that("is.Array", { + a <- Array$create(1, type = int32()) + expect_true(is.Array(a)) + expect_true(is.Array(a, "int32")) + expect_true(is.Array(a, c("int32", "int16"))) + expect_false(is.Array(a, "utf8")) + expect_true(is.Array(a$View(float32())), "float32") + expect_false(is.Array(1)) + expect_true(is.Array(ChunkedArray$create(1, 2))) +}) + +test_that("Array$Take()", { + a <- Array$create(10:20) + expect_as_vector(a$Take(c(4, 2)), c(14, 12)) +}) + +test_that("[ method on Array", { + vec <- 11:20 + a <- Array$create(vec) + expect_as_vector(a[5:9], vec[5:9]) + expect_as_vector(a[c(9, 3, 5)], vec[c(9, 3, 5)]) + expect_as_vector(a[rep(c(TRUE, FALSE), 5)], vec[c(1, 3, 5, 7, 9)]) + expect_as_vector(a[rep(c(TRUE, FALSE, NA, FALSE, TRUE), 2)], c(11, NA, 15, 16, NA, 20)) + expect_as_vector(a[-4], vec[-4]) + expect_as_vector(a[-1], vec[-1]) +}) + +test_that("[ accepts Arrays and otherwise handles bad input", { + vec <- 11:20 + a <- Array$create(vec) + ind <- c(9, 3, 5) + expect_error( + a[Array$create(ind)], + "Cannot extract rows with an Array of type double" + ) + expect_as_vector(a[Array$create(ind - 1, type = int8())], vec[ind]) + expect_as_vector(a[Array$create(ind - 1, type = uint8())], vec[ind]) + expect_as_vector(a[ChunkedArray$create(8, 2, 4, type = uint8())], vec[ind]) + + filt <- seq_along(vec) %in% ind + expect_as_vector(a[Array$create(filt)], vec[filt]) + + expect_error( + a["string"], + "Cannot extract rows with an object of class character" + ) +}) + +test_that("%in% works on dictionary arrays", { + a1 <- Array$create(as.factor(c("A", "B", "C"))) + a2 <- DictionaryArray$create(c(0L, 1L, 2L), c(4.5, 3.2, 1.1)) + c1 <- Array$create(c(FALSE, TRUE, FALSE)) + c2 <- Array$create(c(FALSE, FALSE, FALSE)) + b1 <- Array$create("B") + b2 <- Array$create(5.4) + + expect_equal(is_in(a1, b1), c1) + expect_equal(is_in(a2, b2), c2) + expect_error(is_in(a1, b2)) +}) + +test_that("[ accepts Expressions", { + vec <- 11:20 + a <- Array$create(vec) + b <- Array$create(1:10) + expect_as_vector(a[b > 4], vec[5:10]) +}) + +test_that("Array head/tail", { + vec <- 11:20 + a <- Array$create(vec) + expect_as_vector(head(a), head(vec)) + expect_as_vector(head(a, 4), head(vec, 4)) + expect_as_vector(head(a, 40), head(vec, 40)) + expect_as_vector(head(a, -4), head(vec, -4)) + expect_as_vector(head(a, -40), head(vec, -40)) + expect_as_vector(tail(a), tail(vec)) + expect_as_vector(tail(a, 4), tail(vec, 4)) + expect_as_vector(tail(a, 40), tail(vec, 40)) + expect_as_vector(tail(a, -40), tail(vec, -40)) +}) + +test_that("Dictionary array: create from arrays, not factor", { + a <- DictionaryArray$create(c(2L, 1L, 1L, 2L, 0L), c(4.5, 3.2, 1.1)) + expect_equal(a$type, dictionary(int32(), float64())) +}) + +test_that("Dictionary array: translate to R when dict isn't string", { + a <- DictionaryArray$create(c(2L, 1L, 1L, 2L, 0L), c(4.5, 3.2, 1.1)) + expect_warning( + expect_identical( + as.vector(a), + factor(c(3, 2, 2, 3, 1), labels = c("4.5", "3.2", "1.1")) + ) + ) +}) + +test_that("Array$Equals", { + vec <- 11:20 + a <- Array$create(vec) + b <- Array$create(vec) + d <- Array$create(3:4) + expect_equal(a, b) + expect_true(a$Equals(b)) + expect_false(a$Equals(vec)) + expect_false(a$Equals(d)) +}) + +test_that("Array$ApproxEquals", { + vec <- c(1.0000000000001, 2.400000000000001) + a <- Array$create(vec) + b <- Array$create(round(vec, 1)) + expect_false(a$Equals(b)) + expect_true(a$ApproxEquals(b)) + expect_false(a$ApproxEquals(vec)) +}) + +test_that("auto int64 conversion to int can be disabled (ARROW-10093)", { + withr::with_options(list(arrow.int64_downcast = FALSE), { + a <- Array$create(1:10, int64()) + expect_true(inherits(a$as_vector(), "integer64")) + + batch <- RecordBatch$create(x = a) + expect_true(inherits(as.data.frame(batch)$x, "integer64")) + + tab <- Table$create(x = a) + expect_true(inherits(as.data.frame(batch)$x, "integer64")) + }) +}) + + +test_that("Array to C-interface", { + # create a struct array since that's one of the more complicated array types + df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10]) + arr <- Array$create(df) + + # export the array via the C-interface + schema_ptr <- allocate_arrow_schema() + array_ptr <- allocate_arrow_array() + arr$export_to_c(array_ptr, schema_ptr) + + # then import it and check that the roundtripped value is the same + circle <- Array$import_from_c(array_ptr, schema_ptr) + expect_equal(arr, circle) + + # must clean up the pointers or we leak + delete_arrow_schema(schema_ptr) + delete_arrow_array(array_ptr) +}) diff --git a/src/arrow/r/tests/testthat/test-RecordBatch.R b/src/arrow/r/tests/testthat/test-RecordBatch.R new file mode 100644 index 000000000..d280754a3 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-RecordBatch.R @@ -0,0 +1,690 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("RecordBatch", { + # Note that we're reusing `tbl` and `batch` throughout the tests in this file + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + batch <- record_batch(tbl) + + expect_equal(batch, batch) + expect_equal( + batch$schema, + schema( + int = int32(), dbl = float64(), + lgl = boolean(), chr = utf8(), + fct = dictionary(int8(), utf8()) + ) + ) + expect_equal(batch$num_columns, 5L) + expect_equal(batch$num_rows, 10L) + expect_equal(batch$column_name(0), "int") + expect_equal(batch$column_name(1), "dbl") + expect_equal(batch$column_name(2), "lgl") + expect_equal(batch$column_name(3), "chr") + expect_equal(batch$column_name(4), "fct") + expect_equal(names(batch), c("int", "dbl", "lgl", "chr", "fct")) + + # input validation + expect_error(batch$column_name(NA), "'i' cannot be NA") + expect_error(batch$column_name(-1), "subscript out of bounds") + expect_error(batch$column_name(1000), "subscript out of bounds") + expect_error(batch$column_name(1:2)) + expect_error(batch$column_name("one")) + + col_int <- batch$column(0) + expect_true(inherits(col_int, "Array")) + expect_equal(col_int$as_vector(), tbl$int) + expect_equal(col_int$type, int32()) + + col_dbl <- batch$column(1) + expect_true(inherits(col_dbl, "Array")) + expect_equal(col_dbl$as_vector(), tbl$dbl) + expect_equal(col_dbl$type, float64()) + + col_lgl <- batch$column(2) + expect_true(inherits(col_dbl, "Array")) + expect_equal(col_lgl$as_vector(), tbl$lgl) + expect_equal(col_lgl$type, boolean()) + + col_chr <- batch$column(3) + expect_true(inherits(col_chr, "Array")) + expect_equal(col_chr$as_vector(), tbl$chr) + expect_equal(col_chr$type, utf8()) + + col_fct <- batch$column(4) + expect_true(inherits(col_fct, "Array")) + expect_equal(col_fct$as_vector(), tbl$fct) + expect_equal(col_fct$type, dictionary(int8(), utf8())) + + # input validation + expect_error(batch$column(NA), "'i' cannot be NA") + expect_error(batch$column(-1), "subscript out of bounds") + expect_error(batch$column(1000), "subscript out of bounds") + expect_error(batch$column(1:2)) + expect_error(batch$column("one")) + + batch2 <- batch$RemoveColumn(0) + expect_equal( + batch2$schema, + schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8())) + ) + expect_equal(batch2$column(0), batch$column(1)) + expect_data_frame(batch2, tbl[, -1]) + + # input validation + expect_error(batch$RemoveColumn(NA), "'i' cannot be NA") + expect_error(batch$RemoveColumn(-1), "subscript out of bounds") + expect_error(batch$RemoveColumn(1000), "subscript out of bounds") + expect_error(batch$RemoveColumn(1:2)) + expect_error(batch$RemoveColumn("one")) +}) + +test_that("RecordBatch S3 methods", { + tab <- RecordBatch$create(example_data) + for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", "as.list")) { + fun <- get(f) + expect_identical(fun(tab), fun(example_data), info = f) + } +}) + +test_that("RecordBatch$Slice", { + batch3 <- batch$Slice(5) + expect_data_frame(batch3, tbl[6:10, ]) + + batch4 <- batch$Slice(5, 2) + expect_data_frame(batch4, tbl[6:7, ]) + + # Input validation + expect_error(batch$Slice("ten")) + expect_error(batch$Slice(NA_integer_), "Slice 'offset' cannot be NA") + expect_error(batch$Slice(NA), "Slice 'offset' cannot be NA") + expect_error(batch$Slice(10, "ten")) + expect_error(batch$Slice(10, NA_integer_), "Slice 'length' cannot be NA") + expect_error(batch$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA") + expect_error(batch$Slice(c(10, 10))) + expect_error(batch$Slice(10, c(10, 10))) + expect_error(batch$Slice(1000), "Slice 'offset' greater than array length") + expect_error(batch$Slice(-1), "Slice 'offset' cannot be negative") + expect_error(batch4$Slice(10, 10), "Slice 'offset' greater than array length") + expect_error(batch$Slice(10, -1), "Slice 'length' cannot be negative") + expect_error(batch$Slice(-1, 10), "Slice 'offset' cannot be negative") +}) + +test_that("[ on RecordBatch", { + expect_data_frame(batch[6:7, ], tbl[6:7, ]) + expect_data_frame(batch[c(6, 7), ], tbl[6:7, ]) + expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4]) + expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr) + expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_data_frame( + batch[rep(c(FALSE, TRUE), 5), ], + tbl[c(2, 4, 6, 8, 10), ] + ) + # bool Array + expect_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ]) + # int Array + expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + + # input validation + expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"') + expect_error(batch[, c(6, NA)], "Column indices cannot be NA") + expect_error(batch[, c(2, -2)], "Invalid column index") +}) + +test_that("[[ and $ on RecordBatch", { + expect_as_vector(batch[["int"]], tbl$int) + expect_as_vector(batch$int, tbl$int) + expect_as_vector(batch[[4]], tbl$chr) + expect_null(batch$qwerty) + expect_null(batch[["asdf"]]) + expect_error(batch[[c(4, 3)]]) + expect_error(batch[[NA]], "'i' must be character or numeric, not logical") + expect_error(batch[[NULL]], "'i' must be character or numeric, not NULL") + expect_error(batch[[c("asdf", "jkl;")]], "name is not a string", fixed = TRUE) +}) + +test_that("[[<- assignment", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + batch <- RecordBatch$create(tbl) + + # can remove a column + batch[["chr"]] <- NULL + expect_data_frame(batch, tbl[-4]) + + # can remove a column by index + batch[[4]] <- NULL + expect_data_frame(batch, tbl[1:3]) + + # can add a named column + batch[["new"]] <- letters[10:1] + expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + + # can replace a column by index + batch[[2]] <- as.numeric(10:1) + expect_as_vector(batch[[2]], as.numeric(10:1)) + + # can add a column by index + batch[[5]] <- as.numeric(10:1) + expect_as_vector(batch[[5]], as.numeric(10:1)) + expect_as_vector(batch[["5"]], as.numeric(10:1)) + + # can replace a column + batch[["int"]] <- 10:1 + expect_as_vector(batch[["int"]], 10:1) + + # can use $ + batch$new <- NULL + expect_null(as.vector(batch$new)) + expect_identical(dim(batch), c(10L, 4L)) + + batch$int <- 1:10 + expect_as_vector(batch$int, 1:10) + + # recycling + batch[["atom"]] <- 1L + expect_as_vector(batch[["atom"]], rep(1L, 10)) + + expect_error( + batch[["atom"]] <- 1:6, + "Can't recycle input of size 6 to size 10." + ) + + # assign Arrow array + array <- Array$create(c(10:1)) + batch$array <- array + expect_as_vector(batch$array, 10:1) + + # nonsense indexes + expect_error(batch[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical") + expect_error(batch[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL") + expect_error(batch[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(batch[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(batch[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(batch[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE) +}) + +test_that("head and tail on RecordBatch", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + batch <- RecordBatch$create(tbl) + expect_data_frame(head(batch), head(tbl)) + expect_data_frame(head(batch, 4), head(tbl, 4)) + expect_data_frame(head(batch, 40), head(tbl, 40)) + expect_data_frame(head(batch, -4), head(tbl, -4)) + expect_data_frame(head(batch, -40), head(tbl, -40)) + expect_data_frame(tail(batch), tail(tbl)) + expect_data_frame(tail(batch, 4), tail(tbl, 4)) + expect_data_frame(tail(batch, 40), tail(tbl, 40)) + expect_data_frame(tail(batch, -4), tail(tbl, -4)) + expect_data_frame(tail(batch, -40), tail(tbl, -40)) +}) + +test_that("RecordBatch print method", { + expect_output( + print(batch), + paste( + "RecordBatch", + "10 rows x 5 columns", + "$int <int32>", + "$dbl <double>", + "$lgl <bool>", + "$chr <string>", + "$fct <dictionary<values=string, indices=int8>>", + sep = "\n" + ), + fixed = TRUE + ) +}) + +test_that("RecordBatch with 0 rows are supported", { + tbl <- tibble::tibble( + int = integer(), + dbl = numeric(), + lgl = logical(), + chr = character(), + fct = factor(character(), levels = c("a", "b")) + ) + + batch <- record_batch(tbl) + expect_equal(batch$num_columns, 5L) + expect_equal(batch$num_rows, 0L) + expect_equal( + batch$schema, + schema( + int = int32(), + dbl = float64(), + lgl = boolean(), + chr = utf8(), + fct = dictionary(int8(), utf8()) + ) + ) +}) + +test_that("RecordBatch cast (ARROW-3741)", { + batch <- record_batch(x = 1:10, y = 1:10) + + expect_error(batch$cast(schema(x = int32()))) + expect_error(batch$cast(schema(x = int32(), z = int32()))) + + s2 <- schema(x = int16(), y = int64()) + batch2 <- batch$cast(s2) + expect_equal(batch2$schema, s2) + expect_equal(batch2$column(0L)$type, int16()) + expect_equal(batch2$column(1L)$type, int64()) +}) + +test_that("record_batch() handles schema= argument", { + s <- schema(x = int32(), y = int32()) + batch <- record_batch(x = 1:10, y = 1:10, schema = s) + expect_equal(s, batch$schema) + + s <- schema(x = int32(), y = float64()) + batch <- record_batch(x = 1:10, y = 1:10, schema = s) + expect_equal(s, batch$schema) + + s <- schema(x = int32(), y = utf8()) + expect_error(record_batch(x = 1:10, y = 1:10, schema = s)) +}) + +test_that("record_batch(schema=) does some basic consistency checking of the schema", { + s <- schema(x = int32()) + expect_error(record_batch(x = 1:10, y = 1:10, schema = s)) + expect_error(record_batch(z = 1:10, schema = s)) +}) + +test_that("RecordBatch dim() and nrow() (ARROW-3816)", { + batch <- record_batch(x = 1:10, y = 1:10) + expect_equal(dim(batch), c(10L, 2L)) + expect_equal(nrow(batch), 10L) +}) + +test_that("record_batch() handles Array", { + batch <- record_batch(x = 1:10, y = Array$create(1:10)) + expect_equal(batch$schema, schema(x = int32(), y = int32())) +}) + +test_that("record_batch() handles data frame columns", { + tib <- tibble::tibble(x = 1:10, y = 1:10) + # because tib is named here, this becomes a struct array + batch <- record_batch(a = 1:10, b = tib) + expect_equal( + batch$schema, + schema( + a = int32(), + b = struct(x = int32(), y = int32()) + ) + ) + out <- as.data.frame(batch) + expect_equal(out, tibble::tibble(a = 1:10, b = tib)) + + # if not named, columns from tib are auto spliced + batch2 <- record_batch(a = 1:10, tib) + expect_equal( + batch2$schema, + schema(a = int32(), x = int32(), y = int32()) + ) + out <- as.data.frame(batch2) + expect_equal(out, tibble::tibble(a = 1:10, !!!tib)) +}) + +test_that("record_batch() handles data frame columns with schema spec", { + tib <- tibble::tibble(x = 1:10, y = 1:10) + tib_float <- tib + tib_float$y <- as.numeric(tib_float$y) + schema <- schema(a = int32(), b = struct(x = int16(), y = float64())) + batch <- record_batch(a = 1:10, b = tib, schema = schema) + expect_equal(batch$schema, schema) + out <- as.data.frame(batch) + expect_equal(out, tibble::tibble(a = 1:10, b = tib_float)) + + schema <- schema(a = int32(), b = struct(x = int16(), y = utf8())) + expect_error(record_batch(a = 1:10, b = tib, schema = schema)) +}) + +test_that("record_batch() auto splices (ARROW-5718)", { + df <- tibble::tibble(x = 1:10, y = letters[1:10]) + batch1 <- record_batch(df) + batch2 <- record_batch(!!!df) + expect_equal(batch1, batch2) + expect_equal(batch1$schema, schema(x = int32(), y = utf8())) + expect_data_frame(batch1, df) + + batch3 <- record_batch(df, z = 1:10) + batch4 <- record_batch(!!!df, z = 1:10) + expect_equal(batch3, batch4) + expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32())) + expect_equal( + as.data.frame(batch3), + tibble::as_tibble(cbind(df, data.frame(z = 1:10))) + ) + + s <- schema(x = float64(), y = utf8()) + batch5 <- record_batch(df, schema = s) + batch6 <- record_batch(!!!df, schema = s) + expect_equal(batch5, batch6) + expect_equal(batch5$schema, s) + expect_equal(as.data.frame(batch5), df) + + s2 <- schema(x = float64(), y = utf8(), z = int16()) + batch7 <- record_batch(df, z = 1:10, schema = s2) + batch8 <- record_batch(!!!df, z = 1:10, schema = s2) + expect_equal(batch7, batch8) + expect_equal(batch7$schema, s2) + expect_equal( + as.data.frame(batch7), + tibble::as_tibble(cbind(df, data.frame(z = 1:10))) + ) +}) + +test_that("record_batch() only auto splice data frames", { + expect_error( + record_batch(1:10), + regexp = "only data frames are allowed as unnamed arguments to be auto spliced" + ) +}) + +test_that("record_batch() handles null type (ARROW-7064)", { + batch <- record_batch(a = 1:10, n = vctrs::unspecified(10)) + expect_equal( + batch$schema, + schema(a = int32(), n = null()), + ignore_attr = TRUE + ) +}) + +test_that("record_batch() scalar recycling with vectors", { + expect_data_frame( + record_batch(a = 1:10, b = 5), + tibble::tibble(a = 1:10, b = 5) + ) +}) + +test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", { + expect_data_frame( + record_batch(a = Array$create(1:10), b = Scalar$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + record_batch(a = Array$create(1:10), b = Array$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) +}) + +test_that("record_batch() no recycling with tibbles", { + expect_error( + record_batch( + tibble::tibble(a = 1:10), + tibble::tibble(a = 1, b = 5) + ), + regexp = "All input tibbles or data.frames must have the same number of rows" + ) + + expect_error( + record_batch( + tibble::tibble(a = 1:10), + tibble::tibble(a = 1) + ), + regexp = "All input tibbles or data.frames must have the same number of rows" + ) +}) + +test_that("RecordBatch$Equals", { + df <- tibble::tibble(x = 1:10, y = letters[1:10]) + a <- record_batch(df) + b <- record_batch(df) + expect_equal(a, b) + expect_true(a$Equals(b)) + expect_false(a$Equals(df)) +}) + +test_that("RecordBatch$Equals(check_metadata)", { + df <- tibble::tibble(x = 1:2, y = c("a", "b")) + rb1 <- record_batch(df) + rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some = "metadata"))) + + expect_r6_class(rb1, "RecordBatch") + expect_r6_class(rb2, "RecordBatch") + expect_false(rb1$schema$HasMetadata) + expect_true(rb2$schema$HasMetadata) + expect_identical(rb2$schema$metadata, list(some = "metadata")) + + expect_true(rb1 == rb2) + expect_true(rb1$Equals(rb2)) + expect_false(rb1$Equals(rb2, check_metadata = TRUE)) + + expect_failure(expect_equal(rb1, rb2)) # expect_equal has check_metadata=TRUE + expect_equal(rb1, rb2, ignore_attr = TRUE) # this passes check_metadata=FALSE + + expect_false(rb1$Equals(24)) # Not a RecordBatch +}) + +test_that("RecordBatch name assignment", { + rb <- record_batch(x = 1:10, y = 1:10) + expect_identical(names(rb), c("x", "y")) + names(rb) <- c("a", "b") + expect_identical(names(rb), c("a", "b")) + expect_error(names(rb) <- "f") + expect_error(names(rb) <- letters) + expect_error(names(rb) <- character(0)) + expect_error(names(rb) <- NULL) + expect_error(names(rb) <- c(TRUE, FALSE)) +}) + +test_that("record_batch() with different length arrays", { + msg <- "All arrays must have the same length" + expect_error(record_batch(a = 1:5, b = 1:6), msg) +}) + +test_that("Handling string data with embedded nuls", { + raws <- Array$create(structure(list( + as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), + as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), + as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), + as.raw(c(0x74, 0x76)) + ), + class = c("arrow_binary", "vctrs_vctr", "list") + )) + batch_with_nul <- record_batch(a = 1:5, b = raws) + batch_with_nul$b <- batch_with_nul$b$cast(utf8()) + + # The behavior of the warnings/errors is slightly different with and without + # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately + # on `as.vector()` where as with it, the error only happens on materialization) + skip_if_r_version("3.5.0") + df <- as.data.frame(batch_with_nul) + + expect_error( + df$b[], + paste0( + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ", + "set options(arrow.skip_nul = TRUE)" + ), + fixed = TRUE + ) + + batch_with_nul <- record_batch(a = 1:5, b = raws) + batch_with_nul$b <- batch_with_nul$b$cast(utf8()) + + withr::with_options(list(arrow.skip_nul = TRUE), { + expect_warning( + expect_equal( + as.data.frame(batch_with_nul)$b, + c("person", "woman", "man", "camera", "tv"), + ignore_attr = TRUE + ), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) +}) + +test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation", { + skip_if_not_available("dataset") + library(dplyr, warn.conflicts = FALSE) + + tbl <- tibble::tibble( + int = 1:10, + fct = factor(rep(c("A", "B"), 5)), + fct2 = factor(rep(c("C", "D"), each = 5)), + ) + + expect_r6_class( + tbl %>% + group_by(fct, fct2) %>% + record_batch(), + "RecordBatch" + ) + expect_identical( + tbl %>% + group_by(fct, fct2) %>% + record_batch() %>% + group_vars(), + c("fct", "fct2") + ) + expect_identical( + tbl %>% + group_by(fct, fct2) %>% + record_batch() %>% + ungroup() %>% + group_vars(), + NULL + ) + expect_identical( + tbl %>% + group_by(fct, fct2) %>% + record_batch() %>% + select(-int) %>% + group_vars(), + c("fct", "fct2") + ) +}) + +test_that("ARROW-12729 - length returns number of columns in RecordBatch", { + tbl <- tibble::tibble( + int = 1:10, + fct = factor(rep(c("A", "B"), 5)), + fct2 = factor(rep(c("C", "D"), each = 5)), + ) + + rb <- record_batch(!!!tbl) + + expect_identical(length(rb), 3L) +}) + +test_that("RecordBatchReader to C-interface", { + skip_if_not_available("dataset") + + tab <- Table$create(example_data) + + # export the RecordBatchReader via the C-interface + stream_ptr <- allocate_arrow_array_stream() + scan <- Scanner$create(tab) + reader <- scan$ToRecordBatchReader() + reader$export_to_c(stream_ptr) + + # then import it and check that the roundtripped value is the same + circle <- RecordBatchStreamReader$import_from_c(stream_ptr) + tab_from_c_new <- circle$read_table() + expect_equal(tab, tab_from_c_new) + + # must clean up the pointer or we leak + delete_arrow_array_stream(stream_ptr) + + # export the RecordBatchStreamReader via the C-interface + stream_ptr_new <- allocate_arrow_array_stream() + bytes <- write_to_raw(example_data) + expect_type(bytes, "raw") + reader_new <- RecordBatchStreamReader$create(bytes) + reader_new$export_to_c(stream_ptr_new) + + # then import it and check that the roundtripped value is the same + circle_new <- RecordBatchStreamReader$import_from_c(stream_ptr_new) + tab_from_c_new <- circle_new$read_table() + expect_equal(tab, tab_from_c_new) + + # must clean up the pointer or we leak + delete_arrow_array_stream(stream_ptr_new) +}) + +test_that("RecordBatch to C-interface", { + batch <- RecordBatch$create(example_data) + + # export the RecordBatch via the C-interface + schema_ptr <- allocate_arrow_schema() + array_ptr <- allocate_arrow_array() + batch$export_to_c(array_ptr, schema_ptr) + + # then import it and check that the roundtripped value is the same + circle <- RecordBatch$import_from_c(array_ptr, schema_ptr) + expect_equal + + # must clean up the pointers or we leak + delete_arrow_schema(schema_ptr) + delete_arrow_array(array_ptr) +}) + + + +test_that("RecordBatchReader to C-interface to arrow_dplyr_query", { + skip_if_not_available("dataset") + + tab <- Table$create(example_data) + + # export the RecordBatchReader via the C-interface + stream_ptr <- allocate_arrow_array_stream() + scan <- Scanner$create(tab) + reader <- scan$ToRecordBatchReader() + reader$export_to_c(stream_ptr) + + # then import it and check that the roundtripped value is the same + circle <- RecordBatchStreamReader$import_from_c(stream_ptr) + + # create an arrow_dplyr_query() from the recordbatch reader + reader_adq <- arrow_dplyr_query(circle) + + tab_from_c_new <- reader_adq %>% + dplyr::compute() + expect_equal(tab_from_c_new, tab) + + # must clean up the pointer or we leak + delete_arrow_array_stream(stream_ptr) +}) diff --git a/src/arrow/r/tests/testthat/test-Table.R b/src/arrow/r/tests/testthat/test-Table.R new file mode 100644 index 000000000..44144c00b --- /dev/null +++ b/src/arrow/r/tests/testthat/test-Table.R @@ -0,0 +1,549 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", { + tbl <- tibble::tibble( + int = 1:10, dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10] + ) + tab <- Table$create(!!!tbl) + + tf <- tempfile() + on.exit(unlink(tf)) + expect_deprecated( + write_arrow(tab, tf), + "write_feather" + ) + + tab1 <- read_feather(tf, as_data_frame = FALSE) + tab2 <- read_feather(normalizePath(tf), as_data_frame = FALSE) + + readable_file <- ReadableFile$create(tf) + expect_deprecated( + tab3 <- read_arrow(readable_file, as_data_frame = FALSE), + "read_feather" + ) + readable_file$close() + + mmap_file <- mmap_open(tf) + mmap_file$close() + + expect_equal(tab, tab1) + expect_equal(tab, tab2) + expect_equal(tab, tab3) +}) + +test_that("Table cast (ARROW-3741)", { + tab <- Table$create(x = 1:10, y = 1:10) + + expect_error(tab$cast(schema(x = int32()))) + expect_error(tab$cast(schema(x = int32(), z = int32()))) + + s2 <- schema(x = int16(), y = int64()) + tab2 <- tab$cast(s2) + expect_equal(tab2$schema, s2) + expect_equal(tab2$column(0L)$type, int16()) + expect_equal(tab2$column(1L)$type, int64()) +}) + +test_that("Table S3 methods", { + tab <- Table$create(example_data) + for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", "as.list")) { + fun <- get(f) + expect_identical(fun(tab), fun(example_data), info = f) + } +}) + +test_that("Table $column and $field", { + tab <- Table$create(x = 1:10, y = 1:10) + + expect_equal(tab$field(0), field("x", int32())) + + # input validation + expect_error(tab$column(NA), "'i' cannot be NA") + expect_error(tab$column(-1), "subscript out of bounds") + expect_error(tab$column(1000), "subscript out of bounds") + expect_error(tab$column(1:2)) + expect_error(tab$column("one")) + + expect_error(tab$field(NA), "'i' cannot be NA") + expect_error(tab$field(-1), "subscript out of bounds") + expect_error(tab$field(1000), "subscript out of bounds") + expect_error(tab$field(1:2)) + expect_error(tab$field("one")) +}) + +test_that("[, [[, $ for Table", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + + expect_identical(names(tab), names(tbl)) + + expect_data_frame(tab[6:7, ], tbl[6:7, ]) + expect_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4]) + expect_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)]) + expect_as_vector(tab[, "chr", drop = TRUE], tbl$chr) + # Take within a single chunk + expect_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4]) + expect_data_frame(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ]) + # bool ChunkedArray (with one chunk) + expect_data_frame(tab[tab$lgl, ], tbl[tbl$lgl, ]) + # ChunkedArray with multiple chunks + c1 <- c(TRUE, FALSE, TRUE, TRUE, FALSE) + c2 <- c(FALSE, FALSE, TRUE, TRUE, FALSE) + ca <- ChunkedArray$create(c1, c2) + expect_data_frame(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ]) + # int Array + expect_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4]) + # ChunkedArray + expect_data_frame(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4]) + # Expression + expect_data_frame(tab[tab$int > 6, ], tbl[tbl$int > 6, ]) + + expect_as_vector(tab[["int"]], tbl$int) + expect_as_vector(tab$int, tbl$int) + expect_as_vector(tab[[4]], tbl$chr) + expect_null(tab$qwerty) + expect_null(tab[["asdf"]]) + # List-like column slicing + expect_data_frame(tab[2:4], tbl[2:4]) + expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)]) + expect_data_frame(tab[-3], tbl[-3]) + + expect_error(tab[[c(4, 3)]]) + expect_error(tab[[NA]], "'i' must be character or numeric, not logical") + expect_error(tab[[NULL]], "'i' must be character or numeric, not NULL") + expect_error(tab[[c("asdf", "jkl;")]], "length(name) not equal to 1", fixed = TRUE) + expect_error(tab[-3:3], "Invalid column index") + expect_error(tab[1000], "Invalid column index") + expect_error(tab[1:1000], "Invalid column index") + + # input validation + expect_error(tab[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"') + expect_error(tab[, c(6, NA)], "Column indices cannot be NA") + + skip("Table with 0 cols doesn't know how many rows it should have") + expect_data_frame(tab[0], tbl[0]) +}) + +test_that("[[<- assignment", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + + # can remove a column + tab[["chr"]] <- NULL + expect_data_frame(tab, tbl[-4]) + + # can remove a column by index + tab[[4]] <- NULL + expect_data_frame(tab, tbl[1:3]) + + # can add a named column + tab[["new"]] <- letters[10:1] + expect_data_frame(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1])) + + # can replace a column by index + tab[[2]] <- as.numeric(10:1) + expect_as_vector(tab[[2]], as.numeric(10:1)) + + # can add a column by index + tab[[5]] <- as.numeric(10:1) + expect_as_vector(tab[[5]], as.numeric(10:1)) + expect_as_vector(tab[["5"]], as.numeric(10:1)) + + # can replace a column + tab[["int"]] <- 10:1 + expect_as_vector(tab[["int"]], 10:1) + + # can use $ + tab$new <- NULL + expect_null(as.vector(tab$new)) + expect_identical(dim(tab), c(10L, 4L)) + + tab$int <- 1:10 + expect_as_vector(tab$int, 1:10) + + # recycling + tab[["atom"]] <- 1L + expect_as_vector(tab[["atom"]], rep(1L, 10)) + + expect_error( + tab[["atom"]] <- 1:6, + "Can't recycle input of size 6 to size 10." + ) + + # assign Arrow array and chunked_array + array <- Array$create(c(10:1)) + tab$array <- array + expect_as_vector(tab$array, 10:1) + + tab$chunked <- chunked_array(1:10) + expect_as_vector(tab$chunked, 1:10) + + # nonsense indexes + expect_error(tab[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical") + expect_error(tab[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL") + expect_error(tab[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(tab[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(tab[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(tab[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE) +}) + +test_that("Table$Slice", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + tab2 <- tab$Slice(5) + expect_data_frame(tab2, tbl[6:10, ]) + + tab3 <- tab$Slice(5, 2) + expect_data_frame(tab3, tbl[6:7, ]) + + # Input validation + expect_error(tab$Slice("ten")) + expect_error(tab$Slice(NA_integer_), "Slice 'offset' cannot be NA") + expect_error(tab$Slice(NA), "Slice 'offset' cannot be NA") + expect_error(tab$Slice(10, "ten")) + expect_error(tab$Slice(10, NA_integer_), "Slice 'length' cannot be NA") + expect_error(tab$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA") + expect_error(tab$Slice(c(10, 10))) + expect_error(tab$Slice(10, c(10, 10))) + expect_error(tab$Slice(1000), "Slice 'offset' greater than array length") + expect_error(tab$Slice(-1), "Slice 'offset' cannot be negative") + expect_error(tab3$Slice(10, 10), "Slice 'offset' greater than array length") + expect_error(tab$Slice(10, -1), "Slice 'length' cannot be negative") + expect_error(tab$Slice(-1, 10), "Slice 'offset' cannot be negative") +}) + +test_that("head and tail on Table", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + + expect_data_frame(head(tab), head(tbl)) + expect_data_frame(head(tab, 4), head(tbl, 4)) + expect_data_frame(head(tab, 40), head(tbl, 40)) + expect_data_frame(head(tab, -4), head(tbl, -4)) + expect_data_frame(head(tab, -40), head(tbl, -40)) + expect_data_frame(tail(tab), tail(tbl)) + expect_data_frame(tail(tab, 4), tail(tbl, 4)) + expect_data_frame(tail(tab, 40), tail(tbl, 40)) + expect_data_frame(tail(tab, -4), tail(tbl, -4)) + expect_data_frame(tail(tab, -40), tail(tbl, -40)) +}) + +test_that("Table print method", { + expect_output( + print(tab), + paste( + "Table", + "10 rows x 5 columns", + "$int <int32>", + "$dbl <double>", + "$lgl <bool>", + "$chr <string>", + "$fct <dictionary<values=string, indices=int8>>", + sep = "\n" + ), + fixed = TRUE + ) +}) + +test_that("table active bindings", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10], + fct = factor(letters[1:10]) + ) + tab <- Table$create(tbl) + + expect_identical(dim(tbl), dim(tab)) + expect_type(tab$columns, "list") + expect_equal(tab$columns[[1]], tab[[1]]) +}) + +test_that("table() handles record batches with splicing", { + batch <- record_batch(x = 1:2, y = letters[1:2]) + tab <- Table$create(batch, batch, batch) + expect_equal(tab$schema, batch$schema) + expect_equal(tab$num_rows, 6L) + expect_equal( + as.data.frame(tab), + vctrs::vec_rbind(as.data.frame(batch), as.data.frame(batch), as.data.frame(batch)) + ) + + batches <- list(batch, batch, batch) + tab <- Table$create(!!!batches) + expect_equal(tab$schema, batch$schema) + expect_equal(tab$num_rows, 6L) + expect_equal( + as.data.frame(tab), + vctrs::vec_rbind(!!!purrr::map(batches, as.data.frame)) + ) +}) + +test_that("table() handles ... of arrays, chunked arrays, vectors", { + a <- Array$create(1:10) + ca <- chunked_array(1:5, 6:10) + v <- rnorm(10) + tbl <- tibble::tibble(x = 1:10, y = letters[1:10]) + + tab <- Table$create(a = a, b = ca, c = v, !!!tbl) + expect_equal( + tab$schema, + schema(a = int32(), b = int32(), c = float64(), x = int32(), y = utf8()) + ) + res <- as.data.frame(tab) + expect_equal(names(res), c("a", "b", "c", "x", "y")) + expect_equal( + res, + tibble::tibble(a = 1:10, b = 1:10, c = v, x = 1:10, y = letters[1:10]) + ) +}) + +test_that("table() auto splices (ARROW-5718)", { + df <- tibble::tibble(x = 1:10, y = letters[1:10]) + + tab1 <- Table$create(df) + tab2 <- Table$create(!!!df) + expect_equal(tab1, tab2) + expect_equal(tab1$schema, schema(x = int32(), y = utf8())) + expect_equal(as.data.frame(tab1), df) + + s <- schema(x = float64(), y = utf8()) + tab3 <- Table$create(df, schema = s) + tab4 <- Table$create(!!!df, schema = s) + expect_equal(tab3, tab4) + expect_equal(tab3$schema, s) + expect_equal(as.data.frame(tab3), df) +}) + +test_that("Validation when creating table with schema (ARROW-10953)", { + expect_error( + Table$create(data.frame(), schema = schema(a = int32())), + "incompatible. schema has 1 fields, and 0 columns are supplied", + fixed = TRUE + ) + expect_error( + Table$create(data.frame(b = 1), schema = schema(a = int32())), + "field at index 1 has name 'a' != 'b'", + fixed = TRUE + ) + expect_error( + Table$create(data.frame(b = 2, c = 3), schema = schema(a = int32())), + "incompatible. schema has 1 fields, and 2 columns are supplied", + fixed = TRUE + ) +}) + +test_that("==.Table", { + tab1 <- Table$create(x = 1:2, y = c("a", "b")) + tab2 <- Table$create(x = 1:2, y = c("a", "b")) + tab3 <- Table$create(x = 1:2) + tab4 <- Table$create(x = 1:2, y = c("a", "b"), z = 3:4) + + expect_true(tab1 == tab2) + expect_true(tab2 == tab1) + + expect_false(tab1 == tab3) + expect_false(tab3 == tab1) + + expect_false(tab1 == tab4) + expect_false(tab4 == tab1) + + expect_true(all.equal(tab1, tab2)) + expect_equal(tab1, tab2) +}) + +test_that("Table$Equals(check_metadata)", { + tab1 <- Table$create(x = 1:2, y = c("a", "b")) + tab2 <- Table$create( + x = 1:2, y = c("a", "b"), + schema = tab1$schema$WithMetadata(list(some = "metadata")) + ) + + expect_r6_class(tab1, "Table") + expect_r6_class(tab2, "Table") + expect_false(tab1$schema$HasMetadata) + expect_true(tab2$schema$HasMetadata) + expect_identical(tab2$schema$metadata, list(some = "metadata")) + + expect_true(tab1 == tab2) + expect_true(tab1$Equals(tab2)) + expect_false(tab1$Equals(tab2, check_metadata = TRUE)) + + expect_failure(expect_equal(tab1, tab2)) # expect_equal has check_metadata=TRUE + expect_equal(tab1, tab2, ignore_attr = TRUE) # this sets check_metadata=FALSE + + expect_false(tab1$Equals(24)) # Not a Table +}) + +test_that("Table handles null type (ARROW-7064)", { + tab <- Table$create(a = 1:10, n = vctrs::unspecified(10)) + expect_equal(tab$schema, schema(a = int32(), n = null()), ignore_attr = TRUE) +}) + +test_that("Can create table with specific dictionary types", { + fact <- example_data[, "fct"] + int_types <- c(int8(), int16(), int32(), int64()) + # TODO: test uint types when format allows + # uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint + for (i in int_types) { + sch <- schema(fct = dictionary(i, utf8())) + tab <- Table$create(fact, schema = sch) + expect_equal(sch, tab$schema) + if (i != int64()) { + # TODO: same downcast to int32 as we do for int64() type elsewhere + expect_identical(as.data.frame(tab), fact) + } + } +}) + +test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { + b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) + b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) + b3 <- record_batch(f = factor(NA, levels = "a")) + b4 <- record_batch(f = factor()) + + res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))) + tab <- Table$create(b1, b2, b3, b4) + + expect_identical(as.data.frame(tab), res) +}) + +test_that("Table$SelectColumns()", { + tab <- Table$create(x = 1:10, y = 1:10) + + expect_equal(tab$SelectColumns(0L), Table$create(x = 1:10)) + + expect_error(tab$SelectColumns(2:4)) + expect_error(tab$SelectColumns("")) +}) + +test_that("Table name assignment", { + tab <- Table$create(x = 1:10, y = 1:10) + expect_identical(names(tab), c("x", "y")) + names(tab) <- c("a", "b") + expect_identical(names(tab), c("a", "b")) + expect_error(names(tab) <- "f") + expect_error(names(tab) <- letters) + expect_error(names(tab) <- character(0)) + expect_error(names(tab) <- NULL) + expect_error(names(tab) <- c(TRUE, FALSE)) +}) + +test_that("Table$create() with different length columns", { + msg <- "All columns must have the same length" + expect_error(Table$create(a = 1:5, b = 1:6), msg) +}) + +test_that("Table$create() scalar recycling with vectors", { + expect_data_frame( + Table$create(a = 1:10, b = 5), + tibble::tibble(a = 1:10, b = 5) + ) +}) + +test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", { + expect_data_frame( + Table$create(a = Array$create(1:10), b = Scalar$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + Table$create(a = Array$create(1:10), b = Array$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) + + expect_data_frame( + Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)), + tibble::tibble(a = 1:10, b = 5) + ) +}) + +test_that("Table$create() no recycling with tibbles", { + expect_error( + Table$create( + tibble::tibble(a = 1:10, b = 5), + tibble::tibble(a = 1, b = 5) + ), + regexp = "All input tibbles or data.frames must have the same number of rows" + ) + + expect_error( + Table$create( + tibble::tibble(a = 1:10, b = 5), + tibble::tibble(a = 1) + ), + regexp = "All input tibbles or data.frames must have the same number of rows" + ) +}) + +test_that("ARROW-11769 - grouping preserved in table creation", { + skip_if_not_available("dataset") + + tbl <- tibble::tibble( + int = 1:10, + fct = factor(rep(c("A", "B"), 5)), + fct2 = factor(rep(c("C", "D"), each = 5)), + ) + + expect_identical( + tbl %>% + dplyr::group_by(fct, fct2) %>% + Table$create() %>% + dplyr::group_vars(), + c("fct", "fct2") + ) +}) + +test_that("ARROW-12729 - length returns number of columns in Table", { + tbl <- tibble::tibble( + int = 1:10, + fct = factor(rep(c("A", "B"), 5)), + fct2 = factor(rep(c("C", "D"), each = 5)), + ) + + tab <- Table$create(!!!tbl) + + expect_identical(length(tab), 3L) +}) diff --git a/src/arrow/r/tests/testthat/test-altrep.R b/src/arrow/r/tests/testthat/test-altrep.R new file mode 100644 index 000000000..dff369438 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-altrep.R @@ -0,0 +1,243 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_r_version("3.5.0") + +test_that("is_arrow_altrep() does not include base altrep", { + expect_false(is_arrow_altrep(1:10)) +}) + +test_that("altrep vectors from int32 and dbl arrays with no nulls", { + withr::local_options(list(arrow.use_altrep = TRUE)) + v_int <- Array$create(1:1000) + v_dbl <- Array$create(as.numeric(1:1000)) + c_int <- ChunkedArray$create(1:1000) + c_dbl <- ChunkedArray$create(as.numeric(1:1000)) + + expect_true(is_arrow_altrep(as.vector(v_int))) + expect_true(is_arrow_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_arrow_altrep(as.vector(v_dbl))) + expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1)))) + + expect_equal(c_int$num_chunks, 1L) + expect_true(is_arrow_altrep(as.vector(c_int))) + expect_true(is_arrow_altrep(as.vector(c_int$Slice(1)))) + + expect_equal(c_dbl$num_chunks, 1L) + expect_true(is_arrow_altrep(as.vector(c_dbl))) + expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(1)))) + + withr::local_options(list(arrow.use_altrep = NULL)) + expect_true(is_arrow_altrep(as.vector(v_int))) + expect_true(is_arrow_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_arrow_altrep(as.vector(v_dbl))) + expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1)))) + + withr::local_options(list(arrow.use_altrep = FALSE)) + expect_false(is_arrow_altrep(as.vector(v_int))) + expect_false(is_arrow_altrep(as.vector(v_int$Slice(1)))) + expect_false(is_arrow_altrep(as.vector(v_dbl))) + expect_false(is_arrow_altrep(as.vector(v_dbl$Slice(1)))) +}) + +test_that("altrep vectors from int32 and dbl arrays with nulls", { + withr::local_options(list(arrow.use_altrep = TRUE)) + v_int <- Array$create(c(1L, NA, 3L)) + v_dbl <- Array$create(c(1, NA, 3)) + c_int <- ChunkedArray$create(c(1L, NA, 3L)) + c_dbl <- ChunkedArray$create(c(1, NA, 3)) + + expect_true(is_arrow_altrep(as.vector(v_int))) + expect_true(is_arrow_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_arrow_altrep(as.vector(v_dbl))) + expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1)))) + expect_true(is_arrow_altrep(as.vector(c_int))) + expect_true(is_arrow_altrep(as.vector(c_int$Slice(1)))) + expect_true(is_arrow_altrep(as.vector(c_dbl))) + expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(1)))) + + expect_true(is_arrow_altrep(as.vector(v_int$Slice(2)))) + expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(2)))) + expect_true(is_arrow_altrep(as.vector(c_int$Slice(2)))) + expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(2)))) + + # chunked array with 2 chunks cannot be altrep + c_int <- ChunkedArray$create(0L, c(1L, NA, 3L)) + c_dbl <- ChunkedArray$create(0, c(1, NA, 3)) + expect_equal(c_int$num_chunks, 2L) + expect_equal(c_dbl$num_chunks, 2L) + + expect_false(is_arrow_altrep(as.vector(c_int))) + expect_false(is_arrow_altrep(as.vector(c_dbl))) + expect_true(is_arrow_altrep(as.vector(c_int$Slice(3)))) + expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(3)))) +}) + +test_that("empty vectors are not altrep", { + withr::local_options(list(arrow.use_altrep = TRUE)) + v_int <- Array$create(integer()) + v_dbl <- Array$create(numeric()) + + expect_false(is_arrow_altrep(as.vector(v_int))) + expect_false(is_arrow_altrep(as.vector(v_dbl))) +}) + +test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", { + withr::local_options(list(arrow.use_altrep = TRUE)) + + table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3), str = c("un", "deux", "trois")) + df_table <- as.data.frame(table) + expect_true(is_arrow_altrep(df_table$int)) + expect_true(is_arrow_altrep(df_table$dbl)) + expect_true(is_arrow_altrep(df_table$str)) + + batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3), str = c("un", "deux", "trois")) + df_batch <- as.data.frame(batch) + expect_true(is_arrow_altrep(df_batch$int)) + expect_true(is_arrow_altrep(df_batch$dbl)) + expect_true(is_arrow_altrep(df_batch$str)) +}) + +expect_altrep_rountrip <- function(x, fn, ...) { + alt <- Array$create(x)$as_vector() + + expect_true(is_arrow_altrep(alt)) + expect_identical(fn(x, ...), fn(alt, ...)) + expect_true(is_arrow_altrep(alt)) +} + +test_that("altrep min/max/sum identical to R versions for double", { + x <- c(1, 2, 3) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- c(1, 2, NA_real_) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- rep(NA_real_, 3) + expect_warning( + expect_altrep_rountrip(x, min, na.rm = TRUE), + "no non-missing arguments to min" + ) + expect_warning( + expect_altrep_rountrip(x, max, na.rm = TRUE), + "no non-missing arguments to max" + ) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) +}) + +test_that("altrep min/max/sum identical to R versions for int", { + x <- c(1L, 2L, 3L) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- c(1L, 2L, NA_integer_) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- rep(NA_integer_, 3) + expect_warning( + expect_altrep_rountrip(x, min, na.rm = TRUE), + "no non-missing arguments to min" + ) + expect_warning( + expect_altrep_rountrip(x, max, na.rm = TRUE), + "no non-missing arguments to max" + ) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + # sum(x) is INT_MIN -> convert to double. + x <- as.integer(c(-2^31 + 1L, -1L)) + expect_altrep_rountrip(x, sum) +}) + +test_that("altrep vectors handle serialization", { + ints <- c(1L, 2L, NA_integer_) + dbls <- c(1, 2, NA_real_) + strs <- c("un", "deux", NA_character_) + + expect_identical(ints, unserialize(serialize(Array$create(ints)$as_vector(), NULL))) + expect_identical(dbls, unserialize(serialize(Array$create(dbls)$as_vector(), NULL))) + expect_identical(strs, unserialize(serialize(Array$create(strs)$as_vector(), NULL))) + expect_identical(strs, unserialize(serialize(Array$create(strs, large_utf8())$as_vector(), NULL))) +}) + +test_that("altrep vectors handle coercion", { + ints <- c(1L, 2L, NA_integer_) + dbls <- c(1, 2, NA_real_) + strs <- c("1", "2", NA_character_) + + expect_identical(ints, as.integer(Array$create(dbls)$as_vector())) + expect_identical(ints, as.integer(Array$create(strs)$as_vector())) + + expect_identical(dbls, as.numeric(Array$create(ints)$as_vector())) + expect_identical(dbls, as.numeric(Array$create(strs)$as_vector())) + + expect_identical(strs, as.character(Array$create(ints)$as_vector())) + expect_identical(strs, as.character(Array$create(dbls)$as_vector())) +}) + +test_that("columns of struct types may be altrep", { + st <- Array$create(data.frame(x = 1:10, y = runif(10))) + df <- st$as_vector() + + expect_true(is_arrow_altrep(df$x)) + expect_true(is_arrow_altrep(df$y)) +}) + +test_that("Conversion from altrep R vector to Array uses the existing Array", { + a_int <- Array$create(c(1L, 2L, 3L)) + b_int <- Array$create(a_int$as_vector()) + expect_true(test_same_Array(a_int$pointer(), b_int$pointer())) + + a_dbl <- Array$create(c(1, 2, 3)) + b_dbl <- Array$create(a_dbl$as_vector()) + expect_true(test_same_Array(a_dbl$pointer(), b_dbl$pointer())) + + a_str <- Array$create(c("un", "deux", "trois")) + b_str <- Array$create(a_str$as_vector()) + expect_true(test_same_Array(a_str$pointer(), b_str$pointer())) +}) diff --git a/src/arrow/r/tests/testthat/test-array-data.R b/src/arrow/r/tests/testthat/test-array-data.R new file mode 100644 index 000000000..05d070d8a --- /dev/null +++ b/src/arrow/r/tests/testthat/test-array-data.R @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("string vectors with only empty strings and nulls don't allocate a data buffer (ARROW-3693)", { + a <- Array$create("") + expect_equal(a$length(), 1L) + + buffers <- a$data()$buffers + + # No nulls + expect_equal(buffers[[1]], NULL) + + # Offsets has 2 elements + expect_equal(buffers[[2]]$size, 8L) + + # As per ARROW-2744, values buffer should preferably be non-null. + expect_equal(buffers[[3]]$size, 0L) + expect_equal(buffers[[3]]$capacity, 0L) +}) diff --git a/src/arrow/r/tests/testthat/test-arrow-info.R b/src/arrow/r/tests/testthat/test-arrow-info.R new file mode 100644 index 000000000..9eac60814 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-arrow-info.R @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("arrow_info()", { + expect_s3_class(arrow_info(), "arrow_info") + expect_output(print(arrow_info()), "Arrow package version") + options(arrow.foo = FALSE) + expect_output(print(arrow_info()), "arrow.foo") +}) diff --git a/src/arrow/r/tests/testthat/test-arrow.R b/src/arrow/r/tests/testthat/test-arrow.R new file mode 100644 index 000000000..48970ab89 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-arrow.R @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if (!identical(tolower(Sys.getenv("TEST_R_WITHOUT_LIBARROW")), "true")) { + testthat::test_that("Arrow C++ is available", { + skip_on_cran() + expect_true(arrow_available()) + }) +} + +test_that("Can't $new() an object with anything other than a pointer", { + expect_error( + Array$new(1:5), + "Array$new() requires a pointer as input: did you mean $create() instead?", + fixed = TRUE + ) +}) + +r_only({ + test_that("assert_is", { + x <- 42 + expect_true(assert_is(x, "numeric")) + expect_true(assert_is(x, c("numeric", "character"))) + expect_error(assert_is(x, "factor"), 'x must be a "factor"') + expect_error( + assert_is(x, c("factor", "list")), + 'x must be a "factor" or "list"' + ) + expect_error( + assert_is(x, c("factor", "character", "list")), + 'x must be a "factor", "character", or "list"' + ) + }) +}) + +test_that("arrow gracefully fails to load objects from other sessions (ARROW-10071)", { + a <- Array$create(1:10) + tf <- tempfile() + on.exit(unlink(tf)) + saveRDS(a, tf) + + b <- readRDS(tf) + expect_error(b$length(), "Invalid <Array>") +}) + +test_that("check for an ArrowObject in functions use std::shared_ptr", { + expect_error(Array__length(1), "Invalid R object") +}) + +test_that("MemoryPool calls gc() to free memory when allocation fails (ARROW-10080)", { + # There is a valgrind error on this test because there cannot be memory allocated + # which is exactly what this test is checking, but we quiet this + skip_on_valgrind() + + env <- new.env() + suppressMessages(trace(gc, print = FALSE, tracer = function() { + env$gc_was_called <- TRUE + })) + on.exit(suppressMessages(untrace(gc))) + # We expect this should fail because we don't have this much memory, + # but it should gc() and retry (and fail again) + expect_error(BufferOutputStream$create(2**60)) + expect_true(env$gc_was_called) +}) diff --git a/src/arrow/r/tests/testthat/test-backwards-compatibility.R b/src/arrow/r/tests/testthat/test-backwards-compatibility.R new file mode 100644 index 000000000..32e86d5f6 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-backwards-compatibility.R @@ -0,0 +1,121 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# nolint start +# To write a new version of a test file for a current version: +# write_parquet(example_with_metadata, test_path("golden-files/data-arrow_2.0.0.parquet")) + +# To write a new version of a test file for an old version, use docker(-compose) +# to setup a linux distribution and use RStudio's public package manager binary +# repo to install the old version. The following commands should be run at the +# root of the arrow repo directory and might need slight adjusments. +# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose build --no-cache r +# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose run r /bin/bash +# R +# options(repos = "https://packagemanager.rstudio.com/all/__linux__/focal/latest") +# remotes::install_version("arrow", version = "1.0.1") +# # get example data into the global env +# write_parquet(example_with_metadata, "arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet") +# quit()/exit +# nolint end + +skip_if(getRversion() < "3.5.0", "The serialization format changed in 3.5") + +expect_identical_with_metadata <- function(object, expected, ..., top_level = TRUE) { + attrs_to_keep <- c("names", "class", "row.names") + if (!top_level) { + # remove not-tbl and not-data.frame attributes + for (attribute in names(attributes(expected))) { + if (attribute %in% attrs_to_keep) next + attributes(expected)[[attribute]] <- NULL + } + } + expect_identical(object, expected, ...) +} + +test_that("reading a known Parquet file to dataframe with 3.0.0", { + skip_if_not_available("parquet") + skip_if_not_available("snappy") + pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet") + + df <- read_parquet(pq_file) + # this is equivalent to `expect_identical()` + expect_identical_with_metadata(df, example_with_extra_metadata) +}) + +test_that("reading a known Parquet file to dataframe with 2.0.0", { + skip_if_not_available("parquet") + skip_if_not_available("snappy") + pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet") + + df <- read_parquet(pq_file) + # this is equivalent to `expect_identical()` + expect_identical_with_metadata(df, example_with_metadata) +}) + +test_that("reading a known Parquet file to dataframe with 1.0.1", { + skip_if_not_available("parquet") + skip_if_not_available("snappy") + pq_file <- test_path("golden-files/data-arrow_1.0.1.parquet") + + df <- read_parquet(pq_file) + # 1.0.1 didn't save top-level metadata, so we need to remove it. + expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE) +}) + +for (comp in c("lz4", "uncompressed", "zstd")) { + # nolint start + # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_lz4.feather"), compression = "lz4") + # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_uncompressed.feather"), compression = "uncompressed") + # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_zstd.feather"), compression = "zstd") + # nolint end + test_that("reading a known Feather file to dataframe with 2.0.0", { + skip_if_not_available("parquet") + skip_if_not_available(comp) + feather_file <- test_path(paste0("golden-files/data-arrow_2.0.0_", comp, ".feather")) + + df <- read_feather(feather_file) + expect_identical_with_metadata(df, example_with_metadata) + }) + + test_that("reading a known Feather file to dataframe with 1.0.1", { + skip_if_not_available("parquet") + skip_if_not_available(comp) + feather_file <- test_path(paste0("golden-files/data-arrow_1.0.1_", comp, ".feather")) + + df <- read_feather(feather_file) + # 1.0.1 didn't save top-level metadata, so we need to remove it. + expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE) + }) + + test_that("reading a known Feather file to dataframe with 0.17.0", { + skip_if_not_available("parquet") + skip_if_not_available(comp) + feather_file <- test_path(paste0("golden-files/data-arrow_0.17.0_", comp, ".feather")) + + df <- read_feather(feather_file) + # the metadata from 0.17.0 doesn't have the top level, the special class is + # not maintained and the embedded tibble's attributes are read in a wrong + # order. Since this is prior to 1.0.0 punting on checking the attributes + # though classes are always checked, so that must be removed before checking. + example_with_metadata_sans_special_class <- example_with_metadata + example_with_metadata_sans_special_class$a <- unclass(example_with_metadata_sans_special_class$a) + expect_equal(df, example_with_metadata_sans_special_class, ignore_attr = TRUE) + }) +} + +# TODO: streams(?) diff --git a/src/arrow/r/tests/testthat/test-buffer-reader.R b/src/arrow/r/tests/testthat/test-buffer-reader.R new file mode 100644 index 000000000..b790ed0da --- /dev/null +++ b/src/arrow/r/tests/testthat/test-buffer-reader.R @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("BufferReader can be created from R objects", { + num <- BufferReader$create(numeric(13)) + int <- BufferReader$create(integer(13)) + raw <- BufferReader$create(raw(16)) + + expect_r6_class(num, "BufferReader") + expect_r6_class(int, "BufferReader") + expect_r6_class(raw, "BufferReader") + + expect_equal(num$GetSize(), 13 * 8) + expect_equal(int$GetSize(), 13 * 4) + expect_equal(raw$GetSize(), 16) +}) + +test_that("BufferReader can be created from Buffer", { + buf <- buffer(raw(76)) + reader <- BufferReader$create(buf) + + expect_r6_class(reader, "BufferReader") + expect_equal(reader$GetSize(), 76) +}) diff --git a/src/arrow/r/tests/testthat/test-buffer.R b/src/arrow/r/tests/testthat/test-buffer.R new file mode 100644 index 000000000..9b3ebc6de --- /dev/null +++ b/src/arrow/r/tests/testthat/test-buffer.R @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("Buffer can be created from raw vector", { + vec <- raw(123) + buf <- buffer(vec) + expect_r6_class(buf, "Buffer") + expect_equal(buf$size, 123) +}) + +test_that("Buffer can be created from integer vector", { + vec <- integer(17) + buf <- buffer(vec) + expect_r6_class(buf, "Buffer") + expect_equal(buf$size, 17 * 4) +}) + +test_that("Buffer can be created from numeric vector", { + vec <- numeric(17) + buf <- buffer(vec) + expect_r6_class(buf, "Buffer") + expect_equal(buf$size, 17 * 8) +}) + +test_that("Buffer can be created from complex vector", { + vec <- complex(3) + buf <- buffer(vec) + expect_r6_class(buf, "Buffer") + expect_equal(buf$size, 3 * 16) +}) + +test_that("buffer buffer buffers buffers", { + expect_r6_class(buffer(buffer(42)), "Buffer") +}) + +test_that("Other types can't be converted to Buffers", { + expect_error( + buffer(data.frame(a = "asdf")), + "Cannot convert object of class data.frame to arrow::Buffer" + ) +}) + +test_that("can convert Buffer to raw", { + buf <- buffer(rnorm(10)) + expect_equal(buf$data(), as.raw(buf)) +}) + +test_that("can read remaining bytes of a RandomAccessFile", { + tbl <- tibble::tibble( + int = 1:10, dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10] + ) + tab <- Table$create(!!!tbl) + + tf <- tempfile() + all_bytes <- write_feather(tab, tf) + + file <- ReadableFile$create(tf) + expect_equal(file$tell(), 0) + x <- file$Read(20)$data() + expect_equal(file$tell(), 20) + y <- file$Read()$data() + + file <- ReadableFile$create(tf) + z <- file$Read()$data() + + file <- ReadableFile$create(tf) + a <- file$ReadAt(20)$data() + + expect_equal(file$GetSize(), length(x) + length(y)) + expect_equal(z, c(x, y)) + expect_equal(a, y) +}) + +test_that("Buffer$Equals", { + vec <- integer(17) + buf1 <- buffer(vec) + buf2 <- buffer(vec) + expect_equal(buf1, buf2) + expect_true(buf1$Equals(buf2)) + expect_false(buf1$Equals(vec)) +}) diff --git a/src/arrow/r/tests/testthat/test-chunked-array.R b/src/arrow/r/tests/testthat/test-chunked-array.R new file mode 100644 index 000000000..c931ddec5 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-chunked-array.R @@ -0,0 +1,468 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +expect_chunked_roundtrip <- function(x, type) { + a <- ChunkedArray$create(!!!x) + flat_x <- unlist(x, recursive = FALSE) + attributes(flat_x) <- attributes(x[[1]]) + expect_type_equal(a$type, type) + expect_identical(a$num_chunks, length(x)) + expect_identical(length(a), length(flat_x)) + if (!inherits(type, "ListType")) { + # TODO: revisit how missingness works with ListArrays + # R list objects don't handle missingness the same way as other vectors. + # Is there some vctrs thing we should do on the roundtrip back to R? + expect_identical(as.vector(is.na(a)), is.na(flat_x)) + } + expect_as_vector(a, flat_x) + expect_as_vector(a$chunk(0), x[[1]]) + + if (length(flat_x)) { + a_sliced <- a$Slice(1) + x_sliced <- flat_x[-1] + expect_type_equal(a_sliced$type, type) + expect_identical(length(a_sliced), length(x_sliced)) + if (!inherits(type, "ListType")) { + expect_identical(as.vector(is.na(a_sliced)), is.na(x_sliced)) + } + expect_as_vector(a_sliced, x_sliced) + } + invisible(a) +} + +test_that("ChunkedArray", { + x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32()) + + y <- x$Slice(8) + expect_equal(y$type, int32()) + expect_equal(y$num_chunks, 3L) + expect_equal(length(y), 17L) + expect_as_vector(y, c(9:10, 1:10, 1:5)) + + z <- x$Slice(8, 5) + expect_equal(z$type, int32()) + expect_equal(z$num_chunks, 2L) + expect_equal(z$length(), 5L) + expect_equal(z$as_vector(), c(9:10, 1:3)) + + expect_chunked_roundtrip(list(c(1, 2, 3), c(4, 5, 6)), float64()) + + # input validation + expect_error(x$chunk(14), "subscript out of bounds") + expect_error(x$chunk("one")) + expect_error(x$chunk(NA_integer_), "'i' cannot be NA") + expect_error(x$chunk(-1), "subscript out of bounds") + + expect_error(x$Slice("ten")) + expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA") + expect_error(x$Slice(NA), "Slice 'offset' cannot be NA") + expect_error(x$Slice(10, "ten")) + expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA") + expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA") + expect_error(x$Slice(c(10, 10))) + expect_error(x$Slice(10, c(10, 10))) + expect_error(x$Slice(1000), "Slice 'offset' greater than array length") + expect_error(x$Slice(-1), "Slice 'offset' cannot be negative") + expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length") + expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative") + expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative") + + expect_warning(x$Slice(10, 15), NA) + expect_warning( + overslice <- x$Slice(10, 16), + "Slice 'length' greater than available length" + ) + expect_equal(length(overslice), 15) + expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length") +}) + +test_that("print ChunkedArray", { + verify_output(test_path("test-chunked-array.txt"), { + chunked_array(c(1, 2, 3), c(4, 5, 6)) + chunked_array(1:30, c(4, 5, 6)) + chunked_array(1:30) + chunked_array(factor(c("a", "b")), factor(c("c", "d"))) + }) +}) + +test_that("ChunkedArray handles !!! splicing", { + data <- list(1, 2, 3) + x <- chunked_array(!!!data) + expect_equal(x$type, float64()) + expect_equal(x$num_chunks, 3L) +}) + +test_that("ChunkedArray handles Inf", { + data <- list(c(Inf, 2:10), c(1:3, Inf, 5L), 1:10) + x <- chunked_array(!!!data) + expect_equal(x$type, float64()) + expect_equal(x$num_chunks, 3L) + expect_equal(length(x), 25L) + expect_as_vector(x, c(c(Inf, 2:10), c(1:3, Inf, 5), 1:10)) + + chunks <- x$chunks + expect_as_vector(is.infinite(chunks[[2]]), is.infinite(data[[2]])) + expect_equal( + as.vector(is.infinite(x)), + c(is.infinite(data[[1]]), is.infinite(data[[2]]), is.infinite(data[[3]])) + ) +}) + +test_that("ChunkedArray handles NA", { + data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L)) + x <- chunked_array(!!!data) + expect_equal(x$type, int32()) + expect_equal(x$num_chunks, 3L) + expect_equal(length(x), 25L) + expect_as_vector(x, c(1:10, c(NA, 2:10), c(1:3, NA, 5))) + + chunks <- x$chunks + expect_as_vector(is.na(chunks[[2]]), is.na(data[[2]])) + expect_as_vector(is.na(x), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]]))) +}) + +test_that("ChunkedArray handles NaN", { + data <- list(as.numeric(1:10), c(NaN, 2:10), c(1:3, NaN, 5L)) + x <- chunked_array(!!!data) + + expect_equal(x$type, float64()) + expect_equal(x$num_chunks, 3L) + expect_equal(length(x), 25L) + expect_as_vector(x, c(1:10, c(NaN, 2:10), c(1:3, NaN, 5))) + + chunks <- x$chunks + expect_as_vector(is.nan(chunks[[2]]), is.nan(data[[2]])) + expect_as_vector(is.nan(x), c(is.nan(data[[1]]), is.nan(data[[2]]), is.nan(data[[3]]))) +}) + +test_that("ChunkedArray supports logical vectors (ARROW-3341)", { + # with NA + data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE)) + expect_chunked_roundtrip(data, bool()) + # without NA + data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE)) + expect_chunked_roundtrip(data, bool()) +}) + +test_that("ChunkedArray supports character vectors (ARROW-3339)", { + data <- list( + c("itsy", NA, "spider"), + c("Climbed", "up", "the", "water", "spout"), + c("Down", "came", "the", "rain"), + "And washed the spider out. " + ) + expect_chunked_roundtrip(data, utf8()) +}) + +test_that("ChunkedArray supports factors (ARROW-3716)", { + f <- factor(c("itsy", "bitsy", "spider", "spider")) + expect_chunked_roundtrip(list(f, f, f), dictionary(int8())) +}) + +test_that("ChunkedArray supports dates (ARROW-3716)", { + d <- Sys.Date() + 1:10 + expect_chunked_roundtrip(list(d, d), date32()) +}) + +test_that("ChunkedArray supports POSIXct (ARROW-3716)", { + times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 + expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC")) +}) + +test_that("ChunkedArray supports integer64 (ARROW-3716)", { + x <- bit64::as.integer64(1:10) + MAX_INT + expect_chunked_roundtrip(list(x, x), int64()) + # Also with a first chunk that would downcast + zero <- Array$create(0L)$cast(int64()) + expect_type_equal(zero, int64()) + ca <- ChunkedArray$create(zero, x) + expect_type_equal(ca, int64()) + expect_s3_class(as.vector(ca), "integer64") + expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x)) +}) + +test_that("ChunkedArray supports difftime", { + time <- hms::hms(56, 34, 12) + expect_chunked_roundtrip(list(time, time), time32("s")) +}) + +test_that("ChunkedArray supports empty arrays (ARROW-13761)", { + types <- c( + int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), + uint64(), float32(), float64(), timestamp("ns"), binary(), + large_binary(), fixed_size_binary(32), date32(), date64(), + decimal(4, 2), dictionary(), struct(x = int32()) + ) + + empty_filter <- ChunkedArray$create(type = bool()) + for (type in types) { + one_empty_chunk <- ChunkedArray$create(type = type) + expect_type_equal(one_empty_chunk$type, type) + if (type != struct(x = int32())) { + expect_identical(length(one_empty_chunk), length(as.vector(one_empty_chunk))) + } else { + # struct -> tbl and length(tbl) is num_columns instead of num_rows + expect_identical(length(as.vector(one_empty_chunk)), 1L) + } + zero_empty_chunks <- one_empty_chunk$Filter(empty_filter) + expect_equal(zero_empty_chunks$num_chunks, 0) + expect_type_equal(zero_empty_chunks$type, type) + if (type != struct(x = int32())) { + expect_identical(length(zero_empty_chunks), length(as.vector(zero_empty_chunks))) + } else { + expect_identical(length(as.vector(zero_empty_chunks)), 1L) + } + } +}) + +test_that("integer types casts for ChunkedArray (ARROW-3741)", { + int_types <- c(int8(), int16(), int32(), int64()) + uint_types <- c(uint8(), uint16(), uint32(), uint64()) + float_types <- c(float32(), float64()) # float16() not really supported in C++ yet + all_types <- c( + int_types, + uint_types, + float_types + ) + + a <- chunked_array(1:10, 1:10) + for (type in c(int_types, uint_types)) { + casted <- a$cast(type) + expect_r6_class(casted, "ChunkedArray") + expect_type_equal(casted$type, type) + } + # Also test casting to double(), not actually a type, a base R function but should be alias for float64 + dbl <- a$cast(double()) + expect_r6_class(dbl, "ChunkedArray") + expect_type_equal(dbl$type, float64()) +}) + +test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", { + num_int32 <- 12L + num_int64 <- bit64::as.integer64(10) + for (type in all_types) { + expect_type_equal(chunked_array(num_int32, type = type)$type, type) + expect_type_equal(chunked_array(num_int64, type = type)$type, type) + } + # also test creating with double() "type" + expect_type_equal(chunked_array(num_int32, type = double())$type, float64()) +}) + +test_that("ChunkedArray$create() aborts on overflow", { + expect_error(chunked_array(128L, type = int8())$type) + expect_error(chunked_array(-129L, type = int8())$type) + + expect_error(chunked_array(256L, type = uint8())$type) + expect_error(chunked_array(-1L, type = uint8())$type) + + expect_error(chunked_array(32768L, type = int16())$type) + expect_error(chunked_array(-32769L, type = int16())$type) + + expect_error(chunked_array(65536L, type = uint16())$type) + expect_error(chunked_array(-1L, type = uint16())$type) + + expect_error(chunked_array(65536L, type = uint16())$type) + expect_error(chunked_array(-1L, type = uint16())$type) + + expect_error(chunked_array(bit64::as.integer64(2^31), type = int32())) + expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32())) +}) + +test_that("chunked_array() convert doubles to integers", { + for (type in c(int_types, uint_types)) { + a <- chunked_array(10, type = type) + expect_type_equal(a$type, type) + if (type != uint64()) { + # exception for unsigned integer 64 that + # wa cannot handle yet + expect_true(as.vector(a) == 10) + } + } +}) + +test_that("chunked_array() uses the first ... to infer type", { + a <- chunked_array(10, 10L) + expect_type_equal(a$type, float64()) +}) + +test_that("chunked_array() handles downcasting", { + a <- chunked_array(10L, 10) + expect_type_equal(a$type, int32()) + expect_as_vector(a, c(10L, 10L)) +}) + +test_that("chunked_array() makes chunks of the same type", { + a <- chunked_array(10L, bit64::as.integer64(13), type = int64()) + for (chunk in a$chunks) { + expect_type_equal(chunk$type, int64()) + } +}) + +test_that("chunked_array() handles 0 chunks if given a type", { + for (type in all_types) { + a <- chunked_array(type = type) + expect_type_equal(a$type, as_type(type)) + expect_equal(length(a), 0L) + } +}) + +test_that("chunked_array() can ingest arrays (ARROW-3815)", { + expect_equal( + as.vector(chunked_array(1:5, Array$create(6:10))), + 1:10 + ) +}) + +test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", { + df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10]) + a <- chunked_array(df, df) + expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8())) + expect_equal(a$as_vector(), rbind(df, df), ignore_attr = TRUE) +}) + +test_that("ChunkedArray$View() (ARROW-6542)", { + a <- ChunkedArray$create(1:3, 1:4) + b <- a$View(float32()) + expect_equal(b$type, float32()) + expect_equal(length(b), 7L) + expect_true(all( + sapply(b$chunks, function(.x) .x$type == float32()) + )) + # Input validation + expect_error(a$View("not a type"), "type must be a DataType, not character") +}) + +test_that("ChunkedArray$Validate()", { + a <- ChunkedArray$create(1:10) + expect_error(a$Validate(), NA) +}) + +test_that("[ ChunkedArray", { + one_chunk <- chunked_array(2:11) + x <- chunked_array(1:10, 31:40, 51:55) + # Slice + expect_as_vector(x[8:12], c(8:10, 31:32)) + # Take from same chunk + expect_as_vector(x[c(11, 15, 12)], c(31, 35, 32)) + # Take from multiple chunks (calls Concatenate) + expect_as_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3)) + # Take with Array (note these are 0-based) + take1 <- Array$create(c(10L, 14L, 11L)) + expect_as_vector(x[take1], c(31, 35, 32)) + # Take with ChunkedArray + take2 <- ChunkedArray$create(c(10L, 14L), 11L) + expect_as_vector(x[take2], c(31, 35, 32)) + + # Filter (with recycling) + expect_as_vector( + one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)], + c(3, 6, 8, 11) + ) + # Filter where both are 1-chunk + expect_as_vector( + one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))], + c(3, 6, 8, 11) + ) + # Filter multi-chunk with logical (-> Array) + expect_as_vector( + x[c(FALSE, TRUE, FALSE, FALSE, TRUE)], + c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55) + ) + # Filter with a chunked array with different sized chunks + p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE) + p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE) + filt <- ChunkedArray$create(p1, p2, p2) + expect_as_vector( + x[filt], + c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55) + ) +}) + +test_that("ChunkedArray head/tail", { + vec <- 11:20 + a <- ChunkedArray$create(11:15, 16:20) + expect_as_vector(head(a), head(vec)) + expect_as_vector(head(a, 4), head(vec, 4)) + expect_as_vector(head(a, 40), head(vec, 40)) + expect_as_vector(head(a, -4), head(vec, -4)) + expect_as_vector(head(a, -40), head(vec, -40)) + expect_as_vector(tail(a), tail(vec)) + expect_as_vector(tail(a, 4), tail(vec, 4)) + expect_as_vector(tail(a, 40), tail(vec, 40)) + expect_as_vector(tail(a, -40), tail(vec, -40)) +}) + +test_that("ChunkedArray$Equals", { + vec <- 11:20 + a <- ChunkedArray$create(vec[1:5], vec[6:10]) + b <- ChunkedArray$create(vec[1:5], vec[6:10]) + expect_equal(a, b) + expect_true(a$Equals(b)) + expect_false(a$Equals(vec)) +}) + +test_that("Converting a chunked array unifies factors (ARROW-8374)", { + f1 <- factor(c("a"), levels = c("a", "b")) + f2 <- factor(c("c"), levels = c("c", "d")) + f3 <- factor(NA, levels = "a") + f4 <- factor() + + res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d")) + ca <- ChunkedArray$create(f1, f2, f3, f4) + + expect_identical(ca$as_vector(), res) +}) + +test_that("Handling string data with embedded nuls", { + raws <- structure(list( + as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), + as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), + as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls + as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), + as.raw(c(0x74, 0x76)) + ), + class = c("arrow_binary", "vctrs_vctr", "list") + ) + chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8()) + + # The behavior of the warnings/errors is slightly different with and without + # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately + # on `as.vector()` where as with it, the error only happens on materialization) + skip_if_r_version("3.5.0") + + v <- expect_error(as.vector(chunked_array_with_nul), NA) + + expect_error( + v[], + paste0( + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ", + "set options(arrow.skip_nul = TRUE)" + ), + fixed = TRUE + ) + + withr::with_options(list(arrow.skip_nul = TRUE), { + v <- expect_warning(as.vector(chunked_array_with_nul), NA) + expect_warning( + expect_identical(v[3], "man"), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) +}) diff --git a/src/arrow/r/tests/testthat/test-chunked-array.txt b/src/arrow/r/tests/testthat/test-chunked-array.txt new file mode 100644 index 000000000..c7101359d --- /dev/null +++ b/src/arrow/r/tests/testthat/test-chunked-array.txt @@ -0,0 +1,103 @@ +> chunked_array(c(1, 2, 3), c(4, 5, 6)) +ChunkedArray +[ + [ + 1, + 2, + 3 + ], + [ + 4, + 5, + 6 + ] +] + +> chunked_array(1:30, c(4, 5, 6)) +ChunkedArray +[ + [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + ... + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ], + [ + 4, + 5, + 6 + ] +] + +> chunked_array(1:30) +ChunkedArray +[ + [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + ... + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ] +] + +> chunked_array(factor(c("a", "b")), factor(c("c", "d"))) +ChunkedArray +[ + + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1 + ], + + -- dictionary: + [ + "c", + "d" + ] + -- indices: + [ + 0, + 1 + ] +] + diff --git a/src/arrow/r/tests/testthat/test-compressed.R b/src/arrow/r/tests/testthat/test-compressed.R new file mode 100644 index 000000000..d796e3e75 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-compressed.R @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("codec_is_available", { + expect_true(codec_is_available("uncompressed")) # Always true + expect_match_arg_error(codec_is_available("sdfasdf")) + skip_if_not_available("gzip") + expect_true(codec_is_available("gzip")) + expect_true(codec_is_available("GZIP")) +}) + +if (identical(Sys.getenv("APPVEYOR"), "True")) { + test_that("Compression codecs are included in the Windows build", { + expect_true(codec_is_available("lz4")) + expect_true(codec_is_available("zstd")) + }) +} + +test_that("Codec attributes", { + skip_if_not_available("gzip") + cod <- Codec$create("gzip") + expect_equal(cod$name, "gzip") + # TODO: implement $level + expect_error(cod$level) +}) + +test_that("can write Buffer to CompressedOutputStream and read back in CompressedInputStream", { + skip_if_not_available("gzip") + buf <- buffer(as.raw(sample(0:255, size = 1024, replace = TRUE))) + + tf1 <- tempfile() + stream1 <- CompressedOutputStream$create(tf1) + expect_equal(stream1$tell(), 0) + stream1$write(buf) + expect_equal(stream1$tell(), buf$size) + stream1$close() + + tf2 <- tempfile() + sink2 <- FileOutputStream$create(tf2) + stream2 <- CompressedOutputStream$create(sink2) + expect_equal(stream2$tell(), 0) + stream2$write(buf) + expect_equal(stream2$tell(), buf$size) + stream2$close() + sink2$close() + + input1 <- CompressedInputStream$create(tf1) + buf1 <- input1$Read(1024L) + + file2 <- ReadableFile$create(tf2) + input2 <- CompressedInputStream$create(file2) + buf2 <- input2$Read(1024L) + + expect_equal(buf, buf1) + expect_equal(buf, buf2) + + unlink(tf1) + unlink(tf2) +}) diff --git a/src/arrow/r/tests/testthat/test-compute-aggregate.R b/src/arrow/r/tests/testthat/test-compute-aggregate.R new file mode 100644 index 000000000..018279d4b --- /dev/null +++ b/src/arrow/r/tests/testthat/test-compute-aggregate.R @@ -0,0 +1,434 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("list_compute_functions", { + allfuncs <- list_compute_functions() + expect_false(all(grepl("min", allfuncs))) + justmins <- list_compute_functions("^min") + expect_true(length(justmins) > 0) + expect_true(all(grepl("min", justmins))) + no_hash_funcs <- list_compute_functions("^hash") + expect_true(length(no_hash_funcs) == 0) +}) + +test_that("sum.Array", { + ints <- 1:5 + a <- Array$create(ints) + expect_r6_class(sum(a), "Scalar") + expect_identical(as.integer(sum(a)), sum(ints)) + + floats <- c(1.3, 2.4, 3) + f <- Array$create(floats) + expect_identical(as.numeric(sum(f)), sum(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + if (!grepl("devel", R.version.string)) { + # Valgrind on R-devel confuses NaN and NA_real_ + # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html + expect_identical(as.numeric(sum(na)), sum(floats)) + } + expect_r6_class(sum(na, na.rm = TRUE), "Scalar") + expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE)) + + bools <- c(TRUE, NA, TRUE, FALSE) + b <- Array$create(bools) + expect_identical(as.integer(sum(b)), sum(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) +}) + +test_that("sum.ChunkedArray", { + a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_r6_class(sum(a), "Scalar") + expect_true(is.na(as.vector(sum(a)))) + expect_identical(as.numeric(sum(a, na.rm = TRUE)), 35) +}) + +test_that("sum dots", { + a1 <- Array$create(1:4) + a2 <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_identical(as.numeric(sum(a1, a2, na.rm = TRUE)), 45) +}) + +test_that("sum.Scalar", { + s <- Scalar$create(4) + expect_identical(as.numeric(s), as.numeric(sum(s))) +}) + +test_that("mean.Array", { + ints <- 1:4 + a <- Array$create(ints) + expect_r6_class(mean(a), "Scalar") + expect_identical(as.vector(mean(a)), mean(ints)) + + floats <- c(1.3, 2.4, 3) + f <- Array$create(floats) + expect_identical(as.vector(mean(f)), mean(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + if (!grepl("devel", R.version.string)) { + # Valgrind on R-devel confuses NaN and NA_real_ + # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html + expect_identical(as.vector(mean(na)), mean(floats)) + } + expect_r6_class(mean(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE)) + + bools <- c(TRUE, NA, TRUE, FALSE) + b <- Array$create(bools) + expect_identical(as.vector(mean(b)), mean(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) +}) + +test_that("mean.ChunkedArray", { + a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) + expect_r6_class(mean(a), "Scalar") + expect_true(is.na(as.vector(mean(a)))) + expect_identical(as.vector(mean(a, na.rm = TRUE)), 35 / 13) +}) + +test_that("mean.Scalar", { + s <- Scalar$create(4) + expect_equal(s, mean(s)) +}) + +test_that("Bad input handling of call_function", { + expect_error( + call_function("sum", 2, 3), + 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"' + ) +}) + +test_that("min.Array", { + ints <- 1:4 + a <- Array$create(ints) + expect_r6_class(min(a), "Scalar") + expect_identical(as.vector(min(a)), min(ints)) + + floats <- c(1.3, 3, 2.4) + f <- Array$create(floats) + expect_identical(as.vector(min(f)), min(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + expect_identical(as.vector(min(na)), min(floats)) + expect_r6_class(min(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- Array$create(bools) + # R is inconsistent here: typeof(min(NA)) == "integer", not "logical" + expect_identical(as.vector(min(b)), as.logical(min(bools))) +}) + +test_that("max.Array", { + ints <- 1:4 + a <- Array$create(ints) + expect_r6_class(max(a), "Scalar") + expect_identical(as.vector(max(a)), max(ints)) + + floats <- c(1.3, 3, 2.4) + f <- Array$create(floats) + expect_identical(as.vector(max(f)), max(floats)) + + floats <- c(floats, NA) + na <- Array$create(floats) + expect_identical(as.vector(max(na)), max(floats)) + expect_r6_class(max(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- Array$create(bools) + # R is inconsistent here: typeof(max(NA)) == "integer", not "logical" + expect_identical(as.vector(max(b)), as.logical(max(bools))) +}) + +test_that("min.ChunkedArray", { + ints <- 1:4 + a <- ChunkedArray$create(ints) + expect_r6_class(min(a), "Scalar") + expect_identical(as.vector(min(a)), min(ints)) + + floats <- c(1.3, 3, 2.4) + f <- ChunkedArray$create(floats) + expect_identical(as.vector(min(f)), min(floats)) + + floats <- c(floats, NA) + na <- ChunkedArray$create(floats) + expect_identical(as.vector(min(na)), min(floats)) + expect_r6_class(min(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- ChunkedArray$create(bools) + # R is inconsistent here: typeof(min(NA)) == "integer", not "logical" + expect_identical(as.vector(min(b)), as.logical(min(bools))) +}) + +test_that("max.ChunkedArray", { + ints <- 1:4 + a <- ChunkedArray$create(ints) + expect_r6_class(max(a), "Scalar") + expect_identical(as.vector(max(a)), max(ints)) + + floats <- c(1.3, 3, 2.4) + f <- ChunkedArray$create(floats) + expect_identical(as.vector(max(f)), max(floats)) + + floats <- c(floats, NA) + na <- ChunkedArray$create(floats) + expect_identical(as.vector(max(na)), max(floats)) + expect_r6_class(max(na, na.rm = TRUE), "Scalar") + expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE)) + + bools <- c(TRUE, TRUE, FALSE) + b <- ChunkedArray$create(bools) + # R is inconsistent here: typeof(max(NA)) == "integer", not "logical" + expect_identical(as.vector(max(b)), as.logical(max(bools))) +}) + +test_that("Edge cases", { + a <- Array$create(NA) + for (type in c(int32(), float64(), bool())) { + expect_as_vector(sum(a$cast(type), na.rm = TRUE), sum(NA, na.rm = TRUE)) + expect_as_vector(mean(a$cast(type), na.rm = TRUE), mean(NA, na.rm = TRUE)) + expect_as_vector( + min(a$cast(type), na.rm = TRUE), + # Suppress the base R warning about no non-missing arguments + suppressWarnings(min(NA, na.rm = TRUE)) + ) + expect_as_vector( + max(a$cast(type), na.rm = TRUE), + suppressWarnings(max(NA, na.rm = TRUE)) + ) + } +}) + +test_that("quantile.Array and quantile.ChunkedArray", { + a <- Array$create(c(0, 1, 2, 3)) + ca <- ChunkedArray$create(c(0, 1), c(2, 3)) + probs <- c(0.49, 0.51) + for (ad in list(a, ca)) { + for (type in c(int32(), uint64(), float64())) { + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "linear"), + Array$create(c(1.47, 1.53)) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "lower"), + Array$create(c(1, 1))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "higher"), + Array$create(c(2, 2))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "nearest"), + Array$create(c(1, 2))$cast(type) + ) + expect_equal( + quantile(ad$cast(type), probs = probs, interpolation = "midpoint"), + Array$create(c(1.5, 1.5)) + ) + } + } +}) + +test_that("quantile and median NAs, edge cases, and exceptions", { + expect_equal( + quantile(Array$create(c(1, 2)), probs = c(0, 1)), + Array$create(c(1, 2)) + ) + expect_error( + quantile(Array$create(c(1, 2, NA))), + "Missing values not allowed if 'na.rm' is FALSE" + ) + expect_equal( + quantile(Array$create(numeric(0))), + Array$create(rep(NA_real_, 5)) + ) + expect_equal( + quantile(Array$create(rep(NA_integer_, 3)), na.rm = TRUE), + Array$create(rep(NA_real_, 5)) + ) + expect_equal( + quantile(Scalar$create(0L)), + Array$create(rep(0, 5)) + ) + expect_equal( + median(Scalar$create(1L)), + Scalar$create(1) + ) + expect_error( + quantile(Array$create(1:3), type = 9), + "not supported" + ) +}) + +test_that("median passes ... args to quantile", { + skip_if( + !"..." %in% names(formals(median)), + "The median generic lacks dots in R 3.3.0 and earlier" + ) + expect_equal( + median(Array$create(c(1, 2)), interpolation = "higher"), + Scalar$create(2) + ) + expect_error( + median(Array$create(c(1, 2)), probs = c(.25, .75)) + ) +}) + +test_that("median.Array and median.ChunkedArray", { + compare_expression( + median(.input), + 1:4 + ) + compare_expression( + median(.input), + 1:5 + ) + compare_expression( + median(.input), + numeric(0) + ) + compare_expression( + median(.input, na.rm = FALSE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + NA_real_ + ) + compare_expression( + median(.input, na.rm = FALSE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + c(1, 2, NA) + ) + compare_expression( + median(.input, na.rm = TRUE), + NA_real_ + ) +}) + +test_that("unique.Array", { + a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) + expect_equal(unique(a), Array$create(c(1, 4, 3))) + ca <- ChunkedArray$create(a, a) + expect_equal(unique(ca), Array$create(c(1, 4, 3))) +}) + +test_that("match_arrow", { + a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) + tab <- c(4, 3, 2, 1) + expect_equal(match_arrow(a, tab), Array$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L))) + + ca <- ChunkedArray$create(c(1, 4, 3, 1, 1, 3, 4)) + expect_equal(match_arrow(ca, tab), ChunkedArray$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L))) + + sc <- Scalar$create(3) + expect_equal(match_arrow(sc, tab), Scalar$create(1L)) + + vec <- c(1, 2) + expect_equal(match_arrow(vec, tab), Array$create(c(3L, 2L))) +}) + +test_that("is_in", { + a <- Array$create(c(9, 4, 3)) + tab <- c(4, 3, 2, 1) + expect_equal(is_in(a, tab), Array$create(c(FALSE, TRUE, TRUE))) + + ca <- ChunkedArray$create(c(9, 4, 3)) + expect_equal(is_in(ca, tab), ChunkedArray$create(c(FALSE, TRUE, TRUE))) + + sc <- Scalar$create(3) + expect_equal(is_in(sc, tab), Scalar$create(TRUE)) + + vec <- c(1, 9) + expect_equal(is_in(vec, tab), Array$create(c(TRUE, FALSE))) +}) + +test_that("value_counts", { + a <- Array$create(c(1, 4, 3, 1, 1, 3, 4)) + result_df <- tibble::tibble( + values = c(1, 4, 3), + counts = c(3L, 2L, 2L) + ) + result <- Array$create( + result_df, + type = struct(values = float64(), counts = int64()) + ) + expect_equal(value_counts(a), result) + expect_identical(as.data.frame(value_counts(a)), result_df) + expect_identical(as.vector(value_counts(a)$counts), result_df$counts) +}) + +test_that("any.Array and any.ChunkedArray", { + data <- c(1:10, NA, NA) + + compare_expression(any(.input > 5), data) + compare_expression(any(.input > 5, na.rm = TRUE), data) + compare_expression(any(.input < 1), data) + compare_expression(any(.input < 1, na.rm = TRUE), data) + + data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE) + + compare_expression(any(.input), data_logical) + compare_expression(any(.input, na.rm = FALSE), data_logical) + compare_expression(any(.input, na.rm = TRUE), data_logical) +}) + +test_that("all.Array and all.ChunkedArray", { + data <- c(1:10, NA, NA) + + compare_expression(all(.input > 5), data) + compare_expression(all(.input > 5, na.rm = TRUE), data) + + compare_expression(all(.input < 11), data) + compare_expression(all(.input < 11, na.rm = TRUE), data) + + data_logical <- c(TRUE, TRUE, NA) + + compare_expression(all(.input), data_logical) + compare_expression(all(.input, na.rm = TRUE), data_logical) +}) + +test_that("variance", { + data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(34596)) + expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(34596)) +}) + +test_that("stddev", { + data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA) + arr <- Array$create(data) + chunked_arr <- ChunkedArray$create(data) + + expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(186)) + expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(186)) +}) diff --git a/src/arrow/r/tests/testthat/test-compute-arith.R b/src/arrow/r/tests/testthat/test-compute-arith.R new file mode 100644 index 000000000..e8674e315 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-compute-arith.R @@ -0,0 +1,129 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("Addition", { + a <- Array$create(c(1:4, NA_integer_)) + expect_type_equal(a, int32()) + expect_type_equal(a + 4L, int32()) + expect_type_equal(a + 4, float64()) + expect_equal(a + 4L, Array$create(c(5:8, NA_integer_))) + expect_identical(as.vector(a + 4L), c(5:8, NA_integer_)) + expect_equal(a + 4L, Array$create(c(5:8, NA_integer_))) + expect_as_vector(a + 4L, c(5:8, NA_integer_)) + expect_equal(a + NA_integer_, Array$create(rep(NA_integer_, 5))) + + a8 <- a$cast(int8()) + expect_type_equal(a8 + Scalar$create(1, int8()), int8()) + + # int8 will be promoted to int32 when added to int32 + expect_type_equal(a8 + 127L, int32()) + expect_equal(a8 + 127L, Array$create(c(128:131, NA_integer_))) + + b <- Array$create(c(4:1, NA_integer_)) + expect_type_equal(a8 + b, int32()) + expect_equal(a8 + b, Array$create(c(5L, 5L, 5L, 5L, NA_integer_))) + + expect_type_equal(a + 4.1, float64()) + expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_))) +}) + +test_that("Subtraction", { + a <- Array$create(c(1:4, NA_integer_)) + expect_equal(a - 3L, Array$create(c(-2:1, NA_integer_))) + + expect_equal( + Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_)) - a, + Array$create(c(4.1, 4.1, 4.1, 4.1, NA_real_)) + ) +}) + +test_that("Multiplication", { + a <- Array$create(c(1:4, NA_integer_)) + expect_equal(a * 2L, Array$create(c(1:4 * 2L, NA_integer_))) + + expect_equal( + (a * 0.5) * 3L, + Array$create(c(1.5, 3, 4.5, 6, NA_real_)) + ) +}) + +test_that("Division", { + a <- Array$create(c(1:4, NA_integer_)) + expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_))) + expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_))) + expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_))) + expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_))) + expect_equal(a / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_))) + # TODO add tests for integer division %/% by 0 + # see https://issues.apache.org/jira/browse/ARROW-14297 + + b <- a$cast(float64()) + expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_))) + expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_))) + expect_equal(b / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_))) + # TODO add tests for integer division %/% by 0 + # see https://issues.apache.org/jira/browse/ARROW-14297 + + # the behavior of %/% matches R's (i.e. the integer of the quotient, not + # simply dividing two integers) + expect_equal(b / 2.2, Array$create(c(1:4 / 2.2, NA_real_))) + # nolint start + # c(1:4) %/% 2.2 != c(1:4) %/% as.integer(2.2) + # c(1:4) %/% 2.2 == c(0L, 0L, 1L, 1L) + # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L) + # nolint end + expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_))) + + expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_))) + + expect_equal(b %% 2, Array$create(c(1:4 %% 2, NA_real_))) +}) + +test_that("Power", { + a <- Array$create(c(1:4, NA_integer_)) + b <- a$cast(float64()) + c <- a$cast(int64()) + d <- a$cast(uint64()) + + expect_equal(a^0, Array$create(c(1, 1, 1, 1, NA_real_))) + expect_equal(a^2, Array$create(c(1, 4, 9, 16, NA_real_))) + expect_equal(a^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_))) + expect_equal(a^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_))) + + expect_equal(b^0, Array$create(c(1, 1, 1, 1, NA_real_))) + expect_equal(b^2, Array$create(c(1, 4, 9, 16, NA_real_))) + expect_equal(b^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_))) + expect_equal(b^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_))) + + expect_equal(c^0, Array$create(c(1, 1, 1, 1, NA_real_))) + expect_equal(c^2, Array$create(c(1, 4, 9, 16, NA_real_))) + expect_equal(c^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_))) + expect_equal(c^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_))) + + expect_equal(d^0, Array$create(c(1, 1, 1, 1, NA_real_))) + expect_equal(d^2, Array$create(c(1, 4, 9, 16, NA_real_))) + expect_equal(d^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_))) + expect_equal(d^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_))) +}) + +test_that("Dates casting", { + a <- Array$create(c(Sys.Date() + 1:4, NA_integer_)) + + skip("ARROW-11090 (date/datetime arithmetic)") + # Error: NotImplemented: Function add_checked has no kernel matching input types (array[date32[day]], scalar[double]) + expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4) + 2), NA_integer_)) +}) diff --git a/src/arrow/r/tests/testthat/test-compute-no-bindings.R b/src/arrow/r/tests/testthat/test-compute-no-bindings.R new file mode 100644 index 000000000..a51d797a4 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-compute-no-bindings.R @@ -0,0 +1,201 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("non-bound compute kernels using TrimOptions", { + skip_if_not_available("utf8proc") + expect_equal( + call_function( + "utf8_trim", + Scalar$create("abracadabra"), + options = list(characters = "ab") + ), + Scalar$create("racadabr") + ) + + expect_equal( + call_function( + "utf8_ltrim", + Scalar$create("abracadabra"), + options = list(characters = "ab") + ), + Scalar$create("racadabra") + ) + + expect_equal( + call_function( + "utf8_rtrim", + Scalar$create("abracadabra"), + options = list(characters = "ab") + ), + Scalar$create("abracadabr") + ) + + expect_equal( + call_function( + "utf8_rtrim", + Scalar$create("abracadabra"), + options = list(characters = "ab") + ), + Scalar$create("abracadabr") + ) + + expect_equal( + call_function( + "ascii_ltrim", + Scalar$create("abracadabra"), + options = list(characters = "ab") + ), + Scalar$create("racadabra") + ) + + expect_equal( + call_function( + "ascii_rtrim", + Scalar$create("abracadabra"), + options = list(characters = "ab") + ), + Scalar$create("abracadabr") + ) + + expect_equal( + call_function( + "ascii_rtrim", + Scalar$create("abracadabra"), + options = list(characters = "ab") + ), + Scalar$create("abracadabr") + ) +}) + +test_that("non-bound compute kernels using ReplaceSliceOptions", { + skip_if_not_available("utf8proc") + + expect_equal( + call_function( + "binary_replace_slice", + Array$create("I need to fix this string"), + options = list(start = 1, stop = 1, replacement = " don't") + ), + Array$create("I don't need to fix this string") + ) + + expect_equal( + call_function( + "utf8_replace_slice", + Array$create("I need to fix this string"), + options = list(start = 1, stop = 1, replacement = " don't") + ), + Array$create("I don't need to fix this string") + ) +}) + +test_that("non-bound compute kernels using ModeOptions", { + expect_equal( + as.vector( + call_function("mode", Array$create(c(1:10, 10, 9, NA)), options = list(n = 3)) + ), + tibble::tibble("mode" = c(9, 10, 1), "count" = c(2L, 2L, 1L)) + ) + + expect_equal( + as.vector( + call_function("mode", Array$create(c(1:10, 10, 9, NA)), options = list(n = 3, skip_nulls = FALSE)) + ), + tibble::tibble("mode" = numeric(), "count" = integer()) + ) +}) + +test_that("non-bound compute kernels using PartitionNthOptions", { + result <- call_function( + "partition_nth_indices", + Array$create(c(11:20)), + options = list(pivot = 3) + ) + # Order of indices on either side of the pivot is not deterministic + # (depends on C++ standard library implementation) + expect_true(all(as.vector(result[1:3]) < 3)) + expect_true(all(as.vector(result[4:10]) >= 3)) +}) + + +test_that("non-bound compute kernels using MatchSubstringOptions", { + skip_if_not_available("utf8proc") + + # Remove this test when ARROW-13924 has been completed + expect_equal( + call_function( + "starts_with", + Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")), + options = list(pattern = "abr") + ), + Array$create(c(TRUE, FALSE, FALSE, TRUE)) + ) + + # Remove this test when ARROW-13924 has been completed + expect_equal( + call_function( + "ends_with", + Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")), + options = list(pattern = "e") + ), + Array$create(c(FALSE, FALSE, TRUE, TRUE)) + ) + + # Remove this test when ARROW-13156 has been completed + expect_equal( + as.vector( + call_function( + "count_substring", + Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")), + options = list(pattern = "e") + ) + ), + c(0, 0, 1, 1) + ) + + skip_if_not_available("re2") + + # Remove this test when ARROW-13156 has been completed + expect_equal( + as.vector( + call_function( + "count_substring_regex", + Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")), + options = list(pattern = "e") + ) + ), + c(0, 0, 1, 1) + ) +}) + +test_that("non-bound compute kernels using ExtractRegexOptions", { + skip_if_not_available("re2") + expect_equal( + call_function("extract_regex", Scalar$create("abracadabra"), options = list(pattern = "(?P<letter>[a])")), + Scalar$create(tibble::tibble(letter = "a")) + ) +}) + +test_that("non-bound compute kernels using IndexOptions", { + expect_equal( + as.vector( + call_function("index", Array$create(c(10, 20, 30, 40)), options = list(value = Scalar$create(40))) + ), + 3 + ) +}) diff --git a/src/arrow/r/tests/testthat/test-compute-sort.R b/src/arrow/r/tests/testthat/test-compute-sort.R new file mode 100644 index 000000000..e3574d86f --- /dev/null +++ b/src/arrow/r/tests/testthat/test-compute-sort.R @@ -0,0 +1,155 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +library(dplyr, warn.conflicts = FALSE) + +# randomize order of rows in test data +tbl <- slice_sample(example_data_for_sorting, prop = 1L) + +test_that("sort(Scalar) is identity function", { + int <- Scalar$create(42L) + expect_equal(sort(int), int) + dbl <- Scalar$create(3.14) + expect_equal(sort(dbl), dbl) + chr <- Scalar$create("foo") + expect_equal(sort(chr), chr) +}) + +test_that("Array$SortIndices()", { + int <- tbl$int + # Remove ties because they could give non-deterministic sort indices, and this + # test compares sort indices. Other tests compare sorted values, which are + # deterministic in the case of ties. + int <- int[!duplicated(int)] + expect_equal( + Array$create(int)$SortIndices(), + Array$create(order(int) - 1L, type = uint64()) + ) + # Need to remove NAs because ARROW-12063 + int <- na.omit(int) + expect_equal( + Array$create(int)$SortIndices(descending = TRUE), + Array$create(rev(order(int)) - 1, type = uint64()) + ) +}) + +test_that("ChunkedArray$SortIndices()", { + int <- tbl$int + # Remove ties because they could give non-deterministic sort indices, and this + # test compares sort indices. Other tests compare sorted values, which are + # deterministic in the case of ties. + int <- int[!duplicated(int)] + expect_equal( + ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(), + Array$create(order(int) - 1L, type = uint64()) + ) + # Need to remove NAs because ARROW-12063 + int <- na.omit(int) + expect_equal( + ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(descending = TRUE), + Array$create(rev(order(int)) - 1, type = uint64()) + ) +}) + +test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on integers", { + compare_expression( + sort(.input), + tbl$int + ) + compare_expression( + sort(.input, na.last = NA), + tbl$int + ) + compare_expression( + sort(.input, na.last = TRUE), + tbl$int + ) + compare_expression( + sort(.input, na.last = FALSE), + tbl$int + ) + compare_expression( + sort(.input, decreasing = TRUE), + tbl$int, + ) + compare_expression( + sort(.input, decreasing = TRUE, na.last = TRUE), + tbl$int, + ) + compare_expression( + sort(.input, decreasing = TRUE, na.last = FALSE), + tbl$int, + ) +}) + +test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on strings", { + compare_expression( + sort(.input, decreasing = TRUE, na.last = FALSE), + tbl$chr + ) + compare_expression( + sort(.input, decreasing = TRUE, na.last = FALSE), + tbl$chr + ) +}) + +test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on floats", { + compare_expression( + sort(.input, decreasing = TRUE, na.last = TRUE), + tbl$dbl + ) + compare_expression( + sort(.input, decreasing = FALSE, na.last = TRUE), + tbl$dbl + ) + compare_expression( + sort(.input, decreasing = TRUE, na.last = NA), + tbl$dbl + ) + compare_expression( + sort(.input, decreasing = TRUE, na.last = FALSE), + tbl$dbl, + ) + compare_expression( + sort(.input, decreasing = FALSE, na.last = NA), + tbl$dbl + ) + compare_expression( + sort(.input, decreasing = FALSE, na.last = FALSE), + tbl$dbl, + ) +}) + +test_that("Table$SortIndices()", { + x <- Table$create(tbl) + expect_identical( + as.vector(x$Take(x$SortIndices("chr"))$chr), + sort(tbl$chr, na.last = TRUE) + ) + expect_identical( + as.data.frame(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))), + tbl %>% arrange(int, dbl) + ) +}) + +test_that("RecordBatch$SortIndices()", { + x <- record_batch(tbl) + expect_identical( + as.data.frame(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))), + tbl %>% arrange(desc(chr), desc(int), desc(dbl)) + ) +}) diff --git a/src/arrow/r/tests/testthat/test-compute-vector.R b/src/arrow/r/tests/testthat/test-compute-vector.R new file mode 100644 index 000000000..345da5656 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-compute-vector.R @@ -0,0 +1,133 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +expect_bool_function_equal <- function(array_exp, r_exp) { + # Assert that the Array operation returns a boolean array + # and that its contents are equal to expected + expect_r6_class(array_exp, "ArrowDatum") + expect_type_equal(array_exp, bool()) + expect_identical(as.vector(array_exp), r_exp) +} + +expect_array_compares <- function(x, compared_to) { + r_values <- as.vector(x) + r_compared_to <- as.vector(compared_to) + # Iterate over all comparison functions + expect_bool_function_equal(x == compared_to, r_values == r_compared_to) + expect_bool_function_equal(x != compared_to, r_values != r_compared_to) + expect_bool_function_equal(x > compared_to, r_values > r_compared_to) + expect_bool_function_equal(x >= compared_to, r_values >= r_compared_to) + expect_bool_function_equal(x < compared_to, r_values < r_compared_to) + expect_bool_function_equal(x <= compared_to, r_values <= r_compared_to) +} + +test_that("compare ops with Array", { + a <- Array$create(1:5) + expect_array_compares(a, 4L) + expect_array_compares(a, 4) # implicit casting + expect_array_compares(a, Scalar$create(4)) + expect_array_compares(Array$create(c(NA, 1:5)), 4) + expect_array_compares(Array$create(as.numeric(c(NA, 1:5))), 4) + expect_array_compares(Array$create(c(NA, 1:5)), Array$create(rev(c(NA, 1:5)))) + expect_array_compares(Array$create(c(NA, 1:5)), Array$create(rev(c(NA, 1:5)), type = double())) +}) + +test_that("compare ops with ChunkedArray", { + expect_array_compares(ChunkedArray$create(1:3, 4:5), 4L) + expect_array_compares(ChunkedArray$create(1:3, 4:5), 4) # implicit casting + expect_array_compares(ChunkedArray$create(1:3, 4:5), Scalar$create(4)) + expect_array_compares(ChunkedArray$create(c(NA, 1:3), 4:5), 4) + expect_array_compares( + ChunkedArray$create(c(NA, 1:3), 4:5), + ChunkedArray$create(4:5, c(NA, 1:3)) + ) + expect_array_compares( + ChunkedArray$create(c(NA, 1:3), 4:5), + Array$create(c(NA, 1:5)) + ) + expect_array_compares( + Array$create(c(NA, 1:5)), + ChunkedArray$create(c(NA, 1:3), 4:5) + ) +}) + +test_that("logic ops with Array", { + truth <- expand.grid(left = c(TRUE, FALSE, NA), right = c(TRUE, FALSE, NA)) + a_left <- Array$create(truth$left) + a_right <- Array$create(truth$right) + expect_bool_function_equal(a_left & a_right, truth$left & truth$right) + expect_bool_function_equal(a_left | a_right, truth$left | truth$right) + expect_bool_function_equal(a_left == a_right, truth$left == truth$right) + expect_bool_function_equal(a_left != a_right, truth$left != truth$right) + expect_bool_function_equal(!a_left, !truth$left) + + # More complexity + isEqualTo <- function(x, y) x == y & !is.na(x) + expect_bool_function_equal( + isEqualTo(a_left, a_right), + isEqualTo(truth$left, truth$right) + ) +}) + +test_that("logic ops with ChunkedArray", { + truth <- expand.grid(left = c(TRUE, FALSE, NA), right = c(TRUE, FALSE, NA)) + a_left <- ChunkedArray$create(truth$left) + a_right <- ChunkedArray$create(truth$right) + expect_bool_function_equal(a_left & a_right, truth$left & truth$right) + expect_bool_function_equal(a_left | a_right, truth$left | truth$right) + expect_bool_function_equal(a_left == a_right, truth$left == truth$right) + expect_bool_function_equal(a_left != a_right, truth$left != truth$right) + expect_bool_function_equal(!a_left, !truth$left) + + # More complexity + isEqualTo <- function(x, y) x == y & !is.na(x) + expect_bool_function_equal( + isEqualTo(a_left, a_right), + isEqualTo(truth$left, truth$right) + ) +}) + +test_that("call_function validation", { + expect_error( + call_function("filter", 4), + 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"' + ) + expect_error( + call_function("filter", Array$create(1:4), 3), + "Argument 2 is of class numeric" + ) + expect_error( + call_function("filter", + Array$create(1:4), + Array$create(c(TRUE, FALSE, TRUE)), + options = list(keep_na = TRUE) + ), + "Array arguments must all be the same length" + ) + expect_error( + call_function("filter", + record_batch(a = 1:3), + Array$create(c(TRUE, FALSE, TRUE)), + options = list(keep_na = TRUE) + ), + NA + ) + expect_error( + call_function("filter", options = list(keep_na = TRUE)), + "accepts 2 arguments" + ) +}) diff --git a/src/arrow/r/tests/testthat/test-csv.R b/src/arrow/r/tests/testthat/test-csv.R new file mode 100644 index 000000000..023eee92e --- /dev/null +++ b/src/arrow/r/tests/testthat/test-csv.R @@ -0,0 +1,357 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Not all types round trip via CSV 100% identical by default +tbl <- example_data[, c("dbl", "lgl", "false", "chr")] +tbl_no_dates <- tbl +# Add a date to test its parsing +tbl$date <- Sys.Date() + 1:10 + +csv_file <- tempfile() + +test_that("Can read csv file", { + tf <- tempfile() + on.exit(unlink(tf)) + + write.csv(tbl, tf, row.names = FALSE) + + tab0 <- Table$create(tbl) + tab1 <- read_csv_arrow(tf, as_data_frame = FALSE) + expect_equal(tab0, tab1) + tab2 <- read_csv_arrow(mmap_open(tf), as_data_frame = FALSE) + expect_equal(tab0, tab2) + tab3 <- read_csv_arrow(ReadableFile$create(tf), as_data_frame = FALSE) + expect_equal(tab0, tab3) +}) + +test_that("read_csv_arrow(as_data_frame=TRUE)", { + tf <- tempfile() + on.exit(unlink(tf)) + + write.csv(tbl, tf, row.names = FALSE) + tab1 <- read_csv_arrow(tf, as_data_frame = TRUE) + expect_equal(tbl, tab1) +}) + +test_that("read_delim_arrow parsing options: delim", { + tf <- tempfile() + on.exit(unlink(tf)) + + write.table(tbl, tf, sep = "\t", row.names = FALSE) + tab1 <- read_tsv_arrow(tf) + tab2 <- read_delim_arrow(tf, delim = "\t") + expect_equal(tab1, tab2) + expect_equal(tbl, tab1) +}) + +test_that("read_delim_arrow parsing options: quote", { + tf <- tempfile() + on.exit(unlink(tf)) + + df <- data.frame(a = c(1, 2), b = c("'abc'", "'def'")) + write.table(df, sep = ";", tf, row.names = FALSE, quote = FALSE) + tab1 <- read_delim_arrow(tf, delim = ";", quote = "'") + + # Is this a problem? + # Component “a”: target is integer64, current is numeric + tab1$a <- as.numeric(tab1$a) + expect_equal( + tab1, + tibble::tibble(a = c(1, 2), b = c("abc", "def")) + ) +}) + +test_that("read_csv_arrow parsing options: col_names", { + tf <- tempfile() + on.exit(unlink(tf)) + + # Writing the CSV without the header + write.table(tbl, tf, sep = ",", row.names = FALSE, col.names = FALSE) + + # Reading with col_names = FALSE autogenerates names + no_names <- read_csv_arrow(tf, col_names = FALSE) + expect_equal(no_names$f0, tbl[[1]]) + + tab1 <- read_csv_arrow(tf, col_names = names(tbl)) + + expect_identical(names(tab1), names(tbl)) + expect_equal(tbl, tab1) + + # This errors (correctly) because I haven't given enough names + # but the error message is "Invalid: Empty CSV file", which is not accurate + expect_error( + read_csv_arrow(tf, col_names = names(tbl)[1]) + ) + # Same here + expect_error( + read_csv_arrow(tf, col_names = c(names(tbl), names(tbl))) + ) +}) + +test_that("read_csv_arrow parsing options: skip", { + tf <- tempfile() + on.exit(unlink(tf)) + + # Adding two garbage lines to start the csv + cat("asdf\nqwer\n", file = tf) + suppressWarnings(write.table(tbl, tf, sep = ",", row.names = FALSE, append = TRUE)) + + tab1 <- read_csv_arrow(tf, skip = 2) + + expect_identical(names(tab1), names(tbl)) + expect_equal(tbl, tab1) +}) + +test_that("read_csv_arrow parsing options: skip_empty_rows", { + tf <- tempfile() + on.exit(unlink(tf)) + + write.csv(tbl, tf, row.names = FALSE) + cat("\n\n", file = tf, append = TRUE) + + tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE) + + expect_equal(nrow(tab1), nrow(tbl) + 2) + expect_true(is.na(tail(tab1, 1)[[1]])) +}) + +test_that("read_csv_arrow parsing options: na strings", { + tf <- tempfile() + on.exit(unlink(tf)) + + df <- data.frame( + a = c(1.2, NA, NA, 3.4), + b = c(NA, "B", "C", NA), + stringsAsFactors = FALSE + ) + write.csv(df, tf, row.names = FALSE) + expect_equal(grep("NA", readLines(tf)), 2:5) + + tab1 <- read_csv_arrow(tf) + expect_equal(is.na(tab1$a), is.na(df$a)) + expect_equal(is.na(tab1$b), is.na(df$b)) + + unlink(tf) # Delete and write to the same file name again + + write.csv(df, tf, row.names = FALSE, na = "asdf") + expect_equal(grep("asdf", readLines(tf)), 2:5) + + tab2 <- read_csv_arrow(tf, na = "asdf") + expect_equal(is.na(tab2$a), is.na(df$a)) + expect_equal(is.na(tab2$b), is.na(df$b)) +}) + +test_that("read_csv_arrow() respects col_select", { + tf <- tempfile() + on.exit(unlink(tf)) + + write.csv(tbl, tf, row.names = FALSE, quote = FALSE) + + tab <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = FALSE) + expect_equal(tab, Table$create(example_data[, c("dbl", "lgl")])) + + tib <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = TRUE) + expect_equal(tib, example_data[, c("dbl", "lgl")]) +}) + +test_that("read_csv_arrow() can detect compression from file name", { + skip_if_not_available("gzip") + tf <- tempfile(fileext = ".csv.gz") + on.exit(unlink(tf)) + + write.csv(tbl, gzfile(tf), row.names = FALSE, quote = FALSE) + tab1 <- read_csv_arrow(tf) + expect_equal(tbl, tab1) +}) + +test_that("read_csv_arrow(schema=)", { + tbl <- example_data[, "int"] + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(tbl, tf, row.names = FALSE) + + df <- read_csv_arrow(tf, schema = schema(int = float64()), skip = 1) + expect_identical(df, tibble::tibble(int = as.numeric(tbl$int))) +}) + +test_that("read_csv_arrow(col_types = <Schema>)", { + tbl <- example_data[, "int"] + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(tbl, tf, row.names = FALSE) + + df <- read_csv_arrow(tf, col_types = schema(int = float64())) + expect_identical(df, tibble::tibble(int = as.numeric(tbl$int))) +}) + +test_that("read_csv_arrow(col_types=string, col_names)", { + tbl <- example_data[, "int"] + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(tbl, tf, row.names = FALSE) + + df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1) + expect_identical(df, tibble::tibble(int = as.numeric(tbl$int))) + + expect_error(read_csv_arrow(tf, col_types = c("i", "d"))) + expect_error(read_csv_arrow(tf, col_types = "d")) + expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b"))) + expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a")) +}) + +test_that("read_csv_arrow() can read timestamps", { + tbl <- tibble::tibble(time = as.POSIXct("2020-07-20 16:20", tz = "UTC")) + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(tbl, tf, row.names = FALSE) + + df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC"))) + expect_equal(tbl, df) + + # time zones are being read in as time zone-naive, hence ignore_attr = "tzone" + df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1) + expect_equal(tbl, df, ignore_attr = "tzone") +}) + +test_that("read_csv_arrow(timestamp_parsers=)", { + tf <- tempfile() + on.exit(unlink(tf)) + tbl <- tibble::tibble(time = "23/09/2020") + write.csv(tbl, tf, row.names = FALSE) + + df <- read_csv_arrow( + tf, + col_types = schema(time = timestamp(timezone = "UTC")), + timestamp_parsers = "%d/%m/%Y" + ) + expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC")) +}) + +test_that("Skipping columns with null()", { + tf <- tempfile() + on.exit(unlink(tf)) + cols <- c("dbl", "lgl", "false", "chr") + tbl <- example_data[, cols] + write.csv(tbl, tf, row.names = FALSE) + + df <- read_csv_arrow(tf, col_types = "d-_c", col_names = cols, skip = 1) + expect_identical(df, tbl[, c("dbl", "chr")]) +}) + +test_that("Mix of guessing and declaring types", { + tf <- tempfile() + on.exit(unlink(tf)) + cols <- c("dbl", "lgl", "false", "chr") + tbl <- example_data[, cols] + write.csv(tbl, tf, row.names = FALSE) + + tab <- read_csv_arrow(tf, col_types = schema(dbl = float32()), as_data_frame = FALSE) + expect_equal(tab$schema, schema(dbl = float32(), lgl = bool(), false = bool(), chr = utf8())) + + df <- read_csv_arrow(tf, col_types = "d-?c", col_names = cols, skip = 1) + expect_identical(df, tbl[, c("dbl", "false", "chr")]) +}) + + +test_that("Write a CSV file with header", { + tbl_out <- write_csv_arrow(tbl_no_dates, csv_file) + expect_true(file.exists(csv_file)) + expect_identical(tbl_out, tbl_no_dates) + + tbl_in <- read_csv_arrow(csv_file) + expect_identical(tbl_in, tbl_no_dates) + + tbl_out <- write_csv_arrow(tbl, csv_file) + expect_true(file.exists(csv_file)) + expect_identical(tbl_out, tbl) + + tbl_in <- read_csv_arrow(csv_file) + expect_identical(tbl_in, tbl) +}) + + +test_that("Write a CSV file with no header", { + tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE) + expect_true(file.exists(csv_file)) + expect_identical(tbl_out, tbl_no_dates) + tbl_in <- read_csv_arrow(csv_file, col_names = FALSE) + + tbl_expected <- tbl_no_dates + names(tbl_expected) <- c("f0", "f1", "f2", "f3") + + expect_identical(tbl_in, tbl_expected) +}) + +test_that("Write a CSV file with different batch sizes", { + tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1) + expect_true(file.exists(csv_file)) + expect_identical(tbl_out1, tbl_no_dates) + tbl_in1 <- read_csv_arrow(csv_file) + expect_identical(tbl_in1, tbl_no_dates) + + tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2) + expect_true(file.exists(csv_file)) + expect_identical(tbl_out2, tbl_no_dates) + tbl_in2 <- read_csv_arrow(csv_file) + expect_identical(tbl_in2, tbl_no_dates) + + tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12) + expect_true(file.exists(csv_file)) + expect_identical(tbl_out3, tbl_no_dates) + tbl_in3 <- read_csv_arrow(csv_file) + expect_identical(tbl_in3, tbl_no_dates) +}) + +test_that("Write a CSV file with invalid input type", { + bad_input <- Array$create(1:5) + expect_error( + write_csv_arrow(bad_input, csv_file), + regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'." + ) +}) + +test_that("Write a CSV file with invalid batch size", { + expect_error( + write_csv_arrow(tbl_no_dates, csv_file, batch_size = -1), + regexp = "batch_size not greater than 0" + ) +}) + +test_that("time mapping work as expected (ARROW-13624)", { + tbl <- tibble::tibble( + dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"), + time = c(hms::as_hms("16:20:00"), NA) + ) + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(tbl, tf, row.names = FALSE) + + df <- read_csv_arrow(tf, + col_names = c("dt", "time"), + col_types = "Tt", + skip = 1 + ) + + expect_error( + read_csv_arrow(tf, + col_names = c("dt", "time"), + col_types = "tT", skip = 1 + ) + ) + + expect_equal(df, tbl, ignore_attr = "tzone") +}) diff --git a/src/arrow/r/tests/testthat/test-data-type.R b/src/arrow/r/tests/testthat/test-data-type.R new file mode 100644 index 000000000..a9d0879b8 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-data-type.R @@ -0,0 +1,429 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("null type works as expected", { + x <- null() + expect_equal(x$id, 0L) + expect_equal(x$name, "null") + expect_equal(x$ToString(), "null") + expect_true(x == x) + expect_false(x == int8()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) +}) + +test_that("boolean type work as expected", { + x <- boolean() + expect_equal(x$id, Type$BOOL) + expect_equal(x$name, "bool") + expect_equal(x$ToString(), "bool") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 1L) +}) + +test_that("int types works as expected", { + x <- uint8() + expect_equal(x$id, Type$UINT8) + expect_equal(x$name, "uint8") + expect_equal(x$ToString(), "uint8") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 8L) + + x <- int8() + expect_equal(x$id, Type$INT8) + expect_equal(x$name, "int8") + expect_equal(x$ToString(), "int8") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 8L) + + x <- uint16() + expect_equal(x$id, Type$UINT16) + expect_equal(x$name, "uint16") + expect_equal(x$ToString(), "uint16") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 16L) + + x <- int16() + expect_equal(x$id, Type$INT16) + expect_equal(x$name, "int16") + expect_equal(x$ToString(), "int16") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 16L) + + x <- uint32() + expect_equal(x$id, Type$UINT32) + expect_equal(x$name, "uint32") + expect_equal(x$ToString(), "uint32") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 32L) + + x <- int32() + expect_equal(x$id, Type$INT32) + expect_equal(x$name, "int32") + expect_equal(x$ToString(), "int32") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 32L) + + x <- uint64() + expect_equal(x$id, Type$UINT64) + expect_equal(x$name, "uint64") + expect_equal(x$ToString(), "uint64") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) + + x <- int64() + expect_equal(x$id, Type$INT64) + expect_equal(x$name, "int64") + expect_equal(x$ToString(), "int64") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) +}) + +test_that("float types work as expected", { + x <- float16() + expect_equal(x$id, Type$HALF_FLOAT) + expect_equal(x$name, "halffloat") + expect_equal(x$ToString(), "halffloat") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 16L) + + x <- float32() + expect_equal(x$id, Type$FLOAT) + expect_equal(x$name, "float") + expect_equal(x$ToString(), "float") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 32L) + + x <- float64() + expect_equal(x$id, Type$DOUBLE) + expect_equal(x$name, "double") + expect_equal(x$ToString(), "double") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) +}) + +test_that("utf8 type works as expected", { + x <- utf8() + expect_equal(x$id, Type$STRING) + expect_equal(x$name, "utf8") + expect_equal(x$ToString(), "string") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) +}) + +test_that("date types work as expected", { + x <- date32() + expect_equal(x$id, Type$DATE32) + expect_equal(x$name, "date32") + expect_equal(x$ToString(), "date32[day]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$unit(), unclass(DateUnit$DAY)) + + x <- date64() + expect_equal(x$id, Type$DATE64) + expect_equal(x$name, "date64") + expect_equal(x$ToString(), "date64[ms]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$unit(), unclass(DateUnit$MILLI)) +}) + +test_that("timestamp type works as expected", { + x <- timestamp(TimeUnit$SECOND) + expect_equal(x$id, Type$TIMESTAMP) + expect_equal(x$name, "timestamp") + expect_equal(x$ToString(), "timestamp[s]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) + expect_equal(x$timezone(), "") + expect_equal(x$unit(), unclass(TimeUnit$SECOND)) + + x <- timestamp(TimeUnit$MILLI) + expect_equal(x$id, Type$TIMESTAMP) + expect_equal(x$name, "timestamp") + expect_equal(x$ToString(), "timestamp[ms]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) + expect_equal(x$timezone(), "") + expect_equal(x$unit(), unclass(TimeUnit$MILLI)) + + x <- timestamp(TimeUnit$MICRO) + expect_equal(x$id, Type$TIMESTAMP) + expect_equal(x$name, "timestamp") + expect_equal(x$ToString(), "timestamp[us]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) + expect_equal(x$timezone(), "") + expect_equal(x$unit(), unclass(TimeUnit$MICRO)) + + x <- timestamp(TimeUnit$NANO) + expect_equal(x$id, Type$TIMESTAMP) + expect_equal(x$name, "timestamp") + expect_equal(x$ToString(), "timestamp[ns]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) + expect_equal(x$timezone(), "") + expect_equal(x$unit(), unclass(TimeUnit$NANO)) +}) + +test_that("timestamp with timezone", { + expect_equal(timestamp(timezone = "EST")$ToString(), "timestamp[s, tz=EST]") +}) + +test_that("time32 types work as expected", { + x <- time32(TimeUnit$SECOND) + expect_equal(x$id, Type$TIME32) + expect_equal(x$name, "time32") + expect_equal(x$ToString(), "time32[s]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 32L) + expect_equal(x$unit(), unclass(TimeUnit$SECOND)) + + x <- time32(TimeUnit$MILLI) + expect_equal(x$id, Type$TIME32) + expect_equal(x$name, "time32") + expect_equal(x$ToString(), "time32[ms]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 32L) + expect_equal(x$unit(), unclass(TimeUnit$MILLI)) +}) + +test_that("time64 types work as expected", { + x <- time64(TimeUnit$MICRO) + expect_equal(x$id, Type$TIME64) + expect_equal(x$name, "time64") + expect_equal(x$ToString(), "time64[us]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) + expect_equal(x$unit(), unclass(TimeUnit$MICRO)) + + x <- time64(TimeUnit$NANO) + expect_equal(x$id, Type$TIME64) + expect_equal(x$name, "time64") + expect_equal(x$ToString(), "time64[ns]") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 0L) + expect_equal(x$fields(), list()) + expect_equal(x$bit_width, 64L) + expect_equal(x$unit(), unclass(TimeUnit$NANO)) +}) + +test_that("time type unit validation", { + expect_equal(time32(TimeUnit$SECOND), time32("s")) + expect_equal(time32(TimeUnit$MILLI), time32("ms")) + expect_equal(time32(), time32(TimeUnit$MILLI)) + expect_error(time32(4), '"unit" should be one of 1 or 0') + expect_error(time32(NULL), '"unit" should be one of "ms" or "s"') + expect_match_arg_error(time32("years")) + + expect_equal(time64(TimeUnit$NANO), time64("n")) + expect_equal(time64(TimeUnit$MICRO), time64("us")) + expect_equal(time64(), time64(TimeUnit$NANO)) + expect_error(time64(4), '"unit" should be one of 3 or 2') + expect_error(time64(NULL), '"unit" should be one of "ns" or "us"') + expect_match_arg_error(time64("years")) +}) + +test_that("timestamp type input validation", { + expect_equal(timestamp("ms"), timestamp(TimeUnit$MILLI)) + expect_equal(timestamp(), timestamp(TimeUnit$SECOND)) + expect_error( + timestamp(NULL), + '"unit" should be one of "ns", "us", "ms", or "s"' + ) + expect_error( + timestamp(timezone = 1231231), + "timezone is not a string" + ) + expect_error( + timestamp(timezone = c("not", "a", "timezone")), + "timezone is not a string" + ) +}) + +test_that("list type works as expected", { + x <- list_of(int32()) + expect_equal(x$id, Type$LIST) + expect_equal(x$name, "list") + expect_equal(x$ToString(), "list<item: int32>") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 1L) + expect_equal( + x$fields()[[1]], + field("item", int32()) + ) + expect_equal(x$value_type, int32()) + expect_equal(x$value_field, field("item", int32())) +}) + +test_that("struct type works as expected", { + x <- struct(x = int32(), y = boolean()) + expect_equal(x$id, Type$STRUCT) + expect_equal(x$name, "struct") + expect_equal(x$ToString(), "struct<x: int32, y: bool>") + expect_true(x == x) + expect_false(x == null()) + expect_equal(x$num_fields, 2L) + expect_equal( + x$fields()[[1]], + field("x", int32()) + ) + expect_equal( + x$fields()[[2]], + field("y", boolean()) + ) + expect_equal(x$GetFieldIndex("x"), 0L) + expect_equal(x$GetFieldIndex("y"), 1L) + expect_equal(x$GetFieldIndex("z"), -1L) + + expect_equal(x$GetFieldByName("x"), field("x", int32())) + expect_equal(x$GetFieldByName("y"), field("y", boolean())) + expect_null(x$GetFieldByName("z")) +}) + +test_that("DictionaryType works as expected (ARROW-3355)", { + d <- dictionary(int32(), utf8()) + expect_equal(d, d) + expect_true(d == d) + expect_false(d == int32()) + expect_equal(d$id, Type$DICTIONARY) + expect_equal(d$bit_width, 32L) + expect_equal(d$ToString(), "dictionary<values=string, indices=int32>") + expect_equal(d$index_type, int32()) + expect_equal(d$value_type, utf8()) + ord <- dictionary(ordered = TRUE) + expect_equal(ord$ToString(), "dictionary<values=string, indices=int32, ordered>") +}) + +test_that("DictionaryType validation", { + expect_error( + dictionary(utf8(), int32()), + "Dictionary index type should be .*integer, got string" + ) + expect_error(dictionary(4, utf8()), 'index_type must be a "DataType"') + expect_error(dictionary(int8(), "strings"), 'value_type must be a "DataType"') +}) + +test_that("decimal type and validation", { + expect_error(decimal()) + expect_error(decimal("four"), '"precision" must be an integer') + expect_error(decimal(4)) + expect_error(decimal(4, "two"), '"scale" must be an integer') + expect_error(decimal(NA, 2), '"precision" must be an integer') + expect_error(decimal(0, 2), "Invalid: Decimal precision out of range: 0") + expect_error(decimal(100, 2), "Invalid: Decimal precision out of range: 100") + expect_error(decimal(4, NA), '"scale" must be an integer') + + expect_r6_class(decimal(4, 2), "Decimal128Type") +}) + +test_that("Binary", { + expect_r6_class(binary(), "Binary") + expect_equal(binary()$ToString(), "binary") +}) + +test_that("FixedSizeBinary", { + expect_r6_class(fixed_size_binary(4), "FixedSizeBinary") + expect_equal(fixed_size_binary(4)$ToString(), "fixed_size_binary[4]") + + # input validation + expect_error(fixed_size_binary(NA), "'byte_width' cannot be NA") + expect_error(fixed_size_binary(-1), "'byte_width' must be > 0") + expect_error(fixed_size_binary("four")) + expect_error(fixed_size_binary(c(2, 4))) +}) + +test_that("DataType to C-interface", { + datatype <- timestamp("ms", timezone = "Pacific/Marquesas") + + # export the datatype via the C-interface + ptr <- allocate_arrow_schema() + datatype$export_to_c(ptr) + + # then import it and check that the roundtripped value is the same + circle <- DataType$import_from_c(ptr) + expect_equal(circle, datatype) + + # must clean up the pointer or we leak + delete_arrow_schema(ptr) +}) diff --git a/src/arrow/r/tests/testthat/test-dataset-csv.R b/src/arrow/r/tests/testthat/test-dataset-csv.R new file mode 100644 index 000000000..ab6693148 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dataset-csv.R @@ -0,0 +1,290 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + +csv_dir <- make_temp_dir() +tsv_dir <- make_temp_dir() + +test_that("Setup (putting data in the dirs)", { + dir.create(file.path(csv_dir, 5)) + dir.create(file.path(csv_dir, 6)) + write.csv(df1, file.path(csv_dir, 5, "file1.csv"), row.names = FALSE) + write.csv(df2, file.path(csv_dir, 6, "file2.csv"), row.names = FALSE) + expect_length(dir(csv_dir, recursive = TRUE), 2) + + # Now, tab-delimited + dir.create(file.path(tsv_dir, 5)) + dir.create(file.path(tsv_dir, 6)) + write.table(df1, file.path(tsv_dir, 5, "file1.tsv"), row.names = FALSE, sep = "\t") + write.table(df2, file.path(tsv_dir, 6, "file2.tsv"), row.names = FALSE, sep = "\t") + expect_length(dir(tsv_dir, recursive = TRUE), 2) +}) + +test_that("CSV dataset", { + ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") + expect_r6_class(ds$format, "CsvFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + expect_identical(names(ds), c(names(df1), "part")) + if (getRversion() >= "4.0.0") { + # CountRows segfaults on RTools35/R 3.6, so don't test it there + expect_identical(dim(ds), c(20L, 7L)) + } + expect_equal( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 5) %>% + collect() %>% + summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64 + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) + # Collecting virtual partition column works + expect_equal( + collect(ds) %>% arrange(part) %>% pull(part), + c(rep(5, 10), rep(6, 10)) + ) +}) + +test_that("CSV scan options", { + options <- FragmentScanOptions$create("text") + expect_equal(options$type, "csv") + options <- FragmentScanOptions$create("csv", + null_values = c("mynull"), + strings_can_be_null = TRUE + ) + expect_equal(options$type, "csv") + + dst_dir <- make_temp_dir() + dst_file <- file.path(dst_dir, "data.csv") + df <- tibble(chr = c("foo", "mynull")) + write.csv(df, dst_file, row.names = FALSE, quote = FALSE) + + ds <- open_dataset(dst_dir, format = "csv") + expect_equal(ds %>% collect(), df) + + sb <- ds$NewScan() + sb$FragmentScanOptions(options) + + tab <- sb$Finish()$ToTable() + expect_equal(as.data.frame(tab), tibble(chr = c("foo", NA))) + + # Set default convert options in CsvFileFormat + csv_format <- CsvFileFormat$create( + null_values = c("mynull"), + strings_can_be_null = TRUE + ) + ds <- open_dataset(dst_dir, format = csv_format) + expect_equal(ds %>% collect(), tibble(chr = c("foo", NA))) + + # Set both parse and convert options + df <- tibble(chr = c("foo", "mynull"), chr2 = c("bar", "baz")) + write.table(df, dst_file, row.names = FALSE, quote = FALSE, sep = "\t") + ds <- open_dataset(dst_dir, + format = "csv", + delimiter = "\t", + null_values = c("mynull"), + strings_can_be_null = TRUE + ) + expect_equal(ds %>% collect(), tibble( + chr = c("foo", NA), + chr2 = c("bar", "baz") + )) + expect_equal( + ds %>% + group_by(chr2) %>% + summarize(na = all(is.na(chr))) %>% + arrange(chr2) %>% + collect(), + tibble( + chr2 = c("bar", "baz"), + na = c(FALSE, TRUE) + ) + ) +}) + +test_that("compressed CSV dataset", { + skip_if_not_available("gzip") + dst_dir <- make_temp_dir() + dst_file <- file.path(dst_dir, "data.csv.gz") + write.csv(df1, gzfile(dst_file), row.names = FALSE, quote = FALSE) + format <- FileFormat$create("csv") + ds <- open_dataset(dst_dir, format = format) + expect_r6_class(ds$format, "CsvFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + + expect_equal( + ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("CSV dataset options", { + dst_dir <- make_temp_dir() + dst_file <- file.path(dst_dir, "data.csv") + df <- tibble(chr = letters[1:10]) + write.csv(df, dst_file, row.names = FALSE, quote = FALSE) + + format <- FileFormat$create("csv", skip_rows = 1) + ds <- open_dataset(dst_dir, format = format) + + expect_equal( + ds %>% + select(string = a) %>% + collect(), + df1[-1, ] %>% + select(string = chr) + ) + + ds <- open_dataset(dst_dir, format = "csv", column_names = c("foo")) + + expect_equal( + ds %>% + select(string = foo) %>% + collect(), + tibble(string = c(c("chr"), letters[1:10])) + ) +}) + +test_that("Other text delimited dataset", { + ds1 <- open_dataset(tsv_dir, partitioning = "part", format = "tsv") + expect_equal( + ds1 %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 5) %>% + collect() %>% + summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64 + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) + + ds2 <- open_dataset(tsv_dir, partitioning = "part", format = "text", delimiter = "\t") + expect_equal( + ds2 %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 5) %>% + collect() %>% + summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64 + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("readr parse options", { + arrow_opts <- names(formals(CsvParseOptions$create)) + readr_opts <- names(formals(readr_to_csv_parse_options)) + + # Arrow and readr parse options must be mutually exclusive, or else the code + # in `csv_file_format_parse_options()` will error or behave incorrectly. A + # failure of this test indicates that these two sets of option names are not + # mutually exclusive. + expect_equal( + intersect(arrow_opts, readr_opts), + character(0) + ) + + # With not yet supported readr parse options (ARROW-8631) + expect_error( + open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"), + "supported" + ) + + # With unrecognized (garbage) parse options + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + asdfg = "\\" + ), + "Unrecognized" + ) + + # With both Arrow and readr parse options (disallowed) + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + quote = "\"", + quoting = TRUE + ), + "either" + ) + + # With ambiguous partial option names (disallowed) + expect_error( + open_dataset( + tsv_dir, + partitioning = "part", + format = "text", + quo = "\"", + ), + "Ambiguous" + ) + + # With only readr parse options (and omitting format = "text") + ds1 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t") + expect_equal( + ds1 %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 5) %>% + collect() %>% + summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64 + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +# see https://issues.apache.org/jira/browse/ARROW-12791 +test_that("Error if no format specified and files are not parquet", { + expect_error( + open_dataset(csv_dir, partitioning = "part"), + "Did you mean to specify a 'format' other than the default (parquet)?", + fixed = TRUE + ) + expect_error( + open_dataset(csv_dir, partitioning = "part", format = "parquet"), + "Parquet magic bytes not found" + ) +}) + +test_that("Column names inferred from schema for headerless CSVs (ARROW-14063)", { + headerless_csv_dir <- make_temp_dir() + tbl <- df1[, c("int", "dbl")] + write.table(tbl, file.path(headerless_csv_dir, "file1.csv"), sep = ",", row.names = FALSE, col.names = FALSE) + + ds <- open_dataset(headerless_csv_dir, format = "csv", schema = schema(int = int32(), dbl = float64())) + expect_equal(ds %>% collect(), tbl) +}) diff --git a/src/arrow/r/tests/testthat/test-dataset-dplyr.R b/src/arrow/r/tests/testthat/test-dataset-dplyr.R new file mode 100644 index 000000000..b4519377c --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dataset-dplyr.R @@ -0,0 +1,340 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") +skip_if_not_available("parquet") + +library(dplyr, warn.conflicts = FALSE) + +dataset_dir <- make_temp_dir() +hive_dir <- make_temp_dir() + +test_that("Setup (putting data in the dir)", { + dir.create(file.path(dataset_dir, 1)) + dir.create(file.path(dataset_dir, 2)) + write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet")) + write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet")) + expect_length(dir(dataset_dir, recursive = TRUE), 2) + + dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE) + dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE) + write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet")) + write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet")) + expect_length(dir(hive_dir, recursive = TRUE), 2) +}) + +test_that("filter() with is.nan()", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + expect_equal( + ds %>% + select(part, dbl) %>% + filter(!is.nan(dbl), part == 2) %>% + collect(), + tibble(part = 2L, dbl = df2$dbl[!is.nan(df2$dbl)]) + ) +}) + +test_that("filter() with %in%", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + expect_equal( + ds %>% + select(int, part) %>% + filter(int %in% c(6, 4, 3, 103, 107), part == 1) %>% + collect(), + tibble(int = df1$int[c(3, 4, 6)], part = 1) + ) + + # ARROW-9606: bug in %in% filter on partition column with >1 partition columns + ds <- open_dataset(hive_dir) + expect_equal( + ds %>% + filter(group %in% 2) %>% + select(names(df2)) %>% + collect(), + df2 + ) +}) + +test_that("filter() on timestamp columns", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + expect_equal( + ds %>% + filter(ts >= lubridate::ymd_hms("2015-05-04 03:12:39")) %>% + filter(part == 1) %>% + select(ts) %>% + collect(), + df1[5:10, c("ts")], + ) + + # Now with Date + expect_equal( + ds %>% + filter(ts >= as.Date("2015-05-04")) %>% + filter(part == 1) %>% + select(ts) %>% + collect(), + df1[5:10, c("ts")], + ) + + # Now with bare string date + skip("Implement more aggressive implicit casting for scalars (ARROW-11402)") + expect_equal( + ds %>% + filter(ts >= "2015-05-04") %>% + filter(part == 1) %>% + select(ts) %>% + collect(), + df1[5:10, c("ts")], + ) +}) + +test_that("filter() on date32 columns", { + tmp <- tempfile() + dir.create(tmp) + df <- data.frame(date = as.Date(c("2020-02-02", "2020-02-03"))) + write_parquet(df, file.path(tmp, "file.parquet")) + + expect_equal( + open_dataset(tmp) %>% + filter(date > as.Date("2020-02-02")) %>% + collect() %>% + nrow(), + 1L + ) + + # Also with timestamp scalar + expect_equal( + open_dataset(tmp) %>% + filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>% + collect() %>% + nrow(), + 1L + ) +}) + + +test_that("mutate()", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + mutated <- ds %>% + select(chr, dbl, int) %>% + filter(dbl * 2 > 14 & dbl - 50 < 3L) %>% + mutate(twice = int * 2) + expect_output( + print(mutated), + "FileSystemDataset (query) +chr: string +dbl: double +int: int32 +twice: double (multiply_checked(int, 2)) + +* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3)) +See $.data for the source Arrow object", + fixed = TRUE + ) + expect_equal( + mutated %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl", "int")], + df2[1:2, c("chr", "dbl", "int")] + ) %>% + mutate( + twice = int * 2 + ) + ) +}) + +test_that("mutate() features not yet implemented", { + expect_error( + ds %>% + group_by(int) %>% + mutate(avg = mean(int)), + "window functions not currently supported in Arrow\nCall collect() first to pull data into R.", + fixed = TRUE + ) +}) + +test_that("filter scalar validation doesn't crash (ARROW-7772)", { + expect_error( + ds %>% + filter(int == "fff", part == 1) %>% + collect(), + "equal has no kernel matching input types .array.int32., scalar.string.." + ) +}) + +test_that("collect() on Dataset works (if fits in memory)", { + expect_equal( + collect(open_dataset(dataset_dir)) %>% arrange(int), + rbind(df1, df2) + ) +}) + +test_that("count()", { + ds <- open_dataset(dataset_dir) + df <- rbind(df1, df2) + expect_equal( + ds %>% + filter(int > 6, int < 108) %>% + count(chr) %>% + arrange(chr) %>% + collect(), + df %>% + filter(int > 6, int < 108) %>% + count(chr) + ) +}) + +test_that("arrange()", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + arranged <- ds %>% + select(chr, dbl, int) %>% + filter(dbl * 2 > 14 & dbl - 50 < 3L) %>% + mutate(twice = int * 2) %>% + arrange(chr, desc(twice), dbl + int) + expect_output( + print(arranged), + "FileSystemDataset (query) +chr: string +dbl: double +int: int32 +twice: double (multiply_checked(int, 2)) + +* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3)) +* Sorted by chr [asc], multiply_checked(int, 2) [desc], add_checked(dbl, int) [asc] +See $.data for the source Arrow object", + fixed = TRUE + ) + expect_equal( + arranged %>% + collect(), + rbind( + df1[8, c("chr", "dbl", "int")], + df2[2, c("chr", "dbl", "int")], + df1[9, c("chr", "dbl", "int")], + df2[1, c("chr", "dbl", "int")], + df1[10, c("chr", "dbl", "int")] + ) %>% + mutate( + twice = int * 2 + ) + ) +}) + +test_that("compute()/collect(as_data_frame=FALSE)", { + ds <- open_dataset(dataset_dir) + + tab1 <- ds %>% compute() + expect_r6_class(tab1, "Table") + + tab2 <- ds %>% collect(as_data_frame = FALSE) + expect_r6_class(tab2, "Table") + + tab3 <- ds %>% + mutate(negint = -int) %>% + filter(negint > -100) %>% + arrange(chr) %>% + select(negint) %>% + compute() + + expect_r6_class(tab3, "Table") + + expect_equal( + tab3 %>% collect(), + tibble(negint = -1:-10) + ) + + tab4 <- ds %>% + mutate(negint = -int) %>% + filter(negint > -100) %>% + arrange(chr) %>% + select(negint) %>% + collect(as_data_frame = FALSE) + + expect_r6_class(tab3, "Table") + + expect_equal( + tab4 %>% collect(), + tibble(negint = -1:-10) + ) + + tab5 <- ds %>% + mutate(negint = -int) %>% + group_by(fct) %>% + compute() + + # the group_by() prevents compute() from returning a Table... + expect_s3_class(tab5, "arrow_dplyr_query") + + # ... but $.data is a Table (InMemoryDataset)... + expect_r6_class(tab5$.data, "InMemoryDataset") + # ... and the mutate() was evaluated + expect_true("negint" %in% names(tab5$.data)) +}) + +test_that("head/tail on query on dataset", { + # head/tail on arrow_dplyr_query does not have deterministic order, + # so without sorting we can only assert the correct number of rows + ds <- open_dataset(dataset_dir) + + expect_identical( + ds %>% + filter(int > 6) %>% + head(5) %>% + compute() %>% + nrow(), + 5L + ) + + expect_equal( + ds %>% + filter(int > 6) %>% + arrange(int) %>% + head() %>% + collect(), + rbind(df1[7:10, ], df2[1:2, ]) + ) + + expect_equal( + ds %>% + filter(int < 105) %>% + tail(4) %>% + compute() %>% + nrow(), + 4L + ) + + expect_equal( + ds %>% + filter(int < 105) %>% + arrange(int) %>% + tail() %>% + collect(), + rbind(df1[9:10, ], df2[1:4, ]) + ) +}) + +test_that("dplyr method not implemented messages", { + ds <- open_dataset(dataset_dir) + # This one is more nuanced + expect_error( + ds %>% filter(int > 6, dbl > max(dbl)), + "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.", + fixed = TRUE + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dataset-uri.R b/src/arrow/r/tests/testthat/test-dataset-uri.R new file mode 100644 index 000000000..bdcccf282 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dataset-uri.R @@ -0,0 +1,123 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_on_os("windows") +skip_if_not_available("parquet") +skip_if_not_available("dataset") + + +library(dplyr, warn.conflicts = FALSE) + +dataset_dir <- make_temp_dir() + +test_that("Setup (putting data in the dir)", { + dir.create(file.path(dataset_dir, 1)) + dir.create(file.path(dataset_dir, 2)) + write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet")) + write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet")) + expect_length(dir(dataset_dir, recursive = TRUE), 2) +}) + +files <- c( + file.path(dataset_dir, 1, "file1.parquet", fsep = "/"), + file.path(dataset_dir, 2, "file2.parquet", fsep = "/") +) + + +test_that("dataset from single local file path", { + ds <- open_dataset(files[1]) + expect_r6_class(ds, "Dataset") + expect_equal( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7) %>% + collect() %>% + arrange(dbl), + df1[8:10, c("chr", "dbl")] + ) +}) + +test_that("dataset from vector of file paths", { + ds <- open_dataset(files) + expect_r6_class(ds, "Dataset") + expect_equal( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53L) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) +}) + +test_that("dataset from directory URI", { + uri <- paste0("file://", dataset_dir) + ds <- open_dataset(uri, partitioning = schema(part = uint8())) + expect_r6_class(ds, "Dataset") + expect_equal( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53L) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) +}) + +test_that("dataset from single file URI", { + uri <- paste0("file://", files[1]) + ds <- open_dataset(uri) + expect_r6_class(ds, "Dataset") + expect_equal( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7) %>% + collect() %>% + arrange(dbl), + df1[8:10, c("chr", "dbl")] + ) +}) + +test_that("dataset from vector of file URIs", { + uris <- paste0("file://", files) + ds <- open_dataset(uris) + expect_r6_class(ds, "Dataset") + expect_equal( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53L) %>% + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) +}) + +test_that("open_dataset errors on mixed paths and URIs", { + expect_error( + open_dataset(c(files[1], paste0("file://", files[2]))), + "Vectors of mixed paths and URIs are not supported" + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dataset-write.R b/src/arrow/r/tests/testthat/test-dataset-write.R new file mode 100644 index 000000000..8e7c077e6 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dataset-write.R @@ -0,0 +1,454 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + + +hive_dir <- make_temp_dir() +csv_dir <- make_temp_dir() + +test_that("Setup (putting data in the dirs)", { + if (arrow_with_parquet()) { + dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE) + dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE) + write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet")) + write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet")) + expect_length(dir(hive_dir, recursive = TRUE), 2) + } + + # Now, CSV + dir.create(file.path(csv_dir, 5)) + dir.create(file.path(csv_dir, 6)) + write.csv(df1, file.path(csv_dir, 5, "file1.csv"), row.names = FALSE) + write.csv(df2, file.path(csv_dir, 6, "file2.csv"), row.names = FALSE) + expect_length(dir(csv_dir, recursive = TRUE), 2) +}) + +test_that("Writing a dataset: CSV->IPC", { + ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") + dst_dir <- make_temp_dir() + write_dataset(ds, dst_dir, format = "feather", partitioning = "int") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir, format = "feather") + + expect_equal( + new_ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) + + # Check whether "int" is present in the files or just in the dirs + first <- read_feather( + dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1], + as_data_frame = FALSE + ) + # It shouldn't be there + expect_false("int" %in% names(first)) +}) + +test_that("Writing a dataset: Parquet->IPC", { + skip_if_not_available("parquet") + ds <- open_dataset(hive_dir) + dst_dir <- make_temp_dir() + write_dataset(ds, dst_dir, format = "feather", partitioning = "int") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir, format = "feather") + + expect_equal( + new_ds %>% + select(string = chr, integer = int, group) %>% + filter(integer > 6 & group == 1) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Writing a dataset: CSV->Parquet", { + skip_if_not_available("parquet") + ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") + dst_dir <- make_temp_dir() + write_dataset(ds, dst_dir, format = "parquet", partitioning = "int") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir) + + expect_equal( + new_ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Writing a dataset: Parquet->Parquet (default)", { + skip_if_not_available("parquet") + ds <- open_dataset(hive_dir) + dst_dir <- make_temp_dir() + write_dataset(ds, dst_dir, partitioning = "int") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir) + + expect_equal( + new_ds %>% + select(string = chr, integer = int, group) %>% + filter(integer > 6 & group == 1) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Writing a dataset: existing data behavior", { + # This test does not work on Windows because unlink does not immediately + # delete the data. + skip_on_os("windows") + ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") + dst_dir <- make_temp_dir() + write_dataset(ds, dst_dir, format = "feather", partitioning = "int") + expect_true(dir.exists(dst_dir)) + + check_dataset <- function() { + new_ds <- open_dataset(dst_dir, format = "feather") + + expect_equal( + new_ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) + } + + check_dataset() + # By default we should overwrite + write_dataset(ds, dst_dir, format = "feather", partitioning = "int") + check_dataset() + write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "overwrite") + check_dataset() + expect_error( + write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error"), + "directory is not empty" + ) + unlink(dst_dir, recursive = TRUE) + write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error") + check_dataset() +}) + +test_that("Writing a dataset: no format specified", { + dst_dir <- make_temp_dir() + write_dataset(example_data, dst_dir) + new_ds <- open_dataset(dst_dir) + expect_equal( + list.files(dst_dir, pattern = "parquet"), + "part-0.parquet" + ) + expect_true( + inherits(new_ds$format, "ParquetFileFormat") + ) + expect_equal( + new_ds %>% collect(), + example_data + ) +}) + +test_that("Dataset writing: dplyr methods", { + skip_if_not_available("parquet") + ds <- open_dataset(hive_dir) + dst_dir <- tempfile() + # Specify partition vars by group_by + ds %>% + group_by(int) %>% + write_dataset(dst_dir, format = "feather") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + # select to specify schema (and rename) + dst_dir2 <- tempfile() + ds %>% + group_by(int) %>% + select(chr, dubs = dbl) %>% + write_dataset(dst_dir2, format = "feather") + new_ds <- open_dataset(dst_dir2, format = "feather") + + expect_equal( + collect(new_ds) %>% arrange(int), + rbind(df1[c("chr", "dbl", "int")], df2[c("chr", "dbl", "int")]) %>% rename(dubs = dbl) + ) + + # filter to restrict written rows + dst_dir3 <- tempfile() + ds %>% + filter(int == 4) %>% + write_dataset(dst_dir3, format = "feather") + new_ds <- open_dataset(dst_dir3, format = "feather") + + expect_equal( + new_ds %>% select(names(df1)) %>% collect(), + df1 %>% filter(int == 4) + ) + + # mutate + dst_dir3 <- tempfile() + ds %>% + filter(int == 4) %>% + mutate(twice = int * 2) %>% + write_dataset(dst_dir3, format = "feather") + new_ds <- open_dataset(dst_dir3, format = "feather") + + expect_equal( + new_ds %>% select(c(names(df1), "twice")) %>% collect(), + df1 %>% filter(int == 4) %>% mutate(twice = int * 2) + ) +}) + +test_that("Dataset writing: non-hive", { + skip_if_not_available("parquet") + ds <- open_dataset(hive_dir) + dst_dir <- tempfile() + write_dataset(ds, dst_dir, format = "feather", partitioning = "int", hive_style = FALSE) + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(as.character(c(1:10, 101:110)))) +}) + +test_that("Dataset writing: no partitioning", { + skip_if_not_available("parquet") + ds <- open_dataset(hive_dir) + dst_dir <- tempfile() + write_dataset(ds, dst_dir, format = "feather", partitioning = NULL) + expect_true(dir.exists(dst_dir)) + expect_true(length(dir(dst_dir)) > 0) +}) + +test_that("Dataset writing: partition on null", { + ds <- open_dataset(hive_dir) + + dst_dir <- tempfile() + partitioning <- hive_partition(lgl = boolean()) + write_dataset(ds, dst_dir, partitioning = partitioning) + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), c("lgl=__HIVE_DEFAULT_PARTITION__", "lgl=false", "lgl=true")) + + dst_dir <- tempfile() + partitioning <- hive_partition(lgl = boolean(), null_fallback = "xyz") + write_dataset(ds, dst_dir, partitioning = partitioning) + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), c("lgl=false", "lgl=true", "lgl=xyz")) + + ds_readback <- open_dataset(dst_dir, partitioning = hive_partition(lgl = boolean(), null_fallback = "xyz")) + + expect_identical( + ds %>% + select(int, lgl) %>% + collect() %>% + arrange(lgl, int), + ds_readback %>% + select(int, lgl) %>% + collect() %>% + arrange(lgl, int) + ) +}) + +test_that("Dataset writing: from data.frame", { + dst_dir <- tempfile() + stacked <- rbind(df1, df2) + stacked %>% + group_by(int) %>% + write_dataset(dst_dir, format = "feather") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir, format = "feather") + + expect_equal( + new_ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Dataset writing: from RecordBatch", { + dst_dir <- tempfile() + stacked <- record_batch(rbind(df1, df2)) + stacked %>% + group_by(int) %>% + write_dataset(dst_dir, format = "feather") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir, format = "feather") + + expect_equal( + new_ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Writing a dataset: Ipc format options & compression", { + ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") + dst_dir <- make_temp_dir() + + codec <- NULL + if (codec_is_available("zstd")) { + codec <- Codec$create("zstd") + } + + write_dataset(ds, dst_dir, format = "feather", codec = codec) + expect_true(dir.exists(dst_dir)) + + new_ds <- open_dataset(dst_dir, format = "feather") + expect_equal( + new_ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Writing a dataset: Parquet format options", { + skip_if_not_available("parquet") + ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") + dst_dir <- make_temp_dir() + dst_dir_no_truncated_timestamps <- make_temp_dir() + + # Use trace() to confirm that options are passed in + suppressMessages(trace( + "parquet___ArrowWriterProperties___create", + tracer = quote(warning("allow_truncated_timestamps == ", allow_truncated_timestamps)), + print = FALSE, + where = write_dataset + )) + expect_warning( + write_dataset(ds, dst_dir_no_truncated_timestamps, format = "parquet", partitioning = "int"), + "allow_truncated_timestamps == FALSE" + ) + expect_warning( + write_dataset(ds, dst_dir, format = "parquet", partitioning = "int", allow_truncated_timestamps = TRUE), + "allow_truncated_timestamps == TRUE" + ) + suppressMessages(untrace( + "parquet___ArrowWriterProperties___create", + where = write_dataset + )) + + # Now confirm we can read back what we sent + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir) + + expect_equal( + new_ds %>% + select(string = chr, integer = int) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("Writing a dataset: CSV format options", { + df <- tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[1:10], + ) + + dst_dir <- make_temp_dir() + write_dataset(df, dst_dir, format = "csv") + expect_true(dir.exists(dst_dir)) + new_ds <- open_dataset(dst_dir, format = "csv") + expect_equal(new_ds %>% collect(), df) + + dst_dir <- make_temp_dir() + write_dataset(df, dst_dir, format = "csv", include_header = FALSE) + expect_true(dir.exists(dst_dir)) + new_ds <- open_dataset(dst_dir, + format = "csv", + column_names = c("int", "dbl", "lgl", "chr") + ) + expect_equal(new_ds %>% collect(), df) +}) + +test_that("Dataset writing: unsupported features/input validation", { + skip_if_not_available("parquet") + expect_error(write_dataset(4), 'dataset must be a "Dataset"') + + ds <- open_dataset(hive_dir) + expect_error( + write_dataset(ds, partitioning = c("int", "NOTACOLUMN"), format = "ipc"), + 'Invalid field name: "NOTACOLUMN"' + ) + expect_error( + write_dataset(ds, tempfile(), basename_template = "something_without_i") + ) + expect_error( + write_dataset(ds, tempfile(), basename_template = NULL) + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dataset.R b/src/arrow/r/tests/testthat/test-dataset.R new file mode 100644 index 000000000..4403b479a --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dataset.R @@ -0,0 +1,696 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + +dataset_dir <- make_temp_dir() +hive_dir <- make_temp_dir() +ipc_dir <- make_temp_dir() + +test_that("Setup (putting data in the dir)", { + if (arrow_with_parquet()) { + dir.create(file.path(dataset_dir, 1)) + dir.create(file.path(dataset_dir, 2)) + write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet")) + write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet")) + expect_length(dir(dataset_dir, recursive = TRUE), 2) + + dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE) + dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE) + write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet")) + write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet")) + expect_length(dir(hive_dir, recursive = TRUE), 2) + } + + # Now, an IPC format dataset + dir.create(file.path(ipc_dir, 3)) + dir.create(file.path(ipc_dir, 4)) + write_feather(df1, file.path(ipc_dir, 3, "file1.arrow")) + write_feather(df2, file.path(ipc_dir, 4, "file2.arrow")) + expect_length(dir(ipc_dir, recursive = TRUE), 2) +}) + +test_that("IPC/Feather format data", { + ds <- open_dataset(ipc_dir, partitioning = "part", format = "feather") + expect_r6_class(ds$format, "IpcFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + expect_identical(names(ds), c(names(df1), "part")) + expect_identical(dim(ds), c(20L, 7L)) + + expect_equal( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 3) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) + + # Collecting virtual partition column works + expect_equal( + ds %>% arrange(part) %>% pull(part), + c(rep(3, 10), rep(4, 10)) + ) +}) + +expect_scan_result <- function(ds, schm) { + sb <- ds$NewScan() + expect_r6_class(sb, "ScannerBuilder") + expect_equal(sb$schema, schm) + + sb$Project(c("chr", "lgl")) + sb$Filter(Expression$field_ref("dbl") == 8) + scn <- sb$Finish() + expect_r6_class(scn, "Scanner") + + tab <- scn$ToTable() + expect_r6_class(tab, "Table") + + expect_equal( + as.data.frame(tab), + df1[8, c("chr", "lgl")] + ) +} + +test_that("URI-decoding with directory partitioning", { + root <- make_temp_dir() + fmt <- FileFormat$create("feather") + fs <- LocalFileSystem$create() + selector <- FileSelector$create(root, recursive = TRUE) + dir1 <- file.path(root, "2021-05-04 00%3A00%3A00", "%24") + dir.create(dir1, recursive = TRUE) + write_feather(df1, file.path(dir1, "data.feather")) + + partitioning <- DirectoryPartitioning$create( + schema(date = timestamp(unit = "s"), string = utf8()) + ) + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, + partitioning = partitioning + ) + schm <- factory$Inspect() + ds <- factory$Finish(schm) + expect_scan_result(ds, schm) + + partitioning <- DirectoryPartitioning$create( + schema(date = timestamp(unit = "s"), string = utf8()), + segment_encoding = "none" + ) + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, + partitioning = partitioning + ) + schm <- factory$Inspect() + expect_error(factory$Finish(schm), "Invalid: error parsing") + + partitioning_factory <- DirectoryPartitioningFactory$create( + c("date", "string") + ) + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, partitioning_factory + ) + schm <- factory$Inspect() + ds <- factory$Finish(schm) + # Can't directly inspect partition expressions, so do it implicitly via scan + expect_equal( + ds %>% + filter(date == "2021-05-04 00:00:00", string == "$") %>% + select(int) %>% + collect(), + df1 %>% select(int) %>% collect() + ) + + partitioning_factory <- DirectoryPartitioningFactory$create( + c("date", "string"), + segment_encoding = "none" + ) + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, partitioning_factory + ) + schm <- factory$Inspect() + ds <- factory$Finish(schm) + expect_equal( + ds %>% + filter(date == "2021-05-04 00%3A00%3A00", string == "%24") %>% + select(int) %>% + collect(), + df1 %>% select(int) %>% collect() + ) +}) + +test_that("URI-decoding with hive partitioning", { + root <- make_temp_dir() + fmt <- FileFormat$create("feather") + fs <- LocalFileSystem$create() + selector <- FileSelector$create(root, recursive = TRUE) + dir1 <- file.path(root, "date=2021-05-04 00%3A00%3A00", "string=%24") + dir.create(dir1, recursive = TRUE) + write_feather(df1, file.path(dir1, "data.feather")) + + partitioning <- hive_partition( + date = timestamp(unit = "s"), string = utf8() + ) + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, + partitioning = partitioning + ) + ds <- factory$Finish(schm) + expect_scan_result(ds, schm) + + partitioning <- hive_partition( + date = timestamp(unit = "s"), string = utf8(), segment_encoding = "none" + ) + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, + partitioning = partitioning + ) + expect_error(factory$Finish(schm), "Invalid: error parsing") + + partitioning_factory <- hive_partition() + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, partitioning_factory + ) + schm <- factory$Inspect() + ds <- factory$Finish(schm) + # Can't directly inspect partition expressions, so do it implicitly via scan + expect_equal( + ds %>% + filter(date == "2021-05-04 00:00:00", string == "$") %>% + select(int) %>% + collect(), + df1 %>% select(int) %>% collect() + ) + + partitioning_factory <- hive_partition(segment_encoding = "none") + factory <- FileSystemDatasetFactory$create( + fs, selector, NULL, fmt, partitioning_factory + ) + schm <- factory$Inspect() + ds <- factory$Finish(schm) + expect_equal( + ds %>% + filter(date == "2021-05-04 00%3A00%3A00", string == "%24") %>% + select(int) %>% + collect(), + df1 %>% select(int) %>% collect() + ) +}) + +# Everything else below here is using parquet files +skip_if_not_available("parquet") + +files <- c( + file.path(dataset_dir, 1, "file1.parquet", fsep = "/"), + file.path(dataset_dir, 2, "file2.parquet", fsep = "/") +) + +test_that("Simple interface for datasets", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + expect_r6_class(ds$format, "ParquetFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + expect_r6_class(ds, "Dataset") + expect_equal( + ds %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) + + expect_equal( + ds %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 1) %>% # 6 not 6L to test autocasting + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) + + # Collecting virtual partition column works + expect_equal( + ds %>% arrange(part) %>% pull(part), + c(rep(1, 10), rep(2, 10)) + ) +}) + +test_that("dim method returns the correct number of rows and columns", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + expect_identical(dim(ds), c(20L, 7L)) +}) + + +test_that("dim() correctly determine numbers of rows and columns on arrow_dplyr_query object", { + ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) + + expect_identical( + ds %>% + filter(chr == "a") %>% + dim(), + c(2L, 7L) + ) + expect_equal( + ds %>% + select(chr, fct, int) %>% + dim(), + c(20L, 3L) + ) + expect_identical( + ds %>% + select(chr, fct, int) %>% + filter(chr == "a") %>% + dim(), + c(2L, 3L) + ) +}) + +test_that("Simple interface for datasets (custom ParquetFileFormat)", { + ds <- open_dataset(dataset_dir, + partitioning = schema(part = uint8()), + format = FileFormat$create("parquet", dict_columns = c("chr")) + ) + expect_type_equal(ds$schema$GetFieldByName("chr")$type, dictionary()) +}) + +test_that("Hive partitioning", { + ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8())) + expect_r6_class(ds, "Dataset") + expect_equal( + ds %>% + filter(group == 2) %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53) %>% + collect() %>% + arrange(dbl), + df2[1:2, c("chr", "dbl")] + ) +}) + +test_that("input validation", { + expect_error( + open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8())) + ) +}) + +test_that("Partitioning inference", { + # These are the same tests as above, just using the *PartitioningFactory + ds1 <- open_dataset(dataset_dir, partitioning = "part") + expect_identical(names(ds1), c(names(df1), "part")) + expect_equal( + ds1 %>% + select(string = chr, integer = int, part) %>% + filter(integer > 6 & part == 1) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) + + ds2 <- open_dataset(hive_dir) + expect_identical(names(ds2), c(names(df1), "group", "other")) + expect_equal( + ds2 %>% + filter(group == 2) %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53) %>% + collect() %>% + arrange(dbl), + df2[1:2, c("chr", "dbl")] + ) +}) + +test_that("Dataset with multiple file formats", { + skip("https://issues.apache.org/jira/browse/ARROW-7653") + ds <- open_dataset(list( + open_dataset(dataset_dir, format = "parquet", partitioning = "part"), + open_dataset(ipc_dir, format = "arrow", partitioning = "part") + )) + expect_identical(names(ds), c(names(df1), "part")) + expect_equal( + ds %>% + filter(int > 6 & part %in% c(1, 3)) %>% + select(string = chr, integer = int) %>% + collect(), + df1 %>% + select(string = chr, integer = int) %>% + filter(integer > 6) %>% + rbind(., .) # Stack it twice + ) +}) + +test_that("Creating UnionDataset", { + ds1 <- open_dataset(file.path(dataset_dir, 1)) + ds2 <- open_dataset(file.path(dataset_dir, 2)) + union1 <- open_dataset(list(ds1, ds2)) + expect_r6_class(union1, "UnionDataset") + expect_equal( + union1 %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) + + # Now with the c() method + union2 <- c(ds1, ds2) + expect_r6_class(union2, "UnionDataset") + expect_equal( + union2 %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars + collect() %>% + arrange(dbl), + rbind( + df1[8:10, c("chr", "dbl")], + df2[1:2, c("chr", "dbl")] + ) + ) + + # Confirm c() method error handling + expect_error(c(ds1, 42), "character") +}) + +test_that("map_batches", { + skip("map_batches() is broken (ARROW-14029)") + ds <- open_dataset(dataset_dir, partitioning = "part") + expect_equal( + ds %>% + filter(int > 5) %>% + select(int, lgl) %>% + map_batches(~ summarize(., min_int = min(int))), + tibble(min_int = c(6L, 101L)) + ) +}) + +test_that("partitioning = NULL to ignore partition information (but why?)", { + ds <- open_dataset(hive_dir, partitioning = NULL) + expect_identical(names(ds), names(df1)) # i.e. not c(names(df1), "group", "other") +}) + +test_that("head/tail", { + # head/tail with no query are still deterministic order + ds <- open_dataset(dataset_dir) + expect_equal(as.data.frame(head(ds)), head(df1)) + expect_equal( + as.data.frame(head(ds, 12)), + rbind(df1, df2[1:2, ]) + ) + + expect_equal(as.data.frame(tail(ds)), tail(df2)) + expect_equal( + as.data.frame(tail(ds, 12)), + rbind(df1[9:10, ], df2) + ) +}) + +test_that("Dataset [ (take by index)", { + ds <- open_dataset(dataset_dir) + # Taking only from one file + expect_equal( + as.data.frame(ds[c(4, 5, 9), 3:4]), + df1[c(4, 5, 9), 3:4] + ) + # Taking from more than one + expect_equal( + as.data.frame(ds[c(4, 5, 9, 12, 13), 3:4]), + rbind(df1[c(4, 5, 9), 3:4], df2[2:3, 3:4]) + ) + # Taking out of order + expect_equal( + as.data.frame(ds[c(4, 13, 9, 12, 5), ]), + rbind( + df1[4, ], + df2[3, ], + df1[9, ], + df2[2, ], + df1[5, ] + ) + ) + + # Take from a query + ds2 <- ds %>% + filter(int > 6) %>% + select(int, lgl) + expect_equal( + as.data.frame(ds2[c(2, 5), ]), + rbind( + df1[8, c("int", "lgl")], + df2[1, c("int", "lgl")] + ) + ) +}) + +test_that("Dataset and query print methods", { + ds <- open_dataset(hive_dir) + expect_output( + print(ds), + paste( + "FileSystemDataset with 2 Parquet files", + "int: int32", + "dbl: double", + "lgl: bool", + "chr: string", + "fct: dictionary<values=string, indices=int32>", + "ts: timestamp[us, tz=UTC]", + "group: int32", + "other: string", + sep = "\n" + ), + fixed = TRUE + ) + expect_type(ds$metadata, "list") + q <- select(ds, string = chr, lgl, integer = int) + expect_output( + print(q), + paste( + "Dataset (query)", + "string: string", + "lgl: bool", + "integer: int32", + "", + "See $.data for the source Arrow object", + sep = "\n" + ), + fixed = TRUE + ) + expect_output( + print(q %>% filter(integer == 6) %>% group_by(lgl)), + paste( + "Dataset (query)", + "string: string", + "lgl: bool", + "integer: int32", + "", + "* Filter: (int == 6)", + "* Grouped by lgl", + "See $.data for the source Arrow object", + sep = "\n" + ), + fixed = TRUE + ) +}) + +test_that("Scanner$ScanBatches", { + ds <- open_dataset(ipc_dir, format = "feather") + batches <- ds$NewScan()$Finish()$ScanBatches() + table <- Table$create(!!!batches) + expect_equal(as.data.frame(table), rbind(df1, df2)) + + batches <- ds$NewScan()$UseAsync(TRUE)$Finish()$ScanBatches() + table <- Table$create(!!!batches) + expect_equal(as.data.frame(table), rbind(df1, df2)) +}) + +test_that("Scanner$ToRecordBatchReader()", { + ds <- open_dataset(dataset_dir, partitioning = "part") + scan <- ds %>% + filter(part == 1) %>% + select(int, lgl) %>% + filter(int > 6) %>% + Scanner$create() + reader <- scan$ToRecordBatchReader() + expect_r6_class(reader, "RecordBatchReader") + expect_identical( + as.data.frame(reader$read_table()), + df1[df1$int > 6, c("int", "lgl")] + ) +}) + +test_that("Scanner$create() filter/projection pushdown", { + ds <- open_dataset(dataset_dir, partitioning = "part") + + # the standard to compare all Scanner$create()s against + scan_one <- ds %>% + filter(int > 7 & dbl < 57) %>% + select(int, dbl, lgl) %>% + mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>% + Scanner$create() + + # select a column in projection + scan_two <- ds %>% + filter(int > 7 & dbl < 57) %>% + # select an extra column, since we are going to + select(int, dbl, lgl, chr) %>% + mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>% + Scanner$create(projection = c("int", "dbl", "lgl", "int_plus", "dbl_minus")) + expect_identical( + as.data.frame(scan_one$ToRecordBatchReader()$read_table()), + as.data.frame(scan_two$ToRecordBatchReader()$read_table()) + ) + + # adding filters to Scanner$create + scan_three <- ds %>% + filter(int > 7) %>% + select(int, dbl, lgl) %>% + mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>% + Scanner$create( + filter = Expression$create("less", Expression$field_ref("dbl"), Expression$scalar(57)) + ) + expect_identical( + as.data.frame(scan_one$ToRecordBatchReader()$read_table()), + as.data.frame(scan_three$ToRecordBatchReader()$read_table()) + ) + + expect_error( + ds %>% + select(int, dbl, lgl) %>% + Scanner$create(projection = "not_a_col"), + # Full message is "attempting to project with unknown columns" >= 4.0.0, but + # prior versions have a less nice "all(projection %in% names(proj)) is not TRUE" + "project" + ) + + expect_error( + ds %>% + select(int, dbl, lgl) %>% + Scanner$create(filter = list("foo", "bar")), + "filter expressions must be either an expression or a list of expressions" + ) +}) + +test_that("Assembling a Dataset manually and getting a Table", { + fs <- LocalFileSystem$create() + selector <- FileSelector$create(dataset_dir, recursive = TRUE) + partitioning <- DirectoryPartitioning$create(schema(part = double())) + + fmt <- FileFormat$create("parquet") + factory <- FileSystemDatasetFactory$create(fs, selector, NULL, fmt, partitioning = partitioning) + expect_r6_class(factory, "FileSystemDatasetFactory") + + schm <- factory$Inspect() + expect_r6_class(schm, "Schema") + + phys_schm <- ParquetFileReader$create(files[1])$GetSchema() + expect_equal(names(phys_schm), names(df1)) + expect_equal(names(schm), c(names(phys_schm), "part")) + + child <- factory$Finish(schm) + expect_r6_class(child, "FileSystemDataset") + expect_r6_class(child$schema, "Schema") + expect_r6_class(child$format, "ParquetFileFormat") + expect_equal(names(schm), names(child$schema)) + expect_equal(child$files, files) + + ds <- Dataset$create(list(child), schm) + expect_scan_result(ds, schm) +}) + +test_that("Assembling multiple DatasetFactories with DatasetFactory", { + factory1 <- dataset_factory(file.path(dataset_dir, 1), format = "parquet") + expect_r6_class(factory1, "FileSystemDatasetFactory") + factory2 <- dataset_factory(file.path(dataset_dir, 2), format = "parquet") + expect_r6_class(factory2, "FileSystemDatasetFactory") + + factory <- DatasetFactory$create(list(factory1, factory2)) + expect_r6_class(factory, "DatasetFactory") + + schm <- factory$Inspect() + expect_r6_class(schm, "Schema") + + phys_schm <- ParquetFileReader$create(files[1])$GetSchema() + expect_equal(names(phys_schm), names(df1)) + + ds <- factory$Finish(schm) + expect_r6_class(ds, "UnionDataset") + expect_r6_class(ds$schema, "Schema") + expect_equal(names(schm), names(ds$schema)) + expect_equal(unlist(map(ds$children, ~ .$files)), files) + + expect_scan_result(ds, schm) +}) + +# see https://issues.apache.org/jira/browse/ARROW-11328 +test_that("Collecting zero columns from a dataset doesn't return entire dataset", { + tmp <- tempfile() + write_dataset(mtcars, tmp, format = "parquet") + expect_equal( + open_dataset(tmp) %>% select() %>% collect() %>% dim(), + c(32, 0) + ) +}) + + +test_that("dataset RecordBatchReader to C-interface to arrow_dplyr_query", { + ds <- open_dataset(ipc_dir, partitioning = "part", format = "feather") + + # export the RecordBatchReader via the C-interface + stream_ptr <- allocate_arrow_array_stream() + scan <- Scanner$create(ds) + reader <- scan$ToRecordBatchReader() + reader$export_to_c(stream_ptr) + + # then import it and check that the roundtripped value is the same + circle <- RecordBatchStreamReader$import_from_c(stream_ptr) + + # create an arrow_dplyr_query() from the recordbatch reader + reader_adq <- arrow_dplyr_query(circle) + + # TODO: ARROW-14321 should be able to arrange then collect + tab_from_c_new <- reader_adq %>% + filter(int < 8, int > 55) %>% + mutate(part_plus = part + 6) %>% + collect() + expect_equal( + tab_from_c_new %>% + arrange(dbl), + ds %>% + filter(int < 8, int > 55) %>% + mutate(part_plus = part + 6) %>% + collect() %>% + arrange(dbl) + ) + + # must clean up the pointer or we leak + delete_arrow_array_stream(stream_ptr) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-arrange.R b/src/arrow/r/tests/testthat/test-dplyr-arrange.R new file mode 100644 index 000000000..d22f64a7c --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-arrange.R @@ -0,0 +1,205 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + +# randomize order of rows in test data +tbl <- slice_sample(example_data_for_sorting, prop = 1L) + +test_that("arrange() on integer, double, and character columns", { + compare_dplyr_binding( + .input %>% + arrange(int, chr) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + arrange(int, desc(dbl)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + arrange(int, desc(desc(dbl))) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + arrange(int) %>% + arrange(desc(dbl)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + arrange(int + dbl, chr) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + mutate(zzz = int + dbl, ) %>% + arrange(zzz, chr) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + mutate(zzz = int + dbl) %>% + arrange(int + dbl, chr) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + mutate(int + dbl) %>% + arrange(int + dbl, chr) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(grp) %>% + arrange(int, dbl) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(grp) %>% + arrange(int, dbl, .by_group = TRUE) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(grp, grp2) %>% + arrange(int, dbl, .by_group = TRUE) %>% + collect(), + tbl %>% + mutate(grp2 = ifelse(is.na(lgl), 1L, as.integer(lgl))) + ) + compare_dplyr_binding( + .input %>% + group_by(grp) %>% + arrange(.by_group = TRUE) %>% + pull(grp), + tbl + ) + compare_dplyr_binding( + .input %>% + arrange() %>% + collect(), + tbl %>% + group_by(grp) + ) + compare_dplyr_binding( + .input %>% + group_by(grp) %>% + arrange() %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + arrange() %>% + collect(), + tbl + ) + test_sort_col <- "chr" + compare_dplyr_binding( + .input %>% + arrange(!!sym(test_sort_col)) %>% + collect(), + tbl %>% + select(chr, lgl) + ) + test_sort_cols <- c("int", "dbl") + compare_dplyr_binding( + .input %>% + arrange(!!!syms(test_sort_cols)) %>% + collect(), + tbl + ) +}) + +test_that("arrange() on datetime columns", { + compare_dplyr_binding( + .input %>% + arrange(dttm, int) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + arrange(dttm) %>% + collect(), + tbl %>% + select(dttm, grp) + ) +}) + +test_that("arrange() on logical columns", { + compare_dplyr_binding( + .input %>% + arrange(lgl, int) %>% + collect(), + tbl + ) +}) + +test_that("arrange() with bad inputs", { + expect_error( + tbl %>% + Table$create() %>% + arrange(1), + "does not contain any field names", + fixed = TRUE + ) + expect_error( + tbl %>% + Table$create() %>% + arrange(2 + 2), + "does not contain any field names", + fixed = TRUE + ) + expect_error( + tbl %>% + Table$create() %>% + arrange(aertidjfgjksertyj), + "not found", + fixed = TRUE + ) + expect_error( + tbl %>% + Table$create() %>% + arrange(desc(aertidjfgjksertyj + iaermxiwerksxsdqq)), + "not found", + fixed = TRUE + ) + expect_error( + tbl %>% + Table$create() %>% + arrange(desc(int, chr)), + "expects only one argument", + fixed = TRUE + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-collapse.R b/src/arrow/r/tests/testthat/test-dplyr-collapse.R new file mode 100644 index 000000000..c7281b62d --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-collapse.R @@ -0,0 +1,235 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +withr::local_options(list(arrow.summarise.sort = TRUE)) + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$some_grouping <- rep(c(1, 2), 5) + +tab <- Table$create(tbl) + +test_that("implicit_schema with select", { + expect_equal( + tab %>% + select(int, lgl) %>% + implicit_schema(), + schema(int = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with rename", { + expect_equal( + tab %>% + select(numbers = int, lgl) %>% + implicit_schema(), + schema(numbers = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with mutate", { + expect_equal( + tab %>% + transmute( + numbers = int * 4, + words = as.character(int) + ) %>% + implicit_schema(), + schema(numbers = float64(), words = utf8()) + ) +}) + +test_that("implicit_schema with summarize", { + expect_equal( + tab %>% + summarize( + avg = mean(int) + ) %>% + implicit_schema(), + schema(avg = float64()) + ) +}) + +test_that("implicit_schema with group_by summarize", { + expect_equal( + tab %>% + group_by(some_grouping) %>% + summarize( + avg = mean(int * 5L) + ) %>% + implicit_schema(), + schema(some_grouping = float64(), avg = float64()) + ) +}) + +test_that("collapse", { + q <- tab %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) + expect_false(is_collapsed(q)) + expect_true(is_collapsed(collapse(q))) + expect_false(is_collapsed(collapse(q)$.data)) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + collapse() %>% + filter(int < 5) %>% + select(int, twice) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + collapse() %>% + select(chr, int, lgl) %>% + collapse() %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + collapse() %>% + group_by(chr) %>% + select(chr, int, lgl) %>% + collapse() %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) +}) + +test_that("Properties of collapsed query", { + q <- tab %>% + filter(dbl > 2) %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + group_by(lgl) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + mutate(extra = total * 5) + + # print(tbl %>% + # filter(dbl > 2) %>% + # select(chr, int, lgl) %>% + # mutate(twice = int * 2L) %>% + # group_by(lgl) %>% + # summarize(total = sum(int, na.rm = TRUE)) %>% + # mutate(extra = total * 5)) + + # # A tibble: 3 × 3 + # lgl total extra + # <lgl> <int> <dbl> + # 1 FALSE 8 40 + # 2 TRUE 8 40 + # 3 NA 25 125 + + # Avoid evaluating just for nrow + expect_identical(dim(q), c(NA_integer_, 3L)) + + expect_output( + print(q), + "InMemoryDataset (query) +lgl: bool +total: int32 +extra: double (multiply_checked(total, 5)) + +See $.data for the source Arrow object", + fixed = TRUE + ) + expect_output( + print(q$.data), + "InMemoryDataset (query) +int: int32 +lgl: bool + +* Aggregations: +total: sum(int) +* Filter: (dbl > 2) +* Grouped by lgl +See $.data for the source Arrow object", + fixed = TRUE + ) + + skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal") + # On older R versions: + # ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ──────── + # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40). + # Component "total": Mean relative difference: 0.3846154 + # Component "extra": Mean relative difference: 0.3846154 + # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ──────── + # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125). + # Component "total": Mean relative difference: 0.9230769 + # Component "extra": Mean relative difference: 0.9230769 + expect_equal( + q %>% head(1) %>% collect(), + tibble::tibble(lgl = FALSE, total = 8L, extra = 40) + ) + skip("TODO (ARROW-1XXXX): implement sorting option about where NAs go") + expect_equal( + q %>% tail(1) %>% collect(), + tibble::tibble(lgl = NA, total = 25L, extra = 125) + ) +}) + +test_that("query_on_dataset handles collapse()", { + expect_false(query_on_dataset( + tab %>% + select(int, chr) + )) + expect_false(query_on_dataset( + tab %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) + + ds_dir <- tempfile() + dir.create(ds_dir) + on.exit(unlink(ds_dir)) + write_parquet(tab, file.path(ds_dir, "file.parquet")) + ds <- open_dataset(ds_dir) + + expect_true(query_on_dataset( + ds %>% + select(int, chr) + )) + expect_true(query_on_dataset( + ds %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-count.R b/src/arrow/r/tests/testthat/test-dplyr-count.R new file mode 100644 index 000000000..8af9b57aa --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-count.R @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + +tbl <- example_data +tbl$some_grouping <- rep(c(1, 2), 5) + +test_that("count/tally", { + compare_dplyr_binding( + .input %>% + count() %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + tally() %>% + collect(), + tbl + ) +}) + +test_that("count/tally with wt and grouped data", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + count(wt = int) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + tally(wt = int) %>% + collect(), + tbl + ) +}) + +test_that("count/tally with sort", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + count(wt = int, sort = TRUE) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + tally(wt = int, sort = TRUE) %>% + collect(), + tbl + ) +}) + +test_that("count/tally with name arg", { + compare_dplyr_binding( + .input %>% + count(name = "new_col") %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + tally(name = "new_col") %>% + collect(), + tbl + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-distinct.R b/src/arrow/r/tests/testthat/test-dplyr-distinct.R new file mode 100644 index 000000000..3a44c7372 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-distinct.R @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + +tbl <- example_data +tbl$some_grouping <- rep(c(1, 2), 5) + +test_that("distinct()", { + compare_dplyr_binding( + .input %>% + distinct(some_grouping, lgl) %>% + collect() %>% + arrange(some_grouping, lgl), + tbl + ) +}) + +test_that("distinct() works without any variables", { + compare_dplyr_binding( + .input %>% + distinct() %>% + arrange(int) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(x = int + 1) %>% + distinct() %>% + # Even though we have group_by(x), all cols (including int) are kept + arrange(int) %>% + collect(), + tbl + ) +}) + +test_that("distinct() can retain groups", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping, int) %>% + distinct(lgl) %>% + collect() %>% + arrange(lgl, int), + tbl + ) + + # With expressions here + compare_dplyr_binding( + .input %>% + group_by(y = some_grouping, int) %>% + distinct(x = lgl) %>% + collect() %>% + arrange(int), + tbl + ) +}) + +test_that("distinct() can contain expressions", { + compare_dplyr_binding( + .input %>% + distinct(lgl, x = some_grouping + 1) %>% + collect() %>% + arrange(lgl, x), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(lgl, int) %>% + distinct(x = some_grouping + 1) %>% + collect() %>% + arrange(int), + tbl + ) +}) + +test_that("distinct() can return all columns", { + skip("ARROW-13993 - need this to return correct rows from other cols") + compare_dplyr_binding( + .input %>% + distinct(lgl, .keep_all = TRUE) %>% + collect() %>% + arrange(int), + tbl + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-filter.R b/src/arrow/r/tests/testthat/test-dplyr-filter.R new file mode 100644 index 000000000..72a64229c --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-filter.R @@ -0,0 +1,412 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$some_negative <- tbl$int * (-1)^(1:nrow(tbl)) # nolint + +test_that("filter() on is.na()", { + compare_dplyr_binding( + .input %>% + filter(is.na(lgl)) %>% + select(chr, int, lgl) %>% + collect(), + tbl + ) +}) + +test_that("filter() with NAs in selection", { + compare_dplyr_binding( + .input %>% + filter(lgl) %>% + select(chr, int, lgl) %>% + collect(), + tbl + ) +}) + +test_that("Filter returning an empty Table should not segfault (ARROW-8354)", { + compare_dplyr_binding( + .input %>% + filter(false) %>% + select(chr, int, lgl) %>% + collect(), + tbl + ) +}) + +test_that("filtering with expression", { + char_sym <- "b" + compare_dplyr_binding( + .input %>% + filter(chr == char_sym) %>% + select(string = chr, int) %>% + collect(), + tbl + ) +}) + +test_that("filtering with arithmetic", { + compare_dplyr_binding( + .input %>% + filter(dbl + 1 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl / 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl / 2L > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int / 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int / 2L > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl %/% 2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl^2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) +}) + +test_that("filtering with expression + autocasting", { + compare_dplyr_binding( + .input %>% + filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int + 1 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int^2 > 3) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl + ) +}) + +test_that("More complex select/filter", { + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) +}) + +test_that("filter() with %in%", { + compare_dplyr_binding( + .input %>% + filter(dbl > 2, chr %in% c("d", "f")) %>% + collect(), + tbl + ) +}) + +test_that("Negative scalar values", { + compare_dplyr_binding( + .input %>% + filter(some_negative > -2) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + filter(some_negative %in% -1) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + filter(int == -some_negative) %>% + collect(), + tbl + ) +}) + +test_that("filter() with between()", { + compare_dplyr_binding( + .input %>% + filter(between(dbl, 1, 2)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(between(dbl, 0.5, 2)) %>% + collect(), + tbl + ) + + expect_identical( + tbl %>% + record_batch() %>% + filter(between(dbl, int, dbl2)) %>% + collect(), + tbl %>% + filter(dbl >= int, dbl <= dbl2) + ) + + expect_error( + tbl %>% + record_batch() %>% + filter(between(dbl, 1, "2")) %>% + collect() + ) + + expect_error( + tbl %>% + record_batch() %>% + filter(between(dbl, 1, NA)) %>% + collect() + ) + + expect_error( + tbl %>% + record_batch() %>% + filter(between(chr, 1, 2)) %>% + collect() + ) +}) + +test_that("filter() with string ops", { + skip_if_not_available("utf8proc") + compare_dplyr_binding( + .input %>% + filter(dbl > 2, str_length(verses) > 25) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>% + collect(), + tbl + ) +}) + +test_that("filter environment scope", { + # "object 'b_var' not found" + compare_dplyr_error(.input %>% filter(chr == b_var), tbl) + + b_var <- "b" + compare_dplyr_binding( + .input %>% + filter(chr == b_var) %>% + collect(), + tbl + ) + # Also for functions + # 'could not find function "isEqualTo"' because we haven't defined it yet + compare_dplyr_error(.input %>% filter(isEqualTo(int, 4)), tbl) + + # This works but only because there are S3 methods for those operations + isEqualTo <- function(x, y) x == y & !is.na(x) + compare_dplyr_binding( + .input %>% + select(-fct) %>% # factor levels aren't identical + filter(isEqualTo(int, 4)) %>% + collect(), + tbl + ) + # Try something that needs to call another nse_func + compare_dplyr_binding( + .input %>% + select(-fct) %>% + filter(nchar(padded_strings) < 10) %>% + collect(), + tbl + ) + isShortString <- function(x) nchar(x) < 10 + skip("TODO: 14071") + compare_dplyr_binding( + .input %>% + select(-fct) %>% + filter(isShortString(padded_strings)) %>% + collect(), + tbl + ) +}) + +test_that("Filtering on a column that doesn't exist errors correctly", { + with_language("fr", { + # expect_warning(., NA) because the usual behavior when it hits a filter + # that it can't evaluate is to raise a warning, collect() to R, and retry + # the filter. But we want this to error the first time because it's + # a user error, not solvable by retrying in R + expect_warning( + expect_error( + tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(), + "objet 'not_a_col' introuvable" + ), + NA + ) + }) + with_language("en", { + expect_warning( + expect_error( + tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(), + "object 'not_a_col' not found" + ), + NA + ) + }) +}) + +test_that("Filtering with unsupported functions", { + compare_dplyr_binding( + .input %>% + filter(int > 2, pnorm(dbl) > .99) %>% + collect(), + tbl, + warning = "Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow; pulling data into R" + ) + compare_dplyr_binding( + .input %>% + filter( + nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg + int > 2, # good + pnorm(dbl) > .99 # bad, opaque + ) %>% + collect(), + tbl, + warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported by Arrow +\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow +pulling data into R' + ) +}) + +test_that("Calling Arrow compute functions 'directly'", { + expect_equal( + tbl %>% + record_batch() %>% + filter(arrow_add(dbl, 1) > 3L) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl %>% + filter(dbl + 1 > 3L) %>% + select(string = chr, int, dbl) + ) + + compare_dplyr_binding( + tbl %>% + record_batch() %>% + filter(arrow_greater(arrow_add(dbl, 1), 3L)) %>% + select(string = chr, int, dbl) %>% + collect(), + tbl %>% + filter(dbl + 1 > 3L) %>% + select(string = chr, int, dbl) + ) +}) + +test_that("filter() with .data pronoun", { + compare_dplyr_binding( + .input %>% + filter(.data$dbl > 4) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(is.na(.data$lgl)) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) + + # and the .env pronoun too! + chr <- 4 + compare_dplyr_binding( + .input %>% + filter(.data$dbl > .env$chr) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) + + skip("test now faulty - code no longer gives error & outputs a empty tibble") + # but there is an error if we don't override the masking with `.env` + compare_dplyr_error( + .input %>% + filter(.data$dbl > chr) %>% + select(.data$chr, .data$int, .data$lgl) %>% + collect(), + tbl + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R new file mode 100644 index 000000000..4f2700795 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R @@ -0,0 +1,409 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +suppressPackageStartupMessages(library(bit64)) + + +tbl <- example_data +tbl$verses <- verses[[1]] +tbl$another_chr <- tail(letters, 10) + +test_that("if_else and ifelse", { + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(int > 5, 1, 0) + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(int > 5, int, 0L) + ) %>% + collect(), + tbl + ) + + expect_error( + Table$create(tbl) %>% + mutate( + y = if_else(int > 5, 1, FALSE) + ) %>% + collect(), + "NotImplemented: Function if_else has no kernel matching input types" + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(int > 5, 1, NA_real_) + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = ifelse(int > 5, 1, 0) + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(dbl > 5, TRUE, FALSE) + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(chr %in% letters[1:3], 1L, 3L) + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(int > 5, "one", "zero") + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(int > 5, chr, another_chr) + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(int > 5, "true", chr, missing = "MISSING") + ) %>% + collect(), + tbl + ) + + # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow + # supports factors in if(_)else + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(int > 5, fct, factor("a")) + ) %>% + collect() %>% + # This is a no-op on the Arrow side, but necessary to make the results equal + mutate(y = as.character(y)), + tbl, + warning = "Dictionaries .* are currently converted to strings .* in if_else and ifelse" + ) + + # detecting NA and NaN works just fine + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(is.na(dbl), chr, "false", missing = "MISSING") + ) %>% + collect(), + example_data_for_sorting + ) + + # However, currently comparisons with NaNs return false and not NaNs or NAs + skip("ARROW-13364") + compare_dplyr_binding( + .input %>% + mutate( + y = if_else(dbl > 5, chr, another_chr, missing = "MISSING") + ) %>% + collect(), + example_data_for_sorting + ) + + skip("TODO: could? should? we support the autocasting in ifelse") + compare_dplyr_binding( + .input %>% + mutate(y = ifelse(int > 5, 1, FALSE)) %>% + collect(), + tbl + ) +}) + +test_that("case_when()", { + compare_dplyr_binding( + .input %>% + transmute(cw = case_when(lgl ~ dbl, !false ~ dbl + dbl2)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + mutate(cw = case_when(int > 5 ~ 1, TRUE ~ 0)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute(cw = case_when(chr %in% letters[1:3] ~ 1L) + 41L) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + filter(case_when( + dbl + int - 1.1 == dbl2 ~ TRUE, + NA ~ NA, + TRUE ~ FALSE + ) & !is.na(dbl2)) %>% + collect(), + tbl + ) + + # dplyr::case_when() errors if values on right side of formulas do not have + # exactly the same type, but the Arrow case_when kernel allows compatible types + expect_equal( + tbl %>% + mutate(i64 = as.integer64(1e10)) %>% + Table$create() %>% + transmute(cw = case_when( + is.na(fct) ~ int, + is.na(chr) ~ dbl, + TRUE ~ i64 + )) %>% + collect(), + tbl %>% + transmute( + cw = ifelse(is.na(fct), int, ifelse(is.na(chr), dbl, 1e10)) + ) + ) + + # expected errors (which are caught by abandon_ship() and changed to warnings) + # TODO: Find a way to test these directly without abandon_ship() interfering + expect_error( + # no cases + expect_warning( + tbl %>% + Table$create() %>% + transmute(cw = case_when()), + "case_when" + ) + ) + expect_error( + # argument not a formula + expect_warning( + tbl %>% + Table$create() %>% + transmute(cw = case_when(TRUE ~ FALSE, TRUE)), + "case_when" + ) + ) + expect_error( + # non-logical R scalar on left side of formula + expect_warning( + tbl %>% + Table$create() %>% + transmute(cw = case_when(0L ~ FALSE, TRUE ~ FALSE)), + "case_when" + ) + ) + expect_error( + # non-logical Arrow column reference on left side of formula + expect_warning( + tbl %>% + Table$create() %>% + transmute(cw = case_when(int ~ FALSE)), + "case_when" + ) + ) + expect_error( + # non-logical Arrow expression on left side of formula + expect_warning( + tbl %>% + Table$create() %>% + transmute(cw = case_when(dbl + 3.14159 ~ TRUE)), + "case_when" + ) + ) + + compare_dplyr_binding( + .input %>% + transmute(cw = case_when(lgl ~ "abc")) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute(cw = case_when(lgl ~ verses, !false ~ paste(chr, chr))) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate( + cw = case_when(!(!(!(lgl))) ~ factor(chr), TRUE ~ fct) + ) %>% + collect(), + tbl, + warning = TRUE + ) +}) + +test_that("coalesce()", { + # character + df <- tibble( + w = c(NA_character_, NA_character_, NA_character_), + x = c(NA_character_, NA_character_, "c"), + y = c(NA_character_, "b", "c"), + z = c("a", "b", "c") + ) + compare_dplyr_binding( + .input %>% + mutate( + cw = coalesce(w), + cz = coalesce(z), + cwx = coalesce(w, x), + cwxy = coalesce(w, x, y), + cwxyz = coalesce(w, x, y, z) + ) %>% + collect(), + df + ) + + # integer + df <- tibble( + w = c(NA_integer_, NA_integer_, NA_integer_), + x = c(NA_integer_, NA_integer_, 3L), + y = c(NA_integer_, 2L, 3L), + z = 1:3 + ) + compare_dplyr_binding( + .input %>% + mutate( + cw = coalesce(w), + cz = coalesce(z), + cwx = coalesce(w, x), + cwxy = coalesce(w, x, y), + cwxyz = coalesce(w, x, y, z) + ) %>% + collect(), + df + ) + + # double with NaNs + df <- tibble( + w = c(NA_real_, NaN, NA_real_), + x = c(NA_real_, NaN, 3.3), + y = c(NA_real_, 2.2, 3.3), + z = c(1.1, 2.2, 3.3) + ) + compare_dplyr_binding( + .input %>% + mutate( + cw = coalesce(w), + cz = coalesce(z), + cwx = coalesce(w, x), + cwxy = coalesce(w, x, y), + cwxyz = coalesce(w, x, y, z) + ) %>% + collect(), + df + ) + # NaNs stay NaN and are not converted to NA in the results + # (testing this requires expect_identical()) + expect_identical( + df %>% Table$create() %>% mutate(cwx = coalesce(w, x)) %>% collect(), + df %>% mutate(cwx = coalesce(w, x)) + ) + expect_identical( + df %>% Table$create() %>% transmute(cw = coalesce(w)) %>% collect(), + df %>% transmute(cw = coalesce(w)) + ) + expect_identical( + df %>% Table$create() %>% transmute(cn = coalesce(NaN)) %>% collect(), + df %>% transmute(cn = coalesce(NaN)) + ) + # singles stay single + expect_equal( + (df %>% + Table$create(schema = schema( + w = float32(), + x = float32(), + y = float32(), + z = float32() + )) %>% + transmute(c = coalesce(w, x, y, z)) %>% + compute() + )$schema[[1]]$type, + float32() + ) + # with R literal values + compare_dplyr_binding( + .input %>% + mutate( + c1 = coalesce(4.4), + c2 = coalesce(NA_real_), + c3 = coalesce(NaN), + c4 = coalesce(w, x, y, 5.5), + c5 = coalesce(w, x, y, NA_real_), + c6 = coalesce(w, x, y, NaN) + ) %>% + collect(), + df + ) + + # factors + # TODO: remove the mutate + warning after ARROW-14167 is merged and Arrow + # supports factors in coalesce + df <- tibble( + x = factor("a", levels = c("a", "z")), + y = factor("b", levels = c("a", "b", "c")) + ) + compare_dplyr_binding( + .input %>% + mutate(c = coalesce(x, y)) %>% + collect() %>% + # This is a no-op on the Arrow side, but necessary to make the results equal + mutate(c = as.character(c)), + df, + warning = "Dictionaries .* are currently converted to strings .* in coalesce" + ) + + # no arguments + expect_error( + nse_funcs$coalesce(), + "At least one argument must be supplied to coalesce()", + fixed = TRUE + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R new file mode 100644 index 000000000..5cb515e69 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -0,0 +1,304 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(lubridate, warn.conflicts = FALSE) +library(dplyr, warn.conflicts = FALSE) + +# base::strptime() defaults to local timezone +# but arrow's strptime defaults to UTC. +# So that tests are consistent, set the local timezone to UTC +# TODO: consider reevaluating this workaround after ARROW-12980 +withr::local_timezone("UTC") + +# TODO: We should test on windows once ARROW-13168 is resolved. +if (tolower(Sys.info()[["sysname"]]) == "windows") { + test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "") +} else { + test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas") +} + + +test_df <- tibble::tibble( + # test_date + 1 turns the tzone = "" to NULL, which is functionally equivalent + # so we can run some tests on Windows, but this skirts around + # https://issues.apache.org/jira/browse/ARROW-13588 + # That issue is tough because in C++, "" is the "no timezone" value + # due to static typing, so we can't distinguish a literal "" from NULL + datetime = c(test_date, NA) + 1, + date = c(as.Date("2021-09-09"), NA) +) + +# These tests test component extraction from timestamp objects + +test_that("extract year from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = year(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract isoyear from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = isoyear(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract quarter from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = quarter(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract month from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = month(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract isoweek from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = isoweek(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract epiweek from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = epiweek(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract day from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = day(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract wday from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = wday(datetime)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = wday(date, week_start = 3)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = wday(date, week_start = 1)) %>% + collect(), + test_df + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + + compare_dplyr_binding( + .input %>% + mutate(x = wday(date, label = TRUE)) %>% + mutate(x = as.character(x)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = wday(datetime, label = TRUE, abbr = TRUE)) %>% + mutate(x = as.character(x)) %>% + collect(), + test_df + ) +}) + +test_that("extract yday from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = yday(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract hour from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = hour(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract minute from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = minute(datetime)) %>% + collect(), + test_df + ) +}) + +test_that("extract second from timestamp", { + compare_dplyr_binding( + .input %>% + mutate(x = second(datetime)) %>% + collect(), + test_df, + # arrow supports nanosecond resolution but lubridate does not + tolerance = 1e-6 + ) +}) + +# These tests test extraction of components from date32 objects + +test_that("extract year from date", { + compare_dplyr_binding( + .input %>% + mutate(x = year(date)) %>% + collect(), + test_df + ) +}) + +test_that("extract isoyear from date", { + compare_dplyr_binding( + .input %>% + mutate(x = isoyear(date)) %>% + collect(), + test_df + ) +}) + +test_that("extract quarter from date", { + compare_dplyr_binding( + .input %>% + mutate(x = quarter(date)) %>% + collect(), + test_df + ) +}) + +test_that("extract month from date", { + compare_dplyr_binding( + .input %>% + mutate(x = month(date)) %>% + collect(), + test_df + ) +}) + +test_that("extract isoweek from date", { + compare_dplyr_binding( + .input %>% + mutate(x = isoweek(date)) %>% + collect(), + test_df + ) +}) + +test_that("extract epiweek from date", { + compare_dplyr_binding( + .input %>% + mutate(x = epiweek(date)) %>% + collect(), + test_df + ) +}) + +test_that("extract day from date", { + compare_dplyr_binding( + .input %>% + mutate(x = day(date)) %>% + collect(), + test_df + ) +}) + +test_that("extract wday from date", { + compare_dplyr_binding( + .input %>% + mutate(x = wday(date)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = wday(date, week_start = 3)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = wday(date, week_start = 1)) %>% + collect(), + test_df + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + + compare_dplyr_binding( + .input %>% + mutate(x = wday(date, label = TRUE, abbr = TRUE)) %>% + mutate(x = as.character(x)) %>% + collect(), + test_df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = wday(date, label = TRUE)) %>% + mutate(x = as.character(x)) %>% + collect(), + test_df + ) +}) + +test_that("extract yday from date", { + compare_dplyr_binding( + .input %>% + mutate(x = yday(date)) %>% + collect(), + test_df + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R new file mode 100644 index 000000000..b66630675 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R @@ -0,0 +1,309 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + + +test_that("abs()", { + df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA)) + + compare_dplyr_binding( + .input %>% + transmute(abs = abs(x)) %>% + collect(), + df + ) +}) + +test_that("sign()", { + df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA)) + + compare_dplyr_binding( + .input %>% + transmute(sign = sign(x)) %>% + collect(), + df + ) +}) + +test_that("ceiling(), floor(), trunc(), round()", { + df <- tibble(x = c(-1, -0.55, -0.5, -0.1, 0, 0.1, 0.5, 0.55, 1, NA, NaN)) + + compare_dplyr_binding( + .input %>% + mutate( + c = ceiling(x), + f = floor(x), + t = trunc(x), + r = round(x) + ) %>% + collect(), + df + ) + + # with digits set to 1 + compare_dplyr_binding( + .input %>% + filter(x %% 0.5 == 0) %>% # filter out indeterminate cases (see below) + mutate(r = round(x, 1)) %>% + collect(), + df + ) + + # with digits set to -1 + compare_dplyr_binding( + .input %>% + mutate( + rd = round(floor(x * 111), -1), # double + y = ifelse(is.nan(x), NA_integer_, x), + ri = round(as.integer(y * 111), -1) # integer (with the NaN removed) + ) %>% + collect(), + df + ) + + # round(x, -2) is equivalent to round_to_multiple(x, 100) + expect_equal( + Table$create(x = 1111.1) %>% + mutate(r = round(x, -2)) %>% + collect(), + Table$create(x = 1111.1) %>% + mutate(r = arrow_round_to_multiple(x, options = list(multiple = 100))) %>% + collect() + ) + + # For consistency with base R, the binding for round() uses the Arrow + # library's HALF_TO_EVEN round mode, but the expectations *above* would pass + # even if another round mode were used. The expectations *below* should fail + # with other round modes. However, some decimal numbers cannot be represented + # exactly as floating point numbers, and for the ones that also end in 5 (such + # as 0.55), R's rounding behavior is indeterminate: it will vary depending on + # the OS. In practice, this seems to affect Windows, so we skip these tests + # on Windows and on CRAN. + + skip_on_cran() + skip_on_os("windows") + + compare_dplyr_binding( + .input %>% + mutate(r = round(x, 1)) %>% + collect(), + df + ) + + # Verify that round mode HALF_TO_EVEN, which is what the round() binding uses, + # yields results consistent with R... + expect_equal( + as.vector( + call_function( + "round", + Array$create(df$x), + options = list(ndigits = 1L, round_mode = RoundMode$HALF_TO_EVEN) + ) + ), + round(df$x, 1) + ) + # ...but that the round mode HALF_TOWARDS_ZERO does not. If the expectation + # below fails, it means that the expectation above is not effectively testing + # that Arrow is using the HALF_TO_EVEN mode. + expect_false( + isTRUE(all.equal( + as.vector( + call_function( + "round", + Array$create(df$x), + options = list(ndigits = 1L, round_mode = RoundMode$HALF_TOWARDS_ZERO) + ) + ), + round(df$x, 1) + )) + ) +}) + +test_that("log functions", { + df <- tibble(x = c(1:10, NA, NA)) + + compare_dplyr_binding( + .input %>% + mutate(y = log(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = log(x, base = exp(1))) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = log(x, base = 2)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = log(x, base = 10)) %>% + collect(), + df + ) + + # test log(, base = (length == 1)) + compare_dplyr_binding( + .input %>% + mutate(y = log(x, base = 5)) %>% + collect(), + df + ) + + # test log(, base = (length != 1)) + expect_error( + nse_funcs$log(10, base = 5:6), + "base must be a column or a length-1 numeric; other values not supported by Arrow", + fixed = TRUE + ) + + # test log(x = (length != 1)) + expect_error( + nse_funcs$log(10:11), + "x must be a column or a length-1 numeric; other values not supported by Arrow", + fixed = TRUE + ) + + # test log(, base = Expression) + compare_dplyr_binding( + .input %>% + # test cases where base = 1 below + filter(x != 1) %>% + mutate( + y = log(x, base = x), + z = log(2, base = x) + ) %>% + collect(), + df + ) + + # log(1, base = 1) is NaN in both R and Arrow + # suppress the R warning because R warns but Arrow does not + suppressWarnings( + compare_dplyr_binding( + .input %>% + mutate(y = log(x, base = y)) %>% + collect(), + tibble(x = 1, y = 1) + ) + ) + + # log(n != 1, base = 1) is Inf in R and Arrow + compare_dplyr_binding( + .input %>% + mutate(y = log(x, base = y)) %>% + collect(), + tibble(x = 10, y = 1) + ) + + compare_dplyr_binding( + .input %>% + mutate(y = logb(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = log1p(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = log2(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = log10(x)) %>% + collect(), + df + ) +}) + +test_that("trig functions", { + df <- tibble(x = c(seq(from = 0, to = 1, by = 0.1), NA)) + + compare_dplyr_binding( + .input %>% + mutate(y = sin(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = cos(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = tan(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = asin(x)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = acos(x)) %>% + collect(), + df + ) +}) + +test_that("arith functions ", { + df <- tibble(x = c(1:5, NA)) + + compare_dplyr_binding( + .input %>% + transmute( + int_div = x %/% 2, + addition = x + 1, + multiplication = x * 3, + subtraction = x - 5, + division = x / 2, + power = x^3, + modulo = x %% 3 + ) %>% + collect(), + df + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R new file mode 100644 index 000000000..5e092f4e3 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R @@ -0,0 +1,1399 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") +skip_if_not_available("utf8proc") + +library(dplyr, warn.conflicts = FALSE) +library(lubridate) +library(stringr) +library(stringi) + +test_that("paste, paste0, and str_c", { + df <- tibble( + v = c("A", "B", "C"), + w = c("a", "b", "c"), + x = c("d", NA_character_, "f"), + y = c(NA_character_, "h", "i"), + z = c(1.1, 2.2, NA) + ) + x <- Expression$field_ref("x") + y <- Expression$field_ref("y") + + # no NAs in data + compare_dplyr_binding( + .input %>% + transmute(paste(v, w)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(paste(v, w, sep = "-")) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(paste0(v, w)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(str_c(v, w)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(str_c(v, w, sep = "+")) %>% + collect(), + df + ) + + # NAs in data + compare_dplyr_binding( + .input %>% + transmute(paste(x, y)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(paste(x, y, sep = "-")) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(str_c(x, y)) %>% + collect(), + df + ) + + # non-character column in dots + compare_dplyr_binding( + .input %>% + transmute(paste0(x, y, z)) %>% + collect(), + df + ) + + # literal string in dots + compare_dplyr_binding( + .input %>% + transmute(paste(x, "foo", y)) %>% + collect(), + df + ) + + # literal NA in dots + compare_dplyr_binding( + .input %>% + transmute(paste(x, NA, y)) %>% + collect(), + df + ) + + # expressions in dots + compare_dplyr_binding( + .input %>% + transmute(paste0(x, toupper(y), as.character(z))) %>% + collect(), + df + ) + + # sep is literal NA + # errors in paste() (consistent with base::paste()) + expect_error( + nse_funcs$paste(x, y, sep = NA_character_), + "Invalid separator" + ) + # emits null in str_c() (consistent with stringr::str_c()) + compare_dplyr_binding( + .input %>% + transmute(str_c(x, y, sep = NA_character_)) %>% + collect(), + df + ) + + # sep passed in dots to paste0 (which doesn't take a sep argument) + compare_dplyr_binding( + .input %>% + transmute(paste0(x, y, sep = "-")) %>% + collect(), + df + ) + + # known differences + + # arrow allows the separator to be an array + expect_equal( + df %>% + Table$create() %>% + transmute(result = paste(x, y, sep = w)) %>% + collect(), + df %>% + transmute(result = paste(x, w, y, sep = "")) + ) + + # expected errors + + # collapse argument not supported + expect_error( + nse_funcs$paste(x, y, collapse = ""), + "collapse" + ) + expect_error( + nse_funcs$paste0(x, y, collapse = ""), + "collapse" + ) + expect_error( + nse_funcs$str_c(x, y, collapse = ""), + "collapse" + ) + + # literal vectors of length != 1 not supported + expect_error( + nse_funcs$paste(x, character(0), y), + "Literal vectors of length != 1 not supported in string concatenation" + ) + expect_error( + nse_funcs$paste(x, c(",", ";"), y), + "Literal vectors of length != 1 not supported in string concatenation" + ) +}) + +test_that("grepl with ignore.case = FALSE and fixed = TRUE", { + df <- tibble(x = c("Foo", "bar")) + compare_dplyr_binding( + .input %>% + filter(grepl("o", x, fixed = TRUE)) %>% + collect(), + df + ) +}) + +test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", { + df <- tibble(x = c("Foo", "bar")) + compare_dplyr_binding( + .input %>% + transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = gsub("o", "u", x, fixed = TRUE)) %>% + collect(), + df + ) +}) + +# many of the remainder of these tests require RE2 +skip_if_not_available("re2") + +test_that("grepl", { + df <- tibble(x = c("Foo", "bar")) + + for (fixed in c(TRUE, FALSE)) { + compare_dplyr_binding( + .input %>% + filter(grepl("Foo", x, fixed = fixed)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>% + collect(), + df + ) + } +}) + +test_that("grepl with ignore.case = TRUE and fixed = TRUE", { + df <- tibble(x = c("Foo", "bar")) + + # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE, + # so we can't use compare_dplyr_binding() for these tests + expect_equal( + df %>% + Table$create() %>% + filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>% + collect(), + tibble(x = "Foo") + ) + expect_equal( + df %>% + Table$create() %>% + filter(x = grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>% + collect(), + tibble(x = character(0)) + ) +}) + +test_that("str_detect", { + df <- tibble(x = c("Foo", "bar")) + + compare_dplyr_binding( + .input %>% + filter(str_detect(x, regex("^F"))) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + filter(str_detect(x, fixed("o"))) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + filter(str_detect(x, fixed("O"))) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>% + collect(), + df + ) +}) + +test_that("sub and gsub", { + df <- tibble(x = c("Foo", "bar")) + + for (fixed in c(TRUE, FALSE)) { + compare_dplyr_binding( + .input %>% + transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% + collect(), + df + ) + } +}) + +test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", { + df <- tibble(x = c("Foo", "bar")) + + # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when + # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests + expect_equal( + df %>% + Table$create() %>% + transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% + collect(), + tibble(x = c("Fuo", "bar")) + ) + expect_equal( + df %>% + Table$create() %>% + transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% + collect(), + tibble(x = c("Fuu", "bar")) + ) + expect_equal( + df %>% + Table$create() %>% + transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>% + collect(), + df # unchanged + ) +}) + +test_that("str_replace and str_replace_all", { + df <- tibble(x = c("Foo", "bar")) + + compare_dplyr_binding( + .input %>% + transmute(x = str_replace_all(x, "^F", "baz")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace_all(x, fixed("o"), "u")) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace(x, fixed("O"), "u")) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>% + collect(), + df + ) +}) + +test_that("strsplit and str_split", { + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + compare_dplyr_binding( + .input %>% + mutate(x = strsplit(x, "and")) %>% + collect(), + df, + # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray) + # has type information in it, but it's just a bare list from R/dplyr. + ignore_attr = TRUE + ) + compare_dplyr_binding( + .input %>% + mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% + collect(), + df, + ignore_attr = TRUE + ) + compare_dplyr_binding( + .input %>% + mutate(x = strsplit(x, " +and +")) %>% + collect(), + df, + ignore_attr = TRUE + ) + compare_dplyr_binding( + .input %>% + mutate(x = str_split(x, "and")) %>% + collect(), + df, + ignore_attr = TRUE + ) + compare_dplyr_binding( + .input %>% + mutate(x = str_split(x, "and", n = 2)) %>% + collect(), + df, + ignore_attr = TRUE + ) + compare_dplyr_binding( + .input %>% + mutate(x = str_split(x, fixed("and"), n = 2)) %>% + collect(), + df, + ignore_attr = TRUE + ) + compare_dplyr_binding( + .input %>% + mutate(x = str_split(x, regex("and"), n = 2)) %>% + collect(), + df, + ignore_attr = TRUE + ) + compare_dplyr_binding( + .input %>% + mutate(x = str_split(x, "Foo|bar", n = 2)) %>% + collect(), + df, + ignore_attr = TRUE + ) +}) + +test_that("str_to_lower, str_to_upper, and str_to_title", { + df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) + compare_dplyr_binding( + .input %>% + transmute( + x_lower = str_to_lower(x), + x_upper = str_to_upper(x), + x_title = str_to_title(x) + ) %>% + collect(), + df + ) + + # Error checking a single function because they all use the same code path. + expect_error( + nse_funcs$str_to_lower("Apache Arrow", locale = "sp"), + "Providing a value for 'locale' other than the default ('en') is not supported by Arrow", + fixed = TRUE + ) +}) + +test_that("arrow_*_split_whitespace functions", { + # use only ASCII whitespace characters + df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) + + # use only non-ASCII whitespace characters + df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux")) + + df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) + + # use default option values + expect_equal( + df_ascii %>% + Table$create() %>% + mutate(x = arrow_ascii_split_whitespace(x)) %>% + collect(), + df_split, + ignore_attr = TRUE + ) + expect_equal( + df_utf8 %>% + Table$create() %>% + mutate(x = arrow_utf8_split_whitespace(x)) %>% + collect(), + df_split, + ignore_attr = TRUE + ) + + # specify non-default option values + expect_equal( + df_ascii %>% + Table$create() %>% + mutate( + x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) + ) %>% + collect(), + tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))), + ignore_attr = TRUE + ) + expect_equal( + df_utf8 %>% + Table$create() %>% + mutate( + x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) + ) %>% + collect(), + tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))), + ignore_attr = TRUE + ) +}) + +test_that("errors and warnings in string splitting", { + # These conditions generate an error, but abandon_ship() catches the error, + # issues a warning, and pulls the data into R (if computing on InMemoryDataset) + # Elsewhere we test that abandon_ship() works, + # so here we can just call the functions directly + + x <- Expression$field_ref("x") + expect_error( + nse_funcs$str_split(x, fixed("and", ignore_case = TRUE)), + "Case-insensitive string splitting not supported by Arrow" + ) + expect_error( + nse_funcs$str_split(x, coll("and.?")), + "Pattern modifier `coll()` not supported by Arrow", + fixed = TRUE + ) + expect_error( + nse_funcs$str_split(x, boundary(type = "word")), + "Pattern modifier `boundary()` not supported by Arrow", + fixed = TRUE + ) + expect_error( + nse_funcs$str_split(x, "and", n = 0), + "Splitting strings into zero parts not supported by Arrow" + ) + + # This condition generates a warning + expect_warning( + nse_funcs$str_split(x, fixed("and"), simplify = TRUE), + "Argument 'simplify = TRUE' will be ignored" + ) +}) + +test_that("errors and warnings in string detection and replacement", { + x <- Expression$field_ref("x") + + expect_error( + nse_funcs$str_detect(x, boundary(type = "character")), + "Pattern modifier `boundary()` not supported by Arrow", + fixed = TRUE + ) + expect_error( + nse_funcs$str_replace_all(x, coll("o", locale = "en"), "ó"), + "Pattern modifier `coll()` not supported by Arrow", + fixed = TRUE + ) + + # This condition generates a warning + expect_warning( + nse_funcs$str_replace_all(x, regex("o", multiline = TRUE), "u"), + "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" + ) +}) + +test_that("backreferences in pattern in string detection", { + skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)") + df <- tibble(x = c("Foo", "bar")) + + compare_dplyr_binding( + .input %>% + filter(str_detect(x, regex("F([aeiou])\\1"))) %>% + collect(), + df + ) +}) + +test_that("backreferences (substitutions) in string replacement", { + df <- tibble(x = c("Foo", "bar")) + + compare_dplyr_binding( + .input %>% + transmute(desc = sub( + "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?", + "path `\\2` on server `\\1`", + url + )) %>% + collect(), + tibble(url = "https://arrow.apache.org/docs/r/") + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% + collect(), + df + ) + compare_dplyr_binding( + .input %>% + transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% + collect(), + df + ) +}) + +test_that("edge cases in string detection and replacement", { + # in case-insensitive fixed match/replace, test that "\\E" in the search + # string and backslashes in the replacement string are interpreted literally. + # this test does not use compare_dplyr_binding() because base::sub() and + # base::grepl() do not support ignore.case = TRUE when fixed = TRUE. + expect_equal( + tibble(x = c("\\Q\\e\\D")) %>% + Table$create() %>% + filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>% + collect(), + tibble(x = c("\\Q\\e\\D")) + ) + expect_equal( + tibble(x = c("\\Q\\e\\D")) %>% + Table$create() %>% + transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>% + collect(), + tibble(x = c("\\Q\\L\\D")) + ) + + # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's + # added in case-insensitive regex match/replace + compare_dplyr_binding( + .input %>% + filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>% + collect(), + tibble(x = c("ABC")) + ) + compare_dplyr_binding( + .input %>% + transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>% + collect(), + tibble(x = c("ABC")) + ) +}) + +test_that("strptime", { + # base::strptime() defaults to local timezone + # but arrow's strptime defaults to UTC. + # So that tests are consistent, set the local timezone to UTC + # TODO: consider reevaluating this workaround after ARROW-12980 + withr::local_timezone("UTC") + + t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) + t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x) + ) %>% + collect(), + t_stamp, + ignore_attr = "tzone" + ) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S") + ) %>% + collect(), + t_stamp, + ignore_attr = "tzone" + ) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns") + ) %>% + collect(), + t_stamp, + ignore_attr = "tzone" + ) + + expect_equal( + t_string %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s") + ) %>% + collect(), + t_stamp, + ignore_attr = "tzone" + ) + + tstring <- tibble(x = c("08-05-2008", NA)) + tstamp <- strptime(c("08-05-2008", NA), format = "%m-%d-%Y") + + expect_equal( + tstring %>% + Table$create() %>% + mutate( + x = strptime(x, format = "%m-%d-%Y") + ) %>% + pull(), + # R's strptime returns POSIXlt (list type) + as.POSIXct(tstamp), + ignore_attr = "tzone" + ) +}) + +test_that("errors in strptime", { + # Error when tz is passed + x <- Expression$field_ref("x") + expect_error( + nse_funcs$strptime(x, tz = "PDT"), + "Time zone argument not supported by Arrow" + ) +}) + +test_that("strftime", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + + times <- tibble( + datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), + date = c(as.Date("2021-01-01"), NA) + ) + formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" + formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" + + compare_dplyr_binding( + .input %>% + mutate(x = strftime(datetime, format = formats)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = strftime(date, format = formats_date)) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% + collect(), + times + ) + + withr::with_timezone( + "Pacific/Marquesas", + { + compare_dplyr_binding( + .input %>% + mutate( + x = strftime(datetime, format = formats, tz = "EST"), + x_date = strftime(date, format = formats_date, tz = "EST") + ) %>% + collect(), + times + ) + + compare_dplyr_binding( + .input %>% + mutate( + x = strftime(datetime, format = formats), + x_date = strftime(date, format = formats_date) + ) %>% + collect(), + times + ) + } + ) + + # This check is due to differences in the way %c currently works in Arrow and R's strftime. + # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved. + expect_error( + times %>% + Table$create() %>% + mutate(x = strftime(datetime, format = "%c")) %>% + collect(), + "%c flag is not supported in non-C locales." + ) + + # Output precision of %S depends on the input timestamp precision. + # Timestamps with second precision are represented as integers while + # milliseconds, microsecond and nanoseconds are represented as fixed floating + # point numbers with 3, 6 and 9 decimal places respectively. + compare_dplyr_binding( + .input %>% + mutate(x = strftime(datetime, format = "%S")) %>% + transmute(as.double(substr(x, 1, 2))) %>% + collect(), + times, + tolerance = 1e-6 + ) +}) + +test_that("format_ISO8601", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA)) + + compare_dplyr_binding( + .input %>% + mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>% + collect(), + times + ) + + if (getRversion() < "3.5") { + # before 3.5, times$x will have no timezone attribute, so Arrow faithfully + # errors that there is no timezone to format: + expect_error( + times %>% + Table$create() %>% + mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>% + collect(), + "Timezone not present, cannot convert to string with timezone: %Y-%m-%d%z" + ) + + # See comment regarding %S flag in strftime tests + expect_error( + times %>% + Table$create() %>% + mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>% + mutate(x = gsub("\\.0*", "", x)) %>% + collect(), + "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z" + ) + } else { + compare_dplyr_binding( + .input %>% + mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>% + collect(), + times + ) + + # See comment regarding %S flag in strftime tests + compare_dplyr_binding( + .input %>% + mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>% + mutate(x = gsub("\\.0*", "", x)) %>% + collect(), + times + ) + } + + + # See comment regarding %S flag in strftime tests + compare_dplyr_binding( + .input %>% + mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>% + mutate(x = gsub("\\.0*", "", x)) %>% + collect(), + times + ) +}) + +test_that("arrow_find_substring and arrow_find_substring_regex", { + df <- tibble(x = c("Foo and Bar", "baz and qux and quux")) + + expect_equal( + df %>% + Table$create() %>% + mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>% + collect(), + tibble(x = c(-1, 0)) + ) + expect_equal( + df %>% + Table$create() %>% + mutate(x = arrow_find_substring( + x, + options = list(pattern = "b", ignore_case = TRUE) + )) %>% + collect(), + tibble(x = c(8, 0)) + ) + expect_equal( + df %>% + Table$create() %>% + mutate(x = arrow_find_substring_regex( + x, + options = list(pattern = "^[fb]") + )) %>% + collect(), + tibble(x = c(-1, 0)) + ) + expect_equal( + df %>% + Table$create() %>% + mutate(x = arrow_find_substring_regex( + x, + options = list(pattern = "[AEIOU]", ignore_case = TRUE) + )) %>% + collect(), + tibble(x = c(1, 1)) + ) +}) + +test_that("stri_reverse and arrow_ascii_reverse functions", { + df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) + + df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux")) + + compare_dplyr_binding( + .input %>% + mutate(x = stri_reverse(x)) %>% + collect(), + df_utf8 + ) + + compare_dplyr_binding( + .input %>% + mutate(x = stri_reverse(x)) %>% + collect(), + df_ascii + ) + + expect_equal( + df_ascii %>% + Table$create() %>% + mutate(x = arrow_ascii_reverse(x)) %>% + collect(), + tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) + ) + + expect_error( + df_utf8 %>% + Table$create() %>% + mutate(x = arrow_ascii_reverse(x)) %>% + collect(), + "Invalid: Non-ASCII sequence in input" + ) +}) + +test_that("str_like", { + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + # TODO: After new version of stringr with str_like has been released, update all + # these tests to use compare_dplyr_binding + + # No match - entire string + expect_equal( + df %>% + Table$create() %>% + mutate(x = str_like(x, "baz")) %>% + collect(), + tibble(x = c(FALSE, FALSE)) + ) + + # Match - entire string + expect_equal( + df %>% + Table$create() %>% + mutate(x = str_like(x, "Foo and bar")) %>% + collect(), + tibble(x = c(TRUE, FALSE)) + ) + + # Wildcard + expect_equal( + df %>% + Table$create() %>% + mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>% + collect(), + tibble(x = c(TRUE, FALSE)) + ) + + # Ignore case + expect_equal( + df %>% + Table$create() %>% + mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>% + collect(), + tibble(x = c(FALSE, FALSE)) + ) + + # Single character + expect_equal( + df %>% + Table$create() %>% + mutate(x = str_like(x, "_a%")) %>% + collect(), + tibble(x = c(FALSE, TRUE)) + ) + + # This will give an error until a new version of stringr with str_like has been released + skip_if_not(packageVersion("stringr") > "1.4.0") + compare_dplyr_binding( + .input %>% + mutate(x = str_like(x, "%baz%")) %>% + collect(), + df + ) +}) + +test_that("str_pad", { + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) + + compare_dplyr_binding( + .input %>% + mutate(x = str_pad(x, width = 31)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = str_pad(x, width = 30, side = "right")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(x = str_pad(x, width = 31, side = "both")) %>% + collect(), + df + ) +}) + +test_that("substr", { + df <- tibble(x = "Apache Arrow") + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, 1, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, 0, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, -1, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, 6, 1)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, -1, -2)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, 9, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, 1, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, 8, 12)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = substr(x, -5, -1)) %>% + collect(), + df + ) + + expect_error( + nse_funcs$substr("Apache Arrow", c(1, 2), 3), + "`start` must be length 1 - other lengths are not supported in Arrow" + ) + + expect_error( + nse_funcs$substr("Apache Arrow", 1, c(2, 3)), + "`stop` must be length 1 - other lengths are not supported in Arrow" + ) +}) + +test_that("substring", { + # nse_funcs$substring just calls nse_funcs$substr, tested extensively above + df <- tibble(x = "Apache Arrow") + + compare_dplyr_binding( + .input %>% + mutate(y = substring(x, 1, 6)) %>% + collect(), + df + ) +}) + +test_that("str_sub", { + df <- tibble(x = "Apache Arrow") + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, 1, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, 0, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, -1, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, 6, 1)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, -1, -2)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, -1, 3)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, 9, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, 1, 6)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, 8, 12)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(y = str_sub(x, -5, -1)) %>% + collect(), + df + ) + + expect_error( + nse_funcs$str_sub("Apache Arrow", c(1, 2), 3), + "`start` must be length 1 - other lengths are not supported in Arrow" + ) + + expect_error( + nse_funcs$str_sub("Apache Arrow", 1, c(2, 3)), + "`end` must be length 1 - other lengths are not supported in Arrow" + ) +}) + +test_that("str_starts, str_ends, startsWith, endsWith", { + df <- tibble(x = c("Foo", "bar", "baz", "qux")) + + compare_dplyr_binding( + .input %>% + filter(str_starts(x, "b.*")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_starts(x, "b.*", negate = TRUE)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_starts(x, fixed("b.*"))) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_starts(x, fixed("b"))) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_ends(x, "r")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_ends(x, "r", negate = TRUE)) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_ends(x, fixed("r$"))) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(str_ends(x, fixed("r"))) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(startsWith(x, "b")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(endsWith(x, "r")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(startsWith(x, "b.*")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + filter(endsWith(x, "r$")) %>% + collect(), + df + ) +}) + +test_that("str_count", { + df <- tibble( + cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"), + dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") + ) + + compare_dplyr_binding( + .input %>% + mutate(a_count = str_count(cities, pattern = "a")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(p_count = str_count(cities, pattern = "d")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(p_count = str_count(cities, + pattern = regex("d", ignore_case = TRUE) + )) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(e_count = str_count(cities, pattern = "u")) %>% + collect(), + df + ) + + # nse_funcs$str_count() is not vectorised over pattern + compare_dplyr_binding( + .input %>% + mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% + collect(), + df, + warning = TRUE + ) + + compare_dplyr_binding( + .input %>% + mutate(dots_count = str_count(dots, ".")) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate(dots_count = str_count(dots, fixed("."))) %>% + collect(), + df + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R new file mode 100644 index 000000000..859dc14b9 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R @@ -0,0 +1,627 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +suppressPackageStartupMessages(library(bit64)) + + +tbl <- example_data + +test_that("explicit type conversions with cast()", { + num_int32 <- 12L + num_int64 <- bit64::as.integer64(10) + + int_types <- c(int8(), int16(), int32(), int64()) + uint_types <- c(uint8(), uint16(), uint32(), uint64()) + float_types <- c(float32(), float64()) + + types <- c( + int_types, + uint_types, + float_types, + double(), # not actually a type, a base R function but should be alias for float64 + string() + ) + + for (type in types) { + expect_type_equal( + object = { + t1 <- Table$create(x = num_int32) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + expect_type_equal( + object = { + t1 <- Table$create(x = num_int64) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + } + + # Arrow errors when truncating floats... + expect_error( + expect_type_equal( + object = { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32())) %>% + compute() + t1$schema[[1]]$type + }, + int32() + ), + "truncated" + ) + + # ... unless safe = FALSE (or allow_float_truncate = TRUE) + expect_type_equal( + object = { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32(), safe = FALSE)) %>% + compute() + t1$schema[[1]]$type + }, + int32() + ) +}) + +test_that("explicit type conversions with as.*()", { + library(bit64) + compare_dplyr_binding( + .input %>% + transmute( + int2chr = as.character(int), + int2dbl = as.double(int), + int2int = as.integer(int), + int2num = as.numeric(int), + dbl2chr = as.character(dbl), + dbl2dbl = as.double(dbl), + dbl2int = as.integer(dbl), + dbl2num = as.numeric(dbl), + ) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute( + chr2chr = as.character(chr), + chr2dbl = as.double(chr), + chr2int = as.integer(chr), + chr2num = as.numeric(chr) + ) %>% + collect(), + tibble(chr = c("1", "2", "3")) + ) + compare_dplyr_binding( + .input %>% + transmute( + chr2i64 = as.integer64(chr), + dbl2i64 = as.integer64(dbl), + i642i64 = as.integer64(i64), + ) %>% + collect(), + tibble(chr = "10000000000", dbl = 10000000000, i64 = as.integer64(1e10)) + ) + compare_dplyr_binding( + .input %>% + transmute( + chr2lgl = as.logical(chr), + dbl2lgl = as.logical(dbl), + int2lgl = as.logical(int) + ) %>% + collect(), + tibble( + chr = c("TRUE", "FALSE", "true", "false"), + dbl = c(1, 0, -99, 0), + int = c(1L, 0L, -99L, 0L) + ) + ) + compare_dplyr_binding( + .input %>% + transmute( + dbl2chr = as.character(dbl), + dbl2dbl = as.double(dbl), + dbl2int = as.integer(dbl), + dbl2lgl = as.logical(dbl), + int2chr = as.character(int), + int2dbl = as.double(int), + int2int = as.integer(int), + int2lgl = as.logical(int), + lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ... + lgl2dbl = as.double(lgl), + lgl2int = as.integer(lgl), + lgl2lgl = as.logical(lgl) + ) %>% + collect() %>% + # need to use toupper() *after* collect() or else skip if utf8proc not available + mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE" + tibble( + dbl = c(1, 0, NA_real_), + int = c(1L, 0L, NA_integer_), + lgl = c(TRUE, FALSE, NA) + ) + ) +}) + +test_that("is.finite(), is.infinite(), is.nan()", { + df <- tibble(x = c( + -4.94065645841246544e-324, 1.79769313486231570e+308, 0, + NA_real_, NaN, Inf, -Inf + )) + compare_dplyr_binding( + .input %>% + transmute( + is_fin = is.finite(x), + is_inf = is.infinite(x) + ) %>% + collect(), + df + ) + # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850) + compare_dplyr_binding( + .input %>% + transmute( + is_nan = is.nan(x) + ) %>% + collect(), + df + ) +}) + +test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", { + df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7)) + compare_dplyr_binding( + .input %>% + transmute( + is_na = is.na(x) + ) %>% + collect(), + df + ) +}) + +test_that("type checks with is() giving Arrow types", { + # with class2=DataType + expect_equal( + Table$create( + i32 = Array$create(1, int32()), + dec = Array$create(pi)$cast(decimal(3, 2)), + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + i32_is_i32 = is(i32, int32()), + i32_is_dec = is(i32, decimal(3, 2)), + i32_is_i64 = is(i32, float64()), + i32_is_str = is(i32, arrow::string()), + dec_is_i32 = is(dec, int32()), + dec_is_dec = is(dec, decimal(3, 2)), + dec_is_i64 = is(dec, float64()), + dec_is_str = is(dec, arrow::string()), + f64_is_i32 = is(f64, int32()), + f64_is_dec = is(f64, decimal(3, 2)), + f64_is_i64 = is(f64, float64()), + f64_is_str = is(f64, arrow::string()), + str_is_i32 = is(str, int32()), + str_is_dec = is(str, decimal(3, 2)), + str_is_i64 = is(str, float64()), + str_is_str = is(str, arrow::string()) + ) %>% + collect() %>% + t() %>% + as.vector(), + c( + TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, + FALSE, FALSE, FALSE, FALSE, TRUE + ) + ) + # with class2=string + expect_equal( + Table$create( + i32 = Array$create(1, int32()), + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + i32_is_i32 = is(i32, "int32"), + i32_is_i64 = is(i32, "double"), + i32_is_str = is(i32, "string"), + f64_is_i32 = is(f64, "int32"), + f64_is_i64 = is(f64, "double"), + f64_is_str = is(f64, "string"), + str_is_i32 = is(str, "int32"), + str_is_i64 = is(str, "double"), + str_is_str = is(str, "string") + ) %>% + collect() %>% + t() %>% + as.vector(), + c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE) + ) + # with class2=string alias + expect_equal( + Table$create( + f16 = Array$create(NA_real_, halffloat()), + f32 = Array$create(1.1, float()), + f64 = Array$create(2.2, float64()), + lgl = Array$create(TRUE, bool()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + f16_is_f16 = is(f16, "float16"), + f16_is_f32 = is(f16, "float32"), + f16_is_f64 = is(f16, "float64"), + f16_is_lgl = is(f16, "boolean"), + f16_is_str = is(f16, "utf8"), + f32_is_f16 = is(f32, "float16"), + f32_is_f32 = is(f32, "float32"), + f32_is_f64 = is(f32, "float64"), + f32_is_lgl = is(f32, "boolean"), + f32_is_str = is(f32, "utf8"), + f64_is_f16 = is(f64, "float16"), + f64_is_f32 = is(f64, "float32"), + f64_is_f64 = is(f64, "float64"), + f64_is_lgl = is(f64, "boolean"), + f64_is_str = is(f64, "utf8"), + lgl_is_f16 = is(lgl, "float16"), + lgl_is_f32 = is(lgl, "float32"), + lgl_is_f64 = is(lgl, "float64"), + lgl_is_lgl = is(lgl, "boolean"), + lgl_is_str = is(lgl, "utf8"), + str_is_f16 = is(str, "float16"), + str_is_f32 = is(str, "float32"), + str_is_f64 = is(str, "float64"), + str_is_lgl = is(str, "boolean"), + str_is_str = is(str, "utf8") + ) %>% + collect() %>% + t() %>% + as.vector(), + c( + TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, + FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, + FALSE, FALSE, TRUE + ) + ) +}) + +test_that("type checks with is() giving R types", { + library(bit64) + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is(chr, "character"), + chr_is_fct = is(chr, "factor"), + chr_is_int = is(chr, "integer"), + chr_is_i64 = is(chr, "integer64"), + chr_is_lst = is(chr, "list"), + chr_is_lgl = is(chr, "logical"), + chr_is_num = is(chr, "numeric"), + dbl_is_chr = is(dbl, "character"), + dbl_is_fct = is(dbl, "factor"), + dbl_is_int = is(dbl, "integer"), + dbl_is_i64 = is(dbl, "integer64"), + dbl_is_lst = is(dbl, "list"), + dbl_is_lgl = is(dbl, "logical"), + dbl_is_num = is(dbl, "numeric"), + fct_is_chr = is(fct, "character"), + fct_is_fct = is(fct, "factor"), + fct_is_int = is(fct, "integer"), + fct_is_i64 = is(fct, "integer64"), + fct_is_lst = is(fct, "list"), + fct_is_lgl = is(fct, "logical"), + fct_is_num = is(fct, "numeric"), + int_is_chr = is(int, "character"), + int_is_fct = is(int, "factor"), + int_is_int = is(int, "integer"), + int_is_i64 = is(int, "integer64"), + int_is_lst = is(int, "list"), + int_is_lgl = is(int, "logical"), + int_is_num = is(int, "numeric"), + lgl_is_chr = is(lgl, "character"), + lgl_is_fct = is(lgl, "factor"), + lgl_is_int = is(lgl, "integer"), + lgl_is_i64 = is(lgl, "integer64"), + lgl_is_lst = is(lgl, "list"), + lgl_is_lgl = is(lgl, "logical"), + lgl_is_num = is(lgl, "numeric") + ) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute( + i64_is_chr = is(i64, "character"), + i64_is_fct = is(i64, "factor"), + # we want Arrow to return TRUE, but bit64 returns FALSE + # i64_is_int = is(i64, "integer"), + i64_is_i64 = is(i64, "integer64"), + i64_is_lst = is(i64, "list"), + i64_is_lgl = is(i64, "logical"), + # we want Arrow to return TRUE, but bit64 returns FALSE + # i64_is_num = is(i64, "numeric"), + lst_is_chr = is(lst, "character"), + lst_is_fct = is(lst, "factor"), + lst_is_int = is(lst, "integer"), + lst_is_i64 = is(lst, "integer64"), + lst_is_lst = is(lst, "list"), + lst_is_lgl = is(lst, "logical"), + lst_is_num = is(lst, "numeric") + ) %>% + collect(), + tibble( + i64 = as.integer64(1:3), + lst = list(c("a", "b"), c("d", "e"), c("f", "g")) + ) + ) +}) + +test_that("type checks with is.*()", { + library(bit64) + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is.character(chr), + chr_is_dbl = is.double(chr), + chr_is_fct = is.factor(chr), + chr_is_int = is.integer(chr), + chr_is_i64 = is.integer64(chr), + chr_is_lst = is.list(chr), + chr_is_lgl = is.logical(chr), + chr_is_num = is.numeric(chr), + dbl_is_chr = is.character(dbl), + dbl_is_dbl = is.double(dbl), + dbl_is_fct = is.factor(dbl), + dbl_is_int = is.integer(dbl), + dbl_is_i64 = is.integer64(dbl), + dbl_is_lst = is.list(dbl), + dbl_is_lgl = is.logical(dbl), + dbl_is_num = is.numeric(dbl), + fct_is_chr = is.character(fct), + fct_is_dbl = is.double(fct), + fct_is_fct = is.factor(fct), + fct_is_int = is.integer(fct), + fct_is_i64 = is.integer64(fct), + fct_is_lst = is.list(fct), + fct_is_lgl = is.logical(fct), + fct_is_num = is.numeric(fct), + int_is_chr = is.character(int), + int_is_dbl = is.double(int), + int_is_fct = is.factor(int), + int_is_int = is.integer(int), + int_is_i64 = is.integer64(int), + int_is_lst = is.list(int), + int_is_lgl = is.logical(int), + int_is_num = is.numeric(int), + lgl_is_chr = is.character(lgl), + lgl_is_dbl = is.double(lgl), + lgl_is_fct = is.factor(lgl), + lgl_is_int = is.integer(lgl), + lgl_is_i64 = is.integer64(lgl), + lgl_is_lst = is.list(lgl), + lgl_is_lgl = is.logical(lgl), + lgl_is_num = is.numeric(lgl) + ) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute( + i64_is_chr = is.character(i64), + # TODO: investigate why this is not matching when testthat runs it + # i64_is_dbl = is.double(i64), + i64_is_fct = is.factor(i64), + # we want Arrow to return TRUE, but bit64 returns FALSE + # i64_is_int = is.integer(i64), + i64_is_i64 = is.integer64(i64), + i64_is_lst = is.list(i64), + i64_is_lgl = is.logical(i64), + i64_is_num = is.numeric(i64), + lst_is_chr = is.character(lst), + lst_is_dbl = is.double(lst), + lst_is_fct = is.factor(lst), + lst_is_int = is.integer(lst), + lst_is_i64 = is.integer64(lst), + lst_is_lst = is.list(lst), + lst_is_lgl = is.logical(lst), + lst_is_num = is.numeric(lst) + ) %>% + collect(), + tibble( + i64 = as.integer64(1:3), + lst = list(c("a", "b"), c("d", "e"), c("f", "g")) + ) + ) +}) + +test_that("type checks with is_*()", { + library(rlang, warn.conflicts = FALSE) + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is_character(chr), + chr_is_dbl = is_double(chr), + chr_is_int = is_integer(chr), + chr_is_lst = is_list(chr), + chr_is_lgl = is_logical(chr), + dbl_is_chr = is_character(dbl), + dbl_is_dbl = is_double(dbl), + dbl_is_int = is_integer(dbl), + dbl_is_lst = is_list(dbl), + dbl_is_lgl = is_logical(dbl), + int_is_chr = is_character(int), + int_is_dbl = is_double(int), + int_is_int = is_integer(int), + int_is_lst = is_list(int), + int_is_lgl = is_logical(int), + lgl_is_chr = is_character(lgl), + lgl_is_dbl = is_double(lgl), + lgl_is_int = is_integer(lgl), + lgl_is_lst = is_list(lgl), + lgl_is_lgl = is_logical(lgl) + ) %>% + collect(), + tbl + ) +}) + +test_that("type checks on expressions", { + compare_dplyr_binding( + .input %>% + transmute( + a = is.character(as.character(int)), + b = is.integer(as.character(int)), + c = is.integer(int + int), + d = is.double(int + dbl), + e = is.logical(dbl > pi) + ) %>% + collect(), + tbl + ) + + # the code in the expectation below depends on RE2 + skip_if_not_available("re2") + + compare_dplyr_binding( + .input %>% + transmute( + a = is.logical(grepl("[def]", chr)) + ) %>% + collect(), + tbl + ) +}) + +test_that("type checks on R scalar literals", { + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is.character("foo"), + int_is_chr = is.character(42L), + int_is_int = is.integer(42L), + chr_is_int = is.integer("foo"), + dbl_is_num = is.numeric(3.14159), + int_is_num = is.numeric(42L), + chr_is_num = is.numeric("foo"), + dbl_is_dbl = is.double(3.14159), + chr_is_dbl = is.double("foo"), + lgl_is_lgl = is.logical(TRUE), + chr_is_lgl = is.logical("foo"), + fct_is_fct = is.factor(factor("foo", levels = c("foo", "bar", "baz"))), + chr_is_fct = is.factor("foo"), + lst_is_lst = is.list(list(c(a = "foo", b = "bar"))), + chr_is_lst = is.list("foo") + ) %>% + collect(), + tbl + ) +}) + +test_that("as.factor()/dictionary_encode()", { + skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression") + df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) + df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8)) + + compare_dplyr_binding( + .input %>% + transmute(x = as.factor(x)) %>% + collect(), + df1 + ) + + expect_warning( + compare_dplyr_binding( + .input %>% + transmute(x = as.factor(x)) %>% + collect(), + df2 + ), + "Coercing dictionary values to R character factor levels" + ) + + # dictionary values with default null encoding behavior ("mask") omits + # nulls from the dictionary values + expect_equal( + object = { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x)) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = NA) + ) + + # dictionary values with "encode" null encoding behavior includes nulls in + # the dictionary values + expect_equal( + object = { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = TRUE) + ) +}) + +test_that("bad explicit type conversions with as.*()", { + + # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R) + expect_error( + compare_dplyr_binding( + .input %>% + transmute(lgl2chr = as.character(lgl)) %>% + collect(), + tibble(lgl = c(TRUE, FALSE, NA)) + ) + ) + + # Arrow fails to parse these strings as numbers (instead of returning NAs with + # a warning like R does) + expect_error( + expect_warning( + compare_dplyr_binding( + .input %>% + transmute(chr2num = as.numeric(chr)) %>% + collect(), + tibble(chr = c("l.O", "S.S", "")) + ) + ) + ) + + # Arrow fails to parse these strings as Booleans (instead of returning NAs + # like R does) + expect_error( + compare_dplyr_binding( + .input %>% + transmute(chr2lgl = as.logical(chr)) %>% + collect(), + tibble(chr = c("TRU", "FAX", "")) + ) + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-group-by.R b/src/arrow/r/tests/testthat/test-dplyr-group-by.R new file mode 100644 index 000000000..7cfcfb5c9 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-group-by.R @@ -0,0 +1,158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data + +test_that("group_by groupings are recorded", { + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + select(int, chr) %>% + filter(int > 5) %>% + collect(), + tbl + ) +}) + +test_that("group_by supports creating/renaming", { + compare_dplyr_binding( + .input %>% + group_by(chr, numbers = int) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(chr, numbers = int * 4) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(int > 4, lgl, foo = int > 5) %>% + collect(), + tbl + ) +}) + +test_that("ungroup", { + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + select(int, chr) %>% + ungroup() %>% + filter(int > 5) %>% + collect(), + tbl + ) + + # to confirm that the above expectation is actually testing what we think it's + # testing, verify that compare_dplyr_binding() distinguishes between grouped and + # ungrouped tibbles + expect_error( + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + select(int, chr) %>% + (function(x) if (inherits(x, "tbl_df")) ungroup(x) else x) %>% + filter(int > 5) %>% + collect(), + tbl + ) + ) +}) + +test_that("group_by then rename", { + compare_dplyr_binding( + .input %>% + group_by(chr) %>% + select(string = chr, int) %>% + collect(), + tbl + ) +}) + +test_that("group_by with .drop", { + test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog") + compare_dplyr_binding( + .input %>% + group_by(!!!syms(test_groups), .drop = TRUE) %>% + collect(), + example_with_logical_factors + ) + compare_dplyr_binding( + .input %>% + group_by(!!!syms(test_groups), .drop = FALSE) %>% + collect(), + example_with_logical_factors + ) + expect_equal( + example_with_logical_factors %>% + group_by(!!!syms(test_groups), .drop = TRUE) %>% + collect() %>% + n_groups(), + 4L + ) + expect_equal( + example_with_logical_factors %>% + group_by(!!!syms(test_groups), .drop = FALSE) %>% + collect() %>% + n_groups(), + 8L + ) + expect_equal( + example_with_logical_factors %>% + group_by(!!!syms(test_groups), .drop = FALSE) %>% + group_by_drop_default(), + FALSE + ) + expect_equal( + example_with_logical_factors %>% + group_by(!!!syms(test_groups), .drop = TRUE) %>% + group_by_drop_default(), + TRUE + ) + compare_dplyr_binding( + .input %>% + group_by(.drop = FALSE) %>% # no group by vars + group_by_drop_default(), + example_with_logical_factors + ) + compare_dplyr_binding( + .input %>% + group_by_drop_default(), + example_with_logical_factors + ) + compare_dplyr_binding( + .input %>% + group_by(!!!syms(test_groups)) %>% + group_by_drop_default(), + example_with_logical_factors + ) + compare_dplyr_binding( + .input %>% + group_by(!!!syms(test_groups), .drop = FALSE) %>% + ungroup() %>% + group_by_drop_default(), + example_with_logical_factors + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-join.R b/src/arrow/r/tests/testthat/test-dplyr-join.R new file mode 100644 index 000000000..d8239f810 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-join.R @@ -0,0 +1,175 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) + +left <- example_data +left$some_grouping <- rep(c(1, 2), 5) + +left_tab <- Table$create(left) + +to_join <- tibble::tibble( + some_grouping = c(1, 2), + capital_letters = c("A", "B"), + another_column = TRUE +) +to_join_tab <- Table$create(to_join) + + +test_that("left_join", { + expect_message( + compare_dplyr_binding( + .input %>% + left_join(to_join) %>% + collect(), + left + ), + 'Joining, by = "some_grouping"' + ) +}) + +test_that("left_join `by` args", { + compare_dplyr_binding( + .input %>% + left_join(to_join, by = "some_grouping") %>% + collect(), + left + ) + compare_dplyr_binding( + .input %>% + left_join( + to_join %>% + rename(the_grouping = some_grouping), + by = c(some_grouping = "the_grouping") + ) %>% + collect(), + left + ) + + compare_dplyr_binding( + .input %>% + rename(the_grouping = some_grouping) %>% + left_join( + to_join, + by = c(the_grouping = "some_grouping") + ) %>% + collect(), + left + ) +}) + +test_that("join two tables", { + expect_identical( + left_tab %>% + left_join(to_join_tab, by = "some_grouping") %>% + collect(), + left %>% + left_join(to_join, by = "some_grouping") %>% + collect() + ) +}) + +test_that("Error handling", { + expect_error( + left_tab %>% + left_join(to_join, by = "not_a_col") %>% + collect(), + "all(names(by) %in% names(x)) is not TRUE", + fixed = TRUE + ) +}) + +# TODO: test duplicate col names +# TODO: casting: int and float columns? + +test_that("right_join", { + compare_dplyr_binding( + .input %>% + right_join(to_join, by = "some_grouping") %>% + collect(), + left + ) +}) + +test_that("inner_join", { + compare_dplyr_binding( + .input %>% + inner_join(to_join, by = "some_grouping") %>% + collect(), + left + ) +}) + +test_that("full_join", { + compare_dplyr_binding( + .input %>% + full_join(to_join, by = "some_grouping") %>% + collect(), + left + ) +}) + +test_that("semi_join", { + compare_dplyr_binding( + .input %>% + semi_join(to_join, by = "some_grouping") %>% + collect(), + left + ) +}) + +test_that("anti_join", { + compare_dplyr_binding( + .input %>% + # Factor levels when there are no rows in the data don't match + # TODO: use better anti_join test data + select(-fct) %>% + anti_join(to_join, by = "some_grouping") %>% + collect(), + left + ) +}) + +test_that("mutate then join", { + left <- Table$create( + one = c("a", "b"), + two = 1:2 + ) + right <- Table$create( + three = TRUE, + dos = 2L + ) + + expect_equal( + left %>% + rename(dos = two) %>% + mutate(one = toupper(one)) %>% + left_join( + right %>% + mutate(three = !three) + ) %>% + arrange(dos) %>% + collect(), + tibble( + one = c("A", "B"), + dos = 1:2, + three = c(NA, FALSE) + ) + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-mutate.R b/src/arrow/r/tests/testthat/test-dplyr-mutate.R new file mode 100644 index 000000000..886ec9e42 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-mutate.R @@ -0,0 +1,522 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") + +test_that("mutate() is lazy", { + expect_s3_class( + tbl %>% record_batch() %>% mutate(int = int + 6L), + "arrow_dplyr_query" + ) +}) + +test_that("basic mutate", { + compare_dplyr_binding( + .input %>% + select(int, chr) %>% + filter(int > 5) %>% + mutate(int = int + 6L) %>% + collect(), + tbl + ) +}) + +test_that("mutate() with NULL inputs", { + compare_dplyr_binding( + .input %>% + mutate(int = NULL) %>% + collect(), + tbl + ) +}) + +test_that("empty mutate()", { + compare_dplyr_binding( + .input %>% + mutate() %>% + collect(), + tbl + ) +}) + +test_that("transmute", { + compare_dplyr_binding( + .input %>% + select(int, chr) %>% + filter(int > 5) %>% + transmute(int = int + 6L) %>% + collect(), + tbl + ) +}) + +test_that("transmute() with NULL inputs", { + compare_dplyr_binding( + .input %>% + transmute(int = NULL) %>% + collect(), + tbl + ) +}) + +test_that("empty transmute()", { + compare_dplyr_binding( + .input %>% + transmute() %>% + collect(), + tbl + ) +}) + +test_that("transmute() with unsupported arguments", { + expect_error( + tbl %>% + Table$create() %>% + transmute(int = int + 42L, .keep = "all"), + "`transmute()` does not support the `.keep` argument", + fixed = TRUE + ) + expect_error( + tbl %>% + Table$create() %>% + transmute(int = int + 42L, .before = lgl), + "`transmute()` does not support the `.before` argument", + fixed = TRUE + ) + expect_error( + tbl %>% + Table$create() %>% + transmute(int = int + 42L, .after = chr), + "`transmute()` does not support the `.after` argument", + fixed = TRUE + ) +}) + +test_that("transmute() defuses dots arguments (ARROW-13262)", { + expect_warning( + tbl %>% + Table$create() %>% + transmute(stringr::str_c(chr, chr)) %>% + collect(), + "Expression stringr::str_c(chr, chr) not supported in Arrow; pulling data into R", + fixed = TRUE + ) +}) + +test_that("mutate and refer to previous mutants", { + compare_dplyr_binding( + .input %>% + select(int, verses) %>% + mutate( + line_lengths = nchar(verses), + longer = line_lengths * 10 + ) %>% + filter(line_lengths > 15) %>% + collect(), + tbl + ) +}) + +test_that("nchar() arguments", { + compare_dplyr_binding( + .input %>% + select(int, verses) %>% + mutate( + line_lengths = nchar(verses, type = "bytes"), + longer = line_lengths * 10 + ) %>% + filter(line_lengths > 15) %>% + collect(), + tbl + ) + # This tests the whole abandon_ship() machinery + compare_dplyr_binding( + .input %>% + select(int, verses) %>% + mutate( + line_lengths = nchar(verses, type = "bytes", allowNA = TRUE), + longer = line_lengths * 10 + ) %>% + filter(line_lengths > 15) %>% + collect(), + tbl, + warning = paste0( + "In nchar\\(verses, type = \"bytes\", allowNA = TRUE\\), ", + "allowNA = TRUE not supported by Arrow; pulling data into R" + ) + ) +}) + +test_that("mutate with .data pronoun", { + compare_dplyr_binding( + .input %>% + select(int, verses) %>% + mutate( + line_lengths = str_length(verses), + longer = .data$line_lengths * 10 + ) %>% + filter(line_lengths > 15) %>% + collect(), + tbl + ) +}) + +test_that("mutate with unnamed expressions", { + compare_dplyr_binding( + .input %>% + select(int, padded_strings) %>% + mutate( + int, # bare column name + nchar(padded_strings) # expression + ) %>% + filter(int > 5) %>% + collect(), + tbl + ) +}) + +test_that("mutate with reassigning same name", { + compare_dplyr_binding( + .input %>% + transmute( + new = lgl, + new = chr + ) %>% + collect(), + tbl + ) +}) + +test_that("mutate with single value for recycling", { + compare_dplyr_binding( + .input %>% + select(int, padded_strings) %>% + mutate( + dr_bronner = 1 # ALL ONE! + ) %>% + collect(), + tbl + ) +}) + +test_that("dplyr::mutate's examples", { + # Newly created variables are available immediately + compare_dplyr_binding( + .input %>% + select(name, mass) %>% + mutate( + mass2 = mass * 2, + mass2_squared = mass2 * mass2 + ) %>% + collect(), + starwars # this is a test tibble that ships with dplyr + ) + + # As well as adding new variables, you can use mutate() to + # remove variables and modify existing variables. + compare_dplyr_binding( + .input %>% + select(name, height, mass, homeworld) %>% + mutate( + mass = NULL, + height = height * 0.0328084 # convert to feet + ) %>% + collect(), + starwars + ) + + # Examples we don't support should succeed + # but warn that they're pulling data into R to do so + + # across and autosplicing: ARROW-11699 + compare_dplyr_binding( + .input %>% + select(name, homeworld, species) %>% + mutate(across(!name, as.factor)) %>% + collect(), + starwars, + warning = "Expression across.*not supported in Arrow" + ) + + # group_by then mutate + compare_dplyr_binding( + .input %>% + select(name, mass, homeworld) %>% + group_by(homeworld) %>% + mutate(rank = min_rank(desc(mass))) %>% + collect(), + starwars, + warning = TRUE + ) + + # `.before` and `.after` experimental args: ARROW-11701 + df <- tibble(x = 1, y = 2) + compare_dplyr_binding( + .input %>% mutate(z = x + y) %>% collect(), + df + ) + #> # A tibble: 1 x 3 + #> x y z + #> <dbl> <dbl> <dbl> + #> 1 1 2 3 + + compare_dplyr_binding( + .input %>% mutate(z = x + y, .before = 1) %>% collect(), + df + ) + #> # A tibble: 1 x 3 + #> z x y + #> <dbl> <dbl> <dbl> + #> 1 3 1 2 + compare_dplyr_binding( + .input %>% mutate(z = x + y, .after = x) %>% collect(), + df + ) + #> # A tibble: 1 x 3 + #> x z y + #> <dbl> <dbl> <dbl> + #> 1 1 3 2 + + # By default, mutate() keeps all columns from the input data. + # Experimental: You can override with `.keep` + df <- tibble(x = 1, y = 2, a = "a", b = "b") + compare_dplyr_binding( + .input %>% mutate(z = x + y, .keep = "all") %>% collect(), # the default + df + ) + #> # A tibble: 1 x 5 + #> x y a b z + #> <dbl> <dbl> <chr> <chr> <dbl> + #> 1 1 2 a b 3 + compare_dplyr_binding( + .input %>% mutate(z = x + y, .keep = "used") %>% collect(), + df + ) + #> # A tibble: 1 x 3 + #> x y z + #> <dbl> <dbl> <dbl> + #> 1 1 2 3 + compare_dplyr_binding( + .input %>% mutate(z = x + y, .keep = "unused") %>% collect(), + df + ) + #> # A tibble: 1 x 3 + #> a b z + #> <chr> <chr> <dbl> + #> 1 a b 3 + compare_dplyr_binding( + .input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute() + df + ) + #> # A tibble: 1 x 1 + #> z + #> <dbl> + #> 1 3 + + # Grouping ---------------------------------------- + # The mutate operation may yield different results on grouped + # tibbles because the expressions are computed within groups. + # The following normalises `mass` by the global average: + # TODO: ARROW-13926 + compare_dplyr_binding( + .input %>% + select(name, mass, species) %>% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>% + collect(), + starwars, + warning = "window function" + ) +}) + +test_that("Can mutate after group_by as long as there are no aggregations", { + compare_dplyr_binding( + .input %>% + select(int, chr) %>% + group_by(chr) %>% + mutate(int = int + 6L) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + select(mean = int, chr) %>% + # rename `int` to `mean` and use `mean` in `mutate()` to test that + # `all_funs()` does not incorrectly identify it as an aggregate function + group_by(chr) %>% + mutate(mean = mean + 6L) %>% + collect(), + tbl + ) + expect_warning( + tbl %>% + Table$create() %>% + select(int, chr) %>% + group_by(chr) %>% + mutate(avg_int = mean(int)) %>% + collect(), + "window functions not currently supported in Arrow; pulling data into R", + fixed = TRUE + ) + expect_warning( + tbl %>% + Table$create() %>% + select(mean = int, chr) %>% + # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that + # `all_funs()` detects `mean()` despite the collision with a column name + group_by(chr) %>% + mutate(avg_int = mean(mean)) %>% + collect(), + "window functions not currently supported in Arrow; pulling data into R", + fixed = TRUE + ) +}) + +test_that("handle bad expressions", { + # TODO: search for functions other than mean() (see above test) + # that need to be forced to fail because they error ambiguously + + with_language("fr", { + # expect_warning(., NA) because the usual behavior when it hits a filter + # that it can't evaluate is to raise a warning, collect() to R, and retry + # the filter. But we want this to error the first time because it's + # a user error, not solvable by retrying in R + expect_warning( + expect_error( + Table$create(tbl) %>% mutate(newvar = NOTAVAR + 2), + "objet 'NOTAVAR' introuvable" + ), + NA + ) + }) +}) + +test_that("Can't just add a vector column with mutate()", { + expect_warning( + expect_equal( + Table$create(tbl) %>% + select(int) %>% + mutate(again = 1:10), + tibble::tibble(int = tbl$int, again = 1:10) + ), + "In again = 1:10, only values of size one are recycled; pulling data into R" + ) +}) + +test_that("print a mutated table", { + expect_output( + Table$create(tbl) %>% + select(int) %>% + mutate(twice = int * 2) %>% + print(), + "InMemoryDataset (query) +int: int32 +twice: double (multiply_checked(int, 2)) + +See $.data for the source Arrow object", + fixed = TRUE + ) +}) + +test_that("mutate and write_dataset", { + skip_if_not_available("dataset") + # See related test in test-dataset.R + + first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") + df1 <- tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[1:10], + fct = factor(LETTERS[1:10]), + ts = first_date + lubridate::days(1:10) + ) + + second_date <- lubridate::ymd_hms("2017-03-09 07:01:02") + df2 <- tibble( + int = 101:110, + dbl = c(as.numeric(51:59), NaN), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[10:1], + fct = factor(LETTERS[10:1]), + ts = second_date + lubridate::days(10:1) + ) + + dst_dir <- tempfile() + stacked <- record_batch(rbind(df1, df2)) + stacked %>% + mutate(twice = int * 2) %>% + group_by(int) %>% + write_dataset(dst_dir, format = "feather") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir, format = "feather") + + expect_equal( + new_ds %>% + select(string = chr, integer = int, twice) %>% + filter(integer > 6 & integer < 11) %>% + collect() %>% + summarize(mean = mean(integer)), + df1 %>% + select(string = chr, integer = int) %>% + mutate(twice = integer * 2) %>% + filter(integer > 6) %>% + summarize(mean = mean(integer)) + ) +}) + +test_that("mutate and pmin/pmax", { + df <- tibble( + city = c("Chillan", "Valdivia", "Osorno"), + val1 = c(200, 300, NA), + val2 = c(100, NA, NA), + val3 = c(0, NA, NA) + ) + + compare_dplyr_binding( + .input %>% + mutate( + max_val_1 = pmax(val1, val2, val3), + max_val_2 = pmax(val1, val2, val3, na.rm = TRUE), + min_val_1 = pmin(val1, val2, val3), + min_val_2 = pmin(val1, val2, val3, na.rm = TRUE) + ) %>% + collect(), + df + ) + + compare_dplyr_binding( + .input %>% + mutate( + max_val_1 = pmax(val1 - 100, 200, val1 * 100, na.rm = TRUE), + min_val_1 = pmin(val1 - 100, 100, val1 * 100, na.rm = TRUE), + ) %>% + collect(), + df + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-query.R b/src/arrow/r/tests/testthat/test-dplyr-query.R new file mode 100644 index 000000000..21a55f4b4 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-query.R @@ -0,0 +1,296 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$another_chr <- tail(letters, 10) + +test_that("basic select/filter/collect", { + batch <- record_batch(tbl) + + b2 <- batch %>% + select(int, chr) %>% + filter(int > 5) + + expect_s3_class(b2, "arrow_dplyr_query") + t2 <- collect(b2) + expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]) + # Test that the original object is not affected + expect_identical(collect(batch), tbl) +}) + +test_that("dim() on query", { + compare_dplyr_binding( + .input %>% + filter(int > 5) %>% + select(int, chr) %>% + dim(), + tbl + ) +}) + +test_that("Print method", { + expect_output( + record_batch(tbl) %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + filter(int < 5) %>% + select(int, chr) %>% + print(), + 'InMemoryDataset (query) +int: int32 +chr: string + +* Filter: (((dbl > 2) and ((chr == "d") or (chr == "f"))) and (int < 5)) +See $.data for the source Arrow object', + fixed = TRUE + ) +}) + +test_that("pull", { + compare_dplyr_binding( + .input %>% pull(), + tbl + ) + compare_dplyr_binding( + .input %>% pull(1), + tbl + ) + compare_dplyr_binding( + .input %>% pull(chr), + tbl + ) + compare_dplyr_binding( + .input %>% + filter(int > 4) %>% + rename(strng = chr) %>% + pull(strng), + tbl + ) +}) + +test_that("collect(as_data_frame=FALSE)", { + batch <- record_batch(tbl) + + b1 <- batch %>% collect(as_data_frame = FALSE) + + expect_r6_class(b1, "RecordBatch") + + b2 <- batch %>% + select(int, chr) %>% + filter(int > 5) %>% + collect(as_data_frame = FALSE) + + # collect(as_data_frame = FALSE) always returns Table now + expect_r6_class(b2, "Table") + expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] + expect_equal(as.data.frame(b2), expected) + + b3 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + collect(as_data_frame = FALSE) + expect_r6_class(b3, "Table") + expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + + b4 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + group_by(int) %>% + collect(as_data_frame = FALSE) + expect_s3_class(b4, "arrow_dplyr_query") + expect_equal( + as.data.frame(b4), + expected %>% + rename(strng = chr) %>% + group_by(int) + ) +}) + +test_that("compute()", { + batch <- record_batch(tbl) + + b1 <- batch %>% compute() + + expect_r6_class(b1, "RecordBatch") + + b2 <- batch %>% + select(int, chr) %>% + filter(int > 5) %>% + compute() + + expect_r6_class(b2, "Table") + expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] + expect_equal(as.data.frame(b2), expected) + + b3 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + compute() + expect_r6_class(b3, "Table") + expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + + b4 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + group_by(int) %>% + compute() + expect_s3_class(b4, "arrow_dplyr_query") + expect_equal( + as.data.frame(b4), + expected %>% + rename(strng = chr) %>% + group_by(int) + ) +}) + +test_that("head", { + batch <- record_batch(tbl) + + b2 <- batch %>% + select(int, chr) %>% + filter(int > 5) %>% + head(2) + expect_s3_class(b2, "arrow_dplyr_query") + expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")][1:2, ] + expect_equal(collect(b2), expected) + + b3 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + head(2) + expect_s3_class(b3, "arrow_dplyr_query") + expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + + b4 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + group_by(int) %>% + head(2) + expect_s3_class(b4, "arrow_dplyr_query") + expect_equal( + as.data.frame(b4), + expected %>% + rename(strng = chr) %>% + group_by(int) + ) + + expect_equal( + batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + head(2) %>% + mutate(twice = int * 2) %>% + collect(), + expected %>% + rename(strng = chr) %>% + mutate(twice = int * 2) + ) + + # This would fail if we evaluated head() after filter() + expect_equal( + batch %>% + select(int, strng = chr) %>% + head(2) %>% + filter(int > 5) %>% + collect(), + expected %>% + rename(strng = chr) %>% + filter(FALSE) + ) +}) + +test_that("arrange then head returns the right data (ARROW-14162)", { + + compare_dplyr_binding( + .input %>% + # mpg has ties so we need to sort by two things to get deterministic order + arrange(mpg, disp) %>% + head(4) %>% + collect(), + mtcars, + ignore_attr = "row.names" + ) +}) + +test_that("arrange then tail returns the right data", { + compare_dplyr_binding( + .input %>% + # mpg has ties so we need to sort by two things to get deterministic order + arrange(mpg, disp) %>% + tail(4) %>% + collect(), + mtcars, + ignore_attr = "row.names" + ) +}) + +test_that("tail", { + batch <- record_batch(tbl) + + b2 <- batch %>% + select(int, chr) %>% + filter(int > 5) %>% + arrange(int) %>% + tail(2) + + expect_s3_class(b2, "arrow_dplyr_query") + expected <- tail(tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")], 2) + expect_equal(as.data.frame(b2), expected) + + b3 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + arrange(int) %>% + tail(2) + expect_s3_class(b3, "arrow_dplyr_query") + expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) + + b4 <- batch %>% + select(int, strng = chr) %>% + filter(int > 5) %>% + group_by(int) %>% + arrange(int) %>% + tail(2) + expect_s3_class(b4, "arrow_dplyr_query") + expect_equal( + as.data.frame(b4), + expected %>% + rename(strng = chr) %>% + group_by(int) + ) +}) + +test_that("No duplicate field names are allowed in an arrow_dplyr_query", { + expect_error( + Table$create(tbl, tbl) %>% + filter(int > 0), + regexp = paste0( + 'The following field names were found more than once in the data: "int", "dbl", ', + '"dbl2", "lgl", "false", "chr", "fct", "verses", "padded_strings"' + ) + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-select.R b/src/arrow/r/tests/testthat/test-dplyr-select.R new file mode 100644 index 000000000..2ca2b100e --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-select.R @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data + +test_that("Empty select returns no columns", { + compare_dplyr_binding( + .input %>% select() %>% collect(), + tbl, + skip_table = "Table with 0 cols doesn't know how many rows it should have" + ) +}) +test_that("Empty select still includes the group_by columns", { + expect_message( + compare_dplyr_binding( + .input %>% group_by(chr) %>% select() %>% collect(), + tbl + ), + "Adding missing grouping variables" + ) +}) + +test_that("select/rename", { + compare_dplyr_binding( + .input %>% + select(string = chr, int) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + rename(string = chr) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + rename(strng = chr) %>% + rename(other = strng) %>% + collect(), + tbl + ) +}) + +test_that("select/rename with selection helpers", { + + # TODO: add some passing tests here + + expect_error( + compare_dplyr_binding( + .input %>% + select(where(is.numeric)) %>% + collect(), + tbl + ), + "Unsupported selection helper" + ) +}) + +test_that("filtering with rename", { + compare_dplyr_binding( + .input %>% + filter(chr == "b") %>% + select(string = chr, int) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + select(string = chr, int) %>% + filter(string == "b") %>% + collect(), + tbl + ) +}) + +test_that("relocate", { + df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a") + compare_dplyr_binding( + .input %>% relocate(f) %>% collect(), + df, + ) + compare_dplyr_binding( + .input %>% relocate(a, .after = c) %>% collect(), + df, + ) + compare_dplyr_binding( + .input %>% relocate(f, .before = b) %>% collect(), + df, + ) + compare_dplyr_binding( + .input %>% relocate(a, .after = last_col()) %>% collect(), + df, + ) + compare_dplyr_binding( + .input %>% relocate(ff = f) %>% collect(), + df, + ) +}) + +test_that("relocate with selection helpers", { + df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a") + compare_dplyr_binding( + .input %>% relocate(any_of(c("a", "e", "i", "o", "u"))) %>% collect(), + df + ) + compare_dplyr_binding( + .input %>% relocate(where(is.character)) %>% collect(), + df + ) + compare_dplyr_binding( + .input %>% relocate(a, b, c, .after = where(is.character)) %>% collect(), + df + ) + compare_dplyr_binding( + .input %>% relocate(d, e, f, .before = where(is.numeric)) %>% collect(), + df + ) + # works after other dplyr verbs + compare_dplyr_binding( + .input %>% + mutate(c = as.character(c)) %>% + relocate(d, e, f, .after = where(is.numeric)) %>% + collect(), + df + ) +}) diff --git a/src/arrow/r/tests/testthat/test-dplyr-summarize.R b/src/arrow/r/tests/testthat/test-dplyr-summarize.R new file mode 100644 index 000000000..3988412b8 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-summarize.R @@ -0,0 +1,881 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +withr::local_options(list(arrow.summarise.sort = TRUE)) + +library(dplyr, warn.conflicts = FALSE) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$some_grouping <- rep(c(1, 2), 5) + +test_that("summarize() doesn't evaluate eagerly", { + expect_s3_class( + Table$create(tbl) %>% + summarize(total = sum(int)), + "arrow_dplyr_query" + ) + expect_r6_class( + Table$create(tbl) %>% + summarize(total = sum(int)) %>% + compute(), + "ArrowTabular" + ) +}) + +test_that("Can aggregate in Arrow", { + compare_dplyr_binding( + .input %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + summarize(total = sum(int)) %>% + collect(), + tbl + ) +}) + +test_that("Group by sum on dataset", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(total = sum(int * 4, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(total = sum(int)) %>% + collect(), + tbl, + ) +}) + +test_that("Group by mean on dataset", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(mean = mean(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(mean = mean(int, na.rm = FALSE)) %>% + collect(), + tbl + ) +}) + +test_that("Group by sd on dataset", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(sd = sd(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(sd = sd(int, na.rm = FALSE)) %>% + collect(), + tbl + ) +}) + +test_that("Group by var on dataset", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(var = var(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(var = var(int, na.rm = FALSE)) %>% + collect(), + tbl + ) +}) + +test_that("n()", { + compare_dplyr_binding( + .input %>% + summarize(counts = n()) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(counts = n()) %>% + arrange(some_grouping) %>% + collect(), + tbl + ) +}) + +test_that("Group by any/all", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(any(lgl, na.rm = TRUE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(all(lgl, na.rm = TRUE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(any(lgl, na.rm = FALSE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(all(lgl, na.rm = FALSE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + mutate(has_words = nchar(verses) < 0) %>% + group_by(some_grouping) %>% + summarize(any(has_words, na.rm = TRUE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + mutate(has_words = nchar(verses) < 0) %>% + group_by(some_grouping) %>% + summarize(all(has_words, na.rm = TRUE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(has_words = all(nchar(verses) < 0, na.rm = TRUE)) %>% + collect(), + tbl + ) +}) + +test_that("n_distinct() on dataset", { + # With groupby + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>% + collect(), + tbl + ) + # Without groupby + compare_dplyr_binding( + .input %>% + summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + summarize(distinct = n_distinct(int, lgl)) %>% + collect(), + tbl, + warning = "Multiple arguments" + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(distinct = n_distinct(int, lgl)) %>% + collect(), + tbl, + warning = "Multiple arguments" + ) +}) + +test_that("Functions that take ... but we only accept a single arg", { + compare_dplyr_binding( + .input %>% + summarize(distinct = n_distinct()) %>% + collect(), + tbl, + warning = "0 arguments" + ) + compare_dplyr_binding( + .input %>% + summarize(distinct = n_distinct(int, lgl)) %>% + collect(), + tbl, + warning = "Multiple arguments" + ) + # Now that we've demonstrated that the whole machinery works, let's test + # the agg_funcs directly + expect_error(agg_funcs$n_distinct(), "n_distinct() with 0 arguments", fixed = TRUE) + expect_error(agg_funcs$sum(), "sum() with 0 arguments", fixed = TRUE) + expect_error(agg_funcs$any(), "any() with 0 arguments", fixed = TRUE) + expect_error(agg_funcs$all(), "all() with 0 arguments", fixed = TRUE) + expect_error(agg_funcs$min(), "min() with 0 arguments", fixed = TRUE) + expect_error(agg_funcs$max(), "max() with 0 arguments", fixed = TRUE) + expect_error(agg_funcs$n_distinct(1, 2), "Multiple arguments to n_distinct()") + expect_error(agg_funcs$sum(1, 2), "Multiple arguments to sum") + expect_error(agg_funcs$any(1, 2), "Multiple arguments to any()") + expect_error(agg_funcs$all(1, 2), "Multiple arguments to all()") + expect_error(agg_funcs$min(1, 2), "Multiple arguments to min()") + expect_error(agg_funcs$max(1, 2), "Multiple arguments to max()") +}) + +test_that("median()", { + # When medians are integer-valued, stats::median() sometimes returns output of + # type integer, whereas whereas the Arrow approx_median kernels always return + # output of type float64. The calls to median(int, ...) in the tests below + # are enclosed in as.double() to work around this known difference. + + # Use old testthat behavior here so we don't have to assert the same warning + # over and over + local_edition(2) + + # with groups + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + med_dbl = median(dbl), + med_int = as.double(median(int)), + med_dbl_narmf = median(dbl, FALSE), + med_int_narmf = as.double(median(int, na.rm = FALSE)), + med_dbl_narmt = median(dbl, na.rm = TRUE), + med_int_narmt = as.double(median(int, TRUE)) + ) %>% + arrange(some_grouping) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) + # without groups, with na.rm = TRUE + compare_dplyr_binding( + .input %>% + summarize( + med_dbl_narmt = median(dbl, na.rm = TRUE), + med_int_narmt = as.double(median(int, TRUE)) + ) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) + # without groups, with na.rm = FALSE (the default) + compare_dplyr_binding( + .input %>% + summarize( + med_dbl = median(dbl), + med_int = as.double(median(int)), + med_dbl_narmf = median(dbl, FALSE), + med_int_narmf = as.double(median(int, na.rm = FALSE)) + ) %>% + collect(), + tbl, + warning = "median\\(\\) currently returns an approximate median in Arrow" + ) + local_edition(3) +}) + +test_that("quantile()", { + # The default method for stats::quantile() throws an error when na.rm = FALSE + # and the input contains NA or NaN, whereas the Arrow tdigest kernels return + # null in this situation. To work around this known difference, the tests + # below always use na.rm = TRUE when the data contains NA or NaN. + + # The default method for stats::quantile() has an argument `names` that + # controls whether the result has a names attribute. It defaults to + # names = TRUE. With Arrow, it is not possible to give the result a names + # attribute, so the quantile() binding in Arrow does not accept a `names` + # argument. Differences in this names attribute cause compare_dplyr_binding() to + # report that the objects are not equal, so we do not use compare_dplyr_binding() + # in the tests below. + + # The tests below all use probs = 0.5 because other values cause differences + # between the exact quantiles returned by R and the approximate quantiles + # returned by Arrow. + + # When quantiles are integer-valued, stats::quantile() sometimes returns + # output of type integer, whereas whereas the Arrow tdigest kernels always + # return output of type float64. The calls to quantile(int, ...) in the tests + # below are enclosed in as.double() to work around this known difference. + + local_edition(2) + # with groups + expect_warning( + expect_equal( + tbl %>% + group_by(some_grouping) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE), + q_int = as.double( + quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + ) + ) %>% + arrange(some_grouping), + Table$create(tbl) %>% + group_by(some_grouping) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE), + q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE)) + ) %>% + arrange(some_grouping) %>% + collect() + ), + "quantile() currently returns an approximate quantile in Arrow", + fixed = TRUE + ) + + # without groups + expect_warning( + expect_equal( + tbl %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE), + q_int = as.double( + quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE) + ) + ), + Table$create(tbl) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE), + q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE)) + ) %>% + collect() + ), + "quantile() currently returns an approximate quantile in Arrow", + fixed = TRUE + ) + + # with missing values and na.rm = FALSE + expect_warning( + expect_equal( + tibble( + q_dbl = NA_real_, + q_int = NA_real_ + ), + Table$create(tbl) %>% + summarize( + q_dbl = quantile(dbl, probs = 0.5, na.rm = FALSE), + q_int = as.double(quantile(int, probs = 0.5, na.rm = FALSE)) + ) %>% + collect() + ), + "quantile() currently returns an approximate quantile in Arrow", + fixed = TRUE + ) + local_edition(3) + + # with a vector of 2+ probs + expect_warning( + Table$create(tbl) %>% + summarize(q = quantile(dbl, probs = c(0.2, 0.8), na.rm = TRUE)), + "quantile() with length(probs) != 1 not supported by Arrow", + fixed = TRUE + ) +}) + +test_that("summarize() with min() and max()", { + compare_dplyr_binding( + .input %>% + select(int, chr) %>% + filter(int > 5) %>% # this filters out the NAs in `int` + summarize(min_int = min(int), max_int = max(int)) %>% + collect(), + tbl, + ) + compare_dplyr_binding( + .input %>% + select(int, chr) %>% + filter(int > 5) %>% # this filters out the NAs in `int` + summarize( + min_int = min(int + 4) / 2, + max_int = 3 / max(42 - int) + ) %>% + collect(), + tbl, + ) + compare_dplyr_binding( + .input %>% + select(int, chr) %>% + summarize(min_int = min(int), max_int = max(int)) %>% + collect(), + tbl, + ) + compare_dplyr_binding( + .input %>% + select(int) %>% + summarize( + min_int = min(int, na.rm = TRUE), + max_int = max(int, na.rm = TRUE) + ) %>% + collect(), + tbl, + ) + compare_dplyr_binding( + .input %>% + select(dbl, int) %>% + summarize( + min_int = -min(log(ceiling(dbl)), na.rm = TRUE), + max_int = log(max(as.double(int), na.rm = TRUE)) + ) %>% + collect(), + tbl, + ) + + # multiple dots arguments to min(), max() not supported + compare_dplyr_binding( + .input %>% + summarize(min_mult = min(dbl, int)) %>% + collect(), + tbl, + warning = "Multiple arguments to min\\(\\) not supported by Arrow" + ) + compare_dplyr_binding( + .input %>% + select(int, dbl, dbl2) %>% + summarize(max_mult = max(int, dbl, dbl2)) %>% + collect(), + tbl, + warning = "Multiple arguments to max\\(\\) not supported by Arrow" + ) + + # min(logical) or max(logical) yields integer in R + # min(Boolean) or max(Boolean) yields Boolean in Arrow + compare_dplyr_binding( + .input %>% + select(lgl) %>% + summarize( + max_lgl = as.logical(max(lgl, na.rm = TRUE)), + min_lgl = as.logical(min(lgl, na.rm = TRUE)) + ) %>% + collect(), + tbl, + ) +}) + +test_that("min() and max() on character strings", { + compare_dplyr_binding( + .input %>% + summarize( + min_chr = min(chr, na.rm = TRUE), + max_chr = max(chr, na.rm = TRUE) + ) %>% + collect(), + tbl, + ) + skip("Strings not supported by hash_min_max (ARROW-13988)") + compare_dplyr_binding( + .input %>% + group_by(fct) %>% + summarize( + min_chr = min(chr, na.rm = TRUE), + max_chr = max(chr, na.rm = TRUE) + ) %>% + collect(), + tbl, + ) +}) + +test_that("summarise() with !!sym()", { + test_chr_col <- "int" + test_dbl_col <- "dbl" + test_lgl_col <- "lgl" + compare_dplyr_binding( + .input %>% + group_by(false) %>% + summarise( + sum = sum(!!sym(test_dbl_col)), + any = any(!!sym(test_lgl_col)), + all = all(!!sym(test_lgl_col)), + mean = mean(!!sym(test_dbl_col)), + sd = sd(!!sym(test_dbl_col)), + var = var(!!sym(test_dbl_col)), + n_distinct = n_distinct(!!sym(test_chr_col)), + min = min(!!sym(test_dbl_col)), + max = max(!!sym(test_dbl_col)) + ) %>% + collect(), + tbl + ) +}) + +test_that("Filter and aggregate", { + compare_dplyr_binding( + .input %>% + filter(some_grouping == 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(some_grouping == 2) %>% + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(int > 5) %>% + group_by(some_grouping) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) +}) + +test_that("Group by edge cases", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping * 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + group_by(alt = some_grouping * 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) +}) + +test_that("Do things after summarize", { + group2_sum <- tbl %>% + group_by(some_grouping) %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + pull() %>% + tail(1) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + filter(total == group2_sum) %>% + mutate(extra = total * 5) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + filter(dbl > 2) %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + group_by(lgl) %>% + summarize( + count = n(), + total = sum(twice, na.rm = TRUE) + ) %>% + mutate(mean = total / count) %>% + collect(), + tbl + ) +}) + +test_that("Expressions on aggregations", { + # This is what it effectively is + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + any = any(lgl), + all = all(lgl) + ) %>% + ungroup() %>% # TODO: loosen the restriction on mutate after group_by + mutate(some = any & !all) %>% + select(some_grouping, some) %>% + collect(), + tbl + ) + # More concisely: + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize(any(lgl) & !all(lgl)) %>% + collect(), + tbl + ) + + # Save one of the aggregates first + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + any_lgl = any(lgl), + some = any_lgl & !all(lgl) + ) %>% + collect(), + tbl + ) + + # Make sure order of columns in result is correct + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + any_lgl = any(lgl), + some = any_lgl & !all(lgl), + n() + ) %>% + collect(), + tbl + ) + + # Aggregate on an aggregate (trivial but dplyr allows) + skip("Aggregate on an aggregate not supported") + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + any_lgl = any(any(lgl)) + ) %>% + collect(), + tbl + ) +}) + +test_that("Summarize with 0 arguments", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize() %>% + collect(), + tbl + ) +}) + +test_that("Not (yet) supported: implicit join", { + withr::local_options(list(arrow.debug = TRUE)) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + sum((dbl - mean(dbl))^2) + ) %>% + collect(), + tbl, + warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R" + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + sum(dbl - mean(dbl)) + ) %>% + collect(), + tbl, + warning = "Expression sum\\(dbl - mean\\(dbl\\)\\) not supported in Arrow; pulling data into R" + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + sqrt(sum((dbl - mean(dbl))^2) / (n() - 1L)) + ) %>% + collect(), + tbl, + warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R" + ) + + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + dbl - mean(dbl) + ) %>% + collect(), + tbl, + warning = "Expression dbl - mean\\(dbl\\) not supported in Arrow; pulling data into R" + ) + + # This one could possibly be supported--in mutate() + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + dbl - int + ) %>% + collect(), + tbl, + warning = "Expression dbl - int not supported in Arrow; pulling data into R" + ) +}) + +test_that(".groups argument", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n()) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "drop_last") %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "keep") %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "drop") %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "rowwise") %>% + collect(), + tbl, + warning = TRUE + ) + + # abandon_ship() raises the warning, then dplyr itself errors + # This isn't ideal but it's fine and won't be an issue on Datasets + expect_error( + expect_warning( + Table$create(tbl) %>% + group_by(some_grouping, int < 6) %>% + summarize(count = n(), .groups = "NOTVALID"), + "Invalid .groups argument" + ), + "NOTVALID" + ) +}) + +test_that("summarize() handles group_by .drop", { + # Error: Type error: Sorting not supported for type dictionary<values=string, indices=int8, ordered=0> + withr::local_options(list(arrow.summarise.sort = FALSE)) + + tbl <- tibble( + x = 1:10, + y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) + ) + compare_dplyr_binding( + .input %>% + group_by(y) %>% + count() %>% + collect() %>% + arrange(y), + tbl + ) + # Not supported: check message + compare_dplyr_binding( + .input %>% + group_by(y, .drop = FALSE) %>% + count() %>% + collect() %>% + # Because it's not supported, we have to filter out the (empty) row + # that dplyr keeps, just so we test equal (otherwise) + filter(y != "b") %>% + arrange(y), + tbl, + warning = ".drop = FALSE currently not supported in Arrow aggregation" + ) + + # But this is ok because there is no factor group + compare_dplyr_binding( + .input %>% + group_by(y, .drop = FALSE) %>% + count() %>% + collect() %>% + arrange(y), + tibble( + x = 1:10, + y = rep(c("a", "c"), each = 5) + ) + ) +}) diff --git a/src/arrow/r/tests/testthat/test-duckdb.R b/src/arrow/r/tests/testthat/test-duckdb.R new file mode 100644 index 000000000..decd6e80e --- /dev/null +++ b/src/arrow/r/tests/testthat/test-duckdb.R @@ -0,0 +1,217 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_installed("duckdb", minimum_version = "0.2.8") +skip_if_not_installed("dbplyr") +skip_if_not_available("dataset") +skip_on_cran() + +library(duckdb, quietly = TRUE) +library(dplyr, warn.conflicts = FALSE) + +test_that("to_duckdb", { + ds <- InMemoryDataset$create(example_data) + + expect_identical( + ds %>% + to_duckdb() %>% + collect() %>% + # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 + select(!fct), + select(example_data, !fct) + ) + + expect_identical( + ds %>% + select(int, lgl, dbl) %>% + to_duckdb() %>% + group_by(lgl) %>% + summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>% + collect(), + tibble::tibble( + lgl = c(TRUE, NA, FALSE), + mean_int = c(3, 6.25, 8.5), + mean_dbl = c(3.1, 6.35, 6.1) + ) + ) + + # can group_by before the to_duckdb + expect_identical( + ds %>% + select(int, lgl, dbl) %>% + group_by(lgl) %>% + to_duckdb() %>% + summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>% + collect(), + tibble::tibble( + lgl = c(TRUE, NA, FALSE), + mean_int = c(3, 6.25, 8.5), + mean_dbl = c(3.1, 6.35, 6.1) + ) + ) +}) + +test_that("to_duckdb then to_arrow", { + ds <- InMemoryDataset$create(example_data) + + ds_rt <- ds %>% + to_duckdb() %>% + # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 + select(-fct) %>% + to_arrow() + + expect_identical( + collect(ds_rt), + ds %>% + select(-fct) %>% + collect() + ) + + # And we can continue the pipeline + ds_rt <- ds %>% + to_duckdb() %>% + # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 + select(-fct) %>% + to_arrow() %>% + filter(int > 5) + + expect_identical( + collect(ds_rt), + ds %>% + select(-fct) %>% + filter(int > 5) %>% + collect() + ) + + # Now check errors + ds_rt <- ds %>% + to_duckdb() %>% + # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879 + select(-fct) + + # alter the class of ds_rt's connection to simulate some other database + class(ds_rt$src$con) <- "some_other_connection" + + expect_error( + to_arrow(ds_rt), + "to_arrow\\(\\) currently only supports Arrow tables, Arrow datasets," + ) +}) + +# The next set of tests use an already-extant connection to test features of +# persistence and querying against the table without using the `tbl` itself, so +# we need to create a connection separate from the ephemeral one that is made +# with arrow_duck_connection() +con <- dbConnect(duckdb::duckdb()) +dbExecute(con, "PRAGMA threads=2") +on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE) + +# write one table to the connection so it is kept open +DBI::dbWriteTable(con, "mtcars", mtcars) + +test_that("Joining, auto-cleanup enabled", { + ds <- InMemoryDataset$create(example_data) + + table_one_name <- "my_arrow_table_1" + table_one <- to_duckdb(ds, con = con, table_name = table_one_name, auto_disconnect = TRUE) + table_two_name <- "my_arrow_table_2" + table_two <- to_duckdb(ds, con = con, table_name = table_two_name, auto_disconnect = TRUE) + + res <- dbGetQuery( + con, + paste0( + "SELECT * FROM ", table_one_name, + " INNER JOIN ", table_two_name, + " ON ", table_one_name, ".int = ", table_two_name, ".int" + ) + ) + expect_identical(dim(res), c(9L, 14L)) + + # clean up cleans up the tables + expect_true(all(c(table_one_name, table_two_name) %in% DBI::dbListTables(con))) + rm(table_one, table_two) + gc() + expect_false(any(c(table_one_name, table_two_name) %in% DBI::dbListTables(con))) +}) + +test_that("Joining, auto-cleanup disabled", { + ds <- InMemoryDataset$create(example_data) + + table_three_name <- "my_arrow_table_3" + table_three <- to_duckdb(ds, con = con, table_name = table_three_name) + + # clean up does *not* clean these tables + expect_true(table_three_name %in% DBI::dbListTables(con)) + rm(table_three) + gc() + # but because we aren't auto_disconnecting then we still have this table. + expect_true(table_three_name %in% DBI::dbListTables(con)) +}) + +test_that("to_duckdb with a table", { + tab <- Table$create(example_data) + + expect_identical( + tab %>% + to_duckdb() %>% + group_by(int > 4) %>% + summarise( + int_mean = mean(int, na.rm = TRUE), + dbl_mean = mean(dbl, na.rm = TRUE) + ) %>% + collect(), + tibble::tibble( + "int > 4" = c(FALSE, NA, TRUE), + int_mean = c(2, NA, 7.5), + dbl_mean = c(2.1, 4.1, 7.3) + ) + ) +}) + +test_that("to_duckdb passing a connection", { + ds <- InMemoryDataset$create(example_data) + + con_separate <- dbConnect(duckdb::duckdb()) + # we always want to test in parallel + dbExecute(con_separate, "PRAGMA threads=2") + on.exit(dbDisconnect(con_separate, shutdown = TRUE), add = TRUE) + + # create a table to join to that we know is in our con_separate + new_df <- data.frame( + int = 1:10, + char = letters[26:17], + stringsAsFactors = FALSE + ) + DBI::dbWriteTable(con_separate, "separate_join_table", new_df) + + table_four <- ds %>% + select(int, lgl, dbl) %>% + to_duckdb(con = con_separate, auto_disconnect = FALSE) + table_four_name <- table_four$ops$x + + result <- DBI::dbGetQuery( + con_separate, + paste0( + "SELECT * FROM ", table_four_name, + " INNER JOIN separate_join_table ", + "ON separate_join_table.int = ", table_four_name, ".int" + ) + ) + + expect_identical(dim(result), c(9L, 5L)) + expect_identical(result$char, new_df[new_df$int != 4, ]$char) +}) diff --git a/src/arrow/r/tests/testthat/test-expression.R b/src/arrow/r/tests/testthat/test-expression.R new file mode 100644 index 000000000..c4aab718d --- /dev/null +++ b/src/arrow/r/tests/testthat/test-expression.R @@ -0,0 +1,128 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("C++ expressions", { + skip_if_not_available("dataset") + f <- Expression$field_ref("f") + expect_identical(f$field_name, "f") + g <- Expression$field_ref("g") + date <- Expression$scalar(as.Date("2020-01-15")) + ts <- Expression$scalar(as.POSIXct("2020-01-17 11:11:11")) + i64 <- Expression$scalar(bit64::as.integer64(42)) + time <- Expression$scalar(hms::hms(56, 34, 12)) + + expect_r6_class(f == g, "Expression") + expect_r6_class(f == 4, "Expression") + expect_r6_class(f == "", "Expression") + expect_r6_class(f == NULL, "Expression") + expect_r6_class(f == date, "Expression") + expect_r6_class(f == i64, "Expression") + expect_r6_class(f == time, "Expression") + # can't seem to make this work right now because of R Ops.method dispatch + # expect_r6_class(f == as.Date("2020-01-15"), "Expression") # nolint + expect_r6_class(f == ts, "Expression") + expect_r6_class(f <= 2L, "Expression") + expect_r6_class(f != FALSE, "Expression") + expect_r6_class(f > 4, "Expression") + expect_r6_class(f < 4 & f > 2, "Expression") + expect_r6_class(f < 4 | f > 2, "Expression") + expect_r6_class(!(f < 4), "Expression") + expect_output( + print(f > 4), + "Expression\n(f > 4)", + fixed = TRUE + ) + expect_equal( + f$type(schema(f = float64())), + float64() + ) + expect_equal( + (f > 4)$type(schema(f = float64())), + bool() + ) + # Interprets that as a list type + expect_r6_class(f == c(1L, 2L), "Expression") + + expect_error( + Expression$create("add", 1, 2), + "Expression arguments must be Expression objects" + ) +}) + +test_that("Field reference expression schemas and types", { + x <- Expression$field_ref("x") + + # type() throws error when schema is NULL + expect_error(x$type(), "schema") + + # type() returns type when schema is set + x$schema <- Schema$create(x = int32()) + expect_equal(x$type(), int32()) +}) + +test_that("Scalar expression schemas and types", { + # type() works on scalars without setting the schema + expect_equal( + Expression$scalar("foo")$type(), + arrow::string() + ) + expect_equal( + Expression$scalar(42L)$type(), + int32() + ) +}) + +test_that("Expression schemas and types", { + x <- Expression$field_ref("x") + y <- Expression$field_ref("y") + z <- Expression$scalar(42L) + + # type() throws error when both schemas are unset + expect_error( + Expression$create("add_checked", x, y)$type(), + "schema" + ) + + # type() throws error when left schema is unset + y$schema <- Schema$create(y = float64()) + expect_error( + Expression$create("add_checked", x, y)$type(), + "schema" + ) + + # type() throws error when right schema is unset + x$schema <- Schema$create(x = int32()) + y$schema <- NULL + expect_error( + Expression$create("add_checked", x, y)$type(), + "schema" + ) + + # type() returns type when both schemas are set + y$schema <- Schema$create(y = float64()) + expect_equal( + Expression$create("add_checked", x, y)$type(), + float64() + ) + + # type() returns type when one arg has schema set and one is scalar + expect_equal( + Expression$create("add_checked", x, z)$type(), + int32() + ) +}) diff --git a/src/arrow/r/tests/testthat/test-feather.R b/src/arrow/r/tests/testthat/test-feather.R new file mode 100644 index 000000000..136474dea --- /dev/null +++ b/src/arrow/r/tests/testthat/test-feather.R @@ -0,0 +1,256 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +feather_file <- tempfile() +tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + +test_that("Write a feather file", { + tib_out <- write_feather(tib, feather_file) + expect_true(file.exists(feather_file)) + # Input is returned unmodified + expect_identical(tib_out, tib) +}) + +expect_feather_roundtrip <- function(write_fun) { + tf2 <- normalizePath(tempfile(), mustWork = FALSE) + tf3 <- tempfile() + on.exit({ + unlink(tf2) + unlink(tf3) + }) + + # Write two ways. These are what varies with each run + write_fun(tib, tf2) + expect_true(file.exists(tf2)) + + stream <- FileOutputStream$create(tf3) + write_fun(tib, stream) + stream$close() + expect_true(file.exists(tf3)) + + # Read both back + tab2 <- read_feather(tf2) + expect_s3_class(tab2, "data.frame") + + tab3 <- read_feather(tf3) + expect_s3_class(tab3, "data.frame") + + # reading directly from arrow::io::MemoryMappedFile + tab4 <- read_feather(mmap_open(tf3)) + expect_s3_class(tab4, "data.frame") + + # reading directly from arrow::io::ReadableFile + tab5 <- read_feather(ReadableFile$create(tf3)) + expect_s3_class(tab5, "data.frame") + + expect_equal(tib, tab2) + expect_equal(tib, tab3) + expect_equal(tib, tab4) + expect_equal(tib, tab5) +} + +test_that("feather read/write round trip", { + expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 1)) + expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 2)) + expect_feather_roundtrip(function(x, f) write_feather(x, f, chunk_size = 32)) + if (codec_is_available("lz4")) { + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4")) + } + if (codec_is_available("zstd")) { + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd")) + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd", compression_level = 3)) + } + + # Write from Arrow data structures + expect_feather_roundtrip(function(x, f) write_feather(RecordBatch$create(x), f)) + expect_feather_roundtrip(function(x, f) write_feather(Table$create(x), f)) +}) + +test_that("write_feather option error handling", { + tf <- tempfile() + expect_false(file.exists(tf)) + expect_error( + write_feather(tib, tf, version = 1, chunk_size = 1024), + "Feather version 1 does not support the 'chunk_size' option" + ) + expect_error( + write_feather(tib, tf, version = 1, compression = "lz4"), + "Feather version 1 does not support the 'compression' option" + ) + expect_error( + write_feather(tib, tf, version = 1, compression_level = 1024), + "Feather version 1 does not support the 'compression_level' option" + ) + expect_error( + write_feather(tib, tf, compression_level = 1024), + "Can only specify a 'compression_level' when 'compression' is 'zstd'" + ) + expect_match_arg_error(write_feather(tib, tf, compression = "bz2")) + expect_false(file.exists(tf)) +}) + +test_that("write_feather with invalid input type", { + bad_input <- Array$create(1:5) + expect_error( + write_feather(bad_input, feather_file), + regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'." + ) +}) + +test_that("read_feather supports col_select = <names>", { + tab1 <- read_feather(feather_file, col_select = c("x", "y")) + expect_s3_class(tab1, "data.frame") + + expect_equal(tib$x, tab1$x) + expect_equal(tib$y, tab1$y) +}) + +test_that("feather handles col_select = <integer>", { + tab1 <- read_feather(feather_file, col_select = 1:2) + expect_s3_class(tab1, "data.frame") + + expect_equal(tib$x, tab1$x) + expect_equal(tib$y, tab1$y) +}) + +test_that("feather handles col_select = <tidyselect helper>", { + tab1 <- read_feather(feather_file, col_select = everything()) + expect_identical(tib, tab1) + + tab2 <- read_feather(feather_file, col_select = starts_with("x")) + expect_identical(tab2, tib[, "x", drop = FALSE]) + + tab3 <- read_feather(feather_file, col_select = c(starts_with("x"), contains("y"))) + expect_identical(tab3, tib[, c("x", "y"), drop = FALSE]) + + tab4 <- read_feather(feather_file, col_select = -z) + expect_identical(tab4, tib[, c("x", "y"), drop = FALSE]) +}) + +test_that("feather read/write round trip", { + tab1 <- read_feather(feather_file, as_data_frame = FALSE) + expect_r6_class(tab1, "Table") + + expect_equal(tib, as.data.frame(tab1)) +}) + +test_that("Read feather from raw vector", { + test_raw <- readBin(feather_file, what = "raw", n = 5000) + df <- read_feather(test_raw) + expect_s3_class(df, "data.frame") +}) + +test_that("FeatherReader", { + v1 <- tempfile() + v2 <- tempfile() + on.exit({ + unlink(v1) + unlink(v2) + }) + write_feather(tib, v1, version = 1) + write_feather(tib, v2) + f1 <- make_readable_file(v1) + reader1 <- FeatherReader$create(f1) + f1$close() + expect_identical(reader1$version, 1L) + f2 <- make_readable_file(v2) + reader2 <- FeatherReader$create(f2) + expect_identical(reader2$version, 2L) + f2$close() +}) + +test_that("read_feather requires RandomAccessFile and errors nicely otherwise (ARROW-8615)", { + skip_if_not_available("gzip") + expect_error( + read_feather(CompressedInputStream$create(feather_file)), + 'file must be a "RandomAccessFile"' + ) +}) + +test_that("read_feather closes connection to file", { + tf <- tempfile() + on.exit(unlink(tf)) + write_feather(tib, sink = tf) + expect_true(file.exists(tf)) + read_feather(tf) + expect_error(file.remove(tf), NA) + expect_false(file.exists(tf)) +}) + +test_that("Character vectors > 2GB can write to feather", { + skip_on_cran() + skip_if_not_running_large_memory_tests() + df <- tibble::tibble(big = make_big_string()) + tf <- tempfile() + on.exit(unlink(tf)) + write_feather(df, tf) + expect_identical(read_feather(tf), df) +}) + +test_that("FeatherReader methods", { + # Setup a feather file to use in the test + feather_temp <- tempfile() + on.exit({ + unlink(feather_temp) + }) + write_feather(tib, feather_temp) + feather_temp_RA <- make_readable_file(feather_temp) + + reader <- FeatherReader$create(feather_temp_RA) + feather_temp_RA$close() + + # column_names + expect_identical( + reader$column_names, + c("x", "y", "z") + ) + + # print method + expect_identical( + capture.output(print(reader)), + # TODO: can we get rows/columns? + c("FeatherReader:", "Schema", "x: int32", "y: double", "z: string") + ) +}) + +unlink(feather_file) + +ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather") + +test_that("Error messages are shown when the compression algorithm lz4 is not found", { + msg <- paste0( + "NotImplemented: Support for codec 'lz4' not built\nIn order to read this file, ", + "you will need to reinstall arrow with additional features enabled.\nSet one of ", + "these environment variables before installing:\n\n * LIBARROW_MINIMAL=false ", + "(for all optional features, including 'lz4')\n * ARROW_WITH_LZ4=ON (for just 'lz4')", + "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details" + ) + + if (codec_is_available("lz4")) { + d <- read_feather(ft_file) + expect_s3_class(d, "data.frame") + } else { + expect_error(read_feather(ft_file), msg, fixed = TRUE) + } +}) + +test_that("Error is created when feather reads a parquet file", { + expect_error( + read_feather(system.file("v0.7.1.parquet", package = "arrow")), + "Not a Feather V1 or Arrow IPC file" + ) +}) diff --git a/src/arrow/r/tests/testthat/test-field.R b/src/arrow/r/tests/testthat/test-field.R new file mode 100644 index 000000000..1be36c064 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-field.R @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("field() factory", { + x <- field("x", int32()) + expect_equal(x$type, int32()) + expect_equal(x$name, "x") + expect_true(x$nullable) + expect_true(x == x) + expect_false(x == field("x", int64())) +}) + +test_that("Field with nullable values", { + x <- field("x", int32(), nullable = FALSE) + expect_equal(x$type, int32()) + expect_false(x$nullable) + expect_true(x == x) + expect_false(x == field("x", int32())) +}) + +test_that("Field validation", { + expect_error(schema(b = 32), "b must be a DataType, not numeric") +}) + +test_that("Print method for field", { + expect_output(print(field("x", int32())), "Field\nx: int32") + expect_output( + print(field("zz", dictionary())), + "Field\nzz: dictionary<values=string, indices=int32>" + ) + + expect_output( + print(field("x", int32(), nullable = FALSE)), + "Field\nx: int32 not null" + ) + +}) + +test_that("Field to C-interface", { + field <- field("x", time32("s")) + + # export the field via the C-interface + ptr <- allocate_arrow_schema() + field$export_to_c(ptr) + + # then import it and check that the roundtripped value is the same + circle <- Field$import_from_c(ptr) + expect_equal(circle, field) + + # must clean up the pointer or we leak + delete_arrow_schema(ptr) +}) diff --git a/src/arrow/r/tests/testthat/test-filesystem.R b/src/arrow/r/tests/testthat/test-filesystem.R new file mode 100644 index 000000000..5ee096f13 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-filesystem.R @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("LocalFilesystem", { + fs <- LocalFileSystem$create() + expect_identical(fs$type_name, "local") + DESCRIPTION <- system.file("DESCRIPTION", package = "arrow") + info <- fs$GetFileInfo(DESCRIPTION)[[1]] + expect_equal(info$base_name(), "DESCRIPTION") + expect_equal(info$extension(), "") + expect_equal(info$type, FileType$File) + expect_equal(info$path, DESCRIPTION) + info <- file.info(DESCRIPTION) + + expect_equal(info$size, info$size) + # This fails due to a subsecond difference on Appveyor on Windows with R 3.3 only + # So add a greater tolerance to allow for that + expect_equal(info$mtime, info$mtime, tolerance = 1) + + tf <- tempfile(fileext = ".txt") + fs$CopyFile(DESCRIPTION, tf) + info <- fs$GetFileInfo(tf)[[1]] + expect_equal(info$extension(), "txt") + expect_equal(info$size, info$size) + expect_equal(readLines(DESCRIPTION), readLines(tf)) + + tf2 <- tempfile(fileext = ".txt") + fs$Move(tf, tf2) + infos <- fs$GetFileInfo(c(tf, tf2, dirname(tf))) + expect_equal(infos[[1]]$type, FileType$NotFound) + expect_equal(infos[[2]]$type, FileType$File) + expect_equal(infos[[3]]$type, FileType$Directory) + + fs$DeleteFile(tf2) + expect_equal(fs$GetFileInfo(tf2)[[1L]]$type, FileType$NotFound) + expect_true(!file.exists(tf2)) + + expect_equal(fs$GetFileInfo(tf)[[1L]]$type, FileType$NotFound) + expect_true(!file.exists(tf)) + + td <- tempfile() + fs$CreateDir(td) + expect_equal(fs$GetFileInfo(td)[[1L]]$type, FileType$Directory) + fs$CopyFile(DESCRIPTION, file.path(td, "DESCRIPTION")) + fs$DeleteDirContents(td) + expect_equal(length(dir(td)), 0L) + fs$DeleteDir(td) + expect_equal(fs$GetFileInfo(td)[[1L]]$type, FileType$NotFound) + + tf3 <- tempfile() + os <- fs$OpenOutputStream(path = tf3) + bytes <- as.raw(1:40) + os$write(bytes) + os$close() + + is <- fs$OpenInputStream(tf3) + buf <- is$Read(40) + expect_equal(buf$data(), bytes) + is$close() +}) + +test_that("SubTreeFilesystem", { + dir.create(td <- tempfile()) + DESCRIPTION <- system.file("DESCRIPTION", package = "arrow") + file.copy(DESCRIPTION, file.path(td, "DESCRIPTION")) + + st_fs <- SubTreeFileSystem$create(td) + expect_r6_class(st_fs, "SubTreeFileSystem") + expect_r6_class(st_fs, "FileSystem") + expect_r6_class(st_fs$base_fs, "LocalFileSystem") + expect_identical( + capture.output(print(st_fs)), + paste0("SubTreeFileSystem: ", "file://", st_fs$base_path) + ) + + # FIXME windows has a trailing slash for one but not the other + # expect_identical(normalizePath(st_fs$base_path), normalizePath(td)) # nolint + + st_fs$CreateDir("test") + st_fs$CopyFile("DESCRIPTION", "DESC.txt") + infos <- st_fs$GetFileInfo(c("DESCRIPTION", "test", "nope", "DESC.txt")) + expect_equal(infos[[1L]]$type, FileType$File) + expect_equal(infos[[2L]]$type, FileType$Directory) + expect_equal(infos[[3L]]$type, FileType$NotFound) + expect_equal(infos[[4L]]$type, FileType$File) + expect_equal(infos[[4L]]$extension(), "txt") + + local_fs <- LocalFileSystem$create() + local_fs$DeleteDirContents(td) + infos <- st_fs$GetFileInfo(c("DESCRIPTION", "test", "nope", "DESC.txt")) + expect_equal(infos[[1L]]$type, FileType$NotFound) + expect_equal(infos[[2L]]$type, FileType$NotFound) + expect_equal(infos[[3L]]$type, FileType$NotFound) + expect_equal(infos[[4L]]$type, FileType$NotFound) +}) + +test_that("LocalFileSystem + Selector", { + fs <- LocalFileSystem$create() + dir.create(td <- tempfile()) + writeLines("blah blah", file.path(td, "one.txt")) + writeLines("yada yada", file.path(td, "two.txt")) + dir.create(file.path(td, "dir")) + writeLines("...", file.path(td, "dir", "three.txt")) + + selector <- FileSelector$create(td, recursive = TRUE) + infos <- fs$GetFileInfo(selector) + expect_equal(length(infos), 4L) + types <- sapply(infos, function(.x) .x$type) + expect_equal(sum(types == FileType$File), 3L) + expect_equal(sum(types == FileType$Directory), 1L) + + selector <- FileSelector$create(td, recursive = FALSE) + infos <- fs$GetFileInfo(selector) + expect_equal(length(infos), 3L) + types <- sapply(infos, function(.x) .x$type) + expect_equal(sum(types == FileType$File), 2L) + expect_equal(sum(types == FileType$Directory), 1L) +}) + +test_that("FileSystem$from_uri", { + skip_on_cran() + skip_if_not_available("s3") + skip_if_offline() + fs_and_path <- FileSystem$from_uri("s3://ursa-labs-taxi-data") + expect_r6_class(fs_and_path$fs, "S3FileSystem") + expect_identical(fs_and_path$fs$region, "us-east-2") +}) + +test_that("SubTreeFileSystem$create() with URI", { + skip_on_cran() + skip_if_not_available("s3") + skip_if_offline() + fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data") + expect_r6_class(fs, "SubTreeFileSystem") + expect_identical( + capture.output(print(fs)), + "SubTreeFileSystem: s3://ursa-labs-taxi-data/" + ) +}) + +test_that("S3FileSystem", { + skip_on_cran() + skip_if_not_available("s3") + skip_if_offline() + s3fs <- S3FileSystem$create() + expect_r6_class(s3fs, "S3FileSystem") +}) + +test_that("s3_bucket", { + skip_on_cran() + skip_if_not_available("s3") + skip_if_offline() + bucket <- s3_bucket("ursa-labs-r-test") + expect_r6_class(bucket, "SubTreeFileSystem") + expect_r6_class(bucket$base_fs, "S3FileSystem") + expect_identical(bucket$region, "us-west-2") + expect_identical( + capture.output(print(bucket)), + "SubTreeFileSystem: s3://ursa-labs-r-test/" + ) + skip_on_os("windows") # FIXME + expect_identical(bucket$base_path, "ursa-labs-r-test/") +}) diff --git a/src/arrow/r/tests/testthat/test-install-arrow.R b/src/arrow/r/tests/testthat/test-install-arrow.R new file mode 100644 index 000000000..977f9d77d --- /dev/null +++ b/src/arrow/r/tests/testthat/test-install-arrow.R @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +r_only({ + test_that("arrow_repos", { + cran <- "https://cloud.r-project.org/" + ours <- "https://dl.example.com/ursalabs/fake_repo" + other <- "https://cran.fiocruz.br/" + + opts <- list( + repos = c(CRAN = "@CRAN@"), # Restore defaul + arrow.dev_repo = ours + ) + withr::with_options(opts, { + expect_identical(arrow_repos(), cran) + expect_identical(arrow_repos(c(cran, ours)), cran) + expect_identical(arrow_repos(c(ours, other)), other) + expect_identical(arrow_repos(nightly = TRUE), c(ours, cran)) + expect_identical(arrow_repos(c(cran, ours), nightly = TRUE), c(ours, cran)) + expect_identical(arrow_repos(c(ours, other), nightly = TRUE), c(ours, other)) + }) + }) +}) diff --git a/src/arrow/r/tests/testthat/test-json.R b/src/arrow/r/tests/testthat/test-json.R new file mode 100644 index 000000000..825511b97 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-json.R @@ -0,0 +1,255 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("json") + +test_that("Can read json file with scalars columns (ARROW-5503)", { + tf <- tempfile() + on.exit(unlink(tf)) + writeLines(' + { "hello": 3.5, "world": false, "yo": "thing" } + { "hello": 3.25, "world": null } + { "hello": 3.125, "world": null, "yo": "\u5fcd" } + { "hello": 0.0, "world": true, "yo": null } + ', tf, useBytes = TRUE) + + tab1 <- read_json_arrow(tf, as_data_frame = FALSE) + tab2 <- read_json_arrow(mmap_open(tf), as_data_frame = FALSE) + tab3 <- read_json_arrow(ReadableFile$create(tf), as_data_frame = FALSE) + + expect_equal(tab1, tab2) + expect_equal(tab1, tab3) + + expect_equal( + tab1$schema, + schema(hello = float64(), world = boolean(), yo = utf8()) + ) + tib <- as.data.frame(tab1) + expect_equal(tib$hello, c(3.5, 3.25, 3.125, 0)) + expect_equal(tib$world, c(FALSE, NA, NA, TRUE)) + expect_equal(tib$yo, c("thing", NA, "\u5fcd", NA)) +}) + +test_that("read_json_arrow() converts to tibble", { + tf <- tempfile() + on.exit(unlink(tf)) + writeLines(' + { "hello": 3.5, "world": false, "yo": "thing" } + { "hello": 3.25, "world": null } + { "hello": 3.125, "world": null, "yo": "\u5fcd" } + { "hello": 0.0, "world": true, "yo": null } + ', tf, useBytes = TRUE) + + tab1 <- read_json_arrow(tf) + tab2 <- read_json_arrow(mmap_open(tf)) + tab3 <- read_json_arrow(ReadableFile$create(tf)) + + expect_s3_class(tab1, "tbl_df") + expect_s3_class(tab2, "tbl_df") + expect_s3_class(tab3, "tbl_df") + + expect_equal(tab1, tab2) + expect_equal(tab1, tab3) + + expect_equal(tab1$hello, c(3.5, 3.25, 3.125, 0)) + expect_equal(tab1$world, c(FALSE, NA, NA, TRUE)) + expect_equal(tab1$yo, c("thing", NA, "\u5fcd", NA)) +}) + +test_that("read_json_arrow() supports col_select=", { + tf <- tempfile() + writeLines(' + { "hello": 3.5, "world": false, "yo": "thing" } + { "hello": 3.25, "world": null } + { "hello": 3.125, "world": null, "yo": "\u5fcd" } + { "hello": 0.0, "world": true, "yo": null } + ', tf) + + tab1 <- read_json_arrow(tf, col_select = c(hello, world)) + expect_equal(names(tab1), c("hello", "world")) + + tab2 <- read_json_arrow(tf, col_select = 1:2) + expect_equal(names(tab2), c("hello", "world")) +}) + +test_that("read_json_arrow(schema=) with empty schema", { + tf <- tempfile() + writeLines(' + { "hello": 3.5, "world": 2, "third_col": 99} + { "hello": 3.25, "world": 5, "third_col": 98} + { "hello": 3.125, "world": 8, "third_col": 97 } + { "hello": 0.0, "world": 10, "third_col": 96} + ', tf) + + tab1 <- read_json_arrow(tf, schema = schema()) + + expect_identical( + tab1, + tibble::tibble( + hello = c(3.5, 3.25, 3.125, 0), + world = c(2L, 5L, 8L, 10L), + third_col = c(99L, 98L, 97L, 96L) + ) + ) +}) + +test_that("read_json_arrow(schema=) with partial schema", { + tf <- tempfile() + writeLines(' + { "hello": 3.5, "world": 2, "third_col": 99} + { "hello": 3.25, "world": 5, "third_col": 98} + { "hello": 3.125, "world": 8, "third_col": 97 } + { "hello": 0.0, "world": 10, "third_col": 96} + ', tf) + + tab1 <- read_json_arrow(tf, schema = schema(third_col = float64(), world = float64())) + + expect_identical( + tab1, + tibble::tibble( + third_col = c(99, 98, 97, 96), + world = c(2, 5, 8, 10), + hello = c(3.5, 3.25, 3.125, 0) + ) + ) + + tf2 <- tempfile() + writeLines(' + { "hello": 3.5, "world": 2, "third_col": "99"} + { "hello": 3.25, "world": 5, "third_col": "98"} + { "hello": 3.125, "world": 8, "third_col": "97"} + ', tf2) + + tab2 <- read_json_arrow(tf2, schema = schema(third_col = string(), world = float64())) + + expect_identical( + tab2, + tibble::tibble( + third_col = c("99", "98", "97"), + world = c(2, 5, 8), + hello = c(3.5, 3.25, 3.125) + ) + ) +}) + +test_that("read_json_arrow(schema=) with full schema", { + tf <- tempfile() + writeLines(' + { "hello": 3.5, "world": 2, "third_col": 99} + { "hello": 3.25, "world": 5, "third_col": 98} + { "hello": 3.125, "world": 8, "third_col": 97} + { "hello": 0.0, "world": 10, "third_col": 96} + ', tf) + + tab1 <- read_json_arrow( + tf, + schema = schema( + hello = float64(), + third_col = float64(), + world = float64() + ) + ) + + expect_identical( + tab1, + tibble::tibble( + hello = c(3.5, 3.25, 3.125, 0), + third_col = c(99, 98, 97, 96), + world = c(2, 5, 8, 10) + ) + ) +}) + +test_that("Can read json file with nested columns (ARROW-5503)", { + tf <- tempfile() + on.exit(unlink(tf)) + writeLines(' + { "arr": [1.0, 2.0, 3.0], "nuf": {} } + { "arr": [2.0], "nuf": null } + { "arr": [], "nuf": { "ps": 78.0, "hello": "hi" } } + { "arr": null, "nuf": { "ps": 90.0, "hello": "bonjour" } } + { "arr": [5.0], "nuf": { "hello": "ciao" } } + { "arr": [5.0, 6.0], "nuf": { "ps": 19 } } + ', tf) + + tab1 <- read_json_arrow(tf, as_data_frame = FALSE) + tab2 <- read_json_arrow(mmap_open(tf), as_data_frame = FALSE) + tab3 <- read_json_arrow(ReadableFile$create(tf), as_data_frame = FALSE) + + expect_equal(tab1, tab2) + expect_equal(tab1, tab3) + + expect_equal( + tab1$schema, + schema( + arr = list_of(float64()), + nuf = struct(ps = float64(), hello = utf8()) + ) + ) + + struct_array <- tab1$column(1)$chunk(0) + ps <- Array$create(c(NA, NA, 78, 90, NA, 19)) + hello <- Array$create(c(NA, NA, "hi", "bonjour", "ciao", NA)) + expect_equal(struct_array$field(0L), ps) + expect_equal(struct_array$GetFieldByName("ps"), ps) + struct_cols <- struct_array$Flatten() + expect_identical(length(struct_cols), 2L) + expect_equal(struct_cols[[1]], ps) + expect_equal(struct_cols[[2]], hello) + expect_equal( + as.vector(struct_array), + tibble::tibble(ps = ps$as_vector(), hello = hello$as_vector()) + ) + + list_array_r <- list( + c(1, 2, 3), + c(2), + numeric(), + NULL, + 5, + c(5, 6) + ) + list_array <- tab1$column(0) + expect_equal( + list_array$as_vector(), + list_array_r, + ignore_attr = TRUE + ) + + tib <- as.data.frame(tab1) + expect_equal( + tib, + tibble::tibble( + arr = list_array_r, + nuf = tibble::tibble(ps = ps$as_vector(), hello = hello$as_vector()) + ), + ignore_attr = TRUE + ) +}) + +test_that("Can read json file with list<struct<T...>> nested columns (ARROW-7740)", { + tf <- tempfile() + on.exit(unlink(tf)) + writeLines(' + {"a":[{"b":1.0},{"b":2.0}]} + {"a":[{"b":1.0},{"b":2.0}]} + ', tf) + + one <- tibble::tibble(b = c(1, 2)) + expected <- tibble::tibble(a = c(list(one), list(one))) + expect_equal(read_json_arrow(tf), expected, ignore_attr = TRUE) +}) diff --git a/src/arrow/r/tests/testthat/test-memory-pool.R b/src/arrow/r/tests/testthat/test-memory-pool.R new file mode 100644 index 000000000..0aa18aadc --- /dev/null +++ b/src/arrow/r/tests/testthat/test-memory-pool.R @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("default_memory_pool and its attributes", { + pool <- default_memory_pool() + # Not integer bc can be >2gb, so we cast to double + expect_type(pool$bytes_allocated, "double") + expect_type(pool$max_memory, "double") + expect_true(pool$backend_name %in% c("system", "jemalloc", "mimalloc")) + + expect_true(all(supported_memory_backends() %in% c("system", "jemalloc", "mimalloc"))) +}) diff --git a/src/arrow/r/tests/testthat/test-message-reader.R b/src/arrow/r/tests/testthat/test-message-reader.R new file mode 100644 index 000000000..44f3fe4f7 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-message-reader.R @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("MessageReader can be created from raw vectors", { + batch <- record_batch(x = 1:10) + bytes <- batch$serialize() + + reader <- MessageReader$create(bytes) + + message <- reader$ReadNextMessage() + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$RECORD_BATCH) + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") + + message <- reader$ReadNextMessage() + expect_null(message) + + schema <- schema(x = int32()) + bytes <- schema$serialize() + + reader <- MessageReader$create(bytes) + + message <- reader$ReadNextMessage() + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$SCHEMA) + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") + + message <- reader$ReadNextMessage() + expect_null(message) +}) + +test_that("MessageReader can be created from input stream", { + batch <- record_batch(x = 1:10) + bytes <- batch$serialize() + + stream <- BufferReader$create(bytes) + expect_r6_class(stream, "BufferReader") + + reader <- MessageReader$create(stream) + expect_r6_class(reader, "MessageReader") + + message <- reader$ReadNextMessage() + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$RECORD_BATCH) + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") + + message <- reader$ReadNextMessage() + expect_null(message) + + schema <- schema(x = int32()) + bytes <- schema$serialize() + + stream <- BufferReader$create(bytes) + expect_r6_class(stream, "BufferReader") + + reader <- MessageReader$create(stream) + expect_r6_class(reader, "MessageReader") + + message <- reader$ReadNextMessage() + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$SCHEMA) + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") + + message <- reader$ReadNextMessage() + expect_null(message) +}) diff --git a/src/arrow/r/tests/testthat/test-message.R b/src/arrow/r/tests/testthat/test-message.R new file mode 100644 index 000000000..c9ee4cb72 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-message.R @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("read_message can read from input stream", { + batch <- record_batch(x = 1:10) + bytes <- batch$serialize() + stream <- BufferReader$create(bytes) + + message <- read_message(stream) + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$RECORD_BATCH) + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") + + message <- read_message(stream) + expect_null(read_message(stream)) +}) + +test_that("read_message() can read Schema messages", { + bytes <- schema(x = int32())$serialize() + stream <- BufferReader$create(bytes) + message <- read_message(stream) + + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$SCHEMA) + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") + + message <- read_message(stream) + expect_null(read_message(stream)) +}) + +test_that("read_message() can handle raw vectors", { + batch <- record_batch(x = 1:10) + bytes <- batch$serialize() + stream <- BufferReader$create(bytes) + + message_stream <- read_message(stream) + message_raw <- read_message(bytes) + expect_equal(message_stream, message_raw) + + bytes <- schema(x = int32())$serialize() + stream <- BufferReader$create(bytes) + message_stream <- read_message(stream) + message_raw <- read_message(bytes) + + expect_equal(message_stream, message_raw) +}) diff --git a/src/arrow/r/tests/testthat/test-metadata.R b/src/arrow/r/tests/testthat/test-metadata.R new file mode 100644 index 000000000..4c4d8a767 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-metadata.R @@ -0,0 +1,369 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("Schema metadata", { + s <- schema(b = double()) + expect_equal(s$metadata, empty_named_list()) + expect_false(s$HasMetadata) + s$metadata <- list(test = TRUE) + expect_identical(s$metadata, list(test = "TRUE")) + expect_true(s$HasMetadata) + s$metadata$foo <- 42 + expect_identical(s$metadata, list(test = "TRUE", foo = "42")) + expect_true(s$HasMetadata) + s$metadata$foo <- NULL + expect_identical(s$metadata, list(test = "TRUE")) + expect_true(s$HasMetadata) + s$metadata <- NULL + expect_equal(s$metadata, empty_named_list()) + expect_false(s$HasMetadata) + expect_error( + s$metadata <- 4, + "Key-value metadata must be a named list or character vector" + ) +}) + +test_that("Table metadata", { + tab <- Table$create(x = 1:2, y = c("a", "b")) + expect_equal(tab$metadata, empty_named_list()) + tab$metadata <- list(test = TRUE) + expect_identical(tab$metadata, list(test = "TRUE")) + tab$metadata$foo <- 42 + expect_identical(tab$metadata, list(test = "TRUE", foo = "42")) + tab$metadata$foo <- NULL + expect_identical(tab$metadata, list(test = "TRUE")) + tab$metadata <- NULL + expect_equal(tab$metadata, empty_named_list()) +}) + +test_that("Table R metadata", { + tab <- Table$create(example_with_metadata) + expect_output(print(tab$metadata), "arrow_r_metadata") + expect_identical(as.data.frame(tab), example_with_metadata) +}) + +test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", { + tab <- Table$create(example_data[1:6]) + expect_null(tab$metadata$r) + + expect_null(Table$create(example_with_times[1:3])$metadata$r) +}) + +test_that("classes are not stored for arrow_binary/arrow_large_binary/arrow_fixed_size_binary (ARROW-14140)", { + raws <- charToRaw("bonjour") + + binary <- Array$create(list(raws), binary()) + large_binary <- Array$create(list(raws), large_binary()) + fixed_size_binary <- Array$create(list(raws), fixed_size_binary(7L)) + + expect_null(RecordBatch$create(b = binary)$metadata$r) + expect_null(RecordBatch$create(b = large_binary)$metadata$r) + expect_null(RecordBatch$create(b = fixed_size_binary)$metadata$r) + + expect_null(Table$create(b = binary)$metadata$r) + expect_null(Table$create(b = large_binary)$metadata$r) + expect_null(Table$create(b = fixed_size_binary)$metadata$r) +}) + +test_that("Garbage R metadata doesn't break things", { + tab <- Table$create(example_data[1:6]) + tab$metadata$r <- "garbage" + expect_warning( + expect_identical(as.data.frame(tab), example_data[1:6]), + "Invalid metadata$r", + fixed = TRUE + ) + # serialize data like .serialize_arrow_r_metadata does, but don't call that + # directly since it checks to ensure that the data is a list + tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE)) + expect_warning( + expect_identical(as.data.frame(tab), example_data[1:6]), + "Invalid metadata$r", + fixed = TRUE + ) +}) + +test_that("Metadata serialization compression", { + # attributes that (when serialized) are just under 100kb are not compressed, + # and simply serialized + strings <- as.list(rep(make_string_of_size(1), 98)) + small <- .serialize_arrow_r_metadata(strings) + expect_equal( + object.size(small), + object.size(rawToChar(serialize(strings, NULL, ascii = TRUE))) + ) + + # Large strings will be compressed + large_strings <- as.list(rep(make_string_of_size(1), 100)) + large <- .serialize_arrow_r_metadata(large_strings) + expect_lt( + object.size(large), + object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE))) + ) + # and this compression ends up being smaller than even the "small" strings + expect_lt(object.size(large), object.size(small)) + + # However strings where compression + serialization is not effective are no + # worse than only serialization alone + large_few_strings <- as.list(rep(make_random_string_of_size(50), 2)) + large_few <- .serialize_arrow_r_metadata(large_few_strings) + expect_equal( + object.size(large_few), + object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE))) + ) + + # But we can disable compression + op <- options(arrow.compress_metadata = FALSE) + on.exit(options(op)) + + large_strings <- as.list(rep(make_string_of_size(1), 100)) + large <- .serialize_arrow_r_metadata(large_strings) + expect_equal( + object.size(large), + object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE))) + ) +}) + +test_that("RecordBatch metadata", { + rb <- RecordBatch$create(x = 1:2, y = c("a", "b")) + expect_equal(rb$metadata, empty_named_list()) + rb$metadata <- list(test = TRUE) + expect_identical(rb$metadata, list(test = "TRUE")) + rb$metadata$foo <- 42 + expect_identical(rb$metadata, list(test = "TRUE", foo = "42")) + rb$metadata$foo <- NULL + expect_identical(rb$metadata, list(test = "TRUE")) + rb$metadata <- NULL + expect_equal(rb$metadata, empty_named_list()) +}) + +test_that("RecordBatch R metadata", { + expect_identical(as.data.frame(record_batch(example_with_metadata)), example_with_metadata) +}) + +test_that("R metadata roundtrip via parquet", { + skip_if_not_available("parquet") + tf <- tempfile() + on.exit(unlink(tf)) + + write_parquet(example_with_metadata, tf) + expect_identical(read_parquet(tf), example_with_metadata) +}) + +test_that("R metadata roundtrip via feather", { + tf <- tempfile() + on.exit(unlink(tf)) + + write_feather(example_with_metadata, tf) + expect_identical(read_feather(tf), example_with_metadata) +}) + +test_that("haven types roundtrip via feather", { + tf <- tempfile() + on.exit(unlink(tf)) + + write_feather(haven_data, tf) + expect_identical(read_feather(tf), haven_data) +}) + +test_that("Date/time type roundtrip", { + rb <- record_batch(example_with_times) + expect_r6_class(rb$schema$posixlt$type, "StructType") + expect_identical(as.data.frame(rb), example_with_times) +}) + +test_that("metadata keeps attribute of top level data frame", { + df <- structure(data.frame(x = 1, y = 2), foo = "bar") + tab <- Table$create(df) + expect_identical(attr(as.data.frame(tab), "foo"), "bar") + expect_identical(as.data.frame(tab), df) +}) + + +test_that("metadata drops readr's problems attribute", { + readr_like <- tibble::tibble( + dbl = 1.1, + not_here = NA_character_ + ) + attributes(readr_like) <- append( + attributes(readr_like), + list(problems = tibble::tibble( + row = 1L, + col = NA_character_, + expected = "2 columns", + actual = "1 columns", + file = "'test'" + )) + ) + + tab <- Table$create(readr_like) + expect_null(attr(as.data.frame(tab), "problems")) +}) + +test_that("Row-level metadata (does not by default) roundtrip", { + # First tracked at ARROW-10386, though it was later determined that row-level + # metadata should be handled separately ARROW-14020, ARROW-12542 + df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux")))) + tab <- Table$create(df) + r_metadata <- tab$r_metadata + expect_type(r_metadata, "list") + expect_null(r_metadata$columns$x$columns) + + # But we can re-enable this / read data that has already been written with + # row-level metadata + withr::with_options( + list("arrow.preserve_row_level_metadata" = TRUE), { + tab <- Table$create(df) + expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") + expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") + }) +}) + + +test_that("Row-level metadata (does not) roundtrip in datasets", { + # First tracked at ARROW-10386, though it was later determined that row-level + # metadata should be handled separately ARROW-14020, ARROW-12542 + skip_if_not_available("dataset") + skip_if_not_available("parquet") + + library(dplyr, warn.conflicts = FALSE) + + df <- tibble::tibble( + metadata = list( + structure(1, my_value_as_attr = 1), + structure(2, my_value_as_attr = 2), + structure(3, my_value_as_attr = 3), + structure(4, my_value_as_attr = 3) + ), + int = 1L:4L, + part = c(1, 3, 2, 1) + ) + + dst_dir <- make_temp_dir() + + withr::with_options( + list("arrow.preserve_row_level_metadata" = TRUE), { + expect_warning( + write_dataset(df, dst_dir, partitioning = "part"), + "Row-level metadata is not compatible with datasets and will be discarded" + ) + + # Reset directory as previous write will have created some files and the default + # behavior is to error on existing + dst_dir <- make_temp_dir() + # but we need to write a dataset with row-level metadata to make sure when + # reading ones that have been written with them we warn appropriately + fake_func_name <- write_dataset + fake_func_name(df, dst_dir, partitioning = "part") + + ds <- open_dataset(dst_dir) + expect_warning( + df_from_ds <- collect(ds), + "Row-level metadata is not compatible with this operation and has been ignored" + ) + expect_equal( + arrange(df_from_ds, int), + arrange(df, int), + ignore_attr = TRUE + ) + + # however there is *no* warning if we don't select the metadata column + expect_warning( + df_from_ds <- ds %>% select(int) %>% collect(), + NA + ) + }) +}) + +test_that("When we encounter SF cols, we warn", { + df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux")))) + class(df$x) <- c("sfc_MULTIPOLYGON", "sfc", "list") + + expect_warning( + tab <- Table$create(df), + "One of the columns given appears to be an" + ) + + # but the table was read fine, just sans (row-level) metadata + r_metadata <- .unserialize_arrow_r_metadata(tab$metadata$r) + expect_null(r_metadata$columns$x$columns) + + # But we can re-enable this / read data that has already been written with + # row-level metadata without a warning + withr::with_options( + list("arrow.preserve_row_level_metadata" = TRUE), { + expect_warning(tab <- Table$create(df), NA) + expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar") + expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux") + }) +}) + +test_that("dplyr with metadata", { + skip_if_not_available("dataset") + + compare_dplyr_binding( + .input %>% + collect(), + example_with_metadata + ) + compare_dplyr_binding( + .input %>% + select(a) %>% + collect(), + example_with_metadata + ) + compare_dplyr_binding( + .input %>% + mutate(z = b * 4) %>% + select(z, a) %>% + collect(), + example_with_metadata + ) + compare_dplyr_binding( + .input %>% + mutate(z = nchar(a)) %>% + select(z, a) %>% + collect(), + example_with_metadata + ) + # dplyr drops top-level attributes if you do summarize, though attributes + # of grouping columns appear to come through + compare_dplyr_binding( + .input %>% + group_by(a) %>% + summarize(n()) %>% + collect(), + example_with_metadata + ) + # Same name in output but different data, so the column metadata shouldn't + # carry through + compare_dplyr_binding( + .input %>% + mutate(a = nchar(a)) %>% + select(a) %>% + collect(), + example_with_metadata + ) +}) + +test_that("grouped_df metadata is recorded (efficiently)", { + grouped <- group_by(tibble(a = 1:2, b = 3:4), a) + expect_s3_class(grouped, "grouped_df") + grouped_tab <- Table$create(grouped) + expect_r6_class(grouped_tab, "Table") + expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a") +}) diff --git a/src/arrow/r/tests/testthat/test-na-omit.R b/src/arrow/r/tests/testthat/test-na-omit.R new file mode 100644 index 000000000..fafebb4ff --- /dev/null +++ b/src/arrow/r/tests/testthat/test-na-omit.R @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +data_no_na <- c(2:10) +data_na <- c(data_no_na, NA_real_) + +test_that("na.fail on Scalar", { + scalar_na <- Scalar$create(NA) + scalar_one <- Scalar$create(1) + expect_as_vector(na.fail(scalar_one), 1) + expect_error(na.fail(scalar_na), "missing values in object") +}) + +test_that("na.omit on Array and ChunkedArray", { + compare_expression(na.omit(.input), data_no_na) + compare_expression(na.omit(.input), data_na, ignore_attr = TRUE) +}) + +test_that("na.exclude on Array and ChunkedArray", { + compare_expression(na.exclude(.input), data_no_na) + compare_expression(na.exclude(.input), data_na, ignore_attr = TRUE) +}) + +test_that("na.fail on Array and ChunkedArray", { + compare_expression(na.fail(.input), data_no_na, ignore_attr = TRUE) + compare_expression_error(na.fail(.input), data_na) +}) + +test_that("na.fail on Scalar", { + scalar_one <- Scalar$create(1) + expect_error(na.fail(scalar_na), regexp = "missing values in object") + expect_as_vector(na.fail(scalar_one), na.fail(1)) +}) + +test_that("na.omit on Table", { + tbl <- Table$create(example_data) + expect_equal( + as.data.frame(na.omit(tbl)), + na.omit(example_data), + # We don't include an attribute with the rows omitted + ignore_attr = "na.action" + ) +}) + +test_that("na.exclude on Table", { + tbl <- Table$create(example_data) + expect_equal( + as.data.frame(na.exclude(tbl)), + na.exclude(example_data), + ignore_attr = "na.action" + ) +}) + +test_that("na.fail on Table", { + tbl <- Table$create(example_data) + expect_error(na.fail(tbl), "missing values in object") +}) + +test_that("na.omit on RecordBatch", { + batch <- record_batch(example_data) + expect_equal( + as.data.frame(na.omit(batch)), + na.omit(example_data), + ignore_attr = "na.action" + ) +}) + +test_that("na.exclude on RecordBatch", { + batch <- record_batch(example_data) + expect_equal( + as.data.frame(na.exclude(batch)), + na.omit(example_data), + ignore_attr = "na.action" + ) +}) + +test_that("na.fail on RecordBatch", { + batch <- record_batch(example_data) + expect_error(na.fail(batch), "missing values in object") +}) diff --git a/src/arrow/r/tests/testthat/test-parquet.R b/src/arrow/r/tests/testthat/test-parquet.R new file mode 100644 index 000000000..55d86b532 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-parquet.R @@ -0,0 +1,274 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("parquet") + +pq_file <- system.file("v0.7.1.parquet", package = "arrow") + +test_that("reading a known Parquet file to tibble", { + skip_if_not_available("snappy") + df <- read_parquet(pq_file) + expect_true(tibble::is_tibble(df)) + expect_identical(dim(df), c(10L, 11L)) + # TODO: assert more about the contents +}) + +test_that("simple int column roundtrip", { + df <- tibble::tibble(x = 1:5) + pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary + + write_parquet(df, pq_tmp_file) + df_read <- read_parquet(pq_tmp_file) + expect_equal(df, df_read) + # Make sure file connection is cleaned up + expect_error(file.remove(pq_tmp_file), NA) + expect_false(file.exists(pq_tmp_file)) +}) + +test_that("read_parquet() supports col_select", { + skip_if_not_available("snappy") + df <- read_parquet(pq_file, col_select = c(x, y, z)) + expect_equal(names(df), c("x", "y", "z")) + + df <- read_parquet(pq_file, col_select = starts_with("c")) + expect_equal(names(df), c("carat", "cut", "color", "clarity")) +}) + +test_that("read_parquet() with raw data", { + skip_if_not_available("snappy") + test_raw <- readBin(pq_file, what = "raw", n = 5000) + df <- read_parquet(test_raw) + expect_identical(dim(df), c(10L, 11L)) +}) + +test_that("write_parquet() handles various compression= specs", { + skip_if_not_available("snappy") + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "snappy") + expect_parquet_roundtrip(tab, compression = rep("snappy", 3L)) + expect_parquet_roundtrip(tab, compression = c(x1 = "snappy", x2 = "snappy")) +}) + +test_that("write_parquet() handles various compression_level= specs", { + skip_if_not_available("gzip") + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = 4) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = rep(4L, 3L)) + expect_parquet_roundtrip(tab, compression = "gzip", compression_level = c(x1 = 5L, x2 = 3L)) +}) + +test_that("write_parquet() handles various use_dictionary= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, use_dictionary = TRUE) + expect_parquet_roundtrip(tab, use_dictionary = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, use_dictionary = c(x1 = TRUE, x2 = TRUE)) + expect_error( + write_parquet(tab, tempfile(), use_dictionary = c(TRUE, FALSE)), + "unsupported use_dictionary= specification" + ) + expect_error( + write_parquet(tab, tempfile(), use_dictionary = 12), + "is.logical(use_dictionary) is not TRUE", + fixed = TRUE + ) +}) + +test_that("write_parquet() handles various write_statistics= specs", { + tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5) + + expect_parquet_roundtrip(tab, write_statistics = TRUE) + expect_parquet_roundtrip(tab, write_statistics = c(TRUE, FALSE, TRUE)) + expect_parquet_roundtrip(tab, write_statistics = c(x1 = TRUE, x2 = TRUE)) +}) + +test_that("write_parquet() accepts RecordBatch too", { + batch <- RecordBatch$create(x1 = 1:5, x2 = 1:5, y = 1:5) + tab <- parquet_roundtrip(batch) + expect_equal(tab, Table$create(batch)) +}) + +test_that("write_parquet() handles grouped_df", { + library(dplyr, warn.conflicts = FALSE) + df <- tibble::tibble(a = 1:4, b = 5) %>% group_by(b) + # Since `df` is a "grouped_df", this test asserts that we get a grouped_df back + expect_parquet_roundtrip(df, as_data_frame = TRUE) +}) + +test_that("write_parquet() with invalid input type", { + bad_input <- Array$create(1:5) + expect_error( + write_parquet(bad_input, tempfile()), + regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'." + ) +}) + +test_that("write_parquet() can truncate timestamps", { + tab <- Table$create(x1 = as.POSIXct("2020/06/03 18:00:00", tz = "UTC")) + expect_type_equal(tab$x1, timestamp("us", "UTC")) + + tf <- tempfile() + on.exit(unlink(tf)) + + write_parquet(tab, tf, coerce_timestamps = "ms", allow_truncated_timestamps = TRUE) + new <- read_parquet(tf, as_data_frame = FALSE) + expect_type_equal(new$x1, timestamp("ms", "UTC")) + expect_equal(as.data.frame(tab), as.data.frame(new)) +}) + +test_that("make_valid_version()", { + expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0) + + expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0) + expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0) +}) + +test_that("write_parquet() defaults to snappy compression", { + skip_if_not_available("snappy") + tmp1 <- tempfile() + tmp2 <- tempfile() + write_parquet(mtcars, tmp1) + write_parquet(mtcars, tmp2, compression = "snappy") + expect_equal(file.size(tmp1), file.size(tmp2)) +}) + +test_that("Factors are preserved when writing/reading from Parquet", { + fct <- factor(c("a", "b"), levels = c("c", "a", "b")) + ord <- factor(c("a", "b"), levels = c("c", "a", "b"), ordered = TRUE) + chr <- c("a", "b") + df <- tibble::tibble(fct = fct, ord = ord, chr = chr) + + pq_tmp_file <- tempfile() + on.exit(unlink(pq_tmp_file)) + + write_parquet(df, pq_tmp_file) + df_read <- read_parquet(pq_tmp_file) + expect_equal(df, df_read) +}) + +test_that("Lists are preserved when writing/reading from Parquet", { + bool <- list(logical(0), NA, c(TRUE, FALSE)) + int <- list(integer(0), NA_integer_, 1:4) + num <- list(numeric(0), NA_real_, c(1, 2)) + char <- list(character(0), NA_character_, c("itsy", "bitsy")) + df <- tibble::tibble(bool = bool, int = int, num = num, char = char) + + pq_tmp_file <- tempfile() + on.exit(unlink(pq_tmp_file)) + + write_parquet(df, pq_tmp_file) + df_read <- read_parquet(pq_tmp_file) + expect_equal(df, df_read, ignore_attr = TRUE) +}) + +test_that("write_parquet() to stream", { + df <- tibble::tibble(x = 1:5) + tf <- tempfile() + con <- FileOutputStream$create(tf) + on.exit(unlink(tf)) + write_parquet(df, con) + con$close() + expect_equal(read_parquet(tf), df) +}) + +test_that("write_parquet() returns its input", { + df <- tibble::tibble(x = 1:5) + tf <- tempfile() + on.exit(unlink(tf)) + df_out <- write_parquet(df, tf) + expect_equal(df, df_out) +}) + +test_that("write_parquet() handles version argument", { + df <- tibble::tibble(x = 1:5) + tf <- tempfile() + on.exit(unlink(tf)) + + purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ { + write_parquet(df, tf, version = .x) + expect_identical(read_parquet(tf), df) + }) + purrr::walk(list("3.0", 3.0, 3L, "A"), ~ { + expect_error(write_parquet(df, tf, version = .x)) + }) +}) + +test_that("ParquetFileWriter raises an error for non-OutputStream sink", { + sch <- schema(a = float32()) + # ARROW-9946 + expect_error( + ParquetFileWriter$create(schema = sch, sink = tempfile()), + regex = "OutputStream" + ) +}) + +test_that("ParquetFileReader $ReadRowGroup(s) methods", { + tab <- Table$create(x = 1:100) + tf <- tempfile() + on.exit(unlink(tf)) + write_parquet(tab, tf, chunk_size = 10) + + reader <- ParquetFileReader$create(tf) + expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:10)) + expect_true(reader$ReadRowGroup(9) == Table$create(x = 91:100)) + expect_error(reader$ReadRowGroup(-1), "Some index in row_group_indices") + expect_error(reader$ReadRowGroup(111), "Some index in row_group_indices") + expect_error(reader$ReadRowGroup(c(1, 2))) + expect_error(reader$ReadRowGroup("a")) + + expect_true(reader$ReadRowGroups(c(0, 1)) == Table$create(x = 1:20)) + expect_error(reader$ReadRowGroups(c(0, 1, -2))) # although it gives a weird error + expect_error(reader$ReadRowGroups(c(0, 1, 31))) # ^^ + expect_error(reader$ReadRowGroups(c("a", "b"))) + + ## -- with column_indices + expect_true(reader$ReadRowGroup(0, 0) == Table$create(x = 1:10)) + expect_error(reader$ReadRowGroup(0, 1)) + + expect_true(reader$ReadRowGroups(c(0, 1), 0) == Table$create(x = 1:20)) + expect_error(reader$ReadRowGroups(c(0, 1), 1)) +}) + +test_that("Error messages are shown when the compression algorithm snappy is not found", { + msg <- paste0( + "NotImplemented: Support for codec 'snappy' not built\nIn order to read this file, ", + "you will need to reinstall arrow with additional features enabled.\nSet one of these ", + "environment variables before installing:\n\n * LIBARROW_MINIMAL=false (for all optional ", + "features, including 'snappy')\n * ARROW_WITH_SNAPPY=ON (for just 'snappy')\n\n", + "See https://arrow.apache.org/docs/r/articles/install.html for details" + ) + + if (codec_is_available("snappy")) { + d <- read_parquet(pq_file) + expect_s3_class(d, "data.frame") + } else { + expect_error(read_parquet(pq_file), msg, fixed = TRUE) + } +}) + +test_that("Error is created when parquet reads a feather file", { + expect_error( + read_parquet(test_path("golden-files/data-arrow_2.0.0_lz4.feather")), + "Parquet magic bytes not found in footer" + ) +}) diff --git a/src/arrow/r/tests/testthat/test-python-flight.R b/src/arrow/r/tests/testthat/test-python-flight.R new file mode 100644 index 000000000..c87f3a562 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-python-flight.R @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Assumes: +# * We've already done arrow::install_pyarrow() +# * R -e 'arrow::load_flight_server("demo_flight_server")$DemoFlightServer(port = 8089)$serve()' +# TODO: set up CI job to test this, or some way of running a background process +if (process_is_running("demo_flight_server")) { + client <- flight_connect(port = 8089) + flight_obj <- tempfile() + + test_that("flight_path_exists", { + expect_false(flight_path_exists(client, flight_obj)) + expect_false(flight_obj %in% list_flights(client)) + }) + + test_that("flight_put", { + flight_put(client, example_data, path = flight_obj) + expect_true(flight_path_exists(client, flight_obj)) + expect_true(flight_obj %in% list_flights(client)) + }) + + test_that("flight_get", { + expect_identical(as.data.frame(flight_get(client, flight_obj)), example_data) + }) + + test_that("flight_put with RecordBatch", { + flight_obj2 <- tempfile() + flight_put(client, RecordBatch$create(example_data), path = flight_obj2) + expect_identical(as.data.frame(flight_get(client, flight_obj2)), example_data) + }) + + test_that("flight_put with overwrite = FALSE", { + expect_error( + flight_put(client, example_with_times, path = flight_obj, overwrite = FALSE), + "exists" + ) + # Default is TRUE so this will overwrite + flight_put(client, example_with_times, path = flight_obj) + expect_identical(as.data.frame(flight_get(client, flight_obj)), example_with_times) + }) +} else { + # Kinda hacky, let's put a skipped test here, just so we note that the tests + # didn't run + test_that("Flight tests", { + skip("Flight server is not running") + }) +} diff --git a/src/arrow/r/tests/testthat/test-python.R b/src/arrow/r/tests/testthat/test-python.R new file mode 100644 index 000000000..5ad7513fb --- /dev/null +++ b/src/arrow/r/tests/testthat/test-python.R @@ -0,0 +1,145 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("install_pyarrow", { + skip_on_cran() + skip_if_offline() + skip_if_not_dev_mode() + # Windows CI machine doesn't pick up the right python or something + skip_on_os("windows") + skip_if_not_installed("reticulate") + + venv <- try(reticulate::virtualenv_create("arrow-test")) + # Bail out if virtualenv isn't available + skip_if(inherits(venv, "try-error")) + expect_error(install_pyarrow("arrow-test", nightly = TRUE), NA) + # Set this up for the following tests + reticulate::use_virtualenv("arrow-test") +}) + +skip_if_no_pyarrow() + +test_that("Array from Python", { + pa <- reticulate::import("pyarrow") + py <- pa$array(c(1, 2, 3)) + expect_equal(py, Array$create(c(1, 2, 3))) +}) + +test_that("Array to Python", { + pa <- reticulate::import("pyarrow", convert = FALSE) + r <- Array$create(c(1, 2, 3)) + py <- pa$concat_arrays(list(r)) + expect_s3_class(py, "pyarrow.lib.Array") + expect_equal(reticulate::py_to_r(py), r) +}) + +test_that("RecordBatch to/from Python", { + pa <- reticulate::import("pyarrow", convert = FALSE) + batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3]) + py <- reticulate::r_to_py(batch) + expect_s3_class(py, "pyarrow.lib.RecordBatch") + expect_equal(reticulate::py_to_r(py), batch) +}) + +test_that("Table and ChunkedArray from Python", { + pa <- reticulate::import("pyarrow", convert = FALSE) + batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3]) + tab <- Table$create(batch, batch) + pybatch <- reticulate::r_to_py(batch) + pytab <- pa$Table$from_batches(list(pybatch, pybatch)) + expect_s3_class(pytab, "pyarrow.lib.Table") + expect_s3_class(pytab[0], "pyarrow.lib.ChunkedArray") + expect_equal(reticulate::py_to_r(pytab[0]), tab$col1) + expect_equal(reticulate::py_to_r(pytab), tab) +}) + +test_that("Table and ChunkedArray to Python", { + batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3]) + tab <- Table$create(batch, batch) + + pychunked <- reticulate::r_to_py(tab$col1) + expect_s3_class(pychunked, "pyarrow.lib.ChunkedArray") + expect_equal(reticulate::py_to_r(pychunked), tab$col1) + + pytab <- reticulate::r_to_py(tab) + expect_s3_class(pytab, "pyarrow.lib.Table") + expect_equal(reticulate::py_to_r(pytab), tab) +}) + +test_that("RecordBatch with metadata roundtrip", { + batch <- RecordBatch$create(example_with_times) + pybatch <- reticulate::r_to_py(batch) + expect_s3_class(pybatch, "pyarrow.lib.RecordBatch") + expect_equal(reticulate::py_to_r(pybatch), batch) + expect_identical(as.data.frame(reticulate::py_to_r(pybatch)), example_with_times) +}) + +test_that("Table with metadata roundtrip", { + tab <- Table$create(example_with_times) + pytab <- reticulate::r_to_py(tab) + expect_s3_class(pytab, "pyarrow.lib.Table") + expect_equal(reticulate::py_to_r(pytab), tab) + expect_identical(as.data.frame(reticulate::py_to_r(pytab)), example_with_times) +}) + +test_that("DataType roundtrip", { + r <- timestamp("ms", timezone = "Pacific/Marquesas") + py <- reticulate::r_to_py(r) + expect_s3_class(py, "pyarrow.lib.DataType") + expect_equal(reticulate::py_to_r(py), r) +}) + +test_that("Field roundtrip", { + r <- field("x", time32("s")) + py <- reticulate::r_to_py(r) + expect_s3_class(py, "pyarrow.lib.Field") + expect_equal(reticulate::py_to_r(py), r) +}) + +test_that("RecordBatchReader to python", { + library(dplyr) + + tab <- Table$create(example_data) + scan <- tab %>% + select(int, lgl) %>% + filter(int > 6) %>% + Scanner$create() + reader <- scan$ToRecordBatchReader() + pyreader <- reticulate::r_to_py(reader) + expect_s3_class(pyreader, "pyarrow.lib.RecordBatchReader") + pytab <- pyreader$read_all() + expect_s3_class(pytab, "pyarrow.lib.Table") + back_to_r <- reticulate::py_to_r(pytab) + expect_r6_class(back_to_r, "Table") + expect_identical( + as.data.frame(back_to_r), + example_data %>% + select(int, lgl) %>% + filter(int > 6) + ) +}) + +test_that("RecordBatchReader from python", { + tab <- Table$create(example_data) + scan <- Scanner$create(tab) + reader <- scan$ToRecordBatchReader() + pyreader <- reticulate::r_to_py(reader) + back_to_r <- reticulate::py_to_r(pyreader) + rt_table <- back_to_r$read_table() + expect_r6_class(rt_table, "Table") + expect_identical(as.data.frame(rt_table), example_data) +}) diff --git a/src/arrow/r/tests/testthat/test-read-record-batch.R b/src/arrow/r/tests/testthat/test-read-record-batch.R new file mode 100644 index 000000000..ba109da6c --- /dev/null +++ b/src/arrow/r/tests/testthat/test-read-record-batch.R @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("RecordBatchFileWriter / RecordBatchFileReader roundtrips", { + tab <- Table$create( + int = 1:10, + dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10] + ) + + tf <- tempfile() + expect_error( + RecordBatchFileWriter$create(tf, tab$schema), + "RecordBatchFileWriter$create() requires an Arrow InputStream. Try providing FileOutputStream$create(tf)", + fixed = TRUE + ) + + stream <- FileOutputStream$create(tf) + writer <- RecordBatchFileWriter$create(stream, tab$schema) + expect_r6_class(writer, "RecordBatchWriter") + writer$write_table(tab) + writer$close() + stream$close() + + expect_equal(read_feather(tf, as_data_frame = FALSE), tab) + # Make sure connections are closed + expect_error(file.remove(tf), NA) + skip_on_os("windows") # This should pass, we've closed the stream + expect_false(file.exists(tf)) +}) + +test_that("record_batch() handles (raw|Buffer|InputStream, Schema) (ARROW-3450, ARROW-3505)", { + tbl <- tibble::tibble( + int = 1:10, dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10] + ) + batch <- record_batch(!!!tbl) + schema <- batch$schema + + raw <- batch$serialize() + batch2 <- record_batch(raw, schema = schema) + batch3 <- record_batch(buffer(raw), schema = schema) + stream <- BufferReader$create(raw) + stream$close() + + expect_equal(batch, batch2) + expect_equal(batch, batch3) +}) + +test_that("record_batch() can handle (Message, Schema) parameters (ARROW-3499)", { + batch <- record_batch(x = 1:10) + schema <- batch$schema + + raw <- batch$serialize() + stream <- BufferReader$create(raw) + + message <- read_message(stream) + batch2 <- record_batch(message, schema = schema) + expect_equal(batch, batch2) + stream$close() +}) diff --git a/src/arrow/r/tests/testthat/test-read-write.R b/src/arrow/r/tests/testthat/test-read-write.R new file mode 100644 index 000000000..66f6db56d --- /dev/null +++ b/src/arrow/r/tests/testthat/test-read-write.R @@ -0,0 +1,125 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("table round trip", { + tbl <- tibble::tibble( + int = 1:10, + dbl = as.numeric(1:10), + raw = as.raw(1:10) + ) + + tab <- Table$create(!!!tbl) + expect_equal(tab$num_columns, 3L) + expect_equal(tab$num_rows, 10L) + + # ChunkedArray + chunked_array_int <- tab$column(0) + expect_equal(chunked_array_int$length(), 10L) + expect_equal(chunked_array_int$null_count, 0L) + expect_equal(chunked_array_int$as_vector(), tbl$int) + + # Array + chunks_int <- chunked_array_int$chunks + expect_equal(length(chunks_int), chunked_array_int$num_chunks) + for (i in seq_along(chunks_int)) { + expect_equal(chunked_array_int$chunk(i - 1L), chunks_int[[i]]) + } + + # ChunkedArray + chunked_array_dbl <- tab$column(1) + expect_equal(chunked_array_dbl$length(), 10L) + expect_equal(chunked_array_dbl$null_count, 0L) + expect_equal(chunked_array_dbl$as_vector(), tbl$dbl) + + # Array + chunks_dbl <- chunked_array_dbl$chunks + expect_equal(length(chunks_dbl), chunked_array_dbl$num_chunks) + for (i in seq_along(chunks_dbl)) { + expect_equal(chunked_array_dbl$chunk(i - 1L), chunks_dbl[[i]]) + } + + # ChunkedArray + chunked_array_raw <- tab$column(2) + expect_equal(chunked_array_raw$length(), 10L) + expect_equal(chunked_array_raw$null_count, 0L) + expect_equal(chunked_array_raw$as_vector(), as.integer(tbl$raw)) + + # Array + chunks_raw <- chunked_array_raw$chunks + expect_equal(length(chunks_raw), chunked_array_raw$num_chunks) + for (i in seq_along(chunks_raw)) { + expect_equal(chunked_array_raw$chunk(i - 1L), chunks_raw[[i]]) + } + tf <- tempfile() + write_feather(tbl, tf) + + res <- read_feather(tf) + expect_identical(tbl$int, res$int) + expect_identical(tbl$dbl, res$dbl) + expect_identical(as.integer(tbl$raw), res$raw) + unlink(tf) +}) + +test_that("table round trip handles NA in integer and numeric", { + tbl <- tibble::tibble( + int = c(NA, 2:10), + dbl = as.numeric(c(1:5, NA, 7:9, NA)), + raw = as.raw(1:10) + ) + + tab <- Table$create(!!!tbl) + expect_equal(tab$num_columns, 3L) + expect_equal(tab$num_rows, 10L) + + expect_equal(tab$column(0)$length(), 10L) + expect_equal(tab$column(1)$length(), 10L) + expect_equal(tab$column(2)$length(), 10L) + + expect_equal(tab$column(0)$null_count, 1L) + expect_equal(tab$column(1)$null_count, 2L) + expect_equal(tab$column(2)$null_count, 0L) + + expect_equal(tab$column(0)$type, int32()) + expect_equal(tab$column(1)$type, float64()) + expect_equal(tab$column(2)$type, uint8()) + + tf <- tempfile() + write_feather(tbl, tf) + + res <- read_feather(tf) + expect_identical(tbl$int, res$int) + expect_identical(tbl$dbl, res$dbl) + expect_identical(as.integer(tbl$raw), res$raw) + + expect_true(is.na(res$int[1])) + expect_true(is.na(res$dbl[6])) + expect_true(is.na(res$dbl[10])) + unlink(tf) +}) + +test_that("reading/writing a raw vector (sparklyr integration)", { + # These are effectively what sparklyr calls to get data to/from Spark + read_from_raw_test <- function(x) { + as.data.frame(RecordBatchStreamReader$create(x)$read_next_batch()) + } + bytes <- write_to_raw(example_data) + expect_type(bytes, "raw") + expect_identical(read_from_raw_test(bytes), example_data) + # this could just be `read_ipc_stream(x)`; propose that + expect_identical(read_ipc_stream(bytes), example_data) +}) diff --git a/src/arrow/r/tests/testthat/test-record-batch-reader.R b/src/arrow/r/tests/testthat/test-record-batch-reader.R new file mode 100644 index 000000000..3992670dc --- /dev/null +++ b/src/arrow/r/tests/testthat/test-record-batch-reader.R @@ -0,0 +1,141 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("RecordBatchStreamReader / Writer", { + tbl <- tibble::tibble( + x = 1:10, + y = letters[1:10] + ) + batch <- record_batch(tbl) + tab <- Table$create(tbl) + + sink <- BufferOutputStream$create() + expect_equal(sink$tell(), 0) + writer <- RecordBatchStreamWriter$create(sink, batch$schema) + expect_r6_class(writer, "RecordBatchWriter") + writer$write(batch) + writer$write(tab) + writer$write(tbl) + expect_true(sink$tell() > 0) + writer$close() + + buf <- sink$finish() + expect_r6_class(buf, "Buffer") + + reader <- RecordBatchStreamReader$create(buf) + expect_r6_class(reader, "RecordBatchStreamReader") + + batch1 <- reader$read_next_batch() + expect_r6_class(batch1, "RecordBatch") + expect_equal(batch, batch1) + batch2 <- reader$read_next_batch() + expect_r6_class(batch2, "RecordBatch") + expect_equal(batch, batch2) + batch3 <- reader$read_next_batch() + expect_r6_class(batch3, "RecordBatch") + expect_equal(batch, batch3) + expect_null(reader$read_next_batch()) +}) + +test_that("RecordBatchFileReader / Writer", { + sink <- BufferOutputStream$create() + writer <- RecordBatchFileWriter$create(sink, batch$schema) + expect_r6_class(writer, "RecordBatchWriter") + writer$write(batch) + writer$write(tab) + writer$write(tbl) + writer$close() + + buf <- sink$finish() + expect_r6_class(buf, "Buffer") + + reader <- RecordBatchFileReader$create(buf) + expect_r6_class(reader, "RecordBatchFileReader") + + batch1 <- reader$get_batch(0) + expect_r6_class(batch1, "RecordBatch") + expect_equal(batch, batch1) + + expect_equal(reader$num_record_batches, 3) +}) + +test_that("StreamReader read_table", { + sink <- BufferOutputStream$create() + writer <- RecordBatchStreamWriter$create(sink, batch$schema) + expect_r6_class(writer, "RecordBatchWriter") + writer$write(batch) + writer$write(tab) + writer$write(tbl) + writer$close() + buf <- sink$finish() + + reader <- RecordBatchStreamReader$create(buf) + out <- reader$read_table() + expect_identical(dim(out), c(30L, 2L)) +}) + +test_that("FileReader read_table", { + sink <- BufferOutputStream$create() + writer <- RecordBatchFileWriter$create(sink, batch$schema) + expect_r6_class(writer, "RecordBatchWriter") + writer$write(batch) + writer$write(tab) + writer$write(tbl) + writer$close() + buf <- sink$finish() + + reader <- RecordBatchFileReader$create(buf) + out <- reader$read_table() + expect_identical(dim(out), c(30L, 2L)) +}) + +test_that("MetadataFormat", { + expect_identical(get_ipc_metadata_version(5), 4L) + expect_identical(get_ipc_metadata_version("V4"), 3L) + expect_identical(get_ipc_metadata_version(NULL), 4L) + Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = 1) + expect_identical(get_ipc_metadata_version(NULL), 3L) + Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = "") + + expect_identical(get_ipc_metadata_version(NULL), 4L) + Sys.setenv(ARROW_PRE_1_0_METADATA_VERSION = 1) + expect_identical(get_ipc_metadata_version(NULL), 3L) + Sys.setenv(ARROW_PRE_1_0_METADATA_VERSION = "") + + expect_error( + get_ipc_metadata_version(99), + "99 is not a valid IPC MetadataVersion" + ) + expect_error( + get_ipc_metadata_version("45"), + '"45" is not a valid IPC MetadataVersion' + ) +}) + +test_that("reader with 0 batches", { + # IPC stream containing only a schema (ARROW-10642) + sink <- BufferOutputStream$create() + writer <- RecordBatchStreamWriter$create(sink, schema(a = int32())) + writer$close() + buf <- sink$finish() + + reader <- RecordBatchStreamReader$create(buf) + tab <- reader$read_table() + expect_r6_class(tab, "Table") + expect_identical(dim(tab), c(0L, 1L)) +}) diff --git a/src/arrow/r/tests/testthat/test-s3-minio.R b/src/arrow/r/tests/testthat/test-s3-minio.R new file mode 100644 index 000000000..e2c1dc2e7 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-s3-minio.R @@ -0,0 +1,228 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +if (arrow_with_s3() && process_is_running("minio server")) { + # Get minio config, with expected defaults + minio_key <- Sys.getenv("MINIO_ACCESS_KEY", "minioadmin") + minio_secret <- Sys.getenv("MINIO_SECRET_KEY", "minioadmin") + minio_port <- Sys.getenv("MINIO_PORT", "9000") + + # Helper function for minio URIs + minio_uri <- function(...) { + template <- "s3://%s:%s@%s?scheme=http&endpoint_override=localhost%s%s" + sprintf(template, minio_key, minio_secret, minio_path(...), "%3A", minio_port) + } + minio_path <- function(...) paste(now, ..., sep = "/") + + test_that("minio setup", { + # Create a "bucket" on minio for this test run, which we'll delete when done. + fs <- S3FileSystem$create( + access_key = minio_key, + secret_key = minio_secret, + scheme = "http", + endpoint_override = paste0("localhost:", minio_port) + ) + expect_r6_class(fs, "S3FileSystem") + now <- as.character(as.numeric(Sys.time())) + # If minio isn't running, this will hang for a few seconds and fail with a + # curl timeout, causing `run_these` to be set to FALSE and skipping the tests + fs$CreateDir(now) + }) + # Clean up when we're all done + on.exit(fs$DeleteDir(now)) + + test_that("read/write Feather on minio", { + write_feather(example_data, minio_uri("test.feather")) + expect_identical(read_feather(minio_uri("test.feather")), example_data) + }) + + test_that("read/write Feather by filesystem, not URI", { + write_feather(example_data, fs$path(minio_path("test2.feather"))) + expect_identical( + read_feather(fs$path(minio_path("test2.feather"))), + example_data + ) + }) + + test_that("read/write stream", { + write_ipc_stream(example_data, fs$path(minio_path("test3.ipc"))) + expect_identical( + read_ipc_stream(fs$path(minio_path("test3.ipc"))), + example_data + ) + }) + + test_that("read/write Parquet on minio", { + skip_if_not_available("parquet") + write_parquet(example_data, fs$path(minio_uri("test.parquet"))) + expect_identical(read_parquet(minio_uri("test.parquet")), example_data) + }) + + if (arrow_with_dataset()) { + library(dplyr) + + make_temp_dir <- function() { + path <- tempfile() + dir.create(path) + normalizePath(path, winslash = "/") + } + + test_that("open_dataset with an S3 file (not directory) URI", { + skip_if_not_available("parquet") + expect_identical( + open_dataset(minio_uri("test.parquet")) %>% collect() %>% arrange(int), + example_data %>% arrange(int) + ) + }) + + test_that("open_dataset with vector of S3 file URIs", { + expect_identical( + open_dataset( + c(minio_uri("test.feather"), minio_uri("test2.feather")), + format = "feather" + ) %>% + arrange(int) %>% + collect(), + rbind(example_data, example_data) %>% arrange(int) + ) + }) + + test_that("open_dataset errors on URIs for different file systems", { + td <- make_temp_dir() + expect_error( + open_dataset( + c( + minio_uri("test.feather"), + paste0("file://", file.path(td, "fake.feather")) + ), + format = "feather" + ), + "Vectors of URIs for different file systems are not supported" + ) + }) + + # Dataset test setup, cf. test-dataset.R + first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") + df1 <- tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[1:10], + fct = factor(LETTERS[1:10]), + ts = first_date + lubridate::days(1:10) + ) + + second_date <- lubridate::ymd_hms("2017-03-09 07:01:02") + df2 <- tibble( + int = 101:110, + dbl = as.numeric(51:60), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[10:1], + fct = factor(LETTERS[10:1]), + ts = second_date + lubridate::days(10:1) + ) + + # This is also to set up the dataset tests + test_that("write_parquet with filesystem arg", { + skip_if_not_available("parquet") + fs$CreateDir(minio_path("hive_dir", "group=1", "other=xxx")) + fs$CreateDir(minio_path("hive_dir", "group=2", "other=yyy")) + expect_length(fs$ls(minio_path("hive_dir")), 2) + write_parquet(df1, fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"))) + write_parquet(df2, fs$path(minio_path("hive_dir", "group=2", "other=yyy", "file2.parquet"))) + expect_identical( + read_parquet(fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"))), + df1 + ) + }) + + test_that("open_dataset with fs", { + ds <- open_dataset(fs$path(minio_path("hive_dir"))) + expect_identical( + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) + ) + }) + + test_that("write_dataset with fs", { + ds <- open_dataset(fs$path(minio_path("hive_dir"))) + write_dataset(ds, fs$path(minio_path("new_dataset_dir"))) + expect_length(fs$ls(minio_path("new_dataset_dir")), 1) + }) + + test_that("Let's test copy_files too", { + td <- make_temp_dir() + copy_files(minio_uri("hive_dir"), td) + expect_length(dir(td), 2) + ds <- open_dataset(td) + expect_identical( + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) + ) + + # Let's copy the other way and use a SubTreeFileSystem rather than URI + copy_files(td, fs$path(minio_path("hive_dir2"))) + ds2 <- open_dataset(fs$path(minio_path("hive_dir2"))) + expect_identical( + ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) + ) + }) + } + + test_that("S3FileSystem input validation", { + expect_error( + S3FileSystem$create(access_key = "foo"), + "Key authentication requires both access_key and secret_key" + ) + expect_error( + S3FileSystem$create(secret_key = "foo"), + "Key authentication requires both access_key and secret_key" + ) + expect_error( + S3FileSystem$create(session_token = "foo"), + paste0( + "In order to initialize a session with temporary credentials, ", + "both secret_key and access_key must be provided ", + "in addition to session_token." + ) + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", anonymous = TRUE), + 'Cannot specify "access_key" and "secret_key" when anonymous = TRUE' + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", role_arn = "qwer"), + "Cannot provide both key authentication and role_arn" + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", external_id = "qwer"), + 'Cannot specify "external_id" without providing a role_arn string' + ) + expect_error( + S3FileSystem$create(external_id = "foo"), + 'Cannot specify "external_id" without providing a role_arn string' + ) + }) +} else { + # Kinda hacky, let's put a skipped test here, just so we note that the tests + # didn't run + test_that("S3FileSystem tests with Minio", { + skip("Minio is not running") + }) +} diff --git a/src/arrow/r/tests/testthat/test-s3.R b/src/arrow/r/tests/testthat/test-s3.R new file mode 100644 index 000000000..298b15bb8 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-s3.R @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +run_these <- tryCatch( + expr = { + if (arrow_with_s3() && + identical(tolower(Sys.getenv("ARROW_R_DEV")), "true") && + !identical(Sys.getenv("AWS_ACCESS_KEY_ID"), "") && + !identical(Sys.getenv("AWS_SECRET_ACCESS_KEY"), "")) { + # See if we have access to the test bucket + bucket <- s3_bucket("ursa-labs-r-test") + bucket$GetFileInfo("") + TRUE + } else { + FALSE + } + }, + error = function(e) FALSE +) + +bucket_uri <- function(..., bucket = "s3://ursa-labs-r-test/%s?region=us-west-2") { + segments <- paste(..., sep = "/") + sprintf(bucket, segments) +} + +if (run_these) { + now <- as.numeric(Sys.time()) + on.exit(bucket$DeleteDir(now)) + + test_that("read/write Feather on S3", { + write_feather(example_data, bucket_uri(now, "test.feather")) + expect_identical(read_feather(bucket_uri(now, "test.feather")), example_data) + }) + + test_that("read/write Parquet on S3", { + skip_if_not_available("parquet") + write_parquet(example_data, bucket_uri(now, "test.parquet")) + expect_identical(read_parquet(bucket_uri(now, "test.parquet")), example_data) + }) +} diff --git a/src/arrow/r/tests/testthat/test-scalar.R b/src/arrow/r/tests/testthat/test-scalar.R new file mode 100644 index 000000000..3afccf743 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-scalar.R @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +expect_scalar_roundtrip <- function(x, type) { + s <- Scalar$create(x) + expect_r6_class(s, "Scalar") + expect_equal(s$type, type) + expect_identical(length(s), 1L) + if (inherits(type, "NestedType")) { + # Should this be? Missing if all elements are missing? + # expect_identical(is.na(s), all(is.na(x))) # nolint + } else { + expect_identical(as.vector(is.na(s)), is.na(x)) + # MakeArrayFromScalar not implemented for list types + expect_as_vector(s, x) + } +} + +test_that("Scalar object roundtrip", { + expect_scalar_roundtrip(2, float64()) + expect_scalar_roundtrip(2L, int32()) + expect_scalar_roundtrip(c(2, 4), list_of(float64())) + expect_scalar_roundtrip(c(NA, NA), list_of(bool())) + expect_scalar_roundtrip(data.frame(a = 2, b = 4L), struct(a = double(), b = int32())) +}) + +test_that("Scalar print", { + expect_output(print(Scalar$create(4)), "Scalar\n4") +}) + +test_that("Creating Scalars of a different type and casting them", { + expect_equal(Scalar$create(4L, int8())$type, int8()) + expect_equal(Scalar$create(4L)$cast(float32())$type, float32()) +}) + +test_that("Scalar to Array", { + a <- Scalar$create(42) + expect_equal(a$as_array(), Array$create(42)) + expect_equal(Array$create(a), Array$create(42)) +}) + +test_that("Scalar$Equals", { + a <- Scalar$create(42) + aa <- Array$create(42) + b <- Scalar$create(42) + d <- Scalar$create(43) + expect_equal(a, b) + expect_true(a$Equals(b)) + expect_false(a$Equals(d)) + expect_false(a$Equals(aa)) +}) + +test_that("Scalar$ApproxEquals", { + a <- Scalar$create(1.0000000000001) + aa <- Array$create(1.0000000000001) + b <- Scalar$create(1.0) + d <- 2.400000000000001 + expect_false(a$Equals(b)) + expect_true(a$ApproxEquals(b)) + expect_false(a$ApproxEquals(d)) + expect_false(a$ApproxEquals(aa)) +}) + +test_that("Handling string data with embedded nuls", { + raws <- as.raw(c(0x6d, 0x61, 0x00, 0x6e)) + expect_error( + rawToChar(raws), + "embedded nul in string: 'ma\\0n'", # See? + fixed = TRUE + ) + scalar_with_nul <- Scalar$create(raws, binary())$cast(utf8()) + + # The behavior of the warnings/errors is slightly different with and without + # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately + # on `as.vector()` where as with it, the error only happens on materialization) + skip_if_r_version("3.5.0") + v <- expect_error(as.vector(scalar_with_nul), NA) + expect_error( + v[1], + paste0( + "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ", + "set options(arrow.skip_nul = TRUE)" + ), + fixed = TRUE + ) + + withr::with_options(list(arrow.skip_nul = TRUE), { + expect_warning( + expect_identical( + as.vector(scalar_with_nul)[], + "man" + ), + "Stripping '\\0' (nul) from character vector", + fixed = TRUE + ) + }) +}) diff --git a/src/arrow/r/tests/testthat/test-schema.R b/src/arrow/r/tests/testthat/test-schema.R new file mode 100644 index 000000000..8473550df --- /dev/null +++ b/src/arrow/r/tests/testthat/test-schema.R @@ -0,0 +1,220 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("Alternate type names are supported", { + expect_equal( + schema(b = double(), c = bool(), d = string(), e = float(), f = halffloat()), + schema(b = float64(), c = boolean(), d = utf8(), e = float32(), f = float16()) + ) + expect_equal(names(schema(b = double(), c = bool(), d = string())), c("b", "c", "d")) +}) + +test_that("Schema print method", { + expect_output( + print(schema(b = double(), c = bool(), d = string())), + paste( + "Schema", + "b: double", + "c: bool", + "d: string", + sep = "\n" + ), + fixed = TRUE + ) +}) + +test_that("Schema with non-nullable fields", { + expect_output( + print(schema(field("b", double()), + field("c", bool(), nullable = FALSE), + field("d", string()))), + paste( + "Schema", + "b: double", + "c: bool not null", + "d: string", + sep = "\n" + ), + fixed = TRUE + ) +}) + +test_that("Schema $GetFieldByName", { + schm <- schema(b = double(), c = string()) + expect_equal(schm$GetFieldByName("b"), field("b", double())) + expect_null(schm$GetFieldByName("f")) + # TODO: schema(b = double(), b = string())$GetFieldByName("b") # nolint + # also returns NULL and probably should error bc duplicated names +}) + +test_that("Schema extract (returns Field)", { + # TODO: should this return a Field or the Type? + # I think of Schema like list(name = type, name = type, ...) + # but in practice it is more like list(list(name, type), list(name, type), ...) + # -> Field names in a Schema may be duplicated + # -> Fields may have metadata (though we don't really handle that in R) + schm <- schema(b = double(), c = string()) + expect_equal(schm$b, field("b", double())) + expect_equal(schm[["b"]], field("b", double())) + expect_equal(schm[[1]], field("b", double())) + + expect_null(schm[["ZZZ"]]) + expect_error(schm[[42]]) # Should have better error message +}) + +test_that("Schema slicing", { + schm <- schema(b = double(), c = string(), d = int8()) + expect_equal(schm[2:3], schema(c = string(), d = int8())) + expect_equal(schm[-1], schema(c = string(), d = int8())) + expect_equal(schm[c("d", "c")], schema(d = int8(), c = string())) + expect_equal(schm[c(FALSE, TRUE, TRUE)], schema(c = string(), d = int8())) + expect_error(schm[c("c", "ZZZ")], 'Invalid field name: "ZZZ"') + expect_error(schm[c("XXX", "c", "ZZZ")], 'Invalid field names: "XXX" and "ZZZ"') +}) + +test_that("Schema modification", { + schm <- schema(b = double(), c = string(), d = int8()) + schm$c <- boolean() + expect_equal(schm, schema(b = double(), c = boolean(), d = int8())) + schm[["d"]] <- int16() + expect_equal(schm, schema(b = double(), c = boolean(), d = int16())) + schm$b <- NULL + expect_equal(schm, schema(c = boolean(), d = int16())) + # NULL assigning something that doesn't exist doesn't modify + schm$zzzz <- NULL + expect_equal(schm, schema(c = boolean(), d = int16())) + # Adding a field + schm$fff <- int32() + expect_equal(schm, schema(c = boolean(), d = int16(), fff = int32())) + + # By index + schm <- schema(b = double(), c = string(), d = int8()) + schm[[2]] <- int32() + expect_equal(schm, schema(b = double(), c = int32(), d = int8())) + + # Adding actual Fields + # If assigning by name, note that this can modify the resulting name + schm <- schema(b = double(), c = string(), d = int8()) + schm$c <- field("x", int32()) + expect_equal(schm, schema(b = double(), x = int32(), d = int8())) + schm[[2]] <- field("y", int64()) + expect_equal(schm, schema(b = double(), y = int64(), d = int8())) + + # Error handling + expect_error(schm$c <- 4, "value must be a DataType") + expect_error(schm[[-3]] <- int32(), "i not greater than 0") + expect_error(schm[[0]] <- int32(), "i not greater than 0") + expect_error(schm[[NA_integer_]] <- int32(), "!is.na(i) is not TRUE", fixed = TRUE) + expect_error(schm[[TRUE]] <- int32(), "i is not a numeric or integer vector") + expect_error(schm[[c(2, 4)]] <- int32(), "length(i) not equal to 1", fixed = TRUE) +}) + +test_that("Metadata is preserved when modifying Schema", { + schm <- schema(b = double(), c = string(), d = int8()) + schm$metadata$foo <- "bar" + expect_identical(schm$metadata, list(foo = "bar")) + schm$c <- field("x", int32()) + expect_identical(schm$metadata, list(foo = "bar")) +}) + +test_that("reading schema from Buffer", { + # TODO: this uses the streaming format, i.e. from RecordBatchStreamWriter + # maybe there is an easier way to serialize a schema + batch <- record_batch(x = 1:10) + expect_r6_class(batch, "RecordBatch") + + stream <- BufferOutputStream$create() + writer <- RecordBatchStreamWriter$create(stream, batch$schema) + expect_r6_class(writer, "RecordBatchWriter") + writer$close() + + buffer <- stream$finish() + expect_r6_class(buffer, "Buffer") + + reader <- MessageReader$create(buffer) + expect_r6_class(reader, "MessageReader") + + message <- reader$ReadNextMessage() + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$SCHEMA) + + stream <- BufferReader$create(buffer) + expect_r6_class(stream, "BufferReader") + message <- read_message(stream) + expect_r6_class(message, "Message") + expect_equal(message$type, MessageType$SCHEMA) +}) + +test_that("Input validation when creating a table with a schema", { + expect_error( + Table$create(b = 1, schema = c(b = float64())), # list not Schema + "`schema` must be an arrow::Schema or NULL" + ) +}) + +test_that("Schema$Equals", { + a <- schema(b = double(), c = bool()) + b <- a$WithMetadata(list(some = "metadata")) + + # different metadata + expect_failure(expect_equal(a, b)) + expect_false(a$Equals(b, check_metadata = TRUE)) + + # Metadata not checked + expect_equal(a, b, ignore_attr = TRUE) + + # Non-schema object + expect_false(a$Equals(42)) +}) + +test_that("unify_schemas", { + a <- schema(b = double(), c = bool()) + z <- schema(b = double(), k = utf8()) + expect_equal( + unify_schemas(a, z), + schema(b = double(), c = bool(), k = utf8()) + ) + # returns NULL when any arg is NULL + expect_null( + unify_schemas(a, NULL, z) + ) + # returns NULL when all args are NULL + expect_null( + unify_schemas(NULL, NULL) + ) + # errors when no args + expect_error( + unify_schemas(), + "Must provide at least one schema to unify" + ) +}) + +test_that("Schema to C-interface", { + schema <- schema(b = double(), c = bool()) + + # export the schema via the C-interface + ptr <- allocate_arrow_schema() + schema$export_to_c(ptr) + + # then import it and check that the roundtripped value is the same + circle <- Schema$import_from_c(ptr) + expect_equal(circle, schema) + + # must clean up the pointer or we leak + delete_arrow_schema(ptr) +}) diff --git a/src/arrow/r/tests/testthat/test-thread-pool.R b/src/arrow/r/tests/testthat/test-thread-pool.R new file mode 100644 index 000000000..baf410368 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-thread-pool.R @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("can set/get cpu thread pool capacity", { + old <- cpu_count() + set_cpu_count(19) + expect_equal(cpu_count(), 19L) + set_cpu_count(old) + expect_equal(cpu_count(), old) +}) + +test_that("can set/get I/O thread pool capacity", { + old <- io_thread_count() + set_io_thread_count(19) + expect_equal(io_thread_count(), 19L) + set_io_thread_count(old) + expect_equal(io_thread_count(), old) +}) diff --git a/src/arrow/r/tests/testthat/test-type.R b/src/arrow/r/tests/testthat/test-type.R new file mode 100644 index 000000000..3821fb450 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-type.R @@ -0,0 +1,211 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("type() gets the right type for arrow::Array", { + a <- Array$create(1:10) + expect_equal(type(a), a$type) +}) + +test_that("type() gets the right type for ChunkedArray", { + a <- chunked_array(1:10, 1:10) + expect_equal(type(a), a$type) +}) + +test_that("type() infers from R type", { + expect_equal(type(1:10), int32()) + expect_equal(type(1), float64()) + expect_equal(type(TRUE), boolean()) + expect_equal(type(raw()), uint8()) + expect_equal(type(""), utf8()) + expect_equal( + type(example_data$fct), + dictionary(int8(), utf8(), FALSE) + ) + expect_equal( + type(lubridate::ymd_hms("2019-02-14 13:55:05")), + timestamp(TimeUnit$MICRO, "UTC") + ) + expect_equal( + type(hms::hms(56, 34, 12)), + time32(unit = TimeUnit$SECOND) + ) + expect_equal( + type(bit64::integer64()), + int64() + ) +}) + +test_that("type() can infer struct types from data frames", { + df <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) + expect_equal(type(df), struct(x = int32(), y = float64(), z = utf8())) +}) + +test_that("DataType$Equals", { + a <- int32() + b <- int32() + z <- float64() + expect_true(a == b) + expect_true(a$Equals(b)) + expect_false(a == z) + expect_equal(a, b) + expect_failure(expect_equal(a, z)) + expect_failure(expect_equal(a, z)) + expect_false(a$Equals(32L)) +}) + +test_that("Masked data type functions still work", { + skip("Work around masking of data type functions (ARROW-12322)") + + # Works when type function is masked + string <- rlang::string + expect_equal( + Array$create("abc", type = string()), + arrow::string() + ) + rm(string) + + # Works when with non-Arrow function that returns an Arrow type + # when the non-Arrow function has the same name as a base R function... + str <- arrow::string + expect_equal( + Array$create("abc", type = str()), + arrow::string() + ) + rm(str) + + # ... and when it has the same name as an Arrow function + type <- arrow::string + expect_equal( + Array$create("abc", type = type()), + arrow::string() + ) + rm(type) + + # Works with local variable whose value is an Arrow type + type <- arrow::string() + expect_equal( + Array$create("abc", type = type), + arrow::string() + ) + rm(type) +}) + +test_that("Type strings are correctly canonicalized", { + # data types without arguments + expect_equal(canonical_type_str("int8"), int8()$ToString()) + expect_equal(canonical_type_str("int16"), int16()$ToString()) + expect_equal(canonical_type_str("int32"), int32()$ToString()) + expect_equal(canonical_type_str("int64"), int64()$ToString()) + expect_equal(canonical_type_str("uint8"), uint8()$ToString()) + expect_equal(canonical_type_str("uint16"), uint16()$ToString()) + expect_equal(canonical_type_str("uint32"), uint32()$ToString()) + expect_equal(canonical_type_str("uint64"), uint64()$ToString()) + expect_equal(canonical_type_str("float16"), float16()$ToString()) + expect_equal(canonical_type_str("halffloat"), halffloat()$ToString()) + expect_equal(canonical_type_str("float32"), float32()$ToString()) + expect_equal(canonical_type_str("float"), float()$ToString()) + expect_equal(canonical_type_str("float64"), float64()$ToString()) + expect_equal(canonical_type_str("double"), float64()$ToString()) + expect_equal(canonical_type_str("boolean"), boolean()$ToString()) + expect_equal(canonical_type_str("bool"), bool()$ToString()) + expect_equal(canonical_type_str("utf8"), utf8()$ToString()) + expect_equal(canonical_type_str("large_utf8"), large_utf8()$ToString()) + expect_equal(canonical_type_str("large_string"), large_utf8()$ToString()) + expect_equal(canonical_type_str("binary"), binary()$ToString()) + expect_equal(canonical_type_str("large_binary"), large_binary()$ToString()) + expect_equal(canonical_type_str("string"), arrow::string()$ToString()) + expect_equal(canonical_type_str("null"), null()$ToString()) + + # data types with arguments + expect_equal( + canonical_type_str("fixed_size_binary"), + sub("^([^([<]+).*$", "\\1", fixed_size_binary(42)$ToString()) + ) + expect_equal( + canonical_type_str("date32"), + sub("^([^([<]+).*$", "\\1", date32()$ToString()) + ) + expect_equal( + canonical_type_str("date64"), + sub("^([^([<]+).*$", "\\1", date64()$ToString()) + ) + expect_equal( + canonical_type_str("time32"), + sub("^([^([<]+).*$", "\\1", time32()$ToString()) + ) + expect_equal( + canonical_type_str("time64"), + sub("^([^([<]+).*$", "\\1", time64()$ToString()) + ) + expect_equal( + canonical_type_str("timestamp"), + sub("^([^([<]+).*$", "\\1", timestamp()$ToString()) + ) + expect_equal( + canonical_type_str("decimal"), + sub("^([^([<]+).*$", "\\1", decimal(3, 2)$ToString()) + ) + expect_equal( + canonical_type_str("struct"), + sub("^([^([<]+).*$", "\\1", struct(foo = int32())$ToString()) + ) + expect_equal( + canonical_type_str("list_of"), + sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("list"), + sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("large_list_of"), + sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("large_list"), + sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString()) + ) + expect_equal( + canonical_type_str("fixed_size_list_of"), + sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString()) + ) + expect_equal( + canonical_type_str("fixed_size_list"), + sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString()) + ) + + # unsupported data types + expect_error( + canonical_type_str("decimal128(3, 2)"), + "parameters" + ) + expect_error( + canonical_type_str("list<item: int32>"), + "parameters" + ) + expect_error( + canonical_type_str("time32[s]"), + "parameters" + ) + + # unrecognized data types + expect_error( + canonical_type_str("foo"), + "Unrecognized" + ) +}) diff --git a/src/arrow/r/tests/testthat/test-utf.R b/src/arrow/r/tests/testthat/test-utf.R new file mode 100644 index 000000000..69d196274 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-utf.R @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +test_that("We handle non-UTF strings", { + # Move the code with non-UTF strings to a separate file so that we don't + # get a parse error on *cough* certain platforms + skip_on_cran() + source("latin1.R", encoding = "latin1") +}) |