summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/tests
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/r/tests
parentInitial commit. (diff)
downloadceph-upstream/18.2.2.tar.xz
ceph-upstream/18.2.2.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/arrow/r/tests/testthat.R27
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquetbin0 -> 7862 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.featherbin0 -> 1650 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.featherbin0 -> 1354 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.featherbin0 -> 1626 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquetbin0 -> 3603 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.featherbin0 -> 2858 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.featherbin0 -> 2626 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.featherbin0 -> 2842 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquetbin0 -> 3965 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.featherbin0 -> 3162 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.featherbin0 -> 2930 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.featherbin0 -> 3146 bytes
-rw-r--r--src/arrow/r/tests/testthat/helper-arrow.R69
-rw-r--r--src/arrow/r/tests/testthat/helper-data.R191
-rw-r--r--src/arrow/r/tests/testthat/helper-expectation.R320
-rw-r--r--src/arrow/r/tests/testthat/helper-parquet.R29
-rw-r--r--src/arrow/r/tests/testthat/helper-roundtrip.R44
-rw-r--r--src/arrow/r/tests/testthat/helper-skip.R81
-rw-r--r--src/arrow/r/tests/testthat/latin1.R76
-rw-r--r--src/arrow/r/tests/testthat/test-Array.R963
-rw-r--r--src/arrow/r/tests/testthat/test-RecordBatch.R690
-rw-r--r--src/arrow/r/tests/testthat/test-Table.R549
-rw-r--r--src/arrow/r/tests/testthat/test-altrep.R243
-rw-r--r--src/arrow/r/tests/testthat/test-array-data.R33
-rw-r--r--src/arrow/r/tests/testthat/test-arrow-info.R23
-rw-r--r--src/arrow/r/tests/testthat/test-arrow.R78
-rw-r--r--src/arrow/r/tests/testthat/test-backwards-compatibility.R121
-rw-r--r--src/arrow/r/tests/testthat/test-buffer-reader.R38
-rw-r--r--src/arrow/r/tests/testthat/test-buffer.R97
-rw-r--r--src/arrow/r/tests/testthat/test-chunked-array.R468
-rw-r--r--src/arrow/r/tests/testthat/test-chunked-array.txt103
-rw-r--r--src/arrow/r/tests/testthat/test-compressed.R73
-rw-r--r--src/arrow/r/tests/testthat/test-compute-aggregate.R434
-rw-r--r--src/arrow/r/tests/testthat/test-compute-arith.R129
-rw-r--r--src/arrow/r/tests/testthat/test-compute-no-bindings.R201
-rw-r--r--src/arrow/r/tests/testthat/test-compute-sort.R155
-rw-r--r--src/arrow/r/tests/testthat/test-compute-vector.R133
-rw-r--r--src/arrow/r/tests/testthat/test-csv.R357
-rw-r--r--src/arrow/r/tests/testthat/test-data-type.R429
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-csv.R290
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-dplyr.R340
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-uri.R123
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-write.R454
-rw-r--r--src/arrow/r/tests/testthat/test-dataset.R696
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-arrange.R205
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-collapse.R235
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-count.R92
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-distinct.R104
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-filter.R412
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R409
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R304
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-math.R309
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-string.R1399
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-type.R627
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-group-by.R158
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-join.R175
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-mutate.R522
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-query.R296
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-select.R146
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-summarize.R881
-rw-r--r--src/arrow/r/tests/testthat/test-duckdb.R217
-rw-r--r--src/arrow/r/tests/testthat/test-expression.R128
-rw-r--r--src/arrow/r/tests/testthat/test-feather.R256
-rw-r--r--src/arrow/r/tests/testthat/test-field.R67
-rw-r--r--src/arrow/r/tests/testthat/test-filesystem.R178
-rw-r--r--src/arrow/r/tests/testthat/test-install-arrow.R37
-rw-r--r--src/arrow/r/tests/testthat/test-json.R255
-rw-r--r--src/arrow/r/tests/testthat/test-memory-pool.R26
-rw-r--r--src/arrow/r/tests/testthat/test-message-reader.R85
-rw-r--r--src/arrow/r/tests/testthat/test-message.R63
-rw-r--r--src/arrow/r/tests/testthat/test-metadata.R369
-rw-r--r--src/arrow/r/tests/testthat/test-na-omit.R94
-rw-r--r--src/arrow/r/tests/testthat/test-parquet.R274
-rw-r--r--src/arrow/r/tests/testthat/test-python-flight.R62
-rw-r--r--src/arrow/r/tests/testthat/test-python.R145
-rw-r--r--src/arrow/r/tests/testthat/test-read-record-batch.R78
-rw-r--r--src/arrow/r/tests/testthat/test-read-write.R125
-rw-r--r--src/arrow/r/tests/testthat/test-record-batch-reader.R141
-rw-r--r--src/arrow/r/tests/testthat/test-s3-minio.R228
-rw-r--r--src/arrow/r/tests/testthat/test-s3.R55
-rw-r--r--src/arrow/r/tests/testthat/test-scalar.R112
-rw-r--r--src/arrow/r/tests/testthat/test-schema.R220
-rw-r--r--src/arrow/r/tests/testthat/test-thread-pool.R33
-rw-r--r--src/arrow/r/tests/testthat/test-type.R211
-rw-r--r--src/arrow/r/tests/testthat/test-utf.R24
86 files changed, 17814 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat.R b/src/arrow/r/tests/testthat.R
new file mode 100644
index 000000000..d0f5b1e0d
--- /dev/null
+++ b/src/arrow/r/tests/testthat.R
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(testthat)
+library(arrow)
+library(tibble)
+
+if (identical(tolower(Sys.getenv("ARROW_R_DEV", "false")), "true")) {
+ arrow_reporter <- MultiReporter$new(list(CheckReporter$new(), LocationReporter$new()))
+} else {
+ arrow_reporter <- check_reporter()
+}
+test_check("arrow", reporter = arrow_reporter)
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
new file mode 100644
index 000000000..3394be241
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather
new file mode 100644
index 000000000..d91acd0cc
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather
new file mode 100644
index 000000000..0198024ec
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather
new file mode 100644
index 000000000..f6788231c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet
new file mode 100644
index 000000000..e1d589bf0
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather
new file mode 100644
index 000000000..f3a71435a
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather
new file mode 100644
index 000000000..1188ac669
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather
new file mode 100644
index 000000000..056b26c17
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet
new file mode 100644
index 000000000..6c5911560
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather
new file mode 100644
index 000000000..b65da7234
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather
new file mode 100644
index 000000000..508903cb4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather
new file mode 100644
index 000000000..39c829fda
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/helper-arrow.R b/src/arrow/r/tests/testthat/helper-arrow.R
new file mode 100644
index 000000000..545f2d044
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-arrow.R
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Wrap testthat::test_that with a check for the C++ library
+options(..skip.tests = !arrow:::arrow_available())
+
+set.seed(1)
+
+MAX_INT <- 2147483647L
+
+# Make sure this is unset
+Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = "")
+
+# use the C locale for string collation (ARROW-12046)
+Sys.setlocale("LC_COLLATE", "C")
+
+# Set English language so that error messages aren't internationalized
+# (R CMD check does this, but in case you're running outside of check)
+Sys.setenv(LANGUAGE = "en")
+
+with_language <- function(lang, expr) {
+ old <- Sys.getenv("LANGUAGE")
+ # Check what this message is before changing languages; this will
+ # trigger caching the transations if the OS does that (some do).
+ # If the OS does cache, then we can't test changing languages safely.
+ before <- i18ize_error_messages()
+ Sys.setenv(LANGUAGE = lang)
+ on.exit({
+ Sys.setenv(LANGUAGE = old)
+ .cache$i18ized_error_pattern <<- NULL
+ })
+ if (!identical(before, i18ize_error_messages())) {
+ skip(paste("This OS either does not support changing languages to", lang, "or it caches translations"))
+ }
+ force(expr)
+}
+
+test_that <- function(what, code) {
+ testthat::test_that(what, {
+ skip_if(getOption("..skip.tests", TRUE), "arrow C++ library not available")
+ code
+ })
+}
+
+# Wrapper to run tests that only touch R code even when the C++ library isn't
+# available (so that at least some tests are run on those platforms)
+r_only <- function(code) {
+ withr::with_options(list(..skip.tests = FALSE), code)
+}
+
+make_temp_dir <- function() {
+ path <- tempfile()
+ dir.create(path)
+ normalizePath(path, winslash = "/")
+}
diff --git a/src/arrow/r/tests/testthat/helper-data.R b/src/arrow/r/tests/testthat/helper-data.R
new file mode 100644
index 000000000..c693e84b2
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-data.R
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+example_data <- tibble::tibble(
+ int = c(1:3, NA_integer_, 5:10),
+ dbl = c(1:8, NA, 10) + .1,
+ dbl2 = rep(5, 10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ false = logical(10),
+ chr = letters[c(1:5, NA, 7:10)],
+ fct = factor(letters[c(1:4, NA, NA, 7:10)])
+)
+
+example_with_metadata <- tibble::tibble(
+ a = structure("one", class = "special_string"),
+ b = 2,
+ c = tibble::tibble(
+ c1 = structure("inner", extra_attr = "something"),
+ c2 = 4,
+ c3 = 50
+ ),
+ d = "four"
+)
+
+attr(example_with_metadata, "top_level") <- list(
+ field_one = 12,
+ field_two = "more stuff"
+)
+
+haven_data <- tibble::tibble(
+ num = structure(c(5.1, 4.9),
+ format.spss = "F8.2"
+ ),
+ cat_int = structure(c(3, 1),
+ format.spss = "F8.0",
+ labels = c(first = 1, second = 2, third = 3),
+ class = c("haven_labelled", "vctrs_vctr", "double")
+ ),
+ cat_chr = structure(c("B", "B"),
+ labels = c(Alpha = "A", Beta = "B"),
+ class = c("haven_labelled", "vctrs_vctr", "character")
+ )
+)
+
+example_with_times <- tibble::tibble(
+ date = Sys.Date() + 1:10,
+ posixct = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10,
+ posixct_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10,
+ posixlt = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10),
+ posixlt_tz = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10)
+)
+
+verses <- list(
+ # Since we tend to test with dataframes with 10 rows, here are verses from
+ # "Milonga del moro judío", by Jorge Drexler. They are décimas, 10-line
+ # poems with a particular meter and rhyme scheme.
+ # (They also have non-ASCII characters, which is nice for testing)
+ c(
+ "Por cada muro, un lamento",
+ "En Jerusalén la dorada",
+ "Y mil vidas malgastadas",
+ "Por cada mandamiento",
+ "Yo soy polvo de tu viento",
+ "Y aunque sangro de tu herida",
+ "Y cada piedra querida",
+ "Guarda mi amor más profundo",
+ "No hay una piedra en el mundo",
+ "Que valga lo que una vida"
+ ),
+ c(
+ "No hay muerto que no me duela",
+ "No hay un bando ganador",
+ "No hay nada más que dolor",
+ "Y otra vida que se vuela",
+ "La guerra es muy mala escuela",
+ "No importa el disfraz que viste",
+ "Perdonen que no me aliste",
+ "Bajo ninguna bandera",
+ "Vale más cualquier quimera",
+ "Que un trozo de tela triste"
+ ),
+ c(
+ "Y a nadie le di permiso",
+ "Para matar en mi nombre",
+ "Un hombre no es más que un hombre",
+ "Y si hay Dios, así lo quiso",
+ "El mismo suelo que piso",
+ "Seguirá, yo me habré ido",
+ "Rumbo también del olvido",
+ "No hay doctrina que no vaya",
+ "Y no hay pueblo que no se haya",
+ "Creído el pueblo elegido"
+ )
+)
+
+make_big_string <- function() {
+ # This creates a character vector that would exceed the capacity of BinaryArray
+ rep(purrr::map_chr(2047:2050, ~ paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
+}
+
+make_random_string_of_size <- function(size = 1) {
+ purrr::map_chr(1000 * size, ~ paste(sample(letters, ., replace = TRUE), collapse = ""))
+}
+
+make_string_of_size <- function(size = 1) {
+ paste(rep(letters, length.out = 1000 * size), collapse = "")
+}
+
+example_with_extra_metadata <- example_with_metadata
+attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100))
+
+example_with_logical_factors <- tibble::tibble(
+ starting_a_fight = factor(c(FALSE, TRUE, TRUE, TRUE)),
+ consoling_a_child = factor(c(TRUE, FALSE, TRUE, TRUE)),
+ petting_a_dog = factor(c(TRUE, TRUE, FALSE, TRUE)),
+ saying = c(
+ "shhhhh, it's ok",
+ "you wanna go outside?",
+ "you want your mommy?",
+ "hey buddy"
+ )
+)
+
+# The values in each column of this tibble are in ascending order. There are
+# some ties, so tests should use two or more columns to ensure deterministic
+# sort order. The Arrow C++ library orders strings lexicographically as byte
+# strings. The order of a string array sorted by Arrow will not match the order
+# of an equivalent character vector sorted by R unless you set the R collation
+# locale to "C" by running: Sys.setlocale("LC_COLLATE", "C")
+# These test scripts set that, but if you are running individual tests you might
+# need to set it manually. When finished, you can restore the default
+# collation locale by running: Sys.setlocale("LC_COLLATE")
+# In the future, the string collation locale used by the Arrow C++ library might
+# be configurable (ARROW-12046).
+example_data_for_sorting <- tibble::tibble(
+ int = c(-.Machine$integer.max, -101L, -100L, 0L, 0L, 1L, 100L, 1000L, .Machine$integer.max, NA_integer_),
+ dbl = c(
+ -Inf, -.Machine$double.xmax, -.Machine$double.xmin, 0, .Machine$double.xmin,
+ pi, .Machine$double.xmax, Inf, NaN, NA_real_
+ ),
+ chr = c("", "", "\"", "&", "ABC", "NULL", "a", "abc", "zzz", NA_character_),
+ lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA),
+ dttm = lubridate::ymd_hms(c(
+ "0000-01-01 00:00:00",
+ "1919-05-29 13:08:55",
+ "1955-06-20 04:10:42",
+ "1973-06-30 11:38:41",
+ "1987-03-29 12:49:47",
+ "1991-06-11 19:07:01",
+ NA_character_,
+ "2017-08-21 18:26:40",
+ "2017-08-21 18:26:40",
+ "9999-12-31 23:59:59"
+ )),
+ grp = c(rep("A", 5), rep("B", 5))
+)
+
+# For Dataset tests
+first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+df1 <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ fct = factor(LETTERS[1:10]),
+ ts = first_date + lubridate::days(1:10)
+)
+
+second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+df2 <- tibble::tibble(
+ int = 101:110,
+ dbl = c(as.numeric(51:59), NaN),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[10:1],
+ fct = factor(LETTERS[10:1]),
+ ts = second_date + lubridate::days(10:1)
+)
diff --git a/src/arrow/r/tests/testthat/helper-expectation.R b/src/arrow/r/tests/testthat/helper-expectation.R
new file mode 100644
index 000000000..ef6142bb4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-expectation.R
@@ -0,0 +1,320 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_as_vector <- function(x, y, ...) {
+ expect_equal(as.vector(x), y, ...)
+}
+
+expect_data_frame <- function(x, y, ...) {
+ expect_equal(as.data.frame(x), y, ...)
+}
+
+expect_r6_class <- function(object, class) {
+ expect_s3_class(object, class)
+ expect_s3_class(object, "R6")
+}
+
+#' Mask `testthat::expect_equal()` in order to compare ArrowObjects using their
+#' `Equals` methods from the C++ library.
+expect_equal <- function(object, expected, ignore_attr = FALSE, ..., info = NULL, label = NULL) {
+ if (inherits(object, "ArrowObject") && inherits(expected, "ArrowObject")) {
+ mc <- match.call()
+ expect_true(
+ all.equal(object, expected, check.attributes = !ignore_attr),
+ info = info,
+ label = paste(rlang::as_label(mc[["object"]]), "==", rlang::as_label(mc[["expected"]]))
+ )
+ } else {
+ testthat::expect_equal(object, expected, ignore_attr = ignore_attr, ..., info = info, label = label)
+ }
+}
+
+expect_type_equal <- function(object, expected, ...) {
+ if (is.Array(object)) {
+ object <- object$type
+ }
+ if (is.Array(expected)) {
+ expected <- expected$type
+ }
+ expect_equal(object, expected, ...)
+}
+
+expect_match_arg_error <- function(object, values = c()) {
+ expect_error(object, paste0("'arg' .*", paste(dQuote(values), collapse = ", ")))
+}
+
+expect_deprecated <- expect_warning
+
+verify_output <- function(...) {
+ if (isTRUE(grepl("conda", R.Version()$platform))) {
+ skip("On conda")
+ }
+ testthat::verify_output(...)
+}
+
+#' Ensure that dplyr methods on Arrow objects return the same as for data frames
+#'
+#' This function compares the output of running a dplyr expression on a tibble
+#' or data.frame object against the output of the same expression run on
+#' Arrow Table and RecordBatch objects.
+#'
+#'
+#' @param expr A dplyr pipeline which must have `.input` as its start
+#' @param tbl A tibble or data.frame which will be substituted for `.input`
+#' @param skip_record_batch The skip message to show (if you should skip the
+#' RecordBatch test)
+#' @param skip_table The skip message to show (if you should skip the Table test)
+#' @param warning The expected warning from the RecordBatch and Table comparison
+#' paths, passed to `expect_warning()`. Special values:
+#' * `NA` (the default) for ensuring no warning message
+#' * `TRUE` is a special case to mean to check for the
+#' "not supported in Arrow; pulling data into R" message.
+#' @param ... additional arguments, passed to `expect_equal()`
+compare_dplyr_binding <- function(expr,
+ tbl,
+ skip_record_batch = NULL,
+ skip_table = NULL,
+ warning = NA,
+ ...) {
+
+ # Quote the contents of `expr` so that we can evaluate it a few different ways
+ expr <- rlang::enquo(expr)
+ # Get the expected output by evaluating expr on the .input data.frame using regular dplyr
+ expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl)))
+
+ if (isTRUE(warning)) {
+ # Special-case the simple warning:
+ # TODO: ARROW-13362 pick one of in or by and use it everywhere
+ warning <- "not supported (in|by) Arrow; pulling data into R"
+ }
+
+ skip_msg <- NULL
+
+ # Evaluate `expr` on a RecordBatch object and compare with `expected`
+ if (is.null(skip_record_batch)) {
+ expect_warning(
+ via_batch <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = record_batch(tbl)))
+ ),
+ warning
+ )
+ expect_equal(via_batch, expected, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_record_batch)
+ }
+
+ # Evaluate `expr` on a Table object and compare with `expected`
+ if (is.null(skip_table)) {
+ expect_warning(
+ via_table <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = arrow_table(tbl)))
+ ),
+ warning
+ )
+ expect_equal(via_table, expected, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_table)
+ }
+
+ if (!is.null(skip_msg)) {
+ skip(paste(skip_msg, collapse = "\n"))
+ }
+}
+
+#' Assert that Arrow dplyr methods error in the same way as methods on data.frame
+#'
+#' Comparing the error message generated when running expressions on R objects
+#' against the error message generated by running the same expression on Arrow
+#' Tables and RecordBatches.
+#'
+#' @param expr A dplyr pipeline which must have `.input` as its start
+#' @param tbl A tibble or data.frame which will be substituted for `.input`
+#' @param ... additional arguments, passed to `expect_error()`
+compare_dplyr_error <- function(expr,
+ tbl,
+ ...) {
+ # ensure we have supplied tbl
+ force(tbl)
+
+ expr <- rlang::enquo(expr)
+ msg <- tryCatch(
+ rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl))),
+ error = function(e) {
+ msg <- conditionMessage(e)
+
+ # The error here is of the form:
+ #
+ # Problem with `filter()` .input `..1`.
+ # x object 'b_var' not found
+ # ℹ Input `..1` is `chr == b_var`.
+ #
+ # but what we really care about is the `x` block
+ # so (temporarily) let's pull those blocks out when we find them
+ pattern <- i18ize_error_messages()
+
+ if (grepl(pattern, msg)) {
+ msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg)
+ }
+ msg
+ }
+ )
+ # make sure msg is a character object (i.e. there has been an error)
+ # If it did not error, we would get a data.frame or whatever
+ # This expectation will tell us "dplyr on data.frame errored is not TRUE"
+ expect_true(identical(typeof(msg), "character"), label = "dplyr on data.frame errored")
+
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = record_batch(tbl)))
+ ),
+ msg,
+ ...
+ )
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = arrow_table(tbl)))
+ ),
+ msg,
+ ...
+ )
+}
+
+#' Comparing the output of running expressions on R vectors against the same
+#' expression run on Arrow Arrays and ChunkedArrays.
+#'
+#' @param expr A vectorized R expression which must have `.input` as its start
+#' @param vec A vector which will be substituted for `.input`
+#' @param skip_array The skip message to show (if you should skip the Array test)
+#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test)
+#' @param ignore_attr Ignore differences in specified attributes?
+#' @param ... additional arguments, passed to `expect_as_vector()`
+compare_expression <- function(expr,
+ vec,
+ skip_array = NULL,
+ skip_chunked_array = NULL,
+ ignore_attr = FALSE,
+ ...) {
+ expr <- rlang::enquo(expr)
+ expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec)))
+ skip_msg <- NULL
+
+ if (is.null(skip_array)) {
+ via_array <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = Array$create(vec)))
+ )
+ expect_as_vector(via_array, expected, ignore_attr, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_array)
+ }
+
+ if (is.null(skip_chunked_array)) {
+ # split input vector into two to exercise ChunkedArray with >1 chunk
+ split_vector <- split_vector_as_list(vec)
+
+ via_chunked <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
+ )
+ expect_as_vector(via_chunked, expected, ignore_attr, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_chunked_array)
+ }
+
+ if (!is.null(skip_msg)) {
+ skip(paste(skip_msg, collapse = "\n"))
+ }
+}
+
+#' Comparing the error message generated when running expressions on R objects
+#' against the error message generated by running the same expression on Arrow
+#' Arrays and ChunkedArrays.
+#'
+#' @param expr An R expression which must have `.input` as its start
+#' @param vec A vector which will be substituted for `.input`
+#' @param skip_array The skip message to show (if you should skip the Array test)
+#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test)
+#' @param ... additional arguments, passed to `expect_error()`
+compare_expression_error <- function(expr,
+ vec,
+ skip_array = NULL,
+ skip_chunked_array = NULL,
+ ...) {
+ expr <- rlang::enquo(expr)
+
+ msg <- tryCatch(
+ rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec))),
+ error = function(e) {
+ msg <- conditionMessage(e)
+
+ pattern <- i18ize_error_messages()
+
+ if (grepl(pattern, msg)) {
+ msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg)
+ }
+ msg
+ }
+ )
+
+ expect_true(identical(typeof(msg), "character"), label = "vector errored")
+
+ skip_msg <- NULL
+
+ if (is.null(skip_array)) {
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = Array$create(vec)))
+ ),
+ msg,
+ ...
+ )
+ } else {
+ skip_msg <- c(skip_msg, skip_array)
+ }
+
+ if (is.null(skip_chunked_array)) {
+ # split input vector into two to exercise ChunkedArray with >1 chunk
+ split_vector <- split_vector_as_list(vec)
+
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
+ ),
+ msg,
+ ...
+ )
+ } else {
+ skip_msg <- c(skip_msg, skip_chunked_array)
+ }
+
+ if (!is.null(skip_msg)) {
+ skip(paste(skip_msg, collapse = "\n"))
+ }
+}
+
+split_vector_as_list <- function(vec) {
+ vec_split <- length(vec) %/% 2
+ vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)]
+ vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)]
+ list(vec1, vec2)
+}
diff --git a/src/arrow/r/tests/testthat/helper-parquet.R b/src/arrow/r/tests/testthat/helper-parquet.R
new file mode 100644
index 000000000..a0dd445bb
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-parquet.R
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_parquet_roundtrip <- function(tab, ...) {
+ expect_equal(parquet_roundtrip(tab, ...), tab)
+}
+
+parquet_roundtrip <- function(x, ..., as_data_frame = FALSE) {
+ # write/read parquet, returns Table
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_parquet(x, tf, ...)
+ read_parquet(tf, as_data_frame = as_data_frame)
+}
diff --git a/src/arrow/r/tests/testthat/helper-roundtrip.R b/src/arrow/r/tests/testthat/helper-roundtrip.R
new file mode 100644
index 000000000..80bcb42f1
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-roundtrip.R
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_array_roundtrip <- function(x, type, as = NULL) {
+ a <- Array$create(x, type = as)
+ expect_equal(a$type, type)
+ expect_identical(length(a), length(x))
+ if (!inherits(type, c("ListType", "LargeListType", "FixedSizeListType"))) {
+ # TODO: revisit how missingness works with ListArrays
+ # R list objects don't handle missingness the same way as other vectors.
+ # Is there some vctrs thing we should do on the roundtrip back to R?
+ expect_as_vector(is.na(a), is.na(x))
+ }
+ roundtrip <- as.vector(a)
+ expect_equal(roundtrip, x, ignore_attr = TRUE)
+ # Make sure the storage mode is the same on roundtrip (esp. integer vs. numeric)
+ expect_identical(typeof(roundtrip), typeof(x))
+
+ if (length(x)) {
+ a_sliced <- a$Slice(1)
+ x_sliced <- x[-1]
+ expect_equal(a_sliced$type, type)
+ expect_identical(length(a_sliced), length(x_sliced))
+ if (!inherits(type, c("ListType", "LargeListType", "FixedSizeListType"))) {
+ expect_as_vector(is.na(a_sliced), is.na(x_sliced))
+ }
+ expect_as_vector(a_sliced, x_sliced, ignore_attr = TRUE)
+ }
+ invisible(a)
+}
diff --git a/src/arrow/r/tests/testthat/helper-skip.R b/src/arrow/r/tests/testthat/helper-skip.R
new file mode 100644
index 000000000..4256ec4ab
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-skip.R
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+build_features <- c(
+ arrow_info()$capabilities,
+ # Special handling for "uncompressed", for tests that iterate over compressions
+ uncompressed = TRUE
+)
+
+skip_if_not_available <- function(feature) {
+ if (feature == "re2") {
+ # RE2 does not support valgrind (on purpose): https://github.com/google/re2/issues/177
+ skip_on_valgrind()
+ }
+
+ yes <- feature %in% names(build_features) && build_features[feature]
+ if (!yes) {
+ skip(paste("Arrow C++ not built with", feature))
+ }
+}
+
+skip_if_no_pyarrow <- function() {
+ skip_on_valgrind()
+ skip_on_os("windows")
+
+ skip_if_not_installed("reticulate")
+ if (!reticulate::py_module_available("pyarrow")) {
+ skip("pyarrow not available for testing")
+ }
+}
+
+skip_if_not_dev_mode <- function() {
+ skip_if_not(
+ identical(tolower(Sys.getenv("ARROW_R_DEV")), "true"),
+ "environment variable ARROW_R_DEV"
+ )
+}
+
+skip_if_not_running_large_memory_tests <- function() {
+ skip_if_not(
+ identical(tolower(Sys.getenv("ARROW_LARGE_MEMORY_TESTS")), "true"),
+ "environment variable ARROW_LARGE_MEMORY_TESTS"
+ )
+}
+
+skip_on_valgrind <- function() {
+ # This does not actually skip on valgrind because we can't exactly detect it.
+ # Instead, it skips on CRAN when the OS is linux + and the R version is development
+ # (which is where valgrind is run as of this code)
+ linux_dev <- identical(tolower(Sys.info()[["sysname"]]), "linux") &&
+ grepl("devel", R.version.string)
+
+ if (linux_dev) {
+ skip_on_cran()
+ }
+}
+
+skip_if_r_version <- function(r_version) {
+ if (getRversion() <= r_version) {
+ skip(paste("R version:", getRversion()))
+ }
+}
+
+process_is_running <- function(x) {
+ cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x)
+ tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE)
+}
diff --git a/src/arrow/r/tests/testthat/latin1.R b/src/arrow/r/tests/testthat/latin1.R
new file mode 100644
index 000000000..150192d31
--- /dev/null
+++ b/src/arrow/r/tests/testthat/latin1.R
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+x <- iconv("Veitingastair", to = "latin1")
+df <- tibble::tibble(
+ chr = x,
+ fct = as.factor(x)
+)
+names(df) <- iconv(paste(x, names(df), sep = "_"), to = "latin1")
+df_struct <- tibble::tibble(a = df)
+
+raw_schema <- list(utf8(), dictionary(int8(), utf8()))
+names(raw_schema) <- names(df)
+
+# Confirm setup
+expect_identical(Encoding(x), "latin1")
+expect_identical(Encoding(names(df)), c("latin1", "latin1"))
+expect_identical(Encoding(df[[1]]), "latin1")
+expect_identical(Encoding(levels(df[[2]])), "latin1")
+
+# Array
+expect_identical(as.vector(Array$create(x)), x)
+# struct
+expect_identical(as.vector(Array$create(df)), df)
+
+# ChunkedArray
+expect_identical(as.vector(ChunkedArray$create(x)), x)
+# struct
+expect_identical(as.vector(ChunkedArray$create(df)), df)
+
+# Table (including field name)
+expect_identical(as.data.frame(Table$create(df)), df)
+expect_identical(as.data.frame(Table$create(df_struct)), df_struct)
+
+# RecordBatch
+expect_identical(as.data.frame(record_batch(df)), df)
+expect_identical(as.data.frame(record_batch(df_struct)), df_struct)
+
+# Schema field name
+df_schema <- do.call(schema, raw_schema)
+expect_identical(names(df_schema), names(df))
+
+df_struct_schema <- schema(a = do.call(struct, raw_schema))
+# StructType doesn't expose names (in C++)
+# expect_identical(names(df_struct_schema$a), names(df))
+
+# Create table/batch with schema
+expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df)
+expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct)
+expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df)
+expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct)
+
+# Serialization
+feather_file <- tempfile()
+write_feather(df_struct, feather_file)
+expect_identical(read_feather(feather_file), df_struct)
+
+if (arrow_with_parquet()) {
+ parquet_file <- tempfile()
+ write_parquet(df, parquet_file) # Parquet doesn't yet support nested types
+ expect_identical(read_parquet(parquet_file), df)
+}
diff --git a/src/arrow/r/tests/testthat/test-Array.R b/src/arrow/r/tests/testthat/test-Array.R
new file mode 100644
index 000000000..ce23c2609
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-Array.R
@@ -0,0 +1,963 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Integer Array", {
+ ints <- c(1:10, 1:10, 1:5)
+ x <- expect_array_roundtrip(ints, int32())
+})
+
+test_that("binary Array", {
+ # if the type is given, we just need a list of raw vectors
+ bin <- list(as.raw(1:10), as.raw(1:10))
+ expect_array_roundtrip(bin, binary(), as = binary())
+ expect_array_roundtrip(bin, large_binary(), as = large_binary())
+ expect_array_roundtrip(bin, fixed_size_binary(10), as = fixed_size_binary(10))
+
+ bin[[1L]] <- as.raw(1:20)
+ expect_error(Array$create(bin, fixed_size_binary(10)))
+
+ # otherwise the arrow type is deduced from the R classes
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:10), as.raw(11:20)),
+ class = "arrow_binary"
+ )
+ expect_array_roundtrip(bin, binary())
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:10), as.raw(11:20)),
+ class = "arrow_large_binary"
+ )
+ expect_array_roundtrip(bin, large_binary())
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:10), as.raw(11:20)),
+ class = "arrow_fixed_size_binary",
+ byte_width = 10L
+ )
+ expect_array_roundtrip(bin, fixed_size_binary(byte_width = 10))
+
+ # degenerate cases
+ skip_on_valgrind() # valgrind errors on these tests ARROW-12638
+ bin <- vctrs::new_vctr(
+ list(1:10),
+ class = "arrow_binary"
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(1:10),
+ ptype = raw(),
+ class = "arrow_large_binary"
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(1:10),
+ class = "arrow_fixed_size_binary",
+ byte_width = 10
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:5)),
+ class = "arrow_fixed_size_binary",
+ byte_width = 10
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:5)),
+ class = "arrow_fixed_size_binary"
+ )
+ expect_error(Array$create(bin))
+})
+
+test_that("Slice() and RangeEquals()", {
+ ints <- c(1:10, 101:110, 201:205)
+ x <- Array$create(ints)
+
+ y <- x$Slice(10)
+ expect_equal(y$type, int32())
+ expect_equal(length(y), 15L)
+ expect_as_vector(y, c(101:110, 201:205))
+ expect_true(x$RangeEquals(y, 10, 24))
+ expect_false(x$RangeEquals(y, 9, 23))
+ expect_false(x$RangeEquals(y, 11, 24))
+
+ z <- x$Slice(10, 5)
+ expect_as_vector(z, c(101:105))
+ expect_true(x$RangeEquals(z, 10, 15, 0))
+
+ # Input validation
+ expect_error(x$Slice("ten"))
+ expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(10, "ten"))
+ expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(c(10, 10)))
+ expect_error(x$Slice(10, c(10, 10)))
+ expect_error(x$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative")
+
+ expect_warning(x$Slice(10, 15), NA)
+ expect_warning(
+ overslice <- x$Slice(10, 16),
+ "Slice 'length' greater than available length"
+ )
+ expect_equal(length(overslice), 15)
+ expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
+
+ expect_error(x$RangeEquals(10, 24, 0), 'other must be a "Array"')
+ expect_error(x$RangeEquals(y, NA, 24), "'start_idx' cannot be NA")
+ expect_error(x$RangeEquals(y, 10, NA), "'end_idx' cannot be NA")
+ expect_error(x$RangeEquals(y, 10, 24, NA), "'other_start_idx' cannot be NA")
+ expect_error(x$RangeEquals(y, "ten", 24))
+
+ skip("TODO: (if anyone uses RangeEquals)")
+ expect_error(x$RangeEquals(y, 10, 2400, 0)) # does not error
+ expect_error(x$RangeEquals(y, 1000, 24, 0)) # does not error
+ expect_error(x$RangeEquals(y, 10, 24, 1000)) # does not error
+})
+
+test_that("Double Array", {
+ dbls <- c(1, 2, 3, 4, 5, 6)
+ x_dbl <- expect_array_roundtrip(dbls, float64())
+})
+
+test_that("Array print method includes type", {
+ x <- Array$create(c(1:10, 1:10, 1:5))
+ expect_output(print(x), "Array\n<int32>\n[\n", fixed = TRUE)
+})
+
+test_that("Array supports NA", {
+ x_int <- Array$create(as.integer(c(1:10, NA)))
+ x_dbl <- Array$create(as.numeric(c(1:10, NA)))
+ expect_true(x_int$IsValid(0))
+ expect_true(x_dbl$IsValid(0L))
+ expect_true(x_int$IsNull(10L))
+ expect_true(x_dbl$IsNull(10))
+
+ expect_as_vector(is.na(x_int), c(rep(FALSE, 10), TRUE))
+ expect_as_vector(is.na(x_dbl), c(rep(FALSE, 10), TRUE))
+
+ # Input validation
+ expect_error(x_int$IsValid("ten"))
+ expect_error(x_int$IsNull("ten"))
+ expect_error(x_int$IsValid(c(10, 10)))
+ expect_error(x_int$IsNull(c(10, 10)))
+ expect_error(x_int$IsValid(NA), "'i' cannot be NA")
+ expect_error(x_int$IsNull(NA), "'i' cannot be NA")
+ expect_error(x_int$IsValid(1000), "subscript out of bounds")
+ expect_error(x_int$IsValid(-1), "subscript out of bounds")
+ expect_error(x_int$IsNull(1000), "subscript out of bounds")
+ expect_error(x_int$IsNull(-1), "subscript out of bounds")
+})
+
+test_that("Array support null type (ARROW-7064)", {
+ expect_array_roundtrip(vctrs::unspecified(10), null())
+})
+
+test_that("Array supports logical vectors (ARROW-3341)", {
+ # with NA
+ x <- sample(c(TRUE, FALSE, NA), 1000, replace = TRUE)
+ expect_array_roundtrip(x, bool())
+
+ # without NA
+ x <- sample(c(TRUE, FALSE), 1000, replace = TRUE)
+ expect_array_roundtrip(x, bool())
+})
+
+test_that("Array supports character vectors (ARROW-3339)", {
+ # without NA
+ expect_array_roundtrip(c("itsy", "bitsy", "spider"), utf8())
+ expect_array_roundtrip(c("itsy", "bitsy", "spider"), large_utf8(), as = large_utf8())
+
+ # with NA
+ expect_array_roundtrip(c("itsy", NA, "spider"), utf8())
+ expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8())
+})
+
+test_that("Character vectors > 2GB become large_utf8", {
+ skip_on_cran()
+ skip_if_not_running_large_memory_tests()
+ big <- make_big_string()
+ expect_array_roundtrip(big, large_utf8())
+})
+
+test_that("empty arrays are supported", {
+ expect_array_roundtrip(character(), utf8())
+ expect_array_roundtrip(character(), large_utf8(), as = large_utf8())
+ expect_array_roundtrip(integer(), int32())
+ expect_array_roundtrip(numeric(), float64())
+ expect_array_roundtrip(factor(character()), dictionary(int8(), utf8()))
+ expect_array_roundtrip(logical(), bool())
+})
+
+test_that("array with all nulls are supported", {
+ nas <- c(NA, NA)
+ expect_array_roundtrip(as.character(nas), utf8())
+ expect_array_roundtrip(as.integer(nas), int32())
+ expect_array_roundtrip(as.numeric(nas), float64())
+ expect_array_roundtrip(as.factor(nas), dictionary(int8(), utf8()))
+ expect_array_roundtrip(as.logical(nas), bool())
+})
+
+test_that("Array supports unordered factors (ARROW-3355)", {
+ # without NA
+ f <- factor(c("itsy", "bitsy", "spider", "spider"))
+ expect_array_roundtrip(f, dictionary(int8(), utf8()))
+
+ # with NA
+ f <- factor(c("itsy", "bitsy", NA, "spider", "spider"))
+ expect_array_roundtrip(f, dictionary(int8(), utf8()))
+})
+
+test_that("Array supports ordered factors (ARROW-3355)", {
+ # without NA
+ f <- ordered(c("itsy", "bitsy", "spider", "spider"))
+ arr_fac <- expect_array_roundtrip(f, dictionary(int8(), utf8(), ordered = TRUE))
+ expect_true(arr_fac$ordered)
+
+ # with NA
+ f <- ordered(c("itsy", "bitsy", NA, "spider", "spider"))
+ expect_array_roundtrip(f, dictionary(int8(), utf8(), ordered = TRUE))
+})
+
+test_that("array supports Date (ARROW-3340)", {
+ d <- Sys.Date() + 1:10
+ expect_array_roundtrip(d, date32())
+
+ d[5] <- NA
+ expect_array_roundtrip(d, date32())
+})
+
+test_that("array supports POSIXct (ARROW-3340)", {
+ times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10
+ expect_array_roundtrip(times, timestamp("us", "UTC"))
+
+ times[5] <- NA
+ expect_array_roundtrip(times, timestamp("us", "UTC"))
+
+ times2 <- lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10
+ expect_array_roundtrip(times2, timestamp("us", "US/Eastern"))
+})
+
+test_that("array supports POSIXct without timezone", {
+ # Make sure timezone is not set
+ withr::with_envvar(c(TZ = ""), {
+ times <- strptime("2019-02-03 12:34:56", format = "%Y-%m-%d %H:%M:%S") + 1:10
+ expect_array_roundtrip(times, timestamp("us", ""))
+
+ # Also test the INTSXP code path
+ skip("Ingest_POSIXct only implemented for REALSXP")
+ times_int <- as.integer(times)
+ attributes(times_int) <- attributes(times)
+ expect_array_roundtrip(times_int, timestamp("us", ""))
+ })
+})
+
+test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", {
+ # Write a feather file as that's what the initial bug report used
+ df <- tibble::tibble(
+ no_tz = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10,
+ yes_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas") + 1:10
+ )
+ if (!identical(Sys.timezone(), "Pacific/Marquesas")) {
+ # Confirming that the columns are in fact different
+ expect_false(any(df$no_tz == df$yes_tz))
+ }
+ feather_file <- tempfile()
+ on.exit(unlink(feather_file))
+ write_feather(df, feather_file)
+ expect_identical(read_feather(feather_file), df)
+})
+
+test_that("array supports integer64", {
+ x <- bit64::as.integer64(1:10) + MAX_INT
+ expect_array_roundtrip(x, int64())
+
+ x[4] <- NA
+ expect_array_roundtrip(x, int64())
+
+ # all NA int64 (ARROW-3795)
+ all_na <- Array$create(bit64::as.integer64(NA))
+ expect_type_equal(all_na, int64())
+ expect_true(as.vector(is.na(all_na)))
+})
+
+test_that("array supports difftime", {
+ time <- hms::hms(56, 34, 12)
+ expect_array_roundtrip(c(time, time), time32("s"))
+ expect_array_roundtrip(vctrs::vec_c(NA, time), time32("s"))
+})
+
+test_that("support for NaN (ARROW-3615)", {
+ x <- c(1, NA, NaN, -1)
+ y <- Array$create(x)
+ expect_true(y$IsValid(2))
+ expect_equal(y$null_count, 1L)
+})
+
+test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", {
+ x <- c(1.0, NA, NaN, -1.0)
+ compare_expression(is.nan(.input), x)
+})
+
+test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", {
+ x <- c(1L, 2L, 3L)
+ y <- c("foo", "bar")
+ compare_expression(is.nan(.input), x)
+ compare_expression(is.nan(.input), y)
+})
+
+test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", {
+ x <- c(1, NA, NaN, -1)
+ compare_expression(is.na(.input), x)
+})
+
+test_that("integer types casts (ARROW-3741)", {
+ # Defining some type groups for use here and in the following tests
+ int_types <- c(int8(), int16(), int32(), int64())
+ uint_types <- c(uint8(), uint16(), uint32(), uint64())
+ float_types <- c(float32(), float64()) # float16() not really supported in C++ yet
+
+ a <- Array$create(c(1:10, NA))
+ for (type in c(int_types, uint_types)) {
+ casted <- a$cast(type)
+ expect_equal(casted$type, type)
+ expect_identical(as.vector(is.na(casted)), c(rep(FALSE, 10), TRUE))
+ }
+})
+
+test_that("integer types cast safety (ARROW-3741, ARROW-5541)", {
+ a <- Array$create(-(1:10))
+ for (type in uint_types) {
+ expect_error(a$cast(type), regexp = "Integer value -1 not in range")
+ expect_error(a$cast(type, safe = FALSE), NA)
+ }
+})
+
+test_that("float types casts (ARROW-3741)", {
+ x <- c(1, 2, 3, NA)
+ a <- Array$create(x)
+ for (type in float_types) {
+ casted <- a$cast(type)
+ expect_equal(casted$type, type)
+ expect_identical(as.vector(is.na(casted)), c(rep(FALSE, 3), TRUE))
+ expect_identical(as.vector(casted), x)
+ }
+})
+
+test_that("cast to half float works", {
+ skip("Need halffloat support: https://issues.apache.org/jira/browse/ARROW-3802")
+ a <- Array$create(1:4)
+ a_f16 <- a$cast(float16())
+ expect_type_equal(a_16$type, float16())
+})
+
+test_that("cast input validation", {
+ a <- Array$create(1:4)
+ expect_error(a$cast("not a type"), "type must be a DataType, not character")
+})
+
+test_that("Array$create() supports the type= argument. conversion from INTSXP and int64 to all int types", {
+ num_int32 <- 12L
+ num_int64 <- bit64::as.integer64(10)
+
+ types <- c(
+ int_types,
+ uint_types,
+ float_types,
+ double() # not actually a type, a base R function but should be alias for float64
+ )
+ for (type in types) {
+ expect_type_equal(Array$create(num_int32, type = type)$type, as_type(type))
+ expect_type_equal(Array$create(num_int64, type = type)$type, as_type(type))
+ }
+
+ # Input validation
+ expect_error(
+ Array$create(5, type = "not a type"),
+ "type must be a DataType, not character"
+ )
+})
+
+test_that("Array$create() aborts on overflow", {
+ expect_error(Array$create(128L, type = int8()))
+ expect_error(Array$create(-129L, type = int8()))
+
+ expect_error(Array$create(256L, type = uint8()))
+ expect_error(Array$create(-1L, type = uint8()))
+
+ expect_error(Array$create(32768L, type = int16()))
+ expect_error(Array$create(-32769L, type = int16()))
+
+ expect_error(Array$create(65536L, type = uint16()))
+ expect_error(Array$create(-1L, type = uint16()))
+
+ expect_error(Array$create(65536L, type = uint16()))
+ expect_error(Array$create(-1L, type = uint16()))
+
+ expect_error(Array$create(bit64::as.integer64(2^31), type = int32()))
+ expect_error(Array$create(bit64::as.integer64(2^32), type = uint32()))
+})
+
+test_that("Array$create() does not convert doubles to integer", {
+ for (type in c(int_types, uint_types)) {
+ a <- Array$create(10, type = type)
+ expect_type_equal(a$type, type)
+ expect_true(as.vector(a) == 10L)
+ }
+})
+
+test_that("Array$create() converts raw vectors to uint8 arrays (ARROW-3794)", {
+ expect_type_equal(Array$create(as.raw(1:10))$type, uint8())
+})
+
+test_that("Array<int8>$as_vector() converts to integer (ARROW-3794)", {
+ i8 <- (-128):127
+ a <- Array$create(i8)$cast(int8())
+ expect_type_equal(a, int8())
+ expect_as_vector(a, i8)
+
+ u8 <- 0:255
+ a <- Array$create(u8)$cast(uint8())
+ expect_type_equal(a, uint8())
+ expect_as_vector(a, u8)
+})
+
+test_that("Arrays of {,u}int{32,64} convert to integer if they can fit", {
+ u32 <- Array$create(1L)$cast(uint32())
+ expect_identical(as.vector(u32), 1L)
+
+ u64 <- Array$create(1L)$cast(uint64())
+ expect_identical(as.vector(u64), 1L)
+
+ i64 <- Array$create(bit64::as.integer64(1:10))
+ expect_identical(as.vector(i64), 1:10)
+})
+
+test_that("Arrays of uint{32,64} convert to numeric if they can't fit integer", {
+ u32 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint32())
+ expect_identical(as.vector(u32), 1 + MAX_INT)
+
+ u64 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint64())
+ expect_identical(as.vector(u64), 1 + MAX_INT)
+})
+
+test_that("Array$create() recognise arrow::Array (ARROW-3815)", {
+ a <- Array$create(1:10)
+ expect_equal(a, Array$create(a))
+})
+
+test_that("Array$create() handles data frame -> struct arrays (ARROW-3811)", {
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ a <- Array$create(df)
+ expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8()))
+ expect_as_vector(a, df)
+
+ df <- structure(
+ list(col = structure(list(structure(list(list(structure(1))), class = "inner")), class = "outer")),
+ class = "data.frame", row.names = c(NA, -1L)
+ )
+ a <- Array$create(df)
+ expect_type_equal(a$type, struct(col = list_of(list_of(list_of(float64())))))
+ expect_as_vector(a, df, ignore_attr = TRUE)
+})
+
+test_that("StructArray methods", {
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ a <- Array$create(df)
+ expect_equal(a$x, Array$create(df$x))
+ expect_equal(a[["x"]], Array$create(df$x))
+ expect_equal(a[[1]], Array$create(df$x))
+ expect_identical(names(a), c("x", "y", "z"))
+ expect_identical(dim(a), c(10L, 3L))
+})
+
+test_that("Array$create() can handle data frame with custom struct type (not inferred)", {
+ df <- tibble::tibble(x = 1:10, y = 1:10)
+ type <- struct(x = float64(), y = int16())
+ a <- Array$create(df, type = type)
+ expect_type_equal(a$type, type)
+
+ type <- struct(x = float64(), y = int16(), z = int32())
+ expect_error(
+ Array$create(df, type = type),
+ regexp = "Number of fields in struct.* incompatible with number of columns in the data frame"
+ )
+
+ type <- struct(y = int16(), x = float64())
+ expect_error(
+ Array$create(df, type = type),
+ regexp = "Field name in position.*does not match the name of the column of the data frame"
+ )
+
+ type <- struct(x = float64(), y = utf8())
+ expect_error(Array$create(df, type = type), regexp = "Invalid")
+})
+
+test_that("Array$create() supports tibble with no columns (ARROW-8354)", {
+ df <- tibble::tibble()
+ expect_equal(Array$create(df)$as_vector(), df)
+})
+
+test_that("Array$create() handles vector -> list arrays (ARROW-7662)", {
+ # Should be able to create an empty list with a type hint.
+ expect_r6_class(Array$create(list(), list_of(bool())), "ListArray")
+
+ # logical
+ expect_array_roundtrip(list(NA), list_of(bool()))
+ expect_array_roundtrip(list(logical(0)), list_of(bool()))
+ expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), list_of(bool()))
+ expect_array_roundtrip(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), list_of(bool()))
+
+ # integer
+ expect_array_roundtrip(list(NA_integer_), list_of(int32()))
+ expect_array_roundtrip(list(integer(0)), list_of(int32()))
+ expect_array_roundtrip(list(1:2, 3:4, 12:18), list_of(int32()))
+ expect_array_roundtrip(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), list_of(int32()))
+
+ # numeric
+ expect_array_roundtrip(list(NA_real_), list_of(float64()))
+ expect_array_roundtrip(list(numeric(0)), list_of(float64()))
+ expect_array_roundtrip(list(1, c(2, 3), 4), list_of(float64()))
+ expect_array_roundtrip(list(1, numeric(0), c(2, 3, NA_real_), 4), list_of(float64()))
+
+ # character
+ expect_array_roundtrip(list(NA_character_), list_of(utf8()))
+ expect_array_roundtrip(list(character(0)), list_of(utf8()))
+ expect_array_roundtrip(list("itsy", c("bitsy", "spider"), c("is")), list_of(utf8()))
+ expect_array_roundtrip(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), list_of(utf8()))
+
+ # factor
+ expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), list_of(dictionary(int8(), utf8())))
+ expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8())))
+
+ # struct
+ expect_array_roundtrip(
+ list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))),
+ list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()))
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(integer()))),
+ list_of(struct(a = list_of(int32())))
+ )
+ # degenerated data frame
+ df <- structure(list(x = 1:2, y = 1), class = "data.frame", row.names = 1:2)
+ expect_error(Array$create(list(df)))
+})
+
+test_that("Array$create() handles vector -> large list arrays", {
+ # Should be able to create an empty list with a type hint.
+ expect_r6_class(Array$create(list(), type = large_list_of(bool())), "LargeListArray")
+
+ # logical
+ expect_array_roundtrip(list(NA), large_list_of(bool()), as = large_list_of(bool()))
+ expect_array_roundtrip(list(logical(0)), large_list_of(bool()), as = large_list_of(bool()))
+ expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), large_list_of(bool()), as = large_list_of(bool()))
+ expect_array_roundtrip(
+ list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)),
+ large_list_of(bool()),
+ as = large_list_of(bool())
+ )
+
+ # integer
+ expect_array_roundtrip(list(NA_integer_), large_list_of(int32()), as = large_list_of(int32()))
+ expect_array_roundtrip(list(integer(0)), large_list_of(int32()), as = large_list_of(int32()))
+ expect_array_roundtrip(list(1:2, 3:4, 12:18), large_list_of(int32()), as = large_list_of(int32()))
+ expect_array_roundtrip(
+ list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)),
+ large_list_of(int32()),
+ as = large_list_of(int32())
+ )
+
+ # numeric
+ expect_array_roundtrip(list(NA_real_), large_list_of(float64()), as = large_list_of(float64()))
+ expect_array_roundtrip(list(numeric(0)), large_list_of(float64()), as = large_list_of(float64()))
+ expect_array_roundtrip(list(1, c(2, 3), 4), large_list_of(float64()), as = large_list_of(float64()))
+ expect_array_roundtrip(
+ list(1, numeric(0), c(2, 3, NA_real_), 4),
+ large_list_of(float64()),
+ as = large_list_of(float64())
+ )
+
+ # character
+ expect_array_roundtrip(list(NA_character_), large_list_of(utf8()), as = large_list_of(utf8()))
+ expect_array_roundtrip(list(character(0)), large_list_of(utf8()), as = large_list_of(utf8()))
+ expect_array_roundtrip(
+ list("itsy", c("bitsy", "spider"), c("is")),
+ large_list_of(utf8()),
+ as = large_list_of(utf8())
+ )
+ expect_array_roundtrip(
+ list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")),
+ large_list_of(utf8()),
+ as = large_list_of(utf8())
+ )
+
+ # factor
+ expect_array_roundtrip(
+ list(factor(c("b", "a"), levels = c("a", "b"))),
+ large_list_of(dictionary(int8(), utf8())),
+ as = large_list_of(dictionary(int8(), utf8()))
+ )
+ expect_array_roundtrip(
+ list(factor(NA, levels = c("a", "b"))),
+ large_list_of(dictionary(int8(), utf8())),
+ as = large_list_of(dictionary(int8(), utf8()))
+ )
+
+ # struct
+ expect_array_roundtrip(
+ list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))),
+ large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())),
+ as = large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()))
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(integer()))),
+ large_list_of(struct(a = list_of(int32()))),
+ as = large_list_of(struct(a = list_of(int32())))
+ )
+})
+
+test_that("Array$create() handles vector -> fixed size list arrays", {
+ # Should be able to create an empty list with a type hint.
+ expect_r6_class(Array$create(list(), type = fixed_size_list_of(bool(), 20)), "FixedSizeListArray")
+
+ # logical
+ expect_array_roundtrip(list(NA), fixed_size_list_of(bool(), 1L), as = fixed_size_list_of(bool(), 1L))
+ expect_array_roundtrip(
+ list(c(TRUE, FALSE), c(FALSE, TRUE)),
+ fixed_size_list_of(bool(), 2L),
+ as = fixed_size_list_of(bool(), 2L)
+ )
+ expect_array_roundtrip(
+ list(c(TRUE), c(FALSE), NA),
+ fixed_size_list_of(bool(), 1L),
+ as = fixed_size_list_of(bool(), 1L)
+ )
+
+ # integer
+ expect_array_roundtrip(list(NA_integer_), fixed_size_list_of(int32(), 1L), as = fixed_size_list_of(int32(), 1L))
+ expect_array_roundtrip(list(1:2, 3:4, 11:12), fixed_size_list_of(int32(), 2L), as = fixed_size_list_of(int32(), 2L))
+ expect_array_roundtrip(
+ list(c(1:2), c(NA_integer_, 3L)),
+ fixed_size_list_of(int32(), 2L),
+ as = fixed_size_list_of(int32(), 2L)
+ )
+
+ # numeric
+ expect_array_roundtrip(list(NA_real_), fixed_size_list_of(float64(), 1L), as = fixed_size_list_of(float64(), 1L))
+ expect_array_roundtrip(
+ list(c(1, 2), c(2, 3)),
+ fixed_size_list_of(float64(), 2L),
+ as = fixed_size_list_of(float64(), 2L)
+ )
+ expect_array_roundtrip(
+ list(c(1, 2), c(NA_real_, 4)),
+ fixed_size_list_of(float64(), 2L),
+ as = fixed_size_list_of(float64(), 2L)
+ )
+
+ # character
+ expect_array_roundtrip(list(NA_character_), fixed_size_list_of(utf8(), 1L), as = fixed_size_list_of(utf8(), 1L))
+ expect_array_roundtrip(
+ list(c("itsy", "bitsy"), c("spider", "is"), c(NA_character_, NA_character_), c("", "")),
+ fixed_size_list_of(utf8(), 2L),
+ as = fixed_size_list_of(utf8(), 2L)
+ )
+
+ # factor
+ expect_array_roundtrip(
+ list(factor(c("b", "a"), levels = c("a", "b"))),
+ fixed_size_list_of(dictionary(int8(), utf8()), 2L),
+ as = fixed_size_list_of(dictionary(int8(), utf8()), 2L)
+ )
+
+ # struct
+ expect_array_roundtrip(
+ list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)),
+ fixed_size_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()), 1L),
+ as = fixed_size_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()), 1L)
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(1L))),
+ fixed_size_list_of(struct(a = list_of(int32())), 1L),
+ as = fixed_size_list_of(struct(a = list_of(int32())), 1L)
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(1L))),
+ list_of(struct(a = fixed_size_list_of(int32(), 1L))),
+ as = list_of(struct(a = fixed_size_list_of(int32(), 1L)))
+ )
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- structure(list(
+ as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
+ as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
+ as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
+ as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
+ as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
+ as.raw(c(0x74, 0x76))
+ ),
+ class = c("arrow_binary", "vctrs_vctr", "list")
+ )
+ expect_error(
+ rawToChar(raws[[3]]),
+ "embedded nul in string: 'ma\\0n'", # See?
+ fixed = TRUE
+ )
+ array_with_nul <- Array$create(raws)$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+
+ # no error on conversion, because altrep laziness
+ v <- expect_error(as.vector(array_with_nul), NA)
+
+ # attempting materialization -> error
+
+ expect_error(v[],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow ",
+ "to R, set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ # also error on materializing v[3]
+ expect_error(v[3],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow ",
+ "to R, set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ # no warning yet because altrep laziness
+ v <- as.vector(array_with_nul)
+
+ expect_warning(
+ expect_identical(
+ v[],
+ c("person", "woman", "man", "fan", "camera", "tv")
+ ),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+
+ v <- as.vector(array_with_nul)
+ expect_warning(
+ expect_identical(v[3], "man"),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+
+ v <- as.vector(array_with_nul)
+ expect_warning(
+ expect_identical(v[4], "fan"),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
+
+test_that("Array$create() should have helpful error", {
+ expect_error(Array$create(list(numeric(0)), list_of(bool())), "Expecting a logical vector")
+
+ lgl <- logical(0)
+ int <- integer(0)
+ num <- numeric(0)
+ char <- character(0)
+ expect_error(Array$create(list()), "Requires at least one element to infer")
+ expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector")
+ expect_error(Array$create(list(char, num, char)), "Expecting a character vector")
+})
+
+test_that("Array$View() (ARROW-6542)", {
+ a <- Array$create(1:3)
+ b <- a$View(float32())
+ expect_equal(b$type, float32())
+ expect_equal(length(b), 3L)
+
+ # Input validation
+ expect_error(a$View("not a type"), "type must be a DataType, not character")
+})
+
+test_that("Array$Validate()", {
+ a <- Array$create(1:10)
+ expect_error(a$Validate(), NA)
+})
+
+test_that("is.Array", {
+ a <- Array$create(1, type = int32())
+ expect_true(is.Array(a))
+ expect_true(is.Array(a, "int32"))
+ expect_true(is.Array(a, c("int32", "int16")))
+ expect_false(is.Array(a, "utf8"))
+ expect_true(is.Array(a$View(float32())), "float32")
+ expect_false(is.Array(1))
+ expect_true(is.Array(ChunkedArray$create(1, 2)))
+})
+
+test_that("Array$Take()", {
+ a <- Array$create(10:20)
+ expect_as_vector(a$Take(c(4, 2)), c(14, 12))
+})
+
+test_that("[ method on Array", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ expect_as_vector(a[5:9], vec[5:9])
+ expect_as_vector(a[c(9, 3, 5)], vec[c(9, 3, 5)])
+ expect_as_vector(a[rep(c(TRUE, FALSE), 5)], vec[c(1, 3, 5, 7, 9)])
+ expect_as_vector(a[rep(c(TRUE, FALSE, NA, FALSE, TRUE), 2)], c(11, NA, 15, 16, NA, 20))
+ expect_as_vector(a[-4], vec[-4])
+ expect_as_vector(a[-1], vec[-1])
+})
+
+test_that("[ accepts Arrays and otherwise handles bad input", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ ind <- c(9, 3, 5)
+ expect_error(
+ a[Array$create(ind)],
+ "Cannot extract rows with an Array of type double"
+ )
+ expect_as_vector(a[Array$create(ind - 1, type = int8())], vec[ind])
+ expect_as_vector(a[Array$create(ind - 1, type = uint8())], vec[ind])
+ expect_as_vector(a[ChunkedArray$create(8, 2, 4, type = uint8())], vec[ind])
+
+ filt <- seq_along(vec) %in% ind
+ expect_as_vector(a[Array$create(filt)], vec[filt])
+
+ expect_error(
+ a["string"],
+ "Cannot extract rows with an object of class character"
+ )
+})
+
+test_that("%in% works on dictionary arrays", {
+ a1 <- Array$create(as.factor(c("A", "B", "C")))
+ a2 <- DictionaryArray$create(c(0L, 1L, 2L), c(4.5, 3.2, 1.1))
+ c1 <- Array$create(c(FALSE, TRUE, FALSE))
+ c2 <- Array$create(c(FALSE, FALSE, FALSE))
+ b1 <- Array$create("B")
+ b2 <- Array$create(5.4)
+
+ expect_equal(is_in(a1, b1), c1)
+ expect_equal(is_in(a2, b2), c2)
+ expect_error(is_in(a1, b2))
+})
+
+test_that("[ accepts Expressions", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ b <- Array$create(1:10)
+ expect_as_vector(a[b > 4], vec[5:10])
+})
+
+test_that("Array head/tail", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ expect_as_vector(head(a), head(vec))
+ expect_as_vector(head(a, 4), head(vec, 4))
+ expect_as_vector(head(a, 40), head(vec, 40))
+ expect_as_vector(head(a, -4), head(vec, -4))
+ expect_as_vector(head(a, -40), head(vec, -40))
+ expect_as_vector(tail(a), tail(vec))
+ expect_as_vector(tail(a, 4), tail(vec, 4))
+ expect_as_vector(tail(a, 40), tail(vec, 40))
+ expect_as_vector(tail(a, -40), tail(vec, -40))
+})
+
+test_that("Dictionary array: create from arrays, not factor", {
+ a <- DictionaryArray$create(c(2L, 1L, 1L, 2L, 0L), c(4.5, 3.2, 1.1))
+ expect_equal(a$type, dictionary(int32(), float64()))
+})
+
+test_that("Dictionary array: translate to R when dict isn't string", {
+ a <- DictionaryArray$create(c(2L, 1L, 1L, 2L, 0L), c(4.5, 3.2, 1.1))
+ expect_warning(
+ expect_identical(
+ as.vector(a),
+ factor(c(3, 2, 2, 3, 1), labels = c("4.5", "3.2", "1.1"))
+ )
+ )
+})
+
+test_that("Array$Equals", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ b <- Array$create(vec)
+ d <- Array$create(3:4)
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(vec))
+ expect_false(a$Equals(d))
+})
+
+test_that("Array$ApproxEquals", {
+ vec <- c(1.0000000000001, 2.400000000000001)
+ a <- Array$create(vec)
+ b <- Array$create(round(vec, 1))
+ expect_false(a$Equals(b))
+ expect_true(a$ApproxEquals(b))
+ expect_false(a$ApproxEquals(vec))
+})
+
+test_that("auto int64 conversion to int can be disabled (ARROW-10093)", {
+ withr::with_options(list(arrow.int64_downcast = FALSE), {
+ a <- Array$create(1:10, int64())
+ expect_true(inherits(a$as_vector(), "integer64"))
+
+ batch <- RecordBatch$create(x = a)
+ expect_true(inherits(as.data.frame(batch)$x, "integer64"))
+
+ tab <- Table$create(x = a)
+ expect_true(inherits(as.data.frame(batch)$x, "integer64"))
+ })
+})
+
+
+test_that("Array to C-interface", {
+ # create a struct array since that's one of the more complicated array types
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ arr <- Array$create(df)
+
+ # export the array via the C-interface
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ arr$export_to_c(array_ptr, schema_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- Array$import_from_c(array_ptr, schema_ptr)
+ expect_equal(arr, circle)
+
+ # must clean up the pointers or we leak
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-RecordBatch.R b/src/arrow/r/tests/testthat/test-RecordBatch.R
new file mode 100644
index 000000000..d280754a3
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-RecordBatch.R
@@ -0,0 +1,690 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("RecordBatch", {
+ # Note that we're reusing `tbl` and `batch` throughout the tests in this file
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ batch <- record_batch(tbl)
+
+ expect_equal(batch, batch)
+ expect_equal(
+ batch$schema,
+ schema(
+ int = int32(), dbl = float64(),
+ lgl = boolean(), chr = utf8(),
+ fct = dictionary(int8(), utf8())
+ )
+ )
+ expect_equal(batch$num_columns, 5L)
+ expect_equal(batch$num_rows, 10L)
+ expect_equal(batch$column_name(0), "int")
+ expect_equal(batch$column_name(1), "dbl")
+ expect_equal(batch$column_name(2), "lgl")
+ expect_equal(batch$column_name(3), "chr")
+ expect_equal(batch$column_name(4), "fct")
+ expect_equal(names(batch), c("int", "dbl", "lgl", "chr", "fct"))
+
+ # input validation
+ expect_error(batch$column_name(NA), "'i' cannot be NA")
+ expect_error(batch$column_name(-1), "subscript out of bounds")
+ expect_error(batch$column_name(1000), "subscript out of bounds")
+ expect_error(batch$column_name(1:2))
+ expect_error(batch$column_name("one"))
+
+ col_int <- batch$column(0)
+ expect_true(inherits(col_int, "Array"))
+ expect_equal(col_int$as_vector(), tbl$int)
+ expect_equal(col_int$type, int32())
+
+ col_dbl <- batch$column(1)
+ expect_true(inherits(col_dbl, "Array"))
+ expect_equal(col_dbl$as_vector(), tbl$dbl)
+ expect_equal(col_dbl$type, float64())
+
+ col_lgl <- batch$column(2)
+ expect_true(inherits(col_dbl, "Array"))
+ expect_equal(col_lgl$as_vector(), tbl$lgl)
+ expect_equal(col_lgl$type, boolean())
+
+ col_chr <- batch$column(3)
+ expect_true(inherits(col_chr, "Array"))
+ expect_equal(col_chr$as_vector(), tbl$chr)
+ expect_equal(col_chr$type, utf8())
+
+ col_fct <- batch$column(4)
+ expect_true(inherits(col_fct, "Array"))
+ expect_equal(col_fct$as_vector(), tbl$fct)
+ expect_equal(col_fct$type, dictionary(int8(), utf8()))
+
+ # input validation
+ expect_error(batch$column(NA), "'i' cannot be NA")
+ expect_error(batch$column(-1), "subscript out of bounds")
+ expect_error(batch$column(1000), "subscript out of bounds")
+ expect_error(batch$column(1:2))
+ expect_error(batch$column("one"))
+
+ batch2 <- batch$RemoveColumn(0)
+ expect_equal(
+ batch2$schema,
+ schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8()))
+ )
+ expect_equal(batch2$column(0), batch$column(1))
+ expect_data_frame(batch2, tbl[, -1])
+
+ # input validation
+ expect_error(batch$RemoveColumn(NA), "'i' cannot be NA")
+ expect_error(batch$RemoveColumn(-1), "subscript out of bounds")
+ expect_error(batch$RemoveColumn(1000), "subscript out of bounds")
+ expect_error(batch$RemoveColumn(1:2))
+ expect_error(batch$RemoveColumn("one"))
+})
+
+test_that("RecordBatch S3 methods", {
+ tab <- RecordBatch$create(example_data)
+ for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", "as.list")) {
+ fun <- get(f)
+ expect_identical(fun(tab), fun(example_data), info = f)
+ }
+})
+
+test_that("RecordBatch$Slice", {
+ batch3 <- batch$Slice(5)
+ expect_data_frame(batch3, tbl[6:10, ])
+
+ batch4 <- batch$Slice(5, 2)
+ expect_data_frame(batch4, tbl[6:7, ])
+
+ # Input validation
+ expect_error(batch$Slice("ten"))
+ expect_error(batch$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(batch$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(batch$Slice(10, "ten"))
+ expect_error(batch$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(batch$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(batch$Slice(c(10, 10)))
+ expect_error(batch$Slice(10, c(10, 10)))
+ expect_error(batch$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(batch$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(batch4$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(batch$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(batch$Slice(-1, 10), "Slice 'offset' cannot be negative")
+})
+
+test_that("[ on RecordBatch", {
+ expect_data_frame(batch[6:7, ], tbl[6:7, ])
+ expect_data_frame(batch[c(6, 7), ], tbl[6:7, ])
+ expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4])
+ expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)])
+ expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr)
+ expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4])
+ expect_data_frame(
+ batch[rep(c(FALSE, TRUE), 5), ],
+ tbl[c(2, 4, 6, 8, 10), ]
+ )
+ # bool Array
+ expect_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ])
+ # int Array
+ expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4])
+
+ # input validation
+ expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"')
+ expect_error(batch[, c(6, NA)], "Column indices cannot be NA")
+ expect_error(batch[, c(2, -2)], "Invalid column index")
+})
+
+test_that("[[ and $ on RecordBatch", {
+ expect_as_vector(batch[["int"]], tbl$int)
+ expect_as_vector(batch$int, tbl$int)
+ expect_as_vector(batch[[4]], tbl$chr)
+ expect_null(batch$qwerty)
+ expect_null(batch[["asdf"]])
+ expect_error(batch[[c(4, 3)]])
+ expect_error(batch[[NA]], "'i' must be character or numeric, not logical")
+ expect_error(batch[[NULL]], "'i' must be character or numeric, not NULL")
+ expect_error(batch[[c("asdf", "jkl;")]], "name is not a string", fixed = TRUE)
+})
+
+test_that("[[<- assignment", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ batch <- RecordBatch$create(tbl)
+
+ # can remove a column
+ batch[["chr"]] <- NULL
+ expect_data_frame(batch, tbl[-4])
+
+ # can remove a column by index
+ batch[[4]] <- NULL
+ expect_data_frame(batch, tbl[1:3])
+
+ # can add a named column
+ batch[["new"]] <- letters[10:1]
+ expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1]))
+
+ # can replace a column by index
+ batch[[2]] <- as.numeric(10:1)
+ expect_as_vector(batch[[2]], as.numeric(10:1))
+
+ # can add a column by index
+ batch[[5]] <- as.numeric(10:1)
+ expect_as_vector(batch[[5]], as.numeric(10:1))
+ expect_as_vector(batch[["5"]], as.numeric(10:1))
+
+ # can replace a column
+ batch[["int"]] <- 10:1
+ expect_as_vector(batch[["int"]], 10:1)
+
+ # can use $
+ batch$new <- NULL
+ expect_null(as.vector(batch$new))
+ expect_identical(dim(batch), c(10L, 4L))
+
+ batch$int <- 1:10
+ expect_as_vector(batch$int, 1:10)
+
+ # recycling
+ batch[["atom"]] <- 1L
+ expect_as_vector(batch[["atom"]], rep(1L, 10))
+
+ expect_error(
+ batch[["atom"]] <- 1:6,
+ "Can't recycle input of size 6 to size 10."
+ )
+
+ # assign Arrow array
+ array <- Array$create(c(10:1))
+ batch$array <- array
+ expect_as_vector(batch$array, 10:1)
+
+ # nonsense indexes
+ expect_error(batch[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical")
+ expect_error(batch[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL")
+ expect_error(batch[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(batch[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(batch[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(batch[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE)
+})
+
+test_that("head and tail on RecordBatch", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ batch <- RecordBatch$create(tbl)
+ expect_data_frame(head(batch), head(tbl))
+ expect_data_frame(head(batch, 4), head(tbl, 4))
+ expect_data_frame(head(batch, 40), head(tbl, 40))
+ expect_data_frame(head(batch, -4), head(tbl, -4))
+ expect_data_frame(head(batch, -40), head(tbl, -40))
+ expect_data_frame(tail(batch), tail(tbl))
+ expect_data_frame(tail(batch, 4), tail(tbl, 4))
+ expect_data_frame(tail(batch, 40), tail(tbl, 40))
+ expect_data_frame(tail(batch, -4), tail(tbl, -4))
+ expect_data_frame(tail(batch, -40), tail(tbl, -40))
+})
+
+test_that("RecordBatch print method", {
+ expect_output(
+ print(batch),
+ paste(
+ "RecordBatch",
+ "10 rows x 5 columns",
+ "$int <int32>",
+ "$dbl <double>",
+ "$lgl <bool>",
+ "$chr <string>",
+ "$fct <dictionary<values=string, indices=int8>>",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("RecordBatch with 0 rows are supported", {
+ tbl <- tibble::tibble(
+ int = integer(),
+ dbl = numeric(),
+ lgl = logical(),
+ chr = character(),
+ fct = factor(character(), levels = c("a", "b"))
+ )
+
+ batch <- record_batch(tbl)
+ expect_equal(batch$num_columns, 5L)
+ expect_equal(batch$num_rows, 0L)
+ expect_equal(
+ batch$schema,
+ schema(
+ int = int32(),
+ dbl = float64(),
+ lgl = boolean(),
+ chr = utf8(),
+ fct = dictionary(int8(), utf8())
+ )
+ )
+})
+
+test_that("RecordBatch cast (ARROW-3741)", {
+ batch <- record_batch(x = 1:10, y = 1:10)
+
+ expect_error(batch$cast(schema(x = int32())))
+ expect_error(batch$cast(schema(x = int32(), z = int32())))
+
+ s2 <- schema(x = int16(), y = int64())
+ batch2 <- batch$cast(s2)
+ expect_equal(batch2$schema, s2)
+ expect_equal(batch2$column(0L)$type, int16())
+ expect_equal(batch2$column(1L)$type, int64())
+})
+
+test_that("record_batch() handles schema= argument", {
+ s <- schema(x = int32(), y = int32())
+ batch <- record_batch(x = 1:10, y = 1:10, schema = s)
+ expect_equal(s, batch$schema)
+
+ s <- schema(x = int32(), y = float64())
+ batch <- record_batch(x = 1:10, y = 1:10, schema = s)
+ expect_equal(s, batch$schema)
+
+ s <- schema(x = int32(), y = utf8())
+ expect_error(record_batch(x = 1:10, y = 1:10, schema = s))
+})
+
+test_that("record_batch(schema=) does some basic consistency checking of the schema", {
+ s <- schema(x = int32())
+ expect_error(record_batch(x = 1:10, y = 1:10, schema = s))
+ expect_error(record_batch(z = 1:10, schema = s))
+})
+
+test_that("RecordBatch dim() and nrow() (ARROW-3816)", {
+ batch <- record_batch(x = 1:10, y = 1:10)
+ expect_equal(dim(batch), c(10L, 2L))
+ expect_equal(nrow(batch), 10L)
+})
+
+test_that("record_batch() handles Array", {
+ batch <- record_batch(x = 1:10, y = Array$create(1:10))
+ expect_equal(batch$schema, schema(x = int32(), y = int32()))
+})
+
+test_that("record_batch() handles data frame columns", {
+ tib <- tibble::tibble(x = 1:10, y = 1:10)
+ # because tib is named here, this becomes a struct array
+ batch <- record_batch(a = 1:10, b = tib)
+ expect_equal(
+ batch$schema,
+ schema(
+ a = int32(),
+ b = struct(x = int32(), y = int32())
+ )
+ )
+ out <- as.data.frame(batch)
+ expect_equal(out, tibble::tibble(a = 1:10, b = tib))
+
+ # if not named, columns from tib are auto spliced
+ batch2 <- record_batch(a = 1:10, tib)
+ expect_equal(
+ batch2$schema,
+ schema(a = int32(), x = int32(), y = int32())
+ )
+ out <- as.data.frame(batch2)
+ expect_equal(out, tibble::tibble(a = 1:10, !!!tib))
+})
+
+test_that("record_batch() handles data frame columns with schema spec", {
+ tib <- tibble::tibble(x = 1:10, y = 1:10)
+ tib_float <- tib
+ tib_float$y <- as.numeric(tib_float$y)
+ schema <- schema(a = int32(), b = struct(x = int16(), y = float64()))
+ batch <- record_batch(a = 1:10, b = tib, schema = schema)
+ expect_equal(batch$schema, schema)
+ out <- as.data.frame(batch)
+ expect_equal(out, tibble::tibble(a = 1:10, b = tib_float))
+
+ schema <- schema(a = int32(), b = struct(x = int16(), y = utf8()))
+ expect_error(record_batch(a = 1:10, b = tib, schema = schema))
+})
+
+test_that("record_batch() auto splices (ARROW-5718)", {
+ df <- tibble::tibble(x = 1:10, y = letters[1:10])
+ batch1 <- record_batch(df)
+ batch2 <- record_batch(!!!df)
+ expect_equal(batch1, batch2)
+ expect_equal(batch1$schema, schema(x = int32(), y = utf8()))
+ expect_data_frame(batch1, df)
+
+ batch3 <- record_batch(df, z = 1:10)
+ batch4 <- record_batch(!!!df, z = 1:10)
+ expect_equal(batch3, batch4)
+ expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32()))
+ expect_equal(
+ as.data.frame(batch3),
+ tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
+ )
+
+ s <- schema(x = float64(), y = utf8())
+ batch5 <- record_batch(df, schema = s)
+ batch6 <- record_batch(!!!df, schema = s)
+ expect_equal(batch5, batch6)
+ expect_equal(batch5$schema, s)
+ expect_equal(as.data.frame(batch5), df)
+
+ s2 <- schema(x = float64(), y = utf8(), z = int16())
+ batch7 <- record_batch(df, z = 1:10, schema = s2)
+ batch8 <- record_batch(!!!df, z = 1:10, schema = s2)
+ expect_equal(batch7, batch8)
+ expect_equal(batch7$schema, s2)
+ expect_equal(
+ as.data.frame(batch7),
+ tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
+ )
+})
+
+test_that("record_batch() only auto splice data frames", {
+ expect_error(
+ record_batch(1:10),
+ regexp = "only data frames are allowed as unnamed arguments to be auto spliced"
+ )
+})
+
+test_that("record_batch() handles null type (ARROW-7064)", {
+ batch <- record_batch(a = 1:10, n = vctrs::unspecified(10))
+ expect_equal(
+ batch$schema,
+ schema(a = int32(), n = null()),
+ ignore_attr = TRUE
+ )
+})
+
+test_that("record_batch() scalar recycling with vectors", {
+ expect_data_frame(
+ record_batch(a = 1:10, b = 5),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
+ expect_data_frame(
+ record_batch(a = Array$create(1:10), b = Scalar$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ record_batch(a = Array$create(1:10), b = Array$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("record_batch() no recycling with tibbles", {
+ expect_error(
+ record_batch(
+ tibble::tibble(a = 1:10),
+ tibble::tibble(a = 1, b = 5)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+
+ expect_error(
+ record_batch(
+ tibble::tibble(a = 1:10),
+ tibble::tibble(a = 1)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+})
+
+test_that("RecordBatch$Equals", {
+ df <- tibble::tibble(x = 1:10, y = letters[1:10])
+ a <- record_batch(df)
+ b <- record_batch(df)
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(df))
+})
+
+test_that("RecordBatch$Equals(check_metadata)", {
+ df <- tibble::tibble(x = 1:2, y = c("a", "b"))
+ rb1 <- record_batch(df)
+ rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some = "metadata")))
+
+ expect_r6_class(rb1, "RecordBatch")
+ expect_r6_class(rb2, "RecordBatch")
+ expect_false(rb1$schema$HasMetadata)
+ expect_true(rb2$schema$HasMetadata)
+ expect_identical(rb2$schema$metadata, list(some = "metadata"))
+
+ expect_true(rb1 == rb2)
+ expect_true(rb1$Equals(rb2))
+ expect_false(rb1$Equals(rb2, check_metadata = TRUE))
+
+ expect_failure(expect_equal(rb1, rb2)) # expect_equal has check_metadata=TRUE
+ expect_equal(rb1, rb2, ignore_attr = TRUE) # this passes check_metadata=FALSE
+
+ expect_false(rb1$Equals(24)) # Not a RecordBatch
+})
+
+test_that("RecordBatch name assignment", {
+ rb <- record_batch(x = 1:10, y = 1:10)
+ expect_identical(names(rb), c("x", "y"))
+ names(rb) <- c("a", "b")
+ expect_identical(names(rb), c("a", "b"))
+ expect_error(names(rb) <- "f")
+ expect_error(names(rb) <- letters)
+ expect_error(names(rb) <- character(0))
+ expect_error(names(rb) <- NULL)
+ expect_error(names(rb) <- c(TRUE, FALSE))
+})
+
+test_that("record_batch() with different length arrays", {
+ msg <- "All arrays must have the same length"
+ expect_error(record_batch(a = 1:5, b = 1:6), msg)
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- Array$create(structure(list(
+ as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
+ as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
+ as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
+ as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
+ as.raw(c(0x74, 0x76))
+ ),
+ class = c("arrow_binary", "vctrs_vctr", "list")
+ ))
+ batch_with_nul <- record_batch(a = 1:5, b = raws)
+ batch_with_nul$b <- batch_with_nul$b$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+ df <- as.data.frame(batch_with_nul)
+
+ expect_error(
+ df$b[],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ",
+ "set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ batch_with_nul <- record_batch(a = 1:5, b = raws)
+ batch_with_nul$b <- batch_with_nul$b$cast(utf8())
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ expect_warning(
+ expect_equal(
+ as.data.frame(batch_with_nul)$b,
+ c("person", "woman", "man", "camera", "tv"),
+ ignore_attr = TRUE
+ ),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
+
+test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation", {
+ skip_if_not_available("dataset")
+ library(dplyr, warn.conflicts = FALSE)
+
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ expect_r6_class(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch(),
+ "RecordBatch"
+ )
+ expect_identical(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch() %>%
+ group_vars(),
+ c("fct", "fct2")
+ )
+ expect_identical(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch() %>%
+ ungroup() %>%
+ group_vars(),
+ NULL
+ )
+ expect_identical(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch() %>%
+ select(-int) %>%
+ group_vars(),
+ c("fct", "fct2")
+ )
+})
+
+test_that("ARROW-12729 - length returns number of columns in RecordBatch", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ rb <- record_batch(!!!tbl)
+
+ expect_identical(length(rb), 3L)
+})
+
+test_that("RecordBatchReader to C-interface", {
+ skip_if_not_available("dataset")
+
+ tab <- Table$create(example_data)
+
+ # export the RecordBatchReader via the C-interface
+ stream_ptr <- allocate_arrow_array_stream()
+ scan <- Scanner$create(tab)
+ reader <- scan$ToRecordBatchReader()
+ reader$export_to_c(stream_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatchStreamReader$import_from_c(stream_ptr)
+ tab_from_c_new <- circle$read_table()
+ expect_equal(tab, tab_from_c_new)
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr)
+
+ # export the RecordBatchStreamReader via the C-interface
+ stream_ptr_new <- allocate_arrow_array_stream()
+ bytes <- write_to_raw(example_data)
+ expect_type(bytes, "raw")
+ reader_new <- RecordBatchStreamReader$create(bytes)
+ reader_new$export_to_c(stream_ptr_new)
+
+ # then import it and check that the roundtripped value is the same
+ circle_new <- RecordBatchStreamReader$import_from_c(stream_ptr_new)
+ tab_from_c_new <- circle_new$read_table()
+ expect_equal(tab, tab_from_c_new)
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr_new)
+})
+
+test_that("RecordBatch to C-interface", {
+ batch <- RecordBatch$create(example_data)
+
+ # export the RecordBatch via the C-interface
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ batch$export_to_c(array_ptr, schema_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatch$import_from_c(array_ptr, schema_ptr)
+ expect_equal
+
+ # must clean up the pointers or we leak
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+})
+
+
+
+test_that("RecordBatchReader to C-interface to arrow_dplyr_query", {
+ skip_if_not_available("dataset")
+
+ tab <- Table$create(example_data)
+
+ # export the RecordBatchReader via the C-interface
+ stream_ptr <- allocate_arrow_array_stream()
+ scan <- Scanner$create(tab)
+ reader <- scan$ToRecordBatchReader()
+ reader$export_to_c(stream_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatchStreamReader$import_from_c(stream_ptr)
+
+ # create an arrow_dplyr_query() from the recordbatch reader
+ reader_adq <- arrow_dplyr_query(circle)
+
+ tab_from_c_new <- reader_adq %>%
+ dplyr::compute()
+ expect_equal(tab_from_c_new, tab)
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-Table.R b/src/arrow/r/tests/testthat/test-Table.R
new file mode 100644
index 000000000..44144c00b
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-Table.R
@@ -0,0 +1,549 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", {
+ tbl <- tibble::tibble(
+ int = 1:10, dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+ tab <- Table$create(!!!tbl)
+
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ expect_deprecated(
+ write_arrow(tab, tf),
+ "write_feather"
+ )
+
+ tab1 <- read_feather(tf, as_data_frame = FALSE)
+ tab2 <- read_feather(normalizePath(tf), as_data_frame = FALSE)
+
+ readable_file <- ReadableFile$create(tf)
+ expect_deprecated(
+ tab3 <- read_arrow(readable_file, as_data_frame = FALSE),
+ "read_feather"
+ )
+ readable_file$close()
+
+ mmap_file <- mmap_open(tf)
+ mmap_file$close()
+
+ expect_equal(tab, tab1)
+ expect_equal(tab, tab2)
+ expect_equal(tab, tab3)
+})
+
+test_that("Table cast (ARROW-3741)", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+
+ expect_error(tab$cast(schema(x = int32())))
+ expect_error(tab$cast(schema(x = int32(), z = int32())))
+
+ s2 <- schema(x = int16(), y = int64())
+ tab2 <- tab$cast(s2)
+ expect_equal(tab2$schema, s2)
+ expect_equal(tab2$column(0L)$type, int16())
+ expect_equal(tab2$column(1L)$type, int64())
+})
+
+test_that("Table S3 methods", {
+ tab <- Table$create(example_data)
+ for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", "as.list")) {
+ fun <- get(f)
+ expect_identical(fun(tab), fun(example_data), info = f)
+ }
+})
+
+test_that("Table $column and $field", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+
+ expect_equal(tab$field(0), field("x", int32()))
+
+ # input validation
+ expect_error(tab$column(NA), "'i' cannot be NA")
+ expect_error(tab$column(-1), "subscript out of bounds")
+ expect_error(tab$column(1000), "subscript out of bounds")
+ expect_error(tab$column(1:2))
+ expect_error(tab$column("one"))
+
+ expect_error(tab$field(NA), "'i' cannot be NA")
+ expect_error(tab$field(-1), "subscript out of bounds")
+ expect_error(tab$field(1000), "subscript out of bounds")
+ expect_error(tab$field(1:2))
+ expect_error(tab$field("one"))
+})
+
+test_that("[, [[, $ for Table", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ expect_identical(names(tab), names(tbl))
+
+ expect_data_frame(tab[6:7, ], tbl[6:7, ])
+ expect_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4])
+ expect_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)])
+ expect_as_vector(tab[, "chr", drop = TRUE], tbl$chr)
+ # Take within a single chunk
+ expect_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4])
+ expect_data_frame(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ])
+ # bool ChunkedArray (with one chunk)
+ expect_data_frame(tab[tab$lgl, ], tbl[tbl$lgl, ])
+ # ChunkedArray with multiple chunks
+ c1 <- c(TRUE, FALSE, TRUE, TRUE, FALSE)
+ c2 <- c(FALSE, FALSE, TRUE, TRUE, FALSE)
+ ca <- ChunkedArray$create(c1, c2)
+ expect_data_frame(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ])
+ # int Array
+ expect_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4])
+ # ChunkedArray
+ expect_data_frame(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4])
+ # Expression
+ expect_data_frame(tab[tab$int > 6, ], tbl[tbl$int > 6, ])
+
+ expect_as_vector(tab[["int"]], tbl$int)
+ expect_as_vector(tab$int, tbl$int)
+ expect_as_vector(tab[[4]], tbl$chr)
+ expect_null(tab$qwerty)
+ expect_null(tab[["asdf"]])
+ # List-like column slicing
+ expect_data_frame(tab[2:4], tbl[2:4])
+ expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)])
+ expect_data_frame(tab[-3], tbl[-3])
+
+ expect_error(tab[[c(4, 3)]])
+ expect_error(tab[[NA]], "'i' must be character or numeric, not logical")
+ expect_error(tab[[NULL]], "'i' must be character or numeric, not NULL")
+ expect_error(tab[[c("asdf", "jkl;")]], "length(name) not equal to 1", fixed = TRUE)
+ expect_error(tab[-3:3], "Invalid column index")
+ expect_error(tab[1000], "Invalid column index")
+ expect_error(tab[1:1000], "Invalid column index")
+
+ # input validation
+ expect_error(tab[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"')
+ expect_error(tab[, c(6, NA)], "Column indices cannot be NA")
+
+ skip("Table with 0 cols doesn't know how many rows it should have")
+ expect_data_frame(tab[0], tbl[0])
+})
+
+test_that("[[<- assignment", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ # can remove a column
+ tab[["chr"]] <- NULL
+ expect_data_frame(tab, tbl[-4])
+
+ # can remove a column by index
+ tab[[4]] <- NULL
+ expect_data_frame(tab, tbl[1:3])
+
+ # can add a named column
+ tab[["new"]] <- letters[10:1]
+ expect_data_frame(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1]))
+
+ # can replace a column by index
+ tab[[2]] <- as.numeric(10:1)
+ expect_as_vector(tab[[2]], as.numeric(10:1))
+
+ # can add a column by index
+ tab[[5]] <- as.numeric(10:1)
+ expect_as_vector(tab[[5]], as.numeric(10:1))
+ expect_as_vector(tab[["5"]], as.numeric(10:1))
+
+ # can replace a column
+ tab[["int"]] <- 10:1
+ expect_as_vector(tab[["int"]], 10:1)
+
+ # can use $
+ tab$new <- NULL
+ expect_null(as.vector(tab$new))
+ expect_identical(dim(tab), c(10L, 4L))
+
+ tab$int <- 1:10
+ expect_as_vector(tab$int, 1:10)
+
+ # recycling
+ tab[["atom"]] <- 1L
+ expect_as_vector(tab[["atom"]], rep(1L, 10))
+
+ expect_error(
+ tab[["atom"]] <- 1:6,
+ "Can't recycle input of size 6 to size 10."
+ )
+
+ # assign Arrow array and chunked_array
+ array <- Array$create(c(10:1))
+ tab$array <- array
+ expect_as_vector(tab$array, 10:1)
+
+ tab$chunked <- chunked_array(1:10)
+ expect_as_vector(tab$chunked, 1:10)
+
+ # nonsense indexes
+ expect_error(tab[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical")
+ expect_error(tab[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL")
+ expect_error(tab[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(tab[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(tab[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(tab[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE)
+})
+
+test_that("Table$Slice", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+ tab2 <- tab$Slice(5)
+ expect_data_frame(tab2, tbl[6:10, ])
+
+ tab3 <- tab$Slice(5, 2)
+ expect_data_frame(tab3, tbl[6:7, ])
+
+ # Input validation
+ expect_error(tab$Slice("ten"))
+ expect_error(tab$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(tab$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(tab$Slice(10, "ten"))
+ expect_error(tab$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(tab$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(tab$Slice(c(10, 10)))
+ expect_error(tab$Slice(10, c(10, 10)))
+ expect_error(tab$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(tab$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(tab3$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(tab$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(tab$Slice(-1, 10), "Slice 'offset' cannot be negative")
+})
+
+test_that("head and tail on Table", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ expect_data_frame(head(tab), head(tbl))
+ expect_data_frame(head(tab, 4), head(tbl, 4))
+ expect_data_frame(head(tab, 40), head(tbl, 40))
+ expect_data_frame(head(tab, -4), head(tbl, -4))
+ expect_data_frame(head(tab, -40), head(tbl, -40))
+ expect_data_frame(tail(tab), tail(tbl))
+ expect_data_frame(tail(tab, 4), tail(tbl, 4))
+ expect_data_frame(tail(tab, 40), tail(tbl, 40))
+ expect_data_frame(tail(tab, -4), tail(tbl, -4))
+ expect_data_frame(tail(tab, -40), tail(tbl, -40))
+})
+
+test_that("Table print method", {
+ expect_output(
+ print(tab),
+ paste(
+ "Table",
+ "10 rows x 5 columns",
+ "$int <int32>",
+ "$dbl <double>",
+ "$lgl <bool>",
+ "$chr <string>",
+ "$fct <dictionary<values=string, indices=int8>>",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("table active bindings", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ expect_identical(dim(tbl), dim(tab))
+ expect_type(tab$columns, "list")
+ expect_equal(tab$columns[[1]], tab[[1]])
+})
+
+test_that("table() handles record batches with splicing", {
+ batch <- record_batch(x = 1:2, y = letters[1:2])
+ tab <- Table$create(batch, batch, batch)
+ expect_equal(tab$schema, batch$schema)
+ expect_equal(tab$num_rows, 6L)
+ expect_equal(
+ as.data.frame(tab),
+ vctrs::vec_rbind(as.data.frame(batch), as.data.frame(batch), as.data.frame(batch))
+ )
+
+ batches <- list(batch, batch, batch)
+ tab <- Table$create(!!!batches)
+ expect_equal(tab$schema, batch$schema)
+ expect_equal(tab$num_rows, 6L)
+ expect_equal(
+ as.data.frame(tab),
+ vctrs::vec_rbind(!!!purrr::map(batches, as.data.frame))
+ )
+})
+
+test_that("table() handles ... of arrays, chunked arrays, vectors", {
+ a <- Array$create(1:10)
+ ca <- chunked_array(1:5, 6:10)
+ v <- rnorm(10)
+ tbl <- tibble::tibble(x = 1:10, y = letters[1:10])
+
+ tab <- Table$create(a = a, b = ca, c = v, !!!tbl)
+ expect_equal(
+ tab$schema,
+ schema(a = int32(), b = int32(), c = float64(), x = int32(), y = utf8())
+ )
+ res <- as.data.frame(tab)
+ expect_equal(names(res), c("a", "b", "c", "x", "y"))
+ expect_equal(
+ res,
+ tibble::tibble(a = 1:10, b = 1:10, c = v, x = 1:10, y = letters[1:10])
+ )
+})
+
+test_that("table() auto splices (ARROW-5718)", {
+ df <- tibble::tibble(x = 1:10, y = letters[1:10])
+
+ tab1 <- Table$create(df)
+ tab2 <- Table$create(!!!df)
+ expect_equal(tab1, tab2)
+ expect_equal(tab1$schema, schema(x = int32(), y = utf8()))
+ expect_equal(as.data.frame(tab1), df)
+
+ s <- schema(x = float64(), y = utf8())
+ tab3 <- Table$create(df, schema = s)
+ tab4 <- Table$create(!!!df, schema = s)
+ expect_equal(tab3, tab4)
+ expect_equal(tab3$schema, s)
+ expect_equal(as.data.frame(tab3), df)
+})
+
+test_that("Validation when creating table with schema (ARROW-10953)", {
+ expect_error(
+ Table$create(data.frame(), schema = schema(a = int32())),
+ "incompatible. schema has 1 fields, and 0 columns are supplied",
+ fixed = TRUE
+ )
+ expect_error(
+ Table$create(data.frame(b = 1), schema = schema(a = int32())),
+ "field at index 1 has name 'a' != 'b'",
+ fixed = TRUE
+ )
+ expect_error(
+ Table$create(data.frame(b = 2, c = 3), schema = schema(a = int32())),
+ "incompatible. schema has 1 fields, and 2 columns are supplied",
+ fixed = TRUE
+ )
+})
+
+test_that("==.Table", {
+ tab1 <- Table$create(x = 1:2, y = c("a", "b"))
+ tab2 <- Table$create(x = 1:2, y = c("a", "b"))
+ tab3 <- Table$create(x = 1:2)
+ tab4 <- Table$create(x = 1:2, y = c("a", "b"), z = 3:4)
+
+ expect_true(tab1 == tab2)
+ expect_true(tab2 == tab1)
+
+ expect_false(tab1 == tab3)
+ expect_false(tab3 == tab1)
+
+ expect_false(tab1 == tab4)
+ expect_false(tab4 == tab1)
+
+ expect_true(all.equal(tab1, tab2))
+ expect_equal(tab1, tab2)
+})
+
+test_that("Table$Equals(check_metadata)", {
+ tab1 <- Table$create(x = 1:2, y = c("a", "b"))
+ tab2 <- Table$create(
+ x = 1:2, y = c("a", "b"),
+ schema = tab1$schema$WithMetadata(list(some = "metadata"))
+ )
+
+ expect_r6_class(tab1, "Table")
+ expect_r6_class(tab2, "Table")
+ expect_false(tab1$schema$HasMetadata)
+ expect_true(tab2$schema$HasMetadata)
+ expect_identical(tab2$schema$metadata, list(some = "metadata"))
+
+ expect_true(tab1 == tab2)
+ expect_true(tab1$Equals(tab2))
+ expect_false(tab1$Equals(tab2, check_metadata = TRUE))
+
+ expect_failure(expect_equal(tab1, tab2)) # expect_equal has check_metadata=TRUE
+ expect_equal(tab1, tab2, ignore_attr = TRUE) # this sets check_metadata=FALSE
+
+ expect_false(tab1$Equals(24)) # Not a Table
+})
+
+test_that("Table handles null type (ARROW-7064)", {
+ tab <- Table$create(a = 1:10, n = vctrs::unspecified(10))
+ expect_equal(tab$schema, schema(a = int32(), n = null()), ignore_attr = TRUE)
+})
+
+test_that("Can create table with specific dictionary types", {
+ fact <- example_data[, "fct"]
+ int_types <- c(int8(), int16(), int32(), int64())
+ # TODO: test uint types when format allows
+ # uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint
+ for (i in int_types) {
+ sch <- schema(fct = dictionary(i, utf8()))
+ tab <- Table$create(fact, schema = sch)
+ expect_equal(sch, tab$schema)
+ if (i != int64()) {
+ # TODO: same downcast to int32 as we do for int64() type elsewhere
+ expect_identical(as.data.frame(tab), fact)
+ }
+ }
+})
+
+test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {
+ b1 <- record_batch(f = factor(c("a"), levels = c("a", "b")))
+ b2 <- record_batch(f = factor(c("c"), levels = c("c", "d")))
+ b3 <- record_batch(f = factor(NA, levels = "a"))
+ b4 <- record_batch(f = factor())
+
+ res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d")))
+ tab <- Table$create(b1, b2, b3, b4)
+
+ expect_identical(as.data.frame(tab), res)
+})
+
+test_that("Table$SelectColumns()", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+
+ expect_equal(tab$SelectColumns(0L), Table$create(x = 1:10))
+
+ expect_error(tab$SelectColumns(2:4))
+ expect_error(tab$SelectColumns(""))
+})
+
+test_that("Table name assignment", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+ expect_identical(names(tab), c("x", "y"))
+ names(tab) <- c("a", "b")
+ expect_identical(names(tab), c("a", "b"))
+ expect_error(names(tab) <- "f")
+ expect_error(names(tab) <- letters)
+ expect_error(names(tab) <- character(0))
+ expect_error(names(tab) <- NULL)
+ expect_error(names(tab) <- c(TRUE, FALSE))
+})
+
+test_that("Table$create() with different length columns", {
+ msg <- "All columns must have the same length"
+ expect_error(Table$create(a = 1:5, b = 1:6), msg)
+})
+
+test_that("Table$create() scalar recycling with vectors", {
+ expect_data_frame(
+ Table$create(a = 1:10, b = 5),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
+ expect_data_frame(
+ Table$create(a = Array$create(1:10), b = Scalar$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ Table$create(a = Array$create(1:10), b = Array$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("Table$create() no recycling with tibbles", {
+ expect_error(
+ Table$create(
+ tibble::tibble(a = 1:10, b = 5),
+ tibble::tibble(a = 1, b = 5)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+
+ expect_error(
+ Table$create(
+ tibble::tibble(a = 1:10, b = 5),
+ tibble::tibble(a = 1)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+})
+
+test_that("ARROW-11769 - grouping preserved in table creation", {
+ skip_if_not_available("dataset")
+
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ expect_identical(
+ tbl %>%
+ dplyr::group_by(fct, fct2) %>%
+ Table$create() %>%
+ dplyr::group_vars(),
+ c("fct", "fct2")
+ )
+})
+
+test_that("ARROW-12729 - length returns number of columns in Table", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ tab <- Table$create(!!!tbl)
+
+ expect_identical(length(tab), 3L)
+})
diff --git a/src/arrow/r/tests/testthat/test-altrep.R b/src/arrow/r/tests/testthat/test-altrep.R
new file mode 100644
index 000000000..dff369438
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-altrep.R
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_r_version("3.5.0")
+
+test_that("is_arrow_altrep() does not include base altrep", {
+ expect_false(is_arrow_altrep(1:10))
+})
+
+test_that("altrep vectors from int32 and dbl arrays with no nulls", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+ v_int <- Array$create(1:1000)
+ v_dbl <- Array$create(as.numeric(1:1000))
+ c_int <- ChunkedArray$create(1:1000)
+ c_dbl <- ChunkedArray$create(as.numeric(1:1000))
+
+ expect_true(is_arrow_altrep(as.vector(v_int)))
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl)))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+
+ expect_equal(c_int$num_chunks, 1L)
+ expect_true(is_arrow_altrep(as.vector(c_int)))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(1))))
+
+ expect_equal(c_dbl$num_chunks, 1L)
+ expect_true(is_arrow_altrep(as.vector(c_dbl)))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(1))))
+
+ withr::local_options(list(arrow.use_altrep = NULL))
+ expect_true(is_arrow_altrep(as.vector(v_int)))
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl)))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+
+ withr::local_options(list(arrow.use_altrep = FALSE))
+ expect_false(is_arrow_altrep(as.vector(v_int)))
+ expect_false(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_false(is_arrow_altrep(as.vector(v_dbl)))
+ expect_false(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+})
+
+test_that("altrep vectors from int32 and dbl arrays with nulls", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+ v_int <- Array$create(c(1L, NA, 3L))
+ v_dbl <- Array$create(c(1, NA, 3))
+ c_int <- ChunkedArray$create(c(1L, NA, 3L))
+ c_dbl <- ChunkedArray$create(c(1, NA, 3))
+
+ expect_true(is_arrow_altrep(as.vector(v_int)))
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl)))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(c_int)))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(c_dbl)))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(1))))
+
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(2))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(2))))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(2))))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(2))))
+
+ # chunked array with 2 chunks cannot be altrep
+ c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
+ c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
+ expect_equal(c_int$num_chunks, 2L)
+ expect_equal(c_dbl$num_chunks, 2L)
+
+ expect_false(is_arrow_altrep(as.vector(c_int)))
+ expect_false(is_arrow_altrep(as.vector(c_dbl)))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(3))))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(3))))
+})
+
+test_that("empty vectors are not altrep", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+ v_int <- Array$create(integer())
+ v_dbl <- Array$create(numeric())
+
+ expect_false(is_arrow_altrep(as.vector(v_int)))
+ expect_false(is_arrow_altrep(as.vector(v_dbl)))
+})
+
+test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+
+ table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3), str = c("un", "deux", "trois"))
+ df_table <- as.data.frame(table)
+ expect_true(is_arrow_altrep(df_table$int))
+ expect_true(is_arrow_altrep(df_table$dbl))
+ expect_true(is_arrow_altrep(df_table$str))
+
+ batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3), str = c("un", "deux", "trois"))
+ df_batch <- as.data.frame(batch)
+ expect_true(is_arrow_altrep(df_batch$int))
+ expect_true(is_arrow_altrep(df_batch$dbl))
+ expect_true(is_arrow_altrep(df_batch$str))
+})
+
+expect_altrep_rountrip <- function(x, fn, ...) {
+ alt <- Array$create(x)$as_vector()
+
+ expect_true(is_arrow_altrep(alt))
+ expect_identical(fn(x, ...), fn(alt, ...))
+ expect_true(is_arrow_altrep(alt))
+}
+
+test_that("altrep min/max/sum identical to R versions for double", {
+ x <- c(1, 2, 3)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- c(1, 2, NA_real_)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- rep(NA_real_, 3)
+ expect_warning(
+ expect_altrep_rountrip(x, min, na.rm = TRUE),
+ "no non-missing arguments to min"
+ )
+ expect_warning(
+ expect_altrep_rountrip(x, max, na.rm = TRUE),
+ "no non-missing arguments to max"
+ )
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+})
+
+test_that("altrep min/max/sum identical to R versions for int", {
+ x <- c(1L, 2L, 3L)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- c(1L, 2L, NA_integer_)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- rep(NA_integer_, 3)
+ expect_warning(
+ expect_altrep_rountrip(x, min, na.rm = TRUE),
+ "no non-missing arguments to min"
+ )
+ expect_warning(
+ expect_altrep_rountrip(x, max, na.rm = TRUE),
+ "no non-missing arguments to max"
+ )
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ # sum(x) is INT_MIN -> convert to double.
+ x <- as.integer(c(-2^31 + 1L, -1L))
+ expect_altrep_rountrip(x, sum)
+})
+
+test_that("altrep vectors handle serialization", {
+ ints <- c(1L, 2L, NA_integer_)
+ dbls <- c(1, 2, NA_real_)
+ strs <- c("un", "deux", NA_character_)
+
+ expect_identical(ints, unserialize(serialize(Array$create(ints)$as_vector(), NULL)))
+ expect_identical(dbls, unserialize(serialize(Array$create(dbls)$as_vector(), NULL)))
+ expect_identical(strs, unserialize(serialize(Array$create(strs)$as_vector(), NULL)))
+ expect_identical(strs, unserialize(serialize(Array$create(strs, large_utf8())$as_vector(), NULL)))
+})
+
+test_that("altrep vectors handle coercion", {
+ ints <- c(1L, 2L, NA_integer_)
+ dbls <- c(1, 2, NA_real_)
+ strs <- c("1", "2", NA_character_)
+
+ expect_identical(ints, as.integer(Array$create(dbls)$as_vector()))
+ expect_identical(ints, as.integer(Array$create(strs)$as_vector()))
+
+ expect_identical(dbls, as.numeric(Array$create(ints)$as_vector()))
+ expect_identical(dbls, as.numeric(Array$create(strs)$as_vector()))
+
+ expect_identical(strs, as.character(Array$create(ints)$as_vector()))
+ expect_identical(strs, as.character(Array$create(dbls)$as_vector()))
+})
+
+test_that("columns of struct types may be altrep", {
+ st <- Array$create(data.frame(x = 1:10, y = runif(10)))
+ df <- st$as_vector()
+
+ expect_true(is_arrow_altrep(df$x))
+ expect_true(is_arrow_altrep(df$y))
+})
+
+test_that("Conversion from altrep R vector to Array uses the existing Array", {
+ a_int <- Array$create(c(1L, 2L, 3L))
+ b_int <- Array$create(a_int$as_vector())
+ expect_true(test_same_Array(a_int$pointer(), b_int$pointer()))
+
+ a_dbl <- Array$create(c(1, 2, 3))
+ b_dbl <- Array$create(a_dbl$as_vector())
+ expect_true(test_same_Array(a_dbl$pointer(), b_dbl$pointer()))
+
+ a_str <- Array$create(c("un", "deux", "trois"))
+ b_str <- Array$create(a_str$as_vector())
+ expect_true(test_same_Array(a_str$pointer(), b_str$pointer()))
+})
diff --git a/src/arrow/r/tests/testthat/test-array-data.R b/src/arrow/r/tests/testthat/test-array-data.R
new file mode 100644
index 000000000..05d070d8a
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-array-data.R
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("string vectors with only empty strings and nulls don't allocate a data buffer (ARROW-3693)", {
+ a <- Array$create("")
+ expect_equal(a$length(), 1L)
+
+ buffers <- a$data()$buffers
+
+ # No nulls
+ expect_equal(buffers[[1]], NULL)
+
+ # Offsets has 2 elements
+ expect_equal(buffers[[2]]$size, 8L)
+
+ # As per ARROW-2744, values buffer should preferably be non-null.
+ expect_equal(buffers[[3]]$size, 0L)
+ expect_equal(buffers[[3]]$capacity, 0L)
+})
diff --git a/src/arrow/r/tests/testthat/test-arrow-info.R b/src/arrow/r/tests/testthat/test-arrow-info.R
new file mode 100644
index 000000000..9eac60814
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-arrow-info.R
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("arrow_info()", {
+ expect_s3_class(arrow_info(), "arrow_info")
+ expect_output(print(arrow_info()), "Arrow package version")
+ options(arrow.foo = FALSE)
+ expect_output(print(arrow_info()), "arrow.foo")
+})
diff --git a/src/arrow/r/tests/testthat/test-arrow.R b/src/arrow/r/tests/testthat/test-arrow.R
new file mode 100644
index 000000000..48970ab89
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-arrow.R
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if (!identical(tolower(Sys.getenv("TEST_R_WITHOUT_LIBARROW")), "true")) {
+ testthat::test_that("Arrow C++ is available", {
+ skip_on_cran()
+ expect_true(arrow_available())
+ })
+}
+
+test_that("Can't $new() an object with anything other than a pointer", {
+ expect_error(
+ Array$new(1:5),
+ "Array$new() requires a pointer as input: did you mean $create() instead?",
+ fixed = TRUE
+ )
+})
+
+r_only({
+ test_that("assert_is", {
+ x <- 42
+ expect_true(assert_is(x, "numeric"))
+ expect_true(assert_is(x, c("numeric", "character")))
+ expect_error(assert_is(x, "factor"), 'x must be a "factor"')
+ expect_error(
+ assert_is(x, c("factor", "list")),
+ 'x must be a "factor" or "list"'
+ )
+ expect_error(
+ assert_is(x, c("factor", "character", "list")),
+ 'x must be a "factor", "character", or "list"'
+ )
+ })
+})
+
+test_that("arrow gracefully fails to load objects from other sessions (ARROW-10071)", {
+ a <- Array$create(1:10)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ saveRDS(a, tf)
+
+ b <- readRDS(tf)
+ expect_error(b$length(), "Invalid <Array>")
+})
+
+test_that("check for an ArrowObject in functions use std::shared_ptr", {
+ expect_error(Array__length(1), "Invalid R object")
+})
+
+test_that("MemoryPool calls gc() to free memory when allocation fails (ARROW-10080)", {
+ # There is a valgrind error on this test because there cannot be memory allocated
+ # which is exactly what this test is checking, but we quiet this
+ skip_on_valgrind()
+
+ env <- new.env()
+ suppressMessages(trace(gc, print = FALSE, tracer = function() {
+ env$gc_was_called <- TRUE
+ }))
+ on.exit(suppressMessages(untrace(gc)))
+ # We expect this should fail because we don't have this much memory,
+ # but it should gc() and retry (and fail again)
+ expect_error(BufferOutputStream$create(2**60))
+ expect_true(env$gc_was_called)
+})
diff --git a/src/arrow/r/tests/testthat/test-backwards-compatibility.R b/src/arrow/r/tests/testthat/test-backwards-compatibility.R
new file mode 100644
index 000000000..32e86d5f6
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-backwards-compatibility.R
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# nolint start
+# To write a new version of a test file for a current version:
+# write_parquet(example_with_metadata, test_path("golden-files/data-arrow_2.0.0.parquet"))
+
+# To write a new version of a test file for an old version, use docker(-compose)
+# to setup a linux distribution and use RStudio's public package manager binary
+# repo to install the old version. The following commands should be run at the
+# root of the arrow repo directory and might need slight adjusments.
+# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose build --no-cache r
+# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose run r /bin/bash
+# R
+# options(repos = "https://packagemanager.rstudio.com/all/__linux__/focal/latest")
+# remotes::install_version("arrow", version = "1.0.1")
+# # get example data into the global env
+# write_parquet(example_with_metadata, "arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet")
+# quit()/exit
+# nolint end
+
+skip_if(getRversion() < "3.5.0", "The serialization format changed in 3.5")
+
+expect_identical_with_metadata <- function(object, expected, ..., top_level = TRUE) {
+ attrs_to_keep <- c("names", "class", "row.names")
+ if (!top_level) {
+ # remove not-tbl and not-data.frame attributes
+ for (attribute in names(attributes(expected))) {
+ if (attribute %in% attrs_to_keep) next
+ attributes(expected)[[attribute]] <- NULL
+ }
+ }
+ expect_identical(object, expected, ...)
+}
+
+test_that("reading a known Parquet file to dataframe with 3.0.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available("snappy")
+ pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet")
+
+ df <- read_parquet(pq_file)
+ # this is equivalent to `expect_identical()`
+ expect_identical_with_metadata(df, example_with_extra_metadata)
+})
+
+test_that("reading a known Parquet file to dataframe with 2.0.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available("snappy")
+ pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet")
+
+ df <- read_parquet(pq_file)
+ # this is equivalent to `expect_identical()`
+ expect_identical_with_metadata(df, example_with_metadata)
+})
+
+test_that("reading a known Parquet file to dataframe with 1.0.1", {
+ skip_if_not_available("parquet")
+ skip_if_not_available("snappy")
+ pq_file <- test_path("golden-files/data-arrow_1.0.1.parquet")
+
+ df <- read_parquet(pq_file)
+ # 1.0.1 didn't save top-level metadata, so we need to remove it.
+ expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE)
+})
+
+for (comp in c("lz4", "uncompressed", "zstd")) {
+ # nolint start
+ # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_lz4.feather"), compression = "lz4")
+ # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_uncompressed.feather"), compression = "uncompressed")
+ # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_zstd.feather"), compression = "zstd")
+ # nolint end
+ test_that("reading a known Feather file to dataframe with 2.0.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available(comp)
+ feather_file <- test_path(paste0("golden-files/data-arrow_2.0.0_", comp, ".feather"))
+
+ df <- read_feather(feather_file)
+ expect_identical_with_metadata(df, example_with_metadata)
+ })
+
+ test_that("reading a known Feather file to dataframe with 1.0.1", {
+ skip_if_not_available("parquet")
+ skip_if_not_available(comp)
+ feather_file <- test_path(paste0("golden-files/data-arrow_1.0.1_", comp, ".feather"))
+
+ df <- read_feather(feather_file)
+ # 1.0.1 didn't save top-level metadata, so we need to remove it.
+ expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE)
+ })
+
+ test_that("reading a known Feather file to dataframe with 0.17.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available(comp)
+ feather_file <- test_path(paste0("golden-files/data-arrow_0.17.0_", comp, ".feather"))
+
+ df <- read_feather(feather_file)
+ # the metadata from 0.17.0 doesn't have the top level, the special class is
+ # not maintained and the embedded tibble's attributes are read in a wrong
+ # order. Since this is prior to 1.0.0 punting on checking the attributes
+ # though classes are always checked, so that must be removed before checking.
+ example_with_metadata_sans_special_class <- example_with_metadata
+ example_with_metadata_sans_special_class$a <- unclass(example_with_metadata_sans_special_class$a)
+ expect_equal(df, example_with_metadata_sans_special_class, ignore_attr = TRUE)
+ })
+}
+
+# TODO: streams(?)
diff --git a/src/arrow/r/tests/testthat/test-buffer-reader.R b/src/arrow/r/tests/testthat/test-buffer-reader.R
new file mode 100644
index 000000000..b790ed0da
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-buffer-reader.R
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("BufferReader can be created from R objects", {
+ num <- BufferReader$create(numeric(13))
+ int <- BufferReader$create(integer(13))
+ raw <- BufferReader$create(raw(16))
+
+ expect_r6_class(num, "BufferReader")
+ expect_r6_class(int, "BufferReader")
+ expect_r6_class(raw, "BufferReader")
+
+ expect_equal(num$GetSize(), 13 * 8)
+ expect_equal(int$GetSize(), 13 * 4)
+ expect_equal(raw$GetSize(), 16)
+})
+
+test_that("BufferReader can be created from Buffer", {
+ buf <- buffer(raw(76))
+ reader <- BufferReader$create(buf)
+
+ expect_r6_class(reader, "BufferReader")
+ expect_equal(reader$GetSize(), 76)
+})
diff --git a/src/arrow/r/tests/testthat/test-buffer.R b/src/arrow/r/tests/testthat/test-buffer.R
new file mode 100644
index 000000000..9b3ebc6de
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-buffer.R
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Buffer can be created from raw vector", {
+ vec <- raw(123)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 123)
+})
+
+test_that("Buffer can be created from integer vector", {
+ vec <- integer(17)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 17 * 4)
+})
+
+test_that("Buffer can be created from numeric vector", {
+ vec <- numeric(17)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 17 * 8)
+})
+
+test_that("Buffer can be created from complex vector", {
+ vec <- complex(3)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 3 * 16)
+})
+
+test_that("buffer buffer buffers buffers", {
+ expect_r6_class(buffer(buffer(42)), "Buffer")
+})
+
+test_that("Other types can't be converted to Buffers", {
+ expect_error(
+ buffer(data.frame(a = "asdf")),
+ "Cannot convert object of class data.frame to arrow::Buffer"
+ )
+})
+
+test_that("can convert Buffer to raw", {
+ buf <- buffer(rnorm(10))
+ expect_equal(buf$data(), as.raw(buf))
+})
+
+test_that("can read remaining bytes of a RandomAccessFile", {
+ tbl <- tibble::tibble(
+ int = 1:10, dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+ tab <- Table$create(!!!tbl)
+
+ tf <- tempfile()
+ all_bytes <- write_feather(tab, tf)
+
+ file <- ReadableFile$create(tf)
+ expect_equal(file$tell(), 0)
+ x <- file$Read(20)$data()
+ expect_equal(file$tell(), 20)
+ y <- file$Read()$data()
+
+ file <- ReadableFile$create(tf)
+ z <- file$Read()$data()
+
+ file <- ReadableFile$create(tf)
+ a <- file$ReadAt(20)$data()
+
+ expect_equal(file$GetSize(), length(x) + length(y))
+ expect_equal(z, c(x, y))
+ expect_equal(a, y)
+})
+
+test_that("Buffer$Equals", {
+ vec <- integer(17)
+ buf1 <- buffer(vec)
+ buf2 <- buffer(vec)
+ expect_equal(buf1, buf2)
+ expect_true(buf1$Equals(buf2))
+ expect_false(buf1$Equals(vec))
+})
diff --git a/src/arrow/r/tests/testthat/test-chunked-array.R b/src/arrow/r/tests/testthat/test-chunked-array.R
new file mode 100644
index 000000000..c931ddec5
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-chunked-array.R
@@ -0,0 +1,468 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+expect_chunked_roundtrip <- function(x, type) {
+ a <- ChunkedArray$create(!!!x)
+ flat_x <- unlist(x, recursive = FALSE)
+ attributes(flat_x) <- attributes(x[[1]])
+ expect_type_equal(a$type, type)
+ expect_identical(a$num_chunks, length(x))
+ expect_identical(length(a), length(flat_x))
+ if (!inherits(type, "ListType")) {
+ # TODO: revisit how missingness works with ListArrays
+ # R list objects don't handle missingness the same way as other vectors.
+ # Is there some vctrs thing we should do on the roundtrip back to R?
+ expect_identical(as.vector(is.na(a)), is.na(flat_x))
+ }
+ expect_as_vector(a, flat_x)
+ expect_as_vector(a$chunk(0), x[[1]])
+
+ if (length(flat_x)) {
+ a_sliced <- a$Slice(1)
+ x_sliced <- flat_x[-1]
+ expect_type_equal(a_sliced$type, type)
+ expect_identical(length(a_sliced), length(x_sliced))
+ if (!inherits(type, "ListType")) {
+ expect_identical(as.vector(is.na(a_sliced)), is.na(x_sliced))
+ }
+ expect_as_vector(a_sliced, x_sliced)
+ }
+ invisible(a)
+}
+
+test_that("ChunkedArray", {
+ x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32())
+
+ y <- x$Slice(8)
+ expect_equal(y$type, int32())
+ expect_equal(y$num_chunks, 3L)
+ expect_equal(length(y), 17L)
+ expect_as_vector(y, c(9:10, 1:10, 1:5))
+
+ z <- x$Slice(8, 5)
+ expect_equal(z$type, int32())
+ expect_equal(z$num_chunks, 2L)
+ expect_equal(z$length(), 5L)
+ expect_equal(z$as_vector(), c(9:10, 1:3))
+
+ expect_chunked_roundtrip(list(c(1, 2, 3), c(4, 5, 6)), float64())
+
+ # input validation
+ expect_error(x$chunk(14), "subscript out of bounds")
+ expect_error(x$chunk("one"))
+ expect_error(x$chunk(NA_integer_), "'i' cannot be NA")
+ expect_error(x$chunk(-1), "subscript out of bounds")
+
+ expect_error(x$Slice("ten"))
+ expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(10, "ten"))
+ expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(c(10, 10)))
+ expect_error(x$Slice(10, c(10, 10)))
+ expect_error(x$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative")
+
+ expect_warning(x$Slice(10, 15), NA)
+ expect_warning(
+ overslice <- x$Slice(10, 16),
+ "Slice 'length' greater than available length"
+ )
+ expect_equal(length(overslice), 15)
+ expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
+})
+
+test_that("print ChunkedArray", {
+ verify_output(test_path("test-chunked-array.txt"), {
+ chunked_array(c(1, 2, 3), c(4, 5, 6))
+ chunked_array(1:30, c(4, 5, 6))
+ chunked_array(1:30)
+ chunked_array(factor(c("a", "b")), factor(c("c", "d")))
+ })
+})
+
+test_that("ChunkedArray handles !!! splicing", {
+ data <- list(1, 2, 3)
+ x <- chunked_array(!!!data)
+ expect_equal(x$type, float64())
+ expect_equal(x$num_chunks, 3L)
+})
+
+test_that("ChunkedArray handles Inf", {
+ data <- list(c(Inf, 2:10), c(1:3, Inf, 5L), 1:10)
+ x <- chunked_array(!!!data)
+ expect_equal(x$type, float64())
+ expect_equal(x$num_chunks, 3L)
+ expect_equal(length(x), 25L)
+ expect_as_vector(x, c(c(Inf, 2:10), c(1:3, Inf, 5), 1:10))
+
+ chunks <- x$chunks
+ expect_as_vector(is.infinite(chunks[[2]]), is.infinite(data[[2]]))
+ expect_equal(
+ as.vector(is.infinite(x)),
+ c(is.infinite(data[[1]]), is.infinite(data[[2]]), is.infinite(data[[3]]))
+ )
+})
+
+test_that("ChunkedArray handles NA", {
+ data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L))
+ x <- chunked_array(!!!data)
+ expect_equal(x$type, int32())
+ expect_equal(x$num_chunks, 3L)
+ expect_equal(length(x), 25L)
+ expect_as_vector(x, c(1:10, c(NA, 2:10), c(1:3, NA, 5)))
+
+ chunks <- x$chunks
+ expect_as_vector(is.na(chunks[[2]]), is.na(data[[2]]))
+ expect_as_vector(is.na(x), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]])))
+})
+
+test_that("ChunkedArray handles NaN", {
+ data <- list(as.numeric(1:10), c(NaN, 2:10), c(1:3, NaN, 5L))
+ x <- chunked_array(!!!data)
+
+ expect_equal(x$type, float64())
+ expect_equal(x$num_chunks, 3L)
+ expect_equal(length(x), 25L)
+ expect_as_vector(x, c(1:10, c(NaN, 2:10), c(1:3, NaN, 5)))
+
+ chunks <- x$chunks
+ expect_as_vector(is.nan(chunks[[2]]), is.nan(data[[2]]))
+ expect_as_vector(is.nan(x), c(is.nan(data[[1]]), is.nan(data[[2]]), is.nan(data[[3]])))
+})
+
+test_that("ChunkedArray supports logical vectors (ARROW-3341)", {
+ # with NA
+ data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE))
+ expect_chunked_roundtrip(data, bool())
+ # without NA
+ data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE))
+ expect_chunked_roundtrip(data, bool())
+})
+
+test_that("ChunkedArray supports character vectors (ARROW-3339)", {
+ data <- list(
+ c("itsy", NA, "spider"),
+ c("Climbed", "up", "the", "water", "spout"),
+ c("Down", "came", "the", "rain"),
+ "And washed the spider out. "
+ )
+ expect_chunked_roundtrip(data, utf8())
+})
+
+test_that("ChunkedArray supports factors (ARROW-3716)", {
+ f <- factor(c("itsy", "bitsy", "spider", "spider"))
+ expect_chunked_roundtrip(list(f, f, f), dictionary(int8()))
+})
+
+test_that("ChunkedArray supports dates (ARROW-3716)", {
+ d <- Sys.Date() + 1:10
+ expect_chunked_roundtrip(list(d, d), date32())
+})
+
+test_that("ChunkedArray supports POSIXct (ARROW-3716)", {
+ times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10
+ expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC"))
+})
+
+test_that("ChunkedArray supports integer64 (ARROW-3716)", {
+ x <- bit64::as.integer64(1:10) + MAX_INT
+ expect_chunked_roundtrip(list(x, x), int64())
+ # Also with a first chunk that would downcast
+ zero <- Array$create(0L)$cast(int64())
+ expect_type_equal(zero, int64())
+ ca <- ChunkedArray$create(zero, x)
+ expect_type_equal(ca, int64())
+ expect_s3_class(as.vector(ca), "integer64")
+ expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x))
+})
+
+test_that("ChunkedArray supports difftime", {
+ time <- hms::hms(56, 34, 12)
+ expect_chunked_roundtrip(list(time, time), time32("s"))
+})
+
+test_that("ChunkedArray supports empty arrays (ARROW-13761)", {
+ types <- c(
+ int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
+ uint64(), float32(), float64(), timestamp("ns"), binary(),
+ large_binary(), fixed_size_binary(32), date32(), date64(),
+ decimal(4, 2), dictionary(), struct(x = int32())
+ )
+
+ empty_filter <- ChunkedArray$create(type = bool())
+ for (type in types) {
+ one_empty_chunk <- ChunkedArray$create(type = type)
+ expect_type_equal(one_empty_chunk$type, type)
+ if (type != struct(x = int32())) {
+ expect_identical(length(one_empty_chunk), length(as.vector(one_empty_chunk)))
+ } else {
+ # struct -> tbl and length(tbl) is num_columns instead of num_rows
+ expect_identical(length(as.vector(one_empty_chunk)), 1L)
+ }
+ zero_empty_chunks <- one_empty_chunk$Filter(empty_filter)
+ expect_equal(zero_empty_chunks$num_chunks, 0)
+ expect_type_equal(zero_empty_chunks$type, type)
+ if (type != struct(x = int32())) {
+ expect_identical(length(zero_empty_chunks), length(as.vector(zero_empty_chunks)))
+ } else {
+ expect_identical(length(as.vector(zero_empty_chunks)), 1L)
+ }
+ }
+})
+
+test_that("integer types casts for ChunkedArray (ARROW-3741)", {
+ int_types <- c(int8(), int16(), int32(), int64())
+ uint_types <- c(uint8(), uint16(), uint32(), uint64())
+ float_types <- c(float32(), float64()) # float16() not really supported in C++ yet
+ all_types <- c(
+ int_types,
+ uint_types,
+ float_types
+ )
+
+ a <- chunked_array(1:10, 1:10)
+ for (type in c(int_types, uint_types)) {
+ casted <- a$cast(type)
+ expect_r6_class(casted, "ChunkedArray")
+ expect_type_equal(casted$type, type)
+ }
+ # Also test casting to double(), not actually a type, a base R function but should be alias for float64
+ dbl <- a$cast(double())
+ expect_r6_class(dbl, "ChunkedArray")
+ expect_type_equal(dbl$type, float64())
+})
+
+test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", {
+ num_int32 <- 12L
+ num_int64 <- bit64::as.integer64(10)
+ for (type in all_types) {
+ expect_type_equal(chunked_array(num_int32, type = type)$type, type)
+ expect_type_equal(chunked_array(num_int64, type = type)$type, type)
+ }
+ # also test creating with double() "type"
+ expect_type_equal(chunked_array(num_int32, type = double())$type, float64())
+})
+
+test_that("ChunkedArray$create() aborts on overflow", {
+ expect_error(chunked_array(128L, type = int8())$type)
+ expect_error(chunked_array(-129L, type = int8())$type)
+
+ expect_error(chunked_array(256L, type = uint8())$type)
+ expect_error(chunked_array(-1L, type = uint8())$type)
+
+ expect_error(chunked_array(32768L, type = int16())$type)
+ expect_error(chunked_array(-32769L, type = int16())$type)
+
+ expect_error(chunked_array(65536L, type = uint16())$type)
+ expect_error(chunked_array(-1L, type = uint16())$type)
+
+ expect_error(chunked_array(65536L, type = uint16())$type)
+ expect_error(chunked_array(-1L, type = uint16())$type)
+
+ expect_error(chunked_array(bit64::as.integer64(2^31), type = int32()))
+ expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32()))
+})
+
+test_that("chunked_array() convert doubles to integers", {
+ for (type in c(int_types, uint_types)) {
+ a <- chunked_array(10, type = type)
+ expect_type_equal(a$type, type)
+ if (type != uint64()) {
+ # exception for unsigned integer 64 that
+ # wa cannot handle yet
+ expect_true(as.vector(a) == 10)
+ }
+ }
+})
+
+test_that("chunked_array() uses the first ... to infer type", {
+ a <- chunked_array(10, 10L)
+ expect_type_equal(a$type, float64())
+})
+
+test_that("chunked_array() handles downcasting", {
+ a <- chunked_array(10L, 10)
+ expect_type_equal(a$type, int32())
+ expect_as_vector(a, c(10L, 10L))
+})
+
+test_that("chunked_array() makes chunks of the same type", {
+ a <- chunked_array(10L, bit64::as.integer64(13), type = int64())
+ for (chunk in a$chunks) {
+ expect_type_equal(chunk$type, int64())
+ }
+})
+
+test_that("chunked_array() handles 0 chunks if given a type", {
+ for (type in all_types) {
+ a <- chunked_array(type = type)
+ expect_type_equal(a$type, as_type(type))
+ expect_equal(length(a), 0L)
+ }
+})
+
+test_that("chunked_array() can ingest arrays (ARROW-3815)", {
+ expect_equal(
+ as.vector(chunked_array(1:5, Array$create(6:10))),
+ 1:10
+ )
+})
+
+test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", {
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ a <- chunked_array(df, df)
+ expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8()))
+ expect_equal(a$as_vector(), rbind(df, df), ignore_attr = TRUE)
+})
+
+test_that("ChunkedArray$View() (ARROW-6542)", {
+ a <- ChunkedArray$create(1:3, 1:4)
+ b <- a$View(float32())
+ expect_equal(b$type, float32())
+ expect_equal(length(b), 7L)
+ expect_true(all(
+ sapply(b$chunks, function(.x) .x$type == float32())
+ ))
+ # Input validation
+ expect_error(a$View("not a type"), "type must be a DataType, not character")
+})
+
+test_that("ChunkedArray$Validate()", {
+ a <- ChunkedArray$create(1:10)
+ expect_error(a$Validate(), NA)
+})
+
+test_that("[ ChunkedArray", {
+ one_chunk <- chunked_array(2:11)
+ x <- chunked_array(1:10, 31:40, 51:55)
+ # Slice
+ expect_as_vector(x[8:12], c(8:10, 31:32))
+ # Take from same chunk
+ expect_as_vector(x[c(11, 15, 12)], c(31, 35, 32))
+ # Take from multiple chunks (calls Concatenate)
+ expect_as_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3))
+ # Take with Array (note these are 0-based)
+ take1 <- Array$create(c(10L, 14L, 11L))
+ expect_as_vector(x[take1], c(31, 35, 32))
+ # Take with ChunkedArray
+ take2 <- ChunkedArray$create(c(10L, 14L), 11L)
+ expect_as_vector(x[take2], c(31, 35, 32))
+
+ # Filter (with recycling)
+ expect_as_vector(
+ one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
+ c(3, 6, 8, 11)
+ )
+ # Filter where both are 1-chunk
+ expect_as_vector(
+ one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))],
+ c(3, 6, 8, 11)
+ )
+ # Filter multi-chunk with logical (-> Array)
+ expect_as_vector(
+ x[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
+ c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55)
+ )
+ # Filter with a chunked array with different sized chunks
+ p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE)
+ p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)
+ filt <- ChunkedArray$create(p1, p2, p2)
+ expect_as_vector(
+ x[filt],
+ c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55)
+ )
+})
+
+test_that("ChunkedArray head/tail", {
+ vec <- 11:20
+ a <- ChunkedArray$create(11:15, 16:20)
+ expect_as_vector(head(a), head(vec))
+ expect_as_vector(head(a, 4), head(vec, 4))
+ expect_as_vector(head(a, 40), head(vec, 40))
+ expect_as_vector(head(a, -4), head(vec, -4))
+ expect_as_vector(head(a, -40), head(vec, -40))
+ expect_as_vector(tail(a), tail(vec))
+ expect_as_vector(tail(a, 4), tail(vec, 4))
+ expect_as_vector(tail(a, 40), tail(vec, 40))
+ expect_as_vector(tail(a, -40), tail(vec, -40))
+})
+
+test_that("ChunkedArray$Equals", {
+ vec <- 11:20
+ a <- ChunkedArray$create(vec[1:5], vec[6:10])
+ b <- ChunkedArray$create(vec[1:5], vec[6:10])
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(vec))
+})
+
+test_that("Converting a chunked array unifies factors (ARROW-8374)", {
+ f1 <- factor(c("a"), levels = c("a", "b"))
+ f2 <- factor(c("c"), levels = c("c", "d"))
+ f3 <- factor(NA, levels = "a")
+ f4 <- factor()
+
+ res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))
+ ca <- ChunkedArray$create(f1, f2, f3, f4)
+
+ expect_identical(ca$as_vector(), res)
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- structure(list(
+ as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
+ as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
+ as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
+ as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
+ as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
+ as.raw(c(0x74, 0x76))
+ ),
+ class = c("arrow_binary", "vctrs_vctr", "list")
+ )
+ chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+
+ v <- expect_error(as.vector(chunked_array_with_nul), NA)
+
+ expect_error(
+ v[],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ",
+ "set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ v <- expect_warning(as.vector(chunked_array_with_nul), NA)
+ expect_warning(
+ expect_identical(v[3], "man"),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
diff --git a/src/arrow/r/tests/testthat/test-chunked-array.txt b/src/arrow/r/tests/testthat/test-chunked-array.txt
new file mode 100644
index 000000000..c7101359d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-chunked-array.txt
@@ -0,0 +1,103 @@
+> chunked_array(c(1, 2, 3), c(4, 5, 6))
+ChunkedArray
+[
+ [
+ 1,
+ 2,
+ 3
+ ],
+ [
+ 4,
+ 5,
+ 6
+ ]
+]
+
+> chunked_array(1:30, c(4, 5, 6))
+ChunkedArray
+[
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ ...
+ 21,
+ 22,
+ 23,
+ 24,
+ 25,
+ 26,
+ 27,
+ 28,
+ 29,
+ 30
+ ],
+ [
+ 4,
+ 5,
+ 6
+ ]
+]
+
+> chunked_array(1:30)
+ChunkedArray
+[
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ ...
+ 21,
+ 22,
+ 23,
+ 24,
+ 25,
+ 26,
+ 27,
+ 28,
+ 29,
+ 30
+ ]
+]
+
+> chunked_array(factor(c("a", "b")), factor(c("c", "d")))
+ChunkedArray
+[
+
+ -- dictionary:
+ [
+ "a",
+ "b"
+ ]
+ -- indices:
+ [
+ 0,
+ 1
+ ],
+
+ -- dictionary:
+ [
+ "c",
+ "d"
+ ]
+ -- indices:
+ [
+ 0,
+ 1
+ ]
+]
+
diff --git a/src/arrow/r/tests/testthat/test-compressed.R b/src/arrow/r/tests/testthat/test-compressed.R
new file mode 100644
index 000000000..d796e3e75
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compressed.R
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("codec_is_available", {
+ expect_true(codec_is_available("uncompressed")) # Always true
+ expect_match_arg_error(codec_is_available("sdfasdf"))
+ skip_if_not_available("gzip")
+ expect_true(codec_is_available("gzip"))
+ expect_true(codec_is_available("GZIP"))
+})
+
+if (identical(Sys.getenv("APPVEYOR"), "True")) {
+ test_that("Compression codecs are included in the Windows build", {
+ expect_true(codec_is_available("lz4"))
+ expect_true(codec_is_available("zstd"))
+ })
+}
+
+test_that("Codec attributes", {
+ skip_if_not_available("gzip")
+ cod <- Codec$create("gzip")
+ expect_equal(cod$name, "gzip")
+ # TODO: implement $level
+ expect_error(cod$level)
+})
+
+test_that("can write Buffer to CompressedOutputStream and read back in CompressedInputStream", {
+ skip_if_not_available("gzip")
+ buf <- buffer(as.raw(sample(0:255, size = 1024, replace = TRUE)))
+
+ tf1 <- tempfile()
+ stream1 <- CompressedOutputStream$create(tf1)
+ expect_equal(stream1$tell(), 0)
+ stream1$write(buf)
+ expect_equal(stream1$tell(), buf$size)
+ stream1$close()
+
+ tf2 <- tempfile()
+ sink2 <- FileOutputStream$create(tf2)
+ stream2 <- CompressedOutputStream$create(sink2)
+ expect_equal(stream2$tell(), 0)
+ stream2$write(buf)
+ expect_equal(stream2$tell(), buf$size)
+ stream2$close()
+ sink2$close()
+
+ input1 <- CompressedInputStream$create(tf1)
+ buf1 <- input1$Read(1024L)
+
+ file2 <- ReadableFile$create(tf2)
+ input2 <- CompressedInputStream$create(file2)
+ buf2 <- input2$Read(1024L)
+
+ expect_equal(buf, buf1)
+ expect_equal(buf, buf2)
+
+ unlink(tf1)
+ unlink(tf2)
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-aggregate.R b/src/arrow/r/tests/testthat/test-compute-aggregate.R
new file mode 100644
index 000000000..018279d4b
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-aggregate.R
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("list_compute_functions", {
+ allfuncs <- list_compute_functions()
+ expect_false(all(grepl("min", allfuncs)))
+ justmins <- list_compute_functions("^min")
+ expect_true(length(justmins) > 0)
+ expect_true(all(grepl("min", justmins)))
+ no_hash_funcs <- list_compute_functions("^hash")
+ expect_true(length(no_hash_funcs) == 0)
+})
+
+test_that("sum.Array", {
+ ints <- 1:5
+ a <- Array$create(ints)
+ expect_r6_class(sum(a), "Scalar")
+ expect_identical(as.integer(sum(a)), sum(ints))
+
+ floats <- c(1.3, 2.4, 3)
+ f <- Array$create(floats)
+ expect_identical(as.numeric(sum(f)), sum(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ if (!grepl("devel", R.version.string)) {
+ # Valgrind on R-devel confuses NaN and NA_real_
+ # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html
+ expect_identical(as.numeric(sum(na)), sum(floats))
+ }
+ expect_r6_class(sum(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, NA, TRUE, FALSE)
+ b <- Array$create(bools)
+ expect_identical(as.integer(sum(b)), sum(bools))
+ expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE))
+})
+
+test_that("sum.ChunkedArray", {
+ a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_r6_class(sum(a), "Scalar")
+ expect_true(is.na(as.vector(sum(a))))
+ expect_identical(as.numeric(sum(a, na.rm = TRUE)), 35)
+})
+
+test_that("sum dots", {
+ a1 <- Array$create(1:4)
+ a2 <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_identical(as.numeric(sum(a1, a2, na.rm = TRUE)), 45)
+})
+
+test_that("sum.Scalar", {
+ s <- Scalar$create(4)
+ expect_identical(as.numeric(s), as.numeric(sum(s)))
+})
+
+test_that("mean.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(mean(a), "Scalar")
+ expect_identical(as.vector(mean(a)), mean(ints))
+
+ floats <- c(1.3, 2.4, 3)
+ f <- Array$create(floats)
+ expect_identical(as.vector(mean(f)), mean(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ if (!grepl("devel", R.version.string)) {
+ # Valgrind on R-devel confuses NaN and NA_real_
+ # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html
+ expect_identical(as.vector(mean(na)), mean(floats))
+ }
+ expect_r6_class(mean(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, NA, TRUE, FALSE)
+ b <- Array$create(bools)
+ expect_identical(as.vector(mean(b)), mean(bools))
+ expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE))
+})
+
+test_that("mean.ChunkedArray", {
+ a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_r6_class(mean(a), "Scalar")
+ expect_true(is.na(as.vector(mean(a))))
+ expect_identical(as.vector(mean(a, na.rm = TRUE)), 35 / 13)
+})
+
+test_that("mean.Scalar", {
+ s <- Scalar$create(4)
+ expect_equal(s, mean(s))
+})
+
+test_that("Bad input handling of call_function", {
+ expect_error(
+ call_function("sum", 2, 3),
+ 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"'
+ )
+})
+
+test_that("min.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(min(a), "Scalar")
+ expect_identical(as.vector(min(a)), min(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- Array$create(floats)
+ expect_identical(as.vector(min(f)), min(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ expect_identical(as.vector(min(na)), min(floats))
+ expect_r6_class(min(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- Array$create(bools)
+ # R is inconsistent here: typeof(min(NA)) == "integer", not "logical"
+ expect_identical(as.vector(min(b)), as.logical(min(bools)))
+})
+
+test_that("max.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(max(a), "Scalar")
+ expect_identical(as.vector(max(a)), max(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- Array$create(floats)
+ expect_identical(as.vector(max(f)), max(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ expect_identical(as.vector(max(na)), max(floats))
+ expect_r6_class(max(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- Array$create(bools)
+ # R is inconsistent here: typeof(max(NA)) == "integer", not "logical"
+ expect_identical(as.vector(max(b)), as.logical(max(bools)))
+})
+
+test_that("min.ChunkedArray", {
+ ints <- 1:4
+ a <- ChunkedArray$create(ints)
+ expect_r6_class(min(a), "Scalar")
+ expect_identical(as.vector(min(a)), min(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- ChunkedArray$create(floats)
+ expect_identical(as.vector(min(f)), min(floats))
+
+ floats <- c(floats, NA)
+ na <- ChunkedArray$create(floats)
+ expect_identical(as.vector(min(na)), min(floats))
+ expect_r6_class(min(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- ChunkedArray$create(bools)
+ # R is inconsistent here: typeof(min(NA)) == "integer", not "logical"
+ expect_identical(as.vector(min(b)), as.logical(min(bools)))
+})
+
+test_that("max.ChunkedArray", {
+ ints <- 1:4
+ a <- ChunkedArray$create(ints)
+ expect_r6_class(max(a), "Scalar")
+ expect_identical(as.vector(max(a)), max(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- ChunkedArray$create(floats)
+ expect_identical(as.vector(max(f)), max(floats))
+
+ floats <- c(floats, NA)
+ na <- ChunkedArray$create(floats)
+ expect_identical(as.vector(max(na)), max(floats))
+ expect_r6_class(max(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- ChunkedArray$create(bools)
+ # R is inconsistent here: typeof(max(NA)) == "integer", not "logical"
+ expect_identical(as.vector(max(b)), as.logical(max(bools)))
+})
+
+test_that("Edge cases", {
+ a <- Array$create(NA)
+ for (type in c(int32(), float64(), bool())) {
+ expect_as_vector(sum(a$cast(type), na.rm = TRUE), sum(NA, na.rm = TRUE))
+ expect_as_vector(mean(a$cast(type), na.rm = TRUE), mean(NA, na.rm = TRUE))
+ expect_as_vector(
+ min(a$cast(type), na.rm = TRUE),
+ # Suppress the base R warning about no non-missing arguments
+ suppressWarnings(min(NA, na.rm = TRUE))
+ )
+ expect_as_vector(
+ max(a$cast(type), na.rm = TRUE),
+ suppressWarnings(max(NA, na.rm = TRUE))
+ )
+ }
+})
+
+test_that("quantile.Array and quantile.ChunkedArray", {
+ a <- Array$create(c(0, 1, 2, 3))
+ ca <- ChunkedArray$create(c(0, 1), c(2, 3))
+ probs <- c(0.49, 0.51)
+ for (ad in list(a, ca)) {
+ for (type in c(int32(), uint64(), float64())) {
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "linear"),
+ Array$create(c(1.47, 1.53))
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "lower"),
+ Array$create(c(1, 1))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "higher"),
+ Array$create(c(2, 2))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "nearest"),
+ Array$create(c(1, 2))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "midpoint"),
+ Array$create(c(1.5, 1.5))
+ )
+ }
+ }
+})
+
+test_that("quantile and median NAs, edge cases, and exceptions", {
+ expect_equal(
+ quantile(Array$create(c(1, 2)), probs = c(0, 1)),
+ Array$create(c(1, 2))
+ )
+ expect_error(
+ quantile(Array$create(c(1, 2, NA))),
+ "Missing values not allowed if 'na.rm' is FALSE"
+ )
+ expect_equal(
+ quantile(Array$create(numeric(0))),
+ Array$create(rep(NA_real_, 5))
+ )
+ expect_equal(
+ quantile(Array$create(rep(NA_integer_, 3)), na.rm = TRUE),
+ Array$create(rep(NA_real_, 5))
+ )
+ expect_equal(
+ quantile(Scalar$create(0L)),
+ Array$create(rep(0, 5))
+ )
+ expect_equal(
+ median(Scalar$create(1L)),
+ Scalar$create(1)
+ )
+ expect_error(
+ quantile(Array$create(1:3), type = 9),
+ "not supported"
+ )
+})
+
+test_that("median passes ... args to quantile", {
+ skip_if(
+ !"..." %in% names(formals(median)),
+ "The median generic lacks dots in R 3.3.0 and earlier"
+ )
+ expect_equal(
+ median(Array$create(c(1, 2)), interpolation = "higher"),
+ Scalar$create(2)
+ )
+ expect_error(
+ median(Array$create(c(1, 2)), probs = c(.25, .75))
+ )
+})
+
+test_that("median.Array and median.ChunkedArray", {
+ compare_expression(
+ median(.input),
+ 1:4
+ )
+ compare_expression(
+ median(.input),
+ 1:5
+ )
+ compare_expression(
+ median(.input),
+ numeric(0)
+ )
+ compare_expression(
+ median(.input, na.rm = FALSE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ NA_real_
+ )
+ compare_expression(
+ median(.input, na.rm = FALSE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ NA_real_
+ )
+})
+
+test_that("unique.Array", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ expect_equal(unique(a), Array$create(c(1, 4, 3)))
+ ca <- ChunkedArray$create(a, a)
+ expect_equal(unique(ca), Array$create(c(1, 4, 3)))
+})
+
+test_that("match_arrow", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ tab <- c(4, 3, 2, 1)
+ expect_equal(match_arrow(a, tab), Array$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L)))
+
+ ca <- ChunkedArray$create(c(1, 4, 3, 1, 1, 3, 4))
+ expect_equal(match_arrow(ca, tab), ChunkedArray$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L)))
+
+ sc <- Scalar$create(3)
+ expect_equal(match_arrow(sc, tab), Scalar$create(1L))
+
+ vec <- c(1, 2)
+ expect_equal(match_arrow(vec, tab), Array$create(c(3L, 2L)))
+})
+
+test_that("is_in", {
+ a <- Array$create(c(9, 4, 3))
+ tab <- c(4, 3, 2, 1)
+ expect_equal(is_in(a, tab), Array$create(c(FALSE, TRUE, TRUE)))
+
+ ca <- ChunkedArray$create(c(9, 4, 3))
+ expect_equal(is_in(ca, tab), ChunkedArray$create(c(FALSE, TRUE, TRUE)))
+
+ sc <- Scalar$create(3)
+ expect_equal(is_in(sc, tab), Scalar$create(TRUE))
+
+ vec <- c(1, 9)
+ expect_equal(is_in(vec, tab), Array$create(c(TRUE, FALSE)))
+})
+
+test_that("value_counts", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ result_df <- tibble::tibble(
+ values = c(1, 4, 3),
+ counts = c(3L, 2L, 2L)
+ )
+ result <- Array$create(
+ result_df,
+ type = struct(values = float64(), counts = int64())
+ )
+ expect_equal(value_counts(a), result)
+ expect_identical(as.data.frame(value_counts(a)), result_df)
+ expect_identical(as.vector(value_counts(a)$counts), result_df$counts)
+})
+
+test_that("any.Array and any.ChunkedArray", {
+ data <- c(1:10, NA, NA)
+
+ compare_expression(any(.input > 5), data)
+ compare_expression(any(.input > 5, na.rm = TRUE), data)
+ compare_expression(any(.input < 1), data)
+ compare_expression(any(.input < 1, na.rm = TRUE), data)
+
+ data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE)
+
+ compare_expression(any(.input), data_logical)
+ compare_expression(any(.input, na.rm = FALSE), data_logical)
+ compare_expression(any(.input, na.rm = TRUE), data_logical)
+})
+
+test_that("all.Array and all.ChunkedArray", {
+ data <- c(1:10, NA, NA)
+
+ compare_expression(all(.input > 5), data)
+ compare_expression(all(.input > 5, na.rm = TRUE), data)
+
+ compare_expression(all(.input < 11), data)
+ compare_expression(all(.input < 11, na.rm = TRUE), data)
+
+ data_logical <- c(TRUE, TRUE, NA)
+
+ compare_expression(all(.input), data_logical)
+ compare_expression(all(.input, na.rm = TRUE), data_logical)
+})
+
+test_that("variance", {
+ data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA)
+ arr <- Array$create(data)
+ chunked_arr <- ChunkedArray$create(data)
+
+ expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(34596))
+ expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(34596))
+})
+
+test_that("stddev", {
+ data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA)
+ arr <- Array$create(data)
+ chunked_arr <- ChunkedArray$create(data)
+
+ expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(186))
+ expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(186))
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-arith.R b/src/arrow/r/tests/testthat/test-compute-arith.R
new file mode 100644
index 000000000..e8674e315
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-arith.R
@@ -0,0 +1,129 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Addition", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_type_equal(a, int32())
+ expect_type_equal(a + 4L, int32())
+ expect_type_equal(a + 4, float64())
+ expect_equal(a + 4L, Array$create(c(5:8, NA_integer_)))
+ expect_identical(as.vector(a + 4L), c(5:8, NA_integer_))
+ expect_equal(a + 4L, Array$create(c(5:8, NA_integer_)))
+ expect_as_vector(a + 4L, c(5:8, NA_integer_))
+ expect_equal(a + NA_integer_, Array$create(rep(NA_integer_, 5)))
+
+ a8 <- a$cast(int8())
+ expect_type_equal(a8 + Scalar$create(1, int8()), int8())
+
+ # int8 will be promoted to int32 when added to int32
+ expect_type_equal(a8 + 127L, int32())
+ expect_equal(a8 + 127L, Array$create(c(128:131, NA_integer_)))
+
+ b <- Array$create(c(4:1, NA_integer_))
+ expect_type_equal(a8 + b, int32())
+ expect_equal(a8 + b, Array$create(c(5L, 5L, 5L, 5L, NA_integer_)))
+
+ expect_type_equal(a + 4.1, float64())
+ expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_)))
+})
+
+test_that("Subtraction", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_equal(a - 3L, Array$create(c(-2:1, NA_integer_)))
+
+ expect_equal(
+ Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_)) - a,
+ Array$create(c(4.1, 4.1, 4.1, 4.1, NA_real_))
+ )
+})
+
+test_that("Multiplication", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_equal(a * 2L, Array$create(c(1:4 * 2L, NA_integer_)))
+
+ expect_equal(
+ (a * 0.5) * 3L,
+ Array$create(c(1.5, 3, 4.5, 6, NA_real_))
+ )
+})
+
+test_that("Division", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_)))
+ expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+ expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_)))
+ expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
+ expect_equal(a / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+ # TODO add tests for integer division %/% by 0
+ # see https://issues.apache.org/jira/browse/ARROW-14297
+
+ b <- a$cast(float64())
+ expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_)))
+ expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+ expect_equal(b / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+ # TODO add tests for integer division %/% by 0
+ # see https://issues.apache.org/jira/browse/ARROW-14297
+
+ # the behavior of %/% matches R's (i.e. the integer of the quotient, not
+ # simply dividing two integers)
+ expect_equal(b / 2.2, Array$create(c(1:4 / 2.2, NA_real_)))
+ # nolint start
+ # c(1:4) %/% 2.2 != c(1:4) %/% as.integer(2.2)
+ # c(1:4) %/% 2.2 == c(0L, 0L, 1L, 1L)
+ # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L)
+ # nolint end
+ expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_)))
+
+ expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_)))
+
+ expect_equal(b %% 2, Array$create(c(1:4 %% 2, NA_real_)))
+})
+
+test_that("Power", {
+ a <- Array$create(c(1:4, NA_integer_))
+ b <- a$cast(float64())
+ c <- a$cast(int64())
+ d <- a$cast(uint64())
+
+ expect_equal(a^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(a^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(a^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(a^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+
+ expect_equal(b^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(b^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(b^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(b^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+
+ expect_equal(c^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(c^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(c^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(c^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+
+ expect_equal(d^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(d^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(d^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(d^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+})
+
+test_that("Dates casting", {
+ a <- Array$create(c(Sys.Date() + 1:4, NA_integer_))
+
+ skip("ARROW-11090 (date/datetime arithmetic)")
+ # Error: NotImplemented: Function add_checked has no kernel matching input types (array[date32[day]], scalar[double])
+ expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4) + 2), NA_integer_))
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-no-bindings.R b/src/arrow/r/tests/testthat/test-compute-no-bindings.R
new file mode 100644
index 000000000..a51d797a4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-no-bindings.R
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("non-bound compute kernels using TrimOptions", {
+ skip_if_not_available("utf8proc")
+ expect_equal(
+ call_function(
+ "utf8_trim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("racadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_ltrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("racadabra")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "ascii_ltrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("racadabra")
+ )
+
+ expect_equal(
+ call_function(
+ "ascii_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "ascii_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+})
+
+test_that("non-bound compute kernels using ReplaceSliceOptions", {
+ skip_if_not_available("utf8proc")
+
+ expect_equal(
+ call_function(
+ "binary_replace_slice",
+ Array$create("I need to fix this string"),
+ options = list(start = 1, stop = 1, replacement = " don't")
+ ),
+ Array$create("I don't need to fix this string")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_replace_slice",
+ Array$create("I need to fix this string"),
+ options = list(start = 1, stop = 1, replacement = " don't")
+ ),
+ Array$create("I don't need to fix this string")
+ )
+})
+
+test_that("non-bound compute kernels using ModeOptions", {
+ expect_equal(
+ as.vector(
+ call_function("mode", Array$create(c(1:10, 10, 9, NA)), options = list(n = 3))
+ ),
+ tibble::tibble("mode" = c(9, 10, 1), "count" = c(2L, 2L, 1L))
+ )
+
+ expect_equal(
+ as.vector(
+ call_function("mode", Array$create(c(1:10, 10, 9, NA)), options = list(n = 3, skip_nulls = FALSE))
+ ),
+ tibble::tibble("mode" = numeric(), "count" = integer())
+ )
+})
+
+test_that("non-bound compute kernels using PartitionNthOptions", {
+ result <- call_function(
+ "partition_nth_indices",
+ Array$create(c(11:20)),
+ options = list(pivot = 3)
+ )
+ # Order of indices on either side of the pivot is not deterministic
+ # (depends on C++ standard library implementation)
+ expect_true(all(as.vector(result[1:3]) < 3))
+ expect_true(all(as.vector(result[4:10]) >= 3))
+})
+
+
+test_that("non-bound compute kernels using MatchSubstringOptions", {
+ skip_if_not_available("utf8proc")
+
+ # Remove this test when ARROW-13924 has been completed
+ expect_equal(
+ call_function(
+ "starts_with",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "abr")
+ ),
+ Array$create(c(TRUE, FALSE, FALSE, TRUE))
+ )
+
+ # Remove this test when ARROW-13924 has been completed
+ expect_equal(
+ call_function(
+ "ends_with",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "e")
+ ),
+ Array$create(c(FALSE, FALSE, TRUE, TRUE))
+ )
+
+ # Remove this test when ARROW-13156 has been completed
+ expect_equal(
+ as.vector(
+ call_function(
+ "count_substring",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "e")
+ )
+ ),
+ c(0, 0, 1, 1)
+ )
+
+ skip_if_not_available("re2")
+
+ # Remove this test when ARROW-13156 has been completed
+ expect_equal(
+ as.vector(
+ call_function(
+ "count_substring_regex",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "e")
+ )
+ ),
+ c(0, 0, 1, 1)
+ )
+})
+
+test_that("non-bound compute kernels using ExtractRegexOptions", {
+ skip_if_not_available("re2")
+ expect_equal(
+ call_function("extract_regex", Scalar$create("abracadabra"), options = list(pattern = "(?P<letter>[a])")),
+ Scalar$create(tibble::tibble(letter = "a"))
+ )
+})
+
+test_that("non-bound compute kernels using IndexOptions", {
+ expect_equal(
+ as.vector(
+ call_function("index", Array$create(c(10, 20, 30, 40)), options = list(value = Scalar$create(40)))
+ ),
+ 3
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-sort.R b/src/arrow/r/tests/testthat/test-compute-sort.R
new file mode 100644
index 000000000..e3574d86f
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-sort.R
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(dplyr, warn.conflicts = FALSE)
+
+# randomize order of rows in test data
+tbl <- slice_sample(example_data_for_sorting, prop = 1L)
+
+test_that("sort(Scalar) is identity function", {
+ int <- Scalar$create(42L)
+ expect_equal(sort(int), int)
+ dbl <- Scalar$create(3.14)
+ expect_equal(sort(dbl), dbl)
+ chr <- Scalar$create("foo")
+ expect_equal(sort(chr), chr)
+})
+
+test_that("Array$SortIndices()", {
+ int <- tbl$int
+ # Remove ties because they could give non-deterministic sort indices, and this
+ # test compares sort indices. Other tests compare sorted values, which are
+ # deterministic in the case of ties.
+ int <- int[!duplicated(int)]
+ expect_equal(
+ Array$create(int)$SortIndices(),
+ Array$create(order(int) - 1L, type = uint64())
+ )
+ # Need to remove NAs because ARROW-12063
+ int <- na.omit(int)
+ expect_equal(
+ Array$create(int)$SortIndices(descending = TRUE),
+ Array$create(rev(order(int)) - 1, type = uint64())
+ )
+})
+
+test_that("ChunkedArray$SortIndices()", {
+ int <- tbl$int
+ # Remove ties because they could give non-deterministic sort indices, and this
+ # test compares sort indices. Other tests compare sorted values, which are
+ # deterministic in the case of ties.
+ int <- int[!duplicated(int)]
+ expect_equal(
+ ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(),
+ Array$create(order(int) - 1L, type = uint64())
+ )
+ # Need to remove NAs because ARROW-12063
+ int <- na.omit(int)
+ expect_equal(
+ ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(descending = TRUE),
+ Array$create(rev(order(int)) - 1, type = uint64())
+ )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on integers", {
+ compare_expression(
+ sort(.input),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, na.last = NA),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, na.last = TRUE),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, na.last = FALSE),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE),
+ tbl$int,
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = TRUE),
+ tbl$int,
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$int,
+ )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on strings", {
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$chr
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$chr
+ )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on floats", {
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = TRUE),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = FALSE, na.last = TRUE),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = NA),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$dbl,
+ )
+ compare_expression(
+ sort(.input, decreasing = FALSE, na.last = NA),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = FALSE, na.last = FALSE),
+ tbl$dbl,
+ )
+})
+
+test_that("Table$SortIndices()", {
+ x <- Table$create(tbl)
+ expect_identical(
+ as.vector(x$Take(x$SortIndices("chr"))$chr),
+ sort(tbl$chr, na.last = TRUE)
+ )
+ expect_identical(
+ as.data.frame(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))),
+ tbl %>% arrange(int, dbl)
+ )
+})
+
+test_that("RecordBatch$SortIndices()", {
+ x <- record_batch(tbl)
+ expect_identical(
+ as.data.frame(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))),
+ tbl %>% arrange(desc(chr), desc(int), desc(dbl))
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-vector.R b/src/arrow/r/tests/testthat/test-compute-vector.R
new file mode 100644
index 000000000..345da5656
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-vector.R
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_bool_function_equal <- function(array_exp, r_exp) {
+ # Assert that the Array operation returns a boolean array
+ # and that its contents are equal to expected
+ expect_r6_class(array_exp, "ArrowDatum")
+ expect_type_equal(array_exp, bool())
+ expect_identical(as.vector(array_exp), r_exp)
+}
+
+expect_array_compares <- function(x, compared_to) {
+ r_values <- as.vector(x)
+ r_compared_to <- as.vector(compared_to)
+ # Iterate over all comparison functions
+ expect_bool_function_equal(x == compared_to, r_values == r_compared_to)
+ expect_bool_function_equal(x != compared_to, r_values != r_compared_to)
+ expect_bool_function_equal(x > compared_to, r_values > r_compared_to)
+ expect_bool_function_equal(x >= compared_to, r_values >= r_compared_to)
+ expect_bool_function_equal(x < compared_to, r_values < r_compared_to)
+ expect_bool_function_equal(x <= compared_to, r_values <= r_compared_to)
+}
+
+test_that("compare ops with Array", {
+ a <- Array$create(1:5)
+ expect_array_compares(a, 4L)
+ expect_array_compares(a, 4) # implicit casting
+ expect_array_compares(a, Scalar$create(4))
+ expect_array_compares(Array$create(c(NA, 1:5)), 4)
+ expect_array_compares(Array$create(as.numeric(c(NA, 1:5))), 4)
+ expect_array_compares(Array$create(c(NA, 1:5)), Array$create(rev(c(NA, 1:5))))
+ expect_array_compares(Array$create(c(NA, 1:5)), Array$create(rev(c(NA, 1:5)), type = double()))
+})
+
+test_that("compare ops with ChunkedArray", {
+ expect_array_compares(ChunkedArray$create(1:3, 4:5), 4L)
+ expect_array_compares(ChunkedArray$create(1:3, 4:5), 4) # implicit casting
+ expect_array_compares(ChunkedArray$create(1:3, 4:5), Scalar$create(4))
+ expect_array_compares(ChunkedArray$create(c(NA, 1:3), 4:5), 4)
+ expect_array_compares(
+ ChunkedArray$create(c(NA, 1:3), 4:5),
+ ChunkedArray$create(4:5, c(NA, 1:3))
+ )
+ expect_array_compares(
+ ChunkedArray$create(c(NA, 1:3), 4:5),
+ Array$create(c(NA, 1:5))
+ )
+ expect_array_compares(
+ Array$create(c(NA, 1:5)),
+ ChunkedArray$create(c(NA, 1:3), 4:5)
+ )
+})
+
+test_that("logic ops with Array", {
+ truth <- expand.grid(left = c(TRUE, FALSE, NA), right = c(TRUE, FALSE, NA))
+ a_left <- Array$create(truth$left)
+ a_right <- Array$create(truth$right)
+ expect_bool_function_equal(a_left & a_right, truth$left & truth$right)
+ expect_bool_function_equal(a_left | a_right, truth$left | truth$right)
+ expect_bool_function_equal(a_left == a_right, truth$left == truth$right)
+ expect_bool_function_equal(a_left != a_right, truth$left != truth$right)
+ expect_bool_function_equal(!a_left, !truth$left)
+
+ # More complexity
+ isEqualTo <- function(x, y) x == y & !is.na(x)
+ expect_bool_function_equal(
+ isEqualTo(a_left, a_right),
+ isEqualTo(truth$left, truth$right)
+ )
+})
+
+test_that("logic ops with ChunkedArray", {
+ truth <- expand.grid(left = c(TRUE, FALSE, NA), right = c(TRUE, FALSE, NA))
+ a_left <- ChunkedArray$create(truth$left)
+ a_right <- ChunkedArray$create(truth$right)
+ expect_bool_function_equal(a_left & a_right, truth$left & truth$right)
+ expect_bool_function_equal(a_left | a_right, truth$left | truth$right)
+ expect_bool_function_equal(a_left == a_right, truth$left == truth$right)
+ expect_bool_function_equal(a_left != a_right, truth$left != truth$right)
+ expect_bool_function_equal(!a_left, !truth$left)
+
+ # More complexity
+ isEqualTo <- function(x, y) x == y & !is.na(x)
+ expect_bool_function_equal(
+ isEqualTo(a_left, a_right),
+ isEqualTo(truth$left, truth$right)
+ )
+})
+
+test_that("call_function validation", {
+ expect_error(
+ call_function("filter", 4),
+ 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"'
+ )
+ expect_error(
+ call_function("filter", Array$create(1:4), 3),
+ "Argument 2 is of class numeric"
+ )
+ expect_error(
+ call_function("filter",
+ Array$create(1:4),
+ Array$create(c(TRUE, FALSE, TRUE)),
+ options = list(keep_na = TRUE)
+ ),
+ "Array arguments must all be the same length"
+ )
+ expect_error(
+ call_function("filter",
+ record_batch(a = 1:3),
+ Array$create(c(TRUE, FALSE, TRUE)),
+ options = list(keep_na = TRUE)
+ ),
+ NA
+ )
+ expect_error(
+ call_function("filter", options = list(keep_na = TRUE)),
+ "accepts 2 arguments"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-csv.R b/src/arrow/r/tests/testthat/test-csv.R
new file mode 100644
index 000000000..023eee92e
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-csv.R
@@ -0,0 +1,357 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Not all types round trip via CSV 100% identical by default
+tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
+# Add a date to test its parsing
+tbl$date <- Sys.Date() + 1:10
+
+csv_file <- tempfile()
+
+test_that("Can read csv file", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE)
+
+ tab0 <- Table$create(tbl)
+ tab1 <- read_csv_arrow(tf, as_data_frame = FALSE)
+ expect_equal(tab0, tab1)
+ tab2 <- read_csv_arrow(mmap_open(tf), as_data_frame = FALSE)
+ expect_equal(tab0, tab2)
+ tab3 <- read_csv_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
+ expect_equal(tab0, tab3)
+})
+
+test_that("read_csv_arrow(as_data_frame=TRUE)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE)
+ tab1 <- read_csv_arrow(tf, as_data_frame = TRUE)
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_delim_arrow parsing options: delim", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.table(tbl, tf, sep = "\t", row.names = FALSE)
+ tab1 <- read_tsv_arrow(tf)
+ tab2 <- read_delim_arrow(tf, delim = "\t")
+ expect_equal(tab1, tab2)
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_delim_arrow parsing options: quote", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ df <- data.frame(a = c(1, 2), b = c("'abc'", "'def'"))
+ write.table(df, sep = ";", tf, row.names = FALSE, quote = FALSE)
+ tab1 <- read_delim_arrow(tf, delim = ";", quote = "'")
+
+ # Is this a problem?
+ # Component “a”: target is integer64, current is numeric
+ tab1$a <- as.numeric(tab1$a)
+ expect_equal(
+ tab1,
+ tibble::tibble(a = c(1, 2), b = c("abc", "def"))
+ )
+})
+
+test_that("read_csv_arrow parsing options: col_names", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ # Writing the CSV without the header
+ write.table(tbl, tf, sep = ",", row.names = FALSE, col.names = FALSE)
+
+ # Reading with col_names = FALSE autogenerates names
+ no_names <- read_csv_arrow(tf, col_names = FALSE)
+ expect_equal(no_names$f0, tbl[[1]])
+
+ tab1 <- read_csv_arrow(tf, col_names = names(tbl))
+
+ expect_identical(names(tab1), names(tbl))
+ expect_equal(tbl, tab1)
+
+ # This errors (correctly) because I haven't given enough names
+ # but the error message is "Invalid: Empty CSV file", which is not accurate
+ expect_error(
+ read_csv_arrow(tf, col_names = names(tbl)[1])
+ )
+ # Same here
+ expect_error(
+ read_csv_arrow(tf, col_names = c(names(tbl), names(tbl)))
+ )
+})
+
+test_that("read_csv_arrow parsing options: skip", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ # Adding two garbage lines to start the csv
+ cat("asdf\nqwer\n", file = tf)
+ suppressWarnings(write.table(tbl, tf, sep = ",", row.names = FALSE, append = TRUE))
+
+ tab1 <- read_csv_arrow(tf, skip = 2)
+
+ expect_identical(names(tab1), names(tbl))
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_csv_arrow parsing options: skip_empty_rows", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE)
+ cat("\n\n", file = tf, append = TRUE)
+
+ tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE)
+
+ expect_equal(nrow(tab1), nrow(tbl) + 2)
+ expect_true(is.na(tail(tab1, 1)[[1]]))
+})
+
+test_that("read_csv_arrow parsing options: na strings", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ df <- data.frame(
+ a = c(1.2, NA, NA, 3.4),
+ b = c(NA, "B", "C", NA),
+ stringsAsFactors = FALSE
+ )
+ write.csv(df, tf, row.names = FALSE)
+ expect_equal(grep("NA", readLines(tf)), 2:5)
+
+ tab1 <- read_csv_arrow(tf)
+ expect_equal(is.na(tab1$a), is.na(df$a))
+ expect_equal(is.na(tab1$b), is.na(df$b))
+
+ unlink(tf) # Delete and write to the same file name again
+
+ write.csv(df, tf, row.names = FALSE, na = "asdf")
+ expect_equal(grep("asdf", readLines(tf)), 2:5)
+
+ tab2 <- read_csv_arrow(tf, na = "asdf")
+ expect_equal(is.na(tab2$a), is.na(df$a))
+ expect_equal(is.na(tab2$b), is.na(df$b))
+})
+
+test_that("read_csv_arrow() respects col_select", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE, quote = FALSE)
+
+ tab <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = FALSE)
+ expect_equal(tab, Table$create(example_data[, c("dbl", "lgl")]))
+
+ tib <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = TRUE)
+ expect_equal(tib, example_data[, c("dbl", "lgl")])
+})
+
+test_that("read_csv_arrow() can detect compression from file name", {
+ skip_if_not_available("gzip")
+ tf <- tempfile(fileext = ".csv.gz")
+ on.exit(unlink(tf))
+
+ write.csv(tbl, gzfile(tf), row.names = FALSE, quote = FALSE)
+ tab1 <- read_csv_arrow(tf)
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_csv_arrow(schema=)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, schema = schema(int = float64()), skip = 1)
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types = <Schema>)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = schema(int = float64()))
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types=string, col_names)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+
+ expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
+ expect_error(read_csv_arrow(tf, col_types = "d"))
+ expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
+ expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
+})
+
+test_that("read_csv_arrow() can read timestamps", {
+ tbl <- tibble::tibble(time = as.POSIXct("2020-07-20 16:20", tz = "UTC"))
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC")))
+ expect_equal(tbl, df)
+
+ # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
+ df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1)
+ expect_equal(tbl, df, ignore_attr = "tzone")
+})
+
+test_that("read_csv_arrow(timestamp_parsers=)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ tbl <- tibble::tibble(time = "23/09/2020")
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(
+ tf,
+ col_types = schema(time = timestamp(timezone = "UTC")),
+ timestamp_parsers = "%d/%m/%Y"
+ )
+ expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
+})
+
+test_that("Skipping columns with null()", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ cols <- c("dbl", "lgl", "false", "chr")
+ tbl <- example_data[, cols]
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = "d-_c", col_names = cols, skip = 1)
+ expect_identical(df, tbl[, c("dbl", "chr")])
+})
+
+test_that("Mix of guessing and declaring types", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ cols <- c("dbl", "lgl", "false", "chr")
+ tbl <- example_data[, cols]
+ write.csv(tbl, tf, row.names = FALSE)
+
+ tab <- read_csv_arrow(tf, col_types = schema(dbl = float32()), as_data_frame = FALSE)
+ expect_equal(tab$schema, schema(dbl = float32(), lgl = bool(), false = bool(), chr = utf8()))
+
+ df <- read_csv_arrow(tf, col_types = "d-?c", col_names = cols, skip = 1)
+ expect_identical(df, tbl[, c("dbl", "false", "chr")])
+})
+
+
+test_that("Write a CSV file with header", {
+ tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out, tbl_no_dates)
+
+ tbl_in <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in, tbl_no_dates)
+
+ tbl_out <- write_csv_arrow(tbl, csv_file)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out, tbl)
+
+ tbl_in <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+ tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out, tbl_no_dates)
+ tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+
+ tbl_expected <- tbl_no_dates
+ names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+
+ expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {
+ tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out1, tbl_no_dates)
+ tbl_in1 <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in1, tbl_no_dates)
+
+ tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out2, tbl_no_dates)
+ tbl_in2 <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in2, tbl_no_dates)
+
+ tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out3, tbl_no_dates)
+ tbl_in3 <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in3, tbl_no_dates)
+})
+
+test_that("Write a CSV file with invalid input type", {
+ bad_input <- Array$create(1:5)
+ expect_error(
+ write_csv_arrow(bad_input, csv_file),
+ regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
+ )
+})
+
+test_that("Write a CSV file with invalid batch size", {
+ expect_error(
+ write_csv_arrow(tbl_no_dates, csv_file, batch_size = -1),
+ regexp = "batch_size not greater than 0"
+ )
+})
+
+test_that("time mapping work as expected (ARROW-13624)", {
+ tbl <- tibble::tibble(
+ dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"),
+ time = c(hms::as_hms("16:20:00"), NA)
+ )
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf,
+ col_names = c("dt", "time"),
+ col_types = "Tt",
+ skip = 1
+ )
+
+ expect_error(
+ read_csv_arrow(tf,
+ col_names = c("dt", "time"),
+ col_types = "tT", skip = 1
+ )
+ )
+
+ expect_equal(df, tbl, ignore_attr = "tzone")
+})
diff --git a/src/arrow/r/tests/testthat/test-data-type.R b/src/arrow/r/tests/testthat/test-data-type.R
new file mode 100644
index 000000000..a9d0879b8
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-data-type.R
@@ -0,0 +1,429 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("null type works as expected", {
+ x <- null()
+ expect_equal(x$id, 0L)
+ expect_equal(x$name, "null")
+ expect_equal(x$ToString(), "null")
+ expect_true(x == x)
+ expect_false(x == int8())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+})
+
+test_that("boolean type work as expected", {
+ x <- boolean()
+ expect_equal(x$id, Type$BOOL)
+ expect_equal(x$name, "bool")
+ expect_equal(x$ToString(), "bool")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 1L)
+})
+
+test_that("int types works as expected", {
+ x <- uint8()
+ expect_equal(x$id, Type$UINT8)
+ expect_equal(x$name, "uint8")
+ expect_equal(x$ToString(), "uint8")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 8L)
+
+ x <- int8()
+ expect_equal(x$id, Type$INT8)
+ expect_equal(x$name, "int8")
+ expect_equal(x$ToString(), "int8")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 8L)
+
+ x <- uint16()
+ expect_equal(x$id, Type$UINT16)
+ expect_equal(x$name, "uint16")
+ expect_equal(x$ToString(), "uint16")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 16L)
+
+ x <- int16()
+ expect_equal(x$id, Type$INT16)
+ expect_equal(x$name, "int16")
+ expect_equal(x$ToString(), "int16")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 16L)
+
+ x <- uint32()
+ expect_equal(x$id, Type$UINT32)
+ expect_equal(x$name, "uint32")
+ expect_equal(x$ToString(), "uint32")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+
+ x <- int32()
+ expect_equal(x$id, Type$INT32)
+ expect_equal(x$name, "int32")
+ expect_equal(x$ToString(), "int32")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+
+ x <- uint64()
+ expect_equal(x$id, Type$UINT64)
+ expect_equal(x$name, "uint64")
+ expect_equal(x$ToString(), "uint64")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+
+ x <- int64()
+ expect_equal(x$id, Type$INT64)
+ expect_equal(x$name, "int64")
+ expect_equal(x$ToString(), "int64")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+})
+
+test_that("float types work as expected", {
+ x <- float16()
+ expect_equal(x$id, Type$HALF_FLOAT)
+ expect_equal(x$name, "halffloat")
+ expect_equal(x$ToString(), "halffloat")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 16L)
+
+ x <- float32()
+ expect_equal(x$id, Type$FLOAT)
+ expect_equal(x$name, "float")
+ expect_equal(x$ToString(), "float")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+
+ x <- float64()
+ expect_equal(x$id, Type$DOUBLE)
+ expect_equal(x$name, "double")
+ expect_equal(x$ToString(), "double")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+})
+
+test_that("utf8 type works as expected", {
+ x <- utf8()
+ expect_equal(x$id, Type$STRING)
+ expect_equal(x$name, "utf8")
+ expect_equal(x$ToString(), "string")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+})
+
+test_that("date types work as expected", {
+ x <- date32()
+ expect_equal(x$id, Type$DATE32)
+ expect_equal(x$name, "date32")
+ expect_equal(x$ToString(), "date32[day]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$unit(), unclass(DateUnit$DAY))
+
+ x <- date64()
+ expect_equal(x$id, Type$DATE64)
+ expect_equal(x$name, "date64")
+ expect_equal(x$ToString(), "date64[ms]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$unit(), unclass(DateUnit$MILLI))
+})
+
+test_that("timestamp type works as expected", {
+ x <- timestamp(TimeUnit$SECOND)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[s]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$SECOND))
+
+ x <- timestamp(TimeUnit$MILLI)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[ms]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$MILLI))
+
+ x <- timestamp(TimeUnit$MICRO)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[us]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$MICRO))
+
+ x <- timestamp(TimeUnit$NANO)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[ns]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$NANO))
+})
+
+test_that("timestamp with timezone", {
+ expect_equal(timestamp(timezone = "EST")$ToString(), "timestamp[s, tz=EST]")
+})
+
+test_that("time32 types work as expected", {
+ x <- time32(TimeUnit$SECOND)
+ expect_equal(x$id, Type$TIME32)
+ expect_equal(x$name, "time32")
+ expect_equal(x$ToString(), "time32[s]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+ expect_equal(x$unit(), unclass(TimeUnit$SECOND))
+
+ x <- time32(TimeUnit$MILLI)
+ expect_equal(x$id, Type$TIME32)
+ expect_equal(x$name, "time32")
+ expect_equal(x$ToString(), "time32[ms]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+ expect_equal(x$unit(), unclass(TimeUnit$MILLI))
+})
+
+test_that("time64 types work as expected", {
+ x <- time64(TimeUnit$MICRO)
+ expect_equal(x$id, Type$TIME64)
+ expect_equal(x$name, "time64")
+ expect_equal(x$ToString(), "time64[us]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$unit(), unclass(TimeUnit$MICRO))
+
+ x <- time64(TimeUnit$NANO)
+ expect_equal(x$id, Type$TIME64)
+ expect_equal(x$name, "time64")
+ expect_equal(x$ToString(), "time64[ns]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$unit(), unclass(TimeUnit$NANO))
+})
+
+test_that("time type unit validation", {
+ expect_equal(time32(TimeUnit$SECOND), time32("s"))
+ expect_equal(time32(TimeUnit$MILLI), time32("ms"))
+ expect_equal(time32(), time32(TimeUnit$MILLI))
+ expect_error(time32(4), '"unit" should be one of 1 or 0')
+ expect_error(time32(NULL), '"unit" should be one of "ms" or "s"')
+ expect_match_arg_error(time32("years"))
+
+ expect_equal(time64(TimeUnit$NANO), time64("n"))
+ expect_equal(time64(TimeUnit$MICRO), time64("us"))
+ expect_equal(time64(), time64(TimeUnit$NANO))
+ expect_error(time64(4), '"unit" should be one of 3 or 2')
+ expect_error(time64(NULL), '"unit" should be one of "ns" or "us"')
+ expect_match_arg_error(time64("years"))
+})
+
+test_that("timestamp type input validation", {
+ expect_equal(timestamp("ms"), timestamp(TimeUnit$MILLI))
+ expect_equal(timestamp(), timestamp(TimeUnit$SECOND))
+ expect_error(
+ timestamp(NULL),
+ '"unit" should be one of "ns", "us", "ms", or "s"'
+ )
+ expect_error(
+ timestamp(timezone = 1231231),
+ "timezone is not a string"
+ )
+ expect_error(
+ timestamp(timezone = c("not", "a", "timezone")),
+ "timezone is not a string"
+ )
+})
+
+test_that("list type works as expected", {
+ x <- list_of(int32())
+ expect_equal(x$id, Type$LIST)
+ expect_equal(x$name, "list")
+ expect_equal(x$ToString(), "list<item: int32>")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 1L)
+ expect_equal(
+ x$fields()[[1]],
+ field("item", int32())
+ )
+ expect_equal(x$value_type, int32())
+ expect_equal(x$value_field, field("item", int32()))
+})
+
+test_that("struct type works as expected", {
+ x <- struct(x = int32(), y = boolean())
+ expect_equal(x$id, Type$STRUCT)
+ expect_equal(x$name, "struct")
+ expect_equal(x$ToString(), "struct<x: int32, y: bool>")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 2L)
+ expect_equal(
+ x$fields()[[1]],
+ field("x", int32())
+ )
+ expect_equal(
+ x$fields()[[2]],
+ field("y", boolean())
+ )
+ expect_equal(x$GetFieldIndex("x"), 0L)
+ expect_equal(x$GetFieldIndex("y"), 1L)
+ expect_equal(x$GetFieldIndex("z"), -1L)
+
+ expect_equal(x$GetFieldByName("x"), field("x", int32()))
+ expect_equal(x$GetFieldByName("y"), field("y", boolean()))
+ expect_null(x$GetFieldByName("z"))
+})
+
+test_that("DictionaryType works as expected (ARROW-3355)", {
+ d <- dictionary(int32(), utf8())
+ expect_equal(d, d)
+ expect_true(d == d)
+ expect_false(d == int32())
+ expect_equal(d$id, Type$DICTIONARY)
+ expect_equal(d$bit_width, 32L)
+ expect_equal(d$ToString(), "dictionary<values=string, indices=int32>")
+ expect_equal(d$index_type, int32())
+ expect_equal(d$value_type, utf8())
+ ord <- dictionary(ordered = TRUE)
+ expect_equal(ord$ToString(), "dictionary<values=string, indices=int32, ordered>")
+})
+
+test_that("DictionaryType validation", {
+ expect_error(
+ dictionary(utf8(), int32()),
+ "Dictionary index type should be .*integer, got string"
+ )
+ expect_error(dictionary(4, utf8()), 'index_type must be a "DataType"')
+ expect_error(dictionary(int8(), "strings"), 'value_type must be a "DataType"')
+})
+
+test_that("decimal type and validation", {
+ expect_error(decimal())
+ expect_error(decimal("four"), '"precision" must be an integer')
+ expect_error(decimal(4))
+ expect_error(decimal(4, "two"), '"scale" must be an integer')
+ expect_error(decimal(NA, 2), '"precision" must be an integer')
+ expect_error(decimal(0, 2), "Invalid: Decimal precision out of range: 0")
+ expect_error(decimal(100, 2), "Invalid: Decimal precision out of range: 100")
+ expect_error(decimal(4, NA), '"scale" must be an integer')
+
+ expect_r6_class(decimal(4, 2), "Decimal128Type")
+})
+
+test_that("Binary", {
+ expect_r6_class(binary(), "Binary")
+ expect_equal(binary()$ToString(), "binary")
+})
+
+test_that("FixedSizeBinary", {
+ expect_r6_class(fixed_size_binary(4), "FixedSizeBinary")
+ expect_equal(fixed_size_binary(4)$ToString(), "fixed_size_binary[4]")
+
+ # input validation
+ expect_error(fixed_size_binary(NA), "'byte_width' cannot be NA")
+ expect_error(fixed_size_binary(-1), "'byte_width' must be > 0")
+ expect_error(fixed_size_binary("four"))
+ expect_error(fixed_size_binary(c(2, 4)))
+})
+
+test_that("DataType to C-interface", {
+ datatype <- timestamp("ms", timezone = "Pacific/Marquesas")
+
+ # export the datatype via the C-interface
+ ptr <- allocate_arrow_schema()
+ datatype$export_to_c(ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- DataType$import_from_c(ptr)
+ expect_equal(circle, datatype)
+
+ # must clean up the pointer or we leak
+ delete_arrow_schema(ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-csv.R b/src/arrow/r/tests/testthat/test-dataset-csv.R
new file mode 100644
index 000000000..ab6693148
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-csv.R
@@ -0,0 +1,290 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+csv_dir <- make_temp_dir()
+tsv_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dirs)", {
+ dir.create(file.path(csv_dir, 5))
+ dir.create(file.path(csv_dir, 6))
+ write.csv(df1, file.path(csv_dir, 5, "file1.csv"), row.names = FALSE)
+ write.csv(df2, file.path(csv_dir, 6, "file2.csv"), row.names = FALSE)
+ expect_length(dir(csv_dir, recursive = TRUE), 2)
+
+ # Now, tab-delimited
+ dir.create(file.path(tsv_dir, 5))
+ dir.create(file.path(tsv_dir, 6))
+ write.table(df1, file.path(tsv_dir, 5, "file1.tsv"), row.names = FALSE, sep = "\t")
+ write.table(df2, file.path(tsv_dir, 6, "file2.tsv"), row.names = FALSE, sep = "\t")
+ expect_length(dir(tsv_dir, recursive = TRUE), 2)
+})
+
+test_that("CSV dataset", {
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ expect_r6_class(ds$format, "CsvFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+ expect_identical(names(ds), c(names(df1), "part"))
+ if (getRversion() >= "4.0.0") {
+ # CountRows segfaults on RTools35/R 3.6, so don't test it there
+ expect_identical(dim(ds), c(20L, 7L))
+ }
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+ # Collecting virtual partition column works
+ expect_equal(
+ collect(ds) %>% arrange(part) %>% pull(part),
+ c(rep(5, 10), rep(6, 10))
+ )
+})
+
+test_that("CSV scan options", {
+ options <- FragmentScanOptions$create("text")
+ expect_equal(options$type, "csv")
+ options <- FragmentScanOptions$create("csv",
+ null_values = c("mynull"),
+ strings_can_be_null = TRUE
+ )
+ expect_equal(options$type, "csv")
+
+ dst_dir <- make_temp_dir()
+ dst_file <- file.path(dst_dir, "data.csv")
+ df <- tibble(chr = c("foo", "mynull"))
+ write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+ ds <- open_dataset(dst_dir, format = "csv")
+ expect_equal(ds %>% collect(), df)
+
+ sb <- ds$NewScan()
+ sb$FragmentScanOptions(options)
+
+ tab <- sb$Finish()$ToTable()
+ expect_equal(as.data.frame(tab), tibble(chr = c("foo", NA)))
+
+ # Set default convert options in CsvFileFormat
+ csv_format <- CsvFileFormat$create(
+ null_values = c("mynull"),
+ strings_can_be_null = TRUE
+ )
+ ds <- open_dataset(dst_dir, format = csv_format)
+ expect_equal(ds %>% collect(), tibble(chr = c("foo", NA)))
+
+ # Set both parse and convert options
+ df <- tibble(chr = c("foo", "mynull"), chr2 = c("bar", "baz"))
+ write.table(df, dst_file, row.names = FALSE, quote = FALSE, sep = "\t")
+ ds <- open_dataset(dst_dir,
+ format = "csv",
+ delimiter = "\t",
+ null_values = c("mynull"),
+ strings_can_be_null = TRUE
+ )
+ expect_equal(ds %>% collect(), tibble(
+ chr = c("foo", NA),
+ chr2 = c("bar", "baz")
+ ))
+ expect_equal(
+ ds %>%
+ group_by(chr2) %>%
+ summarize(na = all(is.na(chr))) %>%
+ arrange(chr2) %>%
+ collect(),
+ tibble(
+ chr2 = c("bar", "baz"),
+ na = c(FALSE, TRUE)
+ )
+ )
+})
+
+test_that("compressed CSV dataset", {
+ skip_if_not_available("gzip")
+ dst_dir <- make_temp_dir()
+ dst_file <- file.path(dst_dir, "data.csv.gz")
+ write.csv(df1, gzfile(dst_file), row.names = FALSE, quote = FALSE)
+ format <- FileFormat$create("csv")
+ ds <- open_dataset(dst_dir, format = format)
+ expect_r6_class(ds$format, "CsvFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("CSV dataset options", {
+ dst_dir <- make_temp_dir()
+ dst_file <- file.path(dst_dir, "data.csv")
+ df <- tibble(chr = letters[1:10])
+ write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+ format <- FileFormat$create("csv", skip_rows = 1)
+ ds <- open_dataset(dst_dir, format = format)
+
+ expect_equal(
+ ds %>%
+ select(string = a) %>%
+ collect(),
+ df1[-1, ] %>%
+ select(string = chr)
+ )
+
+ ds <- open_dataset(dst_dir, format = "csv", column_names = c("foo"))
+
+ expect_equal(
+ ds %>%
+ select(string = foo) %>%
+ collect(),
+ tibble(string = c(c("chr"), letters[1:10]))
+ )
+})
+
+test_that("Other text delimited dataset", {
+ ds1 <- open_dataset(tsv_dir, partitioning = "part", format = "tsv")
+ expect_equal(
+ ds1 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ ds2 <- open_dataset(tsv_dir, partitioning = "part", format = "text", delimiter = "\t")
+ expect_equal(
+ ds2 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("readr parse options", {
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+
+ # Arrow and readr parse options must be mutually exclusive, or else the code
+ # in `csv_file_format_parse_options()` will error or behave incorrectly. A
+ # failure of this test indicates that these two sets of option names are not
+ # mutually exclusive.
+ expect_equal(
+ intersect(arrow_opts, readr_opts),
+ character(0)
+ )
+
+ # With not yet supported readr parse options (ARROW-8631)
+ expect_error(
+ open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
+ "supported"
+ )
+
+ # With unrecognized (garbage) parse options
+ expect_error(
+ open_dataset(
+ tsv_dir,
+ partitioning = "part",
+ format = "text",
+ asdfg = "\\"
+ ),
+ "Unrecognized"
+ )
+
+ # With both Arrow and readr parse options (disallowed)
+ expect_error(
+ open_dataset(
+ tsv_dir,
+ partitioning = "part",
+ format = "text",
+ quote = "\"",
+ quoting = TRUE
+ ),
+ "either"
+ )
+
+ # With ambiguous partial option names (disallowed)
+ expect_error(
+ open_dataset(
+ tsv_dir,
+ partitioning = "part",
+ format = "text",
+ quo = "\"",
+ ),
+ "Ambiguous"
+ )
+
+ # With only readr parse options (and omitting format = "text")
+ ds1 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t")
+ expect_equal(
+ ds1 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+# see https://issues.apache.org/jira/browse/ARROW-12791
+test_that("Error if no format specified and files are not parquet", {
+ expect_error(
+ open_dataset(csv_dir, partitioning = "part"),
+ "Did you mean to specify a 'format' other than the default (parquet)?",
+ fixed = TRUE
+ )
+ expect_error(
+ open_dataset(csv_dir, partitioning = "part", format = "parquet"),
+ "Parquet magic bytes not found"
+ )
+})
+
+test_that("Column names inferred from schema for headerless CSVs (ARROW-14063)", {
+ headerless_csv_dir <- make_temp_dir()
+ tbl <- df1[, c("int", "dbl")]
+ write.table(tbl, file.path(headerless_csv_dir, "file1.csv"), sep = ",", row.names = FALSE, col.names = FALSE)
+
+ ds <- open_dataset(headerless_csv_dir, format = "csv", schema = schema(int = int32(), dbl = float64()))
+ expect_equal(ds %>% collect(), tbl)
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-dplyr.R b/src/arrow/r/tests/testthat/test-dataset-dplyr.R
new file mode 100644
index 000000000..b4519377c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-dplyr.R
@@ -0,0 +1,340 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+skip_if_not_available("parquet")
+
+library(dplyr, warn.conflicts = FALSE)
+
+dataset_dir <- make_temp_dir()
+hive_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dir)", {
+ dir.create(file.path(dataset_dir, 1))
+ dir.create(file.path(dataset_dir, 2))
+ write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet"))
+ write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet"))
+ expect_length(dir(dataset_dir, recursive = TRUE), 2)
+
+ dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE)
+ dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE)
+ write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet"))
+ write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet"))
+ expect_length(dir(hive_dir, recursive = TRUE), 2)
+})
+
+test_that("filter() with is.nan()", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_equal(
+ ds %>%
+ select(part, dbl) %>%
+ filter(!is.nan(dbl), part == 2) %>%
+ collect(),
+ tibble(part = 2L, dbl = df2$dbl[!is.nan(df2$dbl)])
+ )
+})
+
+test_that("filter() with %in%", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_equal(
+ ds %>%
+ select(int, part) %>%
+ filter(int %in% c(6, 4, 3, 103, 107), part == 1) %>%
+ collect(),
+ tibble(int = df1$int[c(3, 4, 6)], part = 1)
+ )
+
+ # ARROW-9606: bug in %in% filter on partition column with >1 partition columns
+ ds <- open_dataset(hive_dir)
+ expect_equal(
+ ds %>%
+ filter(group %in% 2) %>%
+ select(names(df2)) %>%
+ collect(),
+ df2
+ )
+})
+
+test_that("filter() on timestamp columns", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_equal(
+ ds %>%
+ filter(ts >= lubridate::ymd_hms("2015-05-04 03:12:39")) %>%
+ filter(part == 1) %>%
+ select(ts) %>%
+ collect(),
+ df1[5:10, c("ts")],
+ )
+
+ # Now with Date
+ expect_equal(
+ ds %>%
+ filter(ts >= as.Date("2015-05-04")) %>%
+ filter(part == 1) %>%
+ select(ts) %>%
+ collect(),
+ df1[5:10, c("ts")],
+ )
+
+ # Now with bare string date
+ skip("Implement more aggressive implicit casting for scalars (ARROW-11402)")
+ expect_equal(
+ ds %>%
+ filter(ts >= "2015-05-04") %>%
+ filter(part == 1) %>%
+ select(ts) %>%
+ collect(),
+ df1[5:10, c("ts")],
+ )
+})
+
+test_that("filter() on date32 columns", {
+ tmp <- tempfile()
+ dir.create(tmp)
+ df <- data.frame(date = as.Date(c("2020-02-02", "2020-02-03")))
+ write_parquet(df, file.path(tmp, "file.parquet"))
+
+ expect_equal(
+ open_dataset(tmp) %>%
+ filter(date > as.Date("2020-02-02")) %>%
+ collect() %>%
+ nrow(),
+ 1L
+ )
+
+ # Also with timestamp scalar
+ expect_equal(
+ open_dataset(tmp) %>%
+ filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>%
+ collect() %>%
+ nrow(),
+ 1L
+ )
+})
+
+
+test_that("mutate()", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ mutated <- ds %>%
+ select(chr, dbl, int) %>%
+ filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
+ mutate(twice = int * 2)
+ expect_output(
+ print(mutated),
+ "FileSystemDataset (query)
+chr: string
+dbl: double
+int: int32
+twice: double (multiply_checked(int, 2))
+
+* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3))
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+ expect_equal(
+ mutated %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl", "int")],
+ df2[1:2, c("chr", "dbl", "int")]
+ ) %>%
+ mutate(
+ twice = int * 2
+ )
+ )
+})
+
+test_that("mutate() features not yet implemented", {
+ expect_error(
+ ds %>%
+ group_by(int) %>%
+ mutate(avg = mean(int)),
+ "window functions not currently supported in Arrow\nCall collect() first to pull data into R.",
+ fixed = TRUE
+ )
+})
+
+test_that("filter scalar validation doesn't crash (ARROW-7772)", {
+ expect_error(
+ ds %>%
+ filter(int == "fff", part == 1) %>%
+ collect(),
+ "equal has no kernel matching input types .array.int32., scalar.string.."
+ )
+})
+
+test_that("collect() on Dataset works (if fits in memory)", {
+ expect_equal(
+ collect(open_dataset(dataset_dir)) %>% arrange(int),
+ rbind(df1, df2)
+ )
+})
+
+test_that("count()", {
+ ds <- open_dataset(dataset_dir)
+ df <- rbind(df1, df2)
+ expect_equal(
+ ds %>%
+ filter(int > 6, int < 108) %>%
+ count(chr) %>%
+ arrange(chr) %>%
+ collect(),
+ df %>%
+ filter(int > 6, int < 108) %>%
+ count(chr)
+ )
+})
+
+test_that("arrange()", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ arranged <- ds %>%
+ select(chr, dbl, int) %>%
+ filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
+ mutate(twice = int * 2) %>%
+ arrange(chr, desc(twice), dbl + int)
+ expect_output(
+ print(arranged),
+ "FileSystemDataset (query)
+chr: string
+dbl: double
+int: int32
+twice: double (multiply_checked(int, 2))
+
+* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3))
+* Sorted by chr [asc], multiply_checked(int, 2) [desc], add_checked(dbl, int) [asc]
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+ expect_equal(
+ arranged %>%
+ collect(),
+ rbind(
+ df1[8, c("chr", "dbl", "int")],
+ df2[2, c("chr", "dbl", "int")],
+ df1[9, c("chr", "dbl", "int")],
+ df2[1, c("chr", "dbl", "int")],
+ df1[10, c("chr", "dbl", "int")]
+ ) %>%
+ mutate(
+ twice = int * 2
+ )
+ )
+})
+
+test_that("compute()/collect(as_data_frame=FALSE)", {
+ ds <- open_dataset(dataset_dir)
+
+ tab1 <- ds %>% compute()
+ expect_r6_class(tab1, "Table")
+
+ tab2 <- ds %>% collect(as_data_frame = FALSE)
+ expect_r6_class(tab2, "Table")
+
+ tab3 <- ds %>%
+ mutate(negint = -int) %>%
+ filter(negint > -100) %>%
+ arrange(chr) %>%
+ select(negint) %>%
+ compute()
+
+ expect_r6_class(tab3, "Table")
+
+ expect_equal(
+ tab3 %>% collect(),
+ tibble(negint = -1:-10)
+ )
+
+ tab4 <- ds %>%
+ mutate(negint = -int) %>%
+ filter(negint > -100) %>%
+ arrange(chr) %>%
+ select(negint) %>%
+ collect(as_data_frame = FALSE)
+
+ expect_r6_class(tab3, "Table")
+
+ expect_equal(
+ tab4 %>% collect(),
+ tibble(negint = -1:-10)
+ )
+
+ tab5 <- ds %>%
+ mutate(negint = -int) %>%
+ group_by(fct) %>%
+ compute()
+
+ # the group_by() prevents compute() from returning a Table...
+ expect_s3_class(tab5, "arrow_dplyr_query")
+
+ # ... but $.data is a Table (InMemoryDataset)...
+ expect_r6_class(tab5$.data, "InMemoryDataset")
+ # ... and the mutate() was evaluated
+ expect_true("negint" %in% names(tab5$.data))
+})
+
+test_that("head/tail on query on dataset", {
+ # head/tail on arrow_dplyr_query does not have deterministic order,
+ # so without sorting we can only assert the correct number of rows
+ ds <- open_dataset(dataset_dir)
+
+ expect_identical(
+ ds %>%
+ filter(int > 6) %>%
+ head(5) %>%
+ compute() %>%
+ nrow(),
+ 5L
+ )
+
+ expect_equal(
+ ds %>%
+ filter(int > 6) %>%
+ arrange(int) %>%
+ head() %>%
+ collect(),
+ rbind(df1[7:10, ], df2[1:2, ])
+ )
+
+ expect_equal(
+ ds %>%
+ filter(int < 105) %>%
+ tail(4) %>%
+ compute() %>%
+ nrow(),
+ 4L
+ )
+
+ expect_equal(
+ ds %>%
+ filter(int < 105) %>%
+ arrange(int) %>%
+ tail() %>%
+ collect(),
+ rbind(df1[9:10, ], df2[1:4, ])
+ )
+})
+
+test_that("dplyr method not implemented messages", {
+ ds <- open_dataset(dataset_dir)
+ # This one is more nuanced
+ expect_error(
+ ds %>% filter(int > 6, dbl > max(dbl)),
+ "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.",
+ fixed = TRUE
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-uri.R b/src/arrow/r/tests/testthat/test-dataset-uri.R
new file mode 100644
index 000000000..bdcccf282
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-uri.R
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_on_os("windows")
+skip_if_not_available("parquet")
+skip_if_not_available("dataset")
+
+
+library(dplyr, warn.conflicts = FALSE)
+
+dataset_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dir)", {
+ dir.create(file.path(dataset_dir, 1))
+ dir.create(file.path(dataset_dir, 2))
+ write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet"))
+ write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet"))
+ expect_length(dir(dataset_dir, recursive = TRUE), 2)
+})
+
+files <- c(
+ file.path(dataset_dir, 1, "file1.parquet", fsep = "/"),
+ file.path(dataset_dir, 2, "file2.parquet", fsep = "/")
+)
+
+
+test_that("dataset from single local file path", {
+ ds <- open_dataset(files[1])
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7) %>%
+ collect() %>%
+ arrange(dbl),
+ df1[8:10, c("chr", "dbl")]
+ )
+})
+
+test_that("dataset from vector of file paths", {
+ ds <- open_dataset(files)
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+})
+
+test_that("dataset from directory URI", {
+ uri <- paste0("file://", dataset_dir)
+ ds <- open_dataset(uri, partitioning = schema(part = uint8()))
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+})
+
+test_that("dataset from single file URI", {
+ uri <- paste0("file://", files[1])
+ ds <- open_dataset(uri)
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7) %>%
+ collect() %>%
+ arrange(dbl),
+ df1[8:10, c("chr", "dbl")]
+ )
+})
+
+test_that("dataset from vector of file URIs", {
+ uris <- paste0("file://", files)
+ ds <- open_dataset(uris)
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+})
+
+test_that("open_dataset errors on mixed paths and URIs", {
+ expect_error(
+ open_dataset(c(files[1], paste0("file://", files[2]))),
+ "Vectors of mixed paths and URIs are not supported"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-write.R b/src/arrow/r/tests/testthat/test-dataset-write.R
new file mode 100644
index 000000000..8e7c077e6
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-write.R
@@ -0,0 +1,454 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+
+hive_dir <- make_temp_dir()
+csv_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dirs)", {
+ if (arrow_with_parquet()) {
+ dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE)
+ dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE)
+ write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet"))
+ write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet"))
+ expect_length(dir(hive_dir, recursive = TRUE), 2)
+ }
+
+ # Now, CSV
+ dir.create(file.path(csv_dir, 5))
+ dir.create(file.path(csv_dir, 6))
+ write.csv(df1, file.path(csv_dir, 5, "file1.csv"), row.names = FALSE)
+ write.csv(df2, file.path(csv_dir, 6, "file2.csv"), row.names = FALSE)
+ expect_length(dir(csv_dir, recursive = TRUE), 2)
+})
+
+test_that("Writing a dataset: CSV->IPC", {
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ # Check whether "int" is present in the files or just in the dirs
+ first <- read_feather(
+ dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1],
+ as_data_frame = FALSE
+ )
+ # It shouldn't be there
+ expect_false("int" %in% names(first))
+})
+
+test_that("Writing a dataset: Parquet->IPC", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int, group) %>%
+ filter(integer > 6 & group == 1) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: CSV->Parquet", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "parquet", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir)
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: Parquet->Parquet (default)", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir)
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int, group) %>%
+ filter(integer > 6 & group == 1) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: existing data behavior", {
+ # This test does not work on Windows because unlink does not immediately
+ # delete the data.
+ skip_on_os("windows")
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+
+ check_dataset <- function() {
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+ }
+
+ check_dataset()
+ # By default we should overwrite
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ check_dataset()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "overwrite")
+ check_dataset()
+ expect_error(
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error"),
+ "directory is not empty"
+ )
+ unlink(dst_dir, recursive = TRUE)
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error")
+ check_dataset()
+})
+
+test_that("Writing a dataset: no format specified", {
+ dst_dir <- make_temp_dir()
+ write_dataset(example_data, dst_dir)
+ new_ds <- open_dataset(dst_dir)
+ expect_equal(
+ list.files(dst_dir, pattern = "parquet"),
+ "part-0.parquet"
+ )
+ expect_true(
+ inherits(new_ds$format, "ParquetFileFormat")
+ )
+ expect_equal(
+ new_ds %>% collect(),
+ example_data
+ )
+})
+
+test_that("Dataset writing: dplyr methods", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- tempfile()
+ # Specify partition vars by group_by
+ ds %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ # select to specify schema (and rename)
+ dst_dir2 <- tempfile()
+ ds %>%
+ group_by(int) %>%
+ select(chr, dubs = dbl) %>%
+ write_dataset(dst_dir2, format = "feather")
+ new_ds <- open_dataset(dst_dir2, format = "feather")
+
+ expect_equal(
+ collect(new_ds) %>% arrange(int),
+ rbind(df1[c("chr", "dbl", "int")], df2[c("chr", "dbl", "int")]) %>% rename(dubs = dbl)
+ )
+
+ # filter to restrict written rows
+ dst_dir3 <- tempfile()
+ ds %>%
+ filter(int == 4) %>%
+ write_dataset(dst_dir3, format = "feather")
+ new_ds <- open_dataset(dst_dir3, format = "feather")
+
+ expect_equal(
+ new_ds %>% select(names(df1)) %>% collect(),
+ df1 %>% filter(int == 4)
+ )
+
+ # mutate
+ dst_dir3 <- tempfile()
+ ds %>%
+ filter(int == 4) %>%
+ mutate(twice = int * 2) %>%
+ write_dataset(dst_dir3, format = "feather")
+ new_ds <- open_dataset(dst_dir3, format = "feather")
+
+ expect_equal(
+ new_ds %>% select(c(names(df1), "twice")) %>% collect(),
+ df1 %>% filter(int == 4) %>% mutate(twice = int * 2)
+ )
+})
+
+test_that("Dataset writing: non-hive", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- tempfile()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", hive_style = FALSE)
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(as.character(c(1:10, 101:110))))
+})
+
+test_that("Dataset writing: no partitioning", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- tempfile()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = NULL)
+ expect_true(dir.exists(dst_dir))
+ expect_true(length(dir(dst_dir)) > 0)
+})
+
+test_that("Dataset writing: partition on null", {
+ ds <- open_dataset(hive_dir)
+
+ dst_dir <- tempfile()
+ partitioning <- hive_partition(lgl = boolean())
+ write_dataset(ds, dst_dir, partitioning = partitioning)
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), c("lgl=__HIVE_DEFAULT_PARTITION__", "lgl=false", "lgl=true"))
+
+ dst_dir <- tempfile()
+ partitioning <- hive_partition(lgl = boolean(), null_fallback = "xyz")
+ write_dataset(ds, dst_dir, partitioning = partitioning)
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), c("lgl=false", "lgl=true", "lgl=xyz"))
+
+ ds_readback <- open_dataset(dst_dir, partitioning = hive_partition(lgl = boolean(), null_fallback = "xyz"))
+
+ expect_identical(
+ ds %>%
+ select(int, lgl) %>%
+ collect() %>%
+ arrange(lgl, int),
+ ds_readback %>%
+ select(int, lgl) %>%
+ collect() %>%
+ arrange(lgl, int)
+ )
+})
+
+test_that("Dataset writing: from data.frame", {
+ dst_dir <- tempfile()
+ stacked <- rbind(df1, df2)
+ stacked %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Dataset writing: from RecordBatch", {
+ dst_dir <- tempfile()
+ stacked <- record_batch(rbind(df1, df2))
+ stacked %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: Ipc format options & compression", {
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+
+ codec <- NULL
+ if (codec_is_available("zstd")) {
+ codec <- Codec$create("zstd")
+ }
+
+ write_dataset(ds, dst_dir, format = "feather", codec = codec)
+ expect_true(dir.exists(dst_dir))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: Parquet format options", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ dst_dir_no_truncated_timestamps <- make_temp_dir()
+
+ # Use trace() to confirm that options are passed in
+ suppressMessages(trace(
+ "parquet___ArrowWriterProperties___create",
+ tracer = quote(warning("allow_truncated_timestamps == ", allow_truncated_timestamps)),
+ print = FALSE,
+ where = write_dataset
+ ))
+ expect_warning(
+ write_dataset(ds, dst_dir_no_truncated_timestamps, format = "parquet", partitioning = "int"),
+ "allow_truncated_timestamps == FALSE"
+ )
+ expect_warning(
+ write_dataset(ds, dst_dir, format = "parquet", partitioning = "int", allow_truncated_timestamps = TRUE),
+ "allow_truncated_timestamps == TRUE"
+ )
+ suppressMessages(untrace(
+ "parquet___ArrowWriterProperties___create",
+ where = write_dataset
+ ))
+
+ # Now confirm we can read back what we sent
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir)
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: CSV format options", {
+ df <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ )
+
+ dst_dir <- make_temp_dir()
+ write_dataset(df, dst_dir, format = "csv")
+ expect_true(dir.exists(dst_dir))
+ new_ds <- open_dataset(dst_dir, format = "csv")
+ expect_equal(new_ds %>% collect(), df)
+
+ dst_dir <- make_temp_dir()
+ write_dataset(df, dst_dir, format = "csv", include_header = FALSE)
+ expect_true(dir.exists(dst_dir))
+ new_ds <- open_dataset(dst_dir,
+ format = "csv",
+ column_names = c("int", "dbl", "lgl", "chr")
+ )
+ expect_equal(new_ds %>% collect(), df)
+})
+
+test_that("Dataset writing: unsupported features/input validation", {
+ skip_if_not_available("parquet")
+ expect_error(write_dataset(4), 'dataset must be a "Dataset"')
+
+ ds <- open_dataset(hive_dir)
+ expect_error(
+ write_dataset(ds, partitioning = c("int", "NOTACOLUMN"), format = "ipc"),
+ 'Invalid field name: "NOTACOLUMN"'
+ )
+ expect_error(
+ write_dataset(ds, tempfile(), basename_template = "something_without_i")
+ )
+ expect_error(
+ write_dataset(ds, tempfile(), basename_template = NULL)
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset.R b/src/arrow/r/tests/testthat/test-dataset.R
new file mode 100644
index 000000000..4403b479a
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset.R
@@ -0,0 +1,696 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+dataset_dir <- make_temp_dir()
+hive_dir <- make_temp_dir()
+ipc_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dir)", {
+ if (arrow_with_parquet()) {
+ dir.create(file.path(dataset_dir, 1))
+ dir.create(file.path(dataset_dir, 2))
+ write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet"))
+ write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet"))
+ expect_length(dir(dataset_dir, recursive = TRUE), 2)
+
+ dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE)
+ dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE)
+ write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet"))
+ write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet"))
+ expect_length(dir(hive_dir, recursive = TRUE), 2)
+ }
+
+ # Now, an IPC format dataset
+ dir.create(file.path(ipc_dir, 3))
+ dir.create(file.path(ipc_dir, 4))
+ write_feather(df1, file.path(ipc_dir, 3, "file1.arrow"))
+ write_feather(df2, file.path(ipc_dir, 4, "file2.arrow"))
+ expect_length(dir(ipc_dir, recursive = TRUE), 2)
+})
+
+test_that("IPC/Feather format data", {
+ ds <- open_dataset(ipc_dir, partitioning = "part", format = "feather")
+ expect_r6_class(ds$format, "IpcFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+ expect_identical(names(ds), c(names(df1), "part"))
+ expect_identical(dim(ds), c(20L, 7L))
+
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 3) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ # Collecting virtual partition column works
+ expect_equal(
+ ds %>% arrange(part) %>% pull(part),
+ c(rep(3, 10), rep(4, 10))
+ )
+})
+
+expect_scan_result <- function(ds, schm) {
+ sb <- ds$NewScan()
+ expect_r6_class(sb, "ScannerBuilder")
+ expect_equal(sb$schema, schm)
+
+ sb$Project(c("chr", "lgl"))
+ sb$Filter(Expression$field_ref("dbl") == 8)
+ scn <- sb$Finish()
+ expect_r6_class(scn, "Scanner")
+
+ tab <- scn$ToTable()
+ expect_r6_class(tab, "Table")
+
+ expect_equal(
+ as.data.frame(tab),
+ df1[8, c("chr", "lgl")]
+ )
+}
+
+test_that("URI-decoding with directory partitioning", {
+ root <- make_temp_dir()
+ fmt <- FileFormat$create("feather")
+ fs <- LocalFileSystem$create()
+ selector <- FileSelector$create(root, recursive = TRUE)
+ dir1 <- file.path(root, "2021-05-04 00%3A00%3A00", "%24")
+ dir.create(dir1, recursive = TRUE)
+ write_feather(df1, file.path(dir1, "data.feather"))
+
+ partitioning <- DirectoryPartitioning$create(
+ schema(date = timestamp(unit = "s"), string = utf8())
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ expect_scan_result(ds, schm)
+
+ partitioning <- DirectoryPartitioning$create(
+ schema(date = timestamp(unit = "s"), string = utf8()),
+ segment_encoding = "none"
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ schm <- factory$Inspect()
+ expect_error(factory$Finish(schm), "Invalid: error parsing")
+
+ partitioning_factory <- DirectoryPartitioningFactory$create(
+ c("date", "string")
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ # Can't directly inspect partition expressions, so do it implicitly via scan
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00:00:00", string == "$") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+
+ partitioning_factory <- DirectoryPartitioningFactory$create(
+ c("date", "string"),
+ segment_encoding = "none"
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00%3A00%3A00", string == "%24") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+})
+
+test_that("URI-decoding with hive partitioning", {
+ root <- make_temp_dir()
+ fmt <- FileFormat$create("feather")
+ fs <- LocalFileSystem$create()
+ selector <- FileSelector$create(root, recursive = TRUE)
+ dir1 <- file.path(root, "date=2021-05-04 00%3A00%3A00", "string=%24")
+ dir.create(dir1, recursive = TRUE)
+ write_feather(df1, file.path(dir1, "data.feather"))
+
+ partitioning <- hive_partition(
+ date = timestamp(unit = "s"), string = utf8()
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ ds <- factory$Finish(schm)
+ expect_scan_result(ds, schm)
+
+ partitioning <- hive_partition(
+ date = timestamp(unit = "s"), string = utf8(), segment_encoding = "none"
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ expect_error(factory$Finish(schm), "Invalid: error parsing")
+
+ partitioning_factory <- hive_partition()
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ # Can't directly inspect partition expressions, so do it implicitly via scan
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00:00:00", string == "$") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+
+ partitioning_factory <- hive_partition(segment_encoding = "none")
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00%3A00%3A00", string == "%24") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+})
+
+# Everything else below here is using parquet files
+skip_if_not_available("parquet")
+
+files <- c(
+ file.path(dataset_dir, 1, "file1.parquet", fsep = "/"),
+ file.path(dataset_dir, 2, "file2.parquet", fsep = "/")
+)
+
+test_that("Simple interface for datasets", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_r6_class(ds$format, "ParquetFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 1) %>% # 6 not 6L to test autocasting
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ # Collecting virtual partition column works
+ expect_equal(
+ ds %>% arrange(part) %>% pull(part),
+ c(rep(1, 10), rep(2, 10))
+ )
+})
+
+test_that("dim method returns the correct number of rows and columns", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_identical(dim(ds), c(20L, 7L))
+})
+
+
+test_that("dim() correctly determine numbers of rows and columns on arrow_dplyr_query object", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+
+ expect_identical(
+ ds %>%
+ filter(chr == "a") %>%
+ dim(),
+ c(2L, 7L)
+ )
+ expect_equal(
+ ds %>%
+ select(chr, fct, int) %>%
+ dim(),
+ c(20L, 3L)
+ )
+ expect_identical(
+ ds %>%
+ select(chr, fct, int) %>%
+ filter(chr == "a") %>%
+ dim(),
+ c(2L, 3L)
+ )
+})
+
+test_that("Simple interface for datasets (custom ParquetFileFormat)", {
+ ds <- open_dataset(dataset_dir,
+ partitioning = schema(part = uint8()),
+ format = FileFormat$create("parquet", dict_columns = c("chr"))
+ )
+ expect_type_equal(ds$schema$GetFieldByName("chr")$type, dictionary())
+})
+
+test_that("Hive partitioning", {
+ ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ filter(group == 2) %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53) %>%
+ collect() %>%
+ arrange(dbl),
+ df2[1:2, c("chr", "dbl")]
+ )
+})
+
+test_that("input validation", {
+ expect_error(
+ open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8()))
+ )
+})
+
+test_that("Partitioning inference", {
+ # These are the same tests as above, just using the *PartitioningFactory
+ ds1 <- open_dataset(dataset_dir, partitioning = "part")
+ expect_identical(names(ds1), c(names(df1), "part"))
+ expect_equal(
+ ds1 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 1) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ ds2 <- open_dataset(hive_dir)
+ expect_identical(names(ds2), c(names(df1), "group", "other"))
+ expect_equal(
+ ds2 %>%
+ filter(group == 2) %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53) %>%
+ collect() %>%
+ arrange(dbl),
+ df2[1:2, c("chr", "dbl")]
+ )
+})
+
+test_that("Dataset with multiple file formats", {
+ skip("https://issues.apache.org/jira/browse/ARROW-7653")
+ ds <- open_dataset(list(
+ open_dataset(dataset_dir, format = "parquet", partitioning = "part"),
+ open_dataset(ipc_dir, format = "arrow", partitioning = "part")
+ ))
+ expect_identical(names(ds), c(names(df1), "part"))
+ expect_equal(
+ ds %>%
+ filter(int > 6 & part %in% c(1, 3)) %>%
+ select(string = chr, integer = int) %>%
+ collect(),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ rbind(., .) # Stack it twice
+ )
+})
+
+test_that("Creating UnionDataset", {
+ ds1 <- open_dataset(file.path(dataset_dir, 1))
+ ds2 <- open_dataset(file.path(dataset_dir, 2))
+ union1 <- open_dataset(list(ds1, ds2))
+ expect_r6_class(union1, "UnionDataset")
+ expect_equal(
+ union1 %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+
+ # Now with the c() method
+ union2 <- c(ds1, ds2)
+ expect_r6_class(union2, "UnionDataset")
+ expect_equal(
+ union2 %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+
+ # Confirm c() method error handling
+ expect_error(c(ds1, 42), "character")
+})
+
+test_that("map_batches", {
+ skip("map_batches() is broken (ARROW-14029)")
+ ds <- open_dataset(dataset_dir, partitioning = "part")
+ expect_equal(
+ ds %>%
+ filter(int > 5) %>%
+ select(int, lgl) %>%
+ map_batches(~ summarize(., min_int = min(int))),
+ tibble(min_int = c(6L, 101L))
+ )
+})
+
+test_that("partitioning = NULL to ignore partition information (but why?)", {
+ ds <- open_dataset(hive_dir, partitioning = NULL)
+ expect_identical(names(ds), names(df1)) # i.e. not c(names(df1), "group", "other")
+})
+
+test_that("head/tail", {
+ # head/tail with no query are still deterministic order
+ ds <- open_dataset(dataset_dir)
+ expect_equal(as.data.frame(head(ds)), head(df1))
+ expect_equal(
+ as.data.frame(head(ds, 12)),
+ rbind(df1, df2[1:2, ])
+ )
+
+ expect_equal(as.data.frame(tail(ds)), tail(df2))
+ expect_equal(
+ as.data.frame(tail(ds, 12)),
+ rbind(df1[9:10, ], df2)
+ )
+})
+
+test_that("Dataset [ (take by index)", {
+ ds <- open_dataset(dataset_dir)
+ # Taking only from one file
+ expect_equal(
+ as.data.frame(ds[c(4, 5, 9), 3:4]),
+ df1[c(4, 5, 9), 3:4]
+ )
+ # Taking from more than one
+ expect_equal(
+ as.data.frame(ds[c(4, 5, 9, 12, 13), 3:4]),
+ rbind(df1[c(4, 5, 9), 3:4], df2[2:3, 3:4])
+ )
+ # Taking out of order
+ expect_equal(
+ as.data.frame(ds[c(4, 13, 9, 12, 5), ]),
+ rbind(
+ df1[4, ],
+ df2[3, ],
+ df1[9, ],
+ df2[2, ],
+ df1[5, ]
+ )
+ )
+
+ # Take from a query
+ ds2 <- ds %>%
+ filter(int > 6) %>%
+ select(int, lgl)
+ expect_equal(
+ as.data.frame(ds2[c(2, 5), ]),
+ rbind(
+ df1[8, c("int", "lgl")],
+ df2[1, c("int", "lgl")]
+ )
+ )
+})
+
+test_that("Dataset and query print methods", {
+ ds <- open_dataset(hive_dir)
+ expect_output(
+ print(ds),
+ paste(
+ "FileSystemDataset with 2 Parquet files",
+ "int: int32",
+ "dbl: double",
+ "lgl: bool",
+ "chr: string",
+ "fct: dictionary<values=string, indices=int32>",
+ "ts: timestamp[us, tz=UTC]",
+ "group: int32",
+ "other: string",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+ expect_type(ds$metadata, "list")
+ q <- select(ds, string = chr, lgl, integer = int)
+ expect_output(
+ print(q),
+ paste(
+ "Dataset (query)",
+ "string: string",
+ "lgl: bool",
+ "integer: int32",
+ "",
+ "See $.data for the source Arrow object",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+ expect_output(
+ print(q %>% filter(integer == 6) %>% group_by(lgl)),
+ paste(
+ "Dataset (query)",
+ "string: string",
+ "lgl: bool",
+ "integer: int32",
+ "",
+ "* Filter: (int == 6)",
+ "* Grouped by lgl",
+ "See $.data for the source Arrow object",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("Scanner$ScanBatches", {
+ ds <- open_dataset(ipc_dir, format = "feather")
+ batches <- ds$NewScan()$Finish()$ScanBatches()
+ table <- Table$create(!!!batches)
+ expect_equal(as.data.frame(table), rbind(df1, df2))
+
+ batches <- ds$NewScan()$UseAsync(TRUE)$Finish()$ScanBatches()
+ table <- Table$create(!!!batches)
+ expect_equal(as.data.frame(table), rbind(df1, df2))
+})
+
+test_that("Scanner$ToRecordBatchReader()", {
+ ds <- open_dataset(dataset_dir, partitioning = "part")
+ scan <- ds %>%
+ filter(part == 1) %>%
+ select(int, lgl) %>%
+ filter(int > 6) %>%
+ Scanner$create()
+ reader <- scan$ToRecordBatchReader()
+ expect_r6_class(reader, "RecordBatchReader")
+ expect_identical(
+ as.data.frame(reader$read_table()),
+ df1[df1$int > 6, c("int", "lgl")]
+ )
+})
+
+test_that("Scanner$create() filter/projection pushdown", {
+ ds <- open_dataset(dataset_dir, partitioning = "part")
+
+ # the standard to compare all Scanner$create()s against
+ scan_one <- ds %>%
+ filter(int > 7 & dbl < 57) %>%
+ select(int, dbl, lgl) %>%
+ mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>%
+ Scanner$create()
+
+ # select a column in projection
+ scan_two <- ds %>%
+ filter(int > 7 & dbl < 57) %>%
+ # select an extra column, since we are going to
+ select(int, dbl, lgl, chr) %>%
+ mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>%
+ Scanner$create(projection = c("int", "dbl", "lgl", "int_plus", "dbl_minus"))
+ expect_identical(
+ as.data.frame(scan_one$ToRecordBatchReader()$read_table()),
+ as.data.frame(scan_two$ToRecordBatchReader()$read_table())
+ )
+
+ # adding filters to Scanner$create
+ scan_three <- ds %>%
+ filter(int > 7) %>%
+ select(int, dbl, lgl) %>%
+ mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>%
+ Scanner$create(
+ filter = Expression$create("less", Expression$field_ref("dbl"), Expression$scalar(57))
+ )
+ expect_identical(
+ as.data.frame(scan_one$ToRecordBatchReader()$read_table()),
+ as.data.frame(scan_three$ToRecordBatchReader()$read_table())
+ )
+
+ expect_error(
+ ds %>%
+ select(int, dbl, lgl) %>%
+ Scanner$create(projection = "not_a_col"),
+ # Full message is "attempting to project with unknown columns" >= 4.0.0, but
+ # prior versions have a less nice "all(projection %in% names(proj)) is not TRUE"
+ "project"
+ )
+
+ expect_error(
+ ds %>%
+ select(int, dbl, lgl) %>%
+ Scanner$create(filter = list("foo", "bar")),
+ "filter expressions must be either an expression or a list of expressions"
+ )
+})
+
+test_that("Assembling a Dataset manually and getting a Table", {
+ fs <- LocalFileSystem$create()
+ selector <- FileSelector$create(dataset_dir, recursive = TRUE)
+ partitioning <- DirectoryPartitioning$create(schema(part = double()))
+
+ fmt <- FileFormat$create("parquet")
+ factory <- FileSystemDatasetFactory$create(fs, selector, NULL, fmt, partitioning = partitioning)
+ expect_r6_class(factory, "FileSystemDatasetFactory")
+
+ schm <- factory$Inspect()
+ expect_r6_class(schm, "Schema")
+
+ phys_schm <- ParquetFileReader$create(files[1])$GetSchema()
+ expect_equal(names(phys_schm), names(df1))
+ expect_equal(names(schm), c(names(phys_schm), "part"))
+
+ child <- factory$Finish(schm)
+ expect_r6_class(child, "FileSystemDataset")
+ expect_r6_class(child$schema, "Schema")
+ expect_r6_class(child$format, "ParquetFileFormat")
+ expect_equal(names(schm), names(child$schema))
+ expect_equal(child$files, files)
+
+ ds <- Dataset$create(list(child), schm)
+ expect_scan_result(ds, schm)
+})
+
+test_that("Assembling multiple DatasetFactories with DatasetFactory", {
+ factory1 <- dataset_factory(file.path(dataset_dir, 1), format = "parquet")
+ expect_r6_class(factory1, "FileSystemDatasetFactory")
+ factory2 <- dataset_factory(file.path(dataset_dir, 2), format = "parquet")
+ expect_r6_class(factory2, "FileSystemDatasetFactory")
+
+ factory <- DatasetFactory$create(list(factory1, factory2))
+ expect_r6_class(factory, "DatasetFactory")
+
+ schm <- factory$Inspect()
+ expect_r6_class(schm, "Schema")
+
+ phys_schm <- ParquetFileReader$create(files[1])$GetSchema()
+ expect_equal(names(phys_schm), names(df1))
+
+ ds <- factory$Finish(schm)
+ expect_r6_class(ds, "UnionDataset")
+ expect_r6_class(ds$schema, "Schema")
+ expect_equal(names(schm), names(ds$schema))
+ expect_equal(unlist(map(ds$children, ~ .$files)), files)
+
+ expect_scan_result(ds, schm)
+})
+
+# see https://issues.apache.org/jira/browse/ARROW-11328
+test_that("Collecting zero columns from a dataset doesn't return entire dataset", {
+ tmp <- tempfile()
+ write_dataset(mtcars, tmp, format = "parquet")
+ expect_equal(
+ open_dataset(tmp) %>% select() %>% collect() %>% dim(),
+ c(32, 0)
+ )
+})
+
+
+test_that("dataset RecordBatchReader to C-interface to arrow_dplyr_query", {
+ ds <- open_dataset(ipc_dir, partitioning = "part", format = "feather")
+
+ # export the RecordBatchReader via the C-interface
+ stream_ptr <- allocate_arrow_array_stream()
+ scan <- Scanner$create(ds)
+ reader <- scan$ToRecordBatchReader()
+ reader$export_to_c(stream_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatchStreamReader$import_from_c(stream_ptr)
+
+ # create an arrow_dplyr_query() from the recordbatch reader
+ reader_adq <- arrow_dplyr_query(circle)
+
+ # TODO: ARROW-14321 should be able to arrange then collect
+ tab_from_c_new <- reader_adq %>%
+ filter(int < 8, int > 55) %>%
+ mutate(part_plus = part + 6) %>%
+ collect()
+ expect_equal(
+ tab_from_c_new %>%
+ arrange(dbl),
+ ds %>%
+ filter(int < 8, int > 55) %>%
+ mutate(part_plus = part + 6) %>%
+ collect() %>%
+ arrange(dbl)
+ )
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-arrange.R b/src/arrow/r/tests/testthat/test-dplyr-arrange.R
new file mode 100644
index 000000000..d22f64a7c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-arrange.R
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+# randomize order of rows in test data
+tbl <- slice_sample(example_data_for_sorting, prop = 1L)
+
+test_that("arrange() on integer, double, and character columns", {
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int, desc(dbl)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int, desc(desc(dbl))) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int) %>%
+ arrange(desc(dbl)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int + dbl, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(zzz = int + dbl, ) %>%
+ arrange(zzz, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(zzz = int + dbl) %>%
+ arrange(int + dbl, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(int + dbl) %>%
+ arrange(int + dbl, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange(int, dbl) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange(int, dbl, .by_group = TRUE) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp, grp2) %>%
+ arrange(int, dbl, .by_group = TRUE) %>%
+ collect(),
+ tbl %>%
+ mutate(grp2 = ifelse(is.na(lgl), 1L, as.integer(lgl)))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange(.by_group = TRUE) %>%
+ pull(grp),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange() %>%
+ collect(),
+ tbl %>%
+ group_by(grp)
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange() %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange() %>%
+ collect(),
+ tbl
+ )
+ test_sort_col <- "chr"
+ compare_dplyr_binding(
+ .input %>%
+ arrange(!!sym(test_sort_col)) %>%
+ collect(),
+ tbl %>%
+ select(chr, lgl)
+ )
+ test_sort_cols <- c("int", "dbl")
+ compare_dplyr_binding(
+ .input %>%
+ arrange(!!!syms(test_sort_cols)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("arrange() on datetime columns", {
+ compare_dplyr_binding(
+ .input %>%
+ arrange(dttm, int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(dttm) %>%
+ collect(),
+ tbl %>%
+ select(dttm, grp)
+ )
+})
+
+test_that("arrange() on logical columns", {
+ compare_dplyr_binding(
+ .input %>%
+ arrange(lgl, int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("arrange() with bad inputs", {
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(1),
+ "does not contain any field names",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(2 + 2),
+ "does not contain any field names",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(aertidjfgjksertyj),
+ "not found",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(desc(aertidjfgjksertyj + iaermxiwerksxsdqq)),
+ "not found",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(desc(int, chr)),
+ "expects only one argument",
+ fixed = TRUE
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-collapse.R b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
new file mode 100644
index 000000000..c7281b62d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+withr::local_options(list(arrow.summarise.sort = TRUE))
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+tab <- Table$create(tbl)
+
+test_that("implicit_schema with select", {
+ expect_equal(
+ tab %>%
+ select(int, lgl) %>%
+ implicit_schema(),
+ schema(int = int32(), lgl = bool())
+ )
+})
+
+test_that("implicit_schema with rename", {
+ expect_equal(
+ tab %>%
+ select(numbers = int, lgl) %>%
+ implicit_schema(),
+ schema(numbers = int32(), lgl = bool())
+ )
+})
+
+test_that("implicit_schema with mutate", {
+ expect_equal(
+ tab %>%
+ transmute(
+ numbers = int * 4,
+ words = as.character(int)
+ ) %>%
+ implicit_schema(),
+ schema(numbers = float64(), words = utf8())
+ )
+})
+
+test_that("implicit_schema with summarize", {
+ expect_equal(
+ tab %>%
+ summarize(
+ avg = mean(int)
+ ) %>%
+ implicit_schema(),
+ schema(avg = float64())
+ )
+})
+
+test_that("implicit_schema with group_by summarize", {
+ expect_equal(
+ tab %>%
+ group_by(some_grouping) %>%
+ summarize(
+ avg = mean(int * 5L)
+ ) %>%
+ implicit_schema(),
+ schema(some_grouping = float64(), avg = float64())
+ )
+})
+
+test_that("collapse", {
+ q <- tab %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L)
+ expect_false(is_collapsed(q))
+ expect_true(is_collapsed(collapse(q)))
+ expect_false(is_collapsed(collapse(q)$.data))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, twice) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ collapse() %>%
+ select(chr, int, lgl) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ collapse() %>%
+ group_by(chr) %>%
+ select(chr, int, lgl) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Properties of collapsed query", {
+ q <- tab %>%
+ filter(dbl > 2) %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ group_by(lgl) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ mutate(extra = total * 5)
+
+ # print(tbl %>%
+ # filter(dbl > 2) %>%
+ # select(chr, int, lgl) %>%
+ # mutate(twice = int * 2L) %>%
+ # group_by(lgl) %>%
+ # summarize(total = sum(int, na.rm = TRUE)) %>%
+ # mutate(extra = total * 5))
+
+ # # A tibble: 3 × 3
+ # lgl total extra
+ # <lgl> <int> <dbl>
+ # 1 FALSE 8 40
+ # 2 TRUE 8 40
+ # 3 NA 25 125
+
+ # Avoid evaluating just for nrow
+ expect_identical(dim(q), c(NA_integer_, 3L))
+
+ expect_output(
+ print(q),
+ "InMemoryDataset (query)
+lgl: bool
+total: int32
+extra: double (multiply_checked(total, 5))
+
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+ expect_output(
+ print(q$.data),
+ "InMemoryDataset (query)
+int: int32
+lgl: bool
+
+* Aggregations:
+total: sum(int)
+* Filter: (dbl > 2)
+* Grouped by lgl
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+
+ skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal")
+ # On older R versions:
+ # ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ────────
+ # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40).
+ # Component "total": Mean relative difference: 0.3846154
+ # Component "extra": Mean relative difference: 0.3846154
+ # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ────────
+ # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125).
+ # Component "total": Mean relative difference: 0.9230769
+ # Component "extra": Mean relative difference: 0.9230769
+ expect_equal(
+ q %>% head(1) %>% collect(),
+ tibble::tibble(lgl = FALSE, total = 8L, extra = 40)
+ )
+ skip("TODO (ARROW-1XXXX): implement sorting option about where NAs go")
+ expect_equal(
+ q %>% tail(1) %>% collect(),
+ tibble::tibble(lgl = NA, total = 25L, extra = 125)
+ )
+})
+
+test_that("query_on_dataset handles collapse()", {
+ expect_false(query_on_dataset(
+ tab %>%
+ select(int, chr)
+ ))
+ expect_false(query_on_dataset(
+ tab %>%
+ select(int, chr) %>%
+ collapse() %>%
+ select(int)
+ ))
+
+ ds_dir <- tempfile()
+ dir.create(ds_dir)
+ on.exit(unlink(ds_dir))
+ write_parquet(tab, file.path(ds_dir, "file.parquet"))
+ ds <- open_dataset(ds_dir)
+
+ expect_true(query_on_dataset(
+ ds %>%
+ select(int, chr)
+ ))
+ expect_true(query_on_dataset(
+ ds %>%
+ select(int, chr) %>%
+ collapse() %>%
+ select(int)
+ ))
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-count.R b/src/arrow/r/tests/testthat/test-dplyr-count.R
new file mode 100644
index 000000000..8af9b57aa
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-count.R
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+tbl <- example_data
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+test_that("count/tally", {
+ compare_dplyr_binding(
+ .input %>%
+ count() %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ tally() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("count/tally with wt and grouped data", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ count(wt = int) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ tally(wt = int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("count/tally with sort", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ count(wt = int, sort = TRUE) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ tally(wt = int, sort = TRUE) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("count/tally with name arg", {
+ compare_dplyr_binding(
+ .input %>%
+ count(name = "new_col") %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ tally(name = "new_col") %>%
+ collect(),
+ tbl
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-distinct.R b/src/arrow/r/tests/testthat/test-dplyr-distinct.R
new file mode 100644
index 000000000..3a44c7372
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-distinct.R
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+tbl <- example_data
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+test_that("distinct()", {
+ compare_dplyr_binding(
+ .input %>%
+ distinct(some_grouping, lgl) %>%
+ collect() %>%
+ arrange(some_grouping, lgl),
+ tbl
+ )
+})
+
+test_that("distinct() works without any variables", {
+ compare_dplyr_binding(
+ .input %>%
+ distinct() %>%
+ arrange(int) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(x = int + 1) %>%
+ distinct() %>%
+ # Even though we have group_by(x), all cols (including int) are kept
+ arrange(int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("distinct() can retain groups", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int) %>%
+ distinct(lgl) %>%
+ collect() %>%
+ arrange(lgl, int),
+ tbl
+ )
+
+ # With expressions here
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y = some_grouping, int) %>%
+ distinct(x = lgl) %>%
+ collect() %>%
+ arrange(int),
+ tbl
+ )
+})
+
+test_that("distinct() can contain expressions", {
+ compare_dplyr_binding(
+ .input %>%
+ distinct(lgl, x = some_grouping + 1) %>%
+ collect() %>%
+ arrange(lgl, x),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(lgl, int) %>%
+ distinct(x = some_grouping + 1) %>%
+ collect() %>%
+ arrange(int),
+ tbl
+ )
+})
+
+test_that("distinct() can return all columns", {
+ skip("ARROW-13993 - need this to return correct rows from other cols")
+ compare_dplyr_binding(
+ .input %>%
+ distinct(lgl, .keep_all = TRUE) %>%
+ collect() %>%
+ arrange(int),
+ tbl
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-filter.R b/src/arrow/r/tests/testthat/test-dplyr-filter.R
new file mode 100644
index 000000000..72a64229c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-filter.R
@@ -0,0 +1,412 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_negative <- tbl$int * (-1)^(1:nrow(tbl)) # nolint
+
+test_that("filter() on is.na()", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(is.na(lgl)) %>%
+ select(chr, int, lgl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter() with NAs in selection", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(lgl) %>%
+ select(chr, int, lgl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Filter returning an empty Table should not segfault (ARROW-8354)", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(false) %>%
+ select(chr, int, lgl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filtering with expression", {
+ char_sym <- "b"
+ compare_dplyr_binding(
+ .input %>%
+ filter(chr == char_sym) %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filtering with arithmetic", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl + 1 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl / 2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl / 2L > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int / 2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int / 2L > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl %/% 2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl^2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filtering with expression + autocasting", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int + 1 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int^2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("More complex select/filter", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter() with %in%", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr %in% c("d", "f")) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Negative scalar values", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_negative > -2) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_negative %in% -1) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(int == -some_negative) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter() with between()", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(between(dbl, 1, 2)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(between(dbl, 0.5, 2)) %>%
+ collect(),
+ tbl
+ )
+
+ expect_identical(
+ tbl %>%
+ record_batch() %>%
+ filter(between(dbl, int, dbl2)) %>%
+ collect(),
+ tbl %>%
+ filter(dbl >= int, dbl <= dbl2)
+ )
+
+ expect_error(
+ tbl %>%
+ record_batch() %>%
+ filter(between(dbl, 1, "2")) %>%
+ collect()
+ )
+
+ expect_error(
+ tbl %>%
+ record_batch() %>%
+ filter(between(dbl, 1, NA)) %>%
+ collect()
+ )
+
+ expect_error(
+ tbl %>%
+ record_batch() %>%
+ filter(between(chr, 1, 2)) %>%
+ collect()
+ )
+})
+
+test_that("filter() with string ops", {
+ skip_if_not_available("utf8proc")
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, str_length(verses) > 25) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter environment scope", {
+ # "object 'b_var' not found"
+ compare_dplyr_error(.input %>% filter(chr == b_var), tbl)
+
+ b_var <- "b"
+ compare_dplyr_binding(
+ .input %>%
+ filter(chr == b_var) %>%
+ collect(),
+ tbl
+ )
+ # Also for functions
+ # 'could not find function "isEqualTo"' because we haven't defined it yet
+ compare_dplyr_error(.input %>% filter(isEqualTo(int, 4)), tbl)
+
+ # This works but only because there are S3 methods for those operations
+ isEqualTo <- function(x, y) x == y & !is.na(x)
+ compare_dplyr_binding(
+ .input %>%
+ select(-fct) %>% # factor levels aren't identical
+ filter(isEqualTo(int, 4)) %>%
+ collect(),
+ tbl
+ )
+ # Try something that needs to call another nse_func
+ compare_dplyr_binding(
+ .input %>%
+ select(-fct) %>%
+ filter(nchar(padded_strings) < 10) %>%
+ collect(),
+ tbl
+ )
+ isShortString <- function(x) nchar(x) < 10
+ skip("TODO: 14071")
+ compare_dplyr_binding(
+ .input %>%
+ select(-fct) %>%
+ filter(isShortString(padded_strings)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Filtering on a column that doesn't exist errors correctly", {
+ with_language("fr", {
+ # expect_warning(., NA) because the usual behavior when it hits a filter
+ # that it can't evaluate is to raise a warning, collect() to R, and retry
+ # the filter. But we want this to error the first time because it's
+ # a user error, not solvable by retrying in R
+ expect_warning(
+ expect_error(
+ tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
+ "objet 'not_a_col' introuvable"
+ ),
+ NA
+ )
+ })
+ with_language("en", {
+ expect_warning(
+ expect_error(
+ tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
+ "object 'not_a_col' not found"
+ ),
+ NA
+ )
+ })
+})
+
+test_that("Filtering with unsupported functions", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 2, pnorm(dbl) > .99) %>%
+ collect(),
+ tbl,
+ warning = "Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow; pulling data into R"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(
+ nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg
+ int > 2, # good
+ pnorm(dbl) > .99 # bad, opaque
+ ) %>%
+ collect(),
+ tbl,
+ warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported by Arrow
+\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow
+pulling data into R'
+ )
+})
+
+test_that("Calling Arrow compute functions 'directly'", {
+ expect_equal(
+ tbl %>%
+ record_batch() %>%
+ filter(arrow_add(dbl, 1) > 3L) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl %>%
+ filter(dbl + 1 > 3L) %>%
+ select(string = chr, int, dbl)
+ )
+
+ compare_dplyr_binding(
+ tbl %>%
+ record_batch() %>%
+ filter(arrow_greater(arrow_add(dbl, 1), 3L)) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl %>%
+ filter(dbl + 1 > 3L) %>%
+ select(string = chr, int, dbl)
+ )
+})
+
+test_that("filter() with .data pronoun", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(.data$dbl > 4) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(is.na(.data$lgl)) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+
+ # and the .env pronoun too!
+ chr <- 4
+ compare_dplyr_binding(
+ .input %>%
+ filter(.data$dbl > .env$chr) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+
+ skip("test now faulty - code no longer gives error & outputs a empty tibble")
+ # but there is an error if we don't override the masking with `.env`
+ compare_dplyr_error(
+ .input %>%
+ filter(.data$dbl > chr) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R
new file mode 100644
index 000000000..4f2700795
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R
@@ -0,0 +1,409 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+suppressPackageStartupMessages(library(bit64))
+
+
+tbl <- example_data
+tbl$verses <- verses[[1]]
+tbl$another_chr <- tail(letters, 10)
+
+test_that("if_else and ifelse", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, 1, 0)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, int, 0L)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ expect_error(
+ Table$create(tbl) %>%
+ mutate(
+ y = if_else(int > 5, 1, FALSE)
+ ) %>%
+ collect(),
+ "NotImplemented: Function if_else has no kernel matching input types"
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, 1, NA_real_)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = ifelse(int > 5, 1, 0)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(dbl > 5, TRUE, FALSE)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(chr %in% letters[1:3], 1L, 3L)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, "one", "zero")
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, chr, another_chr)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, "true", chr, missing = "MISSING")
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow
+ # supports factors in if(_)else
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, fct, factor("a"))
+ ) %>%
+ collect() %>%
+ # This is a no-op on the Arrow side, but necessary to make the results equal
+ mutate(y = as.character(y)),
+ tbl,
+ warning = "Dictionaries .* are currently converted to strings .* in if_else and ifelse"
+ )
+
+ # detecting NA and NaN works just fine
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(is.na(dbl), chr, "false", missing = "MISSING")
+ ) %>%
+ collect(),
+ example_data_for_sorting
+ )
+
+ # However, currently comparisons with NaNs return false and not NaNs or NAs
+ skip("ARROW-13364")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(dbl > 5, chr, another_chr, missing = "MISSING")
+ ) %>%
+ collect(),
+ example_data_for_sorting
+ )
+
+ skip("TODO: could? should? we support the autocasting in ifelse")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = ifelse(int > 5, 1, FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("case_when()", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(lgl ~ dbl, !false ~ dbl + dbl2)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(cw = case_when(int > 5 ~ 1, TRUE ~ 0)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(chr %in% letters[1:3] ~ 1L) + 41L) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(case_when(
+ dbl + int - 1.1 == dbl2 ~ TRUE,
+ NA ~ NA,
+ TRUE ~ FALSE
+ ) & !is.na(dbl2)) %>%
+ collect(),
+ tbl
+ )
+
+ # dplyr::case_when() errors if values on right side of formulas do not have
+ # exactly the same type, but the Arrow case_when kernel allows compatible types
+ expect_equal(
+ tbl %>%
+ mutate(i64 = as.integer64(1e10)) %>%
+ Table$create() %>%
+ transmute(cw = case_when(
+ is.na(fct) ~ int,
+ is.na(chr) ~ dbl,
+ TRUE ~ i64
+ )) %>%
+ collect(),
+ tbl %>%
+ transmute(
+ cw = ifelse(is.na(fct), int, ifelse(is.na(chr), dbl, 1e10))
+ )
+ )
+
+ # expected errors (which are caught by abandon_ship() and changed to warnings)
+ # TODO: Find a way to test these directly without abandon_ship() interfering
+ expect_error(
+ # no cases
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when()),
+ "case_when"
+ )
+ )
+ expect_error(
+ # argument not a formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(TRUE ~ FALSE, TRUE)),
+ "case_when"
+ )
+ )
+ expect_error(
+ # non-logical R scalar on left side of formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(0L ~ FALSE, TRUE ~ FALSE)),
+ "case_when"
+ )
+ )
+ expect_error(
+ # non-logical Arrow column reference on left side of formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(int ~ FALSE)),
+ "case_when"
+ )
+ )
+ expect_error(
+ # non-logical Arrow expression on left side of formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(dbl + 3.14159 ~ TRUE)),
+ "case_when"
+ )
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(lgl ~ "abc")) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(lgl ~ verses, !false ~ paste(chr, chr))) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = case_when(!(!(!(lgl))) ~ factor(chr), TRUE ~ fct)
+ ) %>%
+ collect(),
+ tbl,
+ warning = TRUE
+ )
+})
+
+test_that("coalesce()", {
+ # character
+ df <- tibble(
+ w = c(NA_character_, NA_character_, NA_character_),
+ x = c(NA_character_, NA_character_, "c"),
+ y = c(NA_character_, "b", "c"),
+ z = c("a", "b", "c")
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = coalesce(w),
+ cz = coalesce(z),
+ cwx = coalesce(w, x),
+ cwxy = coalesce(w, x, y),
+ cwxyz = coalesce(w, x, y, z)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # integer
+ df <- tibble(
+ w = c(NA_integer_, NA_integer_, NA_integer_),
+ x = c(NA_integer_, NA_integer_, 3L),
+ y = c(NA_integer_, 2L, 3L),
+ z = 1:3
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = coalesce(w),
+ cz = coalesce(z),
+ cwx = coalesce(w, x),
+ cwxy = coalesce(w, x, y),
+ cwxyz = coalesce(w, x, y, z)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # double with NaNs
+ df <- tibble(
+ w = c(NA_real_, NaN, NA_real_),
+ x = c(NA_real_, NaN, 3.3),
+ y = c(NA_real_, 2.2, 3.3),
+ z = c(1.1, 2.2, 3.3)
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = coalesce(w),
+ cz = coalesce(z),
+ cwx = coalesce(w, x),
+ cwxy = coalesce(w, x, y),
+ cwxyz = coalesce(w, x, y, z)
+ ) %>%
+ collect(),
+ df
+ )
+ # NaNs stay NaN and are not converted to NA in the results
+ # (testing this requires expect_identical())
+ expect_identical(
+ df %>% Table$create() %>% mutate(cwx = coalesce(w, x)) %>% collect(),
+ df %>% mutate(cwx = coalesce(w, x))
+ )
+ expect_identical(
+ df %>% Table$create() %>% transmute(cw = coalesce(w)) %>% collect(),
+ df %>% transmute(cw = coalesce(w))
+ )
+ expect_identical(
+ df %>% Table$create() %>% transmute(cn = coalesce(NaN)) %>% collect(),
+ df %>% transmute(cn = coalesce(NaN))
+ )
+ # singles stay single
+ expect_equal(
+ (df %>%
+ Table$create(schema = schema(
+ w = float32(),
+ x = float32(),
+ y = float32(),
+ z = float32()
+ )) %>%
+ transmute(c = coalesce(w, x, y, z)) %>%
+ compute()
+ )$schema[[1]]$type,
+ float32()
+ )
+ # with R literal values
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ c1 = coalesce(4.4),
+ c2 = coalesce(NA_real_),
+ c3 = coalesce(NaN),
+ c4 = coalesce(w, x, y, 5.5),
+ c5 = coalesce(w, x, y, NA_real_),
+ c6 = coalesce(w, x, y, NaN)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # factors
+ # TODO: remove the mutate + warning after ARROW-14167 is merged and Arrow
+ # supports factors in coalesce
+ df <- tibble(
+ x = factor("a", levels = c("a", "z")),
+ y = factor("b", levels = c("a", "b", "c"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(c = coalesce(x, y)) %>%
+ collect() %>%
+ # This is a no-op on the Arrow side, but necessary to make the results equal
+ mutate(c = as.character(c)),
+ df,
+ warning = "Dictionaries .* are currently converted to strings .* in coalesce"
+ )
+
+ # no arguments
+ expect_error(
+ nse_funcs$coalesce(),
+ "At least one argument must be supplied to coalesce()",
+ fixed = TRUE
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R
new file mode 100644
index 000000000..5cb515e69
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -0,0 +1,304 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(lubridate, warn.conflicts = FALSE)
+library(dplyr, warn.conflicts = FALSE)
+
+# base::strptime() defaults to local timezone
+# but arrow's strptime defaults to UTC.
+# So that tests are consistent, set the local timezone to UTC
+# TODO: consider reevaluating this workaround after ARROW-12980
+withr::local_timezone("UTC")
+
+# TODO: We should test on windows once ARROW-13168 is resolved.
+if (tolower(Sys.info()[["sysname"]]) == "windows") {
+ test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "")
+} else {
+ test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas")
+}
+
+
+test_df <- tibble::tibble(
+ # test_date + 1 turns the tzone = "" to NULL, which is functionally equivalent
+ # so we can run some tests on Windows, but this skirts around
+ # https://issues.apache.org/jira/browse/ARROW-13588
+ # That issue is tough because in C++, "" is the "no timezone" value
+ # due to static typing, so we can't distinguish a literal "" from NULL
+ datetime = c(test_date, NA) + 1,
+ date = c(as.Date("2021-09-09"), NA)
+)
+
+# These tests test component extraction from timestamp objects
+
+test_that("extract year from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = year(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoyear from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoyear(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract quarter from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = quarter(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract month from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = month(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoweek from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoweek(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract epiweek from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = epiweek(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract day from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = day(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract wday from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(datetime)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 3)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 1)) %>%
+ collect(),
+ test_df
+ )
+
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, label = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(datetime, label = TRUE, abbr = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract yday from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = yday(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract hour from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = hour(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract minute from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = minute(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract second from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = second(datetime)) %>%
+ collect(),
+ test_df,
+ # arrow supports nanosecond resolution but lubridate does not
+ tolerance = 1e-6
+ )
+})
+
+# These tests test extraction of components from date32 objects
+
+test_that("extract year from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = year(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoyear from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoyear(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract quarter from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = quarter(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract month from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = month(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoweek from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoweek(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract epiweek from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = epiweek(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract day from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = day(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract wday from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 3)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 1)) %>%
+ collect(),
+ test_df
+ )
+
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, label = TRUE, abbr = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, label = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract yday from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = yday(date)) %>%
+ collect(),
+ test_df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R
new file mode 100644
index 000000000..b66630675
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R
@@ -0,0 +1,309 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+
+test_that("abs()", {
+ df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(abs = abs(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("sign()", {
+ df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(sign = sign(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("ceiling(), floor(), trunc(), round()", {
+ df <- tibble(x = c(-1, -0.55, -0.5, -0.1, 0, 0.1, 0.5, 0.55, 1, NA, NaN))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ c = ceiling(x),
+ f = floor(x),
+ t = trunc(x),
+ r = round(x)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # with digits set to 1
+ compare_dplyr_binding(
+ .input %>%
+ filter(x %% 0.5 == 0) %>% # filter out indeterminate cases (see below)
+ mutate(r = round(x, 1)) %>%
+ collect(),
+ df
+ )
+
+ # with digits set to -1
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ rd = round(floor(x * 111), -1), # double
+ y = ifelse(is.nan(x), NA_integer_, x),
+ ri = round(as.integer(y * 111), -1) # integer (with the NaN removed)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # round(x, -2) is equivalent to round_to_multiple(x, 100)
+ expect_equal(
+ Table$create(x = 1111.1) %>%
+ mutate(r = round(x, -2)) %>%
+ collect(),
+ Table$create(x = 1111.1) %>%
+ mutate(r = arrow_round_to_multiple(x, options = list(multiple = 100))) %>%
+ collect()
+ )
+
+ # For consistency with base R, the binding for round() uses the Arrow
+ # library's HALF_TO_EVEN round mode, but the expectations *above* would pass
+ # even if another round mode were used. The expectations *below* should fail
+ # with other round modes. However, some decimal numbers cannot be represented
+ # exactly as floating point numbers, and for the ones that also end in 5 (such
+ # as 0.55), R's rounding behavior is indeterminate: it will vary depending on
+ # the OS. In practice, this seems to affect Windows, so we skip these tests
+ # on Windows and on CRAN.
+
+ skip_on_cran()
+ skip_on_os("windows")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(r = round(x, 1)) %>%
+ collect(),
+ df
+ )
+
+ # Verify that round mode HALF_TO_EVEN, which is what the round() binding uses,
+ # yields results consistent with R...
+ expect_equal(
+ as.vector(
+ call_function(
+ "round",
+ Array$create(df$x),
+ options = list(ndigits = 1L, round_mode = RoundMode$HALF_TO_EVEN)
+ )
+ ),
+ round(df$x, 1)
+ )
+ # ...but that the round mode HALF_TOWARDS_ZERO does not. If the expectation
+ # below fails, it means that the expectation above is not effectively testing
+ # that Arrow is using the HALF_TO_EVEN mode.
+ expect_false(
+ isTRUE(all.equal(
+ as.vector(
+ call_function(
+ "round",
+ Array$create(df$x),
+ options = list(ndigits = 1L, round_mode = RoundMode$HALF_TOWARDS_ZERO)
+ )
+ ),
+ round(df$x, 1)
+ ))
+ )
+})
+
+test_that("log functions", {
+ df <- tibble(x = c(1:10, NA, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = exp(1))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = 2)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = 10)) %>%
+ collect(),
+ df
+ )
+
+ # test log(, base = (length == 1))
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = 5)) %>%
+ collect(),
+ df
+ )
+
+ # test log(, base = (length != 1))
+ expect_error(
+ nse_funcs$log(10, base = 5:6),
+ "base must be a column or a length-1 numeric; other values not supported by Arrow",
+ fixed = TRUE
+ )
+
+ # test log(x = (length != 1))
+ expect_error(
+ nse_funcs$log(10:11),
+ "x must be a column or a length-1 numeric; other values not supported by Arrow",
+ fixed = TRUE
+ )
+
+ # test log(, base = Expression)
+ compare_dplyr_binding(
+ .input %>%
+ # test cases where base = 1 below
+ filter(x != 1) %>%
+ mutate(
+ y = log(x, base = x),
+ z = log(2, base = x)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # log(1, base = 1) is NaN in both R and Arrow
+ # suppress the R warning because R warns but Arrow does not
+ suppressWarnings(
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = y)) %>%
+ collect(),
+ tibble(x = 1, y = 1)
+ )
+ )
+
+ # log(n != 1, base = 1) is Inf in R and Arrow
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = y)) %>%
+ collect(),
+ tibble(x = 10, y = 1)
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = logb(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log1p(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log2(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log10(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("trig functions", {
+ df <- tibble(x = c(seq(from = 0, to = 1, by = 0.1), NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = sin(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = cos(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = tan(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = asin(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = acos(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("arith functions ", {
+ df <- tibble(x = c(1:5, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ int_div = x %/% 2,
+ addition = x + 1,
+ multiplication = x * 3,
+ subtraction = x - 5,
+ division = x / 2,
+ power = x^3,
+ modulo = x %% 3
+ ) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R
new file mode 100644
index 000000000..5e092f4e3
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R
@@ -0,0 +1,1399 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+skip_if_not_available("utf8proc")
+
+library(dplyr, warn.conflicts = FALSE)
+library(lubridate)
+library(stringr)
+library(stringi)
+
+test_that("paste, paste0, and str_c", {
+ df <- tibble(
+ v = c("A", "B", "C"),
+ w = c("a", "b", "c"),
+ x = c("d", NA_character_, "f"),
+ y = c(NA_character_, "h", "i"),
+ z = c(1.1, 2.2, NA)
+ )
+ x <- Expression$field_ref("x")
+ y <- Expression$field_ref("y")
+
+ # no NAs in data
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(v, w)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(v, w, sep = "-")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(v, w)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(v, w)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(v, w, sep = "+")) %>%
+ collect(),
+ df
+ )
+
+ # NAs in data
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, y)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, y, sep = "-")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(x, y)) %>%
+ collect(),
+ df
+ )
+
+ # non-character column in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(x, y, z)) %>%
+ collect(),
+ df
+ )
+
+ # literal string in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, "foo", y)) %>%
+ collect(),
+ df
+ )
+
+ # literal NA in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, NA, y)) %>%
+ collect(),
+ df
+ )
+
+ # expressions in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(x, toupper(y), as.character(z))) %>%
+ collect(),
+ df
+ )
+
+ # sep is literal NA
+ # errors in paste() (consistent with base::paste())
+ expect_error(
+ nse_funcs$paste(x, y, sep = NA_character_),
+ "Invalid separator"
+ )
+ # emits null in str_c() (consistent with stringr::str_c())
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(x, y, sep = NA_character_)) %>%
+ collect(),
+ df
+ )
+
+ # sep passed in dots to paste0 (which doesn't take a sep argument)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(x, y, sep = "-")) %>%
+ collect(),
+ df
+ )
+
+ # known differences
+
+ # arrow allows the separator to be an array
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(result = paste(x, y, sep = w)) %>%
+ collect(),
+ df %>%
+ transmute(result = paste(x, w, y, sep = ""))
+ )
+
+ # expected errors
+
+ # collapse argument not supported
+ expect_error(
+ nse_funcs$paste(x, y, collapse = ""),
+ "collapse"
+ )
+ expect_error(
+ nse_funcs$paste0(x, y, collapse = ""),
+ "collapse"
+ )
+ expect_error(
+ nse_funcs$str_c(x, y, collapse = ""),
+ "collapse"
+ )
+
+ # literal vectors of length != 1 not supported
+ expect_error(
+ nse_funcs$paste(x, character(0), y),
+ "Literal vectors of length != 1 not supported in string concatenation"
+ )
+ expect_error(
+ nse_funcs$paste(x, c(",", ";"), y),
+ "Literal vectors of length != 1 not supported in string concatenation"
+ )
+})
+
+test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("o", x, fixed = TRUE)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = gsub("o", "u", x, fixed = TRUE)) %>%
+ collect(),
+ df
+ )
+})
+
+# many of the remainder of these tests require RE2
+skip_if_not_available("re2")
+
+test_that("grepl", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ for (fixed in c(TRUE, FALSE)) {
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("Foo", x, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ }
+})
+
+test_that("grepl with ignore.case = TRUE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE,
+ # so we can't use compare_dplyr_binding() for these tests
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = "Foo")
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ filter(x = grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = character(0))
+ )
+})
+
+test_that("str_detect", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, regex("^F"))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("o"))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("O"))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("sub and gsub", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ for (fixed in c(TRUE, FALSE)) {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ }
+})
+
+test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when
+ # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("Fuo", "bar"))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("Fuu", "bar"))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ df # unchanged
+ )
+})
+
+test_that("str_replace and str_replace_all", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace_all(x, "^F", "baz")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace_all(x, fixed("o"), "u")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, fixed("O"), "u")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("strsplit and str_split", {
+ df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strsplit(x, "and")) %>%
+ collect(),
+ df,
+ # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray)
+ # has type information in it, but it's just a bare list from R/dplyr.
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strsplit(x, " +and +")) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, "and")) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, "and", n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, fixed("and"), n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, regex("and"), n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, "Foo|bar", n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+})
+
+test_that("str_to_lower, str_to_upper, and str_to_title", {
+ df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!"))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ x_lower = str_to_lower(x),
+ x_upper = str_to_upper(x),
+ x_title = str_to_title(x)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # Error checking a single function because they all use the same code path.
+ expect_error(
+ nse_funcs$str_to_lower("Apache Arrow", locale = "sp"),
+ "Providing a value for 'locale' other than the default ('en') is not supported by Arrow",
+ fixed = TRUE
+ )
+})
+
+test_that("arrow_*_split_whitespace functions", {
+ # use only ASCII whitespace characters
+ df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
+
+ # use only non-ASCII whitespace characters
+ df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux"))
+
+ df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))
+
+ # use default option values
+ expect_equal(
+ df_ascii %>%
+ Table$create() %>%
+ mutate(x = arrow_ascii_split_whitespace(x)) %>%
+ collect(),
+ df_split,
+ ignore_attr = TRUE
+ )
+ expect_equal(
+ df_utf8 %>%
+ Table$create() %>%
+ mutate(x = arrow_utf8_split_whitespace(x)) %>%
+ collect(),
+ df_split,
+ ignore_attr = TRUE
+ )
+
+ # specify non-default option values
+ expect_equal(
+ df_ascii %>%
+ Table$create() %>%
+ mutate(
+ x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+ ) %>%
+ collect(),
+ tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))),
+ ignore_attr = TRUE
+ )
+ expect_equal(
+ df_utf8 %>%
+ Table$create() %>%
+ mutate(
+ x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+ ) %>%
+ collect(),
+ tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))),
+ ignore_attr = TRUE
+ )
+})
+
+test_that("errors and warnings in string splitting", {
+ # These conditions generate an error, but abandon_ship() catches the error,
+ # issues a warning, and pulls the data into R (if computing on InMemoryDataset)
+ # Elsewhere we test that abandon_ship() works,
+ # so here we can just call the functions directly
+
+ x <- Expression$field_ref("x")
+ expect_error(
+ nse_funcs$str_split(x, fixed("and", ignore_case = TRUE)),
+ "Case-insensitive string splitting not supported by Arrow"
+ )
+ expect_error(
+ nse_funcs$str_split(x, coll("and.?")),
+ "Pattern modifier `coll()` not supported by Arrow",
+ fixed = TRUE
+ )
+ expect_error(
+ nse_funcs$str_split(x, boundary(type = "word")),
+ "Pattern modifier `boundary()` not supported by Arrow",
+ fixed = TRUE
+ )
+ expect_error(
+ nse_funcs$str_split(x, "and", n = 0),
+ "Splitting strings into zero parts not supported by Arrow"
+ )
+
+ # This condition generates a warning
+ expect_warning(
+ nse_funcs$str_split(x, fixed("and"), simplify = TRUE),
+ "Argument 'simplify = TRUE' will be ignored"
+ )
+})
+
+test_that("errors and warnings in string detection and replacement", {
+ x <- Expression$field_ref("x")
+
+ expect_error(
+ nse_funcs$str_detect(x, boundary(type = "character")),
+ "Pattern modifier `boundary()` not supported by Arrow",
+ fixed = TRUE
+ )
+ expect_error(
+ nse_funcs$str_replace_all(x, coll("o", locale = "en"), "ó"),
+ "Pattern modifier `coll()` not supported by Arrow",
+ fixed = TRUE
+ )
+
+ # This condition generates a warning
+ expect_warning(
+ nse_funcs$str_replace_all(x, regex("o", multiline = TRUE), "u"),
+ "Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
+ )
+})
+
+test_that("backreferences in pattern in string detection", {
+ skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, regex("F([aeiou])\\1"))) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("backreferences (substitutions) in string replacement", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(desc = sub(
+ "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?",
+ "path `\\2` on server `\\1`",
+ url
+ )) %>%
+ collect(),
+ tibble(url = "https://arrow.apache.org/docs/r/")
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("edge cases in string detection and replacement", {
+ # in case-insensitive fixed match/replace, test that "\\E" in the search
+ # string and backslashes in the replacement string are interpreted literally.
+ # this test does not use compare_dplyr_binding() because base::sub() and
+ # base::grepl() do not support ignore.case = TRUE when fixed = TRUE.
+ expect_equal(
+ tibble(x = c("\\Q\\e\\D")) %>%
+ Table$create() %>%
+ filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("\\Q\\e\\D"))
+ )
+ expect_equal(
+ tibble(x = c("\\Q\\e\\D")) %>%
+ Table$create() %>%
+ transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("\\Q\\L\\D"))
+ )
+
+ # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's
+ # added in case-insensitive regex match/replace
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>%
+ collect(),
+ tibble(x = c("ABC"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>%
+ collect(),
+ tibble(x = c("ABC"))
+ )
+})
+
+test_that("strptime", {
+ # base::strptime() defaults to local timezone
+ # but arrow's strptime defaults to UTC.
+ # So that tests are consistent, set the local timezone to UTC
+ # TODO: consider reevaluating this workaround after ARROW-12980
+ withr::local_timezone("UTC")
+
+ t_string <- tibble(x = c("2018-10-07 19:04:05", NA))
+ t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA))
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x)
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%Y-%m-%d %H:%M:%S")
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns")
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s")
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ tstring <- tibble(x = c("08-05-2008", NA))
+ tstamp <- strptime(c("08-05-2008", NA), format = "%m-%d-%Y")
+
+ expect_equal(
+ tstring %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%m-%d-%Y")
+ ) %>%
+ pull(),
+ # R's strptime returns POSIXlt (list type)
+ as.POSIXct(tstamp),
+ ignore_attr = "tzone"
+ )
+})
+
+test_that("errors in strptime", {
+ # Error when tz is passed
+ x <- Expression$field_ref("x")
+ expect_error(
+ nse_funcs$strptime(x, tz = "PDT"),
+ "Time zone argument not supported by Arrow"
+ )
+})
+
+test_that("strftime", {
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+ times <- tibble(
+ datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA),
+ date = c(as.Date("2021-01-01"), NA)
+ )
+ formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u"
+ formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u"
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = formats)) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(date, format = formats_date)) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>%
+ collect(),
+ times
+ )
+
+ withr::with_timezone(
+ "Pacific/Marquesas",
+ {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ x = strftime(datetime, format = formats, tz = "EST"),
+ x_date = strftime(date, format = formats_date, tz = "EST")
+ ) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ x = strftime(datetime, format = formats),
+ x_date = strftime(date, format = formats_date)
+ ) %>%
+ collect(),
+ times
+ )
+ }
+ )
+
+ # This check is due to differences in the way %c currently works in Arrow and R's strftime.
+ # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved.
+ expect_error(
+ times %>%
+ Table$create() %>%
+ mutate(x = strftime(datetime, format = "%c")) %>%
+ collect(),
+ "%c flag is not supported in non-C locales."
+ )
+
+ # Output precision of %S depends on the input timestamp precision.
+ # Timestamps with second precision are represented as integers while
+ # milliseconds, microsecond and nanoseconds are represented as fixed floating
+ # point numbers with 3, 6 and 9 decimal places respectively.
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = "%S")) %>%
+ transmute(as.double(substr(x, 1, 2))) %>%
+ collect(),
+ times,
+ tolerance = 1e-6
+ )
+})
+
+test_that("format_ISO8601", {
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+ times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>%
+ collect(),
+ times
+ )
+
+ if (getRversion() < "3.5") {
+ # before 3.5, times$x will have no timezone attribute, so Arrow faithfully
+ # errors that there is no timezone to format:
+ expect_error(
+ times %>%
+ Table$create() %>%
+ mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
+ collect(),
+ "Timezone not present, cannot convert to string with timezone: %Y-%m-%d%z"
+ )
+
+ # See comment regarding %S flag in strftime tests
+ expect_error(
+ times %>%
+ Table$create() %>%
+ mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
+ mutate(x = gsub("\\.0*", "", x)) %>%
+ collect(),
+ "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z"
+ )
+ } else {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
+ collect(),
+ times
+ )
+
+ # See comment regarding %S flag in strftime tests
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
+ mutate(x = gsub("\\.0*", "", x)) %>%
+ collect(),
+ times
+ )
+ }
+
+
+ # See comment regarding %S flag in strftime tests
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>%
+ mutate(x = gsub("\\.0*", "", x)) %>%
+ collect(),
+ times
+ )
+})
+
+test_that("arrow_find_substring and arrow_find_substring_regex", {
+ df <- tibble(x = c("Foo and Bar", "baz and qux and quux"))
+
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>%
+ collect(),
+ tibble(x = c(-1, 0))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring(
+ x,
+ options = list(pattern = "b", ignore_case = TRUE)
+ )) %>%
+ collect(),
+ tibble(x = c(8, 0))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring_regex(
+ x,
+ options = list(pattern = "^[fb]")
+ )) %>%
+ collect(),
+ tibble(x = c(-1, 0))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring_regex(
+ x,
+ options = list(pattern = "[AEIOU]", ignore_case = TRUE)
+ )) %>%
+ collect(),
+ tibble(x = c(1, 1))
+ )
+})
+
+test_that("stri_reverse and arrow_ascii_reverse functions", {
+ df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
+
+ df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = stri_reverse(x)) %>%
+ collect(),
+ df_utf8
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = stri_reverse(x)) %>%
+ collect(),
+ df_ascii
+ )
+
+ expect_equal(
+ df_ascii %>%
+ Table$create() %>%
+ mutate(x = arrow_ascii_reverse(x)) %>%
+ collect(),
+ tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab"))
+ )
+
+ expect_error(
+ df_utf8 %>%
+ Table$create() %>%
+ mutate(x = arrow_ascii_reverse(x)) %>%
+ collect(),
+ "Invalid: Non-ASCII sequence in input"
+ )
+})
+
+test_that("str_like", {
+ df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+ # TODO: After new version of stringr with str_like has been released, update all
+ # these tests to use compare_dplyr_binding
+
+ # No match - entire string
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "baz")) %>%
+ collect(),
+ tibble(x = c(FALSE, FALSE))
+ )
+
+ # Match - entire string
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "Foo and bar")) %>%
+ collect(),
+ tibble(x = c(TRUE, FALSE))
+ )
+
+ # Wildcard
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>%
+ collect(),
+ tibble(x = c(TRUE, FALSE))
+ )
+
+ # Ignore case
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>%
+ collect(),
+ tibble(x = c(FALSE, FALSE))
+ )
+
+ # Single character
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "_a%")) %>%
+ collect(),
+ tibble(x = c(FALSE, TRUE))
+ )
+
+ # This will give an error until a new version of stringr with str_like has been released
+ skip_if_not(packageVersion("stringr") > "1.4.0")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_like(x, "%baz%")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("str_pad", {
+ df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 31)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 30, side = "right")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 31, side = "both")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("substr", {
+ df <- tibble(x = "Apache Arrow")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 0, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, -1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 6, 1)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, -1, -2)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 9, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 8, 12)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, -5, -1)) %>%
+ collect(),
+ df
+ )
+
+ expect_error(
+ nse_funcs$substr("Apache Arrow", c(1, 2), 3),
+ "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ expect_error(
+ nse_funcs$substr("Apache Arrow", 1, c(2, 3)),
+ "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+})
+
+test_that("substring", {
+ # nse_funcs$substring just calls nse_funcs$substr, tested extensively above
+ df <- tibble(x = "Apache Arrow")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substring(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("str_sub", {
+ df <- tibble(x = "Apache Arrow")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 0, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 6, 1)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -1, -2)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -1, 3)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 9, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 8, 12)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -5, -1)) %>%
+ collect(),
+ df
+ )
+
+ expect_error(
+ nse_funcs$str_sub("Apache Arrow", c(1, 2), 3),
+ "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ expect_error(
+ nse_funcs$str_sub("Apache Arrow", 1, c(2, 3)),
+ "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+})
+
+test_that("str_starts, str_ends, startsWith, endsWith", {
+ df <- tibble(x = c("Foo", "bar", "baz", "qux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, "b.*")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, "b.*", negate = TRUE)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, fixed("b.*"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, fixed("b"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, "r")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, "r", negate = TRUE)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, fixed("r$"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, fixed("r"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(startsWith(x, "b")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(endsWith(x, "r")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(startsWith(x, "b.*")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(endsWith(x, "r$")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("str_count", {
+ df <- tibble(
+ cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"),
+ dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(a_count = str_count(cities, pattern = "a")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(p_count = str_count(cities, pattern = "d")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(p_count = str_count(cities,
+ pattern = regex("d", ignore_case = TRUE)
+ )) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(e_count = str_count(cities, pattern = "u")) %>%
+ collect(),
+ df
+ )
+
+ # nse_funcs$str_count() is not vectorised over pattern
+ compare_dplyr_binding(
+ .input %>%
+ mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>%
+ collect(),
+ df,
+ warning = TRUE
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(dots_count = str_count(dots, ".")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(dots_count = str_count(dots, fixed("."))) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R
new file mode 100644
index 000000000..859dc14b9
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R
@@ -0,0 +1,627 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+suppressPackageStartupMessages(library(bit64))
+
+
+tbl <- example_data
+
+test_that("explicit type conversions with cast()", {
+ num_int32 <- 12L
+ num_int64 <- bit64::as.integer64(10)
+
+ int_types <- c(int8(), int16(), int32(), int64())
+ uint_types <- c(uint8(), uint16(), uint32(), uint64())
+ float_types <- c(float32(), float64())
+
+ types <- c(
+ int_types,
+ uint_types,
+ float_types,
+ double(), # not actually a type, a base R function but should be alias for float64
+ string()
+ )
+
+ for (type in types) {
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(x = num_int32) %>%
+ transmute(x = cast(x, type)) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ as_type(type)
+ )
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(x = num_int64) %>%
+ transmute(x = cast(x, type)) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ as_type(type)
+ )
+ }
+
+ # Arrow errors when truncating floats...
+ expect_error(
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(pi = pi) %>%
+ transmute(three = cast(pi, int32())) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ int32()
+ ),
+ "truncated"
+ )
+
+ # ... unless safe = FALSE (or allow_float_truncate = TRUE)
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(pi = pi) %>%
+ transmute(three = cast(pi, int32(), safe = FALSE)) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ int32()
+ )
+})
+
+test_that("explicit type conversions with as.*()", {
+ library(bit64)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ int2chr = as.character(int),
+ int2dbl = as.double(int),
+ int2int = as.integer(int),
+ int2num = as.numeric(int),
+ dbl2chr = as.character(dbl),
+ dbl2dbl = as.double(dbl),
+ dbl2int = as.integer(dbl),
+ dbl2num = as.numeric(dbl),
+ ) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr2chr = as.character(chr),
+ chr2dbl = as.double(chr),
+ chr2int = as.integer(chr),
+ chr2num = as.numeric(chr)
+ ) %>%
+ collect(),
+ tibble(chr = c("1", "2", "3"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr2i64 = as.integer64(chr),
+ dbl2i64 = as.integer64(dbl),
+ i642i64 = as.integer64(i64),
+ ) %>%
+ collect(),
+ tibble(chr = "10000000000", dbl = 10000000000, i64 = as.integer64(1e10))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr2lgl = as.logical(chr),
+ dbl2lgl = as.logical(dbl),
+ int2lgl = as.logical(int)
+ ) %>%
+ collect(),
+ tibble(
+ chr = c("TRUE", "FALSE", "true", "false"),
+ dbl = c(1, 0, -99, 0),
+ int = c(1L, 0L, -99L, 0L)
+ )
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ dbl2chr = as.character(dbl),
+ dbl2dbl = as.double(dbl),
+ dbl2int = as.integer(dbl),
+ dbl2lgl = as.logical(dbl),
+ int2chr = as.character(int),
+ int2dbl = as.double(int),
+ int2int = as.integer(int),
+ int2lgl = as.logical(int),
+ lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ...
+ lgl2dbl = as.double(lgl),
+ lgl2int = as.integer(lgl),
+ lgl2lgl = as.logical(lgl)
+ ) %>%
+ collect() %>%
+ # need to use toupper() *after* collect() or else skip if utf8proc not available
+ mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE"
+ tibble(
+ dbl = c(1, 0, NA_real_),
+ int = c(1L, 0L, NA_integer_),
+ lgl = c(TRUE, FALSE, NA)
+ )
+ )
+})
+
+test_that("is.finite(), is.infinite(), is.nan()", {
+ df <- tibble(x = c(
+ -4.94065645841246544e-324, 1.79769313486231570e+308, 0,
+ NA_real_, NaN, Inf, -Inf
+ ))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ is_fin = is.finite(x),
+ is_inf = is.infinite(x)
+ ) %>%
+ collect(),
+ df
+ )
+ # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ is_nan = is.nan(x)
+ ) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", {
+ df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ is_na = is.na(x)
+ ) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("type checks with is() giving Arrow types", {
+ # with class2=DataType
+ expect_equal(
+ Table$create(
+ i32 = Array$create(1, int32()),
+ dec = Array$create(pi)$cast(decimal(3, 2)),
+ f64 = Array$create(1.1, float64()),
+ str = Array$create("a", arrow::string())
+ ) %>% transmute(
+ i32_is_i32 = is(i32, int32()),
+ i32_is_dec = is(i32, decimal(3, 2)),
+ i32_is_i64 = is(i32, float64()),
+ i32_is_str = is(i32, arrow::string()),
+ dec_is_i32 = is(dec, int32()),
+ dec_is_dec = is(dec, decimal(3, 2)),
+ dec_is_i64 = is(dec, float64()),
+ dec_is_str = is(dec, arrow::string()),
+ f64_is_i32 = is(f64, int32()),
+ f64_is_dec = is(f64, decimal(3, 2)),
+ f64_is_i64 = is(f64, float64()),
+ f64_is_str = is(f64, arrow::string()),
+ str_is_i32 = is(str, int32()),
+ str_is_dec = is(str, decimal(3, 2)),
+ str_is_i64 = is(str, float64()),
+ str_is_str = is(str, arrow::string())
+ ) %>%
+ collect() %>%
+ t() %>%
+ as.vector(),
+ c(
+ TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE,
+ FALSE, FALSE, FALSE, FALSE, TRUE
+ )
+ )
+ # with class2=string
+ expect_equal(
+ Table$create(
+ i32 = Array$create(1, int32()),
+ f64 = Array$create(1.1, float64()),
+ str = Array$create("a", arrow::string())
+ ) %>% transmute(
+ i32_is_i32 = is(i32, "int32"),
+ i32_is_i64 = is(i32, "double"),
+ i32_is_str = is(i32, "string"),
+ f64_is_i32 = is(f64, "int32"),
+ f64_is_i64 = is(f64, "double"),
+ f64_is_str = is(f64, "string"),
+ str_is_i32 = is(str, "int32"),
+ str_is_i64 = is(str, "double"),
+ str_is_str = is(str, "string")
+ ) %>%
+ collect() %>%
+ t() %>%
+ as.vector(),
+ c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE)
+ )
+ # with class2=string alias
+ expect_equal(
+ Table$create(
+ f16 = Array$create(NA_real_, halffloat()),
+ f32 = Array$create(1.1, float()),
+ f64 = Array$create(2.2, float64()),
+ lgl = Array$create(TRUE, bool()),
+ str = Array$create("a", arrow::string())
+ ) %>% transmute(
+ f16_is_f16 = is(f16, "float16"),
+ f16_is_f32 = is(f16, "float32"),
+ f16_is_f64 = is(f16, "float64"),
+ f16_is_lgl = is(f16, "boolean"),
+ f16_is_str = is(f16, "utf8"),
+ f32_is_f16 = is(f32, "float16"),
+ f32_is_f32 = is(f32, "float32"),
+ f32_is_f64 = is(f32, "float64"),
+ f32_is_lgl = is(f32, "boolean"),
+ f32_is_str = is(f32, "utf8"),
+ f64_is_f16 = is(f64, "float16"),
+ f64_is_f32 = is(f64, "float32"),
+ f64_is_f64 = is(f64, "float64"),
+ f64_is_lgl = is(f64, "boolean"),
+ f64_is_str = is(f64, "utf8"),
+ lgl_is_f16 = is(lgl, "float16"),
+ lgl_is_f32 = is(lgl, "float32"),
+ lgl_is_f64 = is(lgl, "float64"),
+ lgl_is_lgl = is(lgl, "boolean"),
+ lgl_is_str = is(lgl, "utf8"),
+ str_is_f16 = is(str, "float16"),
+ str_is_f32 = is(str, "float32"),
+ str_is_f64 = is(str, "float64"),
+ str_is_lgl = is(str, "boolean"),
+ str_is_str = is(str, "utf8")
+ ) %>%
+ collect() %>%
+ t() %>%
+ as.vector(),
+ c(
+ TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
+ FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
+ FALSE, FALSE, TRUE
+ )
+ )
+})
+
+test_that("type checks with is() giving R types", {
+ library(bit64)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is(chr, "character"),
+ chr_is_fct = is(chr, "factor"),
+ chr_is_int = is(chr, "integer"),
+ chr_is_i64 = is(chr, "integer64"),
+ chr_is_lst = is(chr, "list"),
+ chr_is_lgl = is(chr, "logical"),
+ chr_is_num = is(chr, "numeric"),
+ dbl_is_chr = is(dbl, "character"),
+ dbl_is_fct = is(dbl, "factor"),
+ dbl_is_int = is(dbl, "integer"),
+ dbl_is_i64 = is(dbl, "integer64"),
+ dbl_is_lst = is(dbl, "list"),
+ dbl_is_lgl = is(dbl, "logical"),
+ dbl_is_num = is(dbl, "numeric"),
+ fct_is_chr = is(fct, "character"),
+ fct_is_fct = is(fct, "factor"),
+ fct_is_int = is(fct, "integer"),
+ fct_is_i64 = is(fct, "integer64"),
+ fct_is_lst = is(fct, "list"),
+ fct_is_lgl = is(fct, "logical"),
+ fct_is_num = is(fct, "numeric"),
+ int_is_chr = is(int, "character"),
+ int_is_fct = is(int, "factor"),
+ int_is_int = is(int, "integer"),
+ int_is_i64 = is(int, "integer64"),
+ int_is_lst = is(int, "list"),
+ int_is_lgl = is(int, "logical"),
+ int_is_num = is(int, "numeric"),
+ lgl_is_chr = is(lgl, "character"),
+ lgl_is_fct = is(lgl, "factor"),
+ lgl_is_int = is(lgl, "integer"),
+ lgl_is_i64 = is(lgl, "integer64"),
+ lgl_is_lst = is(lgl, "list"),
+ lgl_is_lgl = is(lgl, "logical"),
+ lgl_is_num = is(lgl, "numeric")
+ ) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ i64_is_chr = is(i64, "character"),
+ i64_is_fct = is(i64, "factor"),
+ # we want Arrow to return TRUE, but bit64 returns FALSE
+ # i64_is_int = is(i64, "integer"),
+ i64_is_i64 = is(i64, "integer64"),
+ i64_is_lst = is(i64, "list"),
+ i64_is_lgl = is(i64, "logical"),
+ # we want Arrow to return TRUE, but bit64 returns FALSE
+ # i64_is_num = is(i64, "numeric"),
+ lst_is_chr = is(lst, "character"),
+ lst_is_fct = is(lst, "factor"),
+ lst_is_int = is(lst, "integer"),
+ lst_is_i64 = is(lst, "integer64"),
+ lst_is_lst = is(lst, "list"),
+ lst_is_lgl = is(lst, "logical"),
+ lst_is_num = is(lst, "numeric")
+ ) %>%
+ collect(),
+ tibble(
+ i64 = as.integer64(1:3),
+ lst = list(c("a", "b"), c("d", "e"), c("f", "g"))
+ )
+ )
+})
+
+test_that("type checks with is.*()", {
+ library(bit64)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is.character(chr),
+ chr_is_dbl = is.double(chr),
+ chr_is_fct = is.factor(chr),
+ chr_is_int = is.integer(chr),
+ chr_is_i64 = is.integer64(chr),
+ chr_is_lst = is.list(chr),
+ chr_is_lgl = is.logical(chr),
+ chr_is_num = is.numeric(chr),
+ dbl_is_chr = is.character(dbl),
+ dbl_is_dbl = is.double(dbl),
+ dbl_is_fct = is.factor(dbl),
+ dbl_is_int = is.integer(dbl),
+ dbl_is_i64 = is.integer64(dbl),
+ dbl_is_lst = is.list(dbl),
+ dbl_is_lgl = is.logical(dbl),
+ dbl_is_num = is.numeric(dbl),
+ fct_is_chr = is.character(fct),
+ fct_is_dbl = is.double(fct),
+ fct_is_fct = is.factor(fct),
+ fct_is_int = is.integer(fct),
+ fct_is_i64 = is.integer64(fct),
+ fct_is_lst = is.list(fct),
+ fct_is_lgl = is.logical(fct),
+ fct_is_num = is.numeric(fct),
+ int_is_chr = is.character(int),
+ int_is_dbl = is.double(int),
+ int_is_fct = is.factor(int),
+ int_is_int = is.integer(int),
+ int_is_i64 = is.integer64(int),
+ int_is_lst = is.list(int),
+ int_is_lgl = is.logical(int),
+ int_is_num = is.numeric(int),
+ lgl_is_chr = is.character(lgl),
+ lgl_is_dbl = is.double(lgl),
+ lgl_is_fct = is.factor(lgl),
+ lgl_is_int = is.integer(lgl),
+ lgl_is_i64 = is.integer64(lgl),
+ lgl_is_lst = is.list(lgl),
+ lgl_is_lgl = is.logical(lgl),
+ lgl_is_num = is.numeric(lgl)
+ ) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ i64_is_chr = is.character(i64),
+ # TODO: investigate why this is not matching when testthat runs it
+ # i64_is_dbl = is.double(i64),
+ i64_is_fct = is.factor(i64),
+ # we want Arrow to return TRUE, but bit64 returns FALSE
+ # i64_is_int = is.integer(i64),
+ i64_is_i64 = is.integer64(i64),
+ i64_is_lst = is.list(i64),
+ i64_is_lgl = is.logical(i64),
+ i64_is_num = is.numeric(i64),
+ lst_is_chr = is.character(lst),
+ lst_is_dbl = is.double(lst),
+ lst_is_fct = is.factor(lst),
+ lst_is_int = is.integer(lst),
+ lst_is_i64 = is.integer64(lst),
+ lst_is_lst = is.list(lst),
+ lst_is_lgl = is.logical(lst),
+ lst_is_num = is.numeric(lst)
+ ) %>%
+ collect(),
+ tibble(
+ i64 = as.integer64(1:3),
+ lst = list(c("a", "b"), c("d", "e"), c("f", "g"))
+ )
+ )
+})
+
+test_that("type checks with is_*()", {
+ library(rlang, warn.conflicts = FALSE)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is_character(chr),
+ chr_is_dbl = is_double(chr),
+ chr_is_int = is_integer(chr),
+ chr_is_lst = is_list(chr),
+ chr_is_lgl = is_logical(chr),
+ dbl_is_chr = is_character(dbl),
+ dbl_is_dbl = is_double(dbl),
+ dbl_is_int = is_integer(dbl),
+ dbl_is_lst = is_list(dbl),
+ dbl_is_lgl = is_logical(dbl),
+ int_is_chr = is_character(int),
+ int_is_dbl = is_double(int),
+ int_is_int = is_integer(int),
+ int_is_lst = is_list(int),
+ int_is_lgl = is_logical(int),
+ lgl_is_chr = is_character(lgl),
+ lgl_is_dbl = is_double(lgl),
+ lgl_is_int = is_integer(lgl),
+ lgl_is_lst = is_list(lgl),
+ lgl_is_lgl = is_logical(lgl)
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("type checks on expressions", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ a = is.character(as.character(int)),
+ b = is.integer(as.character(int)),
+ c = is.integer(int + int),
+ d = is.double(int + dbl),
+ e = is.logical(dbl > pi)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # the code in the expectation below depends on RE2
+ skip_if_not_available("re2")
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ a = is.logical(grepl("[def]", chr))
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("type checks on R scalar literals", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is.character("foo"),
+ int_is_chr = is.character(42L),
+ int_is_int = is.integer(42L),
+ chr_is_int = is.integer("foo"),
+ dbl_is_num = is.numeric(3.14159),
+ int_is_num = is.numeric(42L),
+ chr_is_num = is.numeric("foo"),
+ dbl_is_dbl = is.double(3.14159),
+ chr_is_dbl = is.double("foo"),
+ lgl_is_lgl = is.logical(TRUE),
+ chr_is_lgl = is.logical("foo"),
+ fct_is_fct = is.factor(factor("foo", levels = c("foo", "bar", "baz"))),
+ chr_is_fct = is.factor("foo"),
+ lst_is_lst = is.list(list(c(a = "foo", b = "bar"))),
+ chr_is_lst = is.list("foo")
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("as.factor()/dictionary_encode()", {
+ skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression")
+ df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B"))
+ df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = as.factor(x)) %>%
+ collect(),
+ df1
+ )
+
+ expect_warning(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = as.factor(x)) %>%
+ collect(),
+ df2
+ ),
+ "Coercing dictionary values to R character factor levels"
+ )
+
+ # dictionary values with default null encoding behavior ("mask") omits
+ # nulls from the dictionary values
+ expect_equal(
+ object = {
+ rb1 <- df1 %>%
+ record_batch() %>%
+ transmute(x = dictionary_encode(x)) %>%
+ compute()
+ dict <- rb1$x$dictionary()
+ as.vector(dict$Take(dict$SortIndices()))
+ },
+ sort(unique(df1$x), na.last = NA)
+ )
+
+ # dictionary values with "encode" null encoding behavior includes nulls in
+ # the dictionary values
+ expect_equal(
+ object = {
+ rb1 <- df1 %>%
+ record_batch() %>%
+ transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>%
+ compute()
+ dict <- rb1$x$dictionary()
+ as.vector(dict$Take(dict$SortIndices()))
+ },
+ sort(unique(df1$x), na.last = TRUE)
+ )
+})
+
+test_that("bad explicit type conversions with as.*()", {
+
+ # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R)
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(lgl2chr = as.character(lgl)) %>%
+ collect(),
+ tibble(lgl = c(TRUE, FALSE, NA))
+ )
+ )
+
+ # Arrow fails to parse these strings as numbers (instead of returning NAs with
+ # a warning like R does)
+ expect_error(
+ expect_warning(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(chr2num = as.numeric(chr)) %>%
+ collect(),
+ tibble(chr = c("l.O", "S.S", ""))
+ )
+ )
+ )
+
+ # Arrow fails to parse these strings as Booleans (instead of returning NAs
+ # like R does)
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(chr2lgl = as.logical(chr)) %>%
+ collect(),
+ tibble(chr = c("TRU", "FAX", ""))
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-group-by.R b/src/arrow/r/tests/testthat/test-dplyr-group-by.R
new file mode 100644
index 000000000..7cfcfb5c9
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-group-by.R
@@ -0,0 +1,158 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+
+test_that("group_by groupings are recorded", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("group_by supports creating/renaming", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr, numbers = int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr, numbers = int * 4) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(int > 4, lgl, foo = int > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("ungroup", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(int, chr) %>%
+ ungroup() %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+
+ # to confirm that the above expectation is actually testing what we think it's
+ # testing, verify that compare_dplyr_binding() distinguishes between grouped and
+ # ungrouped tibbles
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(int, chr) %>%
+ (function(x) if (inherits(x, "tbl_df")) ungroup(x) else x) %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+ )
+})
+
+test_that("group_by then rename", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("group_by with .drop", {
+ test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog")
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups), .drop = TRUE) %>%
+ collect(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ collect(),
+ example_with_logical_factors
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = TRUE) %>%
+ collect() %>%
+ n_groups(),
+ 4L
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ collect() %>%
+ n_groups(),
+ 8L
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ group_by_drop_default(),
+ FALSE
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = TRUE) %>%
+ group_by_drop_default(),
+ TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(.drop = FALSE) %>% # no group by vars
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups)) %>%
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ ungroup() %>%
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-join.R b/src/arrow/r/tests/testthat/test-dplyr-join.R
new file mode 100644
index 000000000..d8239f810
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-join.R
@@ -0,0 +1,175 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+left <- example_data
+left$some_grouping <- rep(c(1, 2), 5)
+
+left_tab <- Table$create(left)
+
+to_join <- tibble::tibble(
+ some_grouping = c(1, 2),
+ capital_letters = c("A", "B"),
+ another_column = TRUE
+)
+to_join_tab <- Table$create(to_join)
+
+
+test_that("left_join", {
+ expect_message(
+ compare_dplyr_binding(
+ .input %>%
+ left_join(to_join) %>%
+ collect(),
+ left
+ ),
+ 'Joining, by = "some_grouping"'
+ )
+})
+
+test_that("left_join `by` args", {
+ compare_dplyr_binding(
+ .input %>%
+ left_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+ compare_dplyr_binding(
+ .input %>%
+ left_join(
+ to_join %>%
+ rename(the_grouping = some_grouping),
+ by = c(some_grouping = "the_grouping")
+ ) %>%
+ collect(),
+ left
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ rename(the_grouping = some_grouping) %>%
+ left_join(
+ to_join,
+ by = c(the_grouping = "some_grouping")
+ ) %>%
+ collect(),
+ left
+ )
+})
+
+test_that("join two tables", {
+ expect_identical(
+ left_tab %>%
+ left_join(to_join_tab, by = "some_grouping") %>%
+ collect(),
+ left %>%
+ left_join(to_join, by = "some_grouping") %>%
+ collect()
+ )
+})
+
+test_that("Error handling", {
+ expect_error(
+ left_tab %>%
+ left_join(to_join, by = "not_a_col") %>%
+ collect(),
+ "all(names(by) %in% names(x)) is not TRUE",
+ fixed = TRUE
+ )
+})
+
+# TODO: test duplicate col names
+# TODO: casting: int and float columns?
+
+test_that("right_join", {
+ compare_dplyr_binding(
+ .input %>%
+ right_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("inner_join", {
+ compare_dplyr_binding(
+ .input %>%
+ inner_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("full_join", {
+ compare_dplyr_binding(
+ .input %>%
+ full_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("semi_join", {
+ compare_dplyr_binding(
+ .input %>%
+ semi_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("anti_join", {
+ compare_dplyr_binding(
+ .input %>%
+ # Factor levels when there are no rows in the data don't match
+ # TODO: use better anti_join test data
+ select(-fct) %>%
+ anti_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("mutate then join", {
+ left <- Table$create(
+ one = c("a", "b"),
+ two = 1:2
+ )
+ right <- Table$create(
+ three = TRUE,
+ dos = 2L
+ )
+
+ expect_equal(
+ left %>%
+ rename(dos = two) %>%
+ mutate(one = toupper(one)) %>%
+ left_join(
+ right %>%
+ mutate(three = !three)
+ ) %>%
+ arrange(dos) %>%
+ collect(),
+ tibble(
+ one = c("A", "B"),
+ dos = 1:2,
+ three = c(NA, FALSE)
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-mutate.R b/src/arrow/r/tests/testthat/test-dplyr-mutate.R
new file mode 100644
index 000000000..886ec9e42
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-mutate.R
@@ -0,0 +1,522 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+
+test_that("mutate() is lazy", {
+ expect_s3_class(
+ tbl %>% record_batch() %>% mutate(int = int + 6L),
+ "arrow_dplyr_query"
+ )
+})
+
+test_that("basic mutate", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ mutate(int = int + 6L) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate() with NULL inputs", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(int = NULL) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("empty mutate()", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("transmute", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ transmute(int = int + 6L) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("transmute() with NULL inputs", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(int = NULL) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("empty transmute()", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("transmute() with unsupported arguments", {
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ transmute(int = int + 42L, .keep = "all"),
+ "`transmute()` does not support the `.keep` argument",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ transmute(int = int + 42L, .before = lgl),
+ "`transmute()` does not support the `.before` argument",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ transmute(int = int + 42L, .after = chr),
+ "`transmute()` does not support the `.after` argument",
+ fixed = TRUE
+ )
+})
+
+test_that("transmute() defuses dots arguments (ARROW-13262)", {
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(stringr::str_c(chr, chr)) %>%
+ collect(),
+ "Expression stringr::str_c(chr, chr) not supported in Arrow; pulling data into R",
+ fixed = TRUE
+ )
+})
+
+test_that("mutate and refer to previous mutants", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = nchar(verses),
+ longer = line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("nchar() arguments", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = nchar(verses, type = "bytes"),
+ longer = line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl
+ )
+ # This tests the whole abandon_ship() machinery
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = nchar(verses, type = "bytes", allowNA = TRUE),
+ longer = line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl,
+ warning = paste0(
+ "In nchar\\(verses, type = \"bytes\", allowNA = TRUE\\), ",
+ "allowNA = TRUE not supported by Arrow; pulling data into R"
+ )
+ )
+})
+
+test_that("mutate with .data pronoun", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = str_length(verses),
+ longer = .data$line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate with unnamed expressions", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, padded_strings) %>%
+ mutate(
+ int, # bare column name
+ nchar(padded_strings) # expression
+ ) %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate with reassigning same name", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ new = lgl,
+ new = chr
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate with single value for recycling", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, padded_strings) %>%
+ mutate(
+ dr_bronner = 1 # ALL ONE!
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("dplyr::mutate's examples", {
+ # Newly created variables are available immediately
+ compare_dplyr_binding(
+ .input %>%
+ select(name, mass) %>%
+ mutate(
+ mass2 = mass * 2,
+ mass2_squared = mass2 * mass2
+ ) %>%
+ collect(),
+ starwars # this is a test tibble that ships with dplyr
+ )
+
+ # As well as adding new variables, you can use mutate() to
+ # remove variables and modify existing variables.
+ compare_dplyr_binding(
+ .input %>%
+ select(name, height, mass, homeworld) %>%
+ mutate(
+ mass = NULL,
+ height = height * 0.0328084 # convert to feet
+ ) %>%
+ collect(),
+ starwars
+ )
+
+ # Examples we don't support should succeed
+ # but warn that they're pulling data into R to do so
+
+ # across and autosplicing: ARROW-11699
+ compare_dplyr_binding(
+ .input %>%
+ select(name, homeworld, species) %>%
+ mutate(across(!name, as.factor)) %>%
+ collect(),
+ starwars,
+ warning = "Expression across.*not supported in Arrow"
+ )
+
+ # group_by then mutate
+ compare_dplyr_binding(
+ .input %>%
+ select(name, mass, homeworld) %>%
+ group_by(homeworld) %>%
+ mutate(rank = min_rank(desc(mass))) %>%
+ collect(),
+ starwars,
+ warning = TRUE
+ )
+
+ # `.before` and `.after` experimental args: ARROW-11701
+ df <- tibble(x = 1, y = 2)
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y) %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> x y z
+ #> <dbl> <dbl> <dbl>
+ #> 1 1 2 3
+
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .before = 1) %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> z x y
+ #> <dbl> <dbl> <dbl>
+ #> 1 3 1 2
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .after = x) %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> x z y
+ #> <dbl> <dbl> <dbl>
+ #> 1 1 3 2
+
+ # By default, mutate() keeps all columns from the input data.
+ # Experimental: You can override with `.keep`
+ df <- tibble(x = 1, y = 2, a = "a", b = "b")
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "all") %>% collect(), # the default
+ df
+ )
+ #> # A tibble: 1 x 5
+ #> x y a b z
+ #> <dbl> <dbl> <chr> <chr> <dbl>
+ #> 1 1 2 a b 3
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "used") %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> x y z
+ #> <dbl> <dbl> <dbl>
+ #> 1 1 2 3
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "unused") %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> a b z
+ #> <chr> <chr> <dbl>
+ #> 1 a b 3
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute()
+ df
+ )
+ #> # A tibble: 1 x 1
+ #> z
+ #> <dbl>
+ #> 1 3
+
+ # Grouping ----------------------------------------
+ # The mutate operation may yield different results on grouped
+ # tibbles because the expressions are computed within groups.
+ # The following normalises `mass` by the global average:
+ # TODO: ARROW-13926
+ compare_dplyr_binding(
+ .input %>%
+ select(name, mass, species) %>%
+ mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>%
+ collect(),
+ starwars,
+ warning = "window function"
+ )
+})
+
+test_that("Can mutate after group_by as long as there are no aggregations", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ group_by(chr) %>%
+ mutate(int = int + 6L) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(mean = int, chr) %>%
+ # rename `int` to `mean` and use `mean` in `mutate()` to test that
+ # `all_funs()` does not incorrectly identify it as an aggregate function
+ group_by(chr) %>%
+ mutate(mean = mean + 6L) %>%
+ collect(),
+ tbl
+ )
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ select(int, chr) %>%
+ group_by(chr) %>%
+ mutate(avg_int = mean(int)) %>%
+ collect(),
+ "window functions not currently supported in Arrow; pulling data into R",
+ fixed = TRUE
+ )
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ select(mean = int, chr) %>%
+ # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that
+ # `all_funs()` detects `mean()` despite the collision with a column name
+ group_by(chr) %>%
+ mutate(avg_int = mean(mean)) %>%
+ collect(),
+ "window functions not currently supported in Arrow; pulling data into R",
+ fixed = TRUE
+ )
+})
+
+test_that("handle bad expressions", {
+ # TODO: search for functions other than mean() (see above test)
+ # that need to be forced to fail because they error ambiguously
+
+ with_language("fr", {
+ # expect_warning(., NA) because the usual behavior when it hits a filter
+ # that it can't evaluate is to raise a warning, collect() to R, and retry
+ # the filter. But we want this to error the first time because it's
+ # a user error, not solvable by retrying in R
+ expect_warning(
+ expect_error(
+ Table$create(tbl) %>% mutate(newvar = NOTAVAR + 2),
+ "objet 'NOTAVAR' introuvable"
+ ),
+ NA
+ )
+ })
+})
+
+test_that("Can't just add a vector column with mutate()", {
+ expect_warning(
+ expect_equal(
+ Table$create(tbl) %>%
+ select(int) %>%
+ mutate(again = 1:10),
+ tibble::tibble(int = tbl$int, again = 1:10)
+ ),
+ "In again = 1:10, only values of size one are recycled; pulling data into R"
+ )
+})
+
+test_that("print a mutated table", {
+ expect_output(
+ Table$create(tbl) %>%
+ select(int) %>%
+ mutate(twice = int * 2) %>%
+ print(),
+ "InMemoryDataset (query)
+int: int32
+twice: double (multiply_checked(int, 2))
+
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+})
+
+test_that("mutate and write_dataset", {
+ skip_if_not_available("dataset")
+ # See related test in test-dataset.R
+
+ first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+ df1 <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ fct = factor(LETTERS[1:10]),
+ ts = first_date + lubridate::days(1:10)
+ )
+
+ second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+ df2 <- tibble(
+ int = 101:110,
+ dbl = c(as.numeric(51:59), NaN),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[10:1],
+ fct = factor(LETTERS[10:1]),
+ ts = second_date + lubridate::days(10:1)
+ )
+
+ dst_dir <- tempfile()
+ stacked <- record_batch(rbind(df1, df2))
+ stacked %>%
+ mutate(twice = int * 2) %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int, twice) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ mutate(twice = integer * 2) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("mutate and pmin/pmax", {
+ df <- tibble(
+ city = c("Chillan", "Valdivia", "Osorno"),
+ val1 = c(200, 300, NA),
+ val2 = c(100, NA, NA),
+ val3 = c(0, NA, NA)
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ max_val_1 = pmax(val1, val2, val3),
+ max_val_2 = pmax(val1, val2, val3, na.rm = TRUE),
+ min_val_1 = pmin(val1, val2, val3),
+ min_val_2 = pmin(val1, val2, val3, na.rm = TRUE)
+ ) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ max_val_1 = pmax(val1 - 100, 200, val1 * 100, na.rm = TRUE),
+ min_val_1 = pmin(val1 - 100, 100, val1 * 100, na.rm = TRUE),
+ ) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-query.R b/src/arrow/r/tests/testthat/test-dplyr-query.R
new file mode 100644
index 000000000..21a55f4b4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-query.R
@@ -0,0 +1,296 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$another_chr <- tail(letters, 10)
+
+test_that("basic select/filter/collect", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5)
+
+ expect_s3_class(b2, "arrow_dplyr_query")
+ t2 <- collect(b2)
+ expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")])
+ # Test that the original object is not affected
+ expect_identical(collect(batch), tbl)
+})
+
+test_that("dim() on query", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 5) %>%
+ select(int, chr) %>%
+ dim(),
+ tbl
+ )
+})
+
+test_that("Print method", {
+ expect_output(
+ record_batch(tbl) %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ print(),
+ 'InMemoryDataset (query)
+int: int32
+chr: string
+
+* Filter: (((dbl > 2) and ((chr == "d") or (chr == "f"))) and (int < 5))
+See $.data for the source Arrow object',
+ fixed = TRUE
+ )
+})
+
+test_that("pull", {
+ compare_dplyr_binding(
+ .input %>% pull(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>% pull(1),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>% pull(chr),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 4) %>%
+ rename(strng = chr) %>%
+ pull(strng),
+ tbl
+ )
+})
+
+test_that("collect(as_data_frame=FALSE)", {
+ batch <- record_batch(tbl)
+
+ b1 <- batch %>% collect(as_data_frame = FALSE)
+
+ expect_r6_class(b1, "RecordBatch")
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ collect(as_data_frame = FALSE)
+
+ # collect(as_data_frame = FALSE) always returns Table now
+ expect_r6_class(b2, "Table")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ collect(as_data_frame = FALSE)
+ expect_r6_class(b3, "Table")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ collect(as_data_frame = FALSE)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("compute()", {
+ batch <- record_batch(tbl)
+
+ b1 <- batch %>% compute()
+
+ expect_r6_class(b1, "RecordBatch")
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ compute()
+
+ expect_r6_class(b2, "Table")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ compute()
+ expect_r6_class(b3, "Table")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ compute()
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("head", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ head(2)
+ expect_s3_class(b2, "arrow_dplyr_query")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")][1:2, ]
+ expect_equal(collect(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ head(2)
+ expect_s3_class(b3, "arrow_dplyr_query")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ head(2)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+
+ expect_equal(
+ batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ head(2) %>%
+ mutate(twice = int * 2) %>%
+ collect(),
+ expected %>%
+ rename(strng = chr) %>%
+ mutate(twice = int * 2)
+ )
+
+ # This would fail if we evaluated head() after filter()
+ expect_equal(
+ batch %>%
+ select(int, strng = chr) %>%
+ head(2) %>%
+ filter(int > 5) %>%
+ collect(),
+ expected %>%
+ rename(strng = chr) %>%
+ filter(FALSE)
+ )
+})
+
+test_that("arrange then head returns the right data (ARROW-14162)", {
+
+ compare_dplyr_binding(
+ .input %>%
+ # mpg has ties so we need to sort by two things to get deterministic order
+ arrange(mpg, disp) %>%
+ head(4) %>%
+ collect(),
+ mtcars,
+ ignore_attr = "row.names"
+ )
+})
+
+test_that("arrange then tail returns the right data", {
+ compare_dplyr_binding(
+ .input %>%
+ # mpg has ties so we need to sort by two things to get deterministic order
+ arrange(mpg, disp) %>%
+ tail(4) %>%
+ collect(),
+ mtcars,
+ ignore_attr = "row.names"
+ )
+})
+
+test_that("tail", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ arrange(int) %>%
+ tail(2)
+
+ expect_s3_class(b2, "arrow_dplyr_query")
+ expected <- tail(tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")], 2)
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ arrange(int) %>%
+ tail(2)
+ expect_s3_class(b3, "arrow_dplyr_query")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ arrange(int) %>%
+ tail(2)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("No duplicate field names are allowed in an arrow_dplyr_query", {
+ expect_error(
+ Table$create(tbl, tbl) %>%
+ filter(int > 0),
+ regexp = paste0(
+ 'The following field names were found more than once in the data: "int", "dbl", ',
+ '"dbl2", "lgl", "false", "chr", "fct", "verses", "padded_strings"'
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-select.R b/src/arrow/r/tests/testthat/test-dplyr-select.R
new file mode 100644
index 000000000..2ca2b100e
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-select.R
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+
+test_that("Empty select returns no columns", {
+ compare_dplyr_binding(
+ .input %>% select() %>% collect(),
+ tbl,
+ skip_table = "Table with 0 cols doesn't know how many rows it should have"
+ )
+})
+test_that("Empty select still includes the group_by columns", {
+ expect_message(
+ compare_dplyr_binding(
+ .input %>% group_by(chr) %>% select() %>% collect(),
+ tbl
+ ),
+ "Adding missing grouping variables"
+ )
+})
+
+test_that("select/rename", {
+ compare_dplyr_binding(
+ .input %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ rename(string = chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ rename(strng = chr) %>%
+ rename(other = strng) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("select/rename with selection helpers", {
+
+ # TODO: add some passing tests here
+
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ select(where(is.numeric)) %>%
+ collect(),
+ tbl
+ ),
+ "Unsupported selection helper"
+ )
+})
+
+test_that("filtering with rename", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(chr == "b") %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(string = chr, int) %>%
+ filter(string == "b") %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("relocate", {
+ df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
+ compare_dplyr_binding(
+ .input %>% relocate(f) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(a, .after = c) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(f, .before = b) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(a, .after = last_col()) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(ff = f) %>% collect(),
+ df,
+ )
+})
+
+test_that("relocate with selection helpers", {
+ df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
+ compare_dplyr_binding(
+ .input %>% relocate(any_of(c("a", "e", "i", "o", "u"))) %>% collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(where(is.character)) %>% collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(a, b, c, .after = where(is.character)) %>% collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(d, e, f, .before = where(is.numeric)) %>% collect(),
+ df
+ )
+ # works after other dplyr verbs
+ compare_dplyr_binding(
+ .input %>%
+ mutate(c = as.character(c)) %>%
+ relocate(d, e, f, .after = where(is.numeric)) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-summarize.R b/src/arrow/r/tests/testthat/test-dplyr-summarize.R
new file mode 100644
index 000000000..3988412b8
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-summarize.R
@@ -0,0 +1,881 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+withr::local_options(list(arrow.summarise.sort = TRUE))
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+test_that("summarize() doesn't evaluate eagerly", {
+ expect_s3_class(
+ Table$create(tbl) %>%
+ summarize(total = sum(int)),
+ "arrow_dplyr_query"
+ )
+ expect_r6_class(
+ Table$create(tbl) %>%
+ summarize(total = sum(int)) %>%
+ compute(),
+ "ArrowTabular"
+ )
+})
+
+test_that("Can aggregate in Arrow", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ summarize(total = sum(int)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by sum on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int * 4, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int)) %>%
+ collect(),
+ tbl,
+ )
+})
+
+test_that("Group by mean on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(mean = mean(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(mean = mean(int, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by sd on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(sd = sd(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(sd = sd(int, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by var on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(var = var(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(var = var(int, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("n()", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(counts = n()) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(counts = n()) %>%
+ arrange(some_grouping) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by any/all", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(any(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(all(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(any(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(all(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(has_words = nchar(verses) < 0) %>%
+ group_by(some_grouping) %>%
+ summarize(any(has_words, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(has_words = nchar(verses) < 0) %>%
+ group_by(some_grouping) %>%
+ summarize(all(has_words, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(has_words = all(nchar(verses) < 0, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("n_distinct() on dataset", {
+ # With groupby
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ # Without groupby
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(int, lgl)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(distinct = n_distinct(int, lgl)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments"
+ )
+})
+
+test_that("Functions that take ... but we only accept a single arg", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct()) %>%
+ collect(),
+ tbl,
+ warning = "0 arguments"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(int, lgl)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments"
+ )
+ # Now that we've demonstrated that the whole machinery works, let's test
+ # the agg_funcs directly
+ expect_error(agg_funcs$n_distinct(), "n_distinct() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$sum(), "sum() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$any(), "any() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$all(), "all() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$min(), "min() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$max(), "max() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$n_distinct(1, 2), "Multiple arguments to n_distinct()")
+ expect_error(agg_funcs$sum(1, 2), "Multiple arguments to sum")
+ expect_error(agg_funcs$any(1, 2), "Multiple arguments to any()")
+ expect_error(agg_funcs$all(1, 2), "Multiple arguments to all()")
+ expect_error(agg_funcs$min(1, 2), "Multiple arguments to min()")
+ expect_error(agg_funcs$max(1, 2), "Multiple arguments to max()")
+})
+
+test_that("median()", {
+ # When medians are integer-valued, stats::median() sometimes returns output of
+ # type integer, whereas whereas the Arrow approx_median kernels always return
+ # output of type float64. The calls to median(int, ...) in the tests below
+ # are enclosed in as.double() to work around this known difference.
+
+ # Use old testthat behavior here so we don't have to assert the same warning
+ # over and over
+ local_edition(2)
+
+ # with groups
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ med_dbl = median(dbl),
+ med_int = as.double(median(int)),
+ med_dbl_narmf = median(dbl, FALSE),
+ med_int_narmf = as.double(median(int, na.rm = FALSE)),
+ med_dbl_narmt = median(dbl, na.rm = TRUE),
+ med_int_narmt = as.double(median(int, TRUE))
+ ) %>%
+ arrange(some_grouping) %>%
+ collect(),
+ tbl,
+ warning = "median\\(\\) currently returns an approximate median in Arrow"
+ )
+ # without groups, with na.rm = TRUE
+ compare_dplyr_binding(
+ .input %>%
+ summarize(
+ med_dbl_narmt = median(dbl, na.rm = TRUE),
+ med_int_narmt = as.double(median(int, TRUE))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "median\\(\\) currently returns an approximate median in Arrow"
+ )
+ # without groups, with na.rm = FALSE (the default)
+ compare_dplyr_binding(
+ .input %>%
+ summarize(
+ med_dbl = median(dbl),
+ med_int = as.double(median(int)),
+ med_dbl_narmf = median(dbl, FALSE),
+ med_int_narmf = as.double(median(int, na.rm = FALSE))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "median\\(\\) currently returns an approximate median in Arrow"
+ )
+ local_edition(3)
+})
+
+test_that("quantile()", {
+ # The default method for stats::quantile() throws an error when na.rm = FALSE
+ # and the input contains NA or NaN, whereas the Arrow tdigest kernels return
+ # null in this situation. To work around this known difference, the tests
+ # below always use na.rm = TRUE when the data contains NA or NaN.
+
+ # The default method for stats::quantile() has an argument `names` that
+ # controls whether the result has a names attribute. It defaults to
+ # names = TRUE. With Arrow, it is not possible to give the result a names
+ # attribute, so the quantile() binding in Arrow does not accept a `names`
+ # argument. Differences in this names attribute cause compare_dplyr_binding() to
+ # report that the objects are not equal, so we do not use compare_dplyr_binding()
+ # in the tests below.
+
+ # The tests below all use probs = 0.5 because other values cause differences
+ # between the exact quantiles returned by R and the approximate quantiles
+ # returned by Arrow.
+
+ # When quantiles are integer-valued, stats::quantile() sometimes returns
+ # output of type integer, whereas whereas the Arrow tdigest kernels always
+ # return output of type float64. The calls to quantile(int, ...) in the tests
+ # below are enclosed in as.double() to work around this known difference.
+
+ local_edition(2)
+ # with groups
+ expect_warning(
+ expect_equal(
+ tbl %>%
+ group_by(some_grouping) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
+ q_int = as.double(
+ quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+ )
+ ) %>%
+ arrange(some_grouping),
+ Table$create(tbl) %>%
+ group_by(some_grouping) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
+ q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE))
+ ) %>%
+ arrange(some_grouping) %>%
+ collect()
+ ),
+ "quantile() currently returns an approximate quantile in Arrow",
+ fixed = TRUE
+ )
+
+ # without groups
+ expect_warning(
+ expect_equal(
+ tbl %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
+ q_int = as.double(
+ quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+ )
+ ),
+ Table$create(tbl) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
+ q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE))
+ ) %>%
+ collect()
+ ),
+ "quantile() currently returns an approximate quantile in Arrow",
+ fixed = TRUE
+ )
+
+ # with missing values and na.rm = FALSE
+ expect_warning(
+ expect_equal(
+ tibble(
+ q_dbl = NA_real_,
+ q_int = NA_real_
+ ),
+ Table$create(tbl) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = FALSE),
+ q_int = as.double(quantile(int, probs = 0.5, na.rm = FALSE))
+ ) %>%
+ collect()
+ ),
+ "quantile() currently returns an approximate quantile in Arrow",
+ fixed = TRUE
+ )
+ local_edition(3)
+
+ # with a vector of 2+ probs
+ expect_warning(
+ Table$create(tbl) %>%
+ summarize(q = quantile(dbl, probs = c(0.2, 0.8), na.rm = TRUE)),
+ "quantile() with length(probs) != 1 not supported by Arrow",
+ fixed = TRUE
+ )
+})
+
+test_that("summarize() with min() and max()", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>% # this filters out the NAs in `int`
+ summarize(min_int = min(int), max_int = max(int)) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>% # this filters out the NAs in `int`
+ summarize(
+ min_int = min(int + 4) / 2,
+ max_int = 3 / max(42 - int)
+ ) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ summarize(min_int = min(int), max_int = max(int)) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int) %>%
+ summarize(
+ min_int = min(int, na.rm = TRUE),
+ max_int = max(int, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(dbl, int) %>%
+ summarize(
+ min_int = -min(log(ceiling(dbl)), na.rm = TRUE),
+ max_int = log(max(as.double(int), na.rm = TRUE))
+ ) %>%
+ collect(),
+ tbl,
+ )
+
+ # multiple dots arguments to min(), max() not supported
+ compare_dplyr_binding(
+ .input %>%
+ summarize(min_mult = min(dbl, int)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments to min\\(\\) not supported by Arrow"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int, dbl, dbl2) %>%
+ summarize(max_mult = max(int, dbl, dbl2)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments to max\\(\\) not supported by Arrow"
+ )
+
+ # min(logical) or max(logical) yields integer in R
+ # min(Boolean) or max(Boolean) yields Boolean in Arrow
+ compare_dplyr_binding(
+ .input %>%
+ select(lgl) %>%
+ summarize(
+ max_lgl = as.logical(max(lgl, na.rm = TRUE)),
+ min_lgl = as.logical(min(lgl, na.rm = TRUE))
+ ) %>%
+ collect(),
+ tbl,
+ )
+})
+
+test_that("min() and max() on character strings", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(
+ min_chr = min(chr, na.rm = TRUE),
+ max_chr = max(chr, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tbl,
+ )
+ skip("Strings not supported by hash_min_max (ARROW-13988)")
+ compare_dplyr_binding(
+ .input %>%
+ group_by(fct) %>%
+ summarize(
+ min_chr = min(chr, na.rm = TRUE),
+ max_chr = max(chr, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tbl,
+ )
+})
+
+test_that("summarise() with !!sym()", {
+ test_chr_col <- "int"
+ test_dbl_col <- "dbl"
+ test_lgl_col <- "lgl"
+ compare_dplyr_binding(
+ .input %>%
+ group_by(false) %>%
+ summarise(
+ sum = sum(!!sym(test_dbl_col)),
+ any = any(!!sym(test_lgl_col)),
+ all = all(!!sym(test_lgl_col)),
+ mean = mean(!!sym(test_dbl_col)),
+ sd = sd(!!sym(test_dbl_col)),
+ var = var(!!sym(test_dbl_col)),
+ n_distinct = n_distinct(!!sym(test_chr_col)),
+ min = min(!!sym(test_dbl_col)),
+ max = max(!!sym(test_dbl_col))
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Filter and aggregate", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_grouping == 2) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 5) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_grouping == 2) %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 5) %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by edge cases", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping * 2) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(alt = some_grouping * 2) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Do things after summarize", {
+ group2_sum <- tbl %>%
+ group_by(some_grouping) %>%
+ filter(int > 5) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ pull() %>%
+ tail(1)
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ filter(int > 5) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ filter(total == group2_sum) %>%
+ mutate(extra = total * 5) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2) %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ group_by(lgl) %>%
+ summarize(
+ count = n(),
+ total = sum(twice, na.rm = TRUE)
+ ) %>%
+ mutate(mean = total / count) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Expressions on aggregations", {
+ # This is what it effectively is
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any = any(lgl),
+ all = all(lgl)
+ ) %>%
+ ungroup() %>% # TODO: loosen the restriction on mutate after group_by
+ mutate(some = any & !all) %>%
+ select(some_grouping, some) %>%
+ collect(),
+ tbl
+ )
+ # More concisely:
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(any(lgl) & !all(lgl)) %>%
+ collect(),
+ tbl
+ )
+
+ # Save one of the aggregates first
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any_lgl = any(lgl),
+ some = any_lgl & !all(lgl)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # Make sure order of columns in result is correct
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any_lgl = any(lgl),
+ some = any_lgl & !all(lgl),
+ n()
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # Aggregate on an aggregate (trivial but dplyr allows)
+ skip("Aggregate on an aggregate not supported")
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any_lgl = any(any(lgl))
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Summarize with 0 arguments", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Not (yet) supported: implicit join", {
+ withr::local_options(list(arrow.debug = TRUE))
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ sum((dbl - mean(dbl))^2)
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ sum(dbl - mean(dbl))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression sum\\(dbl - mean\\(dbl\\)\\) not supported in Arrow; pulling data into R"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ sqrt(sum((dbl - mean(dbl))^2) / (n() - 1L))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R"
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ dbl - mean(dbl)
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression dbl - mean\\(dbl\\) not supported in Arrow; pulling data into R"
+ )
+
+ # This one could possibly be supported--in mutate()
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ dbl - int
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression dbl - int not supported in Arrow; pulling data into R"
+ )
+})
+
+test_that(".groups argument", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n()) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "drop_last") %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "keep") %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "drop") %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "rowwise") %>%
+ collect(),
+ tbl,
+ warning = TRUE
+ )
+
+ # abandon_ship() raises the warning, then dplyr itself errors
+ # This isn't ideal but it's fine and won't be an issue on Datasets
+ expect_error(
+ expect_warning(
+ Table$create(tbl) %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "NOTVALID"),
+ "Invalid .groups argument"
+ ),
+ "NOTVALID"
+ )
+})
+
+test_that("summarize() handles group_by .drop", {
+ # Error: Type error: Sorting not supported for type dictionary<values=string, indices=int8, ordered=0>
+ withr::local_options(list(arrow.summarise.sort = FALSE))
+
+ tbl <- tibble(
+ x = 1:10,
+ y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y) %>%
+ count() %>%
+ collect() %>%
+ arrange(y),
+ tbl
+ )
+ # Not supported: check message
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y, .drop = FALSE) %>%
+ count() %>%
+ collect() %>%
+ # Because it's not supported, we have to filter out the (empty) row
+ # that dplyr keeps, just so we test equal (otherwise)
+ filter(y != "b") %>%
+ arrange(y),
+ tbl,
+ warning = ".drop = FALSE currently not supported in Arrow aggregation"
+ )
+
+ # But this is ok because there is no factor group
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y, .drop = FALSE) %>%
+ count() %>%
+ collect() %>%
+ arrange(y),
+ tibble(
+ x = 1:10,
+ y = rep(c("a", "c"), each = 5)
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-duckdb.R b/src/arrow/r/tests/testthat/test-duckdb.R
new file mode 100644
index 000000000..decd6e80e
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-duckdb.R
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_installed("duckdb", minimum_version = "0.2.8")
+skip_if_not_installed("dbplyr")
+skip_if_not_available("dataset")
+skip_on_cran()
+
+library(duckdb, quietly = TRUE)
+library(dplyr, warn.conflicts = FALSE)
+
+test_that("to_duckdb", {
+ ds <- InMemoryDataset$create(example_data)
+
+ expect_identical(
+ ds %>%
+ to_duckdb() %>%
+ collect() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(!fct),
+ select(example_data, !fct)
+ )
+
+ expect_identical(
+ ds %>%
+ select(int, lgl, dbl) %>%
+ to_duckdb() %>%
+ group_by(lgl) %>%
+ summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>%
+ collect(),
+ tibble::tibble(
+ lgl = c(TRUE, NA, FALSE),
+ mean_int = c(3, 6.25, 8.5),
+ mean_dbl = c(3.1, 6.35, 6.1)
+ )
+ )
+
+ # can group_by before the to_duckdb
+ expect_identical(
+ ds %>%
+ select(int, lgl, dbl) %>%
+ group_by(lgl) %>%
+ to_duckdb() %>%
+ summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>%
+ collect(),
+ tibble::tibble(
+ lgl = c(TRUE, NA, FALSE),
+ mean_int = c(3, 6.25, 8.5),
+ mean_dbl = c(3.1, 6.35, 6.1)
+ )
+ )
+})
+
+test_that("to_duckdb then to_arrow", {
+ ds <- InMemoryDataset$create(example_data)
+
+ ds_rt <- ds %>%
+ to_duckdb() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(-fct) %>%
+ to_arrow()
+
+ expect_identical(
+ collect(ds_rt),
+ ds %>%
+ select(-fct) %>%
+ collect()
+ )
+
+ # And we can continue the pipeline
+ ds_rt <- ds %>%
+ to_duckdb() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(-fct) %>%
+ to_arrow() %>%
+ filter(int > 5)
+
+ expect_identical(
+ collect(ds_rt),
+ ds %>%
+ select(-fct) %>%
+ filter(int > 5) %>%
+ collect()
+ )
+
+ # Now check errors
+ ds_rt <- ds %>%
+ to_duckdb() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(-fct)
+
+ # alter the class of ds_rt's connection to simulate some other database
+ class(ds_rt$src$con) <- "some_other_connection"
+
+ expect_error(
+ to_arrow(ds_rt),
+ "to_arrow\\(\\) currently only supports Arrow tables, Arrow datasets,"
+ )
+})
+
+# The next set of tests use an already-extant connection to test features of
+# persistence and querying against the table without using the `tbl` itself, so
+# we need to create a connection separate from the ephemeral one that is made
+# with arrow_duck_connection()
+con <- dbConnect(duckdb::duckdb())
+dbExecute(con, "PRAGMA threads=2")
+on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE)
+
+# write one table to the connection so it is kept open
+DBI::dbWriteTable(con, "mtcars", mtcars)
+
+test_that("Joining, auto-cleanup enabled", {
+ ds <- InMemoryDataset$create(example_data)
+
+ table_one_name <- "my_arrow_table_1"
+ table_one <- to_duckdb(ds, con = con, table_name = table_one_name, auto_disconnect = TRUE)
+ table_two_name <- "my_arrow_table_2"
+ table_two <- to_duckdb(ds, con = con, table_name = table_two_name, auto_disconnect = TRUE)
+
+ res <- dbGetQuery(
+ con,
+ paste0(
+ "SELECT * FROM ", table_one_name,
+ " INNER JOIN ", table_two_name,
+ " ON ", table_one_name, ".int = ", table_two_name, ".int"
+ )
+ )
+ expect_identical(dim(res), c(9L, 14L))
+
+ # clean up cleans up the tables
+ expect_true(all(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
+ rm(table_one, table_two)
+ gc()
+ expect_false(any(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
+})
+
+test_that("Joining, auto-cleanup disabled", {
+ ds <- InMemoryDataset$create(example_data)
+
+ table_three_name <- "my_arrow_table_3"
+ table_three <- to_duckdb(ds, con = con, table_name = table_three_name)
+
+ # clean up does *not* clean these tables
+ expect_true(table_three_name %in% DBI::dbListTables(con))
+ rm(table_three)
+ gc()
+ # but because we aren't auto_disconnecting then we still have this table.
+ expect_true(table_three_name %in% DBI::dbListTables(con))
+})
+
+test_that("to_duckdb with a table", {
+ tab <- Table$create(example_data)
+
+ expect_identical(
+ tab %>%
+ to_duckdb() %>%
+ group_by(int > 4) %>%
+ summarise(
+ int_mean = mean(int, na.rm = TRUE),
+ dbl_mean = mean(dbl, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tibble::tibble(
+ "int > 4" = c(FALSE, NA, TRUE),
+ int_mean = c(2, NA, 7.5),
+ dbl_mean = c(2.1, 4.1, 7.3)
+ )
+ )
+})
+
+test_that("to_duckdb passing a connection", {
+ ds <- InMemoryDataset$create(example_data)
+
+ con_separate <- dbConnect(duckdb::duckdb())
+ # we always want to test in parallel
+ dbExecute(con_separate, "PRAGMA threads=2")
+ on.exit(dbDisconnect(con_separate, shutdown = TRUE), add = TRUE)
+
+ # create a table to join to that we know is in our con_separate
+ new_df <- data.frame(
+ int = 1:10,
+ char = letters[26:17],
+ stringsAsFactors = FALSE
+ )
+ DBI::dbWriteTable(con_separate, "separate_join_table", new_df)
+
+ table_four <- ds %>%
+ select(int, lgl, dbl) %>%
+ to_duckdb(con = con_separate, auto_disconnect = FALSE)
+ table_four_name <- table_four$ops$x
+
+ result <- DBI::dbGetQuery(
+ con_separate,
+ paste0(
+ "SELECT * FROM ", table_four_name,
+ " INNER JOIN separate_join_table ",
+ "ON separate_join_table.int = ", table_four_name, ".int"
+ )
+ )
+
+ expect_identical(dim(result), c(9L, 5L))
+ expect_identical(result$char, new_df[new_df$int != 4, ]$char)
+})
diff --git a/src/arrow/r/tests/testthat/test-expression.R b/src/arrow/r/tests/testthat/test-expression.R
new file mode 100644
index 000000000..c4aab718d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-expression.R
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("C++ expressions", {
+ skip_if_not_available("dataset")
+ f <- Expression$field_ref("f")
+ expect_identical(f$field_name, "f")
+ g <- Expression$field_ref("g")
+ date <- Expression$scalar(as.Date("2020-01-15"))
+ ts <- Expression$scalar(as.POSIXct("2020-01-17 11:11:11"))
+ i64 <- Expression$scalar(bit64::as.integer64(42))
+ time <- Expression$scalar(hms::hms(56, 34, 12))
+
+ expect_r6_class(f == g, "Expression")
+ expect_r6_class(f == 4, "Expression")
+ expect_r6_class(f == "", "Expression")
+ expect_r6_class(f == NULL, "Expression")
+ expect_r6_class(f == date, "Expression")
+ expect_r6_class(f == i64, "Expression")
+ expect_r6_class(f == time, "Expression")
+ # can't seem to make this work right now because of R Ops.method dispatch
+ # expect_r6_class(f == as.Date("2020-01-15"), "Expression") # nolint
+ expect_r6_class(f == ts, "Expression")
+ expect_r6_class(f <= 2L, "Expression")
+ expect_r6_class(f != FALSE, "Expression")
+ expect_r6_class(f > 4, "Expression")
+ expect_r6_class(f < 4 & f > 2, "Expression")
+ expect_r6_class(f < 4 | f > 2, "Expression")
+ expect_r6_class(!(f < 4), "Expression")
+ expect_output(
+ print(f > 4),
+ "Expression\n(f > 4)",
+ fixed = TRUE
+ )
+ expect_equal(
+ f$type(schema(f = float64())),
+ float64()
+ )
+ expect_equal(
+ (f > 4)$type(schema(f = float64())),
+ bool()
+ )
+ # Interprets that as a list type
+ expect_r6_class(f == c(1L, 2L), "Expression")
+
+ expect_error(
+ Expression$create("add", 1, 2),
+ "Expression arguments must be Expression objects"
+ )
+})
+
+test_that("Field reference expression schemas and types", {
+ x <- Expression$field_ref("x")
+
+ # type() throws error when schema is NULL
+ expect_error(x$type(), "schema")
+
+ # type() returns type when schema is set
+ x$schema <- Schema$create(x = int32())
+ expect_equal(x$type(), int32())
+})
+
+test_that("Scalar expression schemas and types", {
+ # type() works on scalars without setting the schema
+ expect_equal(
+ Expression$scalar("foo")$type(),
+ arrow::string()
+ )
+ expect_equal(
+ Expression$scalar(42L)$type(),
+ int32()
+ )
+})
+
+test_that("Expression schemas and types", {
+ x <- Expression$field_ref("x")
+ y <- Expression$field_ref("y")
+ z <- Expression$scalar(42L)
+
+ # type() throws error when both schemas are unset
+ expect_error(
+ Expression$create("add_checked", x, y)$type(),
+ "schema"
+ )
+
+ # type() throws error when left schema is unset
+ y$schema <- Schema$create(y = float64())
+ expect_error(
+ Expression$create("add_checked", x, y)$type(),
+ "schema"
+ )
+
+ # type() throws error when right schema is unset
+ x$schema <- Schema$create(x = int32())
+ y$schema <- NULL
+ expect_error(
+ Expression$create("add_checked", x, y)$type(),
+ "schema"
+ )
+
+ # type() returns type when both schemas are set
+ y$schema <- Schema$create(y = float64())
+ expect_equal(
+ Expression$create("add_checked", x, y)$type(),
+ float64()
+ )
+
+ # type() returns type when one arg has schema set and one is scalar
+ expect_equal(
+ Expression$create("add_checked", x, z)$type(),
+ int32()
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-feather.R b/src/arrow/r/tests/testthat/test-feather.R
new file mode 100644
index 000000000..136474dea
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-feather.R
@@ -0,0 +1,256 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+feather_file <- tempfile()
+tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+
+test_that("Write a feather file", {
+ tib_out <- write_feather(tib, feather_file)
+ expect_true(file.exists(feather_file))
+ # Input is returned unmodified
+ expect_identical(tib_out, tib)
+})
+
+expect_feather_roundtrip <- function(write_fun) {
+ tf2 <- normalizePath(tempfile(), mustWork = FALSE)
+ tf3 <- tempfile()
+ on.exit({
+ unlink(tf2)
+ unlink(tf3)
+ })
+
+ # Write two ways. These are what varies with each run
+ write_fun(tib, tf2)
+ expect_true(file.exists(tf2))
+
+ stream <- FileOutputStream$create(tf3)
+ write_fun(tib, stream)
+ stream$close()
+ expect_true(file.exists(tf3))
+
+ # Read both back
+ tab2 <- read_feather(tf2)
+ expect_s3_class(tab2, "data.frame")
+
+ tab3 <- read_feather(tf3)
+ expect_s3_class(tab3, "data.frame")
+
+ # reading directly from arrow::io::MemoryMappedFile
+ tab4 <- read_feather(mmap_open(tf3))
+ expect_s3_class(tab4, "data.frame")
+
+ # reading directly from arrow::io::ReadableFile
+ tab5 <- read_feather(ReadableFile$create(tf3))
+ expect_s3_class(tab5, "data.frame")
+
+ expect_equal(tib, tab2)
+ expect_equal(tib, tab3)
+ expect_equal(tib, tab4)
+ expect_equal(tib, tab5)
+}
+
+test_that("feather read/write round trip", {
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 1))
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 2))
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, chunk_size = 32))
+ if (codec_is_available("lz4")) {
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4"))
+ }
+ if (codec_is_available("zstd")) {
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd"))
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd", compression_level = 3))
+ }
+
+ # Write from Arrow data structures
+ expect_feather_roundtrip(function(x, f) write_feather(RecordBatch$create(x), f))
+ expect_feather_roundtrip(function(x, f) write_feather(Table$create(x), f))
+})
+
+test_that("write_feather option error handling", {
+ tf <- tempfile()
+ expect_false(file.exists(tf))
+ expect_error(
+ write_feather(tib, tf, version = 1, chunk_size = 1024),
+ "Feather version 1 does not support the 'chunk_size' option"
+ )
+ expect_error(
+ write_feather(tib, tf, version = 1, compression = "lz4"),
+ "Feather version 1 does not support the 'compression' option"
+ )
+ expect_error(
+ write_feather(tib, tf, version = 1, compression_level = 1024),
+ "Feather version 1 does not support the 'compression_level' option"
+ )
+ expect_error(
+ write_feather(tib, tf, compression_level = 1024),
+ "Can only specify a 'compression_level' when 'compression' is 'zstd'"
+ )
+ expect_match_arg_error(write_feather(tib, tf, compression = "bz2"))
+ expect_false(file.exists(tf))
+})
+
+test_that("write_feather with invalid input type", {
+ bad_input <- Array$create(1:5)
+ expect_error(
+ write_feather(bad_input, feather_file),
+ regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
+ )
+})
+
+test_that("read_feather supports col_select = <names>", {
+ tab1 <- read_feather(feather_file, col_select = c("x", "y"))
+ expect_s3_class(tab1, "data.frame")
+
+ expect_equal(tib$x, tab1$x)
+ expect_equal(tib$y, tab1$y)
+})
+
+test_that("feather handles col_select = <integer>", {
+ tab1 <- read_feather(feather_file, col_select = 1:2)
+ expect_s3_class(tab1, "data.frame")
+
+ expect_equal(tib$x, tab1$x)
+ expect_equal(tib$y, tab1$y)
+})
+
+test_that("feather handles col_select = <tidyselect helper>", {
+ tab1 <- read_feather(feather_file, col_select = everything())
+ expect_identical(tib, tab1)
+
+ tab2 <- read_feather(feather_file, col_select = starts_with("x"))
+ expect_identical(tab2, tib[, "x", drop = FALSE])
+
+ tab3 <- read_feather(feather_file, col_select = c(starts_with("x"), contains("y")))
+ expect_identical(tab3, tib[, c("x", "y"), drop = FALSE])
+
+ tab4 <- read_feather(feather_file, col_select = -z)
+ expect_identical(tab4, tib[, c("x", "y"), drop = FALSE])
+})
+
+test_that("feather read/write round trip", {
+ tab1 <- read_feather(feather_file, as_data_frame = FALSE)
+ expect_r6_class(tab1, "Table")
+
+ expect_equal(tib, as.data.frame(tab1))
+})
+
+test_that("Read feather from raw vector", {
+ test_raw <- readBin(feather_file, what = "raw", n = 5000)
+ df <- read_feather(test_raw)
+ expect_s3_class(df, "data.frame")
+})
+
+test_that("FeatherReader", {
+ v1 <- tempfile()
+ v2 <- tempfile()
+ on.exit({
+ unlink(v1)
+ unlink(v2)
+ })
+ write_feather(tib, v1, version = 1)
+ write_feather(tib, v2)
+ f1 <- make_readable_file(v1)
+ reader1 <- FeatherReader$create(f1)
+ f1$close()
+ expect_identical(reader1$version, 1L)
+ f2 <- make_readable_file(v2)
+ reader2 <- FeatherReader$create(f2)
+ expect_identical(reader2$version, 2L)
+ f2$close()
+})
+
+test_that("read_feather requires RandomAccessFile and errors nicely otherwise (ARROW-8615)", {
+ skip_if_not_available("gzip")
+ expect_error(
+ read_feather(CompressedInputStream$create(feather_file)),
+ 'file must be a "RandomAccessFile"'
+ )
+})
+
+test_that("read_feather closes connection to file", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_feather(tib, sink = tf)
+ expect_true(file.exists(tf))
+ read_feather(tf)
+ expect_error(file.remove(tf), NA)
+ expect_false(file.exists(tf))
+})
+
+test_that("Character vectors > 2GB can write to feather", {
+ skip_on_cran()
+ skip_if_not_running_large_memory_tests()
+ df <- tibble::tibble(big = make_big_string())
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_feather(df, tf)
+ expect_identical(read_feather(tf), df)
+})
+
+test_that("FeatherReader methods", {
+ # Setup a feather file to use in the test
+ feather_temp <- tempfile()
+ on.exit({
+ unlink(feather_temp)
+ })
+ write_feather(tib, feather_temp)
+ feather_temp_RA <- make_readable_file(feather_temp)
+
+ reader <- FeatherReader$create(feather_temp_RA)
+ feather_temp_RA$close()
+
+ # column_names
+ expect_identical(
+ reader$column_names,
+ c("x", "y", "z")
+ )
+
+ # print method
+ expect_identical(
+ capture.output(print(reader)),
+ # TODO: can we get rows/columns?
+ c("FeatherReader:", "Schema", "x: int32", "y: double", "z: string")
+ )
+})
+
+unlink(feather_file)
+
+ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather")
+
+test_that("Error messages are shown when the compression algorithm lz4 is not found", {
+ msg <- paste0(
+ "NotImplemented: Support for codec 'lz4' not built\nIn order to read this file, ",
+ "you will need to reinstall arrow with additional features enabled.\nSet one of ",
+ "these environment variables before installing:\n\n * LIBARROW_MINIMAL=false ",
+ "(for all optional features, including 'lz4')\n * ARROW_WITH_LZ4=ON (for just 'lz4')",
+ "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details"
+ )
+
+ if (codec_is_available("lz4")) {
+ d <- read_feather(ft_file)
+ expect_s3_class(d, "data.frame")
+ } else {
+ expect_error(read_feather(ft_file), msg, fixed = TRUE)
+ }
+})
+
+test_that("Error is created when feather reads a parquet file", {
+ expect_error(
+ read_feather(system.file("v0.7.1.parquet", package = "arrow")),
+ "Not a Feather V1 or Arrow IPC file"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-field.R b/src/arrow/r/tests/testthat/test-field.R
new file mode 100644
index 000000000..1be36c064
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-field.R
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("field() factory", {
+ x <- field("x", int32())
+ expect_equal(x$type, int32())
+ expect_equal(x$name, "x")
+ expect_true(x$nullable)
+ expect_true(x == x)
+ expect_false(x == field("x", int64()))
+})
+
+test_that("Field with nullable values", {
+ x <- field("x", int32(), nullable = FALSE)
+ expect_equal(x$type, int32())
+ expect_false(x$nullable)
+ expect_true(x == x)
+ expect_false(x == field("x", int32()))
+})
+
+test_that("Field validation", {
+ expect_error(schema(b = 32), "b must be a DataType, not numeric")
+})
+
+test_that("Print method for field", {
+ expect_output(print(field("x", int32())), "Field\nx: int32")
+ expect_output(
+ print(field("zz", dictionary())),
+ "Field\nzz: dictionary<values=string, indices=int32>"
+ )
+
+ expect_output(
+ print(field("x", int32(), nullable = FALSE)),
+ "Field\nx: int32 not null"
+ )
+
+})
+
+test_that("Field to C-interface", {
+ field <- field("x", time32("s"))
+
+ # export the field via the C-interface
+ ptr <- allocate_arrow_schema()
+ field$export_to_c(ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- Field$import_from_c(ptr)
+ expect_equal(circle, field)
+
+ # must clean up the pointer or we leak
+ delete_arrow_schema(ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-filesystem.R b/src/arrow/r/tests/testthat/test-filesystem.R
new file mode 100644
index 000000000..5ee096f13
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-filesystem.R
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("LocalFilesystem", {
+ fs <- LocalFileSystem$create()
+ expect_identical(fs$type_name, "local")
+ DESCRIPTION <- system.file("DESCRIPTION", package = "arrow")
+ info <- fs$GetFileInfo(DESCRIPTION)[[1]]
+ expect_equal(info$base_name(), "DESCRIPTION")
+ expect_equal(info$extension(), "")
+ expect_equal(info$type, FileType$File)
+ expect_equal(info$path, DESCRIPTION)
+ info <- file.info(DESCRIPTION)
+
+ expect_equal(info$size, info$size)
+ # This fails due to a subsecond difference on Appveyor on Windows with R 3.3 only
+ # So add a greater tolerance to allow for that
+ expect_equal(info$mtime, info$mtime, tolerance = 1)
+
+ tf <- tempfile(fileext = ".txt")
+ fs$CopyFile(DESCRIPTION, tf)
+ info <- fs$GetFileInfo(tf)[[1]]
+ expect_equal(info$extension(), "txt")
+ expect_equal(info$size, info$size)
+ expect_equal(readLines(DESCRIPTION), readLines(tf))
+
+ tf2 <- tempfile(fileext = ".txt")
+ fs$Move(tf, tf2)
+ infos <- fs$GetFileInfo(c(tf, tf2, dirname(tf)))
+ expect_equal(infos[[1]]$type, FileType$NotFound)
+ expect_equal(infos[[2]]$type, FileType$File)
+ expect_equal(infos[[3]]$type, FileType$Directory)
+
+ fs$DeleteFile(tf2)
+ expect_equal(fs$GetFileInfo(tf2)[[1L]]$type, FileType$NotFound)
+ expect_true(!file.exists(tf2))
+
+ expect_equal(fs$GetFileInfo(tf)[[1L]]$type, FileType$NotFound)
+ expect_true(!file.exists(tf))
+
+ td <- tempfile()
+ fs$CreateDir(td)
+ expect_equal(fs$GetFileInfo(td)[[1L]]$type, FileType$Directory)
+ fs$CopyFile(DESCRIPTION, file.path(td, "DESCRIPTION"))
+ fs$DeleteDirContents(td)
+ expect_equal(length(dir(td)), 0L)
+ fs$DeleteDir(td)
+ expect_equal(fs$GetFileInfo(td)[[1L]]$type, FileType$NotFound)
+
+ tf3 <- tempfile()
+ os <- fs$OpenOutputStream(path = tf3)
+ bytes <- as.raw(1:40)
+ os$write(bytes)
+ os$close()
+
+ is <- fs$OpenInputStream(tf3)
+ buf <- is$Read(40)
+ expect_equal(buf$data(), bytes)
+ is$close()
+})
+
+test_that("SubTreeFilesystem", {
+ dir.create(td <- tempfile())
+ DESCRIPTION <- system.file("DESCRIPTION", package = "arrow")
+ file.copy(DESCRIPTION, file.path(td, "DESCRIPTION"))
+
+ st_fs <- SubTreeFileSystem$create(td)
+ expect_r6_class(st_fs, "SubTreeFileSystem")
+ expect_r6_class(st_fs, "FileSystem")
+ expect_r6_class(st_fs$base_fs, "LocalFileSystem")
+ expect_identical(
+ capture.output(print(st_fs)),
+ paste0("SubTreeFileSystem: ", "file://", st_fs$base_path)
+ )
+
+ # FIXME windows has a trailing slash for one but not the other
+ # expect_identical(normalizePath(st_fs$base_path), normalizePath(td)) # nolint
+
+ st_fs$CreateDir("test")
+ st_fs$CopyFile("DESCRIPTION", "DESC.txt")
+ infos <- st_fs$GetFileInfo(c("DESCRIPTION", "test", "nope", "DESC.txt"))
+ expect_equal(infos[[1L]]$type, FileType$File)
+ expect_equal(infos[[2L]]$type, FileType$Directory)
+ expect_equal(infos[[3L]]$type, FileType$NotFound)
+ expect_equal(infos[[4L]]$type, FileType$File)
+ expect_equal(infos[[4L]]$extension(), "txt")
+
+ local_fs <- LocalFileSystem$create()
+ local_fs$DeleteDirContents(td)
+ infos <- st_fs$GetFileInfo(c("DESCRIPTION", "test", "nope", "DESC.txt"))
+ expect_equal(infos[[1L]]$type, FileType$NotFound)
+ expect_equal(infos[[2L]]$type, FileType$NotFound)
+ expect_equal(infos[[3L]]$type, FileType$NotFound)
+ expect_equal(infos[[4L]]$type, FileType$NotFound)
+})
+
+test_that("LocalFileSystem + Selector", {
+ fs <- LocalFileSystem$create()
+ dir.create(td <- tempfile())
+ writeLines("blah blah", file.path(td, "one.txt"))
+ writeLines("yada yada", file.path(td, "two.txt"))
+ dir.create(file.path(td, "dir"))
+ writeLines("...", file.path(td, "dir", "three.txt"))
+
+ selector <- FileSelector$create(td, recursive = TRUE)
+ infos <- fs$GetFileInfo(selector)
+ expect_equal(length(infos), 4L)
+ types <- sapply(infos, function(.x) .x$type)
+ expect_equal(sum(types == FileType$File), 3L)
+ expect_equal(sum(types == FileType$Directory), 1L)
+
+ selector <- FileSelector$create(td, recursive = FALSE)
+ infos <- fs$GetFileInfo(selector)
+ expect_equal(length(infos), 3L)
+ types <- sapply(infos, function(.x) .x$type)
+ expect_equal(sum(types == FileType$File), 2L)
+ expect_equal(sum(types == FileType$Directory), 1L)
+})
+
+test_that("FileSystem$from_uri", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ fs_and_path <- FileSystem$from_uri("s3://ursa-labs-taxi-data")
+ expect_r6_class(fs_and_path$fs, "S3FileSystem")
+ expect_identical(fs_and_path$fs$region, "us-east-2")
+})
+
+test_that("SubTreeFileSystem$create() with URI", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data")
+ expect_r6_class(fs, "SubTreeFileSystem")
+ expect_identical(
+ capture.output(print(fs)),
+ "SubTreeFileSystem: s3://ursa-labs-taxi-data/"
+ )
+})
+
+test_that("S3FileSystem", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ s3fs <- S3FileSystem$create()
+ expect_r6_class(s3fs, "S3FileSystem")
+})
+
+test_that("s3_bucket", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ bucket <- s3_bucket("ursa-labs-r-test")
+ expect_r6_class(bucket, "SubTreeFileSystem")
+ expect_r6_class(bucket$base_fs, "S3FileSystem")
+ expect_identical(bucket$region, "us-west-2")
+ expect_identical(
+ capture.output(print(bucket)),
+ "SubTreeFileSystem: s3://ursa-labs-r-test/"
+ )
+ skip_on_os("windows") # FIXME
+ expect_identical(bucket$base_path, "ursa-labs-r-test/")
+})
diff --git a/src/arrow/r/tests/testthat/test-install-arrow.R b/src/arrow/r/tests/testthat/test-install-arrow.R
new file mode 100644
index 000000000..977f9d77d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-install-arrow.R
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+r_only({
+ test_that("arrow_repos", {
+ cran <- "https://cloud.r-project.org/"
+ ours <- "https://dl.example.com/ursalabs/fake_repo"
+ other <- "https://cran.fiocruz.br/"
+
+ opts <- list(
+ repos = c(CRAN = "@CRAN@"), # Restore defaul
+ arrow.dev_repo = ours
+ )
+ withr::with_options(opts, {
+ expect_identical(arrow_repos(), cran)
+ expect_identical(arrow_repos(c(cran, ours)), cran)
+ expect_identical(arrow_repos(c(ours, other)), other)
+ expect_identical(arrow_repos(nightly = TRUE), c(ours, cran))
+ expect_identical(arrow_repos(c(cran, ours), nightly = TRUE), c(ours, cran))
+ expect_identical(arrow_repos(c(ours, other), nightly = TRUE), c(ours, other))
+ })
+ })
+})
diff --git a/src/arrow/r/tests/testthat/test-json.R b/src/arrow/r/tests/testthat/test-json.R
new file mode 100644
index 000000000..825511b97
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-json.R
@@ -0,0 +1,255 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("json")
+
+test_that("Can read json file with scalars columns (ARROW-5503)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf, useBytes = TRUE)
+
+ tab1 <- read_json_arrow(tf, as_data_frame = FALSE)
+ tab2 <- read_json_arrow(mmap_open(tf), as_data_frame = FALSE)
+ tab3 <- read_json_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
+
+ expect_equal(tab1, tab2)
+ expect_equal(tab1, tab3)
+
+ expect_equal(
+ tab1$schema,
+ schema(hello = float64(), world = boolean(), yo = utf8())
+ )
+ tib <- as.data.frame(tab1)
+ expect_equal(tib$hello, c(3.5, 3.25, 3.125, 0))
+ expect_equal(tib$world, c(FALSE, NA, NA, TRUE))
+ expect_equal(tib$yo, c("thing", NA, "\u5fcd", NA))
+})
+
+test_that("read_json_arrow() converts to tibble", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf, useBytes = TRUE)
+
+ tab1 <- read_json_arrow(tf)
+ tab2 <- read_json_arrow(mmap_open(tf))
+ tab3 <- read_json_arrow(ReadableFile$create(tf))
+
+ expect_s3_class(tab1, "tbl_df")
+ expect_s3_class(tab2, "tbl_df")
+ expect_s3_class(tab3, "tbl_df")
+
+ expect_equal(tab1, tab2)
+ expect_equal(tab1, tab3)
+
+ expect_equal(tab1$hello, c(3.5, 3.25, 3.125, 0))
+ expect_equal(tab1$world, c(FALSE, NA, NA, TRUE))
+ expect_equal(tab1$yo, c("thing", NA, "\u5fcd", NA))
+})
+
+test_that("read_json_arrow() supports col_select=", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, col_select = c(hello, world))
+ expect_equal(names(tab1), c("hello", "world"))
+
+ tab2 <- read_json_arrow(tf, col_select = 1:2)
+ expect_equal(names(tab2), c("hello", "world"))
+})
+
+test_that("read_json_arrow(schema=) with empty schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97 }
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, schema = schema())
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ hello = c(3.5, 3.25, 3.125, 0),
+ world = c(2L, 5L, 8L, 10L),
+ third_col = c(99L, 98L, 97L, 96L)
+ )
+ )
+})
+
+test_that("read_json_arrow(schema=) with partial schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97 }
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, schema = schema(third_col = float64(), world = float64()))
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ third_col = c(99, 98, 97, 96),
+ world = c(2, 5, 8, 10),
+ hello = c(3.5, 3.25, 3.125, 0)
+ )
+ )
+
+ tf2 <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": "99"}
+ { "hello": 3.25, "world": 5, "third_col": "98"}
+ { "hello": 3.125, "world": 8, "third_col": "97"}
+ ', tf2)
+
+ tab2 <- read_json_arrow(tf2, schema = schema(third_col = string(), world = float64()))
+
+ expect_identical(
+ tab2,
+ tibble::tibble(
+ third_col = c("99", "98", "97"),
+ world = c(2, 5, 8),
+ hello = c(3.5, 3.25, 3.125)
+ )
+ )
+})
+
+test_that("read_json_arrow(schema=) with full schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97}
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(
+ tf,
+ schema = schema(
+ hello = float64(),
+ third_col = float64(),
+ world = float64()
+ )
+ )
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ hello = c(3.5, 3.25, 3.125, 0),
+ third_col = c(99, 98, 97, 96),
+ world = c(2, 5, 8, 10)
+ )
+ )
+})
+
+test_that("Can read json file with nested columns (ARROW-5503)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ { "arr": [1.0, 2.0, 3.0], "nuf": {} }
+ { "arr": [2.0], "nuf": null }
+ { "arr": [], "nuf": { "ps": 78.0, "hello": "hi" } }
+ { "arr": null, "nuf": { "ps": 90.0, "hello": "bonjour" } }
+ { "arr": [5.0], "nuf": { "hello": "ciao" } }
+ { "arr": [5.0, 6.0], "nuf": { "ps": 19 } }
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, as_data_frame = FALSE)
+ tab2 <- read_json_arrow(mmap_open(tf), as_data_frame = FALSE)
+ tab3 <- read_json_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
+
+ expect_equal(tab1, tab2)
+ expect_equal(tab1, tab3)
+
+ expect_equal(
+ tab1$schema,
+ schema(
+ arr = list_of(float64()),
+ nuf = struct(ps = float64(), hello = utf8())
+ )
+ )
+
+ struct_array <- tab1$column(1)$chunk(0)
+ ps <- Array$create(c(NA, NA, 78, 90, NA, 19))
+ hello <- Array$create(c(NA, NA, "hi", "bonjour", "ciao", NA))
+ expect_equal(struct_array$field(0L), ps)
+ expect_equal(struct_array$GetFieldByName("ps"), ps)
+ struct_cols <- struct_array$Flatten()
+ expect_identical(length(struct_cols), 2L)
+ expect_equal(struct_cols[[1]], ps)
+ expect_equal(struct_cols[[2]], hello)
+ expect_equal(
+ as.vector(struct_array),
+ tibble::tibble(ps = ps$as_vector(), hello = hello$as_vector())
+ )
+
+ list_array_r <- list(
+ c(1, 2, 3),
+ c(2),
+ numeric(),
+ NULL,
+ 5,
+ c(5, 6)
+ )
+ list_array <- tab1$column(0)
+ expect_equal(
+ list_array$as_vector(),
+ list_array_r,
+ ignore_attr = TRUE
+ )
+
+ tib <- as.data.frame(tab1)
+ expect_equal(
+ tib,
+ tibble::tibble(
+ arr = list_array_r,
+ nuf = tibble::tibble(ps = ps$as_vector(), hello = hello$as_vector())
+ ),
+ ignore_attr = TRUE
+ )
+})
+
+test_that("Can read json file with list<struct<T...>> nested columns (ARROW-7740)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ {"a":[{"b":1.0},{"b":2.0}]}
+ {"a":[{"b":1.0},{"b":2.0}]}
+ ', tf)
+
+ one <- tibble::tibble(b = c(1, 2))
+ expected <- tibble::tibble(a = c(list(one), list(one)))
+ expect_equal(read_json_arrow(tf), expected, ignore_attr = TRUE)
+})
diff --git a/src/arrow/r/tests/testthat/test-memory-pool.R b/src/arrow/r/tests/testthat/test-memory-pool.R
new file mode 100644
index 000000000..0aa18aadc
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-memory-pool.R
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("default_memory_pool and its attributes", {
+ pool <- default_memory_pool()
+ # Not integer bc can be >2gb, so we cast to double
+ expect_type(pool$bytes_allocated, "double")
+ expect_type(pool$max_memory, "double")
+ expect_true(pool$backend_name %in% c("system", "jemalloc", "mimalloc"))
+
+ expect_true(all(supported_memory_backends() %in% c("system", "jemalloc", "mimalloc")))
+})
diff --git a/src/arrow/r/tests/testthat/test-message-reader.R b/src/arrow/r/tests/testthat/test-message-reader.R
new file mode 100644
index 000000000..44f3fe4f7
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-message-reader.R
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("MessageReader can be created from raw vectors", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+
+ reader <- MessageReader$create(bytes)
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$RECORD_BATCH)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+
+ schema <- schema(x = int32())
+ bytes <- schema$serialize()
+
+ reader <- MessageReader$create(bytes)
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+})
+
+test_that("MessageReader can be created from input stream", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+
+ stream <- BufferReader$create(bytes)
+ expect_r6_class(stream, "BufferReader")
+
+ reader <- MessageReader$create(stream)
+ expect_r6_class(reader, "MessageReader")
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$RECORD_BATCH)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+
+ schema <- schema(x = int32())
+ bytes <- schema$serialize()
+
+ stream <- BufferReader$create(bytes)
+ expect_r6_class(stream, "BufferReader")
+
+ reader <- MessageReader$create(stream)
+ expect_r6_class(reader, "MessageReader")
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+})
diff --git a/src/arrow/r/tests/testthat/test-message.R b/src/arrow/r/tests/testthat/test-message.R
new file mode 100644
index 000000000..c9ee4cb72
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-message.R
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("read_message can read from input stream", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+ stream <- BufferReader$create(bytes)
+
+ message <- read_message(stream)
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$RECORD_BATCH)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- read_message(stream)
+ expect_null(read_message(stream))
+})
+
+test_that("read_message() can read Schema messages", {
+ bytes <- schema(x = int32())$serialize()
+ stream <- BufferReader$create(bytes)
+ message <- read_message(stream)
+
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- read_message(stream)
+ expect_null(read_message(stream))
+})
+
+test_that("read_message() can handle raw vectors", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+ stream <- BufferReader$create(bytes)
+
+ message_stream <- read_message(stream)
+ message_raw <- read_message(bytes)
+ expect_equal(message_stream, message_raw)
+
+ bytes <- schema(x = int32())$serialize()
+ stream <- BufferReader$create(bytes)
+ message_stream <- read_message(stream)
+ message_raw <- read_message(bytes)
+
+ expect_equal(message_stream, message_raw)
+})
diff --git a/src/arrow/r/tests/testthat/test-metadata.R b/src/arrow/r/tests/testthat/test-metadata.R
new file mode 100644
index 000000000..4c4d8a767
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-metadata.R
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Schema metadata", {
+ s <- schema(b = double())
+ expect_equal(s$metadata, empty_named_list())
+ expect_false(s$HasMetadata)
+ s$metadata <- list(test = TRUE)
+ expect_identical(s$metadata, list(test = "TRUE"))
+ expect_true(s$HasMetadata)
+ s$metadata$foo <- 42
+ expect_identical(s$metadata, list(test = "TRUE", foo = "42"))
+ expect_true(s$HasMetadata)
+ s$metadata$foo <- NULL
+ expect_identical(s$metadata, list(test = "TRUE"))
+ expect_true(s$HasMetadata)
+ s$metadata <- NULL
+ expect_equal(s$metadata, empty_named_list())
+ expect_false(s$HasMetadata)
+ expect_error(
+ s$metadata <- 4,
+ "Key-value metadata must be a named list or character vector"
+ )
+})
+
+test_that("Table metadata", {
+ tab <- Table$create(x = 1:2, y = c("a", "b"))
+ expect_equal(tab$metadata, empty_named_list())
+ tab$metadata <- list(test = TRUE)
+ expect_identical(tab$metadata, list(test = "TRUE"))
+ tab$metadata$foo <- 42
+ expect_identical(tab$metadata, list(test = "TRUE", foo = "42"))
+ tab$metadata$foo <- NULL
+ expect_identical(tab$metadata, list(test = "TRUE"))
+ tab$metadata <- NULL
+ expect_equal(tab$metadata, empty_named_list())
+})
+
+test_that("Table R metadata", {
+ tab <- Table$create(example_with_metadata)
+ expect_output(print(tab$metadata), "arrow_r_metadata")
+ expect_identical(as.data.frame(tab), example_with_metadata)
+})
+
+test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", {
+ tab <- Table$create(example_data[1:6])
+ expect_null(tab$metadata$r)
+
+ expect_null(Table$create(example_with_times[1:3])$metadata$r)
+})
+
+test_that("classes are not stored for arrow_binary/arrow_large_binary/arrow_fixed_size_binary (ARROW-14140)", {
+ raws <- charToRaw("bonjour")
+
+ binary <- Array$create(list(raws), binary())
+ large_binary <- Array$create(list(raws), large_binary())
+ fixed_size_binary <- Array$create(list(raws), fixed_size_binary(7L))
+
+ expect_null(RecordBatch$create(b = binary)$metadata$r)
+ expect_null(RecordBatch$create(b = large_binary)$metadata$r)
+ expect_null(RecordBatch$create(b = fixed_size_binary)$metadata$r)
+
+ expect_null(Table$create(b = binary)$metadata$r)
+ expect_null(Table$create(b = large_binary)$metadata$r)
+ expect_null(Table$create(b = fixed_size_binary)$metadata$r)
+})
+
+test_that("Garbage R metadata doesn't break things", {
+ tab <- Table$create(example_data[1:6])
+ tab$metadata$r <- "garbage"
+ expect_warning(
+ expect_identical(as.data.frame(tab), example_data[1:6]),
+ "Invalid metadata$r",
+ fixed = TRUE
+ )
+ # serialize data like .serialize_arrow_r_metadata does, but don't call that
+ # directly since it checks to ensure that the data is a list
+ tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE))
+ expect_warning(
+ expect_identical(as.data.frame(tab), example_data[1:6]),
+ "Invalid metadata$r",
+ fixed = TRUE
+ )
+})
+
+test_that("Metadata serialization compression", {
+ # attributes that (when serialized) are just under 100kb are not compressed,
+ # and simply serialized
+ strings <- as.list(rep(make_string_of_size(1), 98))
+ small <- .serialize_arrow_r_metadata(strings)
+ expect_equal(
+ object.size(small),
+ object.size(rawToChar(serialize(strings, NULL, ascii = TRUE)))
+ )
+
+ # Large strings will be compressed
+ large_strings <- as.list(rep(make_string_of_size(1), 100))
+ large <- .serialize_arrow_r_metadata(large_strings)
+ expect_lt(
+ object.size(large),
+ object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+ )
+ # and this compression ends up being smaller than even the "small" strings
+ expect_lt(object.size(large), object.size(small))
+
+ # However strings where compression + serialization is not effective are no
+ # worse than only serialization alone
+ large_few_strings <- as.list(rep(make_random_string_of_size(50), 2))
+ large_few <- .serialize_arrow_r_metadata(large_few_strings)
+ expect_equal(
+ object.size(large_few),
+ object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE)))
+ )
+
+ # But we can disable compression
+ op <- options(arrow.compress_metadata = FALSE)
+ on.exit(options(op))
+
+ large_strings <- as.list(rep(make_string_of_size(1), 100))
+ large <- .serialize_arrow_r_metadata(large_strings)
+ expect_equal(
+ object.size(large),
+ object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+ )
+})
+
+test_that("RecordBatch metadata", {
+ rb <- RecordBatch$create(x = 1:2, y = c("a", "b"))
+ expect_equal(rb$metadata, empty_named_list())
+ rb$metadata <- list(test = TRUE)
+ expect_identical(rb$metadata, list(test = "TRUE"))
+ rb$metadata$foo <- 42
+ expect_identical(rb$metadata, list(test = "TRUE", foo = "42"))
+ rb$metadata$foo <- NULL
+ expect_identical(rb$metadata, list(test = "TRUE"))
+ rb$metadata <- NULL
+ expect_equal(rb$metadata, empty_named_list())
+})
+
+test_that("RecordBatch R metadata", {
+ expect_identical(as.data.frame(record_batch(example_with_metadata)), example_with_metadata)
+})
+
+test_that("R metadata roundtrip via parquet", {
+ skip_if_not_available("parquet")
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_parquet(example_with_metadata, tf)
+ expect_identical(read_parquet(tf), example_with_metadata)
+})
+
+test_that("R metadata roundtrip via feather", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_feather(example_with_metadata, tf)
+ expect_identical(read_feather(tf), example_with_metadata)
+})
+
+test_that("haven types roundtrip via feather", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_feather(haven_data, tf)
+ expect_identical(read_feather(tf), haven_data)
+})
+
+test_that("Date/time type roundtrip", {
+ rb <- record_batch(example_with_times)
+ expect_r6_class(rb$schema$posixlt$type, "StructType")
+ expect_identical(as.data.frame(rb), example_with_times)
+})
+
+test_that("metadata keeps attribute of top level data frame", {
+ df <- structure(data.frame(x = 1, y = 2), foo = "bar")
+ tab <- Table$create(df)
+ expect_identical(attr(as.data.frame(tab), "foo"), "bar")
+ expect_identical(as.data.frame(tab), df)
+})
+
+
+test_that("metadata drops readr's problems attribute", {
+ readr_like <- tibble::tibble(
+ dbl = 1.1,
+ not_here = NA_character_
+ )
+ attributes(readr_like) <- append(
+ attributes(readr_like),
+ list(problems = tibble::tibble(
+ row = 1L,
+ col = NA_character_,
+ expected = "2 columns",
+ actual = "1 columns",
+ file = "'test'"
+ ))
+ )
+
+ tab <- Table$create(readr_like)
+ expect_null(attr(as.data.frame(tab), "problems"))
+})
+
+test_that("Row-level metadata (does not by default) roundtrip", {
+ # First tracked at ARROW-10386, though it was later determined that row-level
+ # metadata should be handled separately ARROW-14020, ARROW-12542
+ df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux"))))
+ tab <- Table$create(df)
+ r_metadata <- tab$r_metadata
+ expect_type(r_metadata, "list")
+ expect_null(r_metadata$columns$x$columns)
+
+ # But we can re-enable this / read data that has already been written with
+ # row-level metadata
+ withr::with_options(
+ list("arrow.preserve_row_level_metadata" = TRUE), {
+ tab <- Table$create(df)
+ expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+ expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
+ })
+})
+
+
+test_that("Row-level metadata (does not) roundtrip in datasets", {
+ # First tracked at ARROW-10386, though it was later determined that row-level
+ # metadata should be handled separately ARROW-14020, ARROW-12542
+ skip_if_not_available("dataset")
+ skip_if_not_available("parquet")
+
+ library(dplyr, warn.conflicts = FALSE)
+
+ df <- tibble::tibble(
+ metadata = list(
+ structure(1, my_value_as_attr = 1),
+ structure(2, my_value_as_attr = 2),
+ structure(3, my_value_as_attr = 3),
+ structure(4, my_value_as_attr = 3)
+ ),
+ int = 1L:4L,
+ part = c(1, 3, 2, 1)
+ )
+
+ dst_dir <- make_temp_dir()
+
+ withr::with_options(
+ list("arrow.preserve_row_level_metadata" = TRUE), {
+ expect_warning(
+ write_dataset(df, dst_dir, partitioning = "part"),
+ "Row-level metadata is not compatible with datasets and will be discarded"
+ )
+
+ # Reset directory as previous write will have created some files and the default
+ # behavior is to error on existing
+ dst_dir <- make_temp_dir()
+ # but we need to write a dataset with row-level metadata to make sure when
+ # reading ones that have been written with them we warn appropriately
+ fake_func_name <- write_dataset
+ fake_func_name(df, dst_dir, partitioning = "part")
+
+ ds <- open_dataset(dst_dir)
+ expect_warning(
+ df_from_ds <- collect(ds),
+ "Row-level metadata is not compatible with this operation and has been ignored"
+ )
+ expect_equal(
+ arrange(df_from_ds, int),
+ arrange(df, int),
+ ignore_attr = TRUE
+ )
+
+ # however there is *no* warning if we don't select the metadata column
+ expect_warning(
+ df_from_ds <- ds %>% select(int) %>% collect(),
+ NA
+ )
+ })
+})
+
+test_that("When we encounter SF cols, we warn", {
+ df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux"))))
+ class(df$x) <- c("sfc_MULTIPOLYGON", "sfc", "list")
+
+ expect_warning(
+ tab <- Table$create(df),
+ "One of the columns given appears to be an"
+ )
+
+ # but the table was read fine, just sans (row-level) metadata
+ r_metadata <- .unserialize_arrow_r_metadata(tab$metadata$r)
+ expect_null(r_metadata$columns$x$columns)
+
+ # But we can re-enable this / read data that has already been written with
+ # row-level metadata without a warning
+ withr::with_options(
+ list("arrow.preserve_row_level_metadata" = TRUE), {
+ expect_warning(tab <- Table$create(df), NA)
+ expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+ expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
+ })
+})
+
+test_that("dplyr with metadata", {
+ skip_if_not_available("dataset")
+
+ compare_dplyr_binding(
+ .input %>%
+ collect(),
+ example_with_metadata
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(a) %>%
+ collect(),
+ example_with_metadata
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(z = b * 4) %>%
+ select(z, a) %>%
+ collect(),
+ example_with_metadata
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(z = nchar(a)) %>%
+ select(z, a) %>%
+ collect(),
+ example_with_metadata
+ )
+ # dplyr drops top-level attributes if you do summarize, though attributes
+ # of grouping columns appear to come through
+ compare_dplyr_binding(
+ .input %>%
+ group_by(a) %>%
+ summarize(n()) %>%
+ collect(),
+ example_with_metadata
+ )
+ # Same name in output but different data, so the column metadata shouldn't
+ # carry through
+ compare_dplyr_binding(
+ .input %>%
+ mutate(a = nchar(a)) %>%
+ select(a) %>%
+ collect(),
+ example_with_metadata
+ )
+})
+
+test_that("grouped_df metadata is recorded (efficiently)", {
+ grouped <- group_by(tibble(a = 1:2, b = 3:4), a)
+ expect_s3_class(grouped, "grouped_df")
+ grouped_tab <- Table$create(grouped)
+ expect_r6_class(grouped_tab, "Table")
+ expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a")
+})
diff --git a/src/arrow/r/tests/testthat/test-na-omit.R b/src/arrow/r/tests/testthat/test-na-omit.R
new file mode 100644
index 000000000..fafebb4ff
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-na-omit.R
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+data_no_na <- c(2:10)
+data_na <- c(data_no_na, NA_real_)
+
+test_that("na.fail on Scalar", {
+ scalar_na <- Scalar$create(NA)
+ scalar_one <- Scalar$create(1)
+ expect_as_vector(na.fail(scalar_one), 1)
+ expect_error(na.fail(scalar_na), "missing values in object")
+})
+
+test_that("na.omit on Array and ChunkedArray", {
+ compare_expression(na.omit(.input), data_no_na)
+ compare_expression(na.omit(.input), data_na, ignore_attr = TRUE)
+})
+
+test_that("na.exclude on Array and ChunkedArray", {
+ compare_expression(na.exclude(.input), data_no_na)
+ compare_expression(na.exclude(.input), data_na, ignore_attr = TRUE)
+})
+
+test_that("na.fail on Array and ChunkedArray", {
+ compare_expression(na.fail(.input), data_no_na, ignore_attr = TRUE)
+ compare_expression_error(na.fail(.input), data_na)
+})
+
+test_that("na.fail on Scalar", {
+ scalar_one <- Scalar$create(1)
+ expect_error(na.fail(scalar_na), regexp = "missing values in object")
+ expect_as_vector(na.fail(scalar_one), na.fail(1))
+})
+
+test_that("na.omit on Table", {
+ tbl <- Table$create(example_data)
+ expect_equal(
+ as.data.frame(na.omit(tbl)),
+ na.omit(example_data),
+ # We don't include an attribute with the rows omitted
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.exclude on Table", {
+ tbl <- Table$create(example_data)
+ expect_equal(
+ as.data.frame(na.exclude(tbl)),
+ na.exclude(example_data),
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.fail on Table", {
+ tbl <- Table$create(example_data)
+ expect_error(na.fail(tbl), "missing values in object")
+})
+
+test_that("na.omit on RecordBatch", {
+ batch <- record_batch(example_data)
+ expect_equal(
+ as.data.frame(na.omit(batch)),
+ na.omit(example_data),
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.exclude on RecordBatch", {
+ batch <- record_batch(example_data)
+ expect_equal(
+ as.data.frame(na.exclude(batch)),
+ na.omit(example_data),
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.fail on RecordBatch", {
+ batch <- record_batch(example_data)
+ expect_error(na.fail(batch), "missing values in object")
+})
diff --git a/src/arrow/r/tests/testthat/test-parquet.R b/src/arrow/r/tests/testthat/test-parquet.R
new file mode 100644
index 000000000..55d86b532
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-parquet.R
@@ -0,0 +1,274 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("parquet")
+
+pq_file <- system.file("v0.7.1.parquet", package = "arrow")
+
+test_that("reading a known Parquet file to tibble", {
+ skip_if_not_available("snappy")
+ df <- read_parquet(pq_file)
+ expect_true(tibble::is_tibble(df))
+ expect_identical(dim(df), c(10L, 11L))
+ # TODO: assert more about the contents
+})
+
+test_that("simple int column roundtrip", {
+ df <- tibble::tibble(x = 1:5)
+ pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read)
+ # Make sure file connection is cleaned up
+ expect_error(file.remove(pq_tmp_file), NA)
+ expect_false(file.exists(pq_tmp_file))
+})
+
+test_that("read_parquet() supports col_select", {
+ skip_if_not_available("snappy")
+ df <- read_parquet(pq_file, col_select = c(x, y, z))
+ expect_equal(names(df), c("x", "y", "z"))
+
+ df <- read_parquet(pq_file, col_select = starts_with("c"))
+ expect_equal(names(df), c("carat", "cut", "color", "clarity"))
+})
+
+test_that("read_parquet() with raw data", {
+ skip_if_not_available("snappy")
+ test_raw <- readBin(pq_file, what = "raw", n = 5000)
+ df <- read_parquet(test_raw)
+ expect_identical(dim(df), c(10L, 11L))
+})
+
+test_that("write_parquet() handles various compression= specs", {
+ skip_if_not_available("snappy")
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, compression = "snappy")
+ expect_parquet_roundtrip(tab, compression = rep("snappy", 3L))
+ expect_parquet_roundtrip(tab, compression = c(x1 = "snappy", x2 = "snappy"))
+})
+
+test_that("write_parquet() handles various compression_level= specs", {
+ skip_if_not_available("gzip")
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = 4)
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = rep(4L, 3L))
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = c(x1 = 5L, x2 = 3L))
+})
+
+test_that("write_parquet() handles various use_dictionary= specs", {
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, use_dictionary = TRUE)
+ expect_parquet_roundtrip(tab, use_dictionary = c(TRUE, FALSE, TRUE))
+ expect_parquet_roundtrip(tab, use_dictionary = c(x1 = TRUE, x2 = TRUE))
+ expect_error(
+ write_parquet(tab, tempfile(), use_dictionary = c(TRUE, FALSE)),
+ "unsupported use_dictionary= specification"
+ )
+ expect_error(
+ write_parquet(tab, tempfile(), use_dictionary = 12),
+ "is.logical(use_dictionary) is not TRUE",
+ fixed = TRUE
+ )
+})
+
+test_that("write_parquet() handles various write_statistics= specs", {
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, write_statistics = TRUE)
+ expect_parquet_roundtrip(tab, write_statistics = c(TRUE, FALSE, TRUE))
+ expect_parquet_roundtrip(tab, write_statistics = c(x1 = TRUE, x2 = TRUE))
+})
+
+test_that("write_parquet() accepts RecordBatch too", {
+ batch <- RecordBatch$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+ tab <- parquet_roundtrip(batch)
+ expect_equal(tab, Table$create(batch))
+})
+
+test_that("write_parquet() handles grouped_df", {
+ library(dplyr, warn.conflicts = FALSE)
+ df <- tibble::tibble(a = 1:4, b = 5) %>% group_by(b)
+ # Since `df` is a "grouped_df", this test asserts that we get a grouped_df back
+ expect_parquet_roundtrip(df, as_data_frame = TRUE)
+})
+
+test_that("write_parquet() with invalid input type", {
+ bad_input <- Array$create(1:5)
+ expect_error(
+ write_parquet(bad_input, tempfile()),
+ regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
+ )
+})
+
+test_that("write_parquet() can truncate timestamps", {
+ tab <- Table$create(x1 = as.POSIXct("2020/06/03 18:00:00", tz = "UTC"))
+ expect_type_equal(tab$x1, timestamp("us", "UTC"))
+
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_parquet(tab, tf, coerce_timestamps = "ms", allow_truncated_timestamps = TRUE)
+ new <- read_parquet(tf, as_data_frame = FALSE)
+ expect_type_equal(new$x1, timestamp("ms", "UTC"))
+ expect_equal(as.data.frame(tab), as.data.frame(new))
+})
+
+test_that("make_valid_version()", {
+ expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0)
+
+ expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0)
+
+ expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0)
+})
+
+test_that("write_parquet() defaults to snappy compression", {
+ skip_if_not_available("snappy")
+ tmp1 <- tempfile()
+ tmp2 <- tempfile()
+ write_parquet(mtcars, tmp1)
+ write_parquet(mtcars, tmp2, compression = "snappy")
+ expect_equal(file.size(tmp1), file.size(tmp2))
+})
+
+test_that("Factors are preserved when writing/reading from Parquet", {
+ fct <- factor(c("a", "b"), levels = c("c", "a", "b"))
+ ord <- factor(c("a", "b"), levels = c("c", "a", "b"), ordered = TRUE)
+ chr <- c("a", "b")
+ df <- tibble::tibble(fct = fct, ord = ord, chr = chr)
+
+ pq_tmp_file <- tempfile()
+ on.exit(unlink(pq_tmp_file))
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read)
+})
+
+test_that("Lists are preserved when writing/reading from Parquet", {
+ bool <- list(logical(0), NA, c(TRUE, FALSE))
+ int <- list(integer(0), NA_integer_, 1:4)
+ num <- list(numeric(0), NA_real_, c(1, 2))
+ char <- list(character(0), NA_character_, c("itsy", "bitsy"))
+ df <- tibble::tibble(bool = bool, int = int, num = num, char = char)
+
+ pq_tmp_file <- tempfile()
+ on.exit(unlink(pq_tmp_file))
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read, ignore_attr = TRUE)
+})
+
+test_that("write_parquet() to stream", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ con <- FileOutputStream$create(tf)
+ on.exit(unlink(tf))
+ write_parquet(df, con)
+ con$close()
+ expect_equal(read_parquet(tf), df)
+})
+
+test_that("write_parquet() returns its input", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ df_out <- write_parquet(df, tf)
+ expect_equal(df, df_out)
+})
+
+test_that("write_parquet() handles version argument", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ {
+ write_parquet(df, tf, version = .x)
+ expect_identical(read_parquet(tf), df)
+ })
+ purrr::walk(list("3.0", 3.0, 3L, "A"), ~ {
+ expect_error(write_parquet(df, tf, version = .x))
+ })
+})
+
+test_that("ParquetFileWriter raises an error for non-OutputStream sink", {
+ sch <- schema(a = float32())
+ # ARROW-9946
+ expect_error(
+ ParquetFileWriter$create(schema = sch, sink = tempfile()),
+ regex = "OutputStream"
+ )
+})
+
+test_that("ParquetFileReader $ReadRowGroup(s) methods", {
+ tab <- Table$create(x = 1:100)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_parquet(tab, tf, chunk_size = 10)
+
+ reader <- ParquetFileReader$create(tf)
+ expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:10))
+ expect_true(reader$ReadRowGroup(9) == Table$create(x = 91:100))
+ expect_error(reader$ReadRowGroup(-1), "Some index in row_group_indices")
+ expect_error(reader$ReadRowGroup(111), "Some index in row_group_indices")
+ expect_error(reader$ReadRowGroup(c(1, 2)))
+ expect_error(reader$ReadRowGroup("a"))
+
+ expect_true(reader$ReadRowGroups(c(0, 1)) == Table$create(x = 1:20))
+ expect_error(reader$ReadRowGroups(c(0, 1, -2))) # although it gives a weird error
+ expect_error(reader$ReadRowGroups(c(0, 1, 31))) # ^^
+ expect_error(reader$ReadRowGroups(c("a", "b")))
+
+ ## -- with column_indices
+ expect_true(reader$ReadRowGroup(0, 0) == Table$create(x = 1:10))
+ expect_error(reader$ReadRowGroup(0, 1))
+
+ expect_true(reader$ReadRowGroups(c(0, 1), 0) == Table$create(x = 1:20))
+ expect_error(reader$ReadRowGroups(c(0, 1), 1))
+})
+
+test_that("Error messages are shown when the compression algorithm snappy is not found", {
+ msg <- paste0(
+ "NotImplemented: Support for codec 'snappy' not built\nIn order to read this file, ",
+ "you will need to reinstall arrow with additional features enabled.\nSet one of these ",
+ "environment variables before installing:\n\n * LIBARROW_MINIMAL=false (for all optional ",
+ "features, including 'snappy')\n * ARROW_WITH_SNAPPY=ON (for just 'snappy')\n\n",
+ "See https://arrow.apache.org/docs/r/articles/install.html for details"
+ )
+
+ if (codec_is_available("snappy")) {
+ d <- read_parquet(pq_file)
+ expect_s3_class(d, "data.frame")
+ } else {
+ expect_error(read_parquet(pq_file), msg, fixed = TRUE)
+ }
+})
+
+test_that("Error is created when parquet reads a feather file", {
+ expect_error(
+ read_parquet(test_path("golden-files/data-arrow_2.0.0_lz4.feather")),
+ "Parquet magic bytes not found in footer"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-python-flight.R b/src/arrow/r/tests/testthat/test-python-flight.R
new file mode 100644
index 000000000..c87f3a562
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-python-flight.R
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Assumes:
+# * We've already done arrow::install_pyarrow()
+# * R -e 'arrow::load_flight_server("demo_flight_server")$DemoFlightServer(port = 8089)$serve()'
+# TODO: set up CI job to test this, or some way of running a background process
+if (process_is_running("demo_flight_server")) {
+ client <- flight_connect(port = 8089)
+ flight_obj <- tempfile()
+
+ test_that("flight_path_exists", {
+ expect_false(flight_path_exists(client, flight_obj))
+ expect_false(flight_obj %in% list_flights(client))
+ })
+
+ test_that("flight_put", {
+ flight_put(client, example_data, path = flight_obj)
+ expect_true(flight_path_exists(client, flight_obj))
+ expect_true(flight_obj %in% list_flights(client))
+ })
+
+ test_that("flight_get", {
+ expect_identical(as.data.frame(flight_get(client, flight_obj)), example_data)
+ })
+
+ test_that("flight_put with RecordBatch", {
+ flight_obj2 <- tempfile()
+ flight_put(client, RecordBatch$create(example_data), path = flight_obj2)
+ expect_identical(as.data.frame(flight_get(client, flight_obj2)), example_data)
+ })
+
+ test_that("flight_put with overwrite = FALSE", {
+ expect_error(
+ flight_put(client, example_with_times, path = flight_obj, overwrite = FALSE),
+ "exists"
+ )
+ # Default is TRUE so this will overwrite
+ flight_put(client, example_with_times, path = flight_obj)
+ expect_identical(as.data.frame(flight_get(client, flight_obj)), example_with_times)
+ })
+} else {
+ # Kinda hacky, let's put a skipped test here, just so we note that the tests
+ # didn't run
+ test_that("Flight tests", {
+ skip("Flight server is not running")
+ })
+}
diff --git a/src/arrow/r/tests/testthat/test-python.R b/src/arrow/r/tests/testthat/test-python.R
new file mode 100644
index 000000000..5ad7513fb
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-python.R
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("install_pyarrow", {
+ skip_on_cran()
+ skip_if_offline()
+ skip_if_not_dev_mode()
+ # Windows CI machine doesn't pick up the right python or something
+ skip_on_os("windows")
+ skip_if_not_installed("reticulate")
+
+ venv <- try(reticulate::virtualenv_create("arrow-test"))
+ # Bail out if virtualenv isn't available
+ skip_if(inherits(venv, "try-error"))
+ expect_error(install_pyarrow("arrow-test", nightly = TRUE), NA)
+ # Set this up for the following tests
+ reticulate::use_virtualenv("arrow-test")
+})
+
+skip_if_no_pyarrow()
+
+test_that("Array from Python", {
+ pa <- reticulate::import("pyarrow")
+ py <- pa$array(c(1, 2, 3))
+ expect_equal(py, Array$create(c(1, 2, 3)))
+})
+
+test_that("Array to Python", {
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ r <- Array$create(c(1, 2, 3))
+ py <- pa$concat_arrays(list(r))
+ expect_s3_class(py, "pyarrow.lib.Array")
+ expect_equal(reticulate::py_to_r(py), r)
+})
+
+test_that("RecordBatch to/from Python", {
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3])
+ py <- reticulate::r_to_py(batch)
+ expect_s3_class(py, "pyarrow.lib.RecordBatch")
+ expect_equal(reticulate::py_to_r(py), batch)
+})
+
+test_that("Table and ChunkedArray from Python", {
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3])
+ tab <- Table$create(batch, batch)
+ pybatch <- reticulate::r_to_py(batch)
+ pytab <- pa$Table$from_batches(list(pybatch, pybatch))
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ expect_s3_class(pytab[0], "pyarrow.lib.ChunkedArray")
+ expect_equal(reticulate::py_to_r(pytab[0]), tab$col1)
+ expect_equal(reticulate::py_to_r(pytab), tab)
+})
+
+test_that("Table and ChunkedArray to Python", {
+ batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3])
+ tab <- Table$create(batch, batch)
+
+ pychunked <- reticulate::r_to_py(tab$col1)
+ expect_s3_class(pychunked, "pyarrow.lib.ChunkedArray")
+ expect_equal(reticulate::py_to_r(pychunked), tab$col1)
+
+ pytab <- reticulate::r_to_py(tab)
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ expect_equal(reticulate::py_to_r(pytab), tab)
+})
+
+test_that("RecordBatch with metadata roundtrip", {
+ batch <- RecordBatch$create(example_with_times)
+ pybatch <- reticulate::r_to_py(batch)
+ expect_s3_class(pybatch, "pyarrow.lib.RecordBatch")
+ expect_equal(reticulate::py_to_r(pybatch), batch)
+ expect_identical(as.data.frame(reticulate::py_to_r(pybatch)), example_with_times)
+})
+
+test_that("Table with metadata roundtrip", {
+ tab <- Table$create(example_with_times)
+ pytab <- reticulate::r_to_py(tab)
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ expect_equal(reticulate::py_to_r(pytab), tab)
+ expect_identical(as.data.frame(reticulate::py_to_r(pytab)), example_with_times)
+})
+
+test_that("DataType roundtrip", {
+ r <- timestamp("ms", timezone = "Pacific/Marquesas")
+ py <- reticulate::r_to_py(r)
+ expect_s3_class(py, "pyarrow.lib.DataType")
+ expect_equal(reticulate::py_to_r(py), r)
+})
+
+test_that("Field roundtrip", {
+ r <- field("x", time32("s"))
+ py <- reticulate::r_to_py(r)
+ expect_s3_class(py, "pyarrow.lib.Field")
+ expect_equal(reticulate::py_to_r(py), r)
+})
+
+test_that("RecordBatchReader to python", {
+ library(dplyr)
+
+ tab <- Table$create(example_data)
+ scan <- tab %>%
+ select(int, lgl) %>%
+ filter(int > 6) %>%
+ Scanner$create()
+ reader <- scan$ToRecordBatchReader()
+ pyreader <- reticulate::r_to_py(reader)
+ expect_s3_class(pyreader, "pyarrow.lib.RecordBatchReader")
+ pytab <- pyreader$read_all()
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ back_to_r <- reticulate::py_to_r(pytab)
+ expect_r6_class(back_to_r, "Table")
+ expect_identical(
+ as.data.frame(back_to_r),
+ example_data %>%
+ select(int, lgl) %>%
+ filter(int > 6)
+ )
+})
+
+test_that("RecordBatchReader from python", {
+ tab <- Table$create(example_data)
+ scan <- Scanner$create(tab)
+ reader <- scan$ToRecordBatchReader()
+ pyreader <- reticulate::r_to_py(reader)
+ back_to_r <- reticulate::py_to_r(pyreader)
+ rt_table <- back_to_r$read_table()
+ expect_r6_class(rt_table, "Table")
+ expect_identical(as.data.frame(rt_table), example_data)
+})
diff --git a/src/arrow/r/tests/testthat/test-read-record-batch.R b/src/arrow/r/tests/testthat/test-read-record-batch.R
new file mode 100644
index 000000000..ba109da6c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-read-record-batch.R
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("RecordBatchFileWriter / RecordBatchFileReader roundtrips", {
+ tab <- Table$create(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+
+ tf <- tempfile()
+ expect_error(
+ RecordBatchFileWriter$create(tf, tab$schema),
+ "RecordBatchFileWriter$create() requires an Arrow InputStream. Try providing FileOutputStream$create(tf)",
+ fixed = TRUE
+ )
+
+ stream <- FileOutputStream$create(tf)
+ writer <- RecordBatchFileWriter$create(stream, tab$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write_table(tab)
+ writer$close()
+ stream$close()
+
+ expect_equal(read_feather(tf, as_data_frame = FALSE), tab)
+ # Make sure connections are closed
+ expect_error(file.remove(tf), NA)
+ skip_on_os("windows") # This should pass, we've closed the stream
+ expect_false(file.exists(tf))
+})
+
+test_that("record_batch() handles (raw|Buffer|InputStream, Schema) (ARROW-3450, ARROW-3505)", {
+ tbl <- tibble::tibble(
+ int = 1:10, dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+ batch <- record_batch(!!!tbl)
+ schema <- batch$schema
+
+ raw <- batch$serialize()
+ batch2 <- record_batch(raw, schema = schema)
+ batch3 <- record_batch(buffer(raw), schema = schema)
+ stream <- BufferReader$create(raw)
+ stream$close()
+
+ expect_equal(batch, batch2)
+ expect_equal(batch, batch3)
+})
+
+test_that("record_batch() can handle (Message, Schema) parameters (ARROW-3499)", {
+ batch <- record_batch(x = 1:10)
+ schema <- batch$schema
+
+ raw <- batch$serialize()
+ stream <- BufferReader$create(raw)
+
+ message <- read_message(stream)
+ batch2 <- record_batch(message, schema = schema)
+ expect_equal(batch, batch2)
+ stream$close()
+})
diff --git a/src/arrow/r/tests/testthat/test-read-write.R b/src/arrow/r/tests/testthat/test-read-write.R
new file mode 100644
index 000000000..66f6db56d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-read-write.R
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("table round trip", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ raw = as.raw(1:10)
+ )
+
+ tab <- Table$create(!!!tbl)
+ expect_equal(tab$num_columns, 3L)
+ expect_equal(tab$num_rows, 10L)
+
+ # ChunkedArray
+ chunked_array_int <- tab$column(0)
+ expect_equal(chunked_array_int$length(), 10L)
+ expect_equal(chunked_array_int$null_count, 0L)
+ expect_equal(chunked_array_int$as_vector(), tbl$int)
+
+ # Array
+ chunks_int <- chunked_array_int$chunks
+ expect_equal(length(chunks_int), chunked_array_int$num_chunks)
+ for (i in seq_along(chunks_int)) {
+ expect_equal(chunked_array_int$chunk(i - 1L), chunks_int[[i]])
+ }
+
+ # ChunkedArray
+ chunked_array_dbl <- tab$column(1)
+ expect_equal(chunked_array_dbl$length(), 10L)
+ expect_equal(chunked_array_dbl$null_count, 0L)
+ expect_equal(chunked_array_dbl$as_vector(), tbl$dbl)
+
+ # Array
+ chunks_dbl <- chunked_array_dbl$chunks
+ expect_equal(length(chunks_dbl), chunked_array_dbl$num_chunks)
+ for (i in seq_along(chunks_dbl)) {
+ expect_equal(chunked_array_dbl$chunk(i - 1L), chunks_dbl[[i]])
+ }
+
+ # ChunkedArray
+ chunked_array_raw <- tab$column(2)
+ expect_equal(chunked_array_raw$length(), 10L)
+ expect_equal(chunked_array_raw$null_count, 0L)
+ expect_equal(chunked_array_raw$as_vector(), as.integer(tbl$raw))
+
+ # Array
+ chunks_raw <- chunked_array_raw$chunks
+ expect_equal(length(chunks_raw), chunked_array_raw$num_chunks)
+ for (i in seq_along(chunks_raw)) {
+ expect_equal(chunked_array_raw$chunk(i - 1L), chunks_raw[[i]])
+ }
+ tf <- tempfile()
+ write_feather(tbl, tf)
+
+ res <- read_feather(tf)
+ expect_identical(tbl$int, res$int)
+ expect_identical(tbl$dbl, res$dbl)
+ expect_identical(as.integer(tbl$raw), res$raw)
+ unlink(tf)
+})
+
+test_that("table round trip handles NA in integer and numeric", {
+ tbl <- tibble::tibble(
+ int = c(NA, 2:10),
+ dbl = as.numeric(c(1:5, NA, 7:9, NA)),
+ raw = as.raw(1:10)
+ )
+
+ tab <- Table$create(!!!tbl)
+ expect_equal(tab$num_columns, 3L)
+ expect_equal(tab$num_rows, 10L)
+
+ expect_equal(tab$column(0)$length(), 10L)
+ expect_equal(tab$column(1)$length(), 10L)
+ expect_equal(tab$column(2)$length(), 10L)
+
+ expect_equal(tab$column(0)$null_count, 1L)
+ expect_equal(tab$column(1)$null_count, 2L)
+ expect_equal(tab$column(2)$null_count, 0L)
+
+ expect_equal(tab$column(0)$type, int32())
+ expect_equal(tab$column(1)$type, float64())
+ expect_equal(tab$column(2)$type, uint8())
+
+ tf <- tempfile()
+ write_feather(tbl, tf)
+
+ res <- read_feather(tf)
+ expect_identical(tbl$int, res$int)
+ expect_identical(tbl$dbl, res$dbl)
+ expect_identical(as.integer(tbl$raw), res$raw)
+
+ expect_true(is.na(res$int[1]))
+ expect_true(is.na(res$dbl[6]))
+ expect_true(is.na(res$dbl[10]))
+ unlink(tf)
+})
+
+test_that("reading/writing a raw vector (sparklyr integration)", {
+ # These are effectively what sparklyr calls to get data to/from Spark
+ read_from_raw_test <- function(x) {
+ as.data.frame(RecordBatchStreamReader$create(x)$read_next_batch())
+ }
+ bytes <- write_to_raw(example_data)
+ expect_type(bytes, "raw")
+ expect_identical(read_from_raw_test(bytes), example_data)
+ # this could just be `read_ipc_stream(x)`; propose that
+ expect_identical(read_ipc_stream(bytes), example_data)
+})
diff --git a/src/arrow/r/tests/testthat/test-record-batch-reader.R b/src/arrow/r/tests/testthat/test-record-batch-reader.R
new file mode 100644
index 000000000..3992670dc
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-record-batch-reader.R
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("RecordBatchStreamReader / Writer", {
+ tbl <- tibble::tibble(
+ x = 1:10,
+ y = letters[1:10]
+ )
+ batch <- record_batch(tbl)
+ tab <- Table$create(tbl)
+
+ sink <- BufferOutputStream$create()
+ expect_equal(sink$tell(), 0)
+ writer <- RecordBatchStreamWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ expect_true(sink$tell() > 0)
+ writer$close()
+
+ buf <- sink$finish()
+ expect_r6_class(buf, "Buffer")
+
+ reader <- RecordBatchStreamReader$create(buf)
+ expect_r6_class(reader, "RecordBatchStreamReader")
+
+ batch1 <- reader$read_next_batch()
+ expect_r6_class(batch1, "RecordBatch")
+ expect_equal(batch, batch1)
+ batch2 <- reader$read_next_batch()
+ expect_r6_class(batch2, "RecordBatch")
+ expect_equal(batch, batch2)
+ batch3 <- reader$read_next_batch()
+ expect_r6_class(batch3, "RecordBatch")
+ expect_equal(batch, batch3)
+ expect_null(reader$read_next_batch())
+})
+
+test_that("RecordBatchFileReader / Writer", {
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchFileWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ writer$close()
+
+ buf <- sink$finish()
+ expect_r6_class(buf, "Buffer")
+
+ reader <- RecordBatchFileReader$create(buf)
+ expect_r6_class(reader, "RecordBatchFileReader")
+
+ batch1 <- reader$get_batch(0)
+ expect_r6_class(batch1, "RecordBatch")
+ expect_equal(batch, batch1)
+
+ expect_equal(reader$num_record_batches, 3)
+})
+
+test_that("StreamReader read_table", {
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchStreamWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ writer$close()
+ buf <- sink$finish()
+
+ reader <- RecordBatchStreamReader$create(buf)
+ out <- reader$read_table()
+ expect_identical(dim(out), c(30L, 2L))
+})
+
+test_that("FileReader read_table", {
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchFileWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ writer$close()
+ buf <- sink$finish()
+
+ reader <- RecordBatchFileReader$create(buf)
+ out <- reader$read_table()
+ expect_identical(dim(out), c(30L, 2L))
+})
+
+test_that("MetadataFormat", {
+ expect_identical(get_ipc_metadata_version(5), 4L)
+ expect_identical(get_ipc_metadata_version("V4"), 3L)
+ expect_identical(get_ipc_metadata_version(NULL), 4L)
+ Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = 1)
+ expect_identical(get_ipc_metadata_version(NULL), 3L)
+ Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = "")
+
+ expect_identical(get_ipc_metadata_version(NULL), 4L)
+ Sys.setenv(ARROW_PRE_1_0_METADATA_VERSION = 1)
+ expect_identical(get_ipc_metadata_version(NULL), 3L)
+ Sys.setenv(ARROW_PRE_1_0_METADATA_VERSION = "")
+
+ expect_error(
+ get_ipc_metadata_version(99),
+ "99 is not a valid IPC MetadataVersion"
+ )
+ expect_error(
+ get_ipc_metadata_version("45"),
+ '"45" is not a valid IPC MetadataVersion'
+ )
+})
+
+test_that("reader with 0 batches", {
+ # IPC stream containing only a schema (ARROW-10642)
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchStreamWriter$create(sink, schema(a = int32()))
+ writer$close()
+ buf <- sink$finish()
+
+ reader <- RecordBatchStreamReader$create(buf)
+ tab <- reader$read_table()
+ expect_r6_class(tab, "Table")
+ expect_identical(dim(tab), c(0L, 1L))
+})
diff --git a/src/arrow/r/tests/testthat/test-s3-minio.R b/src/arrow/r/tests/testthat/test-s3-minio.R
new file mode 100644
index 000000000..e2c1dc2e7
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-s3-minio.R
@@ -0,0 +1,228 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+if (arrow_with_s3() && process_is_running("minio server")) {
+ # Get minio config, with expected defaults
+ minio_key <- Sys.getenv("MINIO_ACCESS_KEY", "minioadmin")
+ minio_secret <- Sys.getenv("MINIO_SECRET_KEY", "minioadmin")
+ minio_port <- Sys.getenv("MINIO_PORT", "9000")
+
+ # Helper function for minio URIs
+ minio_uri <- function(...) {
+ template <- "s3://%s:%s@%s?scheme=http&endpoint_override=localhost%s%s"
+ sprintf(template, minio_key, minio_secret, minio_path(...), "%3A", minio_port)
+ }
+ minio_path <- function(...) paste(now, ..., sep = "/")
+
+ test_that("minio setup", {
+ # Create a "bucket" on minio for this test run, which we'll delete when done.
+ fs <- S3FileSystem$create(
+ access_key = minio_key,
+ secret_key = minio_secret,
+ scheme = "http",
+ endpoint_override = paste0("localhost:", minio_port)
+ )
+ expect_r6_class(fs, "S3FileSystem")
+ now <- as.character(as.numeric(Sys.time()))
+ # If minio isn't running, this will hang for a few seconds and fail with a
+ # curl timeout, causing `run_these` to be set to FALSE and skipping the tests
+ fs$CreateDir(now)
+ })
+ # Clean up when we're all done
+ on.exit(fs$DeleteDir(now))
+
+ test_that("read/write Feather on minio", {
+ write_feather(example_data, minio_uri("test.feather"))
+ expect_identical(read_feather(minio_uri("test.feather")), example_data)
+ })
+
+ test_that("read/write Feather by filesystem, not URI", {
+ write_feather(example_data, fs$path(minio_path("test2.feather")))
+ expect_identical(
+ read_feather(fs$path(minio_path("test2.feather"))),
+ example_data
+ )
+ })
+
+ test_that("read/write stream", {
+ write_ipc_stream(example_data, fs$path(minio_path("test3.ipc")))
+ expect_identical(
+ read_ipc_stream(fs$path(minio_path("test3.ipc"))),
+ example_data
+ )
+ })
+
+ test_that("read/write Parquet on minio", {
+ skip_if_not_available("parquet")
+ write_parquet(example_data, fs$path(minio_uri("test.parquet")))
+ expect_identical(read_parquet(minio_uri("test.parquet")), example_data)
+ })
+
+ if (arrow_with_dataset()) {
+ library(dplyr)
+
+ make_temp_dir <- function() {
+ path <- tempfile()
+ dir.create(path)
+ normalizePath(path, winslash = "/")
+ }
+
+ test_that("open_dataset with an S3 file (not directory) URI", {
+ skip_if_not_available("parquet")
+ expect_identical(
+ open_dataset(minio_uri("test.parquet")) %>% collect() %>% arrange(int),
+ example_data %>% arrange(int)
+ )
+ })
+
+ test_that("open_dataset with vector of S3 file URIs", {
+ expect_identical(
+ open_dataset(
+ c(minio_uri("test.feather"), minio_uri("test2.feather")),
+ format = "feather"
+ ) %>%
+ arrange(int) %>%
+ collect(),
+ rbind(example_data, example_data) %>% arrange(int)
+ )
+ })
+
+ test_that("open_dataset errors on URIs for different file systems", {
+ td <- make_temp_dir()
+ expect_error(
+ open_dataset(
+ c(
+ minio_uri("test.feather"),
+ paste0("file://", file.path(td, "fake.feather"))
+ ),
+ format = "feather"
+ ),
+ "Vectors of URIs for different file systems are not supported"
+ )
+ })
+
+ # Dataset test setup, cf. test-dataset.R
+ first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+ df1 <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ fct = factor(LETTERS[1:10]),
+ ts = first_date + lubridate::days(1:10)
+ )
+
+ second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+ df2 <- tibble(
+ int = 101:110,
+ dbl = as.numeric(51:60),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[10:1],
+ fct = factor(LETTERS[10:1]),
+ ts = second_date + lubridate::days(10:1)
+ )
+
+ # This is also to set up the dataset tests
+ test_that("write_parquet with filesystem arg", {
+ skip_if_not_available("parquet")
+ fs$CreateDir(minio_path("hive_dir", "group=1", "other=xxx"))
+ fs$CreateDir(minio_path("hive_dir", "group=2", "other=yyy"))
+ expect_length(fs$ls(minio_path("hive_dir")), 2)
+ write_parquet(df1, fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet")))
+ write_parquet(df2, fs$path(minio_path("hive_dir", "group=2", "other=yyy", "file2.parquet")))
+ expect_identical(
+ read_parquet(fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"))),
+ df1
+ )
+ })
+
+ test_that("open_dataset with fs", {
+ ds <- open_dataset(fs$path(minio_path("hive_dir")))
+ expect_identical(
+ ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int),
+ rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int)
+ )
+ })
+
+ test_that("write_dataset with fs", {
+ ds <- open_dataset(fs$path(minio_path("hive_dir")))
+ write_dataset(ds, fs$path(minio_path("new_dataset_dir")))
+ expect_length(fs$ls(minio_path("new_dataset_dir")), 1)
+ })
+
+ test_that("Let's test copy_files too", {
+ td <- make_temp_dir()
+ copy_files(minio_uri("hive_dir"), td)
+ expect_length(dir(td), 2)
+ ds <- open_dataset(td)
+ expect_identical(
+ ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int),
+ rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int)
+ )
+
+ # Let's copy the other way and use a SubTreeFileSystem rather than URI
+ copy_files(td, fs$path(minio_path("hive_dir2")))
+ ds2 <- open_dataset(fs$path(minio_path("hive_dir2")))
+ expect_identical(
+ ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int),
+ rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int)
+ )
+ })
+ }
+
+ test_that("S3FileSystem input validation", {
+ expect_error(
+ S3FileSystem$create(access_key = "foo"),
+ "Key authentication requires both access_key and secret_key"
+ )
+ expect_error(
+ S3FileSystem$create(secret_key = "foo"),
+ "Key authentication requires both access_key and secret_key"
+ )
+ expect_error(
+ S3FileSystem$create(session_token = "foo"),
+ paste0(
+ "In order to initialize a session with temporary credentials, ",
+ "both secret_key and access_key must be provided ",
+ "in addition to session_token."
+ )
+ )
+ expect_error(
+ S3FileSystem$create(access_key = "foo", secret_key = "asdf", anonymous = TRUE),
+ 'Cannot specify "access_key" and "secret_key" when anonymous = TRUE'
+ )
+ expect_error(
+ S3FileSystem$create(access_key = "foo", secret_key = "asdf", role_arn = "qwer"),
+ "Cannot provide both key authentication and role_arn"
+ )
+ expect_error(
+ S3FileSystem$create(access_key = "foo", secret_key = "asdf", external_id = "qwer"),
+ 'Cannot specify "external_id" without providing a role_arn string'
+ )
+ expect_error(
+ S3FileSystem$create(external_id = "foo"),
+ 'Cannot specify "external_id" without providing a role_arn string'
+ )
+ })
+} else {
+ # Kinda hacky, let's put a skipped test here, just so we note that the tests
+ # didn't run
+ test_that("S3FileSystem tests with Minio", {
+ skip("Minio is not running")
+ })
+}
diff --git a/src/arrow/r/tests/testthat/test-s3.R b/src/arrow/r/tests/testthat/test-s3.R
new file mode 100644
index 000000000..298b15bb8
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-s3.R
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+run_these <- tryCatch(
+ expr = {
+ if (arrow_with_s3() &&
+ identical(tolower(Sys.getenv("ARROW_R_DEV")), "true") &&
+ !identical(Sys.getenv("AWS_ACCESS_KEY_ID"), "") &&
+ !identical(Sys.getenv("AWS_SECRET_ACCESS_KEY"), "")) {
+ # See if we have access to the test bucket
+ bucket <- s3_bucket("ursa-labs-r-test")
+ bucket$GetFileInfo("")
+ TRUE
+ } else {
+ FALSE
+ }
+ },
+ error = function(e) FALSE
+)
+
+bucket_uri <- function(..., bucket = "s3://ursa-labs-r-test/%s?region=us-west-2") {
+ segments <- paste(..., sep = "/")
+ sprintf(bucket, segments)
+}
+
+if (run_these) {
+ now <- as.numeric(Sys.time())
+ on.exit(bucket$DeleteDir(now))
+
+ test_that("read/write Feather on S3", {
+ write_feather(example_data, bucket_uri(now, "test.feather"))
+ expect_identical(read_feather(bucket_uri(now, "test.feather")), example_data)
+ })
+
+ test_that("read/write Parquet on S3", {
+ skip_if_not_available("parquet")
+ write_parquet(example_data, bucket_uri(now, "test.parquet"))
+ expect_identical(read_parquet(bucket_uri(now, "test.parquet")), example_data)
+ })
+}
diff --git a/src/arrow/r/tests/testthat/test-scalar.R b/src/arrow/r/tests/testthat/test-scalar.R
new file mode 100644
index 000000000..3afccf743
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-scalar.R
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+expect_scalar_roundtrip <- function(x, type) {
+ s <- Scalar$create(x)
+ expect_r6_class(s, "Scalar")
+ expect_equal(s$type, type)
+ expect_identical(length(s), 1L)
+ if (inherits(type, "NestedType")) {
+ # Should this be? Missing if all elements are missing?
+ # expect_identical(is.na(s), all(is.na(x))) # nolint
+ } else {
+ expect_identical(as.vector(is.na(s)), is.na(x))
+ # MakeArrayFromScalar not implemented for list types
+ expect_as_vector(s, x)
+ }
+}
+
+test_that("Scalar object roundtrip", {
+ expect_scalar_roundtrip(2, float64())
+ expect_scalar_roundtrip(2L, int32())
+ expect_scalar_roundtrip(c(2, 4), list_of(float64()))
+ expect_scalar_roundtrip(c(NA, NA), list_of(bool()))
+ expect_scalar_roundtrip(data.frame(a = 2, b = 4L), struct(a = double(), b = int32()))
+})
+
+test_that("Scalar print", {
+ expect_output(print(Scalar$create(4)), "Scalar\n4")
+})
+
+test_that("Creating Scalars of a different type and casting them", {
+ expect_equal(Scalar$create(4L, int8())$type, int8())
+ expect_equal(Scalar$create(4L)$cast(float32())$type, float32())
+})
+
+test_that("Scalar to Array", {
+ a <- Scalar$create(42)
+ expect_equal(a$as_array(), Array$create(42))
+ expect_equal(Array$create(a), Array$create(42))
+})
+
+test_that("Scalar$Equals", {
+ a <- Scalar$create(42)
+ aa <- Array$create(42)
+ b <- Scalar$create(42)
+ d <- Scalar$create(43)
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(d))
+ expect_false(a$Equals(aa))
+})
+
+test_that("Scalar$ApproxEquals", {
+ a <- Scalar$create(1.0000000000001)
+ aa <- Array$create(1.0000000000001)
+ b <- Scalar$create(1.0)
+ d <- 2.400000000000001
+ expect_false(a$Equals(b))
+ expect_true(a$ApproxEquals(b))
+ expect_false(a$ApproxEquals(d))
+ expect_false(a$ApproxEquals(aa))
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- as.raw(c(0x6d, 0x61, 0x00, 0x6e))
+ expect_error(
+ rawToChar(raws),
+ "embedded nul in string: 'ma\\0n'", # See?
+ fixed = TRUE
+ )
+ scalar_with_nul <- Scalar$create(raws, binary())$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+ v <- expect_error(as.vector(scalar_with_nul), NA)
+ expect_error(
+ v[1],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ",
+ "set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ expect_warning(
+ expect_identical(
+ as.vector(scalar_with_nul)[],
+ "man"
+ ),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
diff --git a/src/arrow/r/tests/testthat/test-schema.R b/src/arrow/r/tests/testthat/test-schema.R
new file mode 100644
index 000000000..8473550df
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-schema.R
@@ -0,0 +1,220 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("Alternate type names are supported", {
+ expect_equal(
+ schema(b = double(), c = bool(), d = string(), e = float(), f = halffloat()),
+ schema(b = float64(), c = boolean(), d = utf8(), e = float32(), f = float16())
+ )
+ expect_equal(names(schema(b = double(), c = bool(), d = string())), c("b", "c", "d"))
+})
+
+test_that("Schema print method", {
+ expect_output(
+ print(schema(b = double(), c = bool(), d = string())),
+ paste(
+ "Schema",
+ "b: double",
+ "c: bool",
+ "d: string",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("Schema with non-nullable fields", {
+ expect_output(
+ print(schema(field("b", double()),
+ field("c", bool(), nullable = FALSE),
+ field("d", string()))),
+ paste(
+ "Schema",
+ "b: double",
+ "c: bool not null",
+ "d: string",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("Schema $GetFieldByName", {
+ schm <- schema(b = double(), c = string())
+ expect_equal(schm$GetFieldByName("b"), field("b", double()))
+ expect_null(schm$GetFieldByName("f"))
+ # TODO: schema(b = double(), b = string())$GetFieldByName("b") # nolint
+ # also returns NULL and probably should error bc duplicated names
+})
+
+test_that("Schema extract (returns Field)", {
+ # TODO: should this return a Field or the Type?
+ # I think of Schema like list(name = type, name = type, ...)
+ # but in practice it is more like list(list(name, type), list(name, type), ...)
+ # -> Field names in a Schema may be duplicated
+ # -> Fields may have metadata (though we don't really handle that in R)
+ schm <- schema(b = double(), c = string())
+ expect_equal(schm$b, field("b", double()))
+ expect_equal(schm[["b"]], field("b", double()))
+ expect_equal(schm[[1]], field("b", double()))
+
+ expect_null(schm[["ZZZ"]])
+ expect_error(schm[[42]]) # Should have better error message
+})
+
+test_that("Schema slicing", {
+ schm <- schema(b = double(), c = string(), d = int8())
+ expect_equal(schm[2:3], schema(c = string(), d = int8()))
+ expect_equal(schm[-1], schema(c = string(), d = int8()))
+ expect_equal(schm[c("d", "c")], schema(d = int8(), c = string()))
+ expect_equal(schm[c(FALSE, TRUE, TRUE)], schema(c = string(), d = int8()))
+ expect_error(schm[c("c", "ZZZ")], 'Invalid field name: "ZZZ"')
+ expect_error(schm[c("XXX", "c", "ZZZ")], 'Invalid field names: "XXX" and "ZZZ"')
+})
+
+test_that("Schema modification", {
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm$c <- boolean()
+ expect_equal(schm, schema(b = double(), c = boolean(), d = int8()))
+ schm[["d"]] <- int16()
+ expect_equal(schm, schema(b = double(), c = boolean(), d = int16()))
+ schm$b <- NULL
+ expect_equal(schm, schema(c = boolean(), d = int16()))
+ # NULL assigning something that doesn't exist doesn't modify
+ schm$zzzz <- NULL
+ expect_equal(schm, schema(c = boolean(), d = int16()))
+ # Adding a field
+ schm$fff <- int32()
+ expect_equal(schm, schema(c = boolean(), d = int16(), fff = int32()))
+
+ # By index
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm[[2]] <- int32()
+ expect_equal(schm, schema(b = double(), c = int32(), d = int8()))
+
+ # Adding actual Fields
+ # If assigning by name, note that this can modify the resulting name
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm$c <- field("x", int32())
+ expect_equal(schm, schema(b = double(), x = int32(), d = int8()))
+ schm[[2]] <- field("y", int64())
+ expect_equal(schm, schema(b = double(), y = int64(), d = int8()))
+
+ # Error handling
+ expect_error(schm$c <- 4, "value must be a DataType")
+ expect_error(schm[[-3]] <- int32(), "i not greater than 0")
+ expect_error(schm[[0]] <- int32(), "i not greater than 0")
+ expect_error(schm[[NA_integer_]] <- int32(), "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(schm[[TRUE]] <- int32(), "i is not a numeric or integer vector")
+ expect_error(schm[[c(2, 4)]] <- int32(), "length(i) not equal to 1", fixed = TRUE)
+})
+
+test_that("Metadata is preserved when modifying Schema", {
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm$metadata$foo <- "bar"
+ expect_identical(schm$metadata, list(foo = "bar"))
+ schm$c <- field("x", int32())
+ expect_identical(schm$metadata, list(foo = "bar"))
+})
+
+test_that("reading schema from Buffer", {
+ # TODO: this uses the streaming format, i.e. from RecordBatchStreamWriter
+ # maybe there is an easier way to serialize a schema
+ batch <- record_batch(x = 1:10)
+ expect_r6_class(batch, "RecordBatch")
+
+ stream <- BufferOutputStream$create()
+ writer <- RecordBatchStreamWriter$create(stream, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$close()
+
+ buffer <- stream$finish()
+ expect_r6_class(buffer, "Buffer")
+
+ reader <- MessageReader$create(buffer)
+ expect_r6_class(reader, "MessageReader")
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+
+ stream <- BufferReader$create(buffer)
+ expect_r6_class(stream, "BufferReader")
+ message <- read_message(stream)
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+})
+
+test_that("Input validation when creating a table with a schema", {
+ expect_error(
+ Table$create(b = 1, schema = c(b = float64())), # list not Schema
+ "`schema` must be an arrow::Schema or NULL"
+ )
+})
+
+test_that("Schema$Equals", {
+ a <- schema(b = double(), c = bool())
+ b <- a$WithMetadata(list(some = "metadata"))
+
+ # different metadata
+ expect_failure(expect_equal(a, b))
+ expect_false(a$Equals(b, check_metadata = TRUE))
+
+ # Metadata not checked
+ expect_equal(a, b, ignore_attr = TRUE)
+
+ # Non-schema object
+ expect_false(a$Equals(42))
+})
+
+test_that("unify_schemas", {
+ a <- schema(b = double(), c = bool())
+ z <- schema(b = double(), k = utf8())
+ expect_equal(
+ unify_schemas(a, z),
+ schema(b = double(), c = bool(), k = utf8())
+ )
+ # returns NULL when any arg is NULL
+ expect_null(
+ unify_schemas(a, NULL, z)
+ )
+ # returns NULL when all args are NULL
+ expect_null(
+ unify_schemas(NULL, NULL)
+ )
+ # errors when no args
+ expect_error(
+ unify_schemas(),
+ "Must provide at least one schema to unify"
+ )
+})
+
+test_that("Schema to C-interface", {
+ schema <- schema(b = double(), c = bool())
+
+ # export the schema via the C-interface
+ ptr <- allocate_arrow_schema()
+ schema$export_to_c(ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- Schema$import_from_c(ptr)
+ expect_equal(circle, schema)
+
+ # must clean up the pointer or we leak
+ delete_arrow_schema(ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-thread-pool.R b/src/arrow/r/tests/testthat/test-thread-pool.R
new file mode 100644
index 000000000..baf410368
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-thread-pool.R
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("can set/get cpu thread pool capacity", {
+ old <- cpu_count()
+ set_cpu_count(19)
+ expect_equal(cpu_count(), 19L)
+ set_cpu_count(old)
+ expect_equal(cpu_count(), old)
+})
+
+test_that("can set/get I/O thread pool capacity", {
+ old <- io_thread_count()
+ set_io_thread_count(19)
+ expect_equal(io_thread_count(), 19L)
+ set_io_thread_count(old)
+ expect_equal(io_thread_count(), old)
+})
diff --git a/src/arrow/r/tests/testthat/test-type.R b/src/arrow/r/tests/testthat/test-type.R
new file mode 100644
index 000000000..3821fb450
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-type.R
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("type() gets the right type for arrow::Array", {
+ a <- Array$create(1:10)
+ expect_equal(type(a), a$type)
+})
+
+test_that("type() gets the right type for ChunkedArray", {
+ a <- chunked_array(1:10, 1:10)
+ expect_equal(type(a), a$type)
+})
+
+test_that("type() infers from R type", {
+ expect_equal(type(1:10), int32())
+ expect_equal(type(1), float64())
+ expect_equal(type(TRUE), boolean())
+ expect_equal(type(raw()), uint8())
+ expect_equal(type(""), utf8())
+ expect_equal(
+ type(example_data$fct),
+ dictionary(int8(), utf8(), FALSE)
+ )
+ expect_equal(
+ type(lubridate::ymd_hms("2019-02-14 13:55:05")),
+ timestamp(TimeUnit$MICRO, "UTC")
+ )
+ expect_equal(
+ type(hms::hms(56, 34, 12)),
+ time32(unit = TimeUnit$SECOND)
+ )
+ expect_equal(
+ type(bit64::integer64()),
+ int64()
+ )
+})
+
+test_that("type() can infer struct types from data frames", {
+ df <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+ expect_equal(type(df), struct(x = int32(), y = float64(), z = utf8()))
+})
+
+test_that("DataType$Equals", {
+ a <- int32()
+ b <- int32()
+ z <- float64()
+ expect_true(a == b)
+ expect_true(a$Equals(b))
+ expect_false(a == z)
+ expect_equal(a, b)
+ expect_failure(expect_equal(a, z))
+ expect_failure(expect_equal(a, z))
+ expect_false(a$Equals(32L))
+})
+
+test_that("Masked data type functions still work", {
+ skip("Work around masking of data type functions (ARROW-12322)")
+
+ # Works when type function is masked
+ string <- rlang::string
+ expect_equal(
+ Array$create("abc", type = string()),
+ arrow::string()
+ )
+ rm(string)
+
+ # Works when with non-Arrow function that returns an Arrow type
+ # when the non-Arrow function has the same name as a base R function...
+ str <- arrow::string
+ expect_equal(
+ Array$create("abc", type = str()),
+ arrow::string()
+ )
+ rm(str)
+
+ # ... and when it has the same name as an Arrow function
+ type <- arrow::string
+ expect_equal(
+ Array$create("abc", type = type()),
+ arrow::string()
+ )
+ rm(type)
+
+ # Works with local variable whose value is an Arrow type
+ type <- arrow::string()
+ expect_equal(
+ Array$create("abc", type = type),
+ arrow::string()
+ )
+ rm(type)
+})
+
+test_that("Type strings are correctly canonicalized", {
+ # data types without arguments
+ expect_equal(canonical_type_str("int8"), int8()$ToString())
+ expect_equal(canonical_type_str("int16"), int16()$ToString())
+ expect_equal(canonical_type_str("int32"), int32()$ToString())
+ expect_equal(canonical_type_str("int64"), int64()$ToString())
+ expect_equal(canonical_type_str("uint8"), uint8()$ToString())
+ expect_equal(canonical_type_str("uint16"), uint16()$ToString())
+ expect_equal(canonical_type_str("uint32"), uint32()$ToString())
+ expect_equal(canonical_type_str("uint64"), uint64()$ToString())
+ expect_equal(canonical_type_str("float16"), float16()$ToString())
+ expect_equal(canonical_type_str("halffloat"), halffloat()$ToString())
+ expect_equal(canonical_type_str("float32"), float32()$ToString())
+ expect_equal(canonical_type_str("float"), float()$ToString())
+ expect_equal(canonical_type_str("float64"), float64()$ToString())
+ expect_equal(canonical_type_str("double"), float64()$ToString())
+ expect_equal(canonical_type_str("boolean"), boolean()$ToString())
+ expect_equal(canonical_type_str("bool"), bool()$ToString())
+ expect_equal(canonical_type_str("utf8"), utf8()$ToString())
+ expect_equal(canonical_type_str("large_utf8"), large_utf8()$ToString())
+ expect_equal(canonical_type_str("large_string"), large_utf8()$ToString())
+ expect_equal(canonical_type_str("binary"), binary()$ToString())
+ expect_equal(canonical_type_str("large_binary"), large_binary()$ToString())
+ expect_equal(canonical_type_str("string"), arrow::string()$ToString())
+ expect_equal(canonical_type_str("null"), null()$ToString())
+
+ # data types with arguments
+ expect_equal(
+ canonical_type_str("fixed_size_binary"),
+ sub("^([^([<]+).*$", "\\1", fixed_size_binary(42)$ToString())
+ )
+ expect_equal(
+ canonical_type_str("date32"),
+ sub("^([^([<]+).*$", "\\1", date32()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("date64"),
+ sub("^([^([<]+).*$", "\\1", date64()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("time32"),
+ sub("^([^([<]+).*$", "\\1", time32()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("time64"),
+ sub("^([^([<]+).*$", "\\1", time64()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("timestamp"),
+ sub("^([^([<]+).*$", "\\1", timestamp()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("decimal"),
+ sub("^([^([<]+).*$", "\\1", decimal(3, 2)$ToString())
+ )
+ expect_equal(
+ canonical_type_str("struct"),
+ sub("^([^([<]+).*$", "\\1", struct(foo = int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("list_of"),
+ sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("list"),
+ sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("large_list_of"),
+ sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("large_list"),
+ sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("fixed_size_list_of"),
+ sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString())
+ )
+ expect_equal(
+ canonical_type_str("fixed_size_list"),
+ sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString())
+ )
+
+ # unsupported data types
+ expect_error(
+ canonical_type_str("decimal128(3, 2)"),
+ "parameters"
+ )
+ expect_error(
+ canonical_type_str("list<item: int32>"),
+ "parameters"
+ )
+ expect_error(
+ canonical_type_str("time32[s]"),
+ "parameters"
+ )
+
+ # unrecognized data types
+ expect_error(
+ canonical_type_str("foo"),
+ "Unrecognized"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-utf.R b/src/arrow/r/tests/testthat/test-utf.R
new file mode 100644
index 000000000..69d196274
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-utf.R
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("We handle non-UTF strings", {
+ # Move the code with non-UTF strings to a separate file so that we don't
+ # get a parse error on *cough* certain platforms
+ skip_on_cran()
+ source("latin1.R", encoding = "latin1")
+})