summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/tests/testthat/helper-data.R
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/r/tests/testthat/helper-data.R')
-rw-r--r--src/arrow/r/tests/testthat/helper-data.R191
1 files changed, 191 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/helper-data.R b/src/arrow/r/tests/testthat/helper-data.R
new file mode 100644
index 000000000..c693e84b2
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-data.R
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+example_data <- tibble::tibble(
+ int = c(1:3, NA_integer_, 5:10),
+ dbl = c(1:8, NA, 10) + .1,
+ dbl2 = rep(5, 10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ false = logical(10),
+ chr = letters[c(1:5, NA, 7:10)],
+ fct = factor(letters[c(1:4, NA, NA, 7:10)])
+)
+
+example_with_metadata <- tibble::tibble(
+ a = structure("one", class = "special_string"),
+ b = 2,
+ c = tibble::tibble(
+ c1 = structure("inner", extra_attr = "something"),
+ c2 = 4,
+ c3 = 50
+ ),
+ d = "four"
+)
+
+attr(example_with_metadata, "top_level") <- list(
+ field_one = 12,
+ field_two = "more stuff"
+)
+
+haven_data <- tibble::tibble(
+ num = structure(c(5.1, 4.9),
+ format.spss = "F8.2"
+ ),
+ cat_int = structure(c(3, 1),
+ format.spss = "F8.0",
+ labels = c(first = 1, second = 2, third = 3),
+ class = c("haven_labelled", "vctrs_vctr", "double")
+ ),
+ cat_chr = structure(c("B", "B"),
+ labels = c(Alpha = "A", Beta = "B"),
+ class = c("haven_labelled", "vctrs_vctr", "character")
+ )
+)
+
+example_with_times <- tibble::tibble(
+ date = Sys.Date() + 1:10,
+ posixct = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10,
+ posixct_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10,
+ posixlt = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10),
+ posixlt_tz = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10)
+)
+
+verses <- list(
+ # Since we tend to test with dataframes with 10 rows, here are verses from
+ # "Milonga del moro judío", by Jorge Drexler. They are décimas, 10-line
+ # poems with a particular meter and rhyme scheme.
+ # (They also have non-ASCII characters, which is nice for testing)
+ c(
+ "Por cada muro, un lamento",
+ "En Jerusalén la dorada",
+ "Y mil vidas malgastadas",
+ "Por cada mandamiento",
+ "Yo soy polvo de tu viento",
+ "Y aunque sangro de tu herida",
+ "Y cada piedra querida",
+ "Guarda mi amor más profundo",
+ "No hay una piedra en el mundo",
+ "Que valga lo que una vida"
+ ),
+ c(
+ "No hay muerto que no me duela",
+ "No hay un bando ganador",
+ "No hay nada más que dolor",
+ "Y otra vida que se vuela",
+ "La guerra es muy mala escuela",
+ "No importa el disfraz que viste",
+ "Perdonen que no me aliste",
+ "Bajo ninguna bandera",
+ "Vale más cualquier quimera",
+ "Que un trozo de tela triste"
+ ),
+ c(
+ "Y a nadie le di permiso",
+ "Para matar en mi nombre",
+ "Un hombre no es más que un hombre",
+ "Y si hay Dios, así lo quiso",
+ "El mismo suelo que piso",
+ "Seguirá, yo me habré ido",
+ "Rumbo también del olvido",
+ "No hay doctrina que no vaya",
+ "Y no hay pueblo que no se haya",
+ "Creído el pueblo elegido"
+ )
+)
+
+make_big_string <- function() {
+ # This creates a character vector that would exceed the capacity of BinaryArray
+ rep(purrr::map_chr(2047:2050, ~ paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
+}
+
+make_random_string_of_size <- function(size = 1) {
+ purrr::map_chr(1000 * size, ~ paste(sample(letters, ., replace = TRUE), collapse = ""))
+}
+
+make_string_of_size <- function(size = 1) {
+ paste(rep(letters, length.out = 1000 * size), collapse = "")
+}
+
+example_with_extra_metadata <- example_with_metadata
+attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100))
+
+example_with_logical_factors <- tibble::tibble(
+ starting_a_fight = factor(c(FALSE, TRUE, TRUE, TRUE)),
+ consoling_a_child = factor(c(TRUE, FALSE, TRUE, TRUE)),
+ petting_a_dog = factor(c(TRUE, TRUE, FALSE, TRUE)),
+ saying = c(
+ "shhhhh, it's ok",
+ "you wanna go outside?",
+ "you want your mommy?",
+ "hey buddy"
+ )
+)
+
+# The values in each column of this tibble are in ascending order. There are
+# some ties, so tests should use two or more columns to ensure deterministic
+# sort order. The Arrow C++ library orders strings lexicographically as byte
+# strings. The order of a string array sorted by Arrow will not match the order
+# of an equivalent character vector sorted by R unless you set the R collation
+# locale to "C" by running: Sys.setlocale("LC_COLLATE", "C")
+# These test scripts set that, but if you are running individual tests you might
+# need to set it manually. When finished, you can restore the default
+# collation locale by running: Sys.setlocale("LC_COLLATE")
+# In the future, the string collation locale used by the Arrow C++ library might
+# be configurable (ARROW-12046).
+example_data_for_sorting <- tibble::tibble(
+ int = c(-.Machine$integer.max, -101L, -100L, 0L, 0L, 1L, 100L, 1000L, .Machine$integer.max, NA_integer_),
+ dbl = c(
+ -Inf, -.Machine$double.xmax, -.Machine$double.xmin, 0, .Machine$double.xmin,
+ pi, .Machine$double.xmax, Inf, NaN, NA_real_
+ ),
+ chr = c("", "", "\"", "&", "ABC", "NULL", "a", "abc", "zzz", NA_character_),
+ lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA),
+ dttm = lubridate::ymd_hms(c(
+ "0000-01-01 00:00:00",
+ "1919-05-29 13:08:55",
+ "1955-06-20 04:10:42",
+ "1973-06-30 11:38:41",
+ "1987-03-29 12:49:47",
+ "1991-06-11 19:07:01",
+ NA_character_,
+ "2017-08-21 18:26:40",
+ "2017-08-21 18:26:40",
+ "9999-12-31 23:59:59"
+ )),
+ grp = c(rep("A", 5), rep("B", 5))
+)
+
+# For Dataset tests
+first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+df1 <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ fct = factor(LETTERS[1:10]),
+ ts = first_date + lubridate::days(1:10)
+)
+
+second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+df2 <- tibble::tibble(
+ int = 101:110,
+ dbl = c(as.numeric(51:59), NaN),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[10:1],
+ fct = factor(LETTERS[10:1]),
+ ts = second_date + lubridate::days(10:1)
+)