1 files changed, 191 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/helper-data.R b/src/arrow/r/tests/testthat/helper-data.R
new file mode 100644
index 000000000..c693e84b2
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-data.R
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+example_data <- tibble::tibble(
+  int = c(1:3, NA_integer_, 5:10),
+  dbl = c(1:8, NA, 10) + .1,
+  dbl2 = rep(5, 10),
+  lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+  false = logical(10),
+  chr = letters[c(1:5, NA, 7:10)],
+  fct = factor(letters[c(1:4, NA, NA, 7:10)])
+)
+
+example_with_metadata <- tibble::tibble(
+  a = structure("one", class = "special_string"),
+  b = 2,
+  c = tibble::tibble(
+    c1 = structure("inner", extra_attr = "something"),
+    c2 = 4,
+    c3 = 50
+  ),
+  d = "four"
+)
+
+attr(example_with_metadata, "top_level") <- list(
+  field_one = 12,
+  field_two = "more stuff"
+)
+
+haven_data <- tibble::tibble(
+  num = structure(c(5.1, 4.9),
+    format.spss = "F8.2"
+  ),
+  cat_int = structure(c(3, 1),
+    format.spss = "F8.0",
+    labels = c(first = 1, second = 2, third = 3),
+    class = c("haven_labelled", "vctrs_vctr", "double")
+  ),
+  cat_chr = structure(c("B", "B"),
+    labels = c(Alpha = "A", Beta = "B"),
+    class = c("haven_labelled", "vctrs_vctr", "character")
+  )
+)
+
+example_with_times <- tibble::tibble(
+  date = Sys.Date() + 1:10,
+  posixct = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10,
+  posixct_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10,
+  posixlt = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10),
+  posixlt_tz = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10)
+)
+
+verses <- list(
+  # Since we tend to test with dataframes with 10 rows, here are verses from
+  # "Milonga del moro judío", by Jorge Drexler. They are décimas, 10-line
+  # poems with a particular meter and rhyme scheme.
+  # (They also have non-ASCII characters, which is nice for testing)
+  c(
+    "Por cada muro, un lamento",
+    "En Jerusalén la dorada",
+    "Y mil vidas malgastadas",
+    "Por cada mandamiento",
+    "Yo soy polvo de tu viento",
+    "Y aunque sangro de tu herida",
+    "Y cada piedra querida",
+    "Guarda mi amor más profundo",
+    "No hay una piedra en el mundo",
+    "Que valga lo que una vida"
+  ),
+  c(
+    "No hay muerto que no me duela",
+    "No hay un bando ganador",
+    "No hay nada más que dolor",
+    "Y otra vida que se vuela",
+    "La guerra es muy mala escuela",
+    "No importa el disfraz que viste",
+    "Perdonen que no me aliste",
+    "Bajo ninguna bandera",
+    "Vale más cualquier quimera",
+    "Que un trozo de tela triste"
+  ),
+  c(
+    "Y a nadie le di permiso",
+    "Para matar en mi nombre",
+    "Un hombre no es más que un hombre",
+    "Y si hay Dios, así lo quiso",
+    "El mismo suelo que piso",
+    "Seguirá, yo me habré ido",
+    "Rumbo también del olvido",
+    "No hay doctrina que no vaya",
+    "Y no hay pueblo que no se haya",
+    "Creído el pueblo elegido"
+  )
+)
+
+make_big_string <- function() {
+  # This creates a character vector that would exceed the capacity of BinaryArray
+  rep(purrr::map_chr(2047:2050, ~ paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
+}
+
+make_random_string_of_size <- function(size = 1) {
+  purrr::map_chr(1000 * size, ~ paste(sample(letters, ., replace = TRUE), collapse = ""))
+}
+
+make_string_of_size <- function(size = 1) {
+  paste(rep(letters, length.out = 1000 * size), collapse = "")
+}
+
+example_with_extra_metadata <- example_with_metadata
+attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100))
+
+example_with_logical_factors <- tibble::tibble(
+  starting_a_fight = factor(c(FALSE, TRUE, TRUE, TRUE)),
+  consoling_a_child = factor(c(TRUE, FALSE, TRUE, TRUE)),
+  petting_a_dog = factor(c(TRUE, TRUE, FALSE, TRUE)),
+  saying = c(
+    "shhhhh, it's ok",
+    "you wanna go outside?",
+    "you want your mommy?",
+    "hey buddy"
+  )
+)
+
+# The values in each column of this tibble are in ascending order. There are
+# some ties, so tests should use two or more columns to ensure deterministic
+# sort order. The Arrow C++ library orders strings lexicographically as byte
+# strings. The order of a string array sorted by Arrow will not match the order
+# of an equivalent character vector sorted by R unless you set the R collation
+# locale to "C" by running:   Sys.setlocale("LC_COLLATE", "C")
+# These test scripts set that, but if you are running individual tests you might
+# need to set it manually. When finished, you can restore the default
+# collation locale by running:   Sys.setlocale("LC_COLLATE")
+# In the future, the string collation locale used by the Arrow C++ library might
+# be configurable (ARROW-12046).
+example_data_for_sorting <- tibble::tibble(
+  int = c(-.Machine$integer.max, -101L, -100L, 0L, 0L, 1L, 100L, 1000L, .Machine$integer.max, NA_integer_),
+  dbl = c(
+    -Inf, -.Machine$double.xmax, -.Machine$double.xmin, 0, .Machine$double.xmin,
+    pi, .Machine$double.xmax, Inf, NaN, NA_real_
+  ),
+  chr = c("", "", "\"", "&", "ABC", "NULL", "a", "abc", "zzz", NA_character_),
+  lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA),
+  dttm = lubridate::ymd_hms(c(
+    "0000-01-01 00:00:00",
+    "1919-05-29 13:08:55",
+    "1955-06-20 04:10:42",
+    "1973-06-30 11:38:41",
+    "1987-03-29 12:49:47",
+    "1991-06-11 19:07:01",
+    NA_character_,
+    "2017-08-21 18:26:40",
+    "2017-08-21 18:26:40",
+    "9999-12-31 23:59:59"
+  )),
+  grp = c(rep("A", 5), rep("B", 5))
+)
+
+# For Dataset tests
+first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+df1 <- tibble::tibble(
+  int = 1:10,
+  dbl = as.numeric(1:10),
+  lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+  chr = letters[1:10],
+  fct = factor(LETTERS[1:10]),
+  ts = first_date + lubridate::days(1:10)
+)
+
+second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+df2 <- tibble::tibble(
+  int = 101:110,
+  dbl = c(as.numeric(51:59), NaN),
+  lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+  chr = letters[10:1],
+  fct = factor(LETTERS[10:1]),
+  ts = second_date + lubridate::days(10:1)
+)