diff options
Diffstat (limited to '')
-rw-r--r-- | src/arrow/r/tests/testthat/test-dplyr-funcs-type.R | 627 |
1 files changed, 627 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R new file mode 100644 index 000000000..859dc14b9 --- /dev/null +++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R @@ -0,0 +1,627 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +library(dplyr, warn.conflicts = FALSE) +suppressPackageStartupMessages(library(bit64)) + + +tbl <- example_data + +test_that("explicit type conversions with cast()", { + num_int32 <- 12L + num_int64 <- bit64::as.integer64(10) + + int_types <- c(int8(), int16(), int32(), int64()) + uint_types <- c(uint8(), uint16(), uint32(), uint64()) + float_types <- c(float32(), float64()) + + types <- c( + int_types, + uint_types, + float_types, + double(), # not actually a type, a base R function but should be alias for float64 + string() + ) + + for (type in types) { + expect_type_equal( + object = { + t1 <- Table$create(x = num_int32) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + expect_type_equal( + object = { + t1 <- Table$create(x = num_int64) %>% + transmute(x = cast(x, type)) %>% + compute() + t1$schema[[1]]$type + }, + as_type(type) + ) + } + + # Arrow errors when truncating floats... + expect_error( + expect_type_equal( + object = { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32())) %>% + compute() + t1$schema[[1]]$type + }, + int32() + ), + "truncated" + ) + + # ... unless safe = FALSE (or allow_float_truncate = TRUE) + expect_type_equal( + object = { + t1 <- Table$create(pi = pi) %>% + transmute(three = cast(pi, int32(), safe = FALSE)) %>% + compute() + t1$schema[[1]]$type + }, + int32() + ) +}) + +test_that("explicit type conversions with as.*()", { + library(bit64) + compare_dplyr_binding( + .input %>% + transmute( + int2chr = as.character(int), + int2dbl = as.double(int), + int2int = as.integer(int), + int2num = as.numeric(int), + dbl2chr = as.character(dbl), + dbl2dbl = as.double(dbl), + dbl2int = as.integer(dbl), + dbl2num = as.numeric(dbl), + ) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute( + chr2chr = as.character(chr), + chr2dbl = as.double(chr), + chr2int = as.integer(chr), + chr2num = as.numeric(chr) + ) %>% + collect(), + tibble(chr = c("1", "2", "3")) + ) + compare_dplyr_binding( + .input %>% + transmute( + chr2i64 = as.integer64(chr), + dbl2i64 = as.integer64(dbl), + i642i64 = as.integer64(i64), + ) %>% + collect(), + tibble(chr = "10000000000", dbl = 10000000000, i64 = as.integer64(1e10)) + ) + compare_dplyr_binding( + .input %>% + transmute( + chr2lgl = as.logical(chr), + dbl2lgl = as.logical(dbl), + int2lgl = as.logical(int) + ) %>% + collect(), + tibble( + chr = c("TRUE", "FALSE", "true", "false"), + dbl = c(1, 0, -99, 0), + int = c(1L, 0L, -99L, 0L) + ) + ) + compare_dplyr_binding( + .input %>% + transmute( + dbl2chr = as.character(dbl), + dbl2dbl = as.double(dbl), + dbl2int = as.integer(dbl), + dbl2lgl = as.logical(dbl), + int2chr = as.character(int), + int2dbl = as.double(int), + int2int = as.integer(int), + int2lgl = as.logical(int), + lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ... + lgl2dbl = as.double(lgl), + lgl2int = as.integer(lgl), + lgl2lgl = as.logical(lgl) + ) %>% + collect() %>% + # need to use toupper() *after* collect() or else skip if utf8proc not available + mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE" + tibble( + dbl = c(1, 0, NA_real_), + int = c(1L, 0L, NA_integer_), + lgl = c(TRUE, FALSE, NA) + ) + ) +}) + +test_that("is.finite(), is.infinite(), is.nan()", { + df <- tibble(x = c( + -4.94065645841246544e-324, 1.79769313486231570e+308, 0, + NA_real_, NaN, Inf, -Inf + )) + compare_dplyr_binding( + .input %>% + transmute( + is_fin = is.finite(x), + is_inf = is.infinite(x) + ) %>% + collect(), + df + ) + # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850) + compare_dplyr_binding( + .input %>% + transmute( + is_nan = is.nan(x) + ) %>% + collect(), + df + ) +}) + +test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", { + df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7)) + compare_dplyr_binding( + .input %>% + transmute( + is_na = is.na(x) + ) %>% + collect(), + df + ) +}) + +test_that("type checks with is() giving Arrow types", { + # with class2=DataType + expect_equal( + Table$create( + i32 = Array$create(1, int32()), + dec = Array$create(pi)$cast(decimal(3, 2)), + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + i32_is_i32 = is(i32, int32()), + i32_is_dec = is(i32, decimal(3, 2)), + i32_is_i64 = is(i32, float64()), + i32_is_str = is(i32, arrow::string()), + dec_is_i32 = is(dec, int32()), + dec_is_dec = is(dec, decimal(3, 2)), + dec_is_i64 = is(dec, float64()), + dec_is_str = is(dec, arrow::string()), + f64_is_i32 = is(f64, int32()), + f64_is_dec = is(f64, decimal(3, 2)), + f64_is_i64 = is(f64, float64()), + f64_is_str = is(f64, arrow::string()), + str_is_i32 = is(str, int32()), + str_is_dec = is(str, decimal(3, 2)), + str_is_i64 = is(str, float64()), + str_is_str = is(str, arrow::string()) + ) %>% + collect() %>% + t() %>% + as.vector(), + c( + TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, + FALSE, FALSE, FALSE, FALSE, TRUE + ) + ) + # with class2=string + expect_equal( + Table$create( + i32 = Array$create(1, int32()), + f64 = Array$create(1.1, float64()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + i32_is_i32 = is(i32, "int32"), + i32_is_i64 = is(i32, "double"), + i32_is_str = is(i32, "string"), + f64_is_i32 = is(f64, "int32"), + f64_is_i64 = is(f64, "double"), + f64_is_str = is(f64, "string"), + str_is_i32 = is(str, "int32"), + str_is_i64 = is(str, "double"), + str_is_str = is(str, "string") + ) %>% + collect() %>% + t() %>% + as.vector(), + c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE) + ) + # with class2=string alias + expect_equal( + Table$create( + f16 = Array$create(NA_real_, halffloat()), + f32 = Array$create(1.1, float()), + f64 = Array$create(2.2, float64()), + lgl = Array$create(TRUE, bool()), + str = Array$create("a", arrow::string()) + ) %>% transmute( + f16_is_f16 = is(f16, "float16"), + f16_is_f32 = is(f16, "float32"), + f16_is_f64 = is(f16, "float64"), + f16_is_lgl = is(f16, "boolean"), + f16_is_str = is(f16, "utf8"), + f32_is_f16 = is(f32, "float16"), + f32_is_f32 = is(f32, "float32"), + f32_is_f64 = is(f32, "float64"), + f32_is_lgl = is(f32, "boolean"), + f32_is_str = is(f32, "utf8"), + f64_is_f16 = is(f64, "float16"), + f64_is_f32 = is(f64, "float32"), + f64_is_f64 = is(f64, "float64"), + f64_is_lgl = is(f64, "boolean"), + f64_is_str = is(f64, "utf8"), + lgl_is_f16 = is(lgl, "float16"), + lgl_is_f32 = is(lgl, "float32"), + lgl_is_f64 = is(lgl, "float64"), + lgl_is_lgl = is(lgl, "boolean"), + lgl_is_str = is(lgl, "utf8"), + str_is_f16 = is(str, "float16"), + str_is_f32 = is(str, "float32"), + str_is_f64 = is(str, "float64"), + str_is_lgl = is(str, "boolean"), + str_is_str = is(str, "utf8") + ) %>% + collect() %>% + t() %>% + as.vector(), + c( + TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, + FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, + FALSE, FALSE, TRUE + ) + ) +}) + +test_that("type checks with is() giving R types", { + library(bit64) + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is(chr, "character"), + chr_is_fct = is(chr, "factor"), + chr_is_int = is(chr, "integer"), + chr_is_i64 = is(chr, "integer64"), + chr_is_lst = is(chr, "list"), + chr_is_lgl = is(chr, "logical"), + chr_is_num = is(chr, "numeric"), + dbl_is_chr = is(dbl, "character"), + dbl_is_fct = is(dbl, "factor"), + dbl_is_int = is(dbl, "integer"), + dbl_is_i64 = is(dbl, "integer64"), + dbl_is_lst = is(dbl, "list"), + dbl_is_lgl = is(dbl, "logical"), + dbl_is_num = is(dbl, "numeric"), + fct_is_chr = is(fct, "character"), + fct_is_fct = is(fct, "factor"), + fct_is_int = is(fct, "integer"), + fct_is_i64 = is(fct, "integer64"), + fct_is_lst = is(fct, "list"), + fct_is_lgl = is(fct, "logical"), + fct_is_num = is(fct, "numeric"), + int_is_chr = is(int, "character"), + int_is_fct = is(int, "factor"), + int_is_int = is(int, "integer"), + int_is_i64 = is(int, "integer64"), + int_is_lst = is(int, "list"), + int_is_lgl = is(int, "logical"), + int_is_num = is(int, "numeric"), + lgl_is_chr = is(lgl, "character"), + lgl_is_fct = is(lgl, "factor"), + lgl_is_int = is(lgl, "integer"), + lgl_is_i64 = is(lgl, "integer64"), + lgl_is_lst = is(lgl, "list"), + lgl_is_lgl = is(lgl, "logical"), + lgl_is_num = is(lgl, "numeric") + ) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute( + i64_is_chr = is(i64, "character"), + i64_is_fct = is(i64, "factor"), + # we want Arrow to return TRUE, but bit64 returns FALSE + # i64_is_int = is(i64, "integer"), + i64_is_i64 = is(i64, "integer64"), + i64_is_lst = is(i64, "list"), + i64_is_lgl = is(i64, "logical"), + # we want Arrow to return TRUE, but bit64 returns FALSE + # i64_is_num = is(i64, "numeric"), + lst_is_chr = is(lst, "character"), + lst_is_fct = is(lst, "factor"), + lst_is_int = is(lst, "integer"), + lst_is_i64 = is(lst, "integer64"), + lst_is_lst = is(lst, "list"), + lst_is_lgl = is(lst, "logical"), + lst_is_num = is(lst, "numeric") + ) %>% + collect(), + tibble( + i64 = as.integer64(1:3), + lst = list(c("a", "b"), c("d", "e"), c("f", "g")) + ) + ) +}) + +test_that("type checks with is.*()", { + library(bit64) + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is.character(chr), + chr_is_dbl = is.double(chr), + chr_is_fct = is.factor(chr), + chr_is_int = is.integer(chr), + chr_is_i64 = is.integer64(chr), + chr_is_lst = is.list(chr), + chr_is_lgl = is.logical(chr), + chr_is_num = is.numeric(chr), + dbl_is_chr = is.character(dbl), + dbl_is_dbl = is.double(dbl), + dbl_is_fct = is.factor(dbl), + dbl_is_int = is.integer(dbl), + dbl_is_i64 = is.integer64(dbl), + dbl_is_lst = is.list(dbl), + dbl_is_lgl = is.logical(dbl), + dbl_is_num = is.numeric(dbl), + fct_is_chr = is.character(fct), + fct_is_dbl = is.double(fct), + fct_is_fct = is.factor(fct), + fct_is_int = is.integer(fct), + fct_is_i64 = is.integer64(fct), + fct_is_lst = is.list(fct), + fct_is_lgl = is.logical(fct), + fct_is_num = is.numeric(fct), + int_is_chr = is.character(int), + int_is_dbl = is.double(int), + int_is_fct = is.factor(int), + int_is_int = is.integer(int), + int_is_i64 = is.integer64(int), + int_is_lst = is.list(int), + int_is_lgl = is.logical(int), + int_is_num = is.numeric(int), + lgl_is_chr = is.character(lgl), + lgl_is_dbl = is.double(lgl), + lgl_is_fct = is.factor(lgl), + lgl_is_int = is.integer(lgl), + lgl_is_i64 = is.integer64(lgl), + lgl_is_lst = is.list(lgl), + lgl_is_lgl = is.logical(lgl), + lgl_is_num = is.numeric(lgl) + ) %>% + collect(), + tbl + ) + compare_dplyr_binding( + .input %>% + transmute( + i64_is_chr = is.character(i64), + # TODO: investigate why this is not matching when testthat runs it + # i64_is_dbl = is.double(i64), + i64_is_fct = is.factor(i64), + # we want Arrow to return TRUE, but bit64 returns FALSE + # i64_is_int = is.integer(i64), + i64_is_i64 = is.integer64(i64), + i64_is_lst = is.list(i64), + i64_is_lgl = is.logical(i64), + i64_is_num = is.numeric(i64), + lst_is_chr = is.character(lst), + lst_is_dbl = is.double(lst), + lst_is_fct = is.factor(lst), + lst_is_int = is.integer(lst), + lst_is_i64 = is.integer64(lst), + lst_is_lst = is.list(lst), + lst_is_lgl = is.logical(lst), + lst_is_num = is.numeric(lst) + ) %>% + collect(), + tibble( + i64 = as.integer64(1:3), + lst = list(c("a", "b"), c("d", "e"), c("f", "g")) + ) + ) +}) + +test_that("type checks with is_*()", { + library(rlang, warn.conflicts = FALSE) + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is_character(chr), + chr_is_dbl = is_double(chr), + chr_is_int = is_integer(chr), + chr_is_lst = is_list(chr), + chr_is_lgl = is_logical(chr), + dbl_is_chr = is_character(dbl), + dbl_is_dbl = is_double(dbl), + dbl_is_int = is_integer(dbl), + dbl_is_lst = is_list(dbl), + dbl_is_lgl = is_logical(dbl), + int_is_chr = is_character(int), + int_is_dbl = is_double(int), + int_is_int = is_integer(int), + int_is_lst = is_list(int), + int_is_lgl = is_logical(int), + lgl_is_chr = is_character(lgl), + lgl_is_dbl = is_double(lgl), + lgl_is_int = is_integer(lgl), + lgl_is_lst = is_list(lgl), + lgl_is_lgl = is_logical(lgl) + ) %>% + collect(), + tbl + ) +}) + +test_that("type checks on expressions", { + compare_dplyr_binding( + .input %>% + transmute( + a = is.character(as.character(int)), + b = is.integer(as.character(int)), + c = is.integer(int + int), + d = is.double(int + dbl), + e = is.logical(dbl > pi) + ) %>% + collect(), + tbl + ) + + # the code in the expectation below depends on RE2 + skip_if_not_available("re2") + + compare_dplyr_binding( + .input %>% + transmute( + a = is.logical(grepl("[def]", chr)) + ) %>% + collect(), + tbl + ) +}) + +test_that("type checks on R scalar literals", { + compare_dplyr_binding( + .input %>% + transmute( + chr_is_chr = is.character("foo"), + int_is_chr = is.character(42L), + int_is_int = is.integer(42L), + chr_is_int = is.integer("foo"), + dbl_is_num = is.numeric(3.14159), + int_is_num = is.numeric(42L), + chr_is_num = is.numeric("foo"), + dbl_is_dbl = is.double(3.14159), + chr_is_dbl = is.double("foo"), + lgl_is_lgl = is.logical(TRUE), + chr_is_lgl = is.logical("foo"), + fct_is_fct = is.factor(factor("foo", levels = c("foo", "bar", "baz"))), + chr_is_fct = is.factor("foo"), + lst_is_lst = is.list(list(c(a = "foo", b = "bar"))), + chr_is_lst = is.list("foo") + ) %>% + collect(), + tbl + ) +}) + +test_that("as.factor()/dictionary_encode()", { + skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression") + df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B")) + df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8)) + + compare_dplyr_binding( + .input %>% + transmute(x = as.factor(x)) %>% + collect(), + df1 + ) + + expect_warning( + compare_dplyr_binding( + .input %>% + transmute(x = as.factor(x)) %>% + collect(), + df2 + ), + "Coercing dictionary values to R character factor levels" + ) + + # dictionary values with default null encoding behavior ("mask") omits + # nulls from the dictionary values + expect_equal( + object = { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x)) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = NA) + ) + + # dictionary values with "encode" null encoding behavior includes nulls in + # the dictionary values + expect_equal( + object = { + rb1 <- df1 %>% + record_batch() %>% + transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>% + compute() + dict <- rb1$x$dictionary() + as.vector(dict$Take(dict$SortIndices())) + }, + sort(unique(df1$x), na.last = TRUE) + ) +}) + +test_that("bad explicit type conversions with as.*()", { + + # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R) + expect_error( + compare_dplyr_binding( + .input %>% + transmute(lgl2chr = as.character(lgl)) %>% + collect(), + tibble(lgl = c(TRUE, FALSE, NA)) + ) + ) + + # Arrow fails to parse these strings as numbers (instead of returning NAs with + # a warning like R does) + expect_error( + expect_warning( + compare_dplyr_binding( + .input %>% + transmute(chr2num = as.numeric(chr)) %>% + collect(), + tibble(chr = c("l.O", "S.S", "")) + ) + ) + ) + + # Arrow fails to parse these strings as Booleans (instead of returning NAs + # like R does) + expect_error( + compare_dplyr_binding( + .input %>% + transmute(chr2lgl = as.logical(chr)) %>% + collect(), + tibble(chr = c("TRU", "FAX", "")) + ) + ) +}) |