1 files changed, 1399 insertions, 0 deletions
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R
new file mode 100644
index 000000000..5e092f4e3
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R
@@ -0,0 +1,1399 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+skip_if_not_available("utf8proc")
+
+library(dplyr, warn.conflicts = FALSE)
+library(lubridate)
+library(stringr)
+library(stringi)
+
+test_that("paste, paste0, and str_c", {
+  df <- tibble(
+    v = c("A", "B", "C"),
+    w = c("a", "b", "c"),
+    x = c("d", NA_character_, "f"),
+    y = c(NA_character_, "h", "i"),
+    z = c(1.1, 2.2, NA)
+  )
+  x <- Expression$field_ref("x")
+  y <- Expression$field_ref("y")
+
+  # no NAs in data
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste(v, w)) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste(v, w, sep = "-")) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste0(v, w)) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(str_c(v, w)) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(str_c(v, w, sep = "+")) %>%
+      collect(),
+    df
+  )
+
+  # NAs in data
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste(x, y)) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste(x, y, sep = "-")) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(str_c(x, y)) %>%
+      collect(),
+    df
+  )
+
+  # non-character column in dots
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste0(x, y, z)) %>%
+      collect(),
+    df
+  )
+
+  # literal string in dots
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste(x, "foo", y)) %>%
+      collect(),
+    df
+  )
+
+  # literal NA in dots
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste(x, NA, y)) %>%
+      collect(),
+    df
+  )
+
+  # expressions in dots
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste0(x, toupper(y), as.character(z))) %>%
+      collect(),
+    df
+  )
+
+  # sep is literal NA
+  # errors in paste() (consistent with base::paste())
+  expect_error(
+    nse_funcs$paste(x, y, sep = NA_character_),
+    "Invalid separator"
+  )
+  # emits null in str_c() (consistent with stringr::str_c())
+  compare_dplyr_binding(
+    .input %>%
+      transmute(str_c(x, y, sep = NA_character_)) %>%
+      collect(),
+    df
+  )
+
+  # sep passed in dots to paste0 (which doesn't take a sep argument)
+  compare_dplyr_binding(
+    .input %>%
+      transmute(paste0(x, y, sep = "-")) %>%
+      collect(),
+    df
+  )
+
+  # known differences
+
+  # arrow allows the separator to be an array
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      transmute(result = paste(x, y, sep = w)) %>%
+      collect(),
+    df %>%
+      transmute(result = paste(x, w, y, sep = ""))
+  )
+
+  # expected errors
+
+  # collapse argument not supported
+  expect_error(
+    nse_funcs$paste(x, y, collapse = ""),
+    "collapse"
+  )
+  expect_error(
+    nse_funcs$paste0(x, y, collapse = ""),
+    "collapse"
+  )
+  expect_error(
+    nse_funcs$str_c(x, y, collapse = ""),
+    "collapse"
+  )
+
+  # literal vectors of length != 1 not supported
+  expect_error(
+    nse_funcs$paste(x, character(0), y),
+    "Literal vectors of length != 1 not supported in string concatenation"
+  )
+  expect_error(
+    nse_funcs$paste(x, c(",", ";"), y),
+    "Literal vectors of length != 1 not supported in string concatenation"
+  )
+})
+
+test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
+  df <- tibble(x = c("Foo", "bar"))
+  compare_dplyr_binding(
+    .input %>%
+      filter(grepl("o", x, fixed = TRUE)) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", {
+  df <- tibble(x = c("Foo", "bar"))
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = gsub("o", "u", x, fixed = TRUE)) %>%
+      collect(),
+    df
+  )
+})
+
+# many of the remainder of these tests require RE2
+skip_if_not_available("re2")
+
+test_that("grepl", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  for (fixed in c(TRUE, FALSE)) {
+    compare_dplyr_binding(
+      .input %>%
+        filter(grepl("Foo", x, fixed = fixed)) %>%
+        collect(),
+      df
+    )
+    compare_dplyr_binding(
+      .input %>%
+        transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>%
+        collect(),
+      df
+    )
+    compare_dplyr_binding(
+      .input %>%
+        filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>%
+        collect(),
+      df
+    )
+  }
+})
+
+test_that("grepl with ignore.case = TRUE and fixed = TRUE", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE,
+  # so we can't use compare_dplyr_binding() for these tests
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>%
+      collect(),
+    tibble(x = "Foo")
+  )
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      filter(x = grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>%
+      collect(),
+    tibble(x = character(0))
+  )
+})
+
+test_that("str_detect", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_detect(x, regex("^F"))) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_detect(x, fixed("o"))) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_detect(x, fixed("O"))) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("sub and gsub", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  for (fixed in c(TRUE, FALSE)) {
+    compare_dplyr_binding(
+      .input %>%
+        transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>%
+        collect(),
+      df
+    )
+    compare_dplyr_binding(
+      .input %>%
+        transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
+        collect(),
+      df
+    )
+    compare_dplyr_binding(
+      .input %>%
+        transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
+        collect(),
+      df
+    )
+  }
+})
+
+test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when
+  # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
+      collect(),
+    tibble(x = c("Fuo", "bar"))
+  )
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
+      collect(),
+    tibble(x = c("Fuu", "bar"))
+  )
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>%
+      collect(),
+    df # unchanged
+  )
+})
+
+test_that("str_replace and str_replace_all", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace_all(x, "^F", "baz")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace_all(x, fixed("o"), "u")) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace(x, fixed("O"), "u")) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("strsplit and str_split", {
+  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strsplit(x, "and")) %>%
+      collect(),
+    df,
+    # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray)
+    # has type information in it, but it's just a bare list from R/dplyr.
+    ignore_attr = TRUE
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
+      collect(),
+    df,
+    ignore_attr = TRUE
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strsplit(x, " +and +")) %>%
+      collect(),
+    df,
+    ignore_attr = TRUE
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_split(x, "and")) %>%
+      collect(),
+    df,
+    ignore_attr = TRUE
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_split(x, "and", n = 2)) %>%
+      collect(),
+    df,
+    ignore_attr = TRUE
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_split(x, fixed("and"), n = 2)) %>%
+      collect(),
+    df,
+    ignore_attr = TRUE
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_split(x, regex("and"), n = 2)) %>%
+      collect(),
+    df,
+    ignore_attr = TRUE
+  )
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_split(x, "Foo|bar", n = 2)) %>%
+      collect(),
+    df,
+    ignore_attr = TRUE
+  )
+})
+
+test_that("str_to_lower, str_to_upper, and str_to_title", {
+  df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!"))
+  compare_dplyr_binding(
+    .input %>%
+      transmute(
+        x_lower = str_to_lower(x),
+        x_upper = str_to_upper(x),
+        x_title = str_to_title(x)
+      ) %>%
+      collect(),
+    df
+  )
+
+  # Error checking a single function because they all use the same code path.
+  expect_error(
+    nse_funcs$str_to_lower("Apache Arrow", locale = "sp"),
+    "Providing a value for 'locale' other than the default ('en') is not supported by Arrow",
+    fixed = TRUE
+  )
+})
+
+test_that("arrow_*_split_whitespace functions", {
+  # use only ASCII whitespace characters
+  df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
+
+  # use only non-ASCII whitespace characters
+  df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux"))
+
+  df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))
+
+  # use default option values
+  expect_equal(
+    df_ascii %>%
+      Table$create() %>%
+      mutate(x = arrow_ascii_split_whitespace(x)) %>%
+      collect(),
+    df_split,
+    ignore_attr = TRUE
+  )
+  expect_equal(
+    df_utf8 %>%
+      Table$create() %>%
+      mutate(x = arrow_utf8_split_whitespace(x)) %>%
+      collect(),
+    df_split,
+    ignore_attr = TRUE
+  )
+
+  # specify non-default option values
+  expect_equal(
+    df_ascii %>%
+      Table$create() %>%
+      mutate(
+        x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+      ) %>%
+      collect(),
+    tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))),
+    ignore_attr = TRUE
+  )
+  expect_equal(
+    df_utf8 %>%
+      Table$create() %>%
+      mutate(
+        x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+      ) %>%
+      collect(),
+    tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))),
+    ignore_attr = TRUE
+  )
+})
+
+test_that("errors and warnings in string splitting", {
+  # These conditions generate an error, but abandon_ship() catches the error,
+  # issues a warning, and pulls the data into R (if computing on InMemoryDataset)
+  # Elsewhere we test that abandon_ship() works,
+  # so here we can just call the functions directly
+
+  x <- Expression$field_ref("x")
+  expect_error(
+    nse_funcs$str_split(x, fixed("and", ignore_case = TRUE)),
+    "Case-insensitive string splitting not supported by Arrow"
+  )
+  expect_error(
+    nse_funcs$str_split(x, coll("and.?")),
+    "Pattern modifier `coll()` not supported by Arrow",
+    fixed = TRUE
+  )
+  expect_error(
+    nse_funcs$str_split(x, boundary(type = "word")),
+    "Pattern modifier `boundary()` not supported by Arrow",
+    fixed = TRUE
+  )
+  expect_error(
+    nse_funcs$str_split(x, "and", n = 0),
+    "Splitting strings into zero parts not supported by Arrow"
+  )
+
+  # This condition generates a warning
+  expect_warning(
+    nse_funcs$str_split(x, fixed("and"), simplify = TRUE),
+    "Argument 'simplify = TRUE' will be ignored"
+  )
+})
+
+test_that("errors and warnings in string detection and replacement", {
+  x <- Expression$field_ref("x")
+
+  expect_error(
+    nse_funcs$str_detect(x, boundary(type = "character")),
+    "Pattern modifier `boundary()` not supported by Arrow",
+    fixed = TRUE
+  )
+  expect_error(
+    nse_funcs$str_replace_all(x, coll("o", locale = "en"), "ó"),
+    "Pattern modifier `coll()` not supported by Arrow",
+    fixed = TRUE
+  )
+
+  # This condition generates a warning
+  expect_warning(
+    nse_funcs$str_replace_all(x, regex("o", multiline = TRUE), "u"),
+    "Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
+  )
+})
+
+test_that("backreferences in pattern in string detection", {
+  skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
+  df <- tibble(x = c("Foo", "bar"))
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_detect(x, regex("F([aeiou])\\1"))) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("backreferences (substitutions) in string replacement", {
+  df <- tibble(x = c("Foo", "bar"))
+
+  compare_dplyr_binding(
+    .input %>%
+      transmute(desc = sub(
+        "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?",
+        "path `\\2` on server `\\1`",
+        url
+      )) %>%
+      collect(),
+    tibble(url = "https://arrow.apache.org/docs/r/")
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
+      collect(),
+    df
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("edge cases in string detection and replacement", {
+  # in case-insensitive fixed match/replace, test that "\\E" in the search
+  # string and backslashes in the replacement string are interpreted literally.
+  # this test does not use compare_dplyr_binding() because base::sub() and
+  # base::grepl() do not support ignore.case = TRUE when fixed = TRUE.
+  expect_equal(
+    tibble(x = c("\\Q\\e\\D")) %>%
+      Table$create() %>%
+      filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>%
+      collect(),
+    tibble(x = c("\\Q\\e\\D"))
+  )
+  expect_equal(
+    tibble(x = c("\\Q\\e\\D")) %>%
+      Table$create() %>%
+      transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>%
+      collect(),
+    tibble(x = c("\\Q\\L\\D"))
+  )
+
+  # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's
+  # added in case-insensitive regex match/replace
+  compare_dplyr_binding(
+    .input %>%
+      filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>%
+      collect(),
+    tibble(x = c("ABC"))
+  )
+  compare_dplyr_binding(
+    .input %>%
+      transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>%
+      collect(),
+    tibble(x = c("ABC"))
+  )
+})
+
+test_that("strptime", {
+  # base::strptime() defaults to local timezone
+  # but arrow's strptime defaults to UTC.
+  # So that tests are consistent, set the local timezone to UTC
+  # TODO: consider reevaluating this workaround after ARROW-12980
+  withr::local_timezone("UTC")
+
+  t_string <- tibble(x = c("2018-10-07 19:04:05", NA))
+  t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA))
+
+  expect_equal(
+    t_string %>%
+      Table$create() %>%
+      mutate(
+        x = strptime(x)
+      ) %>%
+      collect(),
+    t_stamp,
+    ignore_attr = "tzone"
+  )
+
+  expect_equal(
+    t_string %>%
+      Table$create() %>%
+      mutate(
+        x = strptime(x, format = "%Y-%m-%d %H:%M:%S")
+      ) %>%
+      collect(),
+    t_stamp,
+    ignore_attr = "tzone"
+  )
+
+  expect_equal(
+    t_string %>%
+      Table$create() %>%
+      mutate(
+        x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns")
+      ) %>%
+      collect(),
+    t_stamp,
+    ignore_attr = "tzone"
+  )
+
+  expect_equal(
+    t_string %>%
+      Table$create() %>%
+      mutate(
+        x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s")
+      ) %>%
+      collect(),
+    t_stamp,
+    ignore_attr = "tzone"
+  )
+
+  tstring <- tibble(x = c("08-05-2008", NA))
+  tstamp <- strptime(c("08-05-2008", NA), format = "%m-%d-%Y")
+
+  expect_equal(
+    tstring %>%
+      Table$create() %>%
+      mutate(
+        x = strptime(x, format = "%m-%d-%Y")
+      ) %>%
+      pull(),
+    # R's strptime returns POSIXlt (list type)
+    as.POSIXct(tstamp),
+    ignore_attr = "tzone"
+  )
+})
+
+test_that("errors in strptime", {
+  # Error when tz is passed
+  x <- Expression$field_ref("x")
+  expect_error(
+    nse_funcs$strptime(x, tz = "PDT"),
+    "Time zone argument not supported by Arrow"
+  )
+})
+
+test_that("strftime", {
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  times <- tibble(
+    datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA),
+    date = c(as.Date("2021-01-01"), NA)
+  )
+  formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u"
+  formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u"
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strftime(datetime, format = formats)) %>%
+      collect(),
+    times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strftime(date, format = formats_date)) %>%
+      collect(),
+    times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>%
+      collect(),
+    times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>%
+      collect(),
+    times
+  )
+
+  withr::with_timezone(
+    "Pacific/Marquesas",
+    {
+      compare_dplyr_binding(
+        .input %>%
+          mutate(
+            x = strftime(datetime, format = formats, tz = "EST"),
+            x_date = strftime(date, format = formats_date, tz = "EST")
+          ) %>%
+          collect(),
+        times
+      )
+
+      compare_dplyr_binding(
+        .input %>%
+          mutate(
+            x = strftime(datetime, format = formats),
+            x_date = strftime(date, format = formats_date)
+          ) %>%
+          collect(),
+        times
+      )
+    }
+  )
+
+  # This check is due to differences in the way %c currently works in Arrow and R's strftime.
+  # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved.
+  expect_error(
+    times %>%
+      Table$create() %>%
+      mutate(x = strftime(datetime, format = "%c")) %>%
+      collect(),
+    "%c flag is not supported in non-C locales."
+  )
+
+  # Output precision of %S depends on the input timestamp precision.
+  # Timestamps with second precision are represented as integers while
+  # milliseconds, microsecond and nanoseconds are represented as fixed floating
+  # point numbers with 3, 6 and 9 decimal places respectively.
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = strftime(datetime, format = "%S")) %>%
+      transmute(as.double(substr(x, 1, 2))) %>%
+      collect(),
+    times,
+    tolerance = 1e-6
+  )
+})
+
+test_that("format_ISO8601", {
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+  times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA))
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>%
+      collect(),
+    times
+  )
+
+  if (getRversion() < "3.5") {
+    # before 3.5, times$x will have no timezone attribute, so Arrow faithfully
+    # errors that there is no timezone to format:
+    expect_error(
+      times %>%
+        Table$create() %>%
+        mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
+        collect(),
+      "Timezone not present, cannot convert to string with timezone: %Y-%m-%d%z"
+    )
+
+    # See comment regarding %S flag in strftime tests
+    expect_error(
+      times %>%
+        Table$create() %>%
+        mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
+        mutate(x = gsub("\\.0*", "", x)) %>%
+        collect(),
+      "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z"
+    )
+  } else {
+    compare_dplyr_binding(
+      .input %>%
+        mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
+        collect(),
+      times
+    )
+
+    # See comment regarding %S flag in strftime tests
+    compare_dplyr_binding(
+      .input %>%
+        mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
+        mutate(x = gsub("\\.0*", "", x)) %>%
+        collect(),
+      times
+    )
+  }
+
+
+  # See comment regarding %S flag in strftime tests
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>%
+      mutate(x = gsub("\\.0*", "", x)) %>%
+      collect(),
+    times
+  )
+})
+
+test_that("arrow_find_substring and arrow_find_substring_regex", {
+  df <- tibble(x = c("Foo and Bar", "baz and qux and quux"))
+
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>%
+      collect(),
+    tibble(x = c(-1, 0))
+  )
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = arrow_find_substring(
+        x,
+        options = list(pattern = "b", ignore_case = TRUE)
+      )) %>%
+      collect(),
+    tibble(x = c(8, 0))
+  )
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = arrow_find_substring_regex(
+        x,
+        options = list(pattern = "^[fb]")
+      )) %>%
+      collect(),
+    tibble(x = c(-1, 0))
+  )
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = arrow_find_substring_regex(
+        x,
+        options = list(pattern = "[AEIOU]", ignore_case = TRUE)
+      )) %>%
+      collect(),
+    tibble(x = c(1, 1))
+  )
+})
+
+test_that("stri_reverse and arrow_ascii_reverse functions", {
+  df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
+
+  df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux"))
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = stri_reverse(x)) %>%
+      collect(),
+    df_utf8
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = stri_reverse(x)) %>%
+      collect(),
+    df_ascii
+  )
+
+  expect_equal(
+    df_ascii %>%
+      Table$create() %>%
+      mutate(x = arrow_ascii_reverse(x)) %>%
+      collect(),
+    tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab"))
+  )
+
+  expect_error(
+    df_utf8 %>%
+      Table$create() %>%
+      mutate(x = arrow_ascii_reverse(x)) %>%
+      collect(),
+    "Invalid: Non-ASCII sequence in input"
+  )
+})
+
+test_that("str_like", {
+  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+  # TODO: After new version of stringr with str_like has been released, update all
+  # these tests to use compare_dplyr_binding
+
+  # No match - entire string
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_like(x, "baz")) %>%
+      collect(),
+    tibble(x = c(FALSE, FALSE))
+  )
+
+  # Match - entire string
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_like(x, "Foo and bar")) %>%
+      collect(),
+    tibble(x = c(TRUE, FALSE))
+  )
+
+  # Wildcard
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>%
+      collect(),
+    tibble(x = c(TRUE, FALSE))
+  )
+
+  # Ignore case
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>%
+      collect(),
+    tibble(x = c(FALSE, FALSE))
+  )
+
+  # Single character
+  expect_equal(
+    df %>%
+      Table$create() %>%
+      mutate(x = str_like(x, "_a%")) %>%
+      collect(),
+    tibble(x = c(FALSE, TRUE))
+  )
+
+  # This will give an error until a new version of stringr with str_like has been released
+  skip_if_not(packageVersion("stringr") > "1.4.0")
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_like(x, "%baz%")) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("str_pad", {
+  df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_pad(x, width = 31)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_pad(x, width = 30, side = "right")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(x = str_pad(x, width = 31, side = "both")) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("substr", {
+  df <- tibble(x = "Apache Arrow")
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, 1, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, 0, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, -1, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, 6, 1)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, -1, -2)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, 9, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, 1, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, 8, 12)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substr(x, -5, -1)) %>%
+      collect(),
+    df
+  )
+
+  expect_error(
+    nse_funcs$substr("Apache Arrow", c(1, 2), 3),
+    "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  expect_error(
+    nse_funcs$substr("Apache Arrow", 1, c(2, 3)),
+    "`stop` must be length 1 - other lengths are not supported in Arrow"
+  )
+})
+
+test_that("substring", {
+  # nse_funcs$substring just calls nse_funcs$substr, tested extensively above
+  df <- tibble(x = "Apache Arrow")
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = substring(x, 1, 6)) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("str_sub", {
+  df <- tibble(x = "Apache Arrow")
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, 1, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, 0, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, -1, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, 6, 1)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, -1, -2)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, -1, 3)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, 9, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, 1, 6)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, 8, 12)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(y = str_sub(x, -5, -1)) %>%
+      collect(),
+    df
+  )
+
+  expect_error(
+    nse_funcs$str_sub("Apache Arrow", c(1, 2), 3),
+    "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  expect_error(
+    nse_funcs$str_sub("Apache Arrow", 1, c(2, 3)),
+    "`end` must be length 1 - other lengths are not supported in Arrow"
+  )
+})
+
+test_that("str_starts, str_ends, startsWith, endsWith", {
+  df <- tibble(x = c("Foo", "bar", "baz", "qux"))
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_starts(x, "b.*")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_starts(x, "b.*", negate = TRUE)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_starts(x, fixed("b.*"))) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_starts(x, fixed("b"))) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_ends(x, "r")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_ends(x, "r", negate = TRUE)) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_ends(x, fixed("r$"))) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(str_ends(x, fixed("r"))) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(startsWith(x, "b")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(endsWith(x, "r")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(startsWith(x, "b.*")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      filter(endsWith(x, "r$")) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("str_count", {
+  df <- tibble(
+    cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"),
+    dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(a_count = str_count(cities, pattern = "a")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(p_count = str_count(cities, pattern = "d")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(p_count = str_count(cities,
+        pattern = regex("d", ignore_case = TRUE)
+      )) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(e_count = str_count(cities, pattern = "u")) %>%
+      collect(),
+    df
+  )
+
+  # nse_funcs$str_count() is not vectorised over pattern
+  compare_dplyr_binding(
+    .input %>%
+      mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>%
+      collect(),
+    df,
+    warning = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(dots_count = str_count(dots, ".")) %>%
+      collect(),
+    df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(dots_count = str_count(dots, fixed("."))) %>%
+      collect(),
+    df
+  )
+})