summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/R/arrow-package.R
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/R/arrow-package.R351
1 files changed, 351 insertions, 0 deletions
diff --git a/src/arrow/r/R/arrow-package.R b/src/arrow/r/R/arrow-package.R
new file mode 100644
index 000000000..edc2652b6
--- /dev/null
+++ b/src/arrow/r/R/arrow-package.R
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail
+#' @importFrom R6 R6Class
+#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr
+#' @importFrom assertthat assert_that is.string
+#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos
+#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
+#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
+#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
+#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
+#' @useDynLib arrow, .registration = TRUE
+#' @keywords internal
+"_PACKAGE"
+
+#' @importFrom vctrs s3_register vec_size vec_cast vec_unique
+.onLoad <- function(...) {
+ dplyr_methods <- paste0(
+ "dplyr::",
+ c(
+ "select", "filter", "collect", "summarise", "group_by", "groups",
+ "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
+ "arrange", "rename", "pull", "relocate", "compute", "collapse",
+ "distinct", "left_join", "right_join", "inner_join", "full_join",
+ "semi_join", "anti_join", "count", "tally"
+ )
+ )
+ for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {
+ for (m in dplyr_methods) {
+ s3_register(m, cl)
+ }
+ }
+ s3_register("dplyr::tbl_vars", "arrow_dplyr_query")
+
+ for (cl in c(
+ "Array", "RecordBatch", "ChunkedArray", "Table", "Schema",
+ "Field", "DataType", "RecordBatchReader"
+ )) {
+ s3_register("reticulate::py_to_r", paste0("pyarrow.lib.", cl))
+ s3_register("reticulate::r_to_py", cl)
+ }
+
+ # Create these once, at package build time
+ if (arrow_available()) {
+ # Also include all available Arrow Compute functions,
+ # namespaced as arrow_fun.
+ # We can't do this at install time because list_compute_functions() may error
+ all_arrow_funs <- list_compute_functions()
+ arrow_funcs <- set_names(
+ lapply(all_arrow_funs, function(fun) {
+ force(fun)
+ function(...) build_expr(fun, ...)
+ }),
+ paste0("arrow_", all_arrow_funs)
+ )
+ .cache$functions <- c(nse_funcs, arrow_funcs)
+ }
+
+ if (tolower(Sys.info()[["sysname"]]) == "windows") {
+ # Disable multithreading on Windows
+ # See https://issues.apache.org/jira/browse/ARROW-8379
+ options(arrow.use_threads = FALSE)
+ }
+
+ invisible()
+}
+
+.onAttach <- function(libname, pkgname) {
+ if (!arrow_available()) {
+ msg <- paste(
+ "The Arrow C++ library is not available. To retry installation with debug output, run:",
+ " install_arrow(verbose = TRUE)",
+ "See https://arrow.apache.org/docs/r/articles/install.html for more guidance and troubleshooting.",
+ sep = "\n"
+ )
+ packageStartupMessage(msg)
+ } else {
+ # Just to be extra safe, let's wrap this in a try();
+ # we don't a failed startup message to prevent the package from loading
+ try({
+ features <- arrow_info()$capabilities
+ # That has all of the #ifdef features, plus the compression libs and the
+ # string libraries (but not the memory allocators, they're added elsewhere)
+ #
+ # Let's print a message if some are off
+ if (some_features_are_off(features)) {
+ packageStartupMessage("See arrow_info() for available features")
+ }
+ })
+ }
+}
+
+#' Is the C++ Arrow library available?
+#'
+#' You won't generally need to call these function, but they're made available
+#' for diagnostic purposes.
+#' @return `TRUE` or `FALSE` depending on whether the package was installed
+#' with:
+#' * The Arrow C++ library (check with `arrow_available()`)
+#' * Arrow Dataset support enabled (check with `arrow_with_dataset()`)
+#' * Parquet support enabled (check with `arrow_with_parquet()`)
+#' * JSON support enabled (check with `arrow_with_json()`)
+#' * Amazon S3 support enabled (check with `arrow_with_s3()`)
+#' @export
+#' @examples
+#' arrow_available()
+#' arrow_with_dataset()
+#' arrow_with_parquet()
+#' arrow_with_json()
+#' arrow_with_s3()
+#' @seealso If any of these are `FALSE`, see
+#' `vignette("install", package = "arrow")` for guidance on reinstalling the
+#' package.
+arrow_available <- function() {
+ tryCatch(.Call(`_arrow_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_dataset <- function() {
+ is_32bit <- .Machine$sizeof.pointer < 8
+ is_old_r <- getRversion() < "4.0.0"
+ is_windows <- tolower(Sys.info()[["sysname"]]) == "windows"
+ if (is_32bit && is_old_r && is_windows) {
+ # 32-bit rtools 3.5 does not properly implement the std::thread expectations
+ # but we can't just disable ARROW_DATASET in that build,
+ # so report it as "off" here.
+ return(FALSE)
+ }
+ tryCatch(.Call(`_dataset_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_parquet <- function() {
+ tryCatch(.Call(`_parquet_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_s3 <- function() {
+ tryCatch(.Call(`_s3_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_json <- function() {
+ tryCatch(.Call(`_json_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+option_use_threads <- function() {
+ !is_false(getOption("arrow.use_threads"))
+}
+
+#' Report information on the package's capabilities
+#'
+#' This function summarizes a number of build-time configurations and run-time
+#' settings for the Arrow package. It may be useful for diagnostics.
+#' @return A list including version information, boolean "capabilities", and
+#' statistics from Arrow's memory allocator, and also Arrow's run-time
+#' information.
+#' @export
+#' @importFrom utils packageVersion
+arrow_info <- function() {
+ opts <- options()
+ out <- list(
+ version = packageVersion("arrow"),
+ libarrow = arrow_available(),
+ options = opts[grep("^arrow\\.", names(opts))]
+ )
+ if (out$libarrow) {
+ pool <- default_memory_pool()
+ runtimeinfo <- runtime_info()
+ buildinfo <- build_info()
+ compute_funcs <- list_compute_functions()
+ out <- c(out, list(
+ capabilities = c(
+ dataset = arrow_with_dataset(),
+ parquet = arrow_with_parquet(),
+ json = arrow_with_json(),
+ s3 = arrow_with_s3(),
+ utf8proc = "utf8_upper" %in% compute_funcs,
+ re2 = "replace_substring_regex" %in% compute_funcs,
+ vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1))
+ ),
+ memory_pool = list(
+ backend_name = pool$backend_name,
+ bytes_allocated = pool$bytes_allocated,
+ max_memory = pool$max_memory,
+ available_backends = supported_memory_backends()
+ ),
+ runtime_info = list(
+ simd_level = runtimeinfo[1],
+ detected_simd_level = runtimeinfo[2]
+ ),
+ build_info = list(
+ cpp_version = buildinfo[1],
+ cpp_compiler = buildinfo[2],
+ cpp_compiler_version = buildinfo[3],
+ cpp_compiler_flags = buildinfo[4],
+ # git_id is "" if not built from a git checkout
+ # convert that to NULL
+ git_id = if (nzchar(buildinfo[5])) buildinfo[5]
+ )
+ ))
+ }
+ structure(out, class = "arrow_info")
+}
+
+some_features_are_off <- function(features) {
+ # `features` is a named logical vector (as in arrow_info()$capabilities)
+ # Let's exclude some less relevant ones
+ blocklist <- c("lzo", "bz2", "brotli")
+ # Return TRUE if any of the other features are FALSE
+ !all(features[setdiff(names(features), blocklist)])
+}
+
+#' @export
+print.arrow_info <- function(x, ...) {
+ print_key_values <- function(title, vals, ...) {
+ # Make a key-value table for printing, no column names
+ df <- data.frame(vals, stringsAsFactors = FALSE, ...)
+ names(df) <- ""
+
+ cat(title, ":\n", sep = "")
+ print(df)
+ cat("\n")
+ }
+ cat("Arrow package version: ", format(x$version), "\n\n", sep = "")
+ if (x$libarrow) {
+ print_key_values("Capabilities", c(
+ x$capabilities,
+ jemalloc = "jemalloc" %in% x$memory_pool$available_backends,
+ mimalloc = "mimalloc" %in% x$memory_pool$available_backends
+ ))
+ if (some_features_are_off(x$capabilities) && identical(tolower(Sys.info()[["sysname"]]), "linux")) {
+ # Only on linux because (e.g.) we disable certain features on purpose on rtools35 and solaris
+ cat(
+ "To reinstall with more optional capabilities enabled, see\n",
+ " https://arrow.apache.org/docs/r/articles/install.html\n\n"
+ )
+ }
+
+ if (length(x$options)) {
+ print_key_values("Arrow options()", map_chr(x$options, format))
+ }
+
+ format_bytes <- function(b, units = "auto", digits = 2L, ...) {
+ format(structure(b, class = "object_size"), units = units, digits = digits, ...)
+ }
+ print_key_values("Memory", c(
+ Allocator = x$memory_pool$backend_name,
+ # utils:::format.object_size is not properly vectorized
+ Current = format_bytes(x$memory_pool$bytes_allocated, ...),
+ Max = format_bytes(x$memory_pool$max_memory, ...)
+ ))
+ print_key_values("Runtime", c(
+ `SIMD Level` = x$runtime_info$simd_level,
+ `Detected SIMD Level` = x$runtime_info$detected_simd_level
+ ))
+ print_key_values("Build", c(
+ `C++ Library Version` = x$build_info$cpp_version,
+ `C++ Compiler` = x$build_info$cpp_compiler,
+ `C++ Compiler Version` = x$build_info$cpp_compiler_version,
+ `Git ID` = x$build_info$git_id
+ ))
+ } else {
+ cat(
+ "Arrow C++ library not available. See https://arrow.apache.org/docs/r/articles/install.html ",
+ "for troubleshooting.\n"
+ )
+ }
+ invisible(x)
+}
+
+option_compress_metadata <- function() {
+ !is_false(getOption("arrow.compress_metadata"))
+}
+
+#' @include enums.R
+ArrowObject <- R6Class("ArrowObject",
+ public = list(
+ initialize = function(xp) self$set_pointer(xp),
+ pointer = function() get(".:xp:.", envir = self),
+ `.:xp:.` = NULL,
+ set_pointer = function(xp) {
+ if (!inherits(xp, "externalptr")) {
+ stop(
+ class(self)[1], "$new() requires a pointer as input: ",
+ "did you mean $create() instead?",
+ call. = FALSE
+ )
+ }
+ assign(".:xp:.", xp, envir = self)
+ },
+ print = function(...) {
+ if (!is.null(self$.class_title)) {
+ # Allow subclasses to override just printing the class name first
+ class_title <- self$.class_title()
+ } else {
+ class_title <- class(self)[[1]]
+ }
+ cat(class_title, "\n", sep = "")
+ if (!is.null(self$ToString)) {
+ cat(self$ToString(), "\n", sep = "")
+ }
+ invisible(self)
+ },
+ invalidate = function() {
+ assign(".:xp:.", NULL, envir = self)
+ }
+ )
+)
+
+#' @export
+`!=.ArrowObject` <- function(lhs, rhs) !(lhs == rhs) # nolint
+
+#' @export
+`==.ArrowObject` <- function(x, y) { # nolint
+ x$Equals(y)
+}
+
+#' @export
+all.equal.ArrowObject <- function(target, current, ..., check.attributes = TRUE) {
+ target$Equals(current, check_metadata = check.attributes)
+}