diff options
Diffstat (limited to 'src/arrow/r/R/arrow-package.R')
-rw-r--r-- | src/arrow/r/R/arrow-package.R | 351 |
1 files changed, 351 insertions, 0 deletions
diff --git a/src/arrow/r/R/arrow-package.R b/src/arrow/r/R/arrow-package.R new file mode 100644 index 000000000..edc2652b6 --- /dev/null +++ b/src/arrow/r/R/arrow-package.R @@ -0,0 +1,351 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail +#' @importFrom R6 R6Class +#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr +#' @importFrom assertthat assert_that is.string +#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos +#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec +#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive +#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure +#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select +#' @useDynLib arrow, .registration = TRUE +#' @keywords internal +"_PACKAGE" + +#' @importFrom vctrs s3_register vec_size vec_cast vec_unique +.onLoad <- function(...) { + dplyr_methods <- paste0( + "dplyr::", + c( + "select", "filter", "collect", "summarise", "group_by", "groups", + "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute", + "arrange", "rename", "pull", "relocate", "compute", "collapse", + "distinct", "left_join", "right_join", "inner_join", "full_join", + "semi_join", "anti_join", "count", "tally" + ) + ) + for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) { + for (m in dplyr_methods) { + s3_register(m, cl) + } + } + s3_register("dplyr::tbl_vars", "arrow_dplyr_query") + + for (cl in c( + "Array", "RecordBatch", "ChunkedArray", "Table", "Schema", + "Field", "DataType", "RecordBatchReader" + )) { + s3_register("reticulate::py_to_r", paste0("pyarrow.lib.", cl)) + s3_register("reticulate::r_to_py", cl) + } + + # Create these once, at package build time + if (arrow_available()) { + # Also include all available Arrow Compute functions, + # namespaced as arrow_fun. + # We can't do this at install time because list_compute_functions() may error + all_arrow_funs <- list_compute_functions() + arrow_funcs <- set_names( + lapply(all_arrow_funs, function(fun) { + force(fun) + function(...) build_expr(fun, ...) + }), + paste0("arrow_", all_arrow_funs) + ) + .cache$functions <- c(nse_funcs, arrow_funcs) + } + + if (tolower(Sys.info()[["sysname"]]) == "windows") { + # Disable multithreading on Windows + # See https://issues.apache.org/jira/browse/ARROW-8379 + options(arrow.use_threads = FALSE) + } + + invisible() +} + +.onAttach <- function(libname, pkgname) { + if (!arrow_available()) { + msg <- paste( + "The Arrow C++ library is not available. To retry installation with debug output, run:", + " install_arrow(verbose = TRUE)", + "See https://arrow.apache.org/docs/r/articles/install.html for more guidance and troubleshooting.", + sep = "\n" + ) + packageStartupMessage(msg) + } else { + # Just to be extra safe, let's wrap this in a try(); + # we don't a failed startup message to prevent the package from loading + try({ + features <- arrow_info()$capabilities + # That has all of the #ifdef features, plus the compression libs and the + # string libraries (but not the memory allocators, they're added elsewhere) + # + # Let's print a message if some are off + if (some_features_are_off(features)) { + packageStartupMessage("See arrow_info() for available features") + } + }) + } +} + +#' Is the C++ Arrow library available? +#' +#' You won't generally need to call these function, but they're made available +#' for diagnostic purposes. +#' @return `TRUE` or `FALSE` depending on whether the package was installed +#' with: +#' * The Arrow C++ library (check with `arrow_available()`) +#' * Arrow Dataset support enabled (check with `arrow_with_dataset()`) +#' * Parquet support enabled (check with `arrow_with_parquet()`) +#' * JSON support enabled (check with `arrow_with_json()`) +#' * Amazon S3 support enabled (check with `arrow_with_s3()`) +#' @export +#' @examples +#' arrow_available() +#' arrow_with_dataset() +#' arrow_with_parquet() +#' arrow_with_json() +#' arrow_with_s3() +#' @seealso If any of these are `FALSE`, see +#' `vignette("install", package = "arrow")` for guidance on reinstalling the +#' package. +arrow_available <- function() { + tryCatch(.Call(`_arrow_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_dataset <- function() { + is_32bit <- .Machine$sizeof.pointer < 8 + is_old_r <- getRversion() < "4.0.0" + is_windows <- tolower(Sys.info()[["sysname"]]) == "windows" + if (is_32bit && is_old_r && is_windows) { + # 32-bit rtools 3.5 does not properly implement the std::thread expectations + # but we can't just disable ARROW_DATASET in that build, + # so report it as "off" here. + return(FALSE) + } + tryCatch(.Call(`_dataset_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_parquet <- function() { + tryCatch(.Call(`_parquet_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_s3 <- function() { + tryCatch(.Call(`_s3_available`), error = function(e) { + return(FALSE) + }) +} + +#' @rdname arrow_available +#' @export +arrow_with_json <- function() { + tryCatch(.Call(`_json_available`), error = function(e) { + return(FALSE) + }) +} + +option_use_threads <- function() { + !is_false(getOption("arrow.use_threads")) +} + +#' Report information on the package's capabilities +#' +#' This function summarizes a number of build-time configurations and run-time +#' settings for the Arrow package. It may be useful for diagnostics. +#' @return A list including version information, boolean "capabilities", and +#' statistics from Arrow's memory allocator, and also Arrow's run-time +#' information. +#' @export +#' @importFrom utils packageVersion +arrow_info <- function() { + opts <- options() + out <- list( + version = packageVersion("arrow"), + libarrow = arrow_available(), + options = opts[grep("^arrow\\.", names(opts))] + ) + if (out$libarrow) { + pool <- default_memory_pool() + runtimeinfo <- runtime_info() + buildinfo <- build_info() + compute_funcs <- list_compute_functions() + out <- c(out, list( + capabilities = c( + dataset = arrow_with_dataset(), + parquet = arrow_with_parquet(), + json = arrow_with_json(), + s3 = arrow_with_s3(), + utf8proc = "utf8_upper" %in% compute_funcs, + re2 = "replace_substring_regex" %in% compute_funcs, + vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1)) + ), + memory_pool = list( + backend_name = pool$backend_name, + bytes_allocated = pool$bytes_allocated, + max_memory = pool$max_memory, + available_backends = supported_memory_backends() + ), + runtime_info = list( + simd_level = runtimeinfo[1], + detected_simd_level = runtimeinfo[2] + ), + build_info = list( + cpp_version = buildinfo[1], + cpp_compiler = buildinfo[2], + cpp_compiler_version = buildinfo[3], + cpp_compiler_flags = buildinfo[4], + # git_id is "" if not built from a git checkout + # convert that to NULL + git_id = if (nzchar(buildinfo[5])) buildinfo[5] + ) + )) + } + structure(out, class = "arrow_info") +} + +some_features_are_off <- function(features) { + # `features` is a named logical vector (as in arrow_info()$capabilities) + # Let's exclude some less relevant ones + blocklist <- c("lzo", "bz2", "brotli") + # Return TRUE if any of the other features are FALSE + !all(features[setdiff(names(features), blocklist)]) +} + +#' @export +print.arrow_info <- function(x, ...) { + print_key_values <- function(title, vals, ...) { + # Make a key-value table for printing, no column names + df <- data.frame(vals, stringsAsFactors = FALSE, ...) + names(df) <- "" + + cat(title, ":\n", sep = "") + print(df) + cat("\n") + } + cat("Arrow package version: ", format(x$version), "\n\n", sep = "") + if (x$libarrow) { + print_key_values("Capabilities", c( + x$capabilities, + jemalloc = "jemalloc" %in% x$memory_pool$available_backends, + mimalloc = "mimalloc" %in% x$memory_pool$available_backends + )) + if (some_features_are_off(x$capabilities) && identical(tolower(Sys.info()[["sysname"]]), "linux")) { + # Only on linux because (e.g.) we disable certain features on purpose on rtools35 and solaris + cat( + "To reinstall with more optional capabilities enabled, see\n", + " https://arrow.apache.org/docs/r/articles/install.html\n\n" + ) + } + + if (length(x$options)) { + print_key_values("Arrow options()", map_chr(x$options, format)) + } + + format_bytes <- function(b, units = "auto", digits = 2L, ...) { + format(structure(b, class = "object_size"), units = units, digits = digits, ...) + } + print_key_values("Memory", c( + Allocator = x$memory_pool$backend_name, + # utils:::format.object_size is not properly vectorized + Current = format_bytes(x$memory_pool$bytes_allocated, ...), + Max = format_bytes(x$memory_pool$max_memory, ...) + )) + print_key_values("Runtime", c( + `SIMD Level` = x$runtime_info$simd_level, + `Detected SIMD Level` = x$runtime_info$detected_simd_level + )) + print_key_values("Build", c( + `C++ Library Version` = x$build_info$cpp_version, + `C++ Compiler` = x$build_info$cpp_compiler, + `C++ Compiler Version` = x$build_info$cpp_compiler_version, + `Git ID` = x$build_info$git_id + )) + } else { + cat( + "Arrow C++ library not available. See https://arrow.apache.org/docs/r/articles/install.html ", + "for troubleshooting.\n" + ) + } + invisible(x) +} + +option_compress_metadata <- function() { + !is_false(getOption("arrow.compress_metadata")) +} + +#' @include enums.R +ArrowObject <- R6Class("ArrowObject", + public = list( + initialize = function(xp) self$set_pointer(xp), + pointer = function() get(".:xp:.", envir = self), + `.:xp:.` = NULL, + set_pointer = function(xp) { + if (!inherits(xp, "externalptr")) { + stop( + class(self)[1], "$new() requires a pointer as input: ", + "did you mean $create() instead?", + call. = FALSE + ) + } + assign(".:xp:.", xp, envir = self) + }, + print = function(...) { + if (!is.null(self$.class_title)) { + # Allow subclasses to override just printing the class name first + class_title <- self$.class_title() + } else { + class_title <- class(self)[[1]] + } + cat(class_title, "\n", sep = "") + if (!is.null(self$ToString)) { + cat(self$ToString(), "\n", sep = "") + } + invisible(self) + }, + invalidate = function() { + assign(".:xp:.", NULL, envir = self) + } + ) +) + +#' @export +`!=.ArrowObject` <- function(lhs, rhs) !(lhs == rhs) # nolint + +#' @export +`==.ArrowObject` <- function(x, y) { # nolint + x$Equals(y) +} + +#' @export +all.equal.ArrowObject <- function(target, current, ..., check.attributes = TRUE) { + target$Equals(current, check_metadata = check.attributes) +} |