From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/arrow/r/man/ArrayData.Rd | 27 +++ src/arrow/r/man/ChunkedArray.Rd | 80 ++++++++ src/arrow/r/man/Codec.Rd | 24 +++ src/arrow/r/man/CsvReadOptions.Rd | 107 ++++++++++ src/arrow/r/man/CsvTableReader.Rd | 32 +++ src/arrow/r/man/DataType.Rd | 15 ++ src/arrow/r/man/Dataset.Rd | 81 ++++++++ src/arrow/r/man/DictionaryType.Rd | 15 ++ src/arrow/r/man/Expression.Rd | 18 ++ src/arrow/r/man/FeatherReader.Rd | 33 ++++ src/arrow/r/man/Field.Rd | 37 ++++ src/arrow/r/man/FileFormat.Rd | 68 +++++++ src/arrow/r/man/FileInfo.Rd | 28 +++ src/arrow/r/man/FileSelector.Rd | 27 +++ src/arrow/r/man/FileSystem.Rd | 99 ++++++++++ src/arrow/r/man/FileWriteOptions.Rd | 8 + src/arrow/r/man/FixedWidthType.Rd | 15 ++ src/arrow/r/man/FragmentScanOptions.Rd | 40 ++++ src/arrow/r/man/InputStream.Rd | 45 +++++ src/arrow/r/man/MemoryPool.Rd | 24 +++ src/arrow/r/man/Message.Rd | 15 ++ src/arrow/r/man/MessageReader.Rd | 15 ++ src/arrow/r/man/OutputStream.Rd | 38 ++++ src/arrow/r/man/ParquetArrowReaderProperties.Rd | 29 +++ src/arrow/r/man/ParquetFileReader.Rd | 59 ++++++ src/arrow/r/man/ParquetFileWriter.Rd | 31 +++ src/arrow/r/man/ParquetWriterProperties.Rd | 49 +++++ src/arrow/r/man/Partitioning.Rd | 51 +++++ src/arrow/r/man/RecordBatch.Rd | 92 +++++++++ src/arrow/r/man/RecordBatchReader.Rd | 86 ++++++++ src/arrow/r/man/RecordBatchWriter.Rd | 89 +++++++++ src/arrow/r/man/Scalar.Rd | 38 ++++ src/arrow/r/man/Scanner.Rd | 51 +++++ src/arrow/r/man/Schema.Rd | 86 ++++++++ src/arrow/r/man/Table.Rd | 92 +++++++++ src/arrow/r/man/array.Rd | 107 ++++++++++ src/arrow/r/man/arrow-package.Rd | 45 +++++ src/arrow/r/man/arrow_available.Rd | 47 +++++ src/arrow/r/man/arrow_info.Rd | 17 ++ src/arrow/r/man/buffer.Rd | 44 +++++ src/arrow/r/man/call_function.Rd | 51 +++++ src/arrow/r/man/cast_options.Rd | 22 +++ src/arrow/r/man/codec_is_available.Rd | 25 +++ src/arrow/r/man/compression.Rd | 31 +++ src/arrow/r/man/contains_regex.Rd | 18 ++ src/arrow/r/man/copy_files.Rd | 35 ++++ src/arrow/r/man/cpu_count.Rd | 17 ++ .../r/man/create_package_with_all_dependencies.Rd | 70 +++++++ src/arrow/r/man/data-type.Rd | 163 +++++++++++++++ src/arrow/r/man/dataset_factory.Rd | 76 +++++++ src/arrow/r/man/default_memory_pool.Rd | 15 ++ src/arrow/r/man/dictionary.Rd | 24 +++ src/arrow/r/man/enums.Rd | 88 +++++++++ src/arrow/r/man/flight_connect.Rd | 21 ++ src/arrow/r/man/flight_get.Rd | 19 ++ src/arrow/r/man/flight_put.Rd | 25 +++ src/arrow/r/man/get_stringr_pattern_options.Rd | 22 +++ src/arrow/r/man/hive_partition.Rd | 35 ++++ src/arrow/r/man/install_arrow.Rd | 61 ++++++ src/arrow/r/man/install_pyarrow.Rd | 22 +++ src/arrow/r/man/io_thread_count.Rd | 17 ++ src/arrow/r/man/list_compute_functions.Rd | 45 +++++ src/arrow/r/man/list_flights.Rd | 23 +++ src/arrow/r/man/load_flight_server.Rd | 22 +++ src/arrow/r/man/make_readable_file.Rd | 29 +++ src/arrow/r/man/map_batches.Rd | 30 +++ src/arrow/r/man/match_arrow.Rd | 53 +++++ src/arrow/r/man/mmap_create.Rd | 19 ++ src/arrow/r/man/mmap_open.Rd | 16 ++ src/arrow/r/man/open_dataset.Rd | 146 ++++++++++++++ src/arrow/r/man/read_delim_arrow.Rd | 218 +++++++++++++++++++++ src/arrow/r/man/read_feather.Rd | 50 +++++ src/arrow/r/man/read_ipc_stream.Rd | 42 ++++ src/arrow/r/man/read_json_arrow.Rd | 52 +++++ src/arrow/r/man/read_message.Rd | 14 ++ src/arrow/r/man/read_parquet.Rd | 50 +++++ src/arrow/r/man/read_schema.Rd | 19 ++ src/arrow/r/man/recycle_scalars.Rd | 18 ++ src/arrow/r/man/reexports.Rd | 29 +++ src/arrow/r/man/repeat_value_as_array.Rd | 20 ++ src/arrow/r/man/s3_bucket.Rd | 28 +++ src/arrow/r/man/to_arrow.Rd | 33 ++++ src/arrow/r/man/to_duckdb.Rd | 56 ++++++ src/arrow/r/man/type.Rd | 27 +++ src/arrow/r/man/unify_schemas.Rd | 27 +++ src/arrow/r/man/value_counts.Rd | 24 +++ src/arrow/r/man/write_csv_arrow.Rd | 32 +++ src/arrow/r/man/write_dataset.Rd | 115 +++++++++++ src/arrow/r/man/write_feather.Rd | 61 ++++++ src/arrow/r/man/write_ipc_stream.Rd | 45 +++++ src/arrow/r/man/write_parquet.Rd | 108 ++++++++++ src/arrow/r/man/write_to_raw.Rd | 28 +++ 92 files changed, 4240 insertions(+) create mode 100644 src/arrow/r/man/ArrayData.Rd create mode 100644 src/arrow/r/man/ChunkedArray.Rd create mode 100644 src/arrow/r/man/Codec.Rd create mode 100644 src/arrow/r/man/CsvReadOptions.Rd create mode 100644 src/arrow/r/man/CsvTableReader.Rd create mode 100644 src/arrow/r/man/DataType.Rd create mode 100644 src/arrow/r/man/Dataset.Rd create mode 100644 src/arrow/r/man/DictionaryType.Rd create mode 100644 src/arrow/r/man/Expression.Rd create mode 100644 src/arrow/r/man/FeatherReader.Rd create mode 100644 src/arrow/r/man/Field.Rd create mode 100644 src/arrow/r/man/FileFormat.Rd create mode 100644 src/arrow/r/man/FileInfo.Rd create mode 100644 src/arrow/r/man/FileSelector.Rd create mode 100644 src/arrow/r/man/FileSystem.Rd create mode 100644 src/arrow/r/man/FileWriteOptions.Rd create mode 100644 src/arrow/r/man/FixedWidthType.Rd create mode 100644 src/arrow/r/man/FragmentScanOptions.Rd create mode 100644 src/arrow/r/man/InputStream.Rd create mode 100644 src/arrow/r/man/MemoryPool.Rd create mode 100644 src/arrow/r/man/Message.Rd create mode 100644 src/arrow/r/man/MessageReader.Rd create mode 100644 src/arrow/r/man/OutputStream.Rd create mode 100644 src/arrow/r/man/ParquetArrowReaderProperties.Rd create mode 100644 src/arrow/r/man/ParquetFileReader.Rd create mode 100644 src/arrow/r/man/ParquetFileWriter.Rd create mode 100644 src/arrow/r/man/ParquetWriterProperties.Rd create mode 100644 src/arrow/r/man/Partitioning.Rd create mode 100644 src/arrow/r/man/RecordBatch.Rd create mode 100644 src/arrow/r/man/RecordBatchReader.Rd create mode 100644 src/arrow/r/man/RecordBatchWriter.Rd create mode 100644 src/arrow/r/man/Scalar.Rd create mode 100644 src/arrow/r/man/Scanner.Rd create mode 100644 src/arrow/r/man/Schema.Rd create mode 100644 src/arrow/r/man/Table.Rd create mode 100644 src/arrow/r/man/array.Rd create mode 100644 src/arrow/r/man/arrow-package.Rd create mode 100644 src/arrow/r/man/arrow_available.Rd create mode 100644 src/arrow/r/man/arrow_info.Rd create mode 100644 src/arrow/r/man/buffer.Rd create mode 100644 src/arrow/r/man/call_function.Rd create mode 100644 src/arrow/r/man/cast_options.Rd create mode 100644 src/arrow/r/man/codec_is_available.Rd create mode 100644 src/arrow/r/man/compression.Rd create mode 100644 src/arrow/r/man/contains_regex.Rd create mode 100644 src/arrow/r/man/copy_files.Rd create mode 100644 src/arrow/r/man/cpu_count.Rd create mode 100644 src/arrow/r/man/create_package_with_all_dependencies.Rd create mode 100644 src/arrow/r/man/data-type.Rd create mode 100644 src/arrow/r/man/dataset_factory.Rd create mode 100644 src/arrow/r/man/default_memory_pool.Rd create mode 100644 src/arrow/r/man/dictionary.Rd create mode 100644 src/arrow/r/man/enums.Rd create mode 100644 src/arrow/r/man/flight_connect.Rd create mode 100644 src/arrow/r/man/flight_get.Rd create mode 100644 src/arrow/r/man/flight_put.Rd create mode 100644 src/arrow/r/man/get_stringr_pattern_options.Rd create mode 100644 src/arrow/r/man/hive_partition.Rd create mode 100644 src/arrow/r/man/install_arrow.Rd create mode 100644 src/arrow/r/man/install_pyarrow.Rd create mode 100644 src/arrow/r/man/io_thread_count.Rd create mode 100644 src/arrow/r/man/list_compute_functions.Rd create mode 100644 src/arrow/r/man/list_flights.Rd create mode 100644 src/arrow/r/man/load_flight_server.Rd create mode 100644 src/arrow/r/man/make_readable_file.Rd create mode 100644 src/arrow/r/man/map_batches.Rd create mode 100644 src/arrow/r/man/match_arrow.Rd create mode 100644 src/arrow/r/man/mmap_create.Rd create mode 100644 src/arrow/r/man/mmap_open.Rd create mode 100644 src/arrow/r/man/open_dataset.Rd create mode 100644 src/arrow/r/man/read_delim_arrow.Rd create mode 100644 src/arrow/r/man/read_feather.Rd create mode 100644 src/arrow/r/man/read_ipc_stream.Rd create mode 100644 src/arrow/r/man/read_json_arrow.Rd create mode 100644 src/arrow/r/man/read_message.Rd create mode 100644 src/arrow/r/man/read_parquet.Rd create mode 100644 src/arrow/r/man/read_schema.Rd create mode 100644 src/arrow/r/man/recycle_scalars.Rd create mode 100644 src/arrow/r/man/reexports.Rd create mode 100644 src/arrow/r/man/repeat_value_as_array.Rd create mode 100644 src/arrow/r/man/s3_bucket.Rd create mode 100644 src/arrow/r/man/to_arrow.Rd create mode 100644 src/arrow/r/man/to_duckdb.Rd create mode 100644 src/arrow/r/man/type.Rd create mode 100644 src/arrow/r/man/unify_schemas.Rd create mode 100644 src/arrow/r/man/value_counts.Rd create mode 100644 src/arrow/r/man/write_csv_arrow.Rd create mode 100644 src/arrow/r/man/write_dataset.Rd create mode 100644 src/arrow/r/man/write_feather.Rd create mode 100644 src/arrow/r/man/write_ipc_stream.Rd create mode 100644 src/arrow/r/man/write_parquet.Rd create mode 100644 src/arrow/r/man/write_to_raw.Rd (limited to 'src/arrow/r/man') diff --git a/src/arrow/r/man/ArrayData.Rd b/src/arrow/r/man/ArrayData.Rd new file mode 100644 index 000000000..383ab317d --- /dev/null +++ b/src/arrow/r/man/ArrayData.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/array-data.R +\docType{class} +\name{ArrayData} +\alias{ArrayData} +\title{ArrayData class} +\description{ +The \code{ArrayData} class allows you to get and inspect the data +inside an \code{arrow::Array}. +} +\section{Usage}{ +\preformatted{data <- Array$create(x)$data() + +data$type +data$length +data$null_count +data$offset +data$buffers +} +} + +\section{Methods}{ + + +... +} + diff --git a/src/arrow/r/man/ChunkedArray.Rd b/src/arrow/r/man/ChunkedArray.Rd new file mode 100644 index 000000000..3a504f014 --- /dev/null +++ b/src/arrow/r/man/ChunkedArray.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chunked-array.R +\docType{class} +\name{ChunkedArray} +\alias{ChunkedArray} +\alias{chunked_array} +\title{ChunkedArray class} +\usage{ +chunked_array(..., type = NULL) +} +\arguments{ +\item{\dots}{Vectors to coerce} + +\item{type}{currently ignored} +} +\description{ +A \code{ChunkedArray} is a data structure managing a list of +primitive Arrow \link[=Array]{Arrays} logically as one large array. Chunked arrays +may be grouped together in a \link{Table}. +} +\section{Factory}{ + +The \code{ChunkedArray$create()} factory method instantiates the object from +various Arrays or R vectors. \code{chunked_array()} is an alias for it. +} + +\section{Methods}{ + +\itemize{ +\item \verb{$length()}: Size in the number of elements this array contains +\item \verb{$chunk(i)}: Extract an \code{Array} chunk by integer position +\item \verb{$as_vector()}: convert to an R vector +\item \verb{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array +with the indicated offset and length. If length is \code{NULL}, the slice goes +until the end of the array. +\item \verb{$Take(i)}: return a \code{ChunkedArray} with values at positions given by +integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be +coerced to an R vector before taking. +\item \verb{$Filter(i, keep_na = TRUE)}: return a \code{ChunkedArray} with values at positions where +logical vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}. +\item \verb{$SortIndices(descending = FALSE)}: return an \code{Array} of integer positions that can be +used to rearrange the \code{ChunkedArray} in ascending or descending order +\item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the +data in the array to change its type. +\item \verb{$null_count}: The number of null entries in the array +\item \verb{$chunks}: return a list of \code{Array}s +\item \verb{$num_chunks}: integer number of chunks in the \code{ChunkedArray} +\item \verb{$type}: logical type of data +\item \verb{$View(type)}: Construct a zero-copy view of this \code{ChunkedArray} with the +given type. +\item \verb{$Validate()}: Perform any validation checks to determine obvious inconsistencies +within the array's internal data. This can be an expensive check, potentially \code{O(length)} +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Pass items into chunked_array as separate objects to create chunks +class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73)) +class_scores$num_chunks + +# When taking a Slice from a chunked_array, chunks are preserved +class_scores$Slice(2, length = 5) + +# You can combine Take and SortIndices to return a ChunkedArray with 1 chunk +# containing all values, ordered. +class_scores$Take(class_scores$SortIndices(descending = TRUE)) + +# If you pass a list into chunked_array, you get a list of length 1 +list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8))) +list_scores$num_chunks + +# When constructing a ChunkedArray, the first chunk is used to infer type. +doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L)) +doubles$type +\dontshow{\}) # examplesIf} +} +\seealso{ +\link{Array} +} diff --git a/src/arrow/r/man/Codec.Rd b/src/arrow/r/man/Codec.Rd new file mode 100644 index 000000000..86723aed5 --- /dev/null +++ b/src/arrow/r/man/Codec.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compression.R +\docType{class} +\name{Codec} +\alias{Codec} +\title{Compression Codec class} +\description{ +Codecs allow you to create \link[=compression]{compressed input and output streams}. +} +\section{Factory}{ + +The \code{Codec$create()} factory method takes the following arguments: +\itemize{ +\item \code{type}: string name of the compression method. Possible values are +"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo", or +"bz2". \code{type} may be upper- or lower-cased. Not all methods may be +available; support depends on build-time flags for the C++ library. +See \code{\link[=codec_is_available]{codec_is_available()}}. Most builds support at least "snappy" and +"gzip". All support "uncompressed". +\item \code{compression_level}: compression level, the default value (\code{NA}) uses the +default compression level for the selected compression \code{type}. +} +} + diff --git a/src/arrow/r/man/CsvReadOptions.Rd b/src/arrow/r/man/CsvReadOptions.Rd new file mode 100644 index 000000000..d08869270 --- /dev/null +++ b/src/arrow/r/man/CsvReadOptions.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R, R/json.R +\docType{class} +\name{CsvReadOptions} +\alias{CsvReadOptions} +\alias{CsvWriteOptions} +\alias{CsvParseOptions} +\alias{TimestampParser} +\alias{CsvConvertOptions} +\alias{JsonReadOptions} +\alias{JsonParseOptions} +\title{File reader options} +\description{ +\code{CsvReadOptions}, \code{CsvParseOptions}, \code{CsvConvertOptions}, +\code{JsonReadOptions}, \code{JsonParseOptions}, and \code{TimestampParser} are containers for various +file reading options. See their usage in \code{\link[=read_csv_arrow]{read_csv_arrow()}} and +\code{\link[=read_json_arrow]{read_json_arrow()}}, respectively. +} +\section{Factory}{ + + +The \code{CsvReadOptions$create()} and \code{JsonReadOptions$create()} factory methods +take the following arguments: +\itemize{ +\item \code{use_threads} Whether to use the global CPU thread pool +\item \code{block_size} Block size we request from the IO layer; also determines +the size of chunks when use_threads is \code{TRUE}. NB: if \code{FALSE}, JSON input +must end with an empty line. +} + +\code{CsvReadOptions$create()} further accepts these additional arguments: +\itemize{ +\item \code{skip_rows} Number of lines to skip before reading data (default 0) +\item \code{column_names} Character vector to supply column names. If length-0 +(the default), the first non-skipped row will be parsed to generate column +names, unless \code{autogenerate_column_names} is \code{TRUE}. +\item \code{autogenerate_column_names} Logical: generate column names instead of +using the first non-skipped row (the default)? If \code{TRUE}, column names will +be "f0", "f1", ..., "fN". +} + +\code{CsvParseOptions$create()} takes the following arguments: +\itemize{ +\item \code{delimiter} Field delimiting character (default \code{","}) +\item \code{quoting} Logical: are strings quoted? (default \code{TRUE}) +\item \code{quote_char} Quoting character, if \code{quoting} is \code{TRUE} +\item \code{double_quote} Logical: are quotes inside values double-quoted? (default \code{TRUE}) +\item \code{escaping} Logical: whether escaping is used (default \code{FALSE}) +\item \code{escape_char} Escaping character, if \code{escaping} is \code{TRUE} +\item \code{newlines_in_values} Logical: are values allowed to contain CR (\code{0x0d}) +and LF (\code{0x0a}) characters? (default \code{FALSE}) +\item \code{ignore_empty_lines} Logical: should empty lines be ignored (default) or +generate a row of missing values (if \code{FALSE})? +} + +\code{JsonParseOptions$create()} accepts only the \code{newlines_in_values} argument. + +\code{CsvConvertOptions$create()} takes the following arguments: +\itemize{ +\item \code{check_utf8} Logical: check UTF8 validity of string columns? (default \code{TRUE}) +\item \code{null_values} character vector of recognized spellings for null values. +Analogous to the \code{na.strings} argument to +\code{\link[utils:read.table]{read.csv()}} or \code{na} in \code{readr::read_csv()}. +\item \code{strings_can_be_null} Logical: can string / binary columns have +null values? Similar to the \code{quoted_na} argument to \code{readr::read_csv()}. +(default \code{FALSE}) +\item \code{true_values} character vector of recognized spellings for \code{TRUE} values +\item \code{false_values} character vector of recognized spellings for \code{FALSE} values +\item \code{col_types} A \code{Schema} or \code{NULL} to infer types +\item \code{auto_dict_encode} Logical: Whether to try to automatically +dictionary-encode string / binary data (think \code{stringsAsFactors}). Default \code{FALSE}. +This setting is ignored for non-inferred columns (those in \code{col_types}). +\item \code{auto_dict_max_cardinality} If \code{auto_dict_encode}, string/binary columns +are dictionary-encoded up to this number of unique values (default 50), +after which it switches to regular encoding. +\item \code{include_columns} If non-empty, indicates the names of columns from the +CSV file that should be actually read and converted (in the vector's order). +\item \code{include_missing_columns} Logical: if \code{include_columns} is provided, should +columns named in it but not found in the data be included as a column of +type \code{null()}? The default (\code{FALSE}) means that the reader will instead +raise an error. +\item \code{timestamp_parsers} User-defined timestamp parsers. If more than one +parser is specified, the CSV conversion logic will try parsing values +starting from the beginning of this vector. Possible values are +(a) \code{NULL}, the default, which uses the ISO-8601 parser; +(b) a character vector of \link[base:strptime]{strptime} parse strings; or +(c) a list of \link{TimestampParser} objects. +} + +\code{TimestampParser$create()} takes an optional \code{format} string argument. +See \code{\link[base:strptime]{strptime()}} for example syntax. +The default is to use an ISO-8601 format parser. + +The \code{CsvWriteOptions$create()} factory method takes the following arguments: +\itemize{ +\item \code{include_header} Whether to write an initial header line with column names +\item \code{batch_size} Maximum number of rows processed at a time. Default is 1024. +} +} + +\section{Active bindings}{ + +\itemize{ +\item \code{column_names}: from \code{CsvReadOptions} +} +} + diff --git a/src/arrow/r/man/CsvTableReader.Rd b/src/arrow/r/man/CsvTableReader.Rd new file mode 100644 index 000000000..1afa9d020 --- /dev/null +++ b/src/arrow/r/man/CsvTableReader.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R, R/json.R +\docType{class} +\name{CsvTableReader} +\alias{CsvTableReader} +\alias{JsonTableReader} +\title{Arrow CSV and JSON table reader classes} +\description{ +\code{CsvTableReader} and \code{JsonTableReader} wrap the Arrow C++ CSV +and JSON table readers. See their usage in \code{\link[=read_csv_arrow]{read_csv_arrow()}} and +\code{\link[=read_json_arrow]{read_json_arrow()}}, respectively. +} +\section{Factory}{ + + +The \code{CsvTableReader$create()} and \code{JsonTableReader$create()} factory methods +take the following arguments: +\itemize{ +\item \code{file} An Arrow \link{InputStream} +\item \code{convert_options} (CSV only), \code{parse_options}, \code{read_options}: see +\link{CsvReadOptions} +\item \code{...} additional parameters. +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$Read()}: returns an Arrow Table. +} +} + diff --git a/src/arrow/r/man/DataType.Rd b/src/arrow/r/man/DataType.Rd new file mode 100644 index 000000000..8c96141be --- /dev/null +++ b/src/arrow/r/man/DataType.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/type.R +\docType{class} +\name{DataType} +\alias{DataType} +\title{class arrow::DataType} +\description{ +class arrow::DataType +} +\section{Methods}{ + + +TODO +} + diff --git a/src/arrow/r/man/Dataset.Rd b/src/arrow/r/man/Dataset.Rd new file mode 100644 index 000000000..c19a0df6c --- /dev/null +++ b/src/arrow/r/man/Dataset.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R, R/dataset-factory.R +\name{Dataset} +\alias{Dataset} +\alias{FileSystemDataset} +\alias{UnionDataset} +\alias{InMemoryDataset} +\alias{DatasetFactory} +\alias{FileSystemDatasetFactory} +\title{Multi-file datasets} +\description{ +Arrow Datasets allow you to query against data that has been split across +multiple files. This sharding of data may indicate partitioning, which +can accelerate queries that only touch some partitions (files). + +A \code{Dataset} contains one or more \code{Fragments}, such as files, of potentially +differing type and partitioning. + +For \code{Dataset$create()}, see \code{\link[=open_dataset]{open_dataset()}}, which is an alias for it. + +\code{DatasetFactory} is used to provide finer control over the creation of \code{Dataset}s. +} +\section{Factory}{ + +\code{DatasetFactory} is used to create a \code{Dataset}, inspect the \link{Schema} of the +fragments contained in it, and declare a partitioning. +\code{FileSystemDatasetFactory} is a subclass of \code{DatasetFactory} for +discovering files in the local file system, the only currently supported +file system. + +For the \code{DatasetFactory$create()} factory method, see \code{\link[=dataset_factory]{dataset_factory()}}, an +alias for it. A \code{DatasetFactory} has: +\itemize{ +\item \verb{$Inspect(unify_schemas)}: If \code{unify_schemas} is \code{TRUE}, all fragments +will be scanned and a unified \link{Schema} will be created from them; if \code{FALSE} +(default), only the first fragment will be inspected for its schema. Use this +fast path when you know and trust that all fragments have an identical schema. +\item \verb{$Finish(schema, unify_schemas)}: Returns a \code{Dataset}. If \code{schema} is provided, +it will be used for the \code{Dataset}; if omitted, a \code{Schema} will be created from +inspecting the fragments (files) in the dataset, following \code{unify_schemas} +as described above. +} + +\code{FileSystemDatasetFactory$create()} is a lower-level factory method and +takes the following arguments: +\itemize{ +\item \code{filesystem}: A \link{FileSystem} +\item \code{selector}: Either a \link{FileSelector} or \code{NULL} +\item \code{paths}: Either a character vector of file paths or \code{NULL} +\item \code{format}: A \link{FileFormat} +\item \code{partitioning}: Either \code{Partitioning}, \code{PartitioningFactory}, or \code{NULL} +} +} + +\section{Methods}{ + + +A \code{Dataset} has the following methods: +\itemize{ +\item \verb{$NewScan()}: Returns a \link{ScannerBuilder} for building a query +\item \verb{$schema}: Active binding that returns the \link{Schema} of the Dataset; you +may also replace the dataset's schema by using \code{ds$schema <- new_schema}. +This method currently supports only adding, removing, or reordering +fields in the schema: you cannot alter or cast the field types. +} + +\code{FileSystemDataset} has the following methods: +\itemize{ +\item \verb{$files}: Active binding, returns the files of the \code{FileSystemDataset} +\item \verb{$format}: Active binding, returns the \link{FileFormat} of the \code{FileSystemDataset} +} + +\code{UnionDataset} has the following methods: +\itemize{ +\item \verb{$children}: Active binding, returns all child \code{Dataset}s. +} +} + +\seealso{ +\code{\link[=open_dataset]{open_dataset()}} for a simple interface to creating a \code{Dataset} +} diff --git a/src/arrow/r/man/DictionaryType.Rd b/src/arrow/r/man/DictionaryType.Rd new file mode 100644 index 000000000..8c9087f1a --- /dev/null +++ b/src/arrow/r/man/DictionaryType.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dictionary.R +\docType{class} +\name{DictionaryType} +\alias{DictionaryType} +\title{class DictionaryType} +\description{ +class DictionaryType +} +\section{Methods}{ + + +TODO +} + diff --git a/src/arrow/r/man/Expression.Rd b/src/arrow/r/man/Expression.Rd new file mode 100644 index 000000000..58a6a44c0 --- /dev/null +++ b/src/arrow/r/man/Expression.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expression.R +\name{Expression} +\alias{Expression} +\title{Arrow expressions} +\description{ +\code{Expression}s are used to define filter logic for passing to a \link{Dataset} +\link{Scanner}. + +\code{Expression$scalar(x)} constructs an \code{Expression} which always evaluates to +the provided scalar (length-1) R value. + +\code{Expression$field_ref(name)} is used to construct an \code{Expression} which +evaluates to the named column in the \code{Dataset} against which it is evaluated. + +\code{Expression$create(function_name, ..., options)} builds a function-call +\code{Expression} containing one or more \code{Expression}s. +} diff --git a/src/arrow/r/man/FeatherReader.Rd b/src/arrow/r/man/FeatherReader.Rd new file mode 100644 index 000000000..64a307fcf --- /dev/null +++ b/src/arrow/r/man/FeatherReader.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feather.R +\docType{class} +\name{FeatherReader} +\alias{FeatherReader} +\title{FeatherReader class} +\description{ +This class enables you to interact with Feather files. Create +one to connect to a file or other InputStream, and call \code{Read()} on it to +make an \code{arrow::Table}. See its usage in \code{\link[=read_feather]{read_feather()}}. +} +\section{Factory}{ + + +The \code{FeatherReader$create()} factory method instantiates the object and +takes the following argument: +\itemize{ +\item \code{file} an Arrow file connection object inheriting from \code{RandomAccessFile}. +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$Read(columns)}: Returns a \code{Table} of the selected columns, a vector of +integer indices +\item \verb{$column_names}: Active binding, returns the column names in the Feather file +\item \verb{$schema}: Active binding, returns the schema of the Feather file +\item \verb{$version}: Active binding, returns \code{1} or \code{2}, according to the Feather +file version +} +} + diff --git a/src/arrow/r/man/Field.Rd b/src/arrow/r/man/Field.Rd new file mode 100644 index 000000000..3b709e879 --- /dev/null +++ b/src/arrow/r/man/Field.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/field.R +\docType{class} +\name{Field} +\alias{Field} +\alias{field} +\title{Field class} +\usage{ +field(name, type, metadata, nullable = TRUE) +} +\arguments{ +\item{name}{field name} + +\item{type}{logical type, instance of \link{DataType}} + +\item{metadata}{currently ignored} + +\item{nullable}{TRUE if field is nullable} +} +\description{ +\code{field()} lets you create an \code{arrow::Field} that maps a +\link[=data-type]{DataType} to a column name. Fields are contained in +\link[=Schema]{Schemas}. +} +\section{Methods}{ + +\itemize{ +\item \code{f$ToString()}: convert to a string +\item \code{f$Equals(other)}: test for equality. More naturally called as \code{f == other} +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +field("x", int32()) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/FileFormat.Rd b/src/arrow/r/man/FileFormat.Rd new file mode 100644 index 000000000..cabacc937 --- /dev/null +++ b/src/arrow/r/man/FileFormat.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-format.R +\name{FileFormat} +\alias{FileFormat} +\alias{ParquetFileFormat} +\alias{IpcFileFormat} +\alias{CsvFileFormat} +\title{Dataset file formats} +\description{ +A \code{FileFormat} holds information about how to read and parse the files +included in a \code{Dataset}. There are subclasses corresponding to the supported +file formats (\code{ParquetFileFormat} and \code{IpcFileFormat}). +} +\section{Factory}{ + +\code{FileFormat$create()} takes the following arguments: +\itemize{ +\item \code{format}: A string identifier of the file format. Currently supported values: +\itemize{ +\item "parquet" +\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +only version 2 files are supported +\item "csv"/"text", aliases for the same thing (because comma is the default +delimiter for text files +\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"} +} +\item \code{...}: Additional format-specific options + +`format = "parquet"``: +\itemize{ +\item \code{dict_columns}: Names of columns which should be read as dictionaries. +\item Any Parquet options from \link{FragmentScanOptions}. +} + +\code{format = "text"}: see \link{CsvParseOptions}. Note that you can specify them either +with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the +\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.). +Not all \code{readr} options are currently supported; please file an issue if +you encounter one that \code{arrow} should support. Also, the following options are +supported. From \link{CsvReadOptions}: +\itemize{ +\item \code{skip_rows} +\item \code{column_names} +\item \code{autogenerate_column_names} +From \link{CsvFragmentScanOptions} (these values can be overridden at scan time): +\item \code{convert_options}: a \link{CsvConvertOptions} +\item \code{block_size} +} +} + +It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat}) +} + +\examples{ +\dontshow{if (arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +## Semi-colon delimited files +# Set up directory for examples +tf <- tempfile() +dir.create(tf) +on.exit(unlink(tf)) +write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE) + +# Create FileFormat object +format <- FileFormat$create(format = "text", delimiter = ";") + +open_dataset(tf, format = format) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/FileInfo.Rd b/src/arrow/r/man/FileInfo.Rd new file mode 100644 index 000000000..ef6182e4e --- /dev/null +++ b/src/arrow/r/man/FileInfo.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{FileInfo} +\alias{FileInfo} +\title{FileSystem entry info} +\description{ +FileSystem entry info +} +\section{Methods}{ + +\itemize{ +\item \code{base_name()} : The file base name (component after the last directory +separator). +\item \code{extension()} : The file extension +} +} + +\section{Active bindings}{ + +\itemize{ +\item \verb{$type}: The file type +\item \verb{$path}: The full file path in the filesystem +\item \verb{$size}: The size in bytes, if available. Only regular files are +guaranteed to have a size. +\item \verb{$mtime}: The time of last modification, if available. +} +} + diff --git a/src/arrow/r/man/FileSelector.Rd b/src/arrow/r/man/FileSelector.Rd new file mode 100644 index 000000000..a3c6deefc --- /dev/null +++ b/src/arrow/r/man/FileSelector.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{FileSelector} +\alias{FileSelector} +\title{file selector} +\description{ +file selector +} +\section{Factory}{ + + +The \verb{$create()} factory method instantiates a \code{FileSelector} given the 3 fields +described below. +} + +\section{Fields}{ + +\itemize{ +\item \code{base_dir}: The directory in which to select files. If the path exists but +doesn't point to a directory, this should be an error. +\item \code{allow_not_found}: The behavior if \code{base_dir} doesn't exist in the +filesystem. If \code{FALSE}, an error is returned. If \code{TRUE}, an empty +selection is returned +\item \code{recursive}: Whether to recurse into subdirectories. +} +} + diff --git a/src/arrow/r/man/FileSystem.Rd b/src/arrow/r/man/FileSystem.Rd new file mode 100644 index 000000000..2f3dcff67 --- /dev/null +++ b/src/arrow/r/man/FileSystem.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\docType{class} +\name{FileSystem} +\alias{FileSystem} +\alias{LocalFileSystem} +\alias{S3FileSystem} +\alias{SubTreeFileSystem} +\title{FileSystem classes} +\description{ +\code{FileSystem} is an abstract file system API, +\code{LocalFileSystem} is an implementation accessing files +on the local machine. \code{SubTreeFileSystem} is an implementation that delegates +to another implementation after prepending a fixed base path +} +\section{Factory}{ + + +\code{LocalFileSystem$create()} returns the object and takes no arguments. + +\code{SubTreeFileSystem$create()} takes the following arguments: +\itemize{ +\item \code{base_path}, a string path +\item \code{base_fs}, a \code{FileSystem} object +} + +\code{S3FileSystem$create()} optionally takes arguments: +\itemize{ +\item \code{anonymous}: logical, default \code{FALSE}. If true, will not attempt to look up +credentials using standard AWS configuration methods. +\item \code{access_key}, \code{secret_key}: authentication credentials. If one is provided, +the other must be as well. If both are provided, they will override any +AWS configuration set at the environment level. +\item \code{session_token}: optional string for authentication along with +\code{access_key} and \code{secret_key} +\item \code{role_arn}: string AWS ARN of an AccessRole. If provided instead of \code{access_key} and +\code{secret_key}, temporary credentials will be fetched by assuming this role. +\item \code{session_name}: optional string identifier for the assumed role session. +\item \code{external_id}: optional unique string identifier that might be required +when you assume a role in another account. +\item \code{load_frequency}: integer, frequency (in seconds) with which temporary +credentials from an assumed role session will be refreshed. Default is +900 (i.e. 15 minutes) +\item \code{region}: AWS region to connect to. If omitted, the AWS library will +provide a sensible default based on client configuration, falling back +to "us-east-1" if no other alternatives are found. +\item \code{endpoint_override}: If non-empty, override region with a connect string +such as "localhost:9000". This is useful for connecting to file systems +that emulate S3. +\item \code{scheme}: S3 connection transport (default "https") +\item \code{background_writes}: logical, whether \code{OutputStream} writes will be issued +in the background, without blocking (default \code{TRUE}) +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$GetFileInfo(x)}: \code{x} may be a \link{FileSelector} or a character +vector of paths. Returns a list of \link{FileInfo} +\item \verb{$CreateDir(path, recursive = TRUE)}: Create a directory and subdirectories. +\item \verb{$DeleteDir(path)}: Delete a directory and its contents, recursively. +\item \verb{$DeleteDirContents(path)}: Delete a directory's contents, recursively. +Like \verb{$DeleteDir()}, +but doesn't delete the directory itself. Passing an empty path (\code{""}) will +wipe the entire filesystem tree. +\item \verb{$DeleteFile(path)} : Delete a file. +\item \verb{$DeleteFiles(paths)} : Delete many files. The default implementation +issues individual delete operations in sequence. +\item \verb{$Move(src, dest)}: Move / rename a file or directory. If the destination +exists: +if it is a non-empty directory, an error is returned +otherwise, if it has the same type as the source, it is replaced +otherwise, behavior is unspecified (implementation-dependent). +\item \verb{$CopyFile(src, dest)}: Copy a file. If the destination exists and is a +directory, an error is returned. Otherwise, it is replaced. +\item \verb{$OpenInputStream(path)}: Open an \link[=InputStream]{input stream} for +sequential reading. +\item \verb{$OpenInputFile(path)}: Open an \link[=RandomAccessFile]{input file} for random +access reading. +\item \verb{$OpenOutputStream(path)}: Open an \link[=OutputStream]{output stream} for +sequential writing. +\item \verb{$OpenAppendStream(path)}: Open an \link[=OutputStream]{output stream} for +appending. +} +} + +\section{Active bindings}{ + +\itemize{ +\item \verb{$type_name}: string filesystem type name, such as "local", "s3", etc. +\item \verb{$region}: string AWS region, for \code{S3FileSystem} and \code{SubTreeFileSystem} +containing a \code{S3FileSystem} +\item \verb{$base_fs}: for \code{SubTreeFileSystem}, the \code{FileSystem} it contains +\item \verb{$base_path}: for \code{SubTreeFileSystem}, the path in \verb{$base_fs} which is considered +root in this \code{SubTreeFileSystem}. +} +} + diff --git a/src/arrow/r/man/FileWriteOptions.Rd b/src/arrow/r/man/FileWriteOptions.Rd new file mode 100644 index 000000000..661393c8e --- /dev/null +++ b/src/arrow/r/man/FileWriteOptions.Rd @@ -0,0 +1,8 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-format.R +\name{FileWriteOptions} +\alias{FileWriteOptions} +\title{Format-specific write options} +\description{ +A \code{FileWriteOptions} holds write options specific to a \code{FileFormat}. +} diff --git a/src/arrow/r/man/FixedWidthType.Rd b/src/arrow/r/man/FixedWidthType.Rd new file mode 100644 index 000000000..28578268d --- /dev/null +++ b/src/arrow/r/man/FixedWidthType.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/type.R +\docType{class} +\name{FixedWidthType} +\alias{FixedWidthType} +\title{class arrow::FixedWidthType} +\description{ +class arrow::FixedWidthType +} +\section{Methods}{ + + +TODO +} + diff --git a/src/arrow/r/man/FragmentScanOptions.Rd b/src/arrow/r/man/FragmentScanOptions.Rd new file mode 100644 index 000000000..103d05895 --- /dev/null +++ b/src/arrow/r/man/FragmentScanOptions.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-format.R +\name{FragmentScanOptions} +\alias{FragmentScanOptions} +\alias{CsvFragmentScanOptions} +\alias{ParquetFragmentScanOptions} +\title{Format-specific scan options} +\description{ +A \code{FragmentScanOptions} holds options specific to a \code{FileFormat} and a scan +operation. +} +\section{Factory}{ + +\code{FragmentScanOptions$create()} takes the following arguments: +\itemize{ +\item \code{format}: A string identifier of the file format. Currently supported values: +\itemize{ +\item "parquet" +\item "csv"/"text", aliases for the same format. +} +\item \code{...}: Additional format-specific options + +`format = "parquet"``: +\itemize{ +\item \code{use_buffered_stream}: Read files through buffered input streams rather than +loading entire row groups at once. This may be enabled +to reduce memory overhead. Disabled by default. +\item \code{buffer_size}: Size of buffered stream, if enabled. Default is 8KB. +\item \code{pre_buffer}: Pre-buffer the raw Parquet data. This can improve performance +on high-latency filesystems. Disabled by default. +\code{format = "text"}: see \link{CsvConvertOptions}. Note that options can only be +specified with the Arrow C++ library naming. Also, "block_size" from +\link{CsvReadOptions} may be given. +} +} + +It returns the appropriate subclass of \code{FragmentScanOptions} +(e.g. \code{CsvFragmentScanOptions}). +} + diff --git a/src/arrow/r/man/InputStream.Rd b/src/arrow/r/man/InputStream.Rd new file mode 100644 index 000000000..b909a77a1 --- /dev/null +++ b/src/arrow/r/man/InputStream.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{InputStream} +\alias{InputStream} +\alias{RandomAccessFile} +\alias{MemoryMappedFile} +\alias{ReadableFile} +\alias{BufferReader} +\title{InputStream classes} +\description{ +\code{RandomAccessFile} inherits from \code{InputStream} and is a base +class for: \code{ReadableFile} for reading from a file; \code{MemoryMappedFile} for +the same but with memory mapping; and \code{BufferReader} for reading from a +buffer. Use these with the various table readers. +} +\section{Factory}{ + + +The \verb{$create()} factory methods instantiate the \code{InputStream} object and +take the following arguments, depending on the subclass: +\itemize{ +\item \code{path} For \code{ReadableFile}, a character file name +\item \code{x} For \code{BufferReader}, a \link{Buffer} or an object that can be +made into a buffer via \code{buffer()}. +} + +To instantiate a \code{MemoryMappedFile}, call \code{\link[=mmap_open]{mmap_open()}}. +} + +\section{Methods}{ + +\itemize{ +\item \verb{$GetSize()}: +\item \verb{$supports_zero_copy()}: Logical +\item \verb{$seek(position)}: go to that position in the stream +\item \verb{$tell()}: return the position in the stream +\item \verb{$close()}: close the stream +\item \verb{$Read(nbytes)}: read data from the stream, either a specified \code{nbytes} or +all, if \code{nbytes} is not provided +\item \verb{$ReadAt(position, nbytes)}: similar to \verb{$seek(position)$Read(nbytes)} +\item \verb{$Resize(size)}: for a \code{MemoryMappedFile} that is writeable +} +} + diff --git a/src/arrow/r/man/MemoryPool.Rd b/src/arrow/r/man/MemoryPool.Rd new file mode 100644 index 000000000..75f1882d2 --- /dev/null +++ b/src/arrow/r/man/MemoryPool.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/memory-pool.R +\docType{class} +\name{MemoryPool} +\alias{MemoryPool} +\title{class arrow::MemoryPool} +\description{ +class arrow::MemoryPool +} +\section{Methods}{ + +\itemize{ +\item \code{backend_name}: one of "jemalloc", "mimalloc", or "system". Alternative +memory allocators are optionally enabled at build time. Windows builds +generally have \code{mimalloc}, and most others have both \code{jemalloc} (used by +default) and \code{mimalloc}. To change memory allocators at runtime, set the +environment variable \code{ARROW_DEFAULT_MEMORY_POOL} to one of those strings +prior to loading the \code{arrow} library. +\item \code{bytes_allocated} +\item \code{max_memory} +} +} + +\keyword{internal} diff --git a/src/arrow/r/man/Message.Rd b/src/arrow/r/man/Message.Rd new file mode 100644 index 000000000..84dd90a64 --- /dev/null +++ b/src/arrow/r/man/Message.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/message.R +\docType{class} +\name{Message} +\alias{Message} +\title{class arrow::Message} +\description{ +class arrow::Message +} +\section{Methods}{ + + +TODO +} + diff --git a/src/arrow/r/man/MessageReader.Rd b/src/arrow/r/man/MessageReader.Rd new file mode 100644 index 000000000..d198c185e --- /dev/null +++ b/src/arrow/r/man/MessageReader.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/message.R +\docType{class} +\name{MessageReader} +\alias{MessageReader} +\title{class arrow::MessageReader} +\description{ +class arrow::MessageReader +} +\section{Methods}{ + + +TODO +} + diff --git a/src/arrow/r/man/OutputStream.Rd b/src/arrow/r/man/OutputStream.Rd new file mode 100644 index 000000000..f7c71b192 --- /dev/null +++ b/src/arrow/r/man/OutputStream.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{OutputStream} +\alias{OutputStream} +\alias{FileOutputStream} +\alias{BufferOutputStream} +\title{OutputStream classes} +\description{ +\code{FileOutputStream} is for writing to a file; +\code{BufferOutputStream} writes to a buffer; +You can create one and pass it to any of the table writers, for example. +} +\section{Factory}{ + + +The \verb{$create()} factory methods instantiate the \code{OutputStream} object and +take the following arguments, depending on the subclass: +\itemize{ +\item \code{path} For \code{FileOutputStream}, a character file name +\item \code{initial_capacity} For \code{BufferOutputStream}, the size in bytes of the +buffer. +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$tell()}: return the position in the stream +\item \verb{$close()}: close the stream +\item \verb{$write(x)}: send \code{x} to the stream +\item \verb{$capacity()}: for \code{BufferOutputStream} +\item \verb{$finish()}: for \code{BufferOutputStream} +\item \verb{$GetExtentBytesWritten()}: for \code{MockOutputStream}, report how many bytes +were sent. +} +} + diff --git a/src/arrow/r/man/ParquetArrowReaderProperties.Rd b/src/arrow/r/man/ParquetArrowReaderProperties.Rd new file mode 100644 index 000000000..33a50f712 --- /dev/null +++ b/src/arrow/r/man/ParquetArrowReaderProperties.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\docType{class} +\name{ParquetArrowReaderProperties} +\alias{ParquetArrowReaderProperties} +\title{ParquetArrowReaderProperties class} +\description{ +This class holds settings to control how a Parquet file is read +by \link{ParquetFileReader}. +} +\section{Factory}{ + + +The \code{ParquetArrowReaderProperties$create()} factory method instantiates the object +and takes the following arguments: +\itemize{ +\item \code{use_threads} Logical: whether to use multithreading (default \code{TRUE}) +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$read_dictionary(column_index)} +\item \verb{$set_read_dictionary(column_index, read_dict)} +\item \verb{$use_threads(use_threads)} +} +} + diff --git a/src/arrow/r/man/ParquetFileReader.Rd b/src/arrow/r/man/ParquetFileReader.Rd new file mode 100644 index 000000000..30d0725a4 --- /dev/null +++ b/src/arrow/r/man/ParquetFileReader.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\docType{class} +\name{ParquetFileReader} +\alias{ParquetFileReader} +\title{ParquetFileReader class} +\description{ +This class enables you to interact with Parquet files. +} +\section{Factory}{ + + +The \code{ParquetFileReader$create()} factory method instantiates the object and +takes the following arguments: +\itemize{ +\item \code{file} A character file name, raw vector, or Arrow file connection object +(e.g. \code{RandomAccessFile}). +\item \code{props} Optional \link{ParquetArrowReaderProperties} +\item \code{mmap} Logical: whether to memory-map the file (default \code{TRUE}) +\item \code{...} Additional arguments, currently ignored +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$ReadTable(column_indices)}: get an \code{arrow::Table} from the file. The optional +\verb{column_indices=} argument is a 0-based integer vector indicating which columns to retain. +\item \verb{$ReadRowGroup(i, column_indices)}: get an \code{arrow::Table} by reading the \code{i}th row group (0-based). +The optional \verb{column_indices=} argument is a 0-based integer vector indicating which columns to retain. +\item \verb{$ReadRowGroups(row_groups, column_indices)}: get an \code{arrow::Table} by reading several row +groups (0-based integers). +The optional \verb{column_indices=} argument is a 0-based integer vector indicating which columns to retain. +\item \verb{$GetSchema()}: get the \code{arrow::Schema} of the data in the file +\item \verb{$ReadColumn(i)}: read the \code{i}th column (0-based) as a \link{ChunkedArray}. +} +} + +\section{Active bindings}{ + +\itemize{ +\item \verb{$num_rows}: number of rows. +\item \verb{$num_columns}: number of columns. +\item \verb{$num_row_groups}: number of row groups. +} +} + +\examples{ +\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +f <- system.file("v0.7.1.parquet", package = "arrow") +pq <- ParquetFileReader$create(f) +pq$GetSchema() +if (codec_is_available("snappy")) { + # This file has compressed data columns + tab <- pq$ReadTable() + tab$schema +} +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/ParquetFileWriter.Rd b/src/arrow/r/man/ParquetFileWriter.Rd new file mode 100644 index 000000000..f36e85ab6 --- /dev/null +++ b/src/arrow/r/man/ParquetFileWriter.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\docType{class} +\name{ParquetFileWriter} +\alias{ParquetFileWriter} +\title{ParquetFileWriter class} +\description{ +This class enables you to interact with Parquet files. +} +\section{Factory}{ + + +The \code{ParquetFileWriter$create()} factory method instantiates the object and +takes the following arguments: +\itemize{ +\item \code{schema} A \link{Schema} +\item \code{sink} An \link[=OutputStream]{arrow::io::OutputStream} +\item \code{properties} An instance of \link{ParquetWriterProperties} +\item \code{arrow_properties} An instance of \code{ParquetArrowWriterProperties} +} +} + +\section{Methods}{ + +\itemize{ +\item \code{WriteTable} Write a \link{Table} to \code{sink} +\item \code{Close} Close the writer. Note: does not close the \code{sink}. +\link[=OutputStream]{arrow::io::OutputStream} has its own \code{close()} method. +} +} + diff --git a/src/arrow/r/man/ParquetWriterProperties.Rd b/src/arrow/r/man/ParquetWriterProperties.Rd new file mode 100644 index 000000000..7beb8a82a --- /dev/null +++ b/src/arrow/r/man/ParquetWriterProperties.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\docType{class} +\name{ParquetWriterProperties} +\alias{ParquetWriterProperties} +\title{ParquetWriterProperties class} +\description{ +This class holds settings to control how a Parquet file is read +by \link{ParquetFileWriter}. +} +\details{ +The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} +and write_statistics` support various patterns: +\itemize{ +\item The default \code{NULL} leaves the parameter unspecified, and the C++ library +uses an appropriate default for each column (defaults listed above) +\item A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns +\item An unnamed vector, of the same size as the number of columns, to specify a +value for each column, in positional order +\item A named vector, to specify the value for the named columns, the default +value for the setting is used when not supplied +} + +Unlike the high-level \link{write_parquet}, \code{ParquetWriterProperties} arguments +use the C++ defaults. Currently this means "uncompressed" rather than +"snappy" for the \code{compression} argument. +} +\section{Factory}{ + + +The \code{ParquetWriterProperties$create()} factory method instantiates the object +and takes the following arguments: +\itemize{ +\item \code{table}: table to write (required) +\item \code{version}: Parquet version, "1.0" or "2.0". Default "1.0" +\item \code{compression}: Compression type, algorithm \code{"uncompressed"} +\item \code{compression_level}: Compression level; meaning depends on compression algorithm +\item \code{use_dictionary}: Specify if we should use dictionary encoding. Default \code{TRUE} +\item \code{write_statistics}: Specify if we should write statistics. Default \code{TRUE} +\item \code{data_page_size}: Set a target threshold for the approximate encoded +size of data pages within a column chunk (in bytes). Default 1 MiB. +} +} + +\seealso{ +\link{write_parquet} + +\link{Schema} for information about schemas and metadata handling. +} diff --git a/src/arrow/r/man/Partitioning.Rd b/src/arrow/r/man/Partitioning.Rd new file mode 100644 index 000000000..cfe374155 --- /dev/null +++ b/src/arrow/r/man/Partitioning.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-partition.R +\name{Partitioning} +\alias{Partitioning} +\alias{DirectoryPartitioning} +\alias{HivePartitioning} +\alias{DirectoryPartitioningFactory} +\alias{HivePartitioningFactory} +\title{Define Partitioning for a Dataset} +\description{ +Pass a \code{Partitioning} object to a \link{FileSystemDatasetFactory}'s \verb{$create()} +method to indicate how the file's paths should be interpreted to define +partitioning. + +\code{DirectoryPartitioning} describes how to interpret raw path segments, in +order. For example, \code{schema(year = int16(), month = int8())} would define +partitions for file paths like "2019/01/file.parquet", +"2019/02/file.parquet", etc. In this scheme \code{NULL} values will be skipped. In +the previous example: when writing a dataset if the month was \code{NA} (or +\code{NULL}), the files would be placed in "2019/file.parquet". When reading, the +rows in "2019/file.parquet" would return an \code{NA} for the month column. An +error will be raised if an outer directory is \code{NULL} and an inner directory +is not. + +\code{HivePartitioning} is for Hive-style partitioning, which embeds field +names and values in path segments, such as +"/year=2019/month=2/data.parquet". Because fields are named in the path +segments, order does not matter. This partitioning scheme allows \code{NULL} +values. They will be replaced by a configurable \code{null_fallback} which +defaults to the string \code{"__HIVE_DEFAULT_PARTITION__"} when writing. When +reading, the \code{null_fallback} string will be replaced with \code{NA}s as +appropriate. + +\code{PartitioningFactory} subclasses instruct the \code{DatasetFactory} to detect +partition features from the file paths. +} +\section{Factory}{ + +Both \code{DirectoryPartitioning$create()} and \code{HivePartitioning$create()} +methods take a \link{Schema} as a single input argument. The helper +function \code{\link[=hive_partition]{hive_partition(...)}} is shorthand for +\code{HivePartitioning$create(schema(...))}. + +With \code{DirectoryPartitioningFactory$create()}, you can provide just the +names of the path segments (in our example, \code{c("year", "month")}), and +the \code{DatasetFactory} will infer the data types for those partition variables. +\code{HivePartitioningFactory$create()} takes no arguments: both variable names +and their types can be inferred from the file paths. \code{hive_partition()} with +no arguments returns a \code{HivePartitioningFactory}. +} + diff --git a/src/arrow/r/man/RecordBatch.Rd b/src/arrow/r/man/RecordBatch.Rd new file mode 100644 index 000000000..ff08c2158 --- /dev/null +++ b/src/arrow/r/man/RecordBatch.Rd @@ -0,0 +1,92 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/record-batch.R +\docType{class} +\name{RecordBatch} +\alias{RecordBatch} +\alias{record_batch} +\title{RecordBatch class} +\usage{ +record_batch(..., schema = NULL) +} +\arguments{ +\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a +mixture of data.frames and vectors, the inputs will be autospliced together +(see examples). Alternatively, you can provide a single Arrow IPC +\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.} + +\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from +the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.} +} +\description{ +A record batch is a collection of equal-length arrays matching +a particular \link{Schema}. It is a table-like data structure that is semantically +a sequence of \link[=Field]{fields}, each a contiguous Arrow \link{Array}. +} +\section{S3 Methods and Usage}{ + +Record batches are data-frame-like, and many methods you expect to work on +a \code{data.frame} are implemented for \code{RecordBatch}. This includes \code{[}, \code{[[}, +\code{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull +the data from an Arrow record batch into R with \code{as.data.frame()}. See the +examples. + +A caveat about the \code{$} method: because \code{RecordBatch} is an \code{R6} object, +\code{$} is also used to access the object's methods (see below). Methods take +precedence over the table's columns. So, \code{batch$Slice} would return the +"Slice" method function even if there were a column in the table called +"Slice". +} + +\section{R6 Methods}{ + +In addition to the more R-friendly S3 methods, a \code{RecordBatch} object has +the following R6 methods that map onto the underlying C++ methods: +\itemize{ +\item \verb{$Equals(other)}: Returns \code{TRUE} if the \code{other} record batch is equal +\item \verb{$column(i)}: Extract an \code{Array} by integer position from the batch +\item \verb{$column_name(i)}: Get a column's name by integer position +\item \verb{$names()}: Get all column names (called by \code{names(batch)}) +\item \verb{$RenameColumns(value)}: Set all column names (called by \code{names(batch) <- value}) +\item \verb{$GetColumnByName(name)}: Extract an \code{Array} by string name +\item \verb{$RemoveColumn(i)}: Drops a column from the batch by integer position +\item \verb{$SelectColumns(indices)}: Return a new record batch with a selection of columns, expressed as 0-based integers. +\item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the +indicated integer offset and going for the given length, or to the end +of the table if \code{NULL}, the default. +\item \verb{$Take(i)}: return an \code{RecordBatch} with rows at positions given by +integers (R vector or Array Array) \code{i}. +\item \verb{$Filter(i, keep_na = TRUE)}: return an \code{RecordBatch} with rows at positions where logical +vector (or Arrow boolean Array) \code{i} is \code{TRUE}. +\item \verb{$SortIndices(names, descending = FALSE)}: return an \code{Array} of integer row +positions that can be used to rearrange the \code{RecordBatch} in ascending or +descending order by the first named column, breaking ties with further named +columns. \code{descending} can be a logical vector of length one or of the same +length as \code{names}. +\item \verb{$serialize()}: Returns a raw vector suitable for interprocess communication +\item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter +the schema of the record batch. +} + +There are also some active bindings +\itemize{ +\item \verb{$num_columns} +\item \verb{$num_rows} +\item \verb{$schema} +\item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list. +Modify or replace by assigning in (\code{batch$metadata <- new_metadata}). +All list elements are coerced to string. See \code{schema()} for more information. +\item \verb{$columns}: Returns a list of \code{Array}s +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +batch <- record_batch(name = rownames(mtcars), mtcars) +dim(batch) +dim(head(batch)) +names(batch) +batch$mpg +batch[["cyl"]] +as.data.frame(batch[4:8, c("gear", "hp", "wt")]) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/RecordBatchReader.Rd b/src/arrow/r/man/RecordBatchReader.Rd new file mode 100644 index 000000000..90c796a66 --- /dev/null +++ b/src/arrow/r/man/RecordBatchReader.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/record-batch-reader.R +\docType{class} +\name{RecordBatchReader} +\alias{RecordBatchReader} +\alias{RecordBatchStreamReader} +\alias{RecordBatchFileReader} +\title{RecordBatchReader classes} +\description{ +Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}: +a "stream" format and a "file" format, known as Feather. +\code{RecordBatchStreamReader} and \code{RecordBatchFileReader} are +interfaces for accessing record batches from input sources in those formats, +respectively. + +For guidance on how to use these classes, see the examples section. +} +\section{Factory}{ + + +The \code{RecordBatchFileReader$create()} and \code{RecordBatchStreamReader$create()} +factory methods instantiate the object and +take a single argument, named according to the class: +\itemize{ +\item \code{file} A character file name, raw vector, or Arrow file connection object +(e.g. \link{RandomAccessFile}). +\item \code{stream} A raw vector, \link{Buffer}, or \link{InputStream}. +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$read_next_batch()}: Returns a \code{RecordBatch}, iterating through the +Reader. If there are no further batches in the Reader, it returns \code{NULL}. +\item \verb{$schema}: Returns a \link{Schema} (active binding) +\item \verb{$batches()}: Returns a list of \code{RecordBatch}es +\item \verb{$read_table()}: Collects the reader's \code{RecordBatch}es into a \link{Table} +\item \verb{$get_batch(i)}: For \code{RecordBatchFileReader}, return a particular batch +by an integer index. +\item \verb{$num_record_batches()}: For \code{RecordBatchFileReader}, see how many batches +are in the file. +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) + +batch <- record_batch(chickwts) + +# This opens a connection to the file in Arrow +file_obj <- FileOutputStream$create(tf) +# Pass that to a RecordBatchWriter to write data conforming to a schema +writer <- RecordBatchFileWriter$create(file_obj, batch$schema) +writer$write(batch) +# You may write additional batches to the stream, provided that they have +# the same schema. +# Call "close" on the writer to indicate end-of-file/stream +writer$close() +# Then, close the connection--closing the IPC message does not close the file +file_obj$close() + +# Now, we have a file we can read from. Same pattern: open file connection, +# then pass it to a RecordBatchReader +read_file_obj <- ReadableFile$create(tf) +reader <- RecordBatchFileReader$create(read_file_obj) +# RecordBatchFileReader knows how many batches it has (StreamReader does not) +reader$num_record_batches +# We could consume the Reader by calling $read_next_batch() until all are, +# consumed, or we can call $read_table() to pull them all into a Table +tab <- reader$read_table() +# Call as.data.frame to turn that Table into an R data.frame +df <- as.data.frame(tab) +# This should be the same data we sent +all.equal(df, chickwts, check.attributes = FALSE) +# Unlike the Writers, we don't have to close RecordBatchReaders, +# but we do still need to close the file connection +read_file_obj$close() +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=read_ipc_stream]{read_ipc_stream()}} and \code{\link[=read_feather]{read_feather()}} provide a much simpler interface +for reading data from these formats and are sufficient for many use cases. +} diff --git a/src/arrow/r/man/RecordBatchWriter.Rd b/src/arrow/r/man/RecordBatchWriter.Rd new file mode 100644 index 000000000..219c150e6 --- /dev/null +++ b/src/arrow/r/man/RecordBatchWriter.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/record-batch-writer.R +\docType{class} +\name{RecordBatchWriter} +\alias{RecordBatchWriter} +\alias{RecordBatchStreamWriter} +\alias{RecordBatchFileWriter} +\title{RecordBatchWriter classes} +\description{ +Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}: +a "stream" format and a "file" format, known as Feather. +\code{RecordBatchStreamWriter} and \code{RecordBatchFileWriter} are +interfaces for writing record batches to those formats, respectively. + +For guidance on how to use these classes, see the examples section. +} +\section{Factory}{ + + +The \code{RecordBatchFileWriter$create()} and \code{RecordBatchStreamWriter$create()} +factory methods instantiate the object and take the following arguments: +\itemize{ +\item \code{sink} An \code{OutputStream} +\item \code{schema} A \link{Schema} for the data to be written +\item \code{use_legacy_format} logical: write data formatted so that Arrow libraries +versions 0.14 and lower can read it. Default is \code{FALSE}. You can also +enable this by setting the environment variable \code{ARROW_PRE_0_15_IPC_FORMAT=1}. +\item \code{metadata_version}: A string like "V5" or the equivalent integer indicating +the Arrow IPC MetadataVersion. Default (NULL) will use the latest version, +unless the environment variable \code{ARROW_PRE_1_0_METADATA_VERSION=1}, in +which case it will be V4. +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$write(x)}: Write a \link{RecordBatch}, \link{Table}, or \code{data.frame}, dispatching +to the methods below appropriately +\item \verb{$write_batch(batch)}: Write a \code{RecordBatch} to stream +\item \verb{$write_table(table)}: Write a \code{Table} to stream +\item \verb{$close()}: close stream. Note that this indicates end-of-file or +end-of-stream--it does not close the connection to the \code{sink}. That needs +to be closed separately. +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) + +batch <- record_batch(chickwts) + +# This opens a connection to the file in Arrow +file_obj <- FileOutputStream$create(tf) +# Pass that to a RecordBatchWriter to write data conforming to a schema +writer <- RecordBatchFileWriter$create(file_obj, batch$schema) +writer$write(batch) +# You may write additional batches to the stream, provided that they have +# the same schema. +# Call "close" on the writer to indicate end-of-file/stream +writer$close() +# Then, close the connection--closing the IPC message does not close the file +file_obj$close() + +# Now, we have a file we can read from. Same pattern: open file connection, +# then pass it to a RecordBatchReader +read_file_obj <- ReadableFile$create(tf) +reader <- RecordBatchFileReader$create(read_file_obj) +# RecordBatchFileReader knows how many batches it has (StreamReader does not) +reader$num_record_batches +# We could consume the Reader by calling $read_next_batch() until all are, +# consumed, or we can call $read_table() to pull them all into a Table +tab <- reader$read_table() +# Call as.data.frame to turn that Table into an R data.frame +df <- as.data.frame(tab) +# This should be the same data we sent +all.equal(df, chickwts, check.attributes = FALSE) +# Unlike the Writers, we don't have to close RecordBatchReaders, +# but we do still need to close the file connection +read_file_obj$close() +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=write_ipc_stream]{write_ipc_stream()}} and \code{\link[=write_feather]{write_feather()}} provide a much simpler +interface for writing data to these formats and are sufficient for many use +cases. \code{\link[=write_to_raw]{write_to_raw()}} is a version that serializes data to a buffer. +} diff --git a/src/arrow/r/man/Scalar.Rd b/src/arrow/r/man/Scalar.Rd new file mode 100644 index 000000000..21e04c12e --- /dev/null +++ b/src/arrow/r/man/Scalar.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scalar.R +\docType{class} +\name{Scalar} +\alias{Scalar} +\title{Arrow scalars} +\description{ +A \code{Scalar} holds a single value of an Arrow type. +} +\section{Methods}{ + +\verb{$ToString()}: convert to a string +\verb{$as_vector()}: convert to an R vector +\verb{$as_array()}: convert to an Arrow \code{Array} +\verb{$Equals(other)}: is this Scalar equal to \code{other} +\verb{$ApproxEquals(other)}: is this Scalar approximately equal to \code{other} +\verb{$is_valid}: is this Scalar valid +\verb{$null_count}: number of invalid values - 1 or 0 +\verb{$type}: Scalar type +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +Scalar$create(pi) +Scalar$create(404) +# If you pass a vector into Scalar$create, you get a list containing your items +Scalar$create(c(1, 2, 3)) + +# Comparisons +my_scalar <- Scalar$create(99) +my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE +my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE +my_scalar$Equals(Scalar$create(99.000009)) # FALSE +my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match) + +my_scalar$ToString() +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/Scanner.Rd b/src/arrow/r/man/Scanner.Rd new file mode 100644 index 000000000..db6488f50 --- /dev/null +++ b/src/arrow/r/man/Scanner.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-scan.R +\name{Scanner} +\alias{Scanner} +\alias{ScannerBuilder} +\title{Scan the contents of a dataset} +\description{ +A \code{Scanner} iterates over a \link{Dataset}'s fragments and returns data +according to given row filtering and column projection. A \code{ScannerBuilder} +can help create one. +} +\section{Factory}{ + +\code{Scanner$create()} wraps the \code{ScannerBuilder} interface to make a \code{Scanner}. +It takes the following arguments: +\itemize{ +\item \code{dataset}: A \code{Dataset} or \code{arrow_dplyr_query} object, as returned by the +\code{dplyr} methods on \code{Dataset}. +\item \code{projection}: A character vector of column names to select columns or a +named list of expressions +\item \code{filter}: A \code{Expression} to filter the scanned rows by, or \code{TRUE} (default) +to keep all rows. +\item \code{use_threads}: logical: should scanning use multithreading? Default \code{TRUE} +\item \code{use_async}: logical: should the async scanner (performs better on +high-latency/highly parallel filesystems like S3) be used? Default \code{FALSE} +\item \code{...}: Additional arguments, currently ignored +} +} + +\section{Methods}{ + +\code{ScannerBuilder} has the following methods: +\itemize{ +\item \verb{$Project(cols)}: Indicate that the scan should only return columns given +by \code{cols}, a character vector of column names +\item \verb{$Filter(expr)}: Filter rows by an \link{Expression}. +\item \verb{$UseThreads(threads)}: logical: should the scan use multithreading? +The method's default input is \code{TRUE}, but you must call the method to enable +multithreading because the scanner default is \code{FALSE}. +\item \verb{$UseAsync(use_async)}: logical: should the async scanner be used? +\item \verb{$BatchSize(batch_size)}: integer: Maximum row count of scanned record +batches, default is 32K. If scanned record batches are overflowing memory +then this method can be called to reduce their size. +\item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset +\item \verb{$Finish()}: Returns a \code{Scanner} +} + +\code{Scanner} currently has a single method, \verb{$ToTable()}, which evaluates the +query and returns an Arrow \link{Table}. +} + diff --git a/src/arrow/r/man/Schema.Rd b/src/arrow/r/man/Schema.Rd new file mode 100644 index 000000000..7322c70f2 --- /dev/null +++ b/src/arrow/r/man/Schema.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/schema.R +\docType{class} +\name{Schema} +\alias{Schema} +\alias{schema} +\title{Schema class} +\usage{ +schema(...) +} +\arguments{ +\item{...}{named list containing \link[=data-type]{data types} or +a list of \link[=field]{fields} containing the fields for the schema} +} +\description{ +A \code{Schema} is a list of \link{Field}s, which map names to +Arrow \link[=data-type]{data types}. Create a \code{Schema} when you +want to convert an R \code{data.frame} to Arrow but don't want to rely on the +default mapping of R types to Arrow types, such as when you want to choose a +specific numeric precision, or when creating a \link{Dataset} and you want to +ensure a specific schema rather than inferring it from the various files. + +Many Arrow objects, including \link{Table} and \link{Dataset}, have a \verb{$schema} method +(active binding) that lets you access their schema. +} +\section{Methods}{ + +\itemize{ +\item \verb{$ToString()}: convert to a string +\item \verb{$field(i)}: returns the field at index \code{i} (0-based) +\item \verb{$GetFieldByName(x)}: returns the field with name \code{x} +\item \verb{$WithMetadata(metadata)}: returns a new \code{Schema} with the key-value +\code{metadata} set. Note that all list elements in \code{metadata} will be coerced +to \code{character}. +} +} + +\section{Active bindings}{ + +\itemize{ +\item \verb{$names}: returns the field names (called in \code{names(Schema)}) +\item \verb{$num_fields}: returns the number of fields (called in \code{length(Schema)}) +\item \verb{$fields}: returns the list of \code{Field}s in the \code{Schema}, suitable for +iterating over +\item \verb{$HasMetadata}: logical: does this \code{Schema} have extra metadata? +\item \verb{$metadata}: returns the key-value metadata as a named list. +Modify or replace by assigning in (\code{sch$metadata <- new_metadata}). +All list elements are coerced to string. +} +} + +\section{R Metadata}{ + + +When converting a data.frame to an Arrow Table or RecordBatch, attributes +from the \code{data.frame} are saved alongside tables so that the object can be +reconstructed faithfully in R (e.g. with \code{as.data.frame()}). This metadata +can be both at the top-level of the \code{data.frame} (e.g. \code{attributes(df)}) or +at the column (e.g. \code{attributes(df$col_a)}) or for list columns only: +element level (e.g. \code{attributes(df[1, "col_a"])}). For example, this allows +for storing \code{haven} columns in a table and being able to faithfully +re-create them when pulled back into R. This metadata is separate from the +schema (column names and types) which is compatible with other Arrow +clients. The R metadata is only read by R and is ignored by other clients +(e.g. Pandas has its own custom metadata). This metadata is stored in +\verb{$metadata$r}. + +Since Schema metadata keys and values must be strings, this metadata is +saved by serializing R's attribute list structure to a string. If the +serialized metadata exceeds 100Kb in size, by default it is compressed +starting in version 3.0.0. To disable this compression (e.g. for tables +that are compatible with Arrow versions before 3.0.0 and include large +amounts of metadata), set the option \code{arrow.compress_metadata} to \code{FALSE}. +Files with compressed metadata are readable by older versions of arrow, but +the metadata is dropped. +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5)) +tab1 <- arrow_table(df) +tab1$schema +tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32())) +tab2$schema +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/Table.Rd b/src/arrow/r/man/Table.Rd new file mode 100644 index 000000000..d5654bf93 --- /dev/null +++ b/src/arrow/r/man/Table.Rd @@ -0,0 +1,92 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/table.R +\docType{class} +\name{Table} +\alias{Table} +\alias{arrow_table} +\title{Table class} +\usage{ +arrow_table(..., schema = NULL) +} +\arguments{ +\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a +mixture of data.frames and named vectors, the inputs will be autospliced together +(see examples). Alternatively, you can provide a single Arrow IPC +\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.} + +\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from +the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.} +} +\description{ +A Table is a sequence of \link[=ChunkedArray]{chunked arrays}. They +have a similar interface to \link[=RecordBatch]{record batches}, but they can be +composed from multiple record batches or chunked arrays. +} +\section{S3 Methods and Usage}{ + +Tables are data-frame-like, and many methods you expect to work on +a \code{data.frame} are implemented for \code{Table}. This includes \code{[}, \code{[[}, +\code{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull +the data from an Arrow table into R with \code{as.data.frame()}. See the +examples. + +A caveat about the \code{$} method: because \code{Table} is an \code{R6} object, +\code{$} is also used to access the object's methods (see below). Methods take +precedence over the table's columns. So, \code{tab$Slice} would return the +"Slice" method function even if there were a column in the table called +"Slice". +} + +\section{R6 Methods}{ + +In addition to the more R-friendly S3 methods, a \code{Table} object has +the following R6 methods that map onto the underlying C++ methods: +\itemize{ +\item \verb{$column(i)}: Extract a \code{ChunkedArray} by integer position from the table +\item \verb{$ColumnNames()}: Get all column names (called by \code{names(tab)}) +\item \verb{$RenameColumns(value)}: Set all column names (called by \code{names(tab) <- value}) +\item \verb{$GetColumnByName(name)}: Extract a \code{ChunkedArray} by string name +\item \verb{$field(i)}: Extract a \code{Field} from the table schema by integer position +\item \verb{$SelectColumns(indices)}: Return new \code{Table} with specified columns, expressed as 0-based integers. +\item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the +indicated integer offset and going for the given length, or to the end +of the table if \code{NULL}, the default. +\item \verb{$Take(i)}: return an \code{Table} with rows at positions given by +integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be +coerced to an R vector before taking. +\item \verb{$Filter(i, keep_na = TRUE)}: return an \code{Table} with rows at positions where logical +vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}. +\item \verb{$SortIndices(names, descending = FALSE)}: return an \code{Array} of integer row +positions that can be used to rearrange the \code{Table} in ascending or descending +order by the first named column, breaking ties with further named columns. +\code{descending} can be a logical vector of length one or of the same length as +\code{names}. +\item \verb{$serialize(output_stream, ...)}: Write the table to the given +\link{OutputStream} +\item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter +the schema of the record batch. +} + +There are also some active bindings: +\itemize{ +\item \verb{$num_columns} +\item \verb{$num_rows} +\item \verb{$schema} +\item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list. +Modify or replace by assigning in (\code{tab$metadata <- new_metadata}). +All list elements are coerced to string. See \code{schema()} for more information. +\item \verb{$columns}: Returns a list of \code{ChunkedArray}s +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tbl <- arrow_table(name = rownames(mtcars), mtcars) +dim(tbl) +dim(head(tbl)) +names(tbl) +tbl$mpg +tbl[["cyl"]] +as.data.frame(tbl[4:8, c("gear", "hp", "wt")]) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/array.Rd b/src/arrow/r/man/array.Rd new file mode 100644 index 000000000..78d3eaff6 --- /dev/null +++ b/src/arrow/r/man/array.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/array.R, R/scalar.R +\docType{class} +\name{array} +\alias{array} +\alias{Array} +\alias{DictionaryArray} +\alias{StructArray} +\alias{ListArray} +\alias{LargeListArray} +\alias{FixedSizeListArray} +\alias{StructScalar} +\title{Arrow Arrays} +\description{ +An \code{Array} is an immutable data array with some logical type +and some length. Most logical types are contained in the base +\code{Array} class; there are also subclasses for \code{DictionaryArray}, \code{ListArray}, +and \code{StructArray}. +} +\section{Factory}{ + +The \code{Array$create()} factory method instantiates an \code{Array} and +takes the following arguments: +\itemize{ +\item \code{x}: an R vector, list, or \code{data.frame} +\item \code{type}: an optional \link[=data-type]{data type} for \code{x}. If omitted, the type +will be inferred from the data. +} + +\code{Array$create()} will return the appropriate subclass of \code{Array}, such as +\code{DictionaryArray} when given an R factor. + +To compose a \code{DictionaryArray} directly, call \code{DictionaryArray$create()}, +which takes two arguments: +\itemize{ +\item \code{x}: an R vector or \code{Array} of integers for the dictionary indices +\item \code{dict}: an R vector or \code{Array} of dictionary values (like R factor levels +but not limited to strings only) +} +} + +\section{Usage}{ +\preformatted{a <- Array$create(x) +length(a) + +print(a) +a == a +} +} + +\section{Methods}{ + +\itemize{ +\item \verb{$IsNull(i)}: Return true if value at index is null. Does not boundscheck +\item \verb{$IsValid(i)}: Return true if value at index is valid. Does not boundscheck +\item \verb{$length()}: Size in the number of elements this array contains +\item \verb{$offset}: A relative position into another array's data, to enable zero-copy slicing +\item \verb{$null_count}: The number of null entries in the array +\item \verb{$type}: logical type of data +\item \verb{$type_id()}: type id +\item \verb{$Equals(other)} : is this array equal to \code{other} +\item \verb{$ApproxEquals(other)} : +\item \verb{$Diff(other)} : return a string expressing the difference between two arrays +\item \verb{$data()}: return the underlying \link{ArrayData} +\item \verb{$as_vector()}: convert to an R vector +\item \verb{$ToString()}: string representation of the array +\item \verb{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array +with the indicated offset and length. If length is \code{NULL}, the slice goes +until the end of the array. +\item \verb{$Take(i)}: return an \code{Array} with values at positions given by integers +(R vector or Array Array) \code{i}. +\item \verb{$Filter(i, keep_na = TRUE)}: return an \code{Array} with values at positions where logical +vector (or Arrow boolean Array) \code{i} is \code{TRUE}. +\item \verb{$SortIndices(descending = FALSE)}: return an \code{Array} of integer positions that can be +used to rearrange the \code{Array} in ascending or descending order +\item \verb{$RangeEquals(other, start_idx, end_idx, other_start_idx)} : +\item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the +data in the array to change its type. +\item \verb{$View(type)}: Construct a zero-copy view of this array with the given type. +\item \verb{$Validate()} : Perform any validation checks to determine obvious inconsistencies +within the array's internal data. This can be an expensive check, potentially \code{O(length)} +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +my_array <- Array$create(1:10) +my_array$type +my_array$cast(int8()) + +# Check if value is null; zero-indexed +na_array <- Array$create(c(1:5, NA)) +na_array$IsNull(0) +na_array$IsNull(5) +na_array$IsValid(5) +na_array$null_count + +# zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice +new_array <- na_array$Slice(5) +new_array$offset + +# Compare 2 arrays +na_array2 <- na_array +na_array2 == na_array # element-wise comparison +na_array2$Equals(na_array) # overall comparison +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/arrow-package.Rd b/src/arrow/r/man/arrow-package.Rd new file mode 100644 index 000000000..021762162 --- /dev/null +++ b/src/arrow/r/man/arrow-package.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/arrow-package.R +\docType{package} +\name{arrow-package} +\alias{arrow} +\alias{arrow-package} +\title{arrow: Integration to 'Apache' 'Arrow'} +\description{ +'Apache' 'Arrow' is a cross-language + development platform for in-memory data. It specifies a standardized + language-independent columnar memory format for flat and hierarchical data, + organized for efficient analytic operations on modern hardware. This + package provides an interface to the 'Arrow C++' library. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/apache/arrow/} + \item \url{https://arrow.apache.org/docs/r/} + \item Report bugs at \url{https://issues.apache.org/jira/projects/ARROW/issues} +} + +} +\author{ +\strong{Maintainer}: Neal Richardson \email{neal@ursalabs.org} + +Authors: +\itemize{ + \item Ian Cook \email{ianmcook@gmail.com} + \item Nic Crane \email{thisisnic@gmail.com} + \item Jonathan Keane \email{jkeane@gmail.com} + \item Romain François \email{romain@rstudio.com} (\href{https://orcid.org/0000-0002-2444-4226}{ORCID}) + \item Jeroen Ooms \email{jeroen@berkeley.edu} + \item Apache Arrow \email{dev@arrow.apache.org} [copyright holder] +} + +Other contributors: +\itemize{ + \item Javier Luraschi \email{javier@rstudio.com} [contributor] + \item Karl Dunkle Werner \email{karldw@users.noreply.github.com} (\href{https://orcid.org/0000-0003-0523-7309}{ORCID}) [contributor] + \item Jeffrey Wong \email{jeffreyw@netflix.com} [contributor] +} + +} +\keyword{internal} diff --git a/src/arrow/r/man/arrow_available.Rd b/src/arrow/r/man/arrow_available.Rd new file mode 100644 index 000000000..3061d10dc --- /dev/null +++ b/src/arrow/r/man/arrow_available.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/arrow-package.R +\name{arrow_available} +\alias{arrow_available} +\alias{arrow_with_dataset} +\alias{arrow_with_parquet} +\alias{arrow_with_s3} +\alias{arrow_with_json} +\title{Is the C++ Arrow library available?} +\usage{ +arrow_available() + +arrow_with_dataset() + +arrow_with_parquet() + +arrow_with_s3() + +arrow_with_json() +} +\value{ +\code{TRUE} or \code{FALSE} depending on whether the package was installed +with: +\itemize{ +\item The Arrow C++ library (check with \code{arrow_available()}) +\item Arrow Dataset support enabled (check with \code{arrow_with_dataset()}) +\item Parquet support enabled (check with \code{arrow_with_parquet()}) +\item JSON support enabled (check with \code{arrow_with_json()}) +\item Amazon S3 support enabled (check with \code{arrow_with_s3()}) +} +} +\description{ +You won't generally need to call these function, but they're made available +for diagnostic purposes. +} +\examples{ +arrow_available() +arrow_with_dataset() +arrow_with_parquet() +arrow_with_json() +arrow_with_s3() +} +\seealso{ +If any of these are \code{FALSE}, see +\code{vignette("install", package = "arrow")} for guidance on reinstalling the +package. +} diff --git a/src/arrow/r/man/arrow_info.Rd b/src/arrow/r/man/arrow_info.Rd new file mode 100644 index 000000000..95444a8bb --- /dev/null +++ b/src/arrow/r/man/arrow_info.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/arrow-package.R +\name{arrow_info} +\alias{arrow_info} +\title{Report information on the package's capabilities} +\usage{ +arrow_info() +} +\value{ +A list including version information, boolean "capabilities", and +statistics from Arrow's memory allocator, and also Arrow's run-time +information. +} +\description{ +This function summarizes a number of build-time configurations and run-time +settings for the Arrow package. It may be useful for diagnostics. +} diff --git a/src/arrow/r/man/buffer.Rd b/src/arrow/r/man/buffer.Rd new file mode 100644 index 000000000..a3ca1fc2f --- /dev/null +++ b/src/arrow/r/man/buffer.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buffer.R +\docType{class} +\name{buffer} +\alias{buffer} +\alias{Buffer} +\title{Buffer class} +\usage{ +buffer(x) +} +\arguments{ +\item{x}{R object. Only raw, numeric and integer vectors are currently supported} +} +\value{ +an instance of \code{Buffer} that borrows memory from \code{x} +} +\description{ +A Buffer is an object containing a pointer to a piece of +contiguous memory with a particular size. +} +\section{Factory}{ + +\code{buffer()} lets you create an \code{arrow::Buffer} from an R object +} + +\section{Methods}{ + +\itemize{ +\item \verb{$is_mutable} : is this buffer mutable? +\item \verb{$ZeroPadding()} : zero bytes in padding, i.e. bytes between size and capacity +\item \verb{$size} : size in memory, in bytes +\item \verb{$capacity}: possible capacity, in bytes +} +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +my_buffer <- buffer(c(1, 2, 3, 4)) +my_buffer$is_mutable +my_buffer$ZeroPadding() +my_buffer$size +my_buffer$capacity +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/call_function.Rd b/src/arrow/r/man/call_function.Rd new file mode 100644 index 000000000..c216af06f --- /dev/null +++ b/src/arrow/r/man/call_function.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.R +\name{call_function} +\alias{call_function} +\title{Call an Arrow compute function} +\usage{ +call_function( + function_name, + ..., + args = list(...), + options = empty_named_list() +) +} +\arguments{ +\item{function_name}{string Arrow compute function name} + +\item{...}{Function arguments, which may include \code{Array}, \code{ChunkedArray}, \code{Scalar}, +\code{RecordBatch}, or \code{Table}.} + +\item{args}{list arguments as an alternative to specifying in \code{...}} + +\item{options}{named list of C++ function options.} +} +\value{ +An \code{Array}, \code{ChunkedArray}, \code{Scalar}, \code{RecordBatch}, or \code{Table}, whatever the compute function results in. +} +\description{ +This function provides a lower-level API for calling Arrow functions by their +string function name. You won't use it directly for most applications. +Many Arrow compute functions are mapped to R methods, +and in a \code{dplyr} evaluation context, \link[=list_compute_functions]{all Arrow functions} +are callable with an \code{arrow_} prefix. +} +\details{ +When passing indices in \code{...}, \code{args}, or \code{options}, express them as +0-based integers (consistent with C++). +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +a <- Array$create(c(1L, 2L, 3L, NA, 5L)) +s <- Scalar$create(4L) +call_function("coalesce", a, s) + +a <- Array$create(rnorm(10000)) +call_function("quantile", a, options = list(q = seq(0, 1, 0.25))) +\dontshow{\}) # examplesIf} +} +\seealso{ +\href{https://arrow.apache.org/docs/cpp/compute.html}{Arrow C++ documentation} for +the functions and their respective options. +} diff --git a/src/arrow/r/man/cast_options.Rd b/src/arrow/r/man/cast_options.Rd new file mode 100644 index 000000000..40d78052c --- /dev/null +++ b/src/arrow/r/man/cast_options.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.R +\name{cast_options} +\alias{cast_options} +\title{Cast options} +\usage{ +cast_options(safe = TRUE, ...) +} +\arguments{ +\item{safe}{logical: enforce safe conversion? Default \code{TRUE}} + +\item{...}{additional cast options, such as \code{allow_int_overflow}, +\code{allow_time_truncate}, and \code{allow_float_truncate}, which are set to \code{!safe} +by default} +} +\value{ +A list +} +\description{ +Cast options +} +\keyword{internal} diff --git a/src/arrow/r/man/codec_is_available.Rd b/src/arrow/r/man/codec_is_available.Rd new file mode 100644 index 000000000..b3238ff1d --- /dev/null +++ b/src/arrow/r/man/codec_is_available.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compression.R +\name{codec_is_available} +\alias{codec_is_available} +\title{Check whether a compression codec is available} +\usage{ +codec_is_available(type) +} +\arguments{ +\item{type}{A string, one of "uncompressed", "snappy", "gzip", "brotli", +"zstd", "lz4", "lzo", or "bz2", case insensitive.} +} +\value{ +Logical: is \code{type} available? +} +\description{ +Support for compression libraries depends on the build-time settings of +the Arrow C++ library. This function lets you know which are available for +use. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +codec_is_available("gzip") +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/compression.Rd b/src/arrow/r/man/compression.Rd new file mode 100644 index 000000000..7cdb320d6 --- /dev/null +++ b/src/arrow/r/man/compression.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compression.R +\docType{class} +\name{compression} +\alias{compression} +\alias{CompressedOutputStream} +\alias{CompressedInputStream} +\title{Compressed stream classes} +\description{ +\code{CompressedInputStream} and \code{CompressedOutputStream} +allow you to apply a compression \link{Codec} to an +input or output stream. +} +\section{Factory}{ + + +The \code{CompressedInputStream$create()} and \code{CompressedOutputStream$create()} +factory methods instantiate the object and take the following arguments: +\itemize{ +\item \code{stream} An \link{InputStream} or \link{OutputStream}, respectively +\item \code{codec} A \code{Codec}, either a \link{Codec} instance or a string +\item \code{compression_level} compression level for when the \code{codec} argument is given as a string +} +} + +\section{Methods}{ + + +Methods are inherited from \link{InputStream} and \link{OutputStream}, respectively +} + diff --git a/src/arrow/r/man/contains_regex.Rd b/src/arrow/r/man/contains_regex.Rd new file mode 100644 index 000000000..f05f11d02 --- /dev/null +++ b/src/arrow/r/man/contains_regex.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-functions.R +\name{contains_regex} +\alias{contains_regex} +\title{Does this string contain regex metacharacters?} +\usage{ +contains_regex(string) +} +\arguments{ +\item{string}{String to be tested} +} +\value{ +Logical: does \code{string} contain regex metacharacters? +} +\description{ +Does this string contain regex metacharacters? +} +\keyword{internal} diff --git a/src/arrow/r/man/copy_files.Rd b/src/arrow/r/man/copy_files.Rd new file mode 100644 index 000000000..1b83703f1 --- /dev/null +++ b/src/arrow/r/man/copy_files.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{copy_files} +\alias{copy_files} +\title{Copy files between FileSystems} +\usage{ +copy_files(from, to, chunk_size = 1024L * 1024L) +} +\arguments{ +\item{from}{A string path to a local directory or file, a URI, or a +\code{SubTreeFileSystem}. Files will be copied recursively from this path.} + +\item{to}{A string path to a local directory or file, a URI, or a +\code{SubTreeFileSystem}. Directories will be created as necessary} + +\item{chunk_size}{The maximum size of block to read before flushing +to the destination file. A larger chunk_size will use more memory while +copying but may help accommodate high latency FileSystems.} +} +\value{ +Nothing: called for side effects in the file system +} +\description{ +Copy files between FileSystems +} +\examples{ +\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Copy an S3 bucket's files to a local directory: +copy_files("s3://your-bucket-name", "local-directory") +# Using a FileSystem object +copy_files(s3_bucket("your-bucket-name"), "local-directory") +# Or go the other way, from local to S3 +copy_files("local-directory", s3_bucket("your-bucket-name")) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/cpu_count.Rd b/src/arrow/r/man/cpu_count.Rd new file mode 100644 index 000000000..f2abfc197 --- /dev/null +++ b/src/arrow/r/man/cpu_count.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{cpu_count} +\alias{cpu_count} +\alias{set_cpu_count} +\title{Manage the global CPU thread pool in libarrow} +\usage{ +cpu_count() + +set_cpu_count(num_threads) +} +\arguments{ +\item{num_threads}{integer: New number of threads for thread pool} +} +\description{ +Manage the global CPU thread pool in libarrow +} diff --git a/src/arrow/r/man/create_package_with_all_dependencies.Rd b/src/arrow/r/man/create_package_with_all_dependencies.Rd new file mode 100644 index 000000000..b2da8c249 --- /dev/null +++ b/src/arrow/r/man/create_package_with_all_dependencies.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/install-arrow.R +\name{create_package_with_all_dependencies} +\alias{create_package_with_all_dependencies} +\title{Create a source bundle that includes all thirdparty dependencies} +\usage{ +create_package_with_all_dependencies(dest_file = NULL, source_file = NULL) +} +\arguments{ +\item{dest_file}{File path for the new tar.gz package. Defaults to +\code{arrow_V.V.V_with_deps.tar.gz} in the current directory (\code{V.V.V} is the version)} + +\item{source_file}{File path for the input tar.gz package. Defaults to +downloading the package from CRAN (or whatever you have set as the first in +\code{getOption("repos")})} +} +\value{ +The full path to \code{dest_file}, invisibly + +This function is used for setting up an offline build. If it's possible to +download at build time, don't use this function. Instead, let \code{cmake} +download the required dependencies for you. +These downloaded dependencies are only used in the build if +\code{ARROW_DEPENDENCY_SOURCE} is unset, \code{BUNDLED}, or \code{AUTO}. +https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds + +If you're using binary packages you shouldn't need to use this function. You +should download the appropriate binary from your package repository, transfer +that to the offline computer, and install that. Any OS can create the source +bundle, but it cannot be installed on Windows. (Instead, use a standard +Windows binary package.) + +Note if you're using RStudio Package Manager on Linux: If you still want to +make a source bundle with this function, make sure to set the first repo in +\code{options("repos")} to be a mirror that contains source packages (that is: +something other than the RSPM binary mirror URLs). +\subsection{Steps for an offline install with optional dependencies:}{ +\subsection{Using a computer with internet access, pre-download the dependencies:}{ +\itemize{ +\item Install the \code{arrow} package \emph{or} run +\code{source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")} +\item Run \code{create_package_with_all_dependencies("my_arrow_pkg.tar.gz")} +\item Copy the newly created \code{my_arrow_pkg.tar.gz} to the computer without internet access +} +} + +\subsection{On the computer without internet access, install the prepared package:}{ +\itemize{ +\item Install the \code{arrow} package from the copied file +\itemize{ +\item \code{install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))} +\item This installation will build from source, so \code{cmake} must be available +} +\item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities +} +} + +} +} +\description{ +Create a source bundle that includes all thirdparty dependencies +} +\examples{ +\dontrun{ +new_pkg <- create_package_with_all_dependencies() +# Note: this works when run in the same R session, but it's meant to be +# copied to a different computer. +install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) +} +} diff --git a/src/arrow/r/man/data-type.Rd b/src/arrow/r/man/data-type.Rd new file mode 100644 index 000000000..a06318975 --- /dev/null +++ b/src/arrow/r/man/data-type.Rd @@ -0,0 +1,163 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/type.R +\name{data-type} +\alias{data-type} +\alias{int8} +\alias{int16} +\alias{int32} +\alias{int64} +\alias{uint8} +\alias{uint16} +\alias{uint32} +\alias{uint64} +\alias{float16} +\alias{halffloat} +\alias{float32} +\alias{float} +\alias{float64} +\alias{boolean} +\alias{bool} +\alias{utf8} +\alias{large_utf8} +\alias{binary} +\alias{large_binary} +\alias{fixed_size_binary} +\alias{string} +\alias{date32} +\alias{date64} +\alias{time32} +\alias{time64} +\alias{null} +\alias{timestamp} +\alias{decimal} +\alias{struct} +\alias{list_of} +\alias{large_list_of} +\alias{FixedSizeListType} +\alias{fixed_size_list_of} +\title{Apache Arrow data types} +\usage{ +int8() + +int16() + +int32() + +int64() + +uint8() + +uint16() + +uint32() + +uint64() + +float16() + +halffloat() + +float32() + +float() + +float64() + +boolean() + +bool() + +utf8() + +large_utf8() + +binary() + +large_binary() + +fixed_size_binary(byte_width) + +string() + +date32() + +date64() + +time32(unit = c("ms", "s")) + +time64(unit = c("ns", "us")) + +null() + +timestamp(unit = c("s", "ms", "us", "ns"), timezone = "") + +decimal(precision, scale) + +struct(...) + +list_of(type) + +large_list_of(type) + +fixed_size_list_of(type, list_size) +} +\arguments{ +\item{byte_width}{byte width for \code{FixedSizeBinary} type.} + +\item{unit}{For time/timestamp types, the time unit. \code{time32()} can take +either "s" or "ms", while \code{time64()} can be "us" or "ns". \code{timestamp()} can +take any of those four values.} + +\item{timezone}{For \code{timestamp()}, an optional time zone string.} + +\item{precision}{For \code{decimal()}, precision} + +\item{scale}{For \code{decimal()}, scale} + +\item{...}{For \code{struct()}, a named list of types to define the struct columns} + +\item{type}{For \code{list_of()}, a data type to make a list-of-type} + +\item{list_size}{list size for \code{FixedSizeList} type.} +} +\value{ +An Arrow type object inheriting from DataType. +} +\description{ +These functions create type objects corresponding to Arrow types. Use them +when defining a \code{\link[=schema]{schema()}} or as inputs to other types, like \code{struct}. Most +of these functions don't take arguments, but a few do. +} +\details{ +A few functions have aliases: +\itemize{ +\item \code{utf8()} and \code{string()} +\item \code{float16()} and \code{halffloat()} +\item \code{float32()} and \code{float()} +\item \code{bool()} and \code{boolean()} +\item When called inside an \code{arrow} function, such as \code{schema()} or \code{cast()}, +\code{double()} also is supported as a way of creating a \code{float64()} +} + +\code{date32()} creates a datetime type with a "day" unit, like the R \code{Date} +class. \code{date64()} has a "ms" unit. + +\code{uint32} (32 bit unsigned integer), \code{uint64} (64 bit unsigned integer), and +\code{int64} (64-bit signed integer) types may contain values that exceed the +range of R's \code{integer} type (32-bit signed integer). When these arrow objects +are translated to R objects, \code{uint32} and \code{uint64} are converted to \code{double} +("numeric") and \code{int64} is converted to \code{bit64::integer64}. For \code{int64} +types, this conversion can be disabled (so that \code{int64} always yields a +\code{bit64::integer64} object) by setting \code{options(arrow.int64_downcast = FALSE)}. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +bool() +struct(a = int32(), b = double()) +timestamp("ms", timezone = "CEST") +time64("ns") +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=dictionary]{dictionary()}} for creating a dictionary (factor-like) type. +} diff --git a/src/arrow/r/man/dataset_factory.Rd b/src/arrow/r/man/dataset_factory.Rd new file mode 100644 index 000000000..d119c150b --- /dev/null +++ b/src/arrow/r/man/dataset_factory.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-factory.R +\name{dataset_factory} +\alias{dataset_factory} +\title{Create a DatasetFactory} +\usage{ +dataset_factory( + x, + filesystem = NULL, + format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"), + partitioning = NULL, + ... +) +} +\arguments{ +\item{x}{A string path to a directory containing data files, a vector of one +one or more string paths to data files, or a list of \code{DatasetFactory} objects +whose datasets should be combined. If this argument is specified it will be +used to construct a \code{UnionDatasetFactory} and other arguments will be +ignored.} + +\item{filesystem}{A \link{FileSystem} object; if omitted, the \code{FileSystem} will +be detected from \code{x}} + +\item{format}{A \link{FileFormat} object, or a string identifier of the format of +the files in \code{x}. Currently supported values: +\itemize{ +\item "parquet" +\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +only version 2 files are supported +\item "csv"/"text", aliases for the same thing (because comma is the default +delimiter for text files +\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"} +} + +Default is "parquet", unless a \code{delimiter} is also specified, in which case +it is assumed to be "text".} + +\item{partitioning}{One of +\itemize{ +\item A \code{Schema}, in which case the file paths relative to \code{sources} will be +parsed, and path segments will be matched with the schema fields. For +example, \code{schema(year = int16(), month = int8())} would create partitions +for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc. +\item A character vector that defines the field names corresponding to those +path segments (that is, you're providing the names that would correspond +to a \code{Schema} but the types will be autodetected) +\item A \code{HivePartitioning} or \code{HivePartitioningFactory}, as returned +by \code{\link[=hive_partition]{hive_partition()}} which parses explicit or autodetected fields from +Hive-style path segments +\item \code{NULL} for no partitioning +}} + +\item{...}{Additional format-specific options, passed to +\code{FileFormat$create()}. For CSV options, note that you can specify them either +with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the +\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.). +Not all \code{readr} options are currently supported; please file an issue if you +encounter one that \code{arrow} should support.} +} +\value{ +A \code{DatasetFactory} object. Pass this to \code{\link[=open_dataset]{open_dataset()}}, +in a list potentially with other \code{DatasetFactory} objects, to create +a \code{Dataset}. +} +\description{ +A \link{Dataset} can constructed using one or more \link{DatasetFactory}s. +This function helps you construct a \code{DatasetFactory} that you can pass to +\code{\link[=open_dataset]{open_dataset()}}. +} +\details{ +If you would only have a single \code{DatasetFactory} (for example, you have a +single directory containing Parquet files), you can call \code{open_dataset()} +directly. Use \code{dataset_factory()} when you +want to combine different directories, file systems, or file formats. +} diff --git a/src/arrow/r/man/default_memory_pool.Rd b/src/arrow/r/man/default_memory_pool.Rd new file mode 100644 index 000000000..232a89e6a --- /dev/null +++ b/src/arrow/r/man/default_memory_pool.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/memory-pool.R +\name{default_memory_pool} +\alias{default_memory_pool} +\title{Arrow's default \link{MemoryPool}} +\usage{ +default_memory_pool() +} +\value{ +the default \link{MemoryPool} +} +\description{ +Arrow's default \link{MemoryPool} +} +\keyword{internal} diff --git a/src/arrow/r/man/dictionary.Rd b/src/arrow/r/man/dictionary.Rd new file mode 100644 index 000000000..d4b934954 --- /dev/null +++ b/src/arrow/r/man/dictionary.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dictionary.R +\name{dictionary} +\alias{dictionary} +\title{Create a dictionary type} +\usage{ +dictionary(index_type = int32(), value_type = utf8(), ordered = FALSE) +} +\arguments{ +\item{index_type}{A DataType for the indices (default \code{\link[=int32]{int32()}})} + +\item{value_type}{A DataType for the values (default \code{\link[=utf8]{utf8()}})} + +\item{ordered}{Is this an ordered dictionary (default \code{FALSE})?} +} +\value{ +A \link{DictionaryType} +} +\description{ +Create a dictionary type +} +\seealso{ +\link[=data-type]{Other Arrow data types} +} diff --git a/src/arrow/r/man/enums.Rd b/src/arrow/r/man/enums.Rd new file mode 100644 index 000000000..7ec126a01 --- /dev/null +++ b/src/arrow/r/man/enums.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/enums.R +\docType{data} +\name{enums} +\alias{enums} +\alias{TimeUnit} +\alias{DateUnit} +\alias{Type} +\alias{StatusCode} +\alias{FileMode} +\alias{MessageType} +\alias{CompressionType} +\alias{FileType} +\alias{ParquetVersionType} +\alias{MetadataVersion} +\alias{QuantileInterpolation} +\alias{NullEncodingBehavior} +\alias{NullHandlingBehavior} +\alias{RoundMode} +\alias{JoinType} +\title{Arrow enums} +\format{ +An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4. + +An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2. + +An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 37. + +An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17. + +An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3. + +An object of class \code{MessageType} (inherits from \code{arrow-enum}) of length 5. + +An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of length 9. + +An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4. + +An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2. + +An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of length 5. + +An object of class \code{QuantileInterpolation} (inherits from \code{arrow-enum}) of length 5. + +An object of class \code{NullEncodingBehavior} (inherits from \code{arrow-enum}) of length 2. + +An object of class \code{NullHandlingBehavior} (inherits from \code{arrow-enum}) of length 3. + +An object of class \code{RoundMode} (inherits from \code{arrow-enum}) of length 10. + +An object of class \code{JoinType} (inherits from \code{arrow-enum}) of length 8. +} +\usage{ +TimeUnit + +DateUnit + +Type + +StatusCode + +FileMode + +MessageType + +CompressionType + +FileType + +ParquetVersionType + +MetadataVersion + +QuantileInterpolation + +NullEncodingBehavior + +NullHandlingBehavior + +RoundMode + +JoinType +} +\description{ +Arrow enums +} +\keyword{datasets} +\keyword{internal} diff --git a/src/arrow/r/man/flight_connect.Rd b/src/arrow/r/man/flight_connect.Rd new file mode 100644 index 000000000..9da7fad75 --- /dev/null +++ b/src/arrow/r/man/flight_connect.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flight.R +\name{flight_connect} +\alias{flight_connect} +\title{Connect to a Flight server} +\usage{ +flight_connect(host = "localhost", port, scheme = "grpc+tcp") +} +\arguments{ +\item{host}{string hostname to connect to} + +\item{port}{integer port to connect on} + +\item{scheme}{URL scheme, default is "grpc+tcp"} +} +\value{ +A \code{pyarrow.flight.FlightClient}. +} +\description{ +Connect to a Flight server +} diff --git a/src/arrow/r/man/flight_get.Rd b/src/arrow/r/man/flight_get.Rd new file mode 100644 index 000000000..a79c4d727 --- /dev/null +++ b/src/arrow/r/man/flight_get.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flight.R +\name{flight_get} +\alias{flight_get} +\title{Get data from a Flight server} +\usage{ +flight_get(client, path) +} +\arguments{ +\item{client}{\code{pyarrow.flight.FlightClient}, as returned by \code{\link[=flight_connect]{flight_connect()}}} + +\item{path}{string identifier under which data is stored} +} +\value{ +A \link{Table} +} +\description{ +Get data from a Flight server +} diff --git a/src/arrow/r/man/flight_put.Rd b/src/arrow/r/man/flight_put.Rd new file mode 100644 index 000000000..13a8da16f --- /dev/null +++ b/src/arrow/r/man/flight_put.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flight.R +\name{flight_put} +\alias{flight_put} +\title{Send data to a Flight server} +\usage{ +flight_put(client, data, path, overwrite = TRUE) +} +\arguments{ +\item{client}{\code{pyarrow.flight.FlightClient}, as returned by \code{\link[=flight_connect]{flight_connect()}}} + +\item{data}{\code{data.frame}, \link{RecordBatch}, or \link{Table} to upload} + +\item{path}{string identifier to store the data under} + +\item{overwrite}{logical: if \code{path} exists on \code{client} already, should we +replace it with the contents of \code{data}? Default is \code{TRUE}; if \code{FALSE} and +\code{path} exists, the function will error.} +} +\value{ +\code{client}, invisibly. +} +\description{ +Send data to a Flight server +} diff --git a/src/arrow/r/man/get_stringr_pattern_options.Rd b/src/arrow/r/man/get_stringr_pattern_options.Rd new file mode 100644 index 000000000..7107b9060 --- /dev/null +++ b/src/arrow/r/man/get_stringr_pattern_options.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr-functions.R +\name{get_stringr_pattern_options} +\alias{get_stringr_pattern_options} +\title{Get \code{stringr} pattern options} +\usage{ +get_stringr_pattern_options(pattern) +} +\arguments{ +\item{pattern}{Unevaluated expression containing a call to a \code{stringr} +pattern modifier function} +} +\value{ +List containing elements \code{pattern}, \code{fixed}, and \code{ignore_case} +} +\description{ +This function assigns definitions for the \code{stringr} pattern modifier +functions (\code{fixed()}, \code{regex()}, etc.) inside itself, and uses them to +evaluate the quoted expression \code{pattern}, returning a list that is used +to control pattern matching behavior in internal \code{arrow} functions. +} +\keyword{internal} diff --git a/src/arrow/r/man/hive_partition.Rd b/src/arrow/r/man/hive_partition.Rd new file mode 100644 index 000000000..eef9f9157 --- /dev/null +++ b/src/arrow/r/man/hive_partition.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-partition.R +\name{hive_partition} +\alias{hive_partition} +\title{Construct Hive partitioning} +\usage{ +hive_partition(..., null_fallback = NULL, segment_encoding = "uri") +} +\arguments{ +\item{...}{named list of \link[=data-type]{data types}, passed to \code{\link[=schema]{schema()}}} + +\item{null_fallback}{character to be used in place of missing values (\code{NA} or \code{NULL}) +in partition columns. Default is \code{"__HIVE_DEFAULT_PARTITION__"}, +which is what Hive uses.} + +\item{segment_encoding}{Decode partition segments after splitting paths. +Default is \code{"uri"} (URI-decode segments). May also be \code{"none"} (leave as-is).} +} +\value{ +A \link[=Partitioning]{HivePartitioning}, or a \code{HivePartitioningFactory} if +calling \code{hive_partition()} with no arguments. +} +\description{ +Hive partitioning embeds field names and values in path segments, such as +"/year=2019/month=2/data.parquet". +} +\details{ +Because fields are named in the path segments, order of fields passed to +\code{hive_partition()} does not matter. +} +\examples{ +\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +hive_partition(year = int16(), month = int8()) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/install_arrow.Rd b/src/arrow/r/man/install_arrow.Rd new file mode 100644 index 000000000..bf94650b3 --- /dev/null +++ b/src/arrow/r/man/install_arrow.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/install-arrow.R +\name{install_arrow} +\alias{install_arrow} +\title{Install or upgrade the Arrow library} +\usage{ +install_arrow( + nightly = FALSE, + binary = Sys.getenv("LIBARROW_BINARY", TRUE), + use_system = Sys.getenv("ARROW_USE_PKG_CONFIG", FALSE), + minimal = Sys.getenv("LIBARROW_MINIMAL", FALSE), + verbose = Sys.getenv("ARROW_R_DEV", FALSE), + repos = getOption("repos"), + ... +) +} +\arguments{ +\item{nightly}{logical: Should we install a development version of the +package, or should we install from CRAN (the default).} + +\item{binary}{On Linux, value to set for the environment variable +\code{LIBARROW_BINARY}, which governs how C++ binaries are used, if at all. +The default value, \code{TRUE}, tells the installation script to detect the +Linux distribution and version and find an appropriate C++ library. \code{FALSE} +would tell the script not to retrieve a binary and instead build Arrow C++ +from source. Other valid values are strings corresponding to a Linux +distribution-version, to override the value that would be detected. +See \code{vignette("install", package = "arrow")} for further details.} + +\item{use_system}{logical: Should we use \code{pkg-config} to look for Arrow +system packages? Default is \code{FALSE}. If \code{TRUE}, source installation may be +faster, but there is a risk of version mismatch. This sets the +\code{ARROW_USE_PKG_CONFIG} environment variable.} + +\item{minimal}{logical: If building from source, should we build without +optional dependencies (compression libraries, for example)? Default is +\code{FALSE}. This sets the \code{LIBARROW_MINIMAL} environment variable.} + +\item{verbose}{logical: Print more debugging output when installing? Default +is \code{FALSE}. This sets the \code{ARROW_R_DEV} environment variable.} + +\item{repos}{character vector of base URLs of the repositories to install +from (passed to \code{install.packages()})} + +\item{...}{Additional arguments passed to \code{install.packages()}} +} +\description{ +Use this function to install the latest release of \code{arrow}, to switch to or +from a nightly development version, or on Linux to try reinstalling with +all necessary C++ dependencies. +} +\details{ +Note that, unlike packages like \code{tensorflow}, \code{blogdown}, and others that +require external dependencies, you do not need to run \code{install_arrow()} +after a successful \code{arrow} installation. +} +\seealso{ +\code{\link[=arrow_available]{arrow_available()}} to see if the package was configured with +necessary C++ dependencies. \code{vignette("install", package = "arrow")} for +more ways to tune installation on Linux. +} diff --git a/src/arrow/r/man/install_pyarrow.Rd b/src/arrow/r/man/install_pyarrow.Rd new file mode 100644 index 000000000..223a26754 --- /dev/null +++ b/src/arrow/r/man/install_pyarrow.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/python.R +\name{install_pyarrow} +\alias{install_pyarrow} +\title{Install pyarrow for use with reticulate} +\usage{ +install_pyarrow(envname = NULL, nightly = FALSE, ...) +} +\arguments{ +\item{envname}{The name or full path of the Python environment to install +into. This can be a virtualenv or conda environment created by \code{reticulate}. +See \code{reticulate::py_install()}.} + +\item{nightly}{logical: Should we install a development version of the +package? Default is to use the official release version.} + +\item{...}{additional arguments passed to \code{reticulate::py_install()}.} +} +\description{ +\code{pyarrow} is the Python package for Apache Arrow. This function helps with +installing it for use with \code{reticulate}. +} diff --git a/src/arrow/r/man/io_thread_count.Rd b/src/arrow/r/man/io_thread_count.Rd new file mode 100644 index 000000000..b1dfa0ba7 --- /dev/null +++ b/src/arrow/r/man/io_thread_count.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{io_thread_count} +\alias{io_thread_count} +\alias{set_io_thread_count} +\title{Manage the global I/O thread pool in libarrow} +\usage{ +io_thread_count() + +set_io_thread_count(num_threads) +} +\arguments{ +\item{num_threads}{integer: New number of threads for thread pool} +} +\description{ +Manage the global I/O thread pool in libarrow +} diff --git a/src/arrow/r/man/list_compute_functions.Rd b/src/arrow/r/man/list_compute_functions.Rd new file mode 100644 index 000000000..45e033836 --- /dev/null +++ b/src/arrow/r/man/list_compute_functions.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.R +\name{list_compute_functions} +\alias{list_compute_functions} +\title{List available Arrow C++ compute functions} +\usage{ +list_compute_functions(pattern = NULL, ...) +} +\arguments{ +\item{pattern}{Optional regular expression to filter the function list} + +\item{...}{Additional parameters passed to \code{grep()}} +} +\value{ +A character vector of available Arrow C++ function names +} +\description{ +This function lists the names of all available Arrow C++ library compute functions. +These can be called by passing to \code{\link[=call_function]{call_function()}}, or they can be +called by name with an \code{arrow_} prefix inside a \code{dplyr} verb. +} +\details{ +The resulting list describes the capabilities of your \code{arrow} build. +Some functions, such as string and regular expression functions, +require optional build-time C++ dependencies. If your \code{arrow} package +was not compiled with those features enabled, those functions will +not appear in this list. + +Some functions take options that need to be passed when calling them +(in a list called \code{options}). These options require custom handling +in C++; many functions already have that handling set up but not all do. +If you encounter one that needs special handling for options, please +report an issue. + +Note that this list does \emph{not} enumerate all of the R bindings for these functions. +The package includes Arrow methods for many base R functions that can +be called directly on Arrow objects, as well as some tidyverse-flavored versions +available inside \code{dplyr} verbs. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +available_funcs <- list_compute_functions() +utf8_funcs <- list_compute_functions(pattern = "^UTF8", ignore.case = TRUE) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/list_flights.Rd b/src/arrow/r/man/list_flights.Rd new file mode 100644 index 000000000..d8ebb0d02 --- /dev/null +++ b/src/arrow/r/man/list_flights.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flight.R +\name{list_flights} +\alias{list_flights} +\alias{flight_path_exists} +\title{See available resources on a Flight server} +\usage{ +list_flights(client) + +flight_path_exists(client, path) +} +\arguments{ +\item{client}{\code{pyarrow.flight.FlightClient}, as returned by \code{\link[=flight_connect]{flight_connect()}}} + +\item{path}{string identifier under which data is stored} +} +\value{ +\code{list_flights()} returns a character vector of paths. +\code{flight_path_exists()} returns a logical value, the equivalent of \code{path \%in\% list_flights()} +} +\description{ +See available resources on a Flight server +} diff --git a/src/arrow/r/man/load_flight_server.Rd b/src/arrow/r/man/load_flight_server.Rd new file mode 100644 index 000000000..66d30f391 --- /dev/null +++ b/src/arrow/r/man/load_flight_server.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flight.R +\name{load_flight_server} +\alias{load_flight_server} +\title{Load a Python Flight server} +\usage{ +load_flight_server(name, path = system.file(package = "arrow")) +} +\arguments{ +\item{name}{string Python module name} + +\item{path}{file system path where the Python module is found. Default is +to look in the \verb{inst/} directory for included modules.} +} +\description{ +Load a Python Flight server +} +\examples{ +\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +load_flight_server("demo_flight_server") +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/make_readable_file.Rd b/src/arrow/r/man/make_readable_file.Rd new file mode 100644 index 000000000..fe2e29826 --- /dev/null +++ b/src/arrow/r/man/make_readable_file.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{make_readable_file} +\alias{make_readable_file} +\title{Handle a range of possible input sources} +\usage{ +make_readable_file(file, mmap = TRUE, compression = NULL, filesystem = NULL) +} +\arguments{ +\item{file}{A character file name, \code{raw} vector, or an Arrow input stream} + +\item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})} + +\item{compression}{If the file is compressed, created a \link{CompressedInputStream} +with this compression codec, either a \link{Codec} or the string name of one. +If \code{NULL} (default) and \code{file} is a string file name, the function will try +to infer compression from the file extension.} + +\item{filesystem}{If not \code{NULL}, \code{file} will be opened via the +\code{filesystem$OpenInputFile()} filesystem method, rather than the \code{io} module's +\code{MemoryMappedFile} or \code{ReadableFile} constructors.} +} +\value{ +An \code{InputStream} or a subclass of one. +} +\description{ +Handle a range of possible input sources +} +\keyword{internal} diff --git a/src/arrow/r/man/map_batches.Rd b/src/arrow/r/man/map_batches.Rd new file mode 100644 index 000000000..08e7b86c0 --- /dev/null +++ b/src/arrow/r/man/map_batches.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-scan.R +\name{map_batches} +\alias{map_batches} +\title{Apply a function to a stream of RecordBatches} +\usage{ +map_batches(X, FUN, ..., .data.frame = TRUE) +} +\arguments{ +\item{X}{A \code{Dataset} or \code{arrow_dplyr_query} object, as returned by the +\code{dplyr} methods on \code{Dataset}.} + +\item{FUN}{A function or \code{purrr}-style lambda expression to apply to each +batch} + +\item{...}{Additional arguments passed to \code{FUN}} + +\item{.data.frame}{logical: collect the resulting chunks into a single +\code{data.frame}? Default \code{TRUE}} +} +\description{ +As an alternative to calling \code{collect()} on a \code{Dataset} query, you can +use this function to access the stream of \code{RecordBatch}es in the \code{Dataset}. +This lets you aggregate on each chunk and pull the intermediate results into +a \code{data.frame} for further aggregation, even if you couldn't fit the whole +\code{Dataset} result in memory. +} +\details{ +This is experimental and not recommended for production use. +} diff --git a/src/arrow/r/man/match_arrow.Rd b/src/arrow/r/man/match_arrow.Rd new file mode 100644 index 000000000..877a41926 --- /dev/null +++ b/src/arrow/r/man/match_arrow.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.R +\name{match_arrow} +\alias{match_arrow} +\alias{is_in} +\title{\code{match} and \code{\%in\%} for Arrow objects} +\usage{ +match_arrow(x, table, ...) + +is_in(x, table, ...) +} +\arguments{ +\item{x}{\code{Scalar}, \code{Array} or \code{ChunkedArray}} + +\item{table}{\code{Scalar}, Array\verb{, }ChunkedArray`, or R vector lookup table.} + +\item{...}{additional arguments, ignored} +} +\value{ +\code{match_arrow()} returns an \code{int32}-type Arrow object of the same length +and type as \code{x} with the (0-based) indexes into \code{table}. \code{is_in()} returns a +\code{boolean}-type Arrow object of the same length and type as \code{x} with values indicating +per element of \code{x} it it is present in \code{table}. +} +\description{ +\code{base::match()} is not a generic, so we can't just define Arrow methods for +it. This function exposes the analogous functions in the Arrow C++ library. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# note that the returned value is 0-indexed +cars_tbl <- arrow_table(name = rownames(mtcars), mtcars) +match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name) + +is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name) + +# Although there are multiple matches, you are returned the index of the first +# match, as with the base R equivalent +match(4, mtcars$cyl) # 1-indexed +match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed + +# If `x` contains multiple values, you are returned the indices of the first +# match for each value. +match(c(4, 6, 8), mtcars$cyl) +match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl) + +# Return type matches type of `x` +is_in(c(4, 6, 8), mtcars$cyl) # returns vector +is_in(Scalar$create(4), mtcars$cyl) # returns Scalar +is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array +is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/mmap_create.Rd b/src/arrow/r/man/mmap_create.Rd new file mode 100644 index 000000000..b85519348 --- /dev/null +++ b/src/arrow/r/man/mmap_create.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{mmap_create} +\alias{mmap_create} +\title{Create a new read/write memory mapped file of a given size} +\usage{ +mmap_create(path, size) +} +\arguments{ +\item{path}{file path} + +\item{size}{size in bytes} +} +\value{ +a \link[=MemoryMappedFile]{arrow::io::MemoryMappedFile} +} +\description{ +Create a new read/write memory mapped file of a given size +} diff --git a/src/arrow/r/man/mmap_open.Rd b/src/arrow/r/man/mmap_open.Rd new file mode 100644 index 000000000..d0047a72c --- /dev/null +++ b/src/arrow/r/man/mmap_open.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{mmap_open} +\alias{mmap_open} +\title{Open a memory mapped file} +\usage{ +mmap_open(path, mode = c("read", "write", "readwrite")) +} +\arguments{ +\item{path}{file path} + +\item{mode}{file mode (read/write/readwrite)} +} +\description{ +Open a memory mapped file +} diff --git a/src/arrow/r/man/open_dataset.Rd b/src/arrow/r/man/open_dataset.Rd new file mode 100644 index 000000000..4d6b492e3 --- /dev/null +++ b/src/arrow/r/man/open_dataset.Rd @@ -0,0 +1,146 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{open_dataset} +\alias{open_dataset} +\title{Open a multi-file dataset} +\usage{ +open_dataset( + sources, + schema = NULL, + partitioning = hive_partition(), + unify_schemas = NULL, + format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"), + ... +) +} +\arguments{ +\item{sources}{One of: +\itemize{ +\item a string path or URI to a directory containing data files +\item a string path or URI to a single file +\item a character vector of paths or URIs to individual data files +\item a list of \code{Dataset} objects as created by this function +\item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}. +} + +When \code{sources} is a vector of file URIs, they must all use the same protocol +and point to files located in the same file system and having the same +format.} + +\item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema +will be inferred from the data sources.} + +\item{partitioning}{When \code{sources} is a directory path/URI, one of: +\itemize{ +\item a \code{Schema}, in which case the file paths relative to \code{sources} will be +parsed, and path segments will be matched with the schema fields. For +example, \code{schema(year = int16(), month = int8())} would create partitions +for file paths like \code{"2019/01/file.parquet"}, \code{"2019/02/file.parquet"}, +etc. +\item a character vector that defines the field names corresponding to those +path segments (that is, you're providing the names that would correspond +to a \code{Schema} but the types will be autodetected) +\item a \code{HivePartitioning} or \code{HivePartitioningFactory}, as returned +by \code{\link[=hive_partition]{hive_partition()}} which parses explicit or autodetected fields from +Hive-style path segments +\item \code{NULL} for no partitioning +} + +The default is to autodetect Hive-style partitions. When \code{sources} is not a +directory path/URI, \code{partitioning} is ignored.} + +\item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s) +be scanned in order to create a unified schema from them? If \code{FALSE}, only +the first fragment will be inspected for its schema. Use this fast path +when you know and trust that all fragments have an identical schema. +The default is \code{FALSE} when creating a dataset from a directory path/URI or +vector of file paths/URIs (because there may be many files and scanning may +be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there +should be few \code{Dataset}s in the list and their \code{Schema}s are already in +memory).} + +\item{format}{A \link{FileFormat} object, or a string identifier of the format of +the files in \code{x}. This argument is ignored when \code{sources} is a list of \code{Dataset} objects. +Currently supported values: +\itemize{ +\item "parquet" +\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +only version 2 files are supported +\item "csv"/"text", aliases for the same thing (because comma is the default +delimiter for text files +\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"} +} + +Default is "parquet", unless a \code{delimiter} is also specified, in which case +it is assumed to be "text".} + +\item{...}{additional arguments passed to \code{dataset_factory()} when \code{sources} +is a directory path/URI or vector of file paths/URIs, otherwise ignored. +These may include \code{format} to indicate the file format, or other +format-specific options.} +} +\value{ +A \link{Dataset} R6 object. Use \code{dplyr} methods on it to query the data, +or call \code{\link[=Scanner]{$NewScan()}} to construct a query directly. +} +\description{ +Arrow Datasets allow you to query against data that has been split across +multiple files. This sharding of data may indicate partitioning, which +can accelerate queries that only touch some partitions (files). Call +\code{open_dataset()} to point to a directory of data files and return a +\code{Dataset}, then use \code{dplyr} methods to query it. +} +\examples{ +\dontshow{if (arrow_with_dataset() & arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Set up directory for examples +tf <- tempfile() +dir.create(tf) +on.exit(unlink(tf)) + +data <- dplyr::group_by(mtcars, cyl) +write_dataset(data, tf) + +# You can specify a directory containing the files for your dataset and +# open_dataset will scan all files in your directory. +open_dataset(tf) + +# You can also supply a vector of paths +open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"), file.path(tf, "cyl=8/part-0.parquet"))) + +## You must specify the file format if using a format other than parquet. +tf2 <- tempfile() +dir.create(tf2) +on.exit(unlink(tf2)) +write_dataset(data, tf2, format = "ipc") +# This line will results in errors when you try to work with the data +\dontrun{ +open_dataset(tf2) +} +# This line will work +open_dataset(tf2, format = "ipc") + +## You can specify file partitioning to include it as a field in your dataset +# Create a temporary directory and write example dataset +tf3 <- tempfile() +dir.create(tf3) +on.exit(unlink(tf3)) +write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE) + +# View files - you can see the partitioning means that files have been written +# to folders based on Month/Day values +tf3_files <- list.files(tf3, recursive = TRUE) + +# With no partitioning specified, dataset contains all files but doesn't include +# directory names as field names +open_dataset(tf3) + +# Now that partitioning has been specified, your dataset contains columns for Month and Day +open_dataset(tf3, partitioning = c("Month", "Day")) + +# If you want to specify the data types for your fields, you can pass in a Schema +open_dataset(tf3, partitioning = schema(Month = int8(), Day = int8())) +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{vignette("dataset", package = "arrow")} +} diff --git a/src/arrow/r/man/read_delim_arrow.Rd b/src/arrow/r/man/read_delim_arrow.Rd new file mode 100644 index 000000000..7bfda29b8 --- /dev/null +++ b/src/arrow/r/man/read_delim_arrow.Rd @@ -0,0 +1,218 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{read_delim_arrow} +\alias{read_delim_arrow} +\alias{read_csv_arrow} +\alias{read_tsv_arrow} +\title{Read a CSV or other delimited file with Arrow} +\usage{ +read_delim_arrow( + file, + delim = ",", + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL +) + +read_csv_arrow( + file, + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL +) + +read_tsv_arrow( + file, + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL +) +} +\arguments{ +\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, +or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +If a file name, a memory-mapped Arrow \link{InputStream} will be opened and +closed when finished; compression will be detected from the file extension +and handled automatically. If an input stream is provided, it will be left +open.} + +\item{delim}{Single character used to separate fields within a record.} + +\item{quote}{Single character used to quote strings.} + +\item{escape_double}{Does the file escape quotes by doubling them? +i.e. If this option is \code{TRUE}, the value \verb{""""} represents +a single quote, \verb{\\"}.} + +\item{escape_backslash}{Does the file use backslashes to escape special +characters? This is more general than \code{escape_double} as backslashes +can be used to escape the delimiter character, the quote character, or +to add special characters like \verb{\\\\n}.} + +\item{schema}{\link{Schema} that describes the table. If provided, it will be +used to satisfy both \code{col_names} and \code{col_types}.} + +\item{col_names}{If \code{TRUE}, the first row of the input will be used as the +column names and will not be included in the data frame. If \code{FALSE}, column +names will be generated by Arrow, starting with "f0", "f1", ..., "fN". +Alternatively, you can specify a character vector of column names.} + +\item{col_types}{A compact string representation of the column types, or +\code{NULL} (the default) to infer types from the data.} + +\item{col_select}{A character vector of column names to keep, as in the +"select" argument to \code{data.table::fread()}, or a +\link[tidyselect:vars_select]{tidy selection specification} +of columns, as used in \code{dplyr::select()}.} + +\item{na}{A character vector of strings to interpret as missing values.} + +\item{quoted_na}{Should missing values inside quotes be treated as missing +values (the default) or strings. (Note that this is different from the +the Arrow C++ default for the corresponding convert option, +\code{strings_can_be_null}.)} + +\item{skip_empty_rows}{Should blank rows be ignored altogether? If +\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be +filled with missings.} + +\item{skip}{Number of lines to skip before reading data.} + +\item{parse_options}{see \link[=CsvReadOptions]{file reader options}. +If given, this overrides any +parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).} + +\item{convert_options}{see \link[=CsvReadOptions]{file reader options}} + +\item{read_options}{see \link[=CsvReadOptions]{file reader options}} + +\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +an Arrow \link{Table}?} + +\item{timestamp_parsers}{User-defined timestamp parsers. If more than one +parser is specified, the CSV conversion logic will try parsing values +starting from the beginning of this vector. Possible values are: +\itemize{ +\item \code{NULL}: the default, which uses the ISO-8601 parser +\item a character vector of \link[base:strptime]{strptime} parse strings +\item a list of \link{TimestampParser} objects +}} +} +\value{ +A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. +} +\description{ +These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}. +Arrow C++ options have been mapped to argument names that follow those of +\code{readr::read_delim()}, and \code{col_select} was inspired by \code{vroom::vroom()}. +} +\details{ +\code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around +\code{read_delim_arrow()} that specify a delimiter. + +Note that not all \code{readr} options are currently implemented here. Please file +an issue if you encounter one that \code{arrow} should support. + +If you need to control Arrow-specific reader parameters that don't have an +equivalent in \code{readr::read_csv()}, you can either provide them in the +\code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can +use \link{CsvTableReader} directly for lower-level access. +} +\section{Specifying column types and names}{ + + +By default, the CSV reader will infer the column names and data types from the file, but there +are a few ways you can specify them directly. + +One way is to provide an Arrow \link{Schema} in the \code{schema} argument, +which is an ordered map of column name to type. +When provided, it satisfies both the \code{col_names} and \code{col_types} arguments. +This is good if you know all of this information up front. + +You can also pass a \code{Schema} to the \code{col_types} argument. If you do this, +column names will still be inferred from the file unless you also specify +\code{col_names}. In either case, the column names in the \code{Schema} must match the +data's column names, whether they are explicitly provided or inferred. That +said, this \code{Schema} does not have to reference all columns: those omitted +will have their types inferred. + +Alternatively, you can declare column types by providing the compact string representation +that \code{readr} uses to the \code{col_types} argument. This means you provide a +single string, one character per column, where the characters map to Arrow +types analogously to the \code{readr} type mapping: +\itemize{ +\item "c": \code{utf8()} +\item "i": \code{int32()} +\item "n": \code{float64()} +\item "d": \code{float64()} +\item "l": \code{bool()} +\item "f": \code{dictionary()} +\item "D": \code{date32()} +\item "T": \code{timestamp()} +\item "t": \code{time32()} +\item "_": \code{null()} +\item "-": \code{null()} +\item "?": infer the type from the data +} + +If you use the compact string representation for \code{col_types}, you must also +specify \code{col_names}. + +Regardless of how types are specified, all columns with a \code{null()} type will +be dropped. + +Note that if you are specifying column names, whether by \code{schema} or +\code{col_names}, and the CSV file has a header row that would otherwise be used +to idenfity column names, you'll need to add \code{skip = 1} to skip that row. +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write.csv(mtcars, file = tf) +df <- read_csv_arrow(tf) +dim(df) +# Can select columns +df <- read_csv_arrow(tf, col_select = starts_with("d")) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/read_feather.Rd b/src/arrow/r/man/read_feather.Rd new file mode 100644 index 000000000..95f4d1d12 --- /dev/null +++ b/src/arrow/r/man/read_feather.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feather.R +\name{read_feather} +\alias{read_feather} +\title{Read a Feather file} +\usage{ +read_feather(file, col_select = NULL, as_data_frame = TRUE, ...) +} +\arguments{ +\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, +or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +If a file name or URI, an Arrow \link{InputStream} will be opened and +closed when finished. If an input stream is provided, it will be left +open.} + +\item{col_select}{A character vector of column names to keep, as in the +"select" argument to \code{data.table::fread()}, or a +\link[tidyselect:vars_select]{tidy selection specification} +of columns, as used in \code{dplyr::select()}.} + +\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +an Arrow \link{Table}?} + +\item{...}{additional parameters, passed to \code{\link[=make_readable_file]{make_readable_file()}}.} +} +\value{ +A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an +Arrow \link{Table} otherwise +} +\description{ +Feather provides binary columnar serialization for data frames. +It is designed to make reading and writing data frames efficient, +and to make sharing data across data analysis languages easy. +This function reads both the original, limited specification of the format +and the version 2 specification, which is the Apache Arrow IPC file format. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_feather(mtcars, tf) +df <- read_feather(tf) +dim(df) +# Can select columns +df <- read_feather(tf, col_select = starts_with("d")) +\dontshow{\}) # examplesIf} +} +\seealso{ +\link{FeatherReader} and \link{RecordBatchReader} for lower-level access to reading Arrow IPC data. +} diff --git a/src/arrow/r/man/read_ipc_stream.Rd b/src/arrow/r/man/read_ipc_stream.Rd new file mode 100644 index 000000000..d4dd78314 --- /dev/null +++ b/src/arrow/r/man/read_ipc_stream.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deprecated.R, R/ipc_stream.R +\name{read_arrow} +\alias{read_arrow} +\alias{read_ipc_stream} +\title{Read Arrow IPC stream format} +\usage{ +read_arrow(file, ...) + +read_ipc_stream(file, as_data_frame = TRUE, ...) +} +\arguments{ +\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, +or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +If a file name or URI, an Arrow \link{InputStream} will be opened and +closed when finished. If an input stream is provided, it will be left +open.} + +\item{...}{extra parameters passed to \code{read_feather()}.} + +\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +an Arrow \link{Table}?} +} +\value{ +A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an +Arrow \link{Table} otherwise +} +\description{ +Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}: +a "stream" format and a "file" format, known as Feather. \code{read_ipc_stream()} +and \code{\link[=read_feather]{read_feather()}} read those formats, respectively. +} +\details{ +\code{read_arrow()}, a wrapper around \code{read_ipc_stream()} and \code{read_feather()}, +is deprecated. You should explicitly choose +the function that will read the desired IPC format (stream or file) since +a file or \code{InputStream} may contain either. +} +\seealso{ +\code{\link[=read_feather]{read_feather()}} for writing IPC files. \link{RecordBatchReader} for a +lower-level interface. +} diff --git a/src/arrow/r/man/read_json_arrow.Rd b/src/arrow/r/man/read_json_arrow.Rd new file mode 100644 index 000000000..610867ca4 --- /dev/null +++ b/src/arrow/r/man/read_json_arrow.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/json.R +\name{read_json_arrow} +\alias{read_json_arrow} +\title{Read a JSON file} +\usage{ +read_json_arrow( + file, + col_select = NULL, + as_data_frame = TRUE, + schema = NULL, + ... +) +} +\arguments{ +\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, +or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +If a file name, a memory-mapped Arrow \link{InputStream} will be opened and +closed when finished; compression will be detected from the file extension +and handled automatically. If an input stream is provided, it will be left +open.} + +\item{col_select}{A character vector of column names to keep, as in the +"select" argument to \code{data.table::fread()}, or a +\link[tidyselect:vars_select]{tidy selection specification} +of columns, as used in \code{dplyr::select()}.} + +\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +an Arrow \link{Table}?} + +\item{schema}{\link{Schema} that describes the table.} + +\item{...}{Additional options passed to \code{JsonTableReader$create()}} +} +\value{ +A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. +} +\description{ +Using \link{JsonTableReader} +} +\examples{ +\dontshow{if (arrow_with_json()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +writeLines(' + { "hello": 3.5, "world": false, "yo": "thing" } + { "hello": 3.25, "world": null } + { "hello": 0.0, "world": true, "yo": null } + ', tf, useBytes = TRUE) +df <- read_json_arrow(tf) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/read_message.Rd b/src/arrow/r/man/read_message.Rd new file mode 100644 index 000000000..444c76c86 --- /dev/null +++ b/src/arrow/r/man/read_message.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/message.R +\name{read_message} +\alias{read_message} +\title{Read a Message from a stream} +\usage{ +read_message(stream) +} +\arguments{ +\item{stream}{an InputStream} +} +\description{ +Read a Message from a stream +} diff --git a/src/arrow/r/man/read_parquet.Rd b/src/arrow/r/man/read_parquet.Rd new file mode 100644 index 000000000..056e86447 --- /dev/null +++ b/src/arrow/r/man/read_parquet.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\name{read_parquet} +\alias{read_parquet} +\title{Read a Parquet file} +\usage{ +read_parquet( + file, + col_select = NULL, + as_data_frame = TRUE, + props = ParquetArrowReaderProperties$create(), + ... +) +} +\arguments{ +\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, +or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +If a file name or URI, an Arrow \link{InputStream} will be opened and +closed when finished. If an input stream is provided, it will be left +open.} + +\item{col_select}{A character vector of column names to keep, as in the +"select" argument to \code{data.table::fread()}, or a +\link[tidyselect:vars_select]{tidy selection specification} +of columns, as used in \code{dplyr::select()}.} + +\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +an Arrow \link{Table}?} + +\item{props}{\link{ParquetArrowReaderProperties}} + +\item{...}{Additional arguments passed to \code{ParquetFileReader$create()}} +} +\value{ +A \link[=Table]{arrow::Table}, or a \code{data.frame} if \code{as_data_frame} is +\code{TRUE} (the default). +} +\description{ +'\href{https://parquet.apache.org/}{Parquet}' is a columnar storage file format. +This function enables you to read Parquet files into R. +} +\examples{ +\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_parquet(mtcars, tf) +df <- read_parquet(tf, col_select = starts_with("d")) +head(df) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/read_schema.Rd b/src/arrow/r/man/read_schema.Rd new file mode 100644 index 000000000..8738b8aeb --- /dev/null +++ b/src/arrow/r/man/read_schema.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/schema.R +\name{read_schema} +\alias{read_schema} +\title{read a Schema from a stream} +\usage{ +read_schema(stream, ...) +} +\arguments{ +\item{stream}{a \code{Message}, \code{InputStream}, or \code{Buffer}} + +\item{...}{currently ignored} +} +\value{ +A \link{Schema} +} +\description{ +read a Schema from a stream +} diff --git a/src/arrow/r/man/recycle_scalars.Rd b/src/arrow/r/man/recycle_scalars.Rd new file mode 100644 index 000000000..3d97ecfd7 --- /dev/null +++ b/src/arrow/r/man/recycle_scalars.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{recycle_scalars} +\alias{recycle_scalars} +\title{Recycle scalar values in a list of arrays} +\usage{ +recycle_scalars(arrays) +} +\arguments{ +\item{arrays}{List of arrays} +} +\value{ +List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled +} +\description{ +Recycle scalar values in a list of arrays +} +\keyword{internal} diff --git a/src/arrow/r/man/reexports.Rd b/src/arrow/r/man/reexports.Rd new file mode 100644 index 000000000..591158c72 --- /dev/null +++ b/src/arrow/r/man/reexports.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reexports-bit64.R, R/reexports-tidyselect.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{print.integer64} +\alias{str.integer64} +\alias{contains} +\alias{select_helpers} +\alias{ends_with} +\alias{everything} +\alias{matches} +\alias{num_range} +\alias{one_of} +\alias{starts_with} +\alias{last_col} +\alias{all_of} +\title{Objects exported from other packages} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{bit64}{\code{\link[bit64:bit64-package]{print.integer64}}, \code{\link[bit64:bit64-package]{str.integer64}}} + + \item{tidyselect}{\code{\link[tidyselect]{all_of}}, \code{\link[tidyselect:starts_with]{contains}}, \code{\link[tidyselect:starts_with]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect:everything]{last_col}}, \code{\link[tidyselect:starts_with]{matches}}, \code{\link[tidyselect:starts_with]{num_range}}, \code{\link[tidyselect]{one_of}}, \code{\link[tidyselect]{starts_with}}} +}} + diff --git a/src/arrow/r/man/repeat_value_as_array.Rd b/src/arrow/r/man/repeat_value_as_array.Rd new file mode 100644 index 000000000..a4937326e --- /dev/null +++ b/src/arrow/r/man/repeat_value_as_array.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.R +\name{repeat_value_as_array} +\alias{repeat_value_as_array} +\title{Take an object of length 1 and repeat it.} +\usage{ +repeat_value_as_array(object, n) +} +\arguments{ +\item{object}{Object of length 1 to be repeated - vector, \code{Scalar}, \code{Array}, or \code{ChunkedArray}} + +\item{n}{Number of repetitions} +} +\value{ +\code{Array} of length \code{n} +} +\description{ +Take an object of length 1 and repeat it. +} +\keyword{internal} diff --git a/src/arrow/r/man/s3_bucket.Rd b/src/arrow/r/man/s3_bucket.Rd new file mode 100644 index 000000000..95a086dea --- /dev/null +++ b/src/arrow/r/man/s3_bucket.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{s3_bucket} +\alias{s3_bucket} +\title{Connect to an AWS S3 bucket} +\usage{ +s3_bucket(bucket, ...) +} +\arguments{ +\item{bucket}{string S3 bucket name or path} + +\item{...}{Additional connection options, passed to \code{S3FileSystem$create()}} +} +\value{ +A \code{SubTreeFileSystem} containing an \code{S3FileSystem} and the bucket's +relative path. Note that this function's success does not guarantee that you +are authorized to access the bucket's contents. +} +\description{ +\code{s3_bucket()} is a convenience function to create an \code{S3FileSystem} object +that automatically detects the bucket's AWS region and holding onto the its +relative path. +} +\examples{ +\dontshow{if (arrow_with_s3()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +bucket <- s3_bucket("ursa-labs-taxi-data") +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/to_arrow.Rd b/src/arrow/r/man/to_arrow.Rd new file mode 100644 index 000000000..e0c31b8dc --- /dev/null +++ b/src/arrow/r/man/to_arrow.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duckdb.R +\name{to_arrow} +\alias{to_arrow} +\title{Create an Arrow object from others} +\usage{ +to_arrow(.data) +} +\arguments{ +\item{.data}{the object to be converted} +} +\value{ +an \code{arrow_dplyr_query} object, to be used in dplyr pipelines. +} +\description{ +This can be used in pipelines that pass data back and forth between Arrow and +other processes (like DuckDB). +} +\examples{ +\dontshow{if (getFromNamespace("run_duckdb_examples", "arrow")()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +library(dplyr) + +ds <- InMemoryDataset$create(mtcars) + +ds \%>\% + filter(mpg < 30) \%>\% + to_duckdb() \%>\% + group_by(cyl) \%>\% + summarize(mean_mpg = mean(mpg, na.rm = TRUE)) \%>\% + to_arrow() \%>\% + collect() +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/to_duckdb.Rd b/src/arrow/r/man/to_duckdb.Rd new file mode 100644 index 000000000..12186d432 --- /dev/null +++ b/src/arrow/r/man/to_duckdb.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duckdb.R +\name{to_duckdb} +\alias{to_duckdb} +\title{Create a (virtual) DuckDB table from an Arrow object} +\usage{ +to_duckdb( + .data, + con = arrow_duck_connection(), + table_name = unique_arrow_tablename(), + auto_disconnect = FALSE +) +} +\arguments{ +\item{.data}{the Arrow object (e.g. Dataset, Table) to use for the DuckDB table} + +\item{con}{a DuckDB connection to use (default will create one and store it +in \code{options("arrow_duck_con")})} + +\item{table_name}{a name to use in DuckDB for this object. The default is a +unique string \code{"arrow_"} followed by numbers.} + +\item{auto_disconnect}{should the table be automatically cleaned up when the +resulting object is removed (and garbage collected)? Default: \code{FALSE}} +} +\value{ +A \code{tbl} of the new table in DuckDB +} +\description{ +This will do the necessary configuration to create a (virtual) table in DuckDB +that is backed by the Arrow object given. No data is copied or modified until +\code{collect()} or \code{compute()} are called or a query is run against the table. +} +\details{ +The result is a dbplyr-compatible object that can be used in d(b)plyr pipelines. + +If \code{auto_disconnect = TRUE}, the DuckDB table that is created will be configured +to be unregistered when the \code{tbl} object is garbage collected. This is helpful +if you don't want to have extra table objects in DuckDB after you've finished +using them. Currently, this cleanup can, however, sometimes lead to hangs if +tables are created and deleted in quick succession, hence the default value +of \code{FALSE} +} +\examples{ +\dontshow{if (getFromNamespace("run_duckdb_examples", "arrow")()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +library(dplyr) + +ds <- InMemoryDataset$create(mtcars) + +ds \%>\% + filter(mpg < 30) \%>\% + to_duckdb() \%>\% + group_by(cyl) \%>\% + summarize(mean_mpg = mean(mpg, na.rm = TRUE)) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/type.Rd b/src/arrow/r/man/type.Rd new file mode 100644 index 000000000..d55bbe24b --- /dev/null +++ b/src/arrow/r/man/type.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/type.R +\name{type} +\alias{type} +\title{infer the arrow Array type from an R vector} +\usage{ +type(x) +} +\arguments{ +\item{x}{an R vector} +} +\value{ +an arrow logical type +} +\description{ +infer the arrow Array type from an R vector +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +type(1:10) +type(1L:10L) +type(c(1, 1.5, 2)) +type(c("A", "B", "C")) +type(mtcars) +type(Sys.Date()) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/unify_schemas.Rd b/src/arrow/r/man/unify_schemas.Rd new file mode 100644 index 000000000..50c80c2dd --- /dev/null +++ b/src/arrow/r/man/unify_schemas.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/schema.R +\name{unify_schemas} +\alias{unify_schemas} +\title{Combine and harmonize schemas} +\usage{ +unify_schemas(..., schemas = list(...)) +} +\arguments{ +\item{...}{\link{Schema}s to unify} + +\item{schemas}{Alternatively, a list of schemas} +} +\value{ +A \code{Schema} with the union of fields contained in the inputs, or +\code{NULL} if any of \code{schemas} is \code{NULL} +} +\description{ +Combine and harmonize schemas +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +a <- schema(b = double(), c = bool()) +z <- schema(b = double(), k = utf8()) +unify_schemas(a, z) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/value_counts.Rd b/src/arrow/r/man/value_counts.Rd new file mode 100644 index 000000000..7e64d1550 --- /dev/null +++ b/src/arrow/r/man/value_counts.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute.R +\name{value_counts} +\alias{value_counts} +\title{\code{table} for Arrow objects} +\usage{ +value_counts(x) +} +\arguments{ +\item{x}{\code{Array} or \code{ChunkedArray}} +} +\value{ +A \code{StructArray} containing "values" (same type as \code{x}) and "counts" +\code{Int64}. +} +\description{ +This function tabulates the values in the array and returns a table of counts. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +cyl_vals <- Array$create(mtcars$cyl) +counts <- value_counts(cyl_vals) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/write_csv_arrow.Rd b/src/arrow/r/man/write_csv_arrow.Rd new file mode 100644 index 000000000..55a239ca9 --- /dev/null +++ b/src/arrow/r/man/write_csv_arrow.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{write_csv_arrow} +\alias{write_csv_arrow} +\title{Write CSV file to disk} +\usage{ +write_csv_arrow(x, sink, include_header = TRUE, batch_size = 1024L) +} +\arguments{ +\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} + +\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +system (\code{SubTreeFileSystem})} + +\item{include_header}{Whether to write an initial header line with column names} + +\item{batch_size}{Maximum number of rows processed at a time. Default is 1024.} +} +\value{ +The input \code{x}, invisibly. Note that if \code{sink} is an \link{OutputStream}, +the stream will be left open. +} +\description{ +Write CSV file to disk +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_csv_arrow(mtcars, tf) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/write_dataset.Rd b/src/arrow/r/man/write_dataset.Rd new file mode 100644 index 000000000..76bbaf7c7 --- /dev/null +++ b/src/arrow/r/man/write_dataset.Rd @@ -0,0 +1,115 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-write.R +\name{write_dataset} +\alias{write_dataset} +\title{Write a dataset} +\usage{ +write_dataset( + dataset, + path, + format = c("parquet", "feather", "arrow", "ipc", "csv"), + partitioning = dplyr::group_vars(dataset), + basename_template = paste0("part-{i}.", as.character(format)), + hive_style = TRUE, + existing_data_behavior = c("overwrite", "error", "delete_matching"), + ... +) +} +\arguments{ +\item{dataset}{\link{Dataset}, \link{RecordBatch}, \link{Table}, \code{arrow_dplyr_query}, or +\code{data.frame}. If an \code{arrow_dplyr_query}, the query will be evaluated and +the result will be written. This means that you can \code{select()}, \code{filter()}, \code{mutate()}, +etc. to transform the data before it is written if you need to.} + +\item{path}{string path, URI, or \code{SubTreeFileSystem} referencing a directory +to write to (directory will be created if it does not exist)} + +\item{format}{a string identifier of the file format. Default is to use +"parquet" (see \link{FileFormat})} + +\item{partitioning}{\code{Partitioning} or a character vector of columns to +use as partition keys (to be written as path segments). Default is to +use the current \code{group_by()} columns.} + +\item{basename_template}{string template for the names of files to be written. +Must contain \code{"{i}"}, which will be replaced with an autoincremented +integer to generate basenames of datafiles. For example, \code{"part-{i}.feather"} +will yield \verb{"part-0.feather", ...}.} + +\item{hive_style}{logical: write partition segments as Hive-style +(\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.} + +\item{existing_data_behavior}{The behavior to use when there is already data +in the destination directory. Must be one of overwrite, error, or +delete_matching. When this is set to "overwrite" (the default) then any +new files created will overwrite existing files. When this is set to +"error" then the operation will fail if the destination directory is not +empty. When this is set to "delete_matching" then the writer will delete +any existing partitions if data is going to be written to those partitions +and will leave alone partitions which data is not written to.} + +\item{...}{additional format-specific arguments. For available Parquet +options, see \code{\link[=write_parquet]{write_parquet()}}. The available Feather options are +\itemize{ +\item \code{use_legacy_format} logical: write data formatted so that Arrow libraries +versions 0.14 and lower can read it. Default is \code{FALSE}. You can also +enable this by setting the environment variable \code{ARROW_PRE_0_15_IPC_FORMAT=1}. +\item \code{metadata_version}: A string like "V5" or the equivalent integer indicating +the Arrow IPC MetadataVersion. Default (NULL) will use the latest version, +unless the environment variable \code{ARROW_PRE_1_0_METADATA_VERSION=1}, in +which case it will be V4. +\item \code{codec}: A \link{Codec} which will be used to compress body buffers of written +files. Default (NULL) will not compress body buffers. +\item \code{null_fallback}: character to be used in place of missing values (\code{NA} or +\code{NULL}) when using Hive-style partitioning. See \code{\link[=hive_partition]{hive_partition()}}. +}} +} +\value{ +The input \code{dataset}, invisibly +} +\description{ +This function allows you to write a dataset. By writing to more efficient +binary storage formats, and by specifying relevant partitioning, you can +make it much faster to read and query. +} +\examples{ +\dontshow{if (arrow_with_dataset() & arrow_with_parquet() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# You can write datasets partitioned by the values in a column (here: "cyl"). +# This creates a structure of the form cyl=X/part-Z.parquet. +one_level_tree <- tempfile() +write_dataset(mtcars, one_level_tree, partitioning = "cyl") +list.files(one_level_tree, recursive = TRUE) + +# You can also partition by the values in multiple columns +# (here: "cyl" and "gear"). +# This creates a structure of the form cyl=X/gear=Y/part-Z.parquet. +two_levels_tree <- tempfile() +write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear")) +list.files(two_levels_tree, recursive = TRUE) + +# In the two previous examples we would have: +# X = {4,6,8}, the number of cylinders. +# Y = {3,4,5}, the number of forward gears. +# Z = {0,1,2}, the number of saved parts, starting from 0. + +# You can obtain the same result as as the previous examples using arrow with +# a dplyr pipeline. This will be the same as two_levels_tree above, but the +# output directory will be different. +library(dplyr) +two_levels_tree_2 <- tempfile() +mtcars \%>\% + group_by(cyl, gear) \%>\% + write_dataset(two_levels_tree_2) +list.files(two_levels_tree_2, recursive = TRUE) + +# And you can also turn off the Hive-style directory naming where the column +# name is included with the values by using `hive_style = FALSE`. + +# Write a structure X/Y/part-Z.parquet. +two_levels_tree_no_hive <- tempfile() +mtcars \%>\% + group_by(cyl, gear) \%>\% + write_dataset(two_levels_tree_no_hive, hive_style = FALSE) +list.files(two_levels_tree_no_hive, recursive = TRUE) +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/write_feather.Rd b/src/arrow/r/man/write_feather.Rd new file mode 100644 index 000000000..c6273b61b --- /dev/null +++ b/src/arrow/r/man/write_feather.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feather.R +\name{write_feather} +\alias{write_feather} +\title{Write data in the Feather format} +\usage{ +write_feather( + x, + sink, + version = 2, + chunk_size = 65536L, + compression = c("default", "lz4", "uncompressed", "zstd"), + compression_level = NULL +) +} +\arguments{ +\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} + +\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +system (\code{SubTreeFileSystem})} + +\item{version}{integer Feather file version. Version 2 is the current. +Version 1 is the more limited legacy format.} + +\item{chunk_size}{For V2 files, the number of rows that each chunk of data +should have in the file. Use a smaller \code{chunk_size} when you need faster +random row access. Default is 64K. This option is not supported for V1.} + +\item{compression}{Name of compression codec to use, if any. Default is +"lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise +"uncompressed". "zstd" is the other available codec and generally has better +compression ratios in exchange for slower read and write performance +See \code{\link[=codec_is_available]{codec_is_available()}}. This option is not supported for V1.} + +\item{compression_level}{If \code{compression} is "zstd", you may +specify an integer compression level. If omitted, the compression codec's +default compression level is used.} +} +\value{ +The input \code{x}, invisibly. Note that if \code{sink} is an \link{OutputStream}, +the stream will be left open. +} +\description{ +Feather provides binary columnar serialization for data frames. +It is designed to make reading and writing data frames efficient, +and to make sharing data across data analysis languages easy. +This function writes both the original, limited specification of the format +and the version 2 specification, which is the Apache Arrow IPC file format. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_feather(mtcars, tf) +\dontshow{\}) # examplesIf} +} +\seealso{ +\link{RecordBatchWriter} for lower-level access to writing Arrow IPC data. + +\link{Schema} for information about schemas and metadata handling. +} diff --git a/src/arrow/r/man/write_ipc_stream.Rd b/src/arrow/r/man/write_ipc_stream.Rd new file mode 100644 index 000000000..2f215f25f --- /dev/null +++ b/src/arrow/r/man/write_ipc_stream.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deprecated.R, R/ipc_stream.R +\name{write_arrow} +\alias{write_arrow} +\alias{write_ipc_stream} +\title{Write Arrow IPC stream format} +\usage{ +write_arrow(x, sink, ...) + +write_ipc_stream(x, sink, ...) +} +\arguments{ +\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} + +\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +system (\code{SubTreeFileSystem})} + +\item{...}{extra parameters passed to \code{write_feather()}.} +} +\value{ +\code{x}, invisibly. +} +\description{ +Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}: +a "stream" format and a "file" format, known as Feather. \code{write_ipc_stream()} +and \code{\link[=write_feather]{write_feather()}} write those formats, respectively. +} +\details{ +\code{write_arrow()}, a wrapper around \code{write_ipc_stream()} and \code{write_feather()} +with some nonstandard behavior, is deprecated. You should explicitly choose +the function that will write the desired IPC format (stream or file) since +either can be written to a file or \code{OutputStream}. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write_ipc_stream(mtcars, tf) +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=write_feather]{write_feather()}} for writing IPC files. \code{\link[=write_to_raw]{write_to_raw()}} to +serialize data to a buffer. +\link{RecordBatchWriter} for a lower-level interface. +} diff --git a/src/arrow/r/man/write_parquet.Rd b/src/arrow/r/man/write_parquet.Rd new file mode 100644 index 000000000..d7147f7e8 --- /dev/null +++ b/src/arrow/r/man/write_parquet.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parquet.R +\name{write_parquet} +\alias{write_parquet} +\title{Write Parquet file to disk} +\usage{ +write_parquet( + x, + sink, + chunk_size = NULL, + version = NULL, + compression = default_parquet_compression(), + compression_level = NULL, + use_dictionary = NULL, + write_statistics = NULL, + data_page_size = NULL, + use_deprecated_int96_timestamps = FALSE, + coerce_timestamps = NULL, + allow_truncated_timestamps = FALSE, + properties = NULL, + arrow_properties = NULL +) +} +\arguments{ +\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} + +\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file +system (\code{SubTreeFileSystem})} + +\item{chunk_size}{chunk size in number of rows. If NULL, the total number of rows is used.} + +\item{version}{parquet version, "1.0" or "2.0". Default "1.0". Numeric values +are coerced to character.} + +\item{compression}{compression algorithm. Default "snappy". See details.} + +\item{compression_level}{compression level. Meaning depends on compression algorithm} + +\item{use_dictionary}{Specify if we should use dictionary encoding. Default \code{TRUE}} + +\item{write_statistics}{Specify if we should write statistics. Default \code{TRUE}} + +\item{data_page_size}{Set a target threshold for the approximate encoded +size of data pages within a column chunk (in bytes). Default 1 MiB.} + +\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet format. Default \code{FALSE}.} + +\item{coerce_timestamps}{Cast timestamps a particular resolution. Can be +\code{NULL}, "ms" or "us". Default \code{NULL} (no casting)} + +\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps to a +particular resolution. E.g. if microsecond or nanosecond data is lost when coercing +to "ms", do not raise an exception} + +\item{properties}{A \code{ParquetWriterProperties} object, used instead of the options +enumerated in this function's signature. Providing \code{properties} as an argument +is deprecated; if you need to assemble \code{ParquetWriterProperties} outside +of \code{write_parquet()}, use \code{ParquetFileWriter} instead.} + +\item{arrow_properties}{A \code{ParquetArrowWriterProperties} object. Like +\code{properties}, this argument is deprecated.} +} +\value{ +the input \code{x} invisibly. +} +\description{ +\href{https://parquet.apache.org/}{Parquet} is a columnar storage file format. +This function enables you to write Parquet files from R. +} +\details{ +Due to features of the format, Parquet files cannot be appended to. +If you want to use the Parquet format but also want the ability to extend +your dataset, you can write to additional Parquet files and then treat +the whole directory of files as a \link{Dataset} you can query. +See \code{vignette("dataset", package = "arrow")} for examples of this. + +The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} and +\code{write_statistics} support various patterns: +\itemize{ +\item The default \code{NULL} leaves the parameter unspecified, and the C++ library +uses an appropriate default for each column (defaults listed above) +\item A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns +\item An unnamed vector, of the same size as the number of columns, to specify a +value for each column, in positional order +\item A named vector, to specify the value for the named columns, the default +value for the setting is used when not supplied +} + +The \code{compression} argument can be any of the following (case insensitive): +"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2". +Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip" +are almost always included. See \code{\link[=codec_is_available]{codec_is_available()}}. +The default "snappy" is used if available, otherwise "uncompressed". To +disable compression, set \code{compression = "uncompressed"}. +Note that "uncompressed" columns may still have dictionary encoding. +} +\examples{ +\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf1 <- tempfile(fileext = ".parquet") +write_parquet(data.frame(x = 1:5), tf1) + +# using compression +if (codec_is_available("gzip")) { + tf2 <- tempfile(fileext = ".gz.parquet") + write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5) +} +\dontshow{\}) # examplesIf} +} diff --git a/src/arrow/r/man/write_to_raw.Rd b/src/arrow/r/man/write_to_raw.Rd new file mode 100644 index 000000000..a3c6e324b --- /dev/null +++ b/src/arrow/r/man/write_to_raw.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ipc_stream.R +\name{write_to_raw} +\alias{write_to_raw} +\title{Write Arrow data to a raw vector} +\usage{ +write_to_raw(x, format = c("stream", "file")) +} +\arguments{ +\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} + +\item{format}{one of \code{c("stream", "file")}, indicating the IPC format to use} +} +\value{ +A \code{raw} vector containing the bytes of the IPC serialized data. +} +\description{ +\code{\link[=write_ipc_stream]{write_ipc_stream()}} and \code{\link[=write_feather]{write_feather()}} write data to a sink and return +the data (\code{data.frame}, \code{RecordBatch}, or \code{Table}) they were given. +This function wraps those so that you can serialize data to a buffer and +access that buffer as a \code{raw} vector in R. +} +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# The default format is "stream" +mtcars_raw <- write_to_raw(mtcars) +\dontshow{\}) # examplesIf} +} -- cgit v1.2.3