diff options
Diffstat (limited to 'src/arrow/r/man/open_dataset.Rd')
-rw-r--r-- | src/arrow/r/man/open_dataset.Rd | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/src/arrow/r/man/open_dataset.Rd b/src/arrow/r/man/open_dataset.Rd new file mode 100644 index 000000000..4d6b492e3 --- /dev/null +++ b/src/arrow/r/man/open_dataset.Rd @@ -0,0 +1,146 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{open_dataset} +\alias{open_dataset} +\title{Open a multi-file dataset} +\usage{ +open_dataset( + sources, + schema = NULL, + partitioning = hive_partition(), + unify_schemas = NULL, + format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"), + ... +) +} +\arguments{ +\item{sources}{One of: +\itemize{ +\item a string path or URI to a directory containing data files +\item a string path or URI to a single file +\item a character vector of paths or URIs to individual data files +\item a list of \code{Dataset} objects as created by this function +\item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}. +} + +When \code{sources} is a vector of file URIs, they must all use the same protocol +and point to files located in the same file system and having the same +format.} + +\item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema +will be inferred from the data sources.} + +\item{partitioning}{When \code{sources} is a directory path/URI, one of: +\itemize{ +\item a \code{Schema}, in which case the file paths relative to \code{sources} will be +parsed, and path segments will be matched with the schema fields. For +example, \code{schema(year = int16(), month = int8())} would create partitions +for file paths like \code{"2019/01/file.parquet"}, \code{"2019/02/file.parquet"}, +etc. +\item a character vector that defines the field names corresponding to those +path segments (that is, you're providing the names that would correspond +to a \code{Schema} but the types will be autodetected) +\item a \code{HivePartitioning} or \code{HivePartitioningFactory}, as returned +by \code{\link[=hive_partition]{hive_partition()}} which parses explicit or autodetected fields from +Hive-style path segments +\item \code{NULL} for no partitioning +} + +The default is to autodetect Hive-style partitions. When \code{sources} is not a +directory path/URI, \code{partitioning} is ignored.} + +\item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s) +be scanned in order to create a unified schema from them? If \code{FALSE}, only +the first fragment will be inspected for its schema. Use this fast path +when you know and trust that all fragments have an identical schema. +The default is \code{FALSE} when creating a dataset from a directory path/URI or +vector of file paths/URIs (because there may be many files and scanning may +be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there +should be few \code{Dataset}s in the list and their \code{Schema}s are already in +memory).} + +\item{format}{A \link{FileFormat} object, or a string identifier of the format of +the files in \code{x}. This argument is ignored when \code{sources} is a list of \code{Dataset} objects. +Currently supported values: +\itemize{ +\item "parquet" +\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that +only version 2 files are supported +\item "csv"/"text", aliases for the same thing (because comma is the default +delimiter for text files +\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"} +} + +Default is "parquet", unless a \code{delimiter} is also specified, in which case +it is assumed to be "text".} + +\item{...}{additional arguments passed to \code{dataset_factory()} when \code{sources} +is a directory path/URI or vector of file paths/URIs, otherwise ignored. +These may include \code{format} to indicate the file format, or other +format-specific options.} +} +\value{ +A \link{Dataset} R6 object. Use \code{dplyr} methods on it to query the data, +or call \code{\link[=Scanner]{$NewScan()}} to construct a query directly. +} +\description{ +Arrow Datasets allow you to query against data that has been split across +multiple files. This sharding of data may indicate partitioning, which +can accelerate queries that only touch some partitions (files). Call +\code{open_dataset()} to point to a directory of data files and return a +\code{Dataset}, then use \code{dplyr} methods to query it. +} +\examples{ +\dontshow{if (arrow_with_dataset() & arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Set up directory for examples +tf <- tempfile() +dir.create(tf) +on.exit(unlink(tf)) + +data <- dplyr::group_by(mtcars, cyl) +write_dataset(data, tf) + +# You can specify a directory containing the files for your dataset and +# open_dataset will scan all files in your directory. +open_dataset(tf) + +# You can also supply a vector of paths +open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"), file.path(tf, "cyl=8/part-0.parquet"))) + +## You must specify the file format if using a format other than parquet. +tf2 <- tempfile() +dir.create(tf2) +on.exit(unlink(tf2)) +write_dataset(data, tf2, format = "ipc") +# This line will results in errors when you try to work with the data +\dontrun{ +open_dataset(tf2) +} +# This line will work +open_dataset(tf2, format = "ipc") + +## You can specify file partitioning to include it as a field in your dataset +# Create a temporary directory and write example dataset +tf3 <- tempfile() +dir.create(tf3) +on.exit(unlink(tf3)) +write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE) + +# View files - you can see the partitioning means that files have been written +# to folders based on Month/Day values +tf3_files <- list.files(tf3, recursive = TRUE) + +# With no partitioning specified, dataset contains all files but doesn't include +# directory names as field names +open_dataset(tf3) + +# Now that partitioning has been specified, your dataset contains columns for Month and Day +open_dataset(tf3, partitioning = c("Month", "Day")) + +# If you want to specify the data types for your fields, you can pass in a Schema +open_dataset(tf3, partitioning = schema(Month = int8(), Day = int8())) +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{vignette("dataset", package = "arrow")} +} |