diff options
Diffstat (limited to '')
-rw-r--r-- | src/arrow/r/man/read_delim_arrow.Rd | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/src/arrow/r/man/read_delim_arrow.Rd b/src/arrow/r/man/read_delim_arrow.Rd new file mode 100644 index 000000000..7bfda29b8 --- /dev/null +++ b/src/arrow/r/man/read_delim_arrow.Rd @@ -0,0 +1,218 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/csv.R +\name{read_delim_arrow} +\alias{read_delim_arrow} +\alias{read_csv_arrow} +\alias{read_tsv_arrow} +\title{Read a CSV or other delimited file with Arrow} +\usage{ +read_delim_arrow( + file, + delim = ",", + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL +) + +read_csv_arrow( + file, + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL +) + +read_tsv_arrow( + file, + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + schema = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + na = c("", "NA"), + quoted_na = TRUE, + skip_empty_rows = TRUE, + skip = 0L, + parse_options = NULL, + convert_options = NULL, + read_options = NULL, + as_data_frame = TRUE, + timestamp_parsers = NULL +) +} +\arguments{ +\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream, +or a \code{FileSystem} with path (\code{SubTreeFileSystem}). +If a file name, a memory-mapped Arrow \link{InputStream} will be opened and +closed when finished; compression will be detected from the file extension +and handled automatically. If an input stream is provided, it will be left +open.} + +\item{delim}{Single character used to separate fields within a record.} + +\item{quote}{Single character used to quote strings.} + +\item{escape_double}{Does the file escape quotes by doubling them? +i.e. If this option is \code{TRUE}, the value \verb{""""} represents +a single quote, \verb{\\"}.} + +\item{escape_backslash}{Does the file use backslashes to escape special +characters? This is more general than \code{escape_double} as backslashes +can be used to escape the delimiter character, the quote character, or +to add special characters like \verb{\\\\n}.} + +\item{schema}{\link{Schema} that describes the table. If provided, it will be +used to satisfy both \code{col_names} and \code{col_types}.} + +\item{col_names}{If \code{TRUE}, the first row of the input will be used as the +column names and will not be included in the data frame. If \code{FALSE}, column +names will be generated by Arrow, starting with "f0", "f1", ..., "fN". +Alternatively, you can specify a character vector of column names.} + +\item{col_types}{A compact string representation of the column types, or +\code{NULL} (the default) to infer types from the data.} + +\item{col_select}{A character vector of column names to keep, as in the +"select" argument to \code{data.table::fread()}, or a +\link[tidyselect:vars_select]{tidy selection specification} +of columns, as used in \code{dplyr::select()}.} + +\item{na}{A character vector of strings to interpret as missing values.} + +\item{quoted_na}{Should missing values inside quotes be treated as missing +values (the default) or strings. (Note that this is different from the +the Arrow C++ default for the corresponding convert option, +\code{strings_can_be_null}.)} + +\item{skip_empty_rows}{Should blank rows be ignored altogether? If +\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be +filled with missings.} + +\item{skip}{Number of lines to skip before reading data.} + +\item{parse_options}{see \link[=CsvReadOptions]{file reader options}. +If given, this overrides any +parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).} + +\item{convert_options}{see \link[=CsvReadOptions]{file reader options}} + +\item{read_options}{see \link[=CsvReadOptions]{file reader options}} + +\item{as_data_frame}{Should the function return a \code{data.frame} (default) or +an Arrow \link{Table}?} + +\item{timestamp_parsers}{User-defined timestamp parsers. If more than one +parser is specified, the CSV conversion logic will try parsing values +starting from the beginning of this vector. Possible values are: +\itemize{ +\item \code{NULL}: the default, which uses the ISO-8601 parser +\item a character vector of \link[base:strptime]{strptime} parse strings +\item a list of \link{TimestampParser} objects +}} +} +\value{ +A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. +} +\description{ +These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}. +Arrow C++ options have been mapped to argument names that follow those of +\code{readr::read_delim()}, and \code{col_select} was inspired by \code{vroom::vroom()}. +} +\details{ +\code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around +\code{read_delim_arrow()} that specify a delimiter. + +Note that not all \code{readr} options are currently implemented here. Please file +an issue if you encounter one that \code{arrow} should support. + +If you need to control Arrow-specific reader parameters that don't have an +equivalent in \code{readr::read_csv()}, you can either provide them in the +\code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can +use \link{CsvTableReader} directly for lower-level access. +} +\section{Specifying column types and names}{ + + +By default, the CSV reader will infer the column names and data types from the file, but there +are a few ways you can specify them directly. + +One way is to provide an Arrow \link{Schema} in the \code{schema} argument, +which is an ordered map of column name to type. +When provided, it satisfies both the \code{col_names} and \code{col_types} arguments. +This is good if you know all of this information up front. + +You can also pass a \code{Schema} to the \code{col_types} argument. If you do this, +column names will still be inferred from the file unless you also specify +\code{col_names}. In either case, the column names in the \code{Schema} must match the +data's column names, whether they are explicitly provided or inferred. That +said, this \code{Schema} does not have to reference all columns: those omitted +will have their types inferred. + +Alternatively, you can declare column types by providing the compact string representation +that \code{readr} uses to the \code{col_types} argument. This means you provide a +single string, one character per column, where the characters map to Arrow +types analogously to the \code{readr} type mapping: +\itemize{ +\item "c": \code{utf8()} +\item "i": \code{int32()} +\item "n": \code{float64()} +\item "d": \code{float64()} +\item "l": \code{bool()} +\item "f": \code{dictionary()} +\item "D": \code{date32()} +\item "T": \code{timestamp()} +\item "t": \code{time32()} +\item "_": \code{null()} +\item "-": \code{null()} +\item "?": infer the type from the data +} + +If you use the compact string representation for \code{col_types}, you must also +specify \code{col_names}. + +Regardless of how types are specified, all columns with a \code{null()} type will +be dropped. + +Note that if you are specifying column names, whether by \code{schema} or +\code{col_names}, and the CSV file has a header row that would otherwise be used +to idenfity column names, you'll need to add \code{skip = 1} to skip that row. +} + +\examples{ +\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +tf <- tempfile() +on.exit(unlink(tf)) +write.csv(mtcars, file = tf) +df <- read_csv_arrow(tf) +dim(df) +# Can select columns +df <- read_csv_arrow(tf, col_select = starts_with("d")) +\dontshow{\}) # examplesIf} +} |