summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/man/FileFormat.Rd
blob: cabacc937554891be4e61f74e60fe9983fb5dbb9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dataset-format.R
\name{FileFormat}
\alias{FileFormat}
\alias{ParquetFileFormat}
\alias{IpcFileFormat}
\alias{CsvFileFormat}
\title{Dataset file formats}
\description{
A \code{FileFormat} holds information about how to read and parse the files
included in a \code{Dataset}. There are subclasses corresponding to the supported
file formats (\code{ParquetFileFormat} and \code{IpcFileFormat}).
}
\section{Factory}{

\code{FileFormat$create()} takes the following arguments:
\itemize{
\item \code{format}: A string identifier of the file format. Currently supported values:
\itemize{
\item "parquet"
\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
only version 2 files are supported
\item "csv"/"text", aliases for the same thing (because comma is the default
delimiter for text files
\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
}
\item \code{...}: Additional format-specific options

`format = "parquet"``:
\itemize{
\item \code{dict_columns}: Names of columns which should be read as dictionaries.
\item Any Parquet options from \link{FragmentScanOptions}.
}

\code{format = "text"}: see \link{CsvParseOptions}. Note that you can specify them either
with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
Not all \code{readr} options are currently supported; please file an issue if
you encounter one that \code{arrow} should support. Also, the following options are
supported. From \link{CsvReadOptions}:
\itemize{
\item \code{skip_rows}
\item \code{column_names}
\item \code{autogenerate_column_names}
From \link{CsvFragmentScanOptions} (these values can be overridden at scan time):
\item \code{convert_options}: a \link{CsvConvertOptions}
\item \code{block_size}
}
}

It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
}

\examples{
\dontshow{if (arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
## Semi-colon delimited files
# Set up directory for examples
tf <- tempfile()
dir.create(tf)
on.exit(unlink(tf))
write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)

# Create FileFormat object
format <- FileFormat$create(format = "text", delimiter = ";")

open_dataset(tf, format = format)
\dontshow{\}) # examplesIf}
}