summaryrefslogtreecommitdiffstats
path: root/src/arrow/r/man/RecordBatchReader.Rd
blob: 90c796a6693fe109abecafb62cdf11e6ea641b48 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/record-batch-reader.R
\docType{class}
\name{RecordBatchReader}
\alias{RecordBatchReader}
\alias{RecordBatchStreamReader}
\alias{RecordBatchFileReader}
\title{RecordBatchReader classes}
\description{
Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}:
a "stream" format and a "file" format, known as Feather.
\code{RecordBatchStreamReader} and \code{RecordBatchFileReader} are
interfaces for accessing record batches from input sources in those formats,
respectively.

For guidance on how to use these classes, see the examples section.
}
\section{Factory}{


The \code{RecordBatchFileReader$create()} and \code{RecordBatchStreamReader$create()}
factory methods instantiate the object and
take a single argument, named according to the class:
\itemize{
\item \code{file} A character file name, raw vector, or Arrow file connection object
(e.g. \link{RandomAccessFile}).
\item \code{stream} A raw vector, \link{Buffer}, or \link{InputStream}.
}
}

\section{Methods}{

\itemize{
\item \verb{$read_next_batch()}: Returns a \code{RecordBatch}, iterating through the
Reader. If there are no further batches in the Reader, it returns \code{NULL}.
\item \verb{$schema}: Returns a \link{Schema} (active binding)
\item \verb{$batches()}: Returns a list of \code{RecordBatch}es
\item \verb{$read_table()}: Collects the reader's \code{RecordBatch}es into a \link{Table}
\item \verb{$get_batch(i)}: For \code{RecordBatchFileReader}, return a particular batch
by an integer index.
\item \verb{$num_record_batches()}: For \code{RecordBatchFileReader}, see how many batches
are in the file.
}
}

\examples{
\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
tf <- tempfile()
on.exit(unlink(tf))

batch <- record_batch(chickwts)

# This opens a connection to the file in Arrow
file_obj <- FileOutputStream$create(tf)
# Pass that to a RecordBatchWriter to write data conforming to a schema
writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
writer$write(batch)
# You may write additional batches to the stream, provided that they have
# the same schema.
# Call "close" on the writer to indicate end-of-file/stream
writer$close()
# Then, close the connection--closing the IPC message does not close the file
file_obj$close()

# Now, we have a file we can read from. Same pattern: open file connection,
# then pass it to a RecordBatchReader
read_file_obj <- ReadableFile$create(tf)
reader <- RecordBatchFileReader$create(read_file_obj)
# RecordBatchFileReader knows how many batches it has (StreamReader does not)
reader$num_record_batches
# We could consume the Reader by calling $read_next_batch() until all are,
# consumed, or we can call $read_table() to pull them all into a Table
tab <- reader$read_table()
# Call as.data.frame to turn that Table into an R data.frame
df <- as.data.frame(tab)
# This should be the same data we sent
all.equal(df, chickwts, check.attributes = FALSE)
# Unlike the Writers, we don't have to close RecordBatchReaders,
# but we do still need to close the file connection
read_file_obj$close()
\dontshow{\}) # examplesIf}
}
\seealso{
\code{\link[=read_ipc_stream]{read_ipc_stream()}} and \code{\link[=read_feather]{read_feather()}} provide a much simpler interface
for reading data from these formats and are sufficient for many use cases.
}