% Generated by roxygen2: do not edit by hand % Please edit documentation in R/record-batch-reader.R \docType{class} \name{RecordBatchReader} \alias{RecordBatchReader} \alias{RecordBatchStreamReader} \alias{RecordBatchFileReader} \title{RecordBatchReader classes} \description{ Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}: a "stream" format and a "file" format, known as Feather. \code{RecordBatchStreamReader} and \code{RecordBatchFileReader} are interfaces for accessing record batches from input sources in those formats, respectively. For guidance on how to use these classes, see the examples section. } \section{Factory}{ The \code{RecordBatchFileReader$create()} and \code{RecordBatchStreamReader$create()} factory methods instantiate the object and take a single argument, named according to the class: \itemize{ \item \code{file} A character file name, raw vector, or Arrow file connection object (e.g. \link{RandomAccessFile}). \item \code{stream} A raw vector, \link{Buffer}, or \link{InputStream}. } } \section{Methods}{ \itemize{ \item \verb{$read_next_batch()}: Returns a \code{RecordBatch}, iterating through the Reader. If there are no further batches in the Reader, it returns \code{NULL}. \item \verb{$schema}: Returns a \link{Schema} (active binding) \item \verb{$batches()}: Returns a list of \code{RecordBatch}es \item \verb{$read_table()}: Collects the reader's \code{RecordBatch}es into a \link{Table} \item \verb{$get_batch(i)}: For \code{RecordBatchFileReader}, return a particular batch by an integer index. \item \verb{$num_record_batches()}: For \code{RecordBatchFileReader}, see how many batches are in the file. } } \examples{ \dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} tf <- tempfile() on.exit(unlink(tf)) batch <- record_batch(chickwts) # This opens a connection to the file in Arrow file_obj <- FileOutputStream$create(tf) # Pass that to a RecordBatchWriter to write data conforming to a schema writer <- RecordBatchFileWriter$create(file_obj, batch$schema) writer$write(batch) # You may write additional batches to the stream, provided that they have # the same schema. # Call "close" on the writer to indicate end-of-file/stream writer$close() # Then, close the connection--closing the IPC message does not close the file file_obj$close() # Now, we have a file we can read from. Same pattern: open file connection, # then pass it to a RecordBatchReader read_file_obj <- ReadableFile$create(tf) reader <- RecordBatchFileReader$create(read_file_obj) # RecordBatchFileReader knows how many batches it has (StreamReader does not) reader$num_record_batches # We could consume the Reader by calling $read_next_batch() until all are, # consumed, or we can call $read_table() to pull them all into a Table tab <- reader$read_table() # Call as.data.frame to turn that Table into an R data.frame df <- as.data.frame(tab) # This should be the same data we sent all.equal(df, chickwts, check.attributes = FALSE) # Unlike the Writers, we don't have to close RecordBatchReaders, # but we do still need to close the file connection read_file_obj$close() \dontshow{\}) # examplesIf} } \seealso{ \code{\link[=read_ipc_stream]{read_ipc_stream()}} and \code{\link[=read_feather]{read_feather()}} provide a much simpler interface for reading data from these formats and are sufficient for many use cases. }