summaryrefslogtreecommitdiffstats
path: root/src/arrow/r
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/arrow/r/.Rbuildignore29
-rw-r--r--src/arrow/r/.gitignore28
-rw-r--r--src/arrow/r/.lintr31
-rw-r--r--src/arrow/r/.styler_excludes.R18
-rw-r--r--src/arrow/r/DESCRIPTION122
-rw-r--r--src/arrow/r/Makefile70
-rw-r--r--src/arrow/r/NAMESPACE382
-rw-r--r--src/arrow/r/NEWS.md472
-rw-r--r--src/arrow/r/R/array-data.R53
-rw-r--r--src/arrow/r/R/array.R329
-rw-r--r--src/arrow/r/R/arrow-datum.R266
-rw-r--r--src/arrow/r/R/arrow-package.R351
-rw-r--r--src/arrow/r/R/arrow-tabular.R272
-rw-r--r--src/arrow/r/R/arrowExports.R1801
-rw-r--r--src/arrow/r/R/buffer.R78
-rw-r--r--src/arrow/r/R/chunked-array.R153
-rw-r--r--src/arrow/r/R/compression.R124
-rw-r--r--src/arrow/r/R/compute.R309
-rw-r--r--src/arrow/r/R/config.R44
-rw-r--r--src/arrow/r/R/csv.R644
-rw-r--r--src/arrow/r/R/dataset-factory.R170
-rw-r--r--src/arrow/r/R/dataset-format.R353
-rw-r--r--src/arrow/r/R/dataset-partition.R132
-rw-r--r--src/arrow/r/R/dataset-scan.R262
-rw-r--r--src/arrow/r/R/dataset-write.R144
-rw-r--r--src/arrow/r/R/dataset.R367
-rw-r--r--src/arrow/r/R/deprecated.R40
-rw-r--r--src/arrow/r/R/dictionary.R69
-rw-r--r--src/arrow/r/R/dplyr-arrange.R98
-rw-r--r--src/arrow/r/R/dplyr-collect.R121
-rw-r--r--src/arrow/r/R/dplyr-count.R60
-rw-r--r--src/arrow/r/R/dplyr-distinct.R46
-rw-r--r--src/arrow/r/R/dplyr-eval.R123
-rw-r--r--src/arrow/r/R/dplyr-filter.R91
-rw-r--r--src/arrow/r/R/dplyr-functions.R1087
-rw-r--r--src/arrow/r/R/dplyr-group-by.R86
-rw-r--r--src/arrow/r/R/dplyr-join.R126
-rw-r--r--src/arrow/r/R/dplyr-mutate.R140
-rw-r--r--src/arrow/r/R/dplyr-select.R125
-rw-r--r--src/arrow/r/R/dplyr-summarize.R289
-rw-r--r--src/arrow/r/R/dplyr.R259
-rw-r--r--src/arrow/r/R/duckdb.R165
-rw-r--r--src/arrow/r/R/enums.R178
-rw-r--r--src/arrow/r/R/expression.R240
-rw-r--r--src/arrow/r/R/feather.R219
-rw-r--r--src/arrow/r/R/field.R84
-rw-r--r--src/arrow/r/R/filesystem.R505
-rw-r--r--src/arrow/r/R/flight.R124
-rw-r--r--src/arrow/r/R/install-arrow.R239
-rw-r--r--src/arrow/r/R/io.R295
-rw-r--r--src/arrow/r/R/ipc_stream.R123
-rw-r--r--src/arrow/r/R/json.R102
-rw-r--r--src/arrow/r/R/memory-pool.R61
-rw-r--r--src/arrow/r/R/message.R97
-rw-r--r--src/arrow/r/R/metadata.R210
-rw-r--r--src/arrow/r/R/parquet.R585
-rw-r--r--src/arrow/r/R/python.R225
-rw-r--r--src/arrow/r/R/query-engine.R298
-rw-r--r--src/arrow/r/R/record-batch-reader.R164
-rw-r--r--src/arrow/r/R/record-batch-writer.R194
-rw-r--r--src/arrow/r/R/record-batch.R193
-rw-r--r--src/arrow/r/R/reexports-bit64.R22
-rw-r--r--src/arrow/r/R/reexports-tidyselect.R46
-rw-r--r--src/arrow/r/R/scalar.R101
-rw-r--r--src/arrow/r/R/schema.R330
-rw-r--r--src/arrow/r/R/table.R170
-rw-r--r--src/arrow/r/R/type.R541
-rw-r--r--src/arrow/r/R/util.R195
-rw-r--r--src/arrow/r/README.md335
-rw-r--r--src/arrow/r/STYLE.md38
-rw-r--r--src/arrow/r/_pkgdown.yml185
-rw-r--r--src/arrow/r/arrow.Rproj21
-rwxr-xr-xsrc/arrow/r/cleanup21
-rwxr-xr-xsrc/arrow/r/configure307
-rw-r--r--src/arrow/r/configure.win73
-rw-r--r--src/arrow/r/cran-comments.md10
-rw-r--r--src/arrow/r/data-raw/codegen.R258
-rw-r--r--src/arrow/r/extra-tests/helpers.R36
-rw-r--r--src/arrow/r/extra-tests/test-read-files.R199
-rw-r--r--src/arrow/r/extra-tests/write-files.R41
-rw-r--r--src/arrow/r/inst/NOTICE.txt84
-rwxr-xr-xsrc/arrow/r/inst/build_arrow_static.sh86
-rw-r--r--src/arrow/r/inst/demo_flight_server.py120
-rw-r--r--src/arrow/r/inst/include/cpp11.hpp26
-rw-r--r--src/arrow/r/inst/include/cpp11/R.hpp46
-rw-r--r--src/arrow/r/inst/include/cpp11/altrep.hpp44
-rw-r--r--src/arrow/r/inst/include/cpp11/as.hpp337
-rw-r--r--src/arrow/r/inst/include/cpp11/attribute_proxy.hpp50
-rw-r--r--src/arrow/r/inst/include/cpp11/data_frame.hpp102
-rw-r--r--src/arrow/r/inst/include/cpp11/declarations.hpp54
-rw-r--r--src/arrow/r/inst/include/cpp11/doubles.hpp145
-rw-r--r--src/arrow/r/inst/include/cpp11/environment.hpp75
-rw-r--r--src/arrow/r/inst/include/cpp11/external_pointer.hpp166
-rw-r--r--src/arrow/r/inst/include/cpp11/function.hpp78
-rw-r--r--src/arrow/r/inst/include/cpp11/integers.hpp146
-rw-r--r--src/arrow/r/inst/include/cpp11/list.hpp138
-rw-r--r--src/arrow/r/inst/include/cpp11/list_of.hpp73
-rw-r--r--src/arrow/r/inst/include/cpp11/logicals.hpp143
-rw-r--r--src/arrow/r/inst/include/cpp11/matrix.hpp112
-rw-r--r--src/arrow/r/inst/include/cpp11/named_arg.hpp50
-rw-r--r--src/arrow/r/inst/include/cpp11/protect.hpp372
-rw-r--r--src/arrow/r/inst/include/cpp11/r_bool.hpp76
-rw-r--r--src/arrow/r/inst/include/cpp11/r_string.hpp98
-rw-r--r--src/arrow/r/inst/include/cpp11/r_vector.hpp1009
-rw-r--r--src/arrow/r/inst/include/cpp11/raws.hpp148
-rw-r--r--src/arrow/r/inst/include/cpp11/sexp.hpp85
-rw-r--r--src/arrow/r/inst/include/cpp11/strings.hpp187
-rw-r--r--src/arrow/r/inst/v0.7.1.parquetbin0 -> 4372 bytes
-rwxr-xr-xsrc/arrow/r/lint.sh45
-rw-r--r--src/arrow/r/man/ArrayData.Rd27
-rw-r--r--src/arrow/r/man/ChunkedArray.Rd80
-rw-r--r--src/arrow/r/man/Codec.Rd24
-rw-r--r--src/arrow/r/man/CsvReadOptions.Rd107
-rw-r--r--src/arrow/r/man/CsvTableReader.Rd32
-rw-r--r--src/arrow/r/man/DataType.Rd15
-rw-r--r--src/arrow/r/man/Dataset.Rd81
-rw-r--r--src/arrow/r/man/DictionaryType.Rd15
-rw-r--r--src/arrow/r/man/Expression.Rd18
-rw-r--r--src/arrow/r/man/FeatherReader.Rd33
-rw-r--r--src/arrow/r/man/Field.Rd37
-rw-r--r--src/arrow/r/man/FileFormat.Rd68
-rw-r--r--src/arrow/r/man/FileInfo.Rd28
-rw-r--r--src/arrow/r/man/FileSelector.Rd27
-rw-r--r--src/arrow/r/man/FileSystem.Rd99
-rw-r--r--src/arrow/r/man/FileWriteOptions.Rd8
-rw-r--r--src/arrow/r/man/FixedWidthType.Rd15
-rw-r--r--src/arrow/r/man/FragmentScanOptions.Rd40
-rw-r--r--src/arrow/r/man/InputStream.Rd45
-rw-r--r--src/arrow/r/man/MemoryPool.Rd24
-rw-r--r--src/arrow/r/man/Message.Rd15
-rw-r--r--src/arrow/r/man/MessageReader.Rd15
-rw-r--r--src/arrow/r/man/OutputStream.Rd38
-rw-r--r--src/arrow/r/man/ParquetArrowReaderProperties.Rd29
-rw-r--r--src/arrow/r/man/ParquetFileReader.Rd59
-rw-r--r--src/arrow/r/man/ParquetFileWriter.Rd31
-rw-r--r--src/arrow/r/man/ParquetWriterProperties.Rd49
-rw-r--r--src/arrow/r/man/Partitioning.Rd51
-rw-r--r--src/arrow/r/man/RecordBatch.Rd92
-rw-r--r--src/arrow/r/man/RecordBatchReader.Rd86
-rw-r--r--src/arrow/r/man/RecordBatchWriter.Rd89
-rw-r--r--src/arrow/r/man/Scalar.Rd38
-rw-r--r--src/arrow/r/man/Scanner.Rd51
-rw-r--r--src/arrow/r/man/Schema.Rd86
-rw-r--r--src/arrow/r/man/Table.Rd92
-rw-r--r--src/arrow/r/man/array.Rd107
-rw-r--r--src/arrow/r/man/arrow-package.Rd45
-rw-r--r--src/arrow/r/man/arrow_available.Rd47
-rw-r--r--src/arrow/r/man/arrow_info.Rd17
-rw-r--r--src/arrow/r/man/buffer.Rd44
-rw-r--r--src/arrow/r/man/call_function.Rd51
-rw-r--r--src/arrow/r/man/cast_options.Rd22
-rw-r--r--src/arrow/r/man/codec_is_available.Rd25
-rw-r--r--src/arrow/r/man/compression.Rd31
-rw-r--r--src/arrow/r/man/contains_regex.Rd18
-rw-r--r--src/arrow/r/man/copy_files.Rd35
-rw-r--r--src/arrow/r/man/cpu_count.Rd17
-rw-r--r--src/arrow/r/man/create_package_with_all_dependencies.Rd70
-rw-r--r--src/arrow/r/man/data-type.Rd163
-rw-r--r--src/arrow/r/man/dataset_factory.Rd76
-rw-r--r--src/arrow/r/man/default_memory_pool.Rd15
-rw-r--r--src/arrow/r/man/dictionary.Rd24
-rw-r--r--src/arrow/r/man/enums.Rd88
-rw-r--r--src/arrow/r/man/flight_connect.Rd21
-rw-r--r--src/arrow/r/man/flight_get.Rd19
-rw-r--r--src/arrow/r/man/flight_put.Rd25
-rw-r--r--src/arrow/r/man/get_stringr_pattern_options.Rd22
-rw-r--r--src/arrow/r/man/hive_partition.Rd35
-rw-r--r--src/arrow/r/man/install_arrow.Rd61
-rw-r--r--src/arrow/r/man/install_pyarrow.Rd22
-rw-r--r--src/arrow/r/man/io_thread_count.Rd17
-rw-r--r--src/arrow/r/man/list_compute_functions.Rd45
-rw-r--r--src/arrow/r/man/list_flights.Rd23
-rw-r--r--src/arrow/r/man/load_flight_server.Rd22
-rw-r--r--src/arrow/r/man/make_readable_file.Rd29
-rw-r--r--src/arrow/r/man/map_batches.Rd30
-rw-r--r--src/arrow/r/man/match_arrow.Rd53
-rw-r--r--src/arrow/r/man/mmap_create.Rd19
-rw-r--r--src/arrow/r/man/mmap_open.Rd16
-rw-r--r--src/arrow/r/man/open_dataset.Rd146
-rw-r--r--src/arrow/r/man/read_delim_arrow.Rd218
-rw-r--r--src/arrow/r/man/read_feather.Rd50
-rw-r--r--src/arrow/r/man/read_ipc_stream.Rd42
-rw-r--r--src/arrow/r/man/read_json_arrow.Rd52
-rw-r--r--src/arrow/r/man/read_message.Rd14
-rw-r--r--src/arrow/r/man/read_parquet.Rd50
-rw-r--r--src/arrow/r/man/read_schema.Rd19
-rw-r--r--src/arrow/r/man/recycle_scalars.Rd18
-rw-r--r--src/arrow/r/man/reexports.Rd29
-rw-r--r--src/arrow/r/man/repeat_value_as_array.Rd20
-rw-r--r--src/arrow/r/man/s3_bucket.Rd28
-rw-r--r--src/arrow/r/man/to_arrow.Rd33
-rw-r--r--src/arrow/r/man/to_duckdb.Rd56
-rw-r--r--src/arrow/r/man/type.Rd27
-rw-r--r--src/arrow/r/man/unify_schemas.Rd27
-rw-r--r--src/arrow/r/man/value_counts.Rd24
-rw-r--r--src/arrow/r/man/write_csv_arrow.Rd32
-rw-r--r--src/arrow/r/man/write_dataset.Rd115
-rw-r--r--src/arrow/r/man/write_feather.Rd61
-rw-r--r--src/arrow/r/man/write_ipc_stream.Rd45
-rw-r--r--src/arrow/r/man/write_parquet.Rd108
-rw-r--r--src/arrow/r/man/write_to_raw.Rd28
-rw-r--r--src/arrow/r/pkgdown/extra.js65
-rw-r--r--src/arrow/r/src/.clang-format20
-rw-r--r--src/arrow/r/src/.gitignore3
-rw-r--r--src/arrow/r/src/Makevars.in29
-rw-r--r--src/arrow/r/src/Makevars.ucrt19
-rw-r--r--src/arrow/r/src/RTasks.cpp74
-rw-r--r--src/arrow/r/src/altrep.cpp690
-rw-r--r--src/arrow/r/src/array.cpp286
-rw-r--r--src/arrow/r/src/array_to_vector.cpp1317
-rw-r--r--src/arrow/r/src/arraydata.cpp49
-rw-r--r--src/arrow/r/src/arrowExports.cpp7636
-rw-r--r--src/arrow/r/src/arrow_cpp11.h382
-rw-r--r--src/arrow/r/src/arrow_types.h274
-rw-r--r--src/arrow/r/src/arrow_vctrs.h22
-rw-r--r--src/arrow/r/src/buffer.cpp71
-rw-r--r--src/arrow/r/src/chunkedarray.cpp139
-rw-r--r--src/arrow/r/src/compression.cpp56
-rw-r--r--src/arrow/r/src/compute-exec.cpp281
-rw-r--r--src/arrow/r/src/compute.cpp576
-rw-r--r--src/arrow/r/src/config.cpp37
-rw-r--r--src/arrow/r/src/csv.cpp205
-rw-r--r--src/arrow/r/src/dataset.cpp543
-rw-r--r--src/arrow/r/src/datatype.cpp426
-rw-r--r--src/arrow/r/src/expression.cpp102
-rw-r--r--src/arrow/r/src/feather.cpp87
-rw-r--r--src/arrow/r/src/field.cpp56
-rw-r--r--src/arrow/r/src/filesystem.cpp329
-rw-r--r--src/arrow/r/src/imports.cpp43
-rw-r--r--src/arrow/r/src/io.cpp181
-rw-r--r--src/arrow/r/src/json.cpp67
-rw-r--r--src/arrow/r/src/memorypool.cpp92
-rw-r--r--src/arrow/r/src/message.cpp105
-rw-r--r--src/arrow/r/src/nameof.h93
-rw-r--r--src/arrow/r/src/parquet.cpp326
-rw-r--r--src/arrow/r/src/py-to-r.cpp117
-rw-r--r--src/arrow/r/src/r_task_group.h55
-rw-r--r--src/arrow/r/src/r_to_arrow.cpp1439
-rw-r--r--src/arrow/r/src/recordbatch.cpp309
-rw-r--r--src/arrow/r/src/recordbatchreader.cpp122
-rw-r--r--src/arrow/r/src/recordbatchwriter.cpp67
-rw-r--r--src/arrow/r/src/scalar.cpp97
-rw-r--r--src/arrow/r/src/schema.cpp146
-rw-r--r--src/arrow/r/src/symbols.cpp90
-rw-r--r--src/arrow/r/src/table.cpp286
-rw-r--r--src/arrow/r/src/threadpool.cpp59
-rw-r--r--src/arrow/r/src/type_infer.cpp202
-rw-r--r--src/arrow/r/tests/testthat.R27
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquetbin0 -> 7862 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.featherbin0 -> 1650 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.featherbin0 -> 1354 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.featherbin0 -> 1626 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquetbin0 -> 3603 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.featherbin0 -> 2858 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.featherbin0 -> 2626 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.featherbin0 -> 2842 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquetbin0 -> 3965 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.featherbin0 -> 3162 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.featherbin0 -> 2930 bytes
-rw-r--r--src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.featherbin0 -> 3146 bytes
-rw-r--r--src/arrow/r/tests/testthat/helper-arrow.R69
-rw-r--r--src/arrow/r/tests/testthat/helper-data.R191
-rw-r--r--src/arrow/r/tests/testthat/helper-expectation.R320
-rw-r--r--src/arrow/r/tests/testthat/helper-parquet.R29
-rw-r--r--src/arrow/r/tests/testthat/helper-roundtrip.R44
-rw-r--r--src/arrow/r/tests/testthat/helper-skip.R81
-rw-r--r--src/arrow/r/tests/testthat/latin1.R76
-rw-r--r--src/arrow/r/tests/testthat/test-Array.R963
-rw-r--r--src/arrow/r/tests/testthat/test-RecordBatch.R690
-rw-r--r--src/arrow/r/tests/testthat/test-Table.R549
-rw-r--r--src/arrow/r/tests/testthat/test-altrep.R243
-rw-r--r--src/arrow/r/tests/testthat/test-array-data.R33
-rw-r--r--src/arrow/r/tests/testthat/test-arrow-info.R23
-rw-r--r--src/arrow/r/tests/testthat/test-arrow.R78
-rw-r--r--src/arrow/r/tests/testthat/test-backwards-compatibility.R121
-rw-r--r--src/arrow/r/tests/testthat/test-buffer-reader.R38
-rw-r--r--src/arrow/r/tests/testthat/test-buffer.R97
-rw-r--r--src/arrow/r/tests/testthat/test-chunked-array.R468
-rw-r--r--src/arrow/r/tests/testthat/test-chunked-array.txt103
-rw-r--r--src/arrow/r/tests/testthat/test-compressed.R73
-rw-r--r--src/arrow/r/tests/testthat/test-compute-aggregate.R434
-rw-r--r--src/arrow/r/tests/testthat/test-compute-arith.R129
-rw-r--r--src/arrow/r/tests/testthat/test-compute-no-bindings.R201
-rw-r--r--src/arrow/r/tests/testthat/test-compute-sort.R155
-rw-r--r--src/arrow/r/tests/testthat/test-compute-vector.R133
-rw-r--r--src/arrow/r/tests/testthat/test-csv.R357
-rw-r--r--src/arrow/r/tests/testthat/test-data-type.R429
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-csv.R290
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-dplyr.R340
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-uri.R123
-rw-r--r--src/arrow/r/tests/testthat/test-dataset-write.R454
-rw-r--r--src/arrow/r/tests/testthat/test-dataset.R696
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-arrange.R205
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-collapse.R235
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-count.R92
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-distinct.R104
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-filter.R412
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R409
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R304
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-math.R309
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-string.R1399
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-funcs-type.R627
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-group-by.R158
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-join.R175
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-mutate.R522
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-query.R296
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-select.R146
-rw-r--r--src/arrow/r/tests/testthat/test-dplyr-summarize.R881
-rw-r--r--src/arrow/r/tests/testthat/test-duckdb.R217
-rw-r--r--src/arrow/r/tests/testthat/test-expression.R128
-rw-r--r--src/arrow/r/tests/testthat/test-feather.R256
-rw-r--r--src/arrow/r/tests/testthat/test-field.R67
-rw-r--r--src/arrow/r/tests/testthat/test-filesystem.R178
-rw-r--r--src/arrow/r/tests/testthat/test-install-arrow.R37
-rw-r--r--src/arrow/r/tests/testthat/test-json.R255
-rw-r--r--src/arrow/r/tests/testthat/test-memory-pool.R26
-rw-r--r--src/arrow/r/tests/testthat/test-message-reader.R85
-rw-r--r--src/arrow/r/tests/testthat/test-message.R63
-rw-r--r--src/arrow/r/tests/testthat/test-metadata.R369
-rw-r--r--src/arrow/r/tests/testthat/test-na-omit.R94
-rw-r--r--src/arrow/r/tests/testthat/test-parquet.R274
-rw-r--r--src/arrow/r/tests/testthat/test-python-flight.R62
-rw-r--r--src/arrow/r/tests/testthat/test-python.R145
-rw-r--r--src/arrow/r/tests/testthat/test-read-record-batch.R78
-rw-r--r--src/arrow/r/tests/testthat/test-read-write.R125
-rw-r--r--src/arrow/r/tests/testthat/test-record-batch-reader.R141
-rw-r--r--src/arrow/r/tests/testthat/test-s3-minio.R228
-rw-r--r--src/arrow/r/tests/testthat/test-s3.R55
-rw-r--r--src/arrow/r/tests/testthat/test-scalar.R112
-rw-r--r--src/arrow/r/tests/testthat/test-schema.R220
-rw-r--r--src/arrow/r/tests/testthat/test-thread-pool.R33
-rw-r--r--src/arrow/r/tests/testthat/test-type.R211
-rw-r--r--src/arrow/r/tests/testthat/test-utf.R24
-rw-r--r--src/arrow/r/tools/autobrew66
-rw-r--r--src/arrow/r/tools/nixlibs.R601
-rw-r--r--src/arrow/r/tools/ubsan.supp18
-rw-r--r--src/arrow/r/tools/winlibs.R65
-rw-r--r--src/arrow/r/vignettes/arrow.Rmd225
-rw-r--r--src/arrow/r/vignettes/dataset.Rmd421
-rw-r--r--src/arrow/r/vignettes/developing.Rmd605
-rw-r--r--src/arrow/r/vignettes/flight.Rmd87
-rw-r--r--src/arrow/r/vignettes/fs.Rmd130
-rw-r--r--src/arrow/r/vignettes/install.Rmd448
-rw-r--r--src/arrow/r/vignettes/python.Rmd131
-rw-r--r--src/arrow/ruby/Gemfile22
-rw-r--r--src/arrow/ruby/README.md36
-rw-r--r--src/arrow/ruby/Rakefile56
-rw-r--r--src/arrow/ruby/red-arrow-cuda/.gitignore19
-rw-r--r--src/arrow/ruby/red-arrow-cuda/Gemfile24
-rw-r--r--src/arrow/ruby/red-arrow-cuda/LICENSE.txt202
-rw-r--r--src/arrow/ruby/red-arrow-cuda/NOTICE.txt2
-rw-r--r--src/arrow/ruby/red-arrow-cuda/README.md60
-rw-r--r--src/arrow/ruby/red-arrow-cuda/Rakefile41
-rw-r--r--src/arrow/ruby/red-arrow-cuda/dependency-check/Rakefile47
-rw-r--r--src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda.rb29
-rw-r--r--src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/device-manager.rb25
-rw-r--r--src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/loader.rb35
-rw-r--r--src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb26
-rw-r--r--src/arrow/ruby/red-arrow-cuda/red-arrow-cuda.gemspec51
-rw-r--r--src/arrow/ruby/red-arrow-cuda/test/helper.rb20
-rwxr-xr-xsrc/arrow/ruby/red-arrow-cuda/test/run-test.rb50
-rw-r--r--src/arrow/ruby/red-arrow-cuda/test/test-cuda.rb38
-rw-r--r--src/arrow/ruby/red-arrow-dataset/.gitignore19
-rw-r--r--src/arrow/ruby/red-arrow-dataset/Gemfile24
-rw-r--r--src/arrow/ruby/red-arrow-dataset/LICENSE.txt202
-rw-r--r--src/arrow/ruby/red-arrow-dataset/NOTICE.txt2
-rw-r--r--src/arrow/ruby/red-arrow-dataset/README.md50
-rw-r--r--src/arrow/ruby/red-arrow-dataset/Rakefile41
-rw-r--r--src/arrow/ruby/red-arrow-dataset/dependency-check/Rakefile47
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb29
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb61
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb69
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb29
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb59
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb39
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb39
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb26
-rw-r--r--src/arrow/ruby/red-arrow-dataset/red-arrow-dataset.gemspec51
-rw-r--r--src/arrow/ruby/red-arrow-dataset/test/helper.rb22
-rwxr-xr-xsrc/arrow/ruby/red-arrow-dataset/test/run-test.rb50
-rw-r--r--src/arrow/ruby/red-arrow-dataset/test/test-arrow-table.rb80
-rw-r--r--src/arrow/ruby/red-arrow-dataset/test/test-file-system-dataset.rb38
-rw-r--r--src/arrow/ruby/red-arrow-flight/.gitignore18
-rw-r--r--src/arrow/ruby/red-arrow-flight/Gemfile24
-rw-r--r--src/arrow/ruby/red-arrow-flight/LICENSE.txt202
-rw-r--r--src/arrow/ruby/red-arrow-flight/NOTICE.txt2
-rw-r--r--src/arrow/ruby/red-arrow-flight/README.md50
-rw-r--r--src/arrow/ruby/red-arrow-flight/Rakefile41
-rw-r--r--src/arrow/ruby/red-arrow-flight/dependency-check/Rakefile47
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight.rb29
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight/call-options.rb35
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight/client-options.rb35
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight/loader.rb44
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight/location.rb31
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight/server-options.rb41
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight/ticket.rb32
-rw-r--r--src/arrow/ruby/red-arrow-flight/lib/arrow-flight/version.rb26
-rw-r--r--src/arrow/ruby/red-arrow-flight/red-arrow-flight.gemspec52
-rw-r--r--src/arrow/ruby/red-arrow-flight/test/helper.rb22
-rw-r--r--src/arrow/ruby/red-arrow-flight/test/helper/info-generator.rb57
-rw-r--r--src/arrow/ruby/red-arrow-flight/test/helper/server.rb39
-rwxr-xr-xsrc/arrow/ruby/red-arrow-flight/test/run-test.rb50
-rw-r--r--src/arrow/ruby/red-arrow-flight/test/test-client.rb46
-rw-r--r--src/arrow/ruby/red-arrow-flight/test/test-location.rb26
-rw-r--r--src/arrow/ruby/red-arrow-flight/test/test-ticket.rb26
-rw-r--r--src/arrow/ruby/red-arrow/.gitignore23
-rw-r--r--src/arrow/ruby/red-arrow/.yardopts6
-rw-r--r--src/arrow/ruby/red-arrow/Gemfile22
-rw-r--r--src/arrow/ruby/red-arrow/LICENSE.txt202
-rw-r--r--src/arrow/ruby/red-arrow/NOTICE.txt2
-rw-r--r--src/arrow/ruby/red-arrow/README.md75
-rw-r--r--src/arrow/ruby/red-arrow/Rakefile100
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/raw-records/boolean.yml65
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/raw-records/decimal128.yml68
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml75
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/raw-records/int64.yml67
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/raw-records/list.yml70
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/raw-records/string.yml65
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/raw-records/timestamp.yml75
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/values/boolean.yml37
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/values/decimal128.yml38
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/values/dictionary.yml46
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/values/int64.yml37
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/values/list.yml44
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/values/string.yml38
-rw-r--r--src/arrow/ruby/red-arrow/benchmark/values/timestamp.yml49
-rw-r--r--src/arrow/ruby/red-arrow/doc/text/development.md34
-rwxr-xr-xsrc/arrow/ruby/red-arrow/example/read-file.rb36
-rwxr-xr-xsrc/arrow/ruby/red-arrow/example/read-stream.rb36
-rwxr-xr-xsrc/arrow/ruby/red-arrow/example/write-file.rb63
-rwxr-xr-xsrc/arrow/ruby/red-arrow/example/write-stream.rb63
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/arrow.cpp84
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/converters.cpp47
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/converters.hpp795
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/extconf.rb76
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/memory-view.cpp311
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/memory-view.hpp26
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/raw-records.cpp184
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/red-arrow.hpp95
-rw-r--r--src/arrow/ruby/red-arrow/ext/arrow/values.cpp157
-rw-r--r--src/arrow/ruby/red-arrow/image/red-arrow.pngbin0 -> 7165 bytes
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow.rb30
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/aggregate-node-options.rb35
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/aggregation.rb46
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/array-builder.rb214
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/array.rb234
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/bigdecimal-extension.rb28
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/binary-dictionary-array-builder.rb27
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/block-closable.rb35
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/buffer.rb32
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/chunked-array.rb91
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/column-containable.rb147
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/column.rb76
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/compression-type.rb37
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/constructor-arguments-gc-guardable.rb25
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/csv-loader.rb384
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/csv-read-options.rb43
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/data-type.rb198
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/date32-array-builder.rb32
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/date32-array.rb30
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/date64-array-builder.rb33
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/date64-array.rb29
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/datum.rb100
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal128-array-builder.rb58
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal128-array.rb24
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal128-data-type.rb71
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal128.rb60
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal256-array-builder.rb61
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal256-array.rb25
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal256-data-type.rb73
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/decimal256.rb60
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/dense-union-data-type.rb90
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/dictionary-array.rb24
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/dictionary-data-type.rb117
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/equal-options.rb38
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/expression.rb48
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/field-containable.rb38
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/field.rb118
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/file-output-stream.rb34
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/file-system.rb34
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array-builder.rb38
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array.rb26
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/generic-filterable.rb43
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/generic-takeable.rb38
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/group.rb164
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/list-array-builder.rb96
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/list-data-type.rb118
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/loader.rb216
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/map-array-builder.rb109
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/map-array.rb26
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/map-data-type.rb89
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/null-array-builder.rb26
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/null-array.rb24
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/path-extension.rb45
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/raw-table-converter.rb47
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record-batch-builder.rb114
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record-batch-file-reader.rb28
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record-batch-iterator.rb22
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record-batch-reader.rb41
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record-batch-stream-reader.rb30
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record-batch.rb75
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record-containable.rb38
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/record.rb60
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/rolling-window.rb48
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/scalar.rb32
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/schema.rb100
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/slicer.rb355
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/sort-key.rb193
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/sort-options.rb109
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/source-node-options.rb32
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/sparse-union-data-type.rb90
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/string-dictionary-array-builder.rb27
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/struct-array-builder.rb146
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/struct-array.rb68
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/struct-data-type.rb128
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/symbol-values-appendable.rb34
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/table-concatenate-options.rb36
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/table-formatter.rb190
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/table-list-formatter.rb41
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/table-loader.rb225
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/table-saver.rb195
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/table-table-formatter.rb49
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/table.rb519
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/tensor.rb24
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/time.rb159
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/time32-array-builder.rb49
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/time32-array.rb28
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/time32-data-type.rb61
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/time64-array-builder.rb49
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/time64-array.rb28
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/time64-data-type.rb61
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb65
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/timestamp-array.rb42
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/timestamp-data-type.rb57
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/version.rb26
-rw-r--r--src/arrow/ruby/red-arrow/lib/arrow/writable.rb22
-rw-r--r--src/arrow/ruby/red-arrow/red-arrow.gemspec67
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/TestOrcFile.test1.orcbin0 -> 1711 bytes
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/float-integer.csv20
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/integer-float.csv20
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/null-with-double-quote.csv20
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/null-without-double-quote.csv20
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/with-header-float.csv20
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/with-header.csv20
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/without-header-float.csv19
-rw-r--r--src/arrow/ruby/red-arrow/test/fixture/without-header.csv19
-rw-r--r--src/arrow/ruby/red-arrow/test/helper.rb28
-rw-r--r--src/arrow/ruby/red-arrow/test/helper/fixture.rb28
-rw-r--r--src/arrow/ruby/red-arrow/test/helper/omittable.rb36
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-basic-arrays.rb365
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-dense-union-array.rb494
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-list-array.rb571
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-map-array.rb441
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-multiple-columns.rb65
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb484
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-struct-array.rb485
-rw-r--r--src/arrow/ruby/red-arrow/test/raw-records/test-table.rb47
-rwxr-xr-xsrc/arrow/ruby/red-arrow/test/run-test.rb71
-rw-r--r--src/arrow/ruby/red-arrow/test/test-array-builder.rb136
-rw-r--r--src/arrow/ruby/red-arrow/test/test-array.rb325
-rw-r--r--src/arrow/ruby/red-arrow/test/test-bigdecimal.rb40
-rw-r--r--src/arrow/ruby/red-arrow/test/test-binary-dictionary-array-builder.rb103
-rw-r--r--src/arrow/ruby/red-arrow/test/test-boolean-scalar.rb26
-rw-r--r--src/arrow/ruby/red-arrow/test/test-buffer.rb49
-rw-r--r--src/arrow/ruby/red-arrow/test/test-chunked-array.rb183
-rw-r--r--src/arrow/ruby/red-arrow/test/test-column.rb92
-rw-r--r--src/arrow/ruby/red-arrow/test/test-csv-loader.rb250
-rw-r--r--src/arrow/ruby/red-arrow/test/test-data-type.rb83
-rw-r--r--src/arrow/ruby/red-arrow/test/test-date32-array.rb24
-rw-r--r--src/arrow/ruby/red-arrow/test/test-date64-array.rb25
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal128-array-builder.rb112
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal128-array.rb38
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal128-data-type.rb31
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal128.rb102
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal256-array-builder.rb112
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal256-array.rb38
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal256-data-type.rb31
-rw-r--r--src/arrow/ruby/red-arrow/test/test-decimal256.rb102
-rw-r--r--src/arrow/ruby/red-arrow/test/test-dense-union-data-type.rb41
-rw-r--r--src/arrow/ruby/red-arrow/test/test-dictionary-array.rb41
-rw-r--r--src/arrow/ruby/red-arrow/test/test-dictionary-data-type.rb40
-rw-r--r--src/arrow/ruby/red-arrow/test/test-expression.rb40
-rw-r--r--src/arrow/ruby/red-arrow/test/test-feather.rb49
-rw-r--r--src/arrow/ruby/red-arrow/test/test-field.rb91
-rw-r--r--src/arrow/ruby/red-arrow/test/test-file-output-stream.rb54
-rw-r--r--src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array-builder.rb92
-rw-r--r--src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array.rb36
-rw-r--r--src/arrow/ruby/red-arrow/test/test-float-scalar.rb46
-rw-r--r--src/arrow/ruby/red-arrow/test/test-function.rb176
-rw-r--r--src/arrow/ruby/red-arrow/test/test-group.rb180
-rw-r--r--src/arrow/ruby/red-arrow/test/test-list-array-builder.rb79
-rw-r--r--src/arrow/ruby/red-arrow/test/test-list-array.rb32
-rw-r--r--src/arrow/ruby/red-arrow/test/test-list-data-type.rb69
-rw-r--r--src/arrow/ruby/red-arrow/test/test-map-array-builder.rb110
-rw-r--r--src/arrow/ruby/red-arrow/test/test-map-array.rb33
-rw-r--r--src/arrow/ruby/red-arrow/test/test-map-data-type.rb36
-rw-r--r--src/arrow/ruby/red-arrow/test/test-memory-view.rb434
-rw-r--r--src/arrow/ruby/red-arrow/test/test-null-array.rb23
-rw-r--r--src/arrow/ruby/red-arrow/test/test-orc.rb173
-rw-r--r--src/arrow/ruby/red-arrow/test/test-record-batch-builder.rb125
-rw-r--r--src/arrow/ruby/red-arrow/test/test-record-batch-file-reader.rb115
-rw-r--r--src/arrow/ruby/red-arrow/test/test-record-batch-iterator.rb37
-rw-r--r--src/arrow/ruby/red-arrow/test/test-record-batch-reader.rb46
-rw-r--r--src/arrow/ruby/red-arrow/test/test-record-batch.rb182
-rw-r--r--src/arrow/ruby/red-arrow/test/test-rolling-window.rb40
-rw-r--r--src/arrow/ruby/red-arrow/test/test-schema.rb134
-rw-r--r--src/arrow/ruby/red-arrow/test/test-slicer.rb487
-rw-r--r--src/arrow/ruby/red-arrow/test/test-sort-indices.rb40
-rw-r--r--src/arrow/ruby/red-arrow/test/test-sort-key.rb81
-rw-r--r--src/arrow/ruby/red-arrow/test/test-sort-options.rb58
-rw-r--r--src/arrow/ruby/red-arrow/test/test-sparse-union-data-type.rb41
-rw-r--r--src/arrow/ruby/red-arrow/test/test-string-dictionary-array-builder.rb103
-rw-r--r--src/arrow/ruby/red-arrow/test/test-struct-array-builder.rb184
-rw-r--r--src/arrow/ruby/red-arrow/test/test-struct-array.rb94
-rw-r--r--src/arrow/ruby/red-arrow/test/test-struct-data-type.rb112
-rw-r--r--src/arrow/ruby/red-arrow/test/test-table.rb925
-rw-r--r--src/arrow/ruby/red-arrow/test/test-tensor.rb56
-rw-r--r--src/arrow/ruby/red-arrow/test/test-time.rb288
-rw-r--r--src/arrow/ruby/red-arrow/test/test-time32-array.rb81
-rw-r--r--src/arrow/ruby/red-arrow/test/test-time32-data-type.rb42
-rw-r--r--src/arrow/ruby/red-arrow/test/test-time64-array.rb81
-rw-r--r--src/arrow/ruby/red-arrow/test/test-time64-data-type.rb42
-rw-r--r--src/arrow/ruby/red-arrow/test/test-timestamp-array.rb45
-rw-r--r--src/arrow/ruby/red-arrow/test/test-timestamp-data-type.rb42
-rw-r--r--src/arrow/ruby/red-arrow/test/values/test-basic-arrays.rb295
-rw-r--r--src/arrow/ruby/red-arrow/test/values/test-dense-union-array.rb482
-rw-r--r--src/arrow/ruby/red-arrow/test/values/test-list-array.rb532
-rw-r--r--src/arrow/ruby/red-arrow/test/values/test-map-array.rb433
-rw-r--r--src/arrow/ruby/red-arrow/test/values/test-sparse-union-array.rb473
-rw-r--r--src/arrow/ruby/red-arrow/test/values/test-struct-array.rb482
-rw-r--r--src/arrow/ruby/red-gandiva/.gitignore19
-rw-r--r--src/arrow/ruby/red-gandiva/Gemfile24
-rw-r--r--src/arrow/ruby/red-gandiva/LICENSE.txt202
-rw-r--r--src/arrow/ruby/red-gandiva/NOTICE.txt2
-rw-r--r--src/arrow/ruby/red-gandiva/README.md68
-rw-r--r--src/arrow/ruby/red-gandiva/Rakefile41
-rw-r--r--src/arrow/ruby/red-gandiva/dependency-check/Rakefile47
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva.rb29
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/arrow-schema.rb25
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder.rb45
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/add.rb40
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/binary-operation.rb38
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/context.rb26
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/divide.rb34
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/elsif.rb36
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/equal.rb33
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/field.rb32
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/greater-than.rb33
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/if.rb75
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/less-than.rb33
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/literal.rb65
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/multiply.rb34
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/record.rb45
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/subtract.rb34
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/value.rb55
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/loader.rb49
-rw-r--r--src/arrow/ruby/red-gandiva/lib/gandiva/version.rb26
-rw-r--r--src/arrow/ruby/red-gandiva/red-gandiva.gemspec49
-rw-r--r--src/arrow/ruby/red-gandiva/test/expression-builder/test-add.rb54
-rw-r--r--src/arrow/ruby/red-gandiva/test/expression-builder/test-record.rb45
-rw-r--r--src/arrow/ruby/red-gandiva/test/helper.rb20
-rwxr-xr-xsrc/arrow/ruby/red-gandiva/test/run-test.rb50
-rw-r--r--src/arrow/ruby/red-gandiva/test/test-boolean-literal-node.rb24
-rw-r--r--src/arrow/ruby/red-gandiva/test/test-projector.rb49
-rw-r--r--src/arrow/ruby/red-parquet/.gitignore19
-rw-r--r--src/arrow/ruby/red-parquet/Gemfile24
-rw-r--r--src/arrow/ruby/red-parquet/LICENSE.txt202
-rw-r--r--src/arrow/ruby/red-parquet/NOTICE.txt2
-rw-r--r--src/arrow/ruby/red-parquet/README.md52
-rw-r--r--src/arrow/ruby/red-parquet/Rakefile41
-rw-r--r--src/arrow/ruby/red-parquet/dependency-check/Rakefile47
-rw-r--r--src/arrow/ruby/red-parquet/lib/parquet.rb29
-rw-r--r--src/arrow/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb36
-rw-r--r--src/arrow/ruby/red-parquet/lib/parquet/arrow-table-savable.rb52
-rw-r--r--src/arrow/ruby/red-parquet/lib/parquet/loader.rb46
-rw-r--r--src/arrow/ruby/red-parquet/lib/parquet/version.rb26
-rw-r--r--src/arrow/ruby/red-parquet/lib/parquet/writer-properties.rb28
-rw-r--r--src/arrow/ruby/red-parquet/red-parquet.gemspec49
-rw-r--r--src/arrow/ruby/red-parquet/test/helper.rb22
-rwxr-xr-xsrc/arrow/ruby/red-parquet/test/run-test.rb50
-rw-r--r--src/arrow/ruby/red-parquet/test/test-arrow-table.rb99
-rw-r--r--src/arrow/ruby/red-plasma/.gitignore19
-rw-r--r--src/arrow/ruby/red-plasma/Gemfile24
-rw-r--r--src/arrow/ruby/red-plasma/LICENSE.txt202
-rw-r--r--src/arrow/ruby/red-plasma/NOTICE.txt2
-rw-r--r--src/arrow/ruby/red-plasma/README.md58
-rw-r--r--src/arrow/ruby/red-plasma/Rakefile41
-rw-r--r--src/arrow/ruby/red-plasma/dependency-check/Rakefile47
-rw-r--r--src/arrow/ruby/red-plasma/lib/plasma.rb29
-rw-r--r--src/arrow/ruby/red-plasma/lib/plasma/client.rb35
-rw-r--r--src/arrow/ruby/red-plasma/lib/plasma/loader.rb35
-rw-r--r--src/arrow/ruby/red-plasma/lib/plasma/version.rb26
-rw-r--r--src/arrow/ruby/red-plasma/red-plasma.gemspec49
-rw-r--r--src/arrow/ruby/red-plasma/test/helper.rb25
-rw-r--r--src/arrow/ruby/red-plasma/test/helper/omittable.rb36
-rw-r--r--src/arrow/ruby/red-plasma/test/helper/plasma-store.rb57
-rwxr-xr-xsrc/arrow/ruby/red-plasma/test/run-test.rb50
-rw-r--r--src/arrow/ruby/red-plasma/test/test-plasma-client.rb53
698 files changed, 93624 insertions, 0 deletions
diff --git a/src/arrow/r/.Rbuildignore b/src/arrow/r/.Rbuildignore
new file mode 100644
index 000000000..4bead75ea
--- /dev/null
+++ b/src/arrow/r/.Rbuildignore
@@ -0,0 +1,29 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^README\.Rmd$
+src/.clang-format
+LICENSE.md
+^data-raw$
+lint.sh
+Dockerfile
+.*\.tar\.gz
+^windows
+^libarrow
+^revdep
+clang_format.sh
+^cran-comments\.md$
+^arrow_.*.tar.gz$
+^arrow_.*.tgz$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
+^Makefile$
+^.*\.orig$
+^.*\.cmd$
+^autobrew$
+^apache-arrow.rb$
+^.*\.Rhistory$
+^extra-tests
+STYLE.md
+^.lintr
+^.styler_excludes.R
diff --git a/src/arrow/r/.gitignore b/src/arrow/r/.gitignore
new file mode 100644
index 000000000..e15c6f4b7
--- /dev/null
+++ b/src/arrow/r/.gitignore
@@ -0,0 +1,28 @@
+Meta
+docs/
+inst/doc
+*.o
+*.o-*
+*.d
+*.so
+*.dll
+.RData
+.Rproj.user
+.Rhistory
+src/Makevars
+src/Makevars.win
+windows/
+libarrow/
+revdep/
+vignettes/nyc-taxi/
+arrow_*.tar.gz
+arrow_*.tgz
+extra-tests/files
+
+# C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here.
+/tools/cpp/
+# cmake expects dotenv, NOTICE.txt, and LICENSE.txt to be available one level up
+# from cpp/, but again, they're just copies
+/tools/dotenv
+/tools/LICENSE.txt
+/tools/NOTICE.txt
diff --git a/src/arrow/r/.lintr b/src/arrow/r/.lintr
new file mode 100644
index 000000000..fb9ca8f87
--- /dev/null
+++ b/src/arrow/r/.lintr
@@ -0,0 +1,31 @@
+license: # Licensed to the Apache Software Foundation (ASF) under one
+ # or more contributor license agreements. See the NOTICE file
+ # distributed with this work for additional information
+ # regarding copyright ownership. The ASF licenses this file
+ # to you under the Apache License, Version 2.0 (the
+ # "License"); you may not use this file except in compliance
+ # with the License. You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing,
+ # software distributed under the License is distributed on an
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ # KIND, either express or implied. See the License for the
+ # specific language governing permissions and limitations
+ # under the License.
+linters: with_defaults(
+ line_length_linter = line_length_linter(120),
+ object_name_linter = NULL,
+ # Even with a liberal definition of name styles, some of our names cause issues due to `.`s for s3 classes or NA in the name
+ # TODO: figure out if we con contribute to lintr to make these work
+ # object_name_linter = object_name_linter(styles = c("snake_case", "camelCase", "CamelCase", "symbols", "dotted.case", "UPPERCASE", "SNAKE_CASE")),
+ object_length_linter = object_length_linter(40),
+ object_usage_linter = NULL, # R6 methods are flagged,
+ cyclocomp_linter = cyclocomp_linter(26) # TODO: reduce to default of 15
+ )
+exclusions: list(
+ "tests/testthat/latin1.R",
+ "R/arrowExports.R",
+ "data-raw/codegen.R"
+ )
diff --git a/src/arrow/r/.styler_excludes.R b/src/arrow/r/.styler_excludes.R
new file mode 100644
index 000000000..19cd1ffa5
--- /dev/null
+++ b/src/arrow/r/.styler_excludes.R
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+c("tests/testthat/latin1.R", "data-raw/codegen.R") \ No newline at end of file
diff --git a/src/arrow/r/DESCRIPTION b/src/arrow/r/DESCRIPTION
new file mode 100644
index 000000000..f42c8165d
--- /dev/null
+++ b/src/arrow/r/DESCRIPTION
@@ -0,0 +1,122 @@
+Package: arrow
+Title: Integration to 'Apache' 'Arrow'
+Version: 6.0.1
+Authors@R: c(
+ person("Neal", "Richardson", email = "neal@ursalabs.org", role = c("aut", "cre")),
+ person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")),
+ person("Nic", "Crane", email = "thisisnic@gmail.com", role = c("aut")),
+ person("Jonathan", "Keane", email = "jkeane@gmail.com", role = c("aut")),
+ person("Romain", "Fran\u00e7ois", email = "romain@rstudio.com", role = c("aut"), comment = c(ORCID = "0000-0002-2444-4226")),
+ person("Jeroen", "Ooms", email = "jeroen@berkeley.edu", role = c("aut")),
+ person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")),
+ person("Karl", "Dunkle Werner", email = "karldw@users.noreply.github.com", role = c("ctb"), comment = c(ORCID = "0000-0003-0523-7309")),
+ person("Jeffrey", "Wong", email = "jeffreyw@netflix.com", role = c("ctb")),
+ person("Apache Arrow", email = "dev@arrow.apache.org", role = c("aut", "cph"))
+ )
+Description: 'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
+ development platform for in-memory data. It specifies a standardized
+ language-independent columnar memory format for flat and hierarchical data,
+ organized for efficient analytic operations on modern hardware. This
+ package provides an interface to the 'Arrow C++' library.
+Depends: R (>= 3.3)
+License: Apache License (>= 2.0)
+URL: https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/
+BugReports: https://issues.apache.org/jira/projects/ARROW/issues
+Encoding: UTF-8
+Language: en-US
+SystemRequirements: C++11; for AWS S3 support on Linux, libcurl and openssl (optional)
+Biarch: true
+Imports:
+ assertthat,
+ bit64 (>= 0.9-7),
+ methods,
+ purrr,
+ R6,
+ rlang,
+ stats,
+ tidyselect,
+ utils,
+ vctrs
+Roxygen: list(markdown = TRUE, r6 = FALSE, load = "source")
+RoxygenNote: 7.1.2
+Config/testthat/edition: 3
+VignetteBuilder: knitr
+Suggests:
+ DBI,
+ dbplyr,
+ decor,
+ distro,
+ dplyr,
+ duckdb (>= 0.2.8),
+ hms,
+ knitr,
+ lubridate,
+ pkgload,
+ reticulate,
+ rmarkdown,
+ stringi,
+ stringr,
+ testthat,
+ tibble,
+ withr
+Collate:
+ 'arrowExports.R'
+ 'enums.R'
+ 'arrow-package.R'
+ 'type.R'
+ 'array-data.R'
+ 'arrow-datum.R'
+ 'array.R'
+ 'arrow-tabular.R'
+ 'buffer.R'
+ 'chunked-array.R'
+ 'io.R'
+ 'compression.R'
+ 'scalar.R'
+ 'compute.R'
+ 'config.R'
+ 'csv.R'
+ 'dataset.R'
+ 'dataset-factory.R'
+ 'dataset-format.R'
+ 'dataset-partition.R'
+ 'dataset-scan.R'
+ 'dataset-write.R'
+ 'deprecated.R'
+ 'dictionary.R'
+ 'dplyr-arrange.R'
+ 'dplyr-collect.R'
+ 'dplyr-count.R'
+ 'dplyr-distinct.R'
+ 'dplyr-eval.R'
+ 'dplyr-filter.R'
+ 'expression.R'
+ 'dplyr-functions.R'
+ 'dplyr-group-by.R'
+ 'dplyr-join.R'
+ 'dplyr-mutate.R'
+ 'dplyr-select.R'
+ 'dplyr-summarize.R'
+ 'record-batch.R'
+ 'table.R'
+ 'dplyr.R'
+ 'duckdb.R'
+ 'feather.R'
+ 'field.R'
+ 'filesystem.R'
+ 'flight.R'
+ 'install-arrow.R'
+ 'ipc_stream.R'
+ 'json.R'
+ 'memory-pool.R'
+ 'message.R'
+ 'metadata.R'
+ 'parquet.R'
+ 'python.R'
+ 'query-engine.R'
+ 'record-batch-reader.R'
+ 'record-batch-writer.R'
+ 'reexports-bit64.R'
+ 'reexports-tidyselect.R'
+ 'schema.R'
+ 'util.R'
diff --git a/src/arrow/r/Makefile b/src/arrow/r/Makefile
new file mode 100644
index 000000000..05cca5f11
--- /dev/null
+++ b/src/arrow/r/Makefile
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+VERSION=$(shell grep ^Version DESCRIPTION | sed s/Version:\ //)
+ARROW_R_DEV="TRUE"
+ARROW_LARGE_MEMORY_TESTS=$(ARROW_R_DEV)
+
+style:
+ R -s -e 'setwd(".."); if (requireNamespace("styler")) styler::style_file(setdiff(system("git diff --name-only | grep r/.*R$$", intern = TRUE), file.path("r", source("r/.styler_excludes.R")$$value)))'
+
+style-all:
+ R -s -e 'styler::style_file(setdiff(dir(pattern = "R$$", recursive = TRUE), source(".styler_excludes.R")$$value))'
+
+doc: style
+ R -s -e 'roxygen2::roxygenize()'
+ -git add --all man/*.Rd
+
+test:
+ export ARROW_R_DEV=$(ARROW_R_DEV) && R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile .
+ export NOT_CRAN=true && export ARROW_R_DEV=$(ARROW_R_DEV) && export AWS_EC2_METADATA_DISABLED=TRUE && export ARROW_LARGE_MEMORY_TESTS=$(ARROW_LARGE_MEMORY_TESTS) && R -s -e 'library(testthat); setwd(file.path(.libPaths()[1], "arrow", "tests")); system.time(test_check("arrow", filter="${file}", reporter=ifelse(nchar("${r}"), "${r}", "summary")))'
+
+deps:
+ R -s -e 'lib <- Sys.getenv("R_LIB", .libPaths()[1]); install.packages("devtools", repo="https://cloud.r-project.org", lib=lib); devtools::install_dev_deps(lib=lib)'
+
+# Note: files in tools are available at build time, but not at run time. The thirdparty
+# cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up from cpp/
+# we must rename .env to dotenv and then replace references to it in cpp/CMakeLists.txt
+sync-cpp:
+ cp ../NOTICE.txt inst/NOTICE.txt
+ rsync --archive --delete --exclude 'build' --exclude 'build-support/boost_*' --exclude 'submodules' ../cpp tools/
+ cp -p ../.env tools/dotenv
+ cp -p ../NOTICE.txt tools/
+ cp -p ../LICENSE.txt tools/
+ sed -i"" -e "s/\.env/dotenv/g" tools/cpp/CMakeLists.txt
+
+build: doc sync-cpp
+ R CMD build ${args} .
+
+check: build
+ -export _R_CHECK_CRAN_INCOMING_REMOTE_=FALSE && export ARROW_R_DEV=$(ARROW_R_DEV) && export _R_CHECK_TESTS_NLINES_=0 && R CMD check --as-cran --run-donttest arrow_$(VERSION).tar.gz
+ rm -rf arrow.Rcheck/
+
+release: build
+ -export _R_CHECK_TESTS_NLINES_=0 && R CMD check --as-cran --run-donttest arrow_$(VERSION).tar.gz
+ rm -rf arrow.Rcheck/
+
+clean:
+ -rm src/*.o
+ -rm src/*.so
+ -rm src/*.dll
+ -rm src/Makevars
+ -rm src/Makevars.win
+ -rm -rf arrow.Rcheck/
+ -rm -rf libarrow/
+ -rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt
+ -find . -name "*.orig" -delete
diff --git a/src/arrow/r/NAMESPACE b/src/arrow/r/NAMESPACE
new file mode 100644
index 000000000..572e5e24c
--- /dev/null
+++ b/src/arrow/r/NAMESPACE
@@ -0,0 +1,382 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method("!=",ArrowObject)
+S3method("$",ArrowTabular)
+S3method("$",Schema)
+S3method("$",StructArray)
+S3method("$",SubTreeFileSystem)
+S3method("$<-",ArrowTabular)
+S3method("$<-",Schema)
+S3method("==",ArrowObject)
+S3method("[",ArrowDatum)
+S3method("[",ArrowTabular)
+S3method("[",Dataset)
+S3method("[",Schema)
+S3method("[",arrow_dplyr_query)
+S3method("[[",ArrowTabular)
+S3method("[[",Schema)
+S3method("[[",StructArray)
+S3method("[[<-",ArrowTabular)
+S3method("[[<-",Schema)
+S3method("names<-",ArrowTabular)
+S3method(Ops,ArrowDatum)
+S3method(Ops,Expression)
+S3method(all,ArrowDatum)
+S3method(all,equal.ArrowObject)
+S3method(any,ArrowDatum)
+S3method(as.character,ArrowDatum)
+S3method(as.character,FileFormat)
+S3method(as.character,FragmentScanOptions)
+S3method(as.data.frame,ArrowTabular)
+S3method(as.data.frame,StructArray)
+S3method(as.data.frame,arrow_dplyr_query)
+S3method(as.double,ArrowDatum)
+S3method(as.integer,ArrowDatum)
+S3method(as.list,ArrowTabular)
+S3method(as.list,Schema)
+S3method(as.raw,Buffer)
+S3method(as.vector,ArrowDatum)
+S3method(c,Dataset)
+S3method(dim,ArrowTabular)
+S3method(dim,Dataset)
+S3method(dim,StructArray)
+S3method(dim,arrow_dplyr_query)
+S3method(dimnames,ArrowTabular)
+S3method(head,ArrowDatum)
+S3method(head,ArrowTabular)
+S3method(head,Dataset)
+S3method(head,RecordBatchReader)
+S3method(head,Scanner)
+S3method(head,arrow_dplyr_query)
+S3method(is.finite,ArrowDatum)
+S3method(is.infinite,ArrowDatum)
+S3method(is.na,ArrowDatum)
+S3method(is.na,Expression)
+S3method(is.nan,ArrowDatum)
+S3method(length,ArrowDatum)
+S3method(length,ArrowTabular)
+S3method(length,Scalar)
+S3method(length,Schema)
+S3method(max,ArrowDatum)
+S3method(mean,ArrowDatum)
+S3method(median,ArrowDatum)
+S3method(min,ArrowDatum)
+S3method(na.exclude,ArrowDatum)
+S3method(na.exclude,ArrowTabular)
+S3method(na.fail,ArrowDatum)
+S3method(na.fail,ArrowTabular)
+S3method(na.omit,ArrowDatum)
+S3method(na.omit,ArrowTabular)
+S3method(names,Dataset)
+S3method(names,FeatherReader)
+S3method(names,RecordBatch)
+S3method(names,Scanner)
+S3method(names,ScannerBuilder)
+S3method(names,Schema)
+S3method(names,StructArray)
+S3method(names,Table)
+S3method(names,arrow_dplyr_query)
+S3method(print,"arrow-enum")
+S3method(print,arrow_dplyr_query)
+S3method(print,arrow_info)
+S3method(print,arrow_r_metadata)
+S3method(quantile,ArrowDatum)
+S3method(read_message,InputStream)
+S3method(read_message,MessageReader)
+S3method(read_message,default)
+S3method(row.names,ArrowTabular)
+S3method(sort,ArrowDatum)
+S3method(sort,Scalar)
+S3method(sum,ArrowDatum)
+S3method(tail,ArrowDatum)
+S3method(tail,ArrowTabular)
+S3method(tail,Dataset)
+S3method(tail,RecordBatchReader)
+S3method(tail,Scanner)
+S3method(tail,arrow_dplyr_query)
+S3method(type,ArrowDatum)
+S3method(type,default)
+S3method(unique,ArrowDatum)
+S3method(vec_ptype_abbr,arrow_fixed_size_binary)
+S3method(vec_ptype_abbr,arrow_fixed_size_list)
+S3method(vec_ptype_abbr,arrow_large_list)
+S3method(vec_ptype_abbr,arrow_list)
+S3method(vec_ptype_full,arrow_fixed_size_binary)
+S3method(vec_ptype_full,arrow_fixed_size_list)
+S3method(vec_ptype_full,arrow_large_list)
+S3method(vec_ptype_full,arrow_list)
+export(Array)
+export(Buffer)
+export(BufferOutputStream)
+export(BufferReader)
+export(ChunkedArray)
+export(Codec)
+export(CompressedInputStream)
+export(CompressedOutputStream)
+export(CompressionType)
+export(CsvConvertOptions)
+export(CsvFileFormat)
+export(CsvFragmentScanOptions)
+export(CsvParseOptions)
+export(CsvReadOptions)
+export(CsvTableReader)
+export(CsvWriteOptions)
+export(Dataset)
+export(DatasetFactory)
+export(DateUnit)
+export(DictionaryArray)
+export(DirectoryPartitioning)
+export(DirectoryPartitioningFactory)
+export(Expression)
+export(FeatherReader)
+export(Field)
+export(FileFormat)
+export(FileInfo)
+export(FileMode)
+export(FileOutputStream)
+export(FileSelector)
+export(FileSystem)
+export(FileSystemDataset)
+export(FileSystemDatasetFactory)
+export(FileType)
+export(FixedSizeListArray)
+export(FixedSizeListType)
+export(FragmentScanOptions)
+export(HivePartitioning)
+export(HivePartitioningFactory)
+export(InMemoryDataset)
+export(IpcFileFormat)
+export(JoinType)
+export(JsonParseOptions)
+export(JsonReadOptions)
+export(JsonTableReader)
+export(LargeListArray)
+export(ListArray)
+export(LocalFileSystem)
+export(MemoryMappedFile)
+export(MessageReader)
+export(MessageType)
+export(MetadataVersion)
+export(NullEncodingBehavior)
+export(NullHandlingBehavior)
+export(ParquetArrowReaderProperties)
+export(ParquetFileFormat)
+export(ParquetFileReader)
+export(ParquetFileWriter)
+export(ParquetFragmentScanOptions)
+export(ParquetVersionType)
+export(ParquetWriterProperties)
+export(Partitioning)
+export(QuantileInterpolation)
+export(RandomAccessFile)
+export(ReadableFile)
+export(RecordBatch)
+export(RecordBatchFileReader)
+export(RecordBatchFileWriter)
+export(RecordBatchStreamReader)
+export(RecordBatchStreamWriter)
+export(RoundMode)
+export(S3FileSystem)
+export(Scalar)
+export(Scanner)
+export(ScannerBuilder)
+export(Schema)
+export(StatusCode)
+export(StructArray)
+export(StructScalar)
+export(SubTreeFileSystem)
+export(Table)
+export(TimeUnit)
+export(TimestampParser)
+export(Type)
+export(UnionDataset)
+export(all_of)
+export(arrow_available)
+export(arrow_info)
+export(arrow_table)
+export(arrow_with_dataset)
+export(arrow_with_json)
+export(arrow_with_parquet)
+export(arrow_with_s3)
+export(binary)
+export(bool)
+export(boolean)
+export(buffer)
+export(call_function)
+export(cast_options)
+export(chunked_array)
+export(codec_is_available)
+export(contains)
+export(copy_files)
+export(cpu_count)
+export(create_package_with_all_dependencies)
+export(dataset_factory)
+export(date32)
+export(date64)
+export(decimal)
+export(default_memory_pool)
+export(dictionary)
+export(ends_with)
+export(everything)
+export(field)
+export(fixed_size_binary)
+export(fixed_size_list_of)
+export(flight_connect)
+export(flight_get)
+export(flight_path_exists)
+export(flight_put)
+export(float)
+export(float16)
+export(float32)
+export(float64)
+export(halffloat)
+export(hive_partition)
+export(install_arrow)
+export(install_pyarrow)
+export(int16)
+export(int32)
+export(int64)
+export(int8)
+export(io_thread_count)
+export(is_in)
+export(large_binary)
+export(large_list_of)
+export(large_utf8)
+export(last_col)
+export(list_compute_functions)
+export(list_flights)
+export(list_of)
+export(load_flight_server)
+export(map_batches)
+export(match_arrow)
+export(matches)
+export(mmap_create)
+export(mmap_open)
+export(null)
+export(num_range)
+export(one_of)
+export(open_dataset)
+export(read_arrow)
+export(read_csv_arrow)
+export(read_delim_arrow)
+export(read_feather)
+export(read_ipc_stream)
+export(read_json_arrow)
+export(read_message)
+export(read_parquet)
+export(read_schema)
+export(read_tsv_arrow)
+export(record_batch)
+export(s3_bucket)
+export(schema)
+export(set_cpu_count)
+export(set_io_thread_count)
+export(starts_with)
+export(string)
+export(struct)
+export(time32)
+export(time64)
+export(timestamp)
+export(to_arrow)
+export(to_duckdb)
+export(type)
+export(uint16)
+export(uint32)
+export(uint64)
+export(uint8)
+export(unify_schemas)
+export(utf8)
+export(value_counts)
+export(write_arrow)
+export(write_csv_arrow)
+export(write_dataset)
+export(write_feather)
+export(write_ipc_stream)
+export(write_parquet)
+export(write_to_raw)
+importFrom(R6,R6Class)
+importFrom(assertthat,assert_that)
+importFrom(assertthat,is.string)
+importFrom(bit64,print.integer64)
+importFrom(bit64,str.integer64)
+importFrom(methods,as)
+importFrom(purrr,as_mapper)
+importFrom(purrr,imap)
+importFrom(purrr,imap_chr)
+importFrom(purrr,keep)
+importFrom(purrr,map)
+importFrom(purrr,map2)
+importFrom(purrr,map2_chr)
+importFrom(purrr,map_chr)
+importFrom(purrr,map_dfr)
+importFrom(purrr,map_int)
+importFrom(purrr,map_lgl)
+importFrom(rlang,"%||%")
+importFrom(rlang,":=")
+importFrom(rlang,.data)
+importFrom(rlang,abort)
+importFrom(rlang,as_quosure)
+importFrom(rlang,caller_env)
+importFrom(rlang,dots_n)
+importFrom(rlang,enexpr)
+importFrom(rlang,enexprs)
+importFrom(rlang,enquo)
+importFrom(rlang,enquos)
+importFrom(rlang,env)
+importFrom(rlang,env_bind)
+importFrom(rlang,eval_tidy)
+importFrom(rlang,exec)
+importFrom(rlang,expr)
+importFrom(rlang,is_bare_character)
+importFrom(rlang,is_character)
+importFrom(rlang,is_false)
+importFrom(rlang,is_integerish)
+importFrom(rlang,is_interactive)
+importFrom(rlang,is_quosure)
+importFrom(rlang,list2)
+importFrom(rlang,new_data_mask)
+importFrom(rlang,new_environment)
+importFrom(rlang,quo_get_env)
+importFrom(rlang,quo_get_expr)
+importFrom(rlang,quo_is_null)
+importFrom(rlang,quo_name)
+importFrom(rlang,quo_set_expr)
+importFrom(rlang,quos)
+importFrom(rlang,seq2)
+importFrom(rlang,set_names)
+importFrom(rlang,sym)
+importFrom(rlang,syms)
+importFrom(rlang,trace_back)
+importFrom(rlang,warn)
+importFrom(stats,median)
+importFrom(stats,na.exclude)
+importFrom(stats,na.fail)
+importFrom(stats,na.omit)
+importFrom(stats,na.pass)
+importFrom(stats,quantile)
+importFrom(tidyselect,all_of)
+importFrom(tidyselect,contains)
+importFrom(tidyselect,ends_with)
+importFrom(tidyselect,eval_select)
+importFrom(tidyselect,everything)
+importFrom(tidyselect,last_col)
+importFrom(tidyselect,matches)
+importFrom(tidyselect,num_range)
+importFrom(tidyselect,one_of)
+importFrom(tidyselect,starts_with)
+importFrom(tidyselect,vars_pull)
+importFrom(tidyselect,vars_rename)
+importFrom(tidyselect,vars_select)
+importFrom(utils,head)
+importFrom(utils,install.packages)
+importFrom(utils,modifyList)
+importFrom(utils,object.size)
+importFrom(utils,packageVersion)
+importFrom(utils,tail)
+importFrom(vctrs,s3_register)
+importFrom(vctrs,vec_cast)
+importFrom(vctrs,vec_ptype_abbr)
+importFrom(vctrs,vec_ptype_full)
+importFrom(vctrs,vec_size)
+importFrom(vctrs,vec_unique)
+useDynLib(arrow, .registration = TRUE)
diff --git a/src/arrow/r/NEWS.md b/src/arrow/r/NEWS.md
new file mode 100644
index 000000000..e38e9e2c7
--- /dev/null
+++ b/src/arrow/r/NEWS.md
@@ -0,0 +1,472 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# arrow 6.0.1
+
+There are now two ways to query Arrow data:
+
+## 1. Expanded Arrow-native queries: aggregation and joins
+
+`dplyr::summarize()`, both grouped and ungrouped, is now implemented for Arrow Datasets, Tables, and RecordBatches. Because data is scanned in chunks, you can aggregate over larger-than-memory datasets backed by many files. Supported aggregation functions include `n()`, `n_distinct()`, `min(),` `max()`, `sum()`, `mean()`, `var()`, `sd()`, `any()`, and `all()`. `median()` and `quantile()` with one probability are also supported and currently return approximate results using the t-digest algorithm.
+
+Along with `summarize()`, you can also call `count()`, `tally()`, and `distinct()`, which effectively wrap `summarize()`.
+
+This enhancement does change the behavior of `summarize()` and `collect()` in some cases: see "Breaking changes" below for details.
+
+In addition to `summarize()`, mutating and filtering equality joins (`inner_join()`, `left_join()`, `right_join()`, `full_join()`, `semi_join()`, and `anti_join()`) with are also supported natively in Arrow.
+
+Grouped aggregation and (especially) joins should be considered somewhat experimental in this release. We expect them to work, but they may not be well optimized for all workloads. To help us focus our efforts on improving them in the next release, please let us know if you encounter unexpected behavior or poor performance.
+
+New non-aggregating compute functions include string functions like `str_to_title()` and `strftime()` as well as compute functions for extracting date parts (e.g. `year()`, `month()`) from dates. This is not a complete list of additional compute functions; for an exhaustive list of available compute functions see `list_compute_functions()`.
+
+We've also worked to fill in support for all data types, such as `Decimal`, for functions added in previous releases. All type limitations mentioned in previous release notes should be no longer valid, and if you find a function that is not implemented for a certain data type, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues).
+
+## 2. DuckDB integration
+
+If you have the [duckdb package](https://CRAN.R-project.org/package=duckdb) installed, you can hand off an Arrow Dataset or query object to [DuckDB](https://duckdb.org/) for further querying using the `to_duckdb()` function. This allows you to use duckdb's `dbplyr` methods, as well as its SQL interface, to aggregate data. Filtering and column projection done before `to_duckdb()` is evaluated in Arrow, and duckdb can push down some predicates to Arrow as well. This handoff *does not* copy the data, instead it uses Arrow's C-interface (just like passing arrow data between R and Python). This means there is no serialization or data copying costs are incurred.
+
+You can also take a duckdb `tbl` and call `to_arrow()` to stream data to Arrow's query engine. This means that in a single dplyr pipeline, you could start with an Arrow Dataset, evaluate some steps in DuckDB, then evaluate the rest in Arrow.
+
+## Breaking changes
+
+* Row order of data from a Dataset query is no longer deterministic. If you need a stable sort order, you should explicitly `arrange()` the query result. For calls to `summarize()`, you can set `options(arrow.summarise.sort = TRUE)` to match the current `dplyr` behavior of sorting on the grouping columns.
+* `dplyr::summarize()` on an in-memory Arrow Table or RecordBatch no longer eagerly evaluates. Call `compute()` or `collect()` to evaluate the query.
+* `head()` and `tail()` also no longer eagerly evaluate, both for in-memory data and for Datasets. Also, because row order is no longer deterministic, they will effectively give you a random slice of data from somewhere in the dataset unless you `arrange()` to specify sorting.
+* Simple Feature (SF) columns no longer save all of their metadata when converting to Arrow tables (and thus when saving to Parquet or Feather). This also includes any dataframe column that has attributes on each element (in other words: row-level metadata). Our previous approach to saving this metadata is both (computationally) inefficient and unreliable with Arrow queries + datasets. This will most impact saving SF columns. For saving these columns we recommend either converting the columns to well-known binary representations (using `sf::st_as_binary(col)`) or using the [sfarrow package](https://CRAN.R-project.org/package=sfarrow) which handles some of the intricacies of this conversion process. We have plans to improve this and re-enable custom metadata like this in the future when we can implement the saving in a safe and efficient way. If you need to preserve the pre-6.0.0 behavior of saving this metadata, you can set `options(arrow.preserve_row_level_metadata = TRUE)`. We will be removing this option in a coming release. We strongly recommend avoiding using this workaround if possible since the results will not be supported in the future and can lead to surprising and inaccurate results. If you run into a custom class besides sf columns that are impacted by this please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues).
+* Datasets are officially no longer supported on 32-bit Windows on R < 4.0 (Rtools 3.5). 32-bit Windows users should upgrade to a newer version of R in order to use datasets.
+
+
+## Installation on Linux
+
+* Package installation now fails if the Arrow C++ library does not compile. In previous versions, if the C++ library failed to compile, you would get a successful R package installation that wouldn't do much useful.
+* You can disable all optional C++ components when building from source by setting the environment variable `LIBARROW_MINIMAL=true`. This will have the core Arrow/Feather components but excludes Parquet, Datasets, compression libraries, and other optional features.
+* Source packages now bundle the Arrow C++ source code, so it does not have to be downloaded in order to build the package. Because the source is included, it is now possible to build the package on an offline/airgapped system. By default, the offline build will be minimal because it cannot download third-party C++ dependencies required to support all features. To allow a fully featured offline build, the included `create_package_with_all_dependencies()` function (also available on GitHub without installing the arrow package) will download all third-party C++ dependencies and bundle them inside the R source package. Run this function on a system connected to the network to produce the "fat" source package, then copy that .tar.gz package to your offline machine and install. Special thanks to @karldw for the huge amount of work on this.
+* Source builds can make use of system dependencies (such as `libz`) by setting `ARROW_DEPENDENCY_SOURCE=AUTO`. This is not the default in this release (`BUNDLED`, i.e. download and build all dependencies) but may become the default in the future.
+* The JSON library components (`read_json_arrow()`) are now optional and still on by default; set `ARROW_JSON=OFF` before building to disable them.
+
+## Other enhancements and fixes
+
+* More Arrow data types use ALTREP when converting to and from R. This speeds up some workflows significantly, while for others it merely delays conversion from Arrow to R. ALTREP is used by default, but to disable it, set `options(arrow.use_altrep = FALSE)`
+* `Field` objects can now be created as non-nullable, and `schema()` now optionally accepts a list of `Field`s
+* Numeric division by zero now matches R's behavior and no longer raises an error
+* `write_parquet()` no longer errors when used with a grouped data.frame
+* `case_when()` now errors cleanly if an expression is not supported in Arrow
+* `open_dataset()` now works on CSVs without header rows
+* Fixed a minor issue where the short readr-style types `T` and `t` were reversed in `read_csv_arrow()`
+* Bindings for `log(..., base = b)` where b is something other than 2, e, or 10
+* A number of updates and expansions to our vignettes
+* Fix segfaults in converting length-0 ChunkedArrays to R vectors
+* `Table$create()` now has alias `arrow_table()`
+
+## Internals
+
+* We now use testthat 3rd edition as our default
+* A number of large test reorganizations
+* Style changes to conform with the tidyverse style guide + using lintr
+
+# arrow 5.0.0.2
+
+This patch version contains fixes for some sanitizer and compiler warnings.
+
+# arrow 5.0.0
+
+## More dplyr
+
+* There are now more than 250 compute functions available for use in `dplyr::filter()`, `mutate()`, etc. Additions in this release include:
+
+ * String operations: `strsplit()` and `str_split()`; `strptime()`; `paste()`, `paste0()`, and `str_c()`; `substr()` and `str_sub()`; `str_like()`; `str_pad()`; `stri_reverse()`
+ * Date/time operations: `lubridate` methods such as `year()`, `month()`, `wday()`, and so on
+ * Math: logarithms (`log()` et al.); trigonometry (`sin()`, `cos()`, et al.); `abs()`; `sign()`; `pmin()` and `pmax()`; `ceiling()`, `floor()`, and `trunc()`
+ * Conditional functions, with some limitations on input type in this release: `ifelse()` and `if_else()` for all but `Decimal` types; `case_when()` for logical, numeric, and temporal types only; `coalesce()` for all but lists/structs. Note also that in this release, factors/dictionaries are converted to strings in these functions.
+ * `is.*` functions are supported and can be used inside `relocate()`
+
+* The print method for `arrow_dplyr_query` now includes the expression and the resulting type of columns derived by `mutate()`.
+* `transmute()` now errors if passed arguments `.keep`, `.before`, or `.after`, for consistency with the behavior of `dplyr` on `data.frame`s.
+
+## CSV writing
+
+* `write_csv_arrow()` to use Arrow to write a data.frame to a single CSV file
+* `write_dataset(format = "csv", ...)` to write a Dataset to CSVs, including with partitioning
+
+## C interface
+
+* Added bindings for the remainder of C data interface: Type, Field, and RecordBatchReader (from the experimental C stream interface). These also have `reticulate::py_to_r()` and `r_to_py()` methods. Along with the addition of the `Scanner$ToRecordBatchReader()` method, you can now build up a Dataset query in R and pass the resulting stream of batches to another tool in process.
+* C interface methods are exposed on Arrow objects (e.g. `Array$export_to_c()`, `RecordBatch$import_from_c()`), similar to how they are in `pyarrow`. This facilitates their use in other packages. See the `py_to_r()` and `r_to_py()` methods for usage examples.
+
+## Other enhancements
+
+* Converting an R `data.frame` to an Arrow `Table` uses multithreading across columns
+* Some Arrow array types now use ALTREP when converting to R. To disable this, set `options(arrow.use_altrep = FALSE)`
+* `is.na()` now evaluates to `TRUE` on `NaN` values in floating point number fields, for consistency with base R.
+* `is.nan()` now evaluates to `FALSE` on `NA` values in floating point number fields and `FALSE` on all values in non-floating point fields, for consistency with base R.
+* Additional methods for `Array`, `ChunkedArray`, `RecordBatch`, and `Table`: `na.omit()` and friends, `any()`/`all()`
+* Scalar inputs to `RecordBatch$create()` and `Table$create()` are recycled
+* `arrow_info()` includes details on the C++ build, such as compiler version
+* `match_arrow()` now converts `x` into an `Array` if it is not a `Scalar`, `Array` or `ChunkedArray` and no longer dispatches `base::match()`.
+* Row-level metadata is now restricted to reading/writing single parquet or feather files. Row-level metadata with datasets is ignored (with a warning) if the dataset contains row-level metadata. Writing a dataset with row-level metadata will also be ignored (with a warning). We are working on a more robust implementation to support row-level metadata (and other complex types) --- stay tuned. For working with {sf} objects, [{sfarrow}](https://CRAN.R-project.org/package=sfarrow) is helpful for serializing sf columns and sharing them with geopandas.
+
+# arrow 4.0.1
+
+* Resolved a few bugs in new string compute kernels (ARROW-12774, ARROW-12670)
+
+# arrow 4.0.0.1
+
+ * The mimalloc memory allocator is the default memory allocator when using a static source build of the package on Linux. This is because it has better behavior under valgrind than jemalloc does. A full-featured build (installed with `LIBARROW_MINIMAL=false`) includes both jemalloc and mimalloc, and it has still has jemalloc as default, though this is configurable at runtime with the `ARROW_DEFAULT_MEMORY_POOL` environment variable.
+ * Environment variables `LIBARROW_MINIMAL`, `LIBARROW_DOWNLOAD`, and `NOT_CRAN` are now case-insensitive in the Linux build script.
+ * A build configuration issue in the macOS binary package has been resolved.
+
+# arrow 4.0.0
+
+## dplyr methods
+
+Many more `dplyr` verbs are supported on Arrow objects:
+
+* `dplyr::mutate()` is now supported in Arrow for many applications. For queries on `Table` and `RecordBatch` that are not yet supported in Arrow, the implementation falls back to pulling data into an in-memory R `data.frame` first, as in the previous release. For queries on `Dataset` (which can be larger than memory), it raises an error if the function is not implemented. The main `mutate()` features that cannot yet be called on Arrow objects are (1) `mutate()` after `group_by()` (which is typically used in combination with aggregation) and (2) queries that use `dplyr::across()`.
+* `dplyr::transmute()` (which calls `mutate()`)
+* `dplyr::group_by()` now preserves the `.drop` argument and supports on-the-fly definition of columns
+* `dplyr::relocate()` to reorder columns
+* `dplyr::arrange()` to sort rows
+* `dplyr::compute()` to evaluate the lazy expressions and return an Arrow Table. This is equivalent to `dplyr::collect(as_data_frame = FALSE)`, which was added in 2.0.0.
+
+Over 100 functions can now be called on Arrow objects inside a `dplyr` verb:
+
+* String functions `nchar()`, `tolower()`, and `toupper()`, along with their `stringr` spellings `str_length()`, `str_to_lower()`, and `str_to_upper()`, are supported in Arrow `dplyr` calls. `str_trim()` is also supported.
+* Regular expression functions `sub()`, `gsub()`, and `grepl()`, along with `str_replace()`, `str_replace_all()`, and `str_detect()`, are supported.
+* `cast(x, type)` and `dictionary_encode()` allow changing the type of columns in Arrow objects; `as.numeric()`, `as.character()`, etc. are exposed as similar type-altering conveniences
+* `dplyr::between()`; the Arrow version also allows the `left` and `right` arguments to be columns in the data and not just scalars
+* Additionally, any Arrow C++ compute function can be called inside a `dplyr` verb. This enables you to access Arrow functions that don't have a direct R mapping. See `list_compute_functions()` for all available functions, which are available in `dplyr` prefixed by `arrow_`.
+* Arrow C++ compute functions now do more systematic type promotion when called on data with different types (e.g. int32 and float64). Previously, Scalars in an expressions were always cast to match the type of the corresponding Array, so this new type promotion enables, among other things, operations on two columns (Arrays) in a dataset. As a side effect, some comparisons that worked in prior versions are no longer supported: for example, `dplyr::filter(arrow_dataset, string_column == 3)` will error with a message about the type mismatch between the numeric `3` and the string type of `string_column`.
+
+## Datasets
+
+* `open_dataset()` now accepts a vector of file paths (or even a single file path). Among other things, this enables you to open a single very large file and use `write_dataset()` to partition it without having to read the whole file into memory.
+* Datasets can now detect and read a directory of compressed CSVs
+* `write_dataset()` now defaults to `format = "parquet"` and better validates the `format` argument
+* Invalid input for `schema` in `open_dataset()` is now correctly handled
+* Collecting 0 columns from a Dataset now no longer returns all of the columns
+* The `Scanner$Scan()` method has been removed; use `Scanner$ScanBatches()`
+
+## Other improvements
+
+* `value_counts()` to tabulate values in an `Array` or `ChunkedArray`, similar to `base::table()`.
+* `StructArray` objects gain data.frame-like methods, including `names()`, `$`, `[[`, and `dim()`.
+* RecordBatch columns can now be added, replaced, or removed by assigning (`<-`) with either `$` or `[[`
+* Similarly, `Schema` can now be edited by assigning in new types. This enables using the CSV reader to detect the schema of a file, modify the `Schema` object for any columns that you want to read in as a different type, and then use that `Schema` to read the data.
+* Better validation when creating a `Table` with a schema, with columns of different lengths, and with scalar value recycling
+* Reading Parquet files in Japanese or other multi-byte locales on Windows no longer hangs (workaround for a [bug in libstdc++](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98723); thanks @yutannihilation for the persistence in discovering this!)
+* If you attempt to read string data that has embedded nul (`\0`) characters, the error message now informs you that you can set `options(arrow.skip_nul = TRUE)` to strip them out. It is not recommended to set this option by default since this code path is significantly slower, and most string data does not contain nuls.
+* `read_json_arrow()` now accepts a schema: `read_json_arrow("file.json", schema = schema(col_a = float64(), col_b = string()))`
+
+## Installation and configuration
+
+* The R package can now support working with an Arrow C++ library that has additional features (such as dataset, parquet, string libraries) disabled, and the bundled build script enables setting environment variables to disable them. See `vignette("install", package = "arrow")` for details. This allows a faster, smaller package build in cases where that is useful, and it enables a minimal, functioning R package build on Solaris.
+* On macOS, it is now possible to use the same bundled C++ build that is used by default on Linux, along with all of its customization parameters, by setting the environment variable `FORCE_BUNDLED_BUILD=true`.
+* `arrow` now uses the `mimalloc` memory allocator by default on macOS, if available (as it is in CRAN binaries), instead of `jemalloc`. There are [configuration issues](https://issues.apache.org/jira/browse/ARROW-6994) with `jemalloc` on macOS, and [benchmark analysis](https://ursalabs.org/blog/2021-r-benchmarks-part-1/) shows that this has negative effects on performance, especially on memory-intensive workflows. `jemalloc` remains the default on Linux; `mimalloc` is default on Windows.
+* Setting the `ARROW_DEFAULT_MEMORY_POOL` environment variable to switch memory allocators now works correctly when the Arrow C++ library has been statically linked (as is usually the case when installing from CRAN).
+* The `arrow_info()` function now reports on the additional optional features, as well as the detected SIMD level. If key features or compression libraries are not enabled in the build, `arrow_info()` will refer to the installation vignette for guidance on how to install a more complete build, if desired.
+* If you attempt to read a file that was compressed with a codec that your Arrow build does not contain support for, the error message now will tell you how to reinstall Arrow with that feature enabled.
+* A new vignette about developer environment setup `vignette("developing", package = "arrow")`.
+* When building from source, you can use the environment variable `ARROW_HOME` to point to a specific directory where the Arrow libraries are. This is similar to passing `INCLUDE_DIR` and `LIB_DIR`.
+
+# arrow 3.0.0
+
+## Python and Flight
+
+* Flight methods `flight_get()` and `flight_put()` (renamed from `push_data()` in this release) can handle both Tables and RecordBatches
+* `flight_put()` gains an `overwrite` argument to optionally check for the existence of a resource with the the same name
+* `list_flights()` and `flight_path_exists()` enable you to see available resources on a Flight server
+* `Schema` objects now have `r_to_py` and `py_to_r` methods
+* Schema metadata is correctly preserved when converting Tables to/from Python
+
+## Enhancements
+
+* Arithmetic operations (`+`, `*`, etc.) are supported on Arrays and ChunkedArrays and can be used in filter expressions in Arrow `dplyr` pipelines
+* Table columns can now be added, replaced, or removed by assigning (`<-`) with either `$` or `[[`
+* Column names of Tables and RecordBatches can be renamed by assigning `names()`
+* Large string types can now be written to Parquet files
+* The [pronouns `.data` and `.env`](https://rlang.r-lib.org/reference/tidyeval-data.html) are now fully supported in Arrow `dplyr` pipelines.
+* Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows conversion of Arrow string (`utf8()`) type data containing embedded nul `\0` characters to R. If set to `TRUE`, nuls will be stripped and a warning is emitted if any are found.
+* `arrow_info()` for an overview of various run-time and build-time Arrow configurations, useful for debugging
+* Set environment variable `ARROW_DEFAULT_MEMORY_POOL` before loading the Arrow package to change memory allocators. Windows packages are built with `mimalloc`; most others are built with both `jemalloc` (used by default) and `mimalloc`. These alternative memory allocators are generally much faster than the system memory allocator, so they are used by default when available, but sometimes it is useful to turn them off for debugging purposes. To disable them, set `ARROW_DEFAULT_MEMORY_POOL=system`.
+* List columns that have attributes on each element are now also included with the metadata that is saved when creating Arrow tables. This allows `sf` tibbles to faithfully preserved and roundtripped (ARROW-10386).
+* R metadata that exceeds 100Kb is now compressed before being written to a table; see `schema()` for more details.
+
+## Bug fixes
+
+* Fixed a performance regression in converting Arrow string types to R that was present in the 2.0.0 release
+* C++ functions now trigger garbage collection when needed
+* `write_parquet()` can now write RecordBatches
+* Reading a Table from a RecordBatchStreamReader containing 0 batches no longer crashes
+* `readr`'s `problems` attribute is removed when converting to Arrow RecordBatch and table to prevent large amounts of metadata from accumulating inadvertently (ARROW-10624)
+* Fixed reading of compressed Feather files written with Arrow 0.17 (ARROW-10850)
+* `SubTreeFileSystem` gains a useful print method and no longer errors when printing
+
+## Packaging and installation
+
+* Nightly development versions of the conda `r-arrow` package are available with `conda install -c arrow-nightlies -c conda-forge --strict-channel-priority r-arrow`
+* Linux installation now safely supports older `cmake` versions
+* Compiler version checking for enabling S3 support correctly identifies the active compiler
+* Updated guidance and troubleshooting in `vignette("install", package = "arrow")`, especially for known CentOS issues
+* Operating system detection on Linux uses the [`distro`](https://enpiar.com/distro/) package. If your OS isn't correctly identified, please report an issue there.
+
+# arrow 2.0.0
+
+## Datasets
+
+* `write_dataset()` to Feather or Parquet files with partitioning. See the end of `vignette("dataset", package = "arrow")` for discussion and examples.
+* Datasets now have `head()`, `tail()`, and take (`[`) methods. `head()` is optimized but the others may not be performant.
+* `collect()` gains an `as_data_frame` argument, default `TRUE` but when `FALSE` allows you to evaluate the accumulated `select` and `filter` query but keep the result in Arrow, not an R `data.frame`
+* `read_csv_arrow()` supports specifying column types, both with a `Schema` and with the compact string representation for types used in the `readr` package. It also has gained a `timestamp_parsers` argument that lets you express a set of `strptime` parse strings that will be tried to convert columns designated as `Timestamp` type.
+
+## AWS S3 support
+
+* S3 support is now enabled in binary macOS and Windows (Rtools40 only, i.e. R >= 4.0) packages. To enable it on Linux, you need the additional system dependencies `libcurl` and `openssl`, as well as a sufficiently modern compiler. See `vignette("install", package = "arrow")` for details.
+* File readers and writers (`read_parquet()`, `write_feather()`, et al.), as well as `open_dataset()` and `write_dataset()`, allow you to access resources on S3 (or on file systems that emulate S3) either by providing an `s3://` URI or by providing a `FileSystem$path()`. See `vignette("fs", package = "arrow")` for examples.
+* `copy_files()` allows you to recursively copy directories of files from one file system to another, such as from S3 to your local machine.
+
+## Flight RPC
+
+[Flight](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/)
+is a general-purpose client-server framework for high performance
+transport of large datasets over network interfaces.
+The `arrow` R package now provides methods for connecting to Flight RPC servers
+to send and receive data. See `vignette("flight", package = "arrow")` for an overview.
+
+## Computation
+
+* Comparison (`==`, `>`, etc.) and boolean (`&`, `|`, `!`) operations, along with `is.na`, `%in%` and `match` (called `match_arrow()`), on Arrow Arrays and ChunkedArrays are now implemented in the C++ library.
+* Aggregation methods `min()`, `max()`, and `unique()` are implemented for Arrays and ChunkedArrays.
+* `dplyr` filter expressions on Arrow Tables and RecordBatches are now evaluated in the C++ library, rather than by pulling data into R and evaluating. This yields significant performance improvements.
+* `dim()` (`nrow`) for dplyr queries on Table/RecordBatch is now supported
+
+## Packaging and installation
+
+* `arrow` now depends on [`cpp11`](https://cpp11.r-lib.org/), which brings more robust UTF-8 handling and faster compilation
+* The Linux build script now succeeds on older versions of R
+* MacOS binary packages now ship with zstandard compression enabled
+
+## Bug fixes and other enhancements
+
+* Automatic conversion of Arrow `Int64` type when all values fit with an R 32-bit integer now correctly inspects all chunks in a ChunkedArray, and this conversion can be disabled (so that `Int64` always yields a `bit64::integer64` vector) by setting `options(arrow.int64_downcast = FALSE)`.
+* In addition to the data.frame column metadata preserved in round trip, added in 1.0.0, now attributes of the data.frame itself are also preserved in Arrow schema metadata.
+* File writers now respect the system umask setting
+* `ParquetFileReader` has additional methods for accessing individual columns or row groups from the file
+* Various segfaults fixed: invalid input in `ParquetFileWriter`; invalid `ArrowObject` pointer from a saved R object; converting deeply nested structs from Arrow to R
+* The `properties` and `arrow_properties` arguments to `write_parquet()` are deprecated
+
+# arrow 1.0.1
+
+## Bug fixes
+
+* Filtering a Dataset that has multiple partition keys using an `%in%` expression now faithfully returns all relevant rows
+* Datasets can now have path segments in the root directory that start with `.` or `_`; files and subdirectories starting with those prefixes are still ignored
+* `open_dataset("~/path")` now correctly expands the path
+* The `version` option to `write_parquet()` is now correctly implemented
+* An UBSAN failure in the `parquet-cpp` library has been fixed
+* For bundled Linux builds, the logic for finding `cmake` is more robust, and you can now specify a `/path/to/cmake` by setting the `CMAKE` environment variable
+
+# arrow 1.0.0
+
+## Arrow format conversion
+
+* `vignette("arrow", package = "arrow")` includes tables that explain how R types are converted to Arrow types and vice versa.
+* Support added for converting to/from more Arrow types: `uint64`, `binary`, `fixed_size_binary`, `large_binary`, `large_utf8`, `large_list`, `list` of `structs`.
+* `character` vectors that exceed 2GB are converted to Arrow `large_utf8` type
+* `POSIXlt` objects can now be converted to Arrow (`struct`)
+* R `attributes()` are preserved in Arrow metadata when converting to Arrow RecordBatch and table and are restored when converting from Arrow. This means that custom subclasses, such as `haven::labelled`, are preserved in round trip through Arrow.
+* Schema metadata is now exposed as a named list, and it can be modified by assignment like `batch$metadata$new_key <- "new value"`
+* Arrow types `int64`, `uint32`, and `uint64` now are converted to R `integer` if all values fit in bounds
+* Arrow `date32` is now converted to R `Date` with `double` underlying storage. Even though the data values themselves are integers, this provides more strict round-trip fidelity
+* When converting to R `factor`, `dictionary` ChunkedArrays that do not have identical dictionaries are properly unified
+* In the 1.0 release, the Arrow IPC metadata version is increased from V4 to V5. By default, `RecordBatch{File,Stream}Writer` will write V5, but you can specify an alternate `metadata_version`. For convenience, if you know the consumer you're writing to cannot read V5, you can set the environment variable `ARROW_PRE_1_0_METADATA_VERSION=1` to write V4 without changing any other code.
+
+## Datasets
+
+* CSV and other text-delimited datasets are now supported
+* With a custom C++ build, it is possible to read datasets directly on S3 by passing a URL like `ds <- open_dataset("s3://...")`. Note that this currently requires a special C++ library build with additional dependencies--this is not yet available in CRAN releases or in nightly packages.
+* When reading individual CSV and JSON files, compression is automatically detected from the file extension
+
+## Other enhancements
+
+* Initial support for C++ aggregation methods: `sum()` and `mean()` are implemented for `Array` and `ChunkedArray`
+* Tables and RecordBatches have additional data.frame-like methods, including `dimnames()` and `as.list()`
+* Tables and ChunkedArrays can now be moved to/from Python via `reticulate`
+
+## Bug fixes and deprecations
+
+* Non-UTF-8 strings (common on Windows) are correctly coerced to UTF-8 when passing to Arrow memory and appropriately re-localized when converting to R
+* The `coerce_timestamps` option to `write_parquet()` is now correctly implemented.
+* Creating a Dictionary array respects the `type` definition if provided by the user
+* `read_arrow` and `write_arrow` are now deprecated; use the `read/write_feather()` and `read/write_ipc_stream()` functions depending on whether you're working with the Arrow IPC file or stream format, respectively.
+* Previously deprecated `FileStats`, `read_record_batch`, and `read_table` have been removed.
+
+## Installation and packaging
+
+* For improved performance in memory allocation, macOS and Linux binaries now have `jemalloc` included, and Windows packages use `mimalloc`
+* Linux installation: some tweaks to OS detection for binaries, some updates to known installation issues in the vignette
+* The bundled libarrow is built with the same `CC` and `CXX` values that R uses
+* Failure to build the bundled libarrow yields a clear message
+* Various streamlining efforts to reduce library size and compile time
+
+# arrow 0.17.1
+
+* Updates for compatibility with `dplyr` 1.0
+* `reticulate::r_to_py()` conversion now correctly works automatically, without having to call the method yourself
+* Assorted bug fixes in the C++ library around Parquet reading
+
+# arrow 0.17.0
+
+## Feather v2
+
+This release includes support for version 2 of the Feather file format.
+Feather v2 features full support for all Arrow data types,
+fixes the 2GB per-column limitation for large amounts of string data,
+and it allows files to be compressed using either `lz4` or `zstd`.
+`write_feather()` can write either version 2 or
+[version 1](https://github.com/wesm/feather) Feather files, and `read_feather()`
+automatically detects which file version it is reading.
+
+Related to this change, several functions around reading and writing data
+have been reworked. `read_ipc_stream()` and `write_ipc_stream()` have been
+added to facilitate writing data to the Arrow IPC stream format, which is
+slightly different from the IPC file format (Feather v2 *is* the IPC file format).
+
+Behavior has been standardized: all `read_<format>()` return an R `data.frame`
+(default) or a `Table` if the argument `as_data_frame = FALSE`;
+all `write_<format>()` functions return the data object, invisibly.
+To facilitate some workflows, a special `write_to_raw()` function is added
+to wrap `write_ipc_stream()` and return the `raw` vector containing the buffer
+that was written.
+
+To achieve this standardization, `read_table()`, `read_record_batch()`,
+`read_arrow()`, and `write_arrow()` have been deprecated.
+
+## Python interoperability
+
+The 0.17 Apache Arrow release includes a C data interface that allows
+exchanging Arrow data in-process at the C level without copying
+and without libraries having a build or runtime dependency on each other. This enables
+us to use `reticulate` to share data between R and Python (`pyarrow`) efficiently.
+
+See `vignette("python", package = "arrow")` for details.
+
+## Datasets
+
+* Dataset reading benefits from many speedups and fixes in the C++ library
+* Datasets have a `dim()` method, which sums rows across all files (#6635, @boshek)
+* Combine multiple datasets into a single queryable `UnionDataset` with the `c()` method
+* Dataset filtering now treats `NA` as `FALSE`, consistent with `dplyr::filter()`
+* Dataset filtering is now correctly supported for all Arrow date/time/timestamp column types
+* `vignette("dataset", package = "arrow")` now has correct, executable code
+
+## Installation
+
+* Installation on Linux now builds C++ the library from source by default, with some compression libraries disabled. For a faster, richer build, set the environment variable `NOT_CRAN=true`. See `vignette("install", package = "arrow")` for details and more options.
+* Source installation is faster and more reliable on more Linux distributions.
+
+## Other bug fixes and enhancements
+
+* `unify_schemas()` to create a `Schema` containing the union of fields in multiple schemas
+* Timezones are faithfully preserved in roundtrip between R and Arrow
+* `read_feather()` and other reader functions close any file connections they open
+* Arrow R6 objects no longer have namespace collisions when the `R.oo` package is also loaded
+* `FileStats` is renamed to `FileInfo`, and the original spelling has been deprecated
+
+# arrow 0.16.0.2
+
+* `install_arrow()` now installs the latest release of `arrow`, including Linux dependencies, either for CRAN releases or for development builds (if `nightly = TRUE`)
+* Package installation on Linux no longer downloads C++ dependencies unless the `LIBARROW_DOWNLOAD` or `NOT_CRAN` environment variable is set
+* `write_feather()`, `write_arrow()` and `write_parquet()` now return their input,
+similar to the `write_*` functions in the `readr` package (#6387, @boshek)
+* Can now infer the type of an R `list` and create a ListArray when all list elements are the same type (#6275, @michaelchirico)
+
+# arrow 0.16.0
+
+## Multi-file datasets
+
+This release includes a `dplyr` interface to Arrow Datasets,
+which let you work efficiently with large, multi-file datasets as a single entity.
+Explore a directory of data files with `open_dataset()` and then use `dplyr` methods to `select()`, `filter()`, etc. Work will be done where possible in Arrow memory. When necessary, data is pulled into R for further computation. `dplyr` methods are conditionally loaded if you have `dplyr` available; it is not a hard dependency.
+
+See `vignette("dataset", package = "arrow")` for details.
+
+## Linux installation
+
+A source package installation (as from CRAN) will now handle its C++ dependencies automatically.
+For common Linux distributions and versions, installation will retrieve a prebuilt static
+C++ library for inclusion in the package; where this binary is not available,
+the package executes a bundled script that should build the Arrow C++ library with
+no system dependencies beyond what R requires.
+
+See `vignette("install", package = "arrow")` for details.
+
+## Data exploration
+
+* `Table`s and `RecordBatch`es also have `dplyr` methods.
+* For exploration without `dplyr`, `[` methods for Tables, RecordBatches, Arrays, and ChunkedArrays now support natural row extraction operations. These use the C++ `Filter`, `Slice`, and `Take` methods for efficient access, depending on the type of selection vector.
+* An experimental, lazily evaluated `array_expression` class has also been added, enabling among other things the ability to filter a Table with some function of Arrays, such as `arrow_table[arrow_table$var1 > 5, ]` without having to pull everything into R first.
+
+## Compression
+
+* `write_parquet()` now supports compression
+* `codec_is_available()` returns `TRUE` or `FALSE` whether the Arrow C++ library was built with support for a given compression library (e.g. gzip, lz4, snappy)
+* Windows builds now include support for zstd and lz4 compression (#5814, @gnguy)
+
+## Other fixes and improvements
+
+* Arrow null type is now supported
+* Factor types are now preserved in round trip through Parquet format (#6135, @yutannihilation)
+* Reading an Arrow dictionary type coerces dictionary values to `character` (as R `factor` levels are required to be) instead of raising an error
+* Many improvements to Parquet function documentation (@karldw, @khughitt)
+
+# arrow 0.15.1
+
+* This patch release includes bugfixes in the C++ library around dictionary types and Parquet reading.
+
+# arrow 0.15.0
+
+## Breaking changes
+
+* The R6 classes that wrap the C++ classes are now documented and exported and have been renamed to be more R-friendly. Users of the high-level R interface in this package are not affected. Those who want to interact with the Arrow C++ API more directly should work with these objects and methods. As part of this change, many functions that instantiated these R6 objects have been removed in favor of `Class$create()` methods. Notably, `arrow::array()` and `arrow::table()` have been removed in favor of `Array$create()` and `Table$create()`, eliminating the package startup message about masking `base` functions. For more information, see the new `vignette("arrow")`.
+* Due to a subtle change in the Arrow message format, data written by the 0.15 version libraries may not be readable by older versions. If you need to send data to a process that uses an older version of Arrow (for example, an Apache Spark server that hasn't yet updated to Arrow 0.15), you can set the environment variable `ARROW_PRE_0_15_IPC_FORMAT=1`.
+* The `as_tibble` argument in the `read_*()` functions has been renamed to `as_data_frame` (ARROW-6337, @jameslamb)
+* The `arrow::Column` class has been removed, as it was removed from the C++ library
+
+## New features
+
+* `Table` and `RecordBatch` objects have S3 methods that enable you to work with them more like `data.frame`s. Extract columns, subset, and so on. See `?Table` and `?RecordBatch` for examples.
+* Initial implementation of bindings for the C++ File System API. (ARROW-6348)
+* Compressed streams are now supported on Windows (ARROW-6360), and you can also specify a compression level (ARROW-6533)
+
+## Other upgrades
+
+* Parquet file reading is much, much faster, thanks to improvements in the Arrow C++ library.
+* `read_csv_arrow()` supports more parsing options, including `col_names`, `na`, `quoted_na`, and `skip`
+* `read_parquet()` and `read_feather()` can ingest data from a `raw` vector (ARROW-6278)
+* File readers now properly handle paths that need expanding, such as `~/file.parquet` (ARROW-6323)
+* Improved support for creating types in a schema: the types' printed names (e.g. "double") are guaranteed to be valid to use in instantiating a schema (e.g. `double()`), and time types can be created with human-friendly resolution strings ("ms", "s", etc.). (ARROW-6338, ARROW-6364)
+
+
+# arrow 0.14.1
+
+Initial CRAN release of the `arrow` package. Key features include:
+
+* Read and write support for various file formats, including Parquet, Feather/Arrow, CSV, and JSON.
+* API bindings to the C++ library for Arrow data types and objects, as well as mapping between Arrow types and R data types.
+* Tools for helping with C++ library configuration and installation.
diff --git a/src/arrow/r/R/array-data.R b/src/arrow/r/R/array-data.R
new file mode 100644
index 000000000..99c24fdcf
--- /dev/null
+++ b/src/arrow/r/R/array-data.R
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @title ArrayData class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description The `ArrayData` class allows you to get and inspect the data
+#' inside an `arrow::Array`.
+#'
+#' @section Usage:
+#'
+#' ```
+#' data <- Array$create(x)$data()
+#'
+#' data$type
+#' data$length
+#' data$null_count
+#' data$offset
+#' data$buffers
+#' ```
+#'
+#' @section Methods:
+#'
+#' ...
+#'
+#' @rdname ArrayData
+#' @name ArrayData
+#' @include type.R
+ArrayData <- R6Class("ArrayData",
+ inherit = ArrowObject,
+ active = list(
+ type = function() ArrayData__get_type(self),
+ length = function() ArrayData__get_length(self),
+ null_count = function() ArrayData__get_null_count(self),
+ offset = function() ArrayData__get_offset(self),
+ buffers = function() ArrayData__buffers(self)
+ )
+)
diff --git a/src/arrow/r/R/array.R b/src/arrow/r/R/array.R
new file mode 100644
index 000000000..46acc14ff
--- /dev/null
+++ b/src/arrow/r/R/array.R
@@ -0,0 +1,329 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+#' @title Arrow Arrays
+#' @description An `Array` is an immutable data array with some logical type
+#' and some length. Most logical types are contained in the base
+#' `Array` class; there are also subclasses for `DictionaryArray`, `ListArray`,
+#' and `StructArray`.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Factory:
+#' The `Array$create()` factory method instantiates an `Array` and
+#' takes the following arguments:
+#' * `x`: an R vector, list, or `data.frame`
+#' * `type`: an optional [data type][data-type] for `x`. If omitted, the type
+#' will be inferred from the data.
+#'
+#' `Array$create()` will return the appropriate subclass of `Array`, such as
+#' `DictionaryArray` when given an R factor.
+#'
+#' To compose a `DictionaryArray` directly, call `DictionaryArray$create()`,
+#' which takes two arguments:
+#' * `x`: an R vector or `Array` of integers for the dictionary indices
+#' * `dict`: an R vector or `Array` of dictionary values (like R factor levels
+#' but not limited to strings only)
+#' @section Usage:
+#'
+#' ```
+#' a <- Array$create(x)
+#' length(a)
+#'
+#' print(a)
+#' a == a
+#' ```
+#'
+#' @section Methods:
+#'
+#' - `$IsNull(i)`: Return true if value at index is null. Does not boundscheck
+#' - `$IsValid(i)`: Return true if value at index is valid. Does not boundscheck
+#' - `$length()`: Size in the number of elements this array contains
+#' - `$offset`: A relative position into another array's data, to enable zero-copy slicing
+#' - `$null_count`: The number of null entries in the array
+#' - `$type`: logical type of data
+#' - `$type_id()`: type id
+#' - `$Equals(other)` : is this array equal to `other`
+#' - `$ApproxEquals(other)` :
+#' - `$Diff(other)` : return a string expressing the difference between two arrays
+#' - `$data()`: return the underlying [ArrayData][ArrayData]
+#' - `$as_vector()`: convert to an R vector
+#' - `$ToString()`: string representation of the array
+#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array
+#' with the indicated offset and length. If length is `NULL`, the slice goes
+#' until the end of the array.
+#' - `$Take(i)`: return an `Array` with values at positions given by integers
+#' (R vector or Array Array) `i`.
+#' - `$Filter(i, keep_na = TRUE)`: return an `Array` with values at positions where logical
+#' vector (or Arrow boolean Array) `i` is `TRUE`.
+#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be
+#' used to rearrange the `Array` in ascending or descending order
+#' - `$RangeEquals(other, start_idx, end_idx, other_start_idx)` :
+#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the
+#' data in the array to change its type.
+#' - `$View(type)`: Construct a zero-copy view of this array with the given type.
+#' - `$Validate()` : Perform any validation checks to determine obvious inconsistencies
+#' within the array's internal data. This can be an expensive check, potentially `O(length)`
+#'
+#' @rdname array
+#' @name array
+#' @examplesIf arrow_available()
+#' my_array <- Array$create(1:10)
+#' my_array$type
+#' my_array$cast(int8())
+#'
+#' # Check if value is null; zero-indexed
+#' na_array <- Array$create(c(1:5, NA))
+#' na_array$IsNull(0)
+#' na_array$IsNull(5)
+#' na_array$IsValid(5)
+#' na_array$null_count
+#'
+#' # zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice
+#' new_array <- na_array$Slice(5)
+#' new_array$offset
+#'
+#' # Compare 2 arrays
+#' na_array2 <- na_array
+#' na_array2 == na_array # element-wise comparison
+#' na_array2$Equals(na_array) # overall comparison
+#' @export
+Array <- R6Class("Array",
+ inherit = ArrowDatum,
+ public = list(
+ IsNull = function(i) Array__IsNull(self, i),
+ IsValid = function(i) Array__IsValid(self, i),
+ length = function() Array__length(self),
+ type_id = function() Array__type_id(self),
+ Equals = function(other, ...) {
+ inherits(other, "Array") && Array__Equals(self, other)
+ },
+ ApproxEquals = function(other) {
+ inherits(other, "Array") && Array__ApproxEquals(self, other)
+ },
+ Diff = function(other) {
+ if (!inherits(other, "Array")) {
+ other <- Array$create(other)
+ }
+ Array__Diff(self, other)
+ },
+ data = function() Array__data(self),
+ as_vector = function() Array__as_vector(self),
+ ToString = function() {
+ typ <- paste0("<", self$type$ToString(), ">")
+ paste(typ, Array__ToString(self), sep = "\n")
+ },
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ Array__Slice1(self, offset)
+ } else {
+ Array__Slice2(self, offset, length)
+ }
+ },
+ Take = function(i) {
+ if (is.numeric(i)) {
+ i <- as.integer(i)
+ }
+ if (is.integer(i)) {
+ i <- Array$create(i)
+ }
+ call_function("take", self, i)
+ },
+ Filter = function(i, keep_na = TRUE) {
+ if (is.logical(i)) {
+ i <- Array$create(i)
+ }
+ assert_is(i, "Array")
+ call_function("filter", self, i, options = list(keep_na = keep_na))
+ },
+ SortIndices = function(descending = FALSE) {
+ assert_that(is.logical(descending))
+ assert_that(length(descending) == 1L)
+ assert_that(!is.na(descending))
+ call_function("array_sort_indices", self, options = list(order = descending))
+ },
+ RangeEquals = function(other, start_idx, end_idx, other_start_idx = 0L) {
+ assert_is(other, "Array")
+ Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx)
+ },
+ View = function(type) {
+ Array$create(Array__View(self, as_type(type)))
+ },
+ Validate = function() Array__Validate(self),
+ export_to_c = function(array_ptr, schema_ptr) ExportArray(self, array_ptr, schema_ptr)
+ ),
+ active = list(
+ null_count = function() Array__null_count(self),
+ offset = function() Array__offset(self),
+ type = function() Array__type(self)
+ )
+)
+Array$create <- function(x, type = NULL) {
+ if (!is.null(type)) {
+ type <- as_type(type)
+ }
+ if (inherits(x, "Scalar")) {
+ out <- x$as_array()
+ if (!is.null(type)) {
+ out <- out$cast(type)
+ }
+ return(out)
+ }
+ vec_to_arrow(x, type)
+}
+#' @include arrowExports.R
+Array$import_from_c <- ImportArray
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+DictionaryArray <- R6Class("DictionaryArray",
+ inherit = Array,
+ public = list(
+ indices = function() DictionaryArray__indices(self),
+ dictionary = function() DictionaryArray__dictionary(self)
+ ),
+ active = list(
+ ordered = function() self$type$ordered
+ )
+)
+DictionaryArray$create <- function(x, dict = NULL) {
+ if (is.factor(x)) {
+ # The simple case: converting a factor.
+ # Ignoring `dict`; should probably error if dict is not NULL
+ return(Array$create(x))
+ }
+
+ assert_that(!is.null(dict))
+ if (!is.Array(x)) {
+ x <- Array$create(x)
+ }
+ if (!is.Array(dict)) {
+ dict <- Array$create(dict)
+ }
+ type <- DictionaryType$create(x$type, dict$type)
+ DictionaryArray__FromArrays(type, x, dict)
+}
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+StructArray <- R6Class("StructArray",
+ inherit = Array,
+ public = list(
+ field = function(i) StructArray__field(self, i),
+ GetFieldByName = function(name) StructArray__GetFieldByName(self, name),
+ Flatten = function() StructArray__Flatten(self)
+ )
+)
+
+
+#' @export
+`[[.StructArray` <- function(x, i, ...) {
+ if (is.character(i)) {
+ x$GetFieldByName(i)
+ } else if (is.numeric(i)) {
+ x$field(i - 1)
+ } else {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`$.StructArray` <- function(x, name, ...) {
+ assert_that(is.string(name))
+ if (name %in% ls(x)) {
+ get(name, x)
+ } else {
+ x$GetFieldByName(name)
+ }
+}
+
+#' @export
+names.StructArray <- function(x, ...) StructType__field_names(x$type)
+
+#' @export
+dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields)
+
+#' @export
+as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) {
+ as.vector(x)
+}
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+ListArray <- R6Class("ListArray",
+ inherit = Array,
+ public = list(
+ values = function() ListArray__values(self),
+ value_length = function(i) ListArray__value_length(self, i),
+ value_offset = function(i) ListArray__value_offset(self, i),
+ raw_value_offsets = function() ListArray__raw_value_offsets(self)
+ ),
+ active = list(
+ value_type = function() ListArray__value_type(self)
+ )
+)
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+LargeListArray <- R6Class("LargeListArray",
+ inherit = Array,
+ public = list(
+ values = function() LargeListArray__values(self),
+ value_length = function(i) LargeListArray__value_length(self, i),
+ value_offset = function(i) LargeListArray__value_offset(self, i),
+ raw_value_offsets = function() LargeListArray__raw_value_offsets(self)
+ ),
+ active = list(
+ value_type = function() LargeListArray__value_type(self)
+ )
+)
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+FixedSizeListArray <- R6Class("FixedSizeListArray",
+ inherit = Array,
+ public = list(
+ values = function() FixedSizeListArray__values(self),
+ value_length = function(i) FixedSizeListArray__value_length(self, i),
+ value_offset = function(i) FixedSizeListArray__value_offset(self, i)
+ ),
+ active = list(
+ value_type = function() FixedSizeListArray__value_type(self),
+ list_size = function() self$type$list_size
+ )
+)
+
+is.Array <- function(x, type = NULL) { # nolint
+ is_it <- inherits(x, c("Array", "ChunkedArray"))
+ if (is_it && !is.null(type)) {
+ is_it <- x$type$ToString() %in% type
+ }
+ is_it
+}
diff --git a/src/arrow/r/R/arrow-datum.R b/src/arrow/r/R/arrow-datum.R
new file mode 100644
index 000000000..557321f68
--- /dev/null
+++ b/src/arrow/r/R/arrow-datum.R
@@ -0,0 +1,266 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+
+# Base class for Array, ChunkedArray, and Scalar, for S3 method dispatch only.
+# Does not exist in C++ class hierarchy
+ArrowDatum <- R6Class("ArrowDatum",
+ inherit = ArrowObject,
+ public = list(
+ cast = function(target_type, safe = TRUE, ...) {
+ opts <- cast_options(safe, ...)
+ opts$to_type <- as_type(target_type)
+ call_function("cast", self, options = opts)
+ }
+ )
+)
+
+#' @export
+length.ArrowDatum <- function(x) x$length()
+
+#' @export
+is.finite.ArrowDatum <- function(x) {
+ is_fin <- call_function("is_finite", x)
+ # for compatibility with base::is.finite(), return FALSE for NA_real_
+ is_fin & !is.na(is_fin)
+}
+
+#' @export
+is.infinite.ArrowDatum <- function(x) {
+ is_inf <- call_function("is_inf", x)
+ # for compatibility with base::is.infinite(), return FALSE for NA_real_
+ is_inf & !is.na(is_inf)
+}
+
+#' @export
+is.na.ArrowDatum <- function(x) {
+ call_function("is_null", x, options = list(nan_is_null = TRUE))
+}
+
+#' @export
+is.nan.ArrowDatum <- function(x) {
+ if (x$type_id() %in% TYPES_WITH_NAN) {
+ # TODO: if an option is added to the is_nan kernel to treat NA as NaN,
+ # use that to simplify the code here (ARROW-13366)
+ call_function("is_nan", x) & call_function("is_valid", x)
+ } else {
+ Scalar$create(FALSE)$as_array(length(x))
+ }
+}
+
+#' @export
+as.vector.ArrowDatum <- function(x, mode) {
+ x$as_vector()
+}
+
+#' @export
+Ops.ArrowDatum <- function(e1, e2) {
+ if (.Generic == "!") {
+ eval_array_expression(.Generic, e1)
+ } else if (.Generic %in% names(.array_function_map)) {
+ eval_array_expression(.Generic, e1, e2)
+ } else {
+ stop(paste0("Unsupported operation on `", class(e1)[1L], "` : "), .Generic, call. = FALSE)
+ }
+}
+
+# Wrapper around call_function that:
+# (1) maps R function names to Arrow C++ compute ("/" --> "divide_checked")
+# (2) wraps R input args as Array or Scalar
+eval_array_expression <- function(FUN,
+ ...,
+ args = list(...),
+ options = empty_named_list()) {
+ if (FUN == "-" && length(args) == 1L) {
+ if (inherits(args[[1]], "ArrowObject")) {
+ return(eval_array_expression("negate_checked", args[[1]]))
+ } else {
+ return(-args[[1]])
+ }
+ }
+ args <- lapply(args, .wrap_arrow, FUN)
+
+ # In Arrow, "divide" is one function, which does integer division on
+ # integer inputs and floating-point division on floats
+ if (FUN == "/") {
+ # TODO: omg so many ways it's wrong to assume these types
+ args <- map(args, ~ .$cast(float64()))
+ } else if (FUN == "%/%") {
+ # In R, integer division works like floor(float division)
+ out <- eval_array_expression("/", args = args, options = options)
+ return(out$cast(int32(), allow_float_truncate = TRUE))
+ } else if (FUN == "%%") {
+ # We can't simply do {e1 - e2 * ( e1 %/% e2 )} since Ops.Array evaluates
+ # eagerly, but we can build that up
+ quotient <- eval_array_expression("%/%", args = args)
+ base <- eval_array_expression("*", quotient, args[[2]])
+ # this cast is to ensure that the result of this and e1 are the same
+ # (autocasting only applies to scalars)
+ base <- base$cast(args[[1]]$type)
+ return(eval_array_expression("-", args[[1]], base))
+ }
+
+ call_function(
+ .array_function_map[[FUN]] %||% FUN,
+ args = args,
+ options = options
+ )
+}
+
+.wrap_arrow <- function(arg, fun) {
+ if (!inherits(arg, "ArrowObject")) {
+ # TODO: Array$create if lengths are equal?
+ if (fun == "%in%") {
+ arg <- Array$create(arg)
+ } else {
+ arg <- Scalar$create(arg)
+ }
+ }
+ arg
+}
+
+#' @export
+na.omit.ArrowDatum <- function(object, ...) {
+ object$Filter(!is.na(object))
+}
+
+#' @export
+na.exclude.ArrowDatum <- na.omit.ArrowDatum
+
+#' @export
+na.fail.ArrowDatum <- function(object, ...) {
+ if (object$null_count > 0) {
+ stop("missing values in object", call. = FALSE)
+ }
+ object
+}
+
+filter_rows <- function(x, i, keep_na = TRUE, ...) {
+ # General purpose function for [ row subsetting with R semantics
+ # Based on the input for `i`, calls x$Filter, x$Slice, or x$Take
+ nrows <- x$num_rows %||% x$length() # Depends on whether Array or Table-like
+ if (is.logical(i)) {
+ if (isTRUE(i)) {
+ # Shortcut without doing any work
+ x
+ } else {
+ i <- rep_len(i, nrows) # For R recycling behavior; consider vctrs::vec_recycle()
+ x$Filter(i, keep_na)
+ }
+ } else if (is.numeric(i)) {
+ if (all(i < 0)) {
+ # in R, negative i means "everything but i"
+ i <- setdiff(seq_len(nrows), -1 * i)
+ }
+ if (is.sliceable(i)) {
+ x$Slice(i[1] - 1, length(i))
+ } else if (all(i > 0)) {
+ x$Take(i - 1)
+ } else {
+ stop("Cannot mix positive and negative indices", call. = FALSE)
+ }
+ } else if (is.Array(i, INTEGER_TYPES)) {
+ # NOTE: this doesn't do the - 1 offset
+ x$Take(i)
+ } else if (is.Array(i, "bool")) {
+ x$Filter(i, keep_na)
+ } else {
+ # Unsupported cases
+ if (is.Array(i)) {
+ stop("Cannot extract rows with an Array of type ", i$type$ToString(), call. = FALSE)
+ }
+ stop("Cannot extract rows with an object of class ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`[.ArrowDatum` <- filter_rows
+
+#' @importFrom utils head
+#' @export
+head.ArrowDatum <- function(x, n = 6L, ...) {
+ assert_is(n, c("numeric", "integer"))
+ assert_that(length(n) == 1)
+ len <- NROW(x)
+ if (n < 0) {
+ # head(x, negative) means all but the last n rows
+ n <- max(len + n, 0)
+ } else {
+ n <- min(len, n)
+ }
+ if (n == len) {
+ return(x)
+ }
+ x$Slice(0, n)
+}
+
+#' @importFrom utils tail
+#' @export
+tail.ArrowDatum <- function(x, n = 6L, ...) {
+ assert_is(n, c("numeric", "integer"))
+ assert_that(length(n) == 1)
+ len <- NROW(x)
+ if (n < 0) {
+ # tail(x, negative) means all but the first n rows
+ n <- min(-n, len)
+ } else {
+ n <- max(len - n, 0)
+ }
+ if (n == 0) {
+ return(x)
+ }
+ x$Slice(n)
+}
+
+is.sliceable <- function(i) {
+ # Determine whether `i` can be expressed as a $Slice() command
+ is.numeric(i) &&
+ length(i) > 0 &&
+ all(i > 0) &&
+ i[1] <= i[length(i)] &&
+ identical(as.integer(i), i[1]:i[length(i)])
+}
+
+#' @export
+as.double.ArrowDatum <- function(x, ...) as.double(as.vector(x), ...)
+
+#' @export
+as.integer.ArrowDatum <- function(x, ...) as.integer(as.vector(x), ...)
+
+#' @export
+as.character.ArrowDatum <- function(x, ...) as.character(as.vector(x), ...)
+
+#' @export
+sort.ArrowDatum <- function(x, decreasing = FALSE, na.last = NA, ...) {
+ # Arrow always sorts nulls at the end of the array. This corresponds to
+ # sort(na.last = TRUE). For the other two cases (na.last = NA and
+ # na.last = FALSE) we need to use workarounds.
+ # TODO: Implement this more cleanly after ARROW-12063
+ if (is.na(na.last)) {
+ # Filter out NAs before sorting
+ x <- x$Filter(!is.na(x))
+ x$Take(x$SortIndices(descending = decreasing))
+ } else if (na.last) {
+ x$Take(x$SortIndices(descending = decreasing))
+ } else {
+ # Create a new array that encodes missing values as 1 and non-missing values
+ # as 0. Sort descending by that array first to get the NAs at the beginning
+ tbl <- Table$create(x = x, `is_na` = as.integer(is.na(x)))
+ tbl$x$Take(tbl$SortIndices(names = c("is_na", "x"), descending = c(TRUE, decreasing)))
+ }
+}
diff --git a/src/arrow/r/R/arrow-package.R b/src/arrow/r/R/arrow-package.R
new file mode 100644
index 000000000..edc2652b6
--- /dev/null
+++ b/src/arrow/r/R/arrow-package.R
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail
+#' @importFrom R6 R6Class
+#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr
+#' @importFrom assertthat assert_that is.string
+#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos
+#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
+#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
+#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
+#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
+#' @useDynLib arrow, .registration = TRUE
+#' @keywords internal
+"_PACKAGE"
+
+#' @importFrom vctrs s3_register vec_size vec_cast vec_unique
+.onLoad <- function(...) {
+ dplyr_methods <- paste0(
+ "dplyr::",
+ c(
+ "select", "filter", "collect", "summarise", "group_by", "groups",
+ "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
+ "arrange", "rename", "pull", "relocate", "compute", "collapse",
+ "distinct", "left_join", "right_join", "inner_join", "full_join",
+ "semi_join", "anti_join", "count", "tally"
+ )
+ )
+ for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {
+ for (m in dplyr_methods) {
+ s3_register(m, cl)
+ }
+ }
+ s3_register("dplyr::tbl_vars", "arrow_dplyr_query")
+
+ for (cl in c(
+ "Array", "RecordBatch", "ChunkedArray", "Table", "Schema",
+ "Field", "DataType", "RecordBatchReader"
+ )) {
+ s3_register("reticulate::py_to_r", paste0("pyarrow.lib.", cl))
+ s3_register("reticulate::r_to_py", cl)
+ }
+
+ # Create these once, at package build time
+ if (arrow_available()) {
+ # Also include all available Arrow Compute functions,
+ # namespaced as arrow_fun.
+ # We can't do this at install time because list_compute_functions() may error
+ all_arrow_funs <- list_compute_functions()
+ arrow_funcs <- set_names(
+ lapply(all_arrow_funs, function(fun) {
+ force(fun)
+ function(...) build_expr(fun, ...)
+ }),
+ paste0("arrow_", all_arrow_funs)
+ )
+ .cache$functions <- c(nse_funcs, arrow_funcs)
+ }
+
+ if (tolower(Sys.info()[["sysname"]]) == "windows") {
+ # Disable multithreading on Windows
+ # See https://issues.apache.org/jira/browse/ARROW-8379
+ options(arrow.use_threads = FALSE)
+ }
+
+ invisible()
+}
+
+.onAttach <- function(libname, pkgname) {
+ if (!arrow_available()) {
+ msg <- paste(
+ "The Arrow C++ library is not available. To retry installation with debug output, run:",
+ " install_arrow(verbose = TRUE)",
+ "See https://arrow.apache.org/docs/r/articles/install.html for more guidance and troubleshooting.",
+ sep = "\n"
+ )
+ packageStartupMessage(msg)
+ } else {
+ # Just to be extra safe, let's wrap this in a try();
+ # we don't a failed startup message to prevent the package from loading
+ try({
+ features <- arrow_info()$capabilities
+ # That has all of the #ifdef features, plus the compression libs and the
+ # string libraries (but not the memory allocators, they're added elsewhere)
+ #
+ # Let's print a message if some are off
+ if (some_features_are_off(features)) {
+ packageStartupMessage("See arrow_info() for available features")
+ }
+ })
+ }
+}
+
+#' Is the C++ Arrow library available?
+#'
+#' You won't generally need to call these function, but they're made available
+#' for diagnostic purposes.
+#' @return `TRUE` or `FALSE` depending on whether the package was installed
+#' with:
+#' * The Arrow C++ library (check with `arrow_available()`)
+#' * Arrow Dataset support enabled (check with `arrow_with_dataset()`)
+#' * Parquet support enabled (check with `arrow_with_parquet()`)
+#' * JSON support enabled (check with `arrow_with_json()`)
+#' * Amazon S3 support enabled (check with `arrow_with_s3()`)
+#' @export
+#' @examples
+#' arrow_available()
+#' arrow_with_dataset()
+#' arrow_with_parquet()
+#' arrow_with_json()
+#' arrow_with_s3()
+#' @seealso If any of these are `FALSE`, see
+#' `vignette("install", package = "arrow")` for guidance on reinstalling the
+#' package.
+arrow_available <- function() {
+ tryCatch(.Call(`_arrow_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_dataset <- function() {
+ is_32bit <- .Machine$sizeof.pointer < 8
+ is_old_r <- getRversion() < "4.0.0"
+ is_windows <- tolower(Sys.info()[["sysname"]]) == "windows"
+ if (is_32bit && is_old_r && is_windows) {
+ # 32-bit rtools 3.5 does not properly implement the std::thread expectations
+ # but we can't just disable ARROW_DATASET in that build,
+ # so report it as "off" here.
+ return(FALSE)
+ }
+ tryCatch(.Call(`_dataset_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_parquet <- function() {
+ tryCatch(.Call(`_parquet_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_s3 <- function() {
+ tryCatch(.Call(`_s3_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+#' @rdname arrow_available
+#' @export
+arrow_with_json <- function() {
+ tryCatch(.Call(`_json_available`), error = function(e) {
+ return(FALSE)
+ })
+}
+
+option_use_threads <- function() {
+ !is_false(getOption("arrow.use_threads"))
+}
+
+#' Report information on the package's capabilities
+#'
+#' This function summarizes a number of build-time configurations and run-time
+#' settings for the Arrow package. It may be useful for diagnostics.
+#' @return A list including version information, boolean "capabilities", and
+#' statistics from Arrow's memory allocator, and also Arrow's run-time
+#' information.
+#' @export
+#' @importFrom utils packageVersion
+arrow_info <- function() {
+ opts <- options()
+ out <- list(
+ version = packageVersion("arrow"),
+ libarrow = arrow_available(),
+ options = opts[grep("^arrow\\.", names(opts))]
+ )
+ if (out$libarrow) {
+ pool <- default_memory_pool()
+ runtimeinfo <- runtime_info()
+ buildinfo <- build_info()
+ compute_funcs <- list_compute_functions()
+ out <- c(out, list(
+ capabilities = c(
+ dataset = arrow_with_dataset(),
+ parquet = arrow_with_parquet(),
+ json = arrow_with_json(),
+ s3 = arrow_with_s3(),
+ utf8proc = "utf8_upper" %in% compute_funcs,
+ re2 = "replace_substring_regex" %in% compute_funcs,
+ vapply(tolower(names(CompressionType)[-1]), codec_is_available, logical(1))
+ ),
+ memory_pool = list(
+ backend_name = pool$backend_name,
+ bytes_allocated = pool$bytes_allocated,
+ max_memory = pool$max_memory,
+ available_backends = supported_memory_backends()
+ ),
+ runtime_info = list(
+ simd_level = runtimeinfo[1],
+ detected_simd_level = runtimeinfo[2]
+ ),
+ build_info = list(
+ cpp_version = buildinfo[1],
+ cpp_compiler = buildinfo[2],
+ cpp_compiler_version = buildinfo[3],
+ cpp_compiler_flags = buildinfo[4],
+ # git_id is "" if not built from a git checkout
+ # convert that to NULL
+ git_id = if (nzchar(buildinfo[5])) buildinfo[5]
+ )
+ ))
+ }
+ structure(out, class = "arrow_info")
+}
+
+some_features_are_off <- function(features) {
+ # `features` is a named logical vector (as in arrow_info()$capabilities)
+ # Let's exclude some less relevant ones
+ blocklist <- c("lzo", "bz2", "brotli")
+ # Return TRUE if any of the other features are FALSE
+ !all(features[setdiff(names(features), blocklist)])
+}
+
+#' @export
+print.arrow_info <- function(x, ...) {
+ print_key_values <- function(title, vals, ...) {
+ # Make a key-value table for printing, no column names
+ df <- data.frame(vals, stringsAsFactors = FALSE, ...)
+ names(df) <- ""
+
+ cat(title, ":\n", sep = "")
+ print(df)
+ cat("\n")
+ }
+ cat("Arrow package version: ", format(x$version), "\n\n", sep = "")
+ if (x$libarrow) {
+ print_key_values("Capabilities", c(
+ x$capabilities,
+ jemalloc = "jemalloc" %in% x$memory_pool$available_backends,
+ mimalloc = "mimalloc" %in% x$memory_pool$available_backends
+ ))
+ if (some_features_are_off(x$capabilities) && identical(tolower(Sys.info()[["sysname"]]), "linux")) {
+ # Only on linux because (e.g.) we disable certain features on purpose on rtools35 and solaris
+ cat(
+ "To reinstall with more optional capabilities enabled, see\n",
+ " https://arrow.apache.org/docs/r/articles/install.html\n\n"
+ )
+ }
+
+ if (length(x$options)) {
+ print_key_values("Arrow options()", map_chr(x$options, format))
+ }
+
+ format_bytes <- function(b, units = "auto", digits = 2L, ...) {
+ format(structure(b, class = "object_size"), units = units, digits = digits, ...)
+ }
+ print_key_values("Memory", c(
+ Allocator = x$memory_pool$backend_name,
+ # utils:::format.object_size is not properly vectorized
+ Current = format_bytes(x$memory_pool$bytes_allocated, ...),
+ Max = format_bytes(x$memory_pool$max_memory, ...)
+ ))
+ print_key_values("Runtime", c(
+ `SIMD Level` = x$runtime_info$simd_level,
+ `Detected SIMD Level` = x$runtime_info$detected_simd_level
+ ))
+ print_key_values("Build", c(
+ `C++ Library Version` = x$build_info$cpp_version,
+ `C++ Compiler` = x$build_info$cpp_compiler,
+ `C++ Compiler Version` = x$build_info$cpp_compiler_version,
+ `Git ID` = x$build_info$git_id
+ ))
+ } else {
+ cat(
+ "Arrow C++ library not available. See https://arrow.apache.org/docs/r/articles/install.html ",
+ "for troubleshooting.\n"
+ )
+ }
+ invisible(x)
+}
+
+option_compress_metadata <- function() {
+ !is_false(getOption("arrow.compress_metadata"))
+}
+
+#' @include enums.R
+ArrowObject <- R6Class("ArrowObject",
+ public = list(
+ initialize = function(xp) self$set_pointer(xp),
+ pointer = function() get(".:xp:.", envir = self),
+ `.:xp:.` = NULL,
+ set_pointer = function(xp) {
+ if (!inherits(xp, "externalptr")) {
+ stop(
+ class(self)[1], "$new() requires a pointer as input: ",
+ "did you mean $create() instead?",
+ call. = FALSE
+ )
+ }
+ assign(".:xp:.", xp, envir = self)
+ },
+ print = function(...) {
+ if (!is.null(self$.class_title)) {
+ # Allow subclasses to override just printing the class name first
+ class_title <- self$.class_title()
+ } else {
+ class_title <- class(self)[[1]]
+ }
+ cat(class_title, "\n", sep = "")
+ if (!is.null(self$ToString)) {
+ cat(self$ToString(), "\n", sep = "")
+ }
+ invisible(self)
+ },
+ invalidate = function() {
+ assign(".:xp:.", NULL, envir = self)
+ }
+ )
+)
+
+#' @export
+`!=.ArrowObject` <- function(lhs, rhs) !(lhs == rhs) # nolint
+
+#' @export
+`==.ArrowObject` <- function(x, y) { # nolint
+ x$Equals(y)
+}
+
+#' @export
+all.equal.ArrowObject <- function(target, current, ..., check.attributes = TRUE) {
+ target$Equals(current, check_metadata = check.attributes)
+}
diff --git a/src/arrow/r/R/arrow-tabular.R b/src/arrow/r/R/arrow-tabular.R
new file mode 100644
index 000000000..43110ccf2
--- /dev/null
+++ b/src/arrow/r/R/arrow-tabular.R
@@ -0,0 +1,272 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+# Base class for RecordBatch and Table for S3 method dispatch only.
+# Does not exist in C++ class hierarchy
+ArrowTabular <- R6Class("ArrowTabular",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ sch <- unlist(strsplit(self$schema$ToString(), "\n"))
+ sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch)
+ dims <- sprintf("%s rows x %s columns", self$num_rows, self$num_columns)
+ paste(c(dims, sch), collapse = "\n")
+ },
+ Take = function(i) {
+ if (is.numeric(i)) {
+ i <- as.integer(i)
+ }
+ if (is.integer(i)) {
+ i <- Array$create(i)
+ }
+ assert_that(is.Array(i))
+ call_function("take", self, i)
+ },
+ Filter = function(i, keep_na = TRUE) {
+ if (is.logical(i)) {
+ i <- Array$create(i)
+ }
+ assert_that(is.Array(i, "bool"))
+ call_function("filter", self, i, options = list(keep_na = keep_na))
+ },
+ SortIndices = function(names, descending = FALSE) {
+ assert_that(is.character(names))
+ assert_that(length(names) > 0)
+ assert_that(!any(is.na(names)))
+ if (length(descending) == 1L) {
+ descending <- rep_len(descending, length(names))
+ }
+ assert_that(is.logical(descending))
+ assert_that(identical(length(names), length(descending)))
+ assert_that(!any(is.na(descending)))
+ call_function(
+ "sort_indices",
+ self,
+ # cpp11 does not support logical vectors so convert to integer
+ options = list(names = names, orders = as.integer(descending))
+ )
+ }
+ ),
+ active = list(
+ metadata = function(new) {
+ if (missing(new)) {
+ # Get the metadata (from the schema)
+ self$schema$metadata
+ } else {
+ # Set the metadata
+ new <- prepare_key_value_metadata(new)
+ out <- self$ReplaceSchemaMetadata(new)
+ # ReplaceSchemaMetadata returns a new object but we're modifying in place,
+ # so swap in that new C++ object pointer into our R6 object
+ self$set_pointer(out$pointer())
+ self
+ }
+ },
+ r_metadata = function(new) {
+ # Helper for the R metadata that handles the serialization
+ # See also method on Schema
+ if (missing(new)) {
+ out <- self$metadata$r
+ if (!is.null(out)) {
+ # Can't unserialize NULL
+ out <- .unserialize_arrow_r_metadata(out)
+ }
+ # Returns either NULL or a named list
+ out
+ } else {
+ # Set the R metadata
+ self$metadata$r <- .serialize_arrow_r_metadata(new)
+ self
+ }
+ }
+ )
+)
+
+#' @export
+as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
+ df <- x$to_data_frame()
+
+ if (!is.null(r_metadata <- x$metadata$r)) {
+ df <- apply_arrow_r_metadata(df, .unserialize_arrow_r_metadata(r_metadata))
+ }
+ df
+}
+
+#' @export
+`names<-.ArrowTabular` <- function(x, value) x$RenameColumns(value)
+
+#' @importFrom methods as
+#' @export
+`[.ArrowTabular` <- function(x, i, j, ..., drop = FALSE) {
+ if (nargs() == 2L) {
+ # List-like column extraction (x[i])
+ return(x[, i])
+ }
+ if (!missing(j)) {
+ # Selecting columns is cheaper than filtering rows, so do it first.
+ # That way, if we're filtering too, we have fewer arrays to filter/slice/take
+ if (is.character(j)) {
+ j_new <- match(j, names(x))
+ if (any(is.na(j_new))) {
+ stop("Column not found: ", oxford_paste(j[is.na(j_new)]), call. = FALSE)
+ }
+ j <- j_new
+ }
+ if (is_integerish(j)) {
+ if (any(is.na(j))) {
+ stop("Column indices cannot be NA", call. = FALSE)
+ }
+ if (length(j) && all(j < 0)) {
+ # in R, negative j means "everything but j"
+ j <- setdiff(seq_len(x$num_columns), -1 * j)
+ }
+ x <- x$SelectColumns(as.integer(j) - 1L)
+ }
+
+ if (drop && ncol(x) == 1L) {
+ x <- x$column(0)
+ }
+ }
+ if (!missing(i)) {
+ x <- filter_rows(x, i, ...)
+ }
+ x
+}
+
+#' @export
+`[[.ArrowTabular` <- function(x, i, ...) {
+ if (is.character(i)) {
+ x$GetColumnByName(i)
+ } else if (is.numeric(i)) {
+ x$column(i - 1)
+ } else {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`$.ArrowTabular` <- function(x, name, ...) {
+ assert_that(is.string(name))
+ if (name %in% ls(x)) {
+ get(name, x)
+ } else {
+ x$GetColumnByName(name)
+ }
+}
+
+#' @export
+`[[<-.ArrowTabular` <- function(x, i, value) {
+ if (!is.character(i) & !is.numeric(i)) {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+ assert_that(length(i) == 1, !is.na(i))
+
+ if (is.null(value)) {
+ if (is.character(i)) {
+ i <- match(i, names(x))
+ }
+ x <- x$RemoveColumn(i - 1L)
+ } else {
+ if (!is.character(i)) {
+ # get or create a/the column name
+ if (i <= x$num_columns) {
+ i <- names(x)[i]
+ } else {
+ i <- as.character(i)
+ }
+ }
+
+ # auto-magic recycling on non-ArrowObjects
+ if (!inherits(value, "ArrowObject")) {
+ value <- vctrs::vec_recycle(value, x$num_rows)
+ }
+
+ # construct the field
+ if (inherits(x, "RecordBatch") && !inherits(value, "Array")) {
+ value <- Array$create(value)
+ } else if (inherits(x, "Table") && !inherits(value, "ChunkedArray")) {
+ value <- ChunkedArray$create(value)
+ }
+ new_field <- field(i, value$type)
+
+ if (i %in% names(x)) {
+ i <- match(i, names(x)) - 1L
+ x <- x$SetColumn(i, new_field, value)
+ } else {
+ i <- x$num_columns
+ x <- x$AddColumn(i, new_field, value)
+ }
+ }
+ x
+}
+
+#' @export
+`$<-.ArrowTabular` <- function(x, i, value) {
+ assert_that(is.string(i))
+ # We need to check if `i` is in names in case it is an active binding (e.g.
+ # `metadata`, in which case we use assign to change the active binding instead
+ # of the column in the table)
+ if (i %in% ls(x)) {
+ assign(i, value, x)
+ } else {
+ x[[i]] <- value
+ }
+ x
+}
+
+#' @export
+dim.ArrowTabular <- function(x) c(x$num_rows, x$num_columns)
+
+#' @export
+length.ArrowTabular <- function(x) x$num_columns
+
+#' @export
+as.list.ArrowTabular <- function(x, ...) as.list(as.data.frame(x, ...))
+
+#' @export
+row.names.ArrowTabular <- function(x) as.character(seq_len(nrow(x)))
+
+#' @export
+dimnames.ArrowTabular <- function(x) list(row.names(x), names(x))
+
+#' @export
+head.ArrowTabular <- head.ArrowDatum
+
+#' @export
+tail.ArrowTabular <- tail.ArrowDatum
+
+#' @export
+na.fail.ArrowTabular <- function(object, ...) {
+ for (col in seq_len(object$num_columns)) {
+ if (object$column(col - 1L)$null_count > 0) {
+ stop("missing values in object", call. = FALSE)
+ }
+ }
+ object
+}
+
+#' @export
+na.omit.ArrowTabular <- function(object, ...) {
+ not_na <- map(object$columns, ~ call_function("is_valid", .x))
+ not_na_agg <- Reduce("&", not_na)
+ object$Filter(not_na_agg)
+}
+
+#' @export
+na.exclude.ArrowTabular <- na.omit.ArrowTabular
diff --git a/src/arrow/r/R/arrowExports.R b/src/arrow/r/R/arrowExports.R
new file mode 100644
index 000000000..014b1641f
--- /dev/null
+++ b/src/arrow/r/R/arrowExports.R
@@ -0,0 +1,1801 @@
+# Generated by using data-raw/codegen.R -> do not edit by hand
+
+test_SET_STRING_ELT <- function(s) {
+ invisible(.Call(`_arrow_test_SET_STRING_ELT`, s))
+}
+
+test_same_Array <- function(x, y) {
+ .Call(`_arrow_test_same_Array`, x, y)
+}
+
+is_arrow_altrep <- function(x) {
+ .Call(`_arrow_is_arrow_altrep`, x)
+}
+
+Array__Slice1 <- function(array, offset) {
+ .Call(`_arrow_Array__Slice1`, array, offset)
+}
+
+Array__Slice2 <- function(array, offset, length) {
+ .Call(`_arrow_Array__Slice2`, array, offset, length)
+}
+
+Array__IsNull <- function(x, i) {
+ .Call(`_arrow_Array__IsNull`, x, i)
+}
+
+Array__IsValid <- function(x, i) {
+ .Call(`_arrow_Array__IsValid`, x, i)
+}
+
+Array__length <- function(x) {
+ .Call(`_arrow_Array__length`, x)
+}
+
+Array__offset <- function(x) {
+ .Call(`_arrow_Array__offset`, x)
+}
+
+Array__null_count <- function(x) {
+ .Call(`_arrow_Array__null_count`, x)
+}
+
+Array__type <- function(x) {
+ .Call(`_arrow_Array__type`, x)
+}
+
+Array__ToString <- function(x) {
+ .Call(`_arrow_Array__ToString`, x)
+}
+
+Array__type_id <- function(x) {
+ .Call(`_arrow_Array__type_id`, x)
+}
+
+Array__Equals <- function(lhs, rhs) {
+ .Call(`_arrow_Array__Equals`, lhs, rhs)
+}
+
+Array__ApproxEquals <- function(lhs, rhs) {
+ .Call(`_arrow_Array__ApproxEquals`, lhs, rhs)
+}
+
+Array__Diff <- function(lhs, rhs) {
+ .Call(`_arrow_Array__Diff`, lhs, rhs)
+}
+
+Array__data <- function(array) {
+ .Call(`_arrow_Array__data`, array)
+}
+
+Array__RangeEquals <- function(self, other, start_idx, end_idx, other_start_idx) {
+ .Call(`_arrow_Array__RangeEquals`, self, other, start_idx, end_idx, other_start_idx)
+}
+
+Array__View <- function(array, type) {
+ .Call(`_arrow_Array__View`, array, type)
+}
+
+Array__Validate <- function(array) {
+ invisible(.Call(`_arrow_Array__Validate`, array))
+}
+
+DictionaryArray__indices <- function(array) {
+ .Call(`_arrow_DictionaryArray__indices`, array)
+}
+
+DictionaryArray__dictionary <- function(array) {
+ .Call(`_arrow_DictionaryArray__dictionary`, array)
+}
+
+StructArray__field <- function(array, i) {
+ .Call(`_arrow_StructArray__field`, array, i)
+}
+
+StructArray__GetFieldByName <- function(array, name) {
+ .Call(`_arrow_StructArray__GetFieldByName`, array, name)
+}
+
+StructArray__Flatten <- function(array) {
+ .Call(`_arrow_StructArray__Flatten`, array)
+}
+
+ListArray__value_type <- function(array) {
+ .Call(`_arrow_ListArray__value_type`, array)
+}
+
+LargeListArray__value_type <- function(array) {
+ .Call(`_arrow_LargeListArray__value_type`, array)
+}
+
+ListArray__values <- function(array) {
+ .Call(`_arrow_ListArray__values`, array)
+}
+
+LargeListArray__values <- function(array) {
+ .Call(`_arrow_LargeListArray__values`, array)
+}
+
+ListArray__value_length <- function(array, i) {
+ .Call(`_arrow_ListArray__value_length`, array, i)
+}
+
+LargeListArray__value_length <- function(array, i) {
+ .Call(`_arrow_LargeListArray__value_length`, array, i)
+}
+
+FixedSizeListArray__value_length <- function(array, i) {
+ .Call(`_arrow_FixedSizeListArray__value_length`, array, i)
+}
+
+ListArray__value_offset <- function(array, i) {
+ .Call(`_arrow_ListArray__value_offset`, array, i)
+}
+
+LargeListArray__value_offset <- function(array, i) {
+ .Call(`_arrow_LargeListArray__value_offset`, array, i)
+}
+
+FixedSizeListArray__value_offset <- function(array, i) {
+ .Call(`_arrow_FixedSizeListArray__value_offset`, array, i)
+}
+
+ListArray__raw_value_offsets <- function(array) {
+ .Call(`_arrow_ListArray__raw_value_offsets`, array)
+}
+
+LargeListArray__raw_value_offsets <- function(array) {
+ .Call(`_arrow_LargeListArray__raw_value_offsets`, array)
+}
+
+Array__as_vector <- function(array) {
+ .Call(`_arrow_Array__as_vector`, array)
+}
+
+ChunkedArray__as_vector <- function(chunked_array, use_threads) {
+ .Call(`_arrow_ChunkedArray__as_vector`, chunked_array, use_threads)
+}
+
+RecordBatch__to_dataframe <- function(batch, use_threads) {
+ .Call(`_arrow_RecordBatch__to_dataframe`, batch, use_threads)
+}
+
+Table__to_dataframe <- function(table, use_threads) {
+ .Call(`_arrow_Table__to_dataframe`, table, use_threads)
+}
+
+ArrayData__get_type <- function(x) {
+ .Call(`_arrow_ArrayData__get_type`, x)
+}
+
+ArrayData__get_length <- function(x) {
+ .Call(`_arrow_ArrayData__get_length`, x)
+}
+
+ArrayData__get_null_count <- function(x) {
+ .Call(`_arrow_ArrayData__get_null_count`, x)
+}
+
+ArrayData__get_offset <- function(x) {
+ .Call(`_arrow_ArrayData__get_offset`, x)
+}
+
+ArrayData__buffers <- function(x) {
+ .Call(`_arrow_ArrayData__buffers`, x)
+}
+
+Buffer__is_mutable <- function(buffer) {
+ .Call(`_arrow_Buffer__is_mutable`, buffer)
+}
+
+Buffer__ZeroPadding <- function(buffer) {
+ invisible(.Call(`_arrow_Buffer__ZeroPadding`, buffer))
+}
+
+Buffer__capacity <- function(buffer) {
+ .Call(`_arrow_Buffer__capacity`, buffer)
+}
+
+Buffer__size <- function(buffer) {
+ .Call(`_arrow_Buffer__size`, buffer)
+}
+
+r___RBuffer__initialize <- function(x) {
+ .Call(`_arrow_r___RBuffer__initialize`, x)
+}
+
+Buffer__data <- function(buffer) {
+ .Call(`_arrow_Buffer__data`, buffer)
+}
+
+Buffer__Equals <- function(x, y) {
+ .Call(`_arrow_Buffer__Equals`, x, y)
+}
+
+ChunkedArray__length <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__length`, chunked_array)
+}
+
+ChunkedArray__null_count <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__null_count`, chunked_array)
+}
+
+ChunkedArray__num_chunks <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__num_chunks`, chunked_array)
+}
+
+ChunkedArray__chunk <- function(chunked_array, i) {
+ .Call(`_arrow_ChunkedArray__chunk`, chunked_array, i)
+}
+
+ChunkedArray__chunks <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__chunks`, chunked_array)
+}
+
+ChunkedArray__type <- function(chunked_array) {
+ .Call(`_arrow_ChunkedArray__type`, chunked_array)
+}
+
+ChunkedArray__Slice1 <- function(chunked_array, offset) {
+ .Call(`_arrow_ChunkedArray__Slice1`, chunked_array, offset)
+}
+
+ChunkedArray__Slice2 <- function(chunked_array, offset, length) {
+ .Call(`_arrow_ChunkedArray__Slice2`, chunked_array, offset, length)
+}
+
+ChunkedArray__View <- function(array, type) {
+ .Call(`_arrow_ChunkedArray__View`, array, type)
+}
+
+ChunkedArray__Validate <- function(chunked_array) {
+ invisible(.Call(`_arrow_ChunkedArray__Validate`, chunked_array))
+}
+
+ChunkedArray__Equals <- function(x, y) {
+ .Call(`_arrow_ChunkedArray__Equals`, x, y)
+}
+
+ChunkedArray__ToString <- function(x) {
+ .Call(`_arrow_ChunkedArray__ToString`, x)
+}
+
+ChunkedArray__from_list <- function(chunks, s_type) {
+ .Call(`_arrow_ChunkedArray__from_list`, chunks, s_type)
+}
+
+util___Codec__Create <- function(codec, compression_level) {
+ .Call(`_arrow_util___Codec__Create`, codec, compression_level)
+}
+
+util___Codec__name <- function(codec) {
+ .Call(`_arrow_util___Codec__name`, codec)
+}
+
+util___Codec__IsAvailable <- function(codec) {
+ .Call(`_arrow_util___Codec__IsAvailable`, codec)
+}
+
+io___CompressedOutputStream__Make <- function(codec, raw) {
+ .Call(`_arrow_io___CompressedOutputStream__Make`, codec, raw)
+}
+
+io___CompressedInputStream__Make <- function(codec, raw) {
+ .Call(`_arrow_io___CompressedInputStream__Make`, codec, raw)
+}
+
+ExecPlan_create <- function(use_threads) {
+ .Call(`_arrow_ExecPlan_create`, use_threads)
+}
+
+ExecPlan_run <- function(plan, final_node, sort_options, head) {
+ .Call(`_arrow_ExecPlan_run`, plan, final_node, sort_options, head)
+}
+
+ExecPlan_StopProducing <- function(plan) {
+ invisible(.Call(`_arrow_ExecPlan_StopProducing`, plan))
+}
+
+ExecNode_output_schema <- function(node) {
+ .Call(`_arrow_ExecNode_output_schema`, node)
+}
+
+ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) {
+ .Call(`_arrow_ExecNode_Scan`, plan, dataset, filter, materialized_field_names)
+}
+
+ExecNode_Filter <- function(input, filter) {
+ .Call(`_arrow_ExecNode_Filter`, input, filter)
+}
+
+ExecNode_Project <- function(input, exprs, names) {
+ .Call(`_arrow_ExecNode_Project`, input, exprs, names)
+}
+
+ExecNode_Aggregate <- function(input, options, target_names, out_field_names, key_names) {
+ .Call(`_arrow_ExecNode_Aggregate`, input, options, target_names, out_field_names, key_names)
+}
+
+ExecNode_Join <- function(input, type, right_data, left_keys, right_keys, left_output, right_output) {
+ .Call(`_arrow_ExecNode_Join`, input, type, right_data, left_keys, right_keys, left_output, right_output)
+}
+
+ExecNode_ReadFromRecordBatchReader <- function(plan, reader) {
+ .Call(`_arrow_ExecNode_ReadFromRecordBatchReader`, plan, reader)
+}
+
+RecordBatch__cast <- function(batch, schema, options) {
+ .Call(`_arrow_RecordBatch__cast`, batch, schema, options)
+}
+
+Table__cast <- function(table, schema, options) {
+ .Call(`_arrow_Table__cast`, table, schema, options)
+}
+
+compute__CallFunction <- function(func_name, args, options) {
+ .Call(`_arrow_compute__CallFunction`, func_name, args, options)
+}
+
+compute__GetFunctionNames <- function() {
+ .Call(`_arrow_compute__GetFunctionNames`)
+}
+
+build_info <- function() {
+ .Call(`_arrow_build_info`)
+}
+
+runtime_info <- function() {
+ .Call(`_arrow_runtime_info`)
+}
+
+csv___WriteOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___WriteOptions__initialize`, options)
+}
+
+csv___ReadOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___ReadOptions__initialize`, options)
+}
+
+csv___ParseOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___ParseOptions__initialize`, options)
+}
+
+csv___ReadOptions__column_names <- function(options) {
+ .Call(`_arrow_csv___ReadOptions__column_names`, options)
+}
+
+csv___ConvertOptions__initialize <- function(options) {
+ .Call(`_arrow_csv___ConvertOptions__initialize`, options)
+}
+
+csv___TableReader__Make <- function(input, read_options, parse_options, convert_options) {
+ .Call(`_arrow_csv___TableReader__Make`, input, read_options, parse_options, convert_options)
+}
+
+csv___TableReader__Read <- function(table_reader) {
+ .Call(`_arrow_csv___TableReader__Read`, table_reader)
+}
+
+TimestampParser__kind <- function(parser) {
+ .Call(`_arrow_TimestampParser__kind`, parser)
+}
+
+TimestampParser__format <- function(parser) {
+ .Call(`_arrow_TimestampParser__format`, parser)
+}
+
+TimestampParser__MakeStrptime <- function(format) {
+ .Call(`_arrow_TimestampParser__MakeStrptime`, format)
+}
+
+TimestampParser__MakeISO8601 <- function() {
+ .Call(`_arrow_TimestampParser__MakeISO8601`)
+}
+
+csv___WriteCSV__Table <- function(table, write_options, stream) {
+ invisible(.Call(`_arrow_csv___WriteCSV__Table`, table, write_options, stream))
+}
+
+csv___WriteCSV__RecordBatch <- function(record_batch, write_options, stream) {
+ invisible(.Call(`_arrow_csv___WriteCSV__RecordBatch`, record_batch, write_options, stream))
+}
+
+dataset___Dataset__NewScan <- function(ds) {
+ .Call(`_arrow_dataset___Dataset__NewScan`, ds)
+}
+
+dataset___Dataset__schema <- function(dataset) {
+ .Call(`_arrow_dataset___Dataset__schema`, dataset)
+}
+
+dataset___Dataset__type_name <- function(dataset) {
+ .Call(`_arrow_dataset___Dataset__type_name`, dataset)
+}
+
+dataset___Dataset__ReplaceSchema <- function(dataset, schm) {
+ .Call(`_arrow_dataset___Dataset__ReplaceSchema`, dataset, schm)
+}
+
+dataset___UnionDataset__create <- function(datasets, schm) {
+ .Call(`_arrow_dataset___UnionDataset__create`, datasets, schm)
+}
+
+dataset___InMemoryDataset__create <- function(table) {
+ .Call(`_arrow_dataset___InMemoryDataset__create`, table)
+}
+
+dataset___UnionDataset__children <- function(ds) {
+ .Call(`_arrow_dataset___UnionDataset__children`, ds)
+}
+
+dataset___FileSystemDataset__format <- function(dataset) {
+ .Call(`_arrow_dataset___FileSystemDataset__format`, dataset)
+}
+
+dataset___FileSystemDataset__filesystem <- function(dataset) {
+ .Call(`_arrow_dataset___FileSystemDataset__filesystem`, dataset)
+}
+
+dataset___FileSystemDataset__files <- function(dataset) {
+ .Call(`_arrow_dataset___FileSystemDataset__files`, dataset)
+}
+
+dataset___DatasetFactory__Finish1 <- function(factory, unify_schemas) {
+ .Call(`_arrow_dataset___DatasetFactory__Finish1`, factory, unify_schemas)
+}
+
+dataset___DatasetFactory__Finish2 <- function(factory, schema) {
+ .Call(`_arrow_dataset___DatasetFactory__Finish2`, factory, schema)
+}
+
+dataset___DatasetFactory__Inspect <- function(factory, unify_schemas) {
+ .Call(`_arrow_dataset___DatasetFactory__Inspect`, factory, unify_schemas)
+}
+
+dataset___UnionDatasetFactory__Make <- function(children) {
+ .Call(`_arrow_dataset___UnionDatasetFactory__Make`, children)
+}
+
+dataset___FileSystemDatasetFactory__Make0 <- function(fs, paths, format) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make0`, fs, paths, format)
+}
+
+dataset___FileSystemDatasetFactory__Make2 <- function(fs, selector, format, partitioning) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make2`, fs, selector, format, partitioning)
+}
+
+dataset___FileSystemDatasetFactory__Make1 <- function(fs, selector, format) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make1`, fs, selector, format)
+}
+
+dataset___FileSystemDatasetFactory__Make3 <- function(fs, selector, format, factory) {
+ .Call(`_arrow_dataset___FileSystemDatasetFactory__Make3`, fs, selector, format, factory)
+}
+
+dataset___FileFormat__type_name <- function(format) {
+ .Call(`_arrow_dataset___FileFormat__type_name`, format)
+}
+
+dataset___FileFormat__DefaultWriteOptions <- function(fmt) {
+ .Call(`_arrow_dataset___FileFormat__DefaultWriteOptions`, fmt)
+}
+
+dataset___ParquetFileFormat__Make <- function(options, dict_columns) {
+ .Call(`_arrow_dataset___ParquetFileFormat__Make`, options, dict_columns)
+}
+
+dataset___FileWriteOptions__type_name <- function(options) {
+ .Call(`_arrow_dataset___FileWriteOptions__type_name`, options)
+}
+
+dataset___ParquetFileWriteOptions__update <- function(options, writer_props, arrow_writer_props) {
+ invisible(.Call(`_arrow_dataset___ParquetFileWriteOptions__update`, options, writer_props, arrow_writer_props))
+}
+
+dataset___IpcFileWriteOptions__update2 <- function(ipc_options, use_legacy_format, codec, metadata_version) {
+ invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update2`, ipc_options, use_legacy_format, codec, metadata_version))
+}
+
+dataset___IpcFileWriteOptions__update1 <- function(ipc_options, use_legacy_format, metadata_version) {
+ invisible(.Call(`_arrow_dataset___IpcFileWriteOptions__update1`, ipc_options, use_legacy_format, metadata_version))
+}
+
+dataset___CsvFileWriteOptions__update <- function(csv_options, write_options) {
+ invisible(.Call(`_arrow_dataset___CsvFileWriteOptions__update`, csv_options, write_options))
+}
+
+dataset___IpcFileFormat__Make <- function() {
+ .Call(`_arrow_dataset___IpcFileFormat__Make`)
+}
+
+dataset___CsvFileFormat__Make <- function(parse_options, convert_options, read_options) {
+ .Call(`_arrow_dataset___CsvFileFormat__Make`, parse_options, convert_options, read_options)
+}
+
+dataset___FragmentScanOptions__type_name <- function(fragment_scan_options) {
+ .Call(`_arrow_dataset___FragmentScanOptions__type_name`, fragment_scan_options)
+}
+
+dataset___CsvFragmentScanOptions__Make <- function(convert_options, read_options) {
+ .Call(`_arrow_dataset___CsvFragmentScanOptions__Make`, convert_options, read_options)
+}
+
+dataset___ParquetFragmentScanOptions__Make <- function(use_buffered_stream, buffer_size, pre_buffer) {
+ .Call(`_arrow_dataset___ParquetFragmentScanOptions__Make`, use_buffered_stream, buffer_size, pre_buffer)
+}
+
+dataset___DirectoryPartitioning <- function(schm, segment_encoding) {
+ .Call(`_arrow_dataset___DirectoryPartitioning`, schm, segment_encoding)
+}
+
+dataset___DirectoryPartitioning__MakeFactory <- function(field_names, segment_encoding) {
+ .Call(`_arrow_dataset___DirectoryPartitioning__MakeFactory`, field_names, segment_encoding)
+}
+
+dataset___HivePartitioning <- function(schm, null_fallback, segment_encoding) {
+ .Call(`_arrow_dataset___HivePartitioning`, schm, null_fallback, segment_encoding)
+}
+
+dataset___HivePartitioning__MakeFactory <- function(null_fallback, segment_encoding) {
+ .Call(`_arrow_dataset___HivePartitioning__MakeFactory`, null_fallback, segment_encoding)
+}
+
+dataset___ScannerBuilder__ProjectNames <- function(sb, cols) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectNames`, sb, cols))
+}
+
+dataset___ScannerBuilder__ProjectExprs <- function(sb, exprs, names) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__ProjectExprs`, sb, exprs, names))
+}
+
+dataset___ScannerBuilder__Filter <- function(sb, expr) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__Filter`, sb, expr))
+}
+
+dataset___ScannerBuilder__UseThreads <- function(sb, threads) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__UseThreads`, sb, threads))
+}
+
+dataset___ScannerBuilder__UseAsync <- function(sb, use_async) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__UseAsync`, sb, use_async))
+}
+
+dataset___ScannerBuilder__BatchSize <- function(sb, batch_size) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__BatchSize`, sb, batch_size))
+}
+
+dataset___ScannerBuilder__FragmentScanOptions <- function(sb, options) {
+ invisible(.Call(`_arrow_dataset___ScannerBuilder__FragmentScanOptions`, sb, options))
+}
+
+dataset___ScannerBuilder__schema <- function(sb) {
+ .Call(`_arrow_dataset___ScannerBuilder__schema`, sb)
+}
+
+dataset___ScannerBuilder__Finish <- function(sb) {
+ .Call(`_arrow_dataset___ScannerBuilder__Finish`, sb)
+}
+
+dataset___ScannerBuilder__FromRecordBatchReader <- function(reader) {
+ .Call(`_arrow_dataset___ScannerBuilder__FromRecordBatchReader`, reader)
+}
+
+dataset___Scanner__ToTable <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__ToTable`, scanner)
+}
+
+dataset___Scanner__ScanBatches <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__ScanBatches`, scanner)
+}
+
+dataset___Scanner__ToRecordBatchReader <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__ToRecordBatchReader`, scanner)
+}
+
+dataset___Scanner__head <- function(scanner, n) {
+ .Call(`_arrow_dataset___Scanner__head`, scanner, n)
+}
+
+dataset___Scanner__schema <- function(sc) {
+ .Call(`_arrow_dataset___Scanner__schema`, sc)
+}
+
+dataset___ScanTask__get_batches <- function(scan_task) {
+ .Call(`_arrow_dataset___ScanTask__get_batches`, scan_task)
+}
+
+dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior) {
+ invisible(.Call(`_arrow_dataset___Dataset__Write`, file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior))
+}
+
+dataset___Scanner__TakeRows <- function(scanner, indices) {
+ .Call(`_arrow_dataset___Scanner__TakeRows`, scanner, indices)
+}
+
+dataset___Scanner__CountRows <- function(scanner) {
+ .Call(`_arrow_dataset___Scanner__CountRows`, scanner)
+}
+
+Int8__initialize <- function() {
+ .Call(`_arrow_Int8__initialize`)
+}
+
+Int16__initialize <- function() {
+ .Call(`_arrow_Int16__initialize`)
+}
+
+Int32__initialize <- function() {
+ .Call(`_arrow_Int32__initialize`)
+}
+
+Int64__initialize <- function() {
+ .Call(`_arrow_Int64__initialize`)
+}
+
+UInt8__initialize <- function() {
+ .Call(`_arrow_UInt8__initialize`)
+}
+
+UInt16__initialize <- function() {
+ .Call(`_arrow_UInt16__initialize`)
+}
+
+UInt32__initialize <- function() {
+ .Call(`_arrow_UInt32__initialize`)
+}
+
+UInt64__initialize <- function() {
+ .Call(`_arrow_UInt64__initialize`)
+}
+
+Float16__initialize <- function() {
+ .Call(`_arrow_Float16__initialize`)
+}
+
+Float32__initialize <- function() {
+ .Call(`_arrow_Float32__initialize`)
+}
+
+Float64__initialize <- function() {
+ .Call(`_arrow_Float64__initialize`)
+}
+
+Boolean__initialize <- function() {
+ .Call(`_arrow_Boolean__initialize`)
+}
+
+Utf8__initialize <- function() {
+ .Call(`_arrow_Utf8__initialize`)
+}
+
+LargeUtf8__initialize <- function() {
+ .Call(`_arrow_LargeUtf8__initialize`)
+}
+
+Binary__initialize <- function() {
+ .Call(`_arrow_Binary__initialize`)
+}
+
+LargeBinary__initialize <- function() {
+ .Call(`_arrow_LargeBinary__initialize`)
+}
+
+Date32__initialize <- function() {
+ .Call(`_arrow_Date32__initialize`)
+}
+
+Date64__initialize <- function() {
+ .Call(`_arrow_Date64__initialize`)
+}
+
+Null__initialize <- function() {
+ .Call(`_arrow_Null__initialize`)
+}
+
+Decimal128Type__initialize <- function(precision, scale) {
+ .Call(`_arrow_Decimal128Type__initialize`, precision, scale)
+}
+
+FixedSizeBinary__initialize <- function(byte_width) {
+ .Call(`_arrow_FixedSizeBinary__initialize`, byte_width)
+}
+
+Timestamp__initialize <- function(unit, timezone) {
+ .Call(`_arrow_Timestamp__initialize`, unit, timezone)
+}
+
+Time32__initialize <- function(unit) {
+ .Call(`_arrow_Time32__initialize`, unit)
+}
+
+Time64__initialize <- function(unit) {
+ .Call(`_arrow_Time64__initialize`, unit)
+}
+
+list__ <- function(x) {
+ .Call(`_arrow_list__`, x)
+}
+
+large_list__ <- function(x) {
+ .Call(`_arrow_large_list__`, x)
+}
+
+fixed_size_list__ <- function(x, list_size) {
+ .Call(`_arrow_fixed_size_list__`, x, list_size)
+}
+
+struct__ <- function(fields) {
+ .Call(`_arrow_struct__`, fields)
+}
+
+DataType__ToString <- function(type) {
+ .Call(`_arrow_DataType__ToString`, type)
+}
+
+DataType__name <- function(type) {
+ .Call(`_arrow_DataType__name`, type)
+}
+
+DataType__Equals <- function(lhs, rhs) {
+ .Call(`_arrow_DataType__Equals`, lhs, rhs)
+}
+
+DataType__num_fields <- function(type) {
+ .Call(`_arrow_DataType__num_fields`, type)
+}
+
+DataType__fields <- function(type) {
+ .Call(`_arrow_DataType__fields`, type)
+}
+
+DataType__id <- function(type) {
+ .Call(`_arrow_DataType__id`, type)
+}
+
+ListType__ToString <- function(type) {
+ .Call(`_arrow_ListType__ToString`, type)
+}
+
+FixedWidthType__bit_width <- function(type) {
+ .Call(`_arrow_FixedWidthType__bit_width`, type)
+}
+
+DateType__unit <- function(type) {
+ .Call(`_arrow_DateType__unit`, type)
+}
+
+TimeType__unit <- function(type) {
+ .Call(`_arrow_TimeType__unit`, type)
+}
+
+DecimalType__precision <- function(type) {
+ .Call(`_arrow_DecimalType__precision`, type)
+}
+
+DecimalType__scale <- function(type) {
+ .Call(`_arrow_DecimalType__scale`, type)
+}
+
+TimestampType__timezone <- function(type) {
+ .Call(`_arrow_TimestampType__timezone`, type)
+}
+
+TimestampType__unit <- function(type) {
+ .Call(`_arrow_TimestampType__unit`, type)
+}
+
+DictionaryType__initialize <- function(index_type, value_type, ordered) {
+ .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered)
+}
+
+DictionaryType__index_type <- function(type) {
+ .Call(`_arrow_DictionaryType__index_type`, type)
+}
+
+DictionaryType__value_type <- function(type) {
+ .Call(`_arrow_DictionaryType__value_type`, type)
+}
+
+DictionaryType__name <- function(type) {
+ .Call(`_arrow_DictionaryType__name`, type)
+}
+
+DictionaryType__ordered <- function(type) {
+ .Call(`_arrow_DictionaryType__ordered`, type)
+}
+
+StructType__GetFieldByName <- function(type, name) {
+ .Call(`_arrow_StructType__GetFieldByName`, type, name)
+}
+
+StructType__GetFieldIndex <- function(type, name) {
+ .Call(`_arrow_StructType__GetFieldIndex`, type, name)
+}
+
+StructType__field_names <- function(type) {
+ .Call(`_arrow_StructType__field_names`, type)
+}
+
+ListType__value_field <- function(type) {
+ .Call(`_arrow_ListType__value_field`, type)
+}
+
+ListType__value_type <- function(type) {
+ .Call(`_arrow_ListType__value_type`, type)
+}
+
+LargeListType__value_field <- function(type) {
+ .Call(`_arrow_LargeListType__value_field`, type)
+}
+
+LargeListType__value_type <- function(type) {
+ .Call(`_arrow_LargeListType__value_type`, type)
+}
+
+FixedSizeListType__value_field <- function(type) {
+ .Call(`_arrow_FixedSizeListType__value_field`, type)
+}
+
+FixedSizeListType__value_type <- function(type) {
+ .Call(`_arrow_FixedSizeListType__value_type`, type)
+}
+
+FixedSizeListType__list_size <- function(type) {
+ .Call(`_arrow_FixedSizeListType__list_size`, type)
+}
+
+compute___expr__equals <- function(lhs, rhs) {
+ .Call(`_arrow_compute___expr__equals`, lhs, rhs)
+}
+
+compute___expr__call <- function(func_name, argument_list, options) {
+ .Call(`_arrow_compute___expr__call`, func_name, argument_list, options)
+}
+
+field_names_in_expression <- function(x) {
+ .Call(`_arrow_field_names_in_expression`, x)
+}
+
+compute___expr__get_field_ref_name <- function(x) {
+ .Call(`_arrow_compute___expr__get_field_ref_name`, x)
+}
+
+compute___expr__field_ref <- function(name) {
+ .Call(`_arrow_compute___expr__field_ref`, name)
+}
+
+compute___expr__scalar <- function(x) {
+ .Call(`_arrow_compute___expr__scalar`, x)
+}
+
+compute___expr__ToString <- function(x) {
+ .Call(`_arrow_compute___expr__ToString`, x)
+}
+
+compute___expr__type <- function(x, schema) {
+ .Call(`_arrow_compute___expr__type`, x, schema)
+}
+
+compute___expr__type_id <- function(x, schema) {
+ .Call(`_arrow_compute___expr__type_id`, x, schema)
+}
+
+ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) {
+ invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level))
+}
+
+ipc___feather___Reader__version <- function(reader) {
+ .Call(`_arrow_ipc___feather___Reader__version`, reader)
+}
+
+ipc___feather___Reader__Read <- function(reader, columns) {
+ .Call(`_arrow_ipc___feather___Reader__Read`, reader, columns)
+}
+
+ipc___feather___Reader__Open <- function(stream) {
+ .Call(`_arrow_ipc___feather___Reader__Open`, stream)
+}
+
+ipc___feather___Reader__schema <- function(reader) {
+ .Call(`_arrow_ipc___feather___Reader__schema`, reader)
+}
+
+Field__initialize <- function(name, field, nullable) {
+ .Call(`_arrow_Field__initialize`, name, field, nullable)
+}
+
+Field__ToString <- function(field) {
+ .Call(`_arrow_Field__ToString`, field)
+}
+
+Field__name <- function(field) {
+ .Call(`_arrow_Field__name`, field)
+}
+
+Field__Equals <- function(field, other) {
+ .Call(`_arrow_Field__Equals`, field, other)
+}
+
+Field__nullable <- function(field) {
+ .Call(`_arrow_Field__nullable`, field)
+}
+
+Field__type <- function(field) {
+ .Call(`_arrow_Field__type`, field)
+}
+
+fs___FileInfo__type <- function(x) {
+ .Call(`_arrow_fs___FileInfo__type`, x)
+}
+
+fs___FileInfo__set_type <- function(x, type) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_type`, x, type))
+}
+
+fs___FileInfo__path <- function(x) {
+ .Call(`_arrow_fs___FileInfo__path`, x)
+}
+
+fs___FileInfo__set_path <- function(x, path) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_path`, x, path))
+}
+
+fs___FileInfo__size <- function(x) {
+ .Call(`_arrow_fs___FileInfo__size`, x)
+}
+
+fs___FileInfo__set_size <- function(x, size) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_size`, x, size))
+}
+
+fs___FileInfo__base_name <- function(x) {
+ .Call(`_arrow_fs___FileInfo__base_name`, x)
+}
+
+fs___FileInfo__extension <- function(x) {
+ .Call(`_arrow_fs___FileInfo__extension`, x)
+}
+
+fs___FileInfo__mtime <- function(x) {
+ .Call(`_arrow_fs___FileInfo__mtime`, x)
+}
+
+fs___FileInfo__set_mtime <- function(x, time) {
+ invisible(.Call(`_arrow_fs___FileInfo__set_mtime`, x, time))
+}
+
+fs___FileSelector__base_dir <- function(selector) {
+ .Call(`_arrow_fs___FileSelector__base_dir`, selector)
+}
+
+fs___FileSelector__allow_not_found <- function(selector) {
+ .Call(`_arrow_fs___FileSelector__allow_not_found`, selector)
+}
+
+fs___FileSelector__recursive <- function(selector) {
+ .Call(`_arrow_fs___FileSelector__recursive`, selector)
+}
+
+fs___FileSelector__create <- function(base_dir, allow_not_found, recursive) {
+ .Call(`_arrow_fs___FileSelector__create`, base_dir, allow_not_found, recursive)
+}
+
+fs___FileSystem__GetTargetInfos_Paths <- function(file_system, paths) {
+ .Call(`_arrow_fs___FileSystem__GetTargetInfos_Paths`, file_system, paths)
+}
+
+fs___FileSystem__GetTargetInfos_FileSelector <- function(file_system, selector) {
+ .Call(`_arrow_fs___FileSystem__GetTargetInfos_FileSelector`, file_system, selector)
+}
+
+fs___FileSystem__CreateDir <- function(file_system, path, recursive) {
+ invisible(.Call(`_arrow_fs___FileSystem__CreateDir`, file_system, path, recursive))
+}
+
+fs___FileSystem__DeleteDir <- function(file_system, path) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteDir`, file_system, path))
+}
+
+fs___FileSystem__DeleteDirContents <- function(file_system, path) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteDirContents`, file_system, path))
+}
+
+fs___FileSystem__DeleteFile <- function(file_system, path) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteFile`, file_system, path))
+}
+
+fs___FileSystem__DeleteFiles <- function(file_system, paths) {
+ invisible(.Call(`_arrow_fs___FileSystem__DeleteFiles`, file_system, paths))
+}
+
+fs___FileSystem__Move <- function(file_system, src, dest) {
+ invisible(.Call(`_arrow_fs___FileSystem__Move`, file_system, src, dest))
+}
+
+fs___FileSystem__CopyFile <- function(file_system, src, dest) {
+ invisible(.Call(`_arrow_fs___FileSystem__CopyFile`, file_system, src, dest))
+}
+
+fs___FileSystem__OpenInputStream <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenInputStream`, file_system, path)
+}
+
+fs___FileSystem__OpenInputFile <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenInputFile`, file_system, path)
+}
+
+fs___FileSystem__OpenOutputStream <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenOutputStream`, file_system, path)
+}
+
+fs___FileSystem__OpenAppendStream <- function(file_system, path) {
+ .Call(`_arrow_fs___FileSystem__OpenAppendStream`, file_system, path)
+}
+
+fs___FileSystem__type_name <- function(file_system) {
+ .Call(`_arrow_fs___FileSystem__type_name`, file_system)
+}
+
+fs___LocalFileSystem__create <- function() {
+ .Call(`_arrow_fs___LocalFileSystem__create`)
+}
+
+fs___SubTreeFileSystem__create <- function(base_path, base_fs) {
+ .Call(`_arrow_fs___SubTreeFileSystem__create`, base_path, base_fs)
+}
+
+fs___SubTreeFileSystem__base_fs <- function(file_system) {
+ .Call(`_arrow_fs___SubTreeFileSystem__base_fs`, file_system)
+}
+
+fs___SubTreeFileSystem__base_path <- function(file_system) {
+ .Call(`_arrow_fs___SubTreeFileSystem__base_path`, file_system)
+}
+
+fs___FileSystemFromUri <- function(path) {
+ .Call(`_arrow_fs___FileSystemFromUri`, path)
+}
+
+fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads) {
+ invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads))
+}
+
+fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) {
+ .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes)
+}
+
+fs___S3FileSystem__region <- function(fs) {
+ .Call(`_arrow_fs___S3FileSystem__region`, fs)
+}
+
+io___Readable__Read <- function(x, nbytes) {
+ .Call(`_arrow_io___Readable__Read`, x, nbytes)
+}
+
+io___InputStream__Close <- function(x) {
+ invisible(.Call(`_arrow_io___InputStream__Close`, x))
+}
+
+io___OutputStream__Close <- function(x) {
+ invisible(.Call(`_arrow_io___OutputStream__Close`, x))
+}
+
+io___RandomAccessFile__GetSize <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__GetSize`, x)
+}
+
+io___RandomAccessFile__supports_zero_copy <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__supports_zero_copy`, x)
+}
+
+io___RandomAccessFile__Seek <- function(x, position) {
+ invisible(.Call(`_arrow_io___RandomAccessFile__Seek`, x, position))
+}
+
+io___RandomAccessFile__Tell <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__Tell`, x)
+}
+
+io___RandomAccessFile__Read0 <- function(x) {
+ .Call(`_arrow_io___RandomAccessFile__Read0`, x)
+}
+
+io___RandomAccessFile__ReadAt <- function(x, position, nbytes) {
+ .Call(`_arrow_io___RandomAccessFile__ReadAt`, x, position, nbytes)
+}
+
+io___MemoryMappedFile__Create <- function(path, size) {
+ .Call(`_arrow_io___MemoryMappedFile__Create`, path, size)
+}
+
+io___MemoryMappedFile__Open <- function(path, mode) {
+ .Call(`_arrow_io___MemoryMappedFile__Open`, path, mode)
+}
+
+io___MemoryMappedFile__Resize <- function(x, size) {
+ invisible(.Call(`_arrow_io___MemoryMappedFile__Resize`, x, size))
+}
+
+io___ReadableFile__Open <- function(path) {
+ .Call(`_arrow_io___ReadableFile__Open`, path)
+}
+
+io___BufferReader__initialize <- function(buffer) {
+ .Call(`_arrow_io___BufferReader__initialize`, buffer)
+}
+
+io___Writable__write <- function(stream, buf) {
+ invisible(.Call(`_arrow_io___Writable__write`, stream, buf))
+}
+
+io___OutputStream__Tell <- function(stream) {
+ .Call(`_arrow_io___OutputStream__Tell`, stream)
+}
+
+io___FileOutputStream__Open <- function(path) {
+ .Call(`_arrow_io___FileOutputStream__Open`, path)
+}
+
+io___BufferOutputStream__Create <- function(initial_capacity) {
+ .Call(`_arrow_io___BufferOutputStream__Create`, initial_capacity)
+}
+
+io___BufferOutputStream__capacity <- function(stream) {
+ .Call(`_arrow_io___BufferOutputStream__capacity`, stream)
+}
+
+io___BufferOutputStream__Finish <- function(stream) {
+ .Call(`_arrow_io___BufferOutputStream__Finish`, stream)
+}
+
+io___BufferOutputStream__Tell <- function(stream) {
+ .Call(`_arrow_io___BufferOutputStream__Tell`, stream)
+}
+
+io___BufferOutputStream__Write <- function(stream, bytes) {
+ invisible(.Call(`_arrow_io___BufferOutputStream__Write`, stream, bytes))
+}
+
+json___ReadOptions__initialize <- function(use_threads, block_size) {
+ .Call(`_arrow_json___ReadOptions__initialize`, use_threads, block_size)
+}
+
+json___ParseOptions__initialize1 <- function(newlines_in_values) {
+ .Call(`_arrow_json___ParseOptions__initialize1`, newlines_in_values)
+}
+
+json___ParseOptions__initialize2 <- function(newlines_in_values, explicit_schema) {
+ .Call(`_arrow_json___ParseOptions__initialize2`, newlines_in_values, explicit_schema)
+}
+
+json___TableReader__Make <- function(input, read_options, parse_options) {
+ .Call(`_arrow_json___TableReader__Make`, input, read_options, parse_options)
+}
+
+json___TableReader__Read <- function(table_reader) {
+ .Call(`_arrow_json___TableReader__Read`, table_reader)
+}
+
+MemoryPool__default <- function() {
+ .Call(`_arrow_MemoryPool__default`)
+}
+
+MemoryPool__bytes_allocated <- function(pool) {
+ .Call(`_arrow_MemoryPool__bytes_allocated`, pool)
+}
+
+MemoryPool__max_memory <- function(pool) {
+ .Call(`_arrow_MemoryPool__max_memory`, pool)
+}
+
+MemoryPool__backend_name <- function(pool) {
+ .Call(`_arrow_MemoryPool__backend_name`, pool)
+}
+
+supported_memory_backends <- function() {
+ .Call(`_arrow_supported_memory_backends`)
+}
+
+ipc___Message__body_length <- function(message) {
+ .Call(`_arrow_ipc___Message__body_length`, message)
+}
+
+ipc___Message__metadata <- function(message) {
+ .Call(`_arrow_ipc___Message__metadata`, message)
+}
+
+ipc___Message__body <- function(message) {
+ .Call(`_arrow_ipc___Message__body`, message)
+}
+
+ipc___Message__Verify <- function(message) {
+ .Call(`_arrow_ipc___Message__Verify`, message)
+}
+
+ipc___Message__type <- function(message) {
+ .Call(`_arrow_ipc___Message__type`, message)
+}
+
+ipc___Message__Equals <- function(x, y) {
+ .Call(`_arrow_ipc___Message__Equals`, x, y)
+}
+
+ipc___ReadRecordBatch__Message__Schema <- function(message, schema) {
+ .Call(`_arrow_ipc___ReadRecordBatch__Message__Schema`, message, schema)
+}
+
+ipc___ReadSchema_InputStream <- function(stream) {
+ .Call(`_arrow_ipc___ReadSchema_InputStream`, stream)
+}
+
+ipc___ReadSchema_Message <- function(message) {
+ .Call(`_arrow_ipc___ReadSchema_Message`, message)
+}
+
+ipc___MessageReader__Open <- function(stream) {
+ .Call(`_arrow_ipc___MessageReader__Open`, stream)
+}
+
+ipc___MessageReader__ReadNextMessage <- function(reader) {
+ .Call(`_arrow_ipc___MessageReader__ReadNextMessage`, reader)
+}
+
+ipc___ReadMessage <- function(stream) {
+ .Call(`_arrow_ipc___ReadMessage`, stream)
+}
+
+parquet___arrow___ArrowReaderProperties__Make <- function(use_threads) {
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__Make`, use_threads)
+}
+
+parquet___arrow___ArrowReaderProperties__set_use_threads <- function(properties, use_threads) {
+ invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads`, properties, use_threads))
+}
+
+parquet___arrow___ArrowReaderProperties__get_use_threads <- function(properties, use_threads) {
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads`, properties, use_threads)
+}
+
+parquet___arrow___ArrowReaderProperties__get_read_dictionary <- function(properties, column_index) {
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary`, properties, column_index)
+}
+
+parquet___arrow___ArrowReaderProperties__set_read_dictionary <- function(properties, column_index, read_dict) {
+ invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary`, properties, column_index, read_dict))
+}
+
+parquet___arrow___FileReader__OpenFile <- function(file, props) {
+ .Call(`_arrow_parquet___arrow___FileReader__OpenFile`, file, props)
+}
+
+parquet___arrow___FileReader__ReadTable1 <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadTable1`, reader)
+}
+
+parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadTable2`, reader, column_indices)
+}
+
+parquet___arrow___FileReader__ReadRowGroup1 <- function(reader, i) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup1`, reader, i)
+}
+
+parquet___arrow___FileReader__ReadRowGroup2 <- function(reader, i, column_indices) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroup2`, reader, i, column_indices)
+}
+
+parquet___arrow___FileReader__ReadRowGroups1 <- function(reader, row_groups) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups1`, reader, row_groups)
+}
+
+parquet___arrow___FileReader__ReadRowGroups2 <- function(reader, row_groups, column_indices) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadRowGroups2`, reader, row_groups, column_indices)
+}
+
+parquet___arrow___FileReader__num_rows <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__num_rows`, reader)
+}
+
+parquet___arrow___FileReader__num_columns <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__num_columns`, reader)
+}
+
+parquet___arrow___FileReader__num_row_groups <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__num_row_groups`, reader)
+}
+
+parquet___arrow___FileReader__ReadColumn <- function(reader, i) {
+ .Call(`_arrow_parquet___arrow___FileReader__ReadColumn`, reader, i)
+}
+
+parquet___ArrowWriterProperties___create <- function(allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit) {
+ .Call(`_arrow_parquet___ArrowWriterProperties___create`, allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit)
+}
+
+parquet___WriterProperties___Builder__create <- function() {
+ .Call(`_arrow_parquet___WriterProperties___Builder__create`)
+}
+
+parquet___WriterProperties___Builder__version <- function(builder, version) {
+ invisible(.Call(`_arrow_parquet___WriterProperties___Builder__version`, builder, version))
+}
+
+parquet___ArrowWriterProperties___Builder__set_compressions <- function(builder, paths, types) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compressions`, builder, paths, types))
+}
+
+parquet___ArrowWriterProperties___Builder__set_compression_levels <- function(builder, paths, levels) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels`, builder, paths, levels))
+}
+
+parquet___ArrowWriterProperties___Builder__set_use_dictionary <- function(builder, paths, use_dictionary) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary`, builder, paths, use_dictionary))
+}
+
+parquet___ArrowWriterProperties___Builder__set_write_statistics <- function(builder, paths, write_statistics) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics`, builder, paths, write_statistics))
+}
+
+parquet___ArrowWriterProperties___Builder__data_page_size <- function(builder, data_page_size) {
+ invisible(.Call(`_arrow_parquet___ArrowWriterProperties___Builder__data_page_size`, builder, data_page_size))
+}
+
+parquet___WriterProperties___Builder__build <- function(builder) {
+ .Call(`_arrow_parquet___WriterProperties___Builder__build`, builder)
+}
+
+parquet___arrow___ParquetFileWriter__Open <- function(schema, sink, properties, arrow_properties) {
+ .Call(`_arrow_parquet___arrow___ParquetFileWriter__Open`, schema, sink, properties, arrow_properties)
+}
+
+parquet___arrow___FileWriter__WriteTable <- function(writer, table, chunk_size) {
+ invisible(.Call(`_arrow_parquet___arrow___FileWriter__WriteTable`, writer, table, chunk_size))
+}
+
+parquet___arrow___FileWriter__Close <- function(writer) {
+ invisible(.Call(`_arrow_parquet___arrow___FileWriter__Close`, writer))
+}
+
+parquet___arrow___WriteTable <- function(table, sink, properties, arrow_properties) {
+ invisible(.Call(`_arrow_parquet___arrow___WriteTable`, table, sink, properties, arrow_properties))
+}
+
+parquet___arrow___FileReader__GetSchema <- function(reader) {
+ .Call(`_arrow_parquet___arrow___FileReader__GetSchema`, reader)
+}
+
+allocate_arrow_schema <- function() {
+ .Call(`_arrow_allocate_arrow_schema`)
+}
+
+delete_arrow_schema <- function(ptr) {
+ invisible(.Call(`_arrow_delete_arrow_schema`, ptr))
+}
+
+allocate_arrow_array <- function() {
+ .Call(`_arrow_allocate_arrow_array`)
+}
+
+delete_arrow_array <- function(ptr) {
+ invisible(.Call(`_arrow_delete_arrow_array`, ptr))
+}
+
+allocate_arrow_array_stream <- function() {
+ .Call(`_arrow_allocate_arrow_array_stream`)
+}
+
+delete_arrow_array_stream <- function(ptr) {
+ invisible(.Call(`_arrow_delete_arrow_array_stream`, ptr))
+}
+
+ImportArray <- function(array, schema) {
+ .Call(`_arrow_ImportArray`, array, schema)
+}
+
+ImportRecordBatch <- function(array, schema) {
+ .Call(`_arrow_ImportRecordBatch`, array, schema)
+}
+
+ImportSchema <- function(schema) {
+ .Call(`_arrow_ImportSchema`, schema)
+}
+
+ImportField <- function(field) {
+ .Call(`_arrow_ImportField`, field)
+}
+
+ImportType <- function(type) {
+ .Call(`_arrow_ImportType`, type)
+}
+
+ImportRecordBatchReader <- function(stream) {
+ .Call(`_arrow_ImportRecordBatchReader`, stream)
+}
+
+ExportType <- function(type, ptr) {
+ invisible(.Call(`_arrow_ExportType`, type, ptr))
+}
+
+ExportField <- function(field, ptr) {
+ invisible(.Call(`_arrow_ExportField`, field, ptr))
+}
+
+ExportSchema <- function(schema, ptr) {
+ invisible(.Call(`_arrow_ExportSchema`, schema, ptr))
+}
+
+ExportArray <- function(array, array_ptr, schema_ptr) {
+ invisible(.Call(`_arrow_ExportArray`, array, array_ptr, schema_ptr))
+}
+
+ExportRecordBatch <- function(batch, array_ptr, schema_ptr) {
+ invisible(.Call(`_arrow_ExportRecordBatch`, batch, array_ptr, schema_ptr))
+}
+
+ExportRecordBatchReader <- function(reader, stream_ptr) {
+ invisible(.Call(`_arrow_ExportRecordBatchReader`, reader, stream_ptr))
+}
+
+Table__from_dots <- function(lst, schema_sxp, use_threads) {
+ .Call(`_arrow_Table__from_dots`, lst, schema_sxp, use_threads)
+}
+
+vec_to_arrow <- function(x, s_type) {
+ .Call(`_arrow_vec_to_arrow`, x, s_type)
+}
+
+DictionaryArray__FromArrays <- function(type, indices, dict) {
+ .Call(`_arrow_DictionaryArray__FromArrays`, type, indices, dict)
+}
+
+RecordBatch__num_columns <- function(x) {
+ .Call(`_arrow_RecordBatch__num_columns`, x)
+}
+
+RecordBatch__num_rows <- function(x) {
+ .Call(`_arrow_RecordBatch__num_rows`, x)
+}
+
+RecordBatch__schema <- function(x) {
+ .Call(`_arrow_RecordBatch__schema`, x)
+}
+
+RecordBatch__RenameColumns <- function(batch, names) {
+ .Call(`_arrow_RecordBatch__RenameColumns`, batch, names)
+}
+
+RecordBatch__ReplaceSchemaMetadata <- function(x, metadata) {
+ .Call(`_arrow_RecordBatch__ReplaceSchemaMetadata`, x, metadata)
+}
+
+RecordBatch__columns <- function(batch) {
+ .Call(`_arrow_RecordBatch__columns`, batch)
+}
+
+RecordBatch__column <- function(batch, i) {
+ .Call(`_arrow_RecordBatch__column`, batch, i)
+}
+
+RecordBatch__GetColumnByName <- function(batch, name) {
+ .Call(`_arrow_RecordBatch__GetColumnByName`, batch, name)
+}
+
+RecordBatch__SelectColumns <- function(batch, indices) {
+ .Call(`_arrow_RecordBatch__SelectColumns`, batch, indices)
+}
+
+RecordBatch__Equals <- function(self, other, check_metadata) {
+ .Call(`_arrow_RecordBatch__Equals`, self, other, check_metadata)
+}
+
+RecordBatch__AddColumn <- function(batch, i, field, column) {
+ .Call(`_arrow_RecordBatch__AddColumn`, batch, i, field, column)
+}
+
+RecordBatch__SetColumn <- function(batch, i, field, column) {
+ .Call(`_arrow_RecordBatch__SetColumn`, batch, i, field, column)
+}
+
+RecordBatch__RemoveColumn <- function(batch, i) {
+ .Call(`_arrow_RecordBatch__RemoveColumn`, batch, i)
+}
+
+RecordBatch__column_name <- function(batch, i) {
+ .Call(`_arrow_RecordBatch__column_name`, batch, i)
+}
+
+RecordBatch__names <- function(batch) {
+ .Call(`_arrow_RecordBatch__names`, batch)
+}
+
+RecordBatch__Slice1 <- function(self, offset) {
+ .Call(`_arrow_RecordBatch__Slice1`, self, offset)
+}
+
+RecordBatch__Slice2 <- function(self, offset, length) {
+ .Call(`_arrow_RecordBatch__Slice2`, self, offset, length)
+}
+
+ipc___SerializeRecordBatch__Raw <- function(batch) {
+ .Call(`_arrow_ipc___SerializeRecordBatch__Raw`, batch)
+}
+
+ipc___ReadRecordBatch__InputStream__Schema <- function(stream, schema) {
+ .Call(`_arrow_ipc___ReadRecordBatch__InputStream__Schema`, stream, schema)
+}
+
+RecordBatch__from_arrays <- function(schema_sxp, lst) {
+ .Call(`_arrow_RecordBatch__from_arrays`, schema_sxp, lst)
+}
+
+RecordBatchReader__schema <- function(reader) {
+ .Call(`_arrow_RecordBatchReader__schema`, reader)
+}
+
+RecordBatchReader__ReadNext <- function(reader) {
+ .Call(`_arrow_RecordBatchReader__ReadNext`, reader)
+}
+
+RecordBatchReader__batches <- function(reader) {
+ .Call(`_arrow_RecordBatchReader__batches`, reader)
+}
+
+Table__from_RecordBatchReader <- function(reader) {
+ .Call(`_arrow_Table__from_RecordBatchReader`, reader)
+}
+
+ipc___RecordBatchStreamReader__Open <- function(stream) {
+ .Call(`_arrow_ipc___RecordBatchStreamReader__Open`, stream)
+}
+
+ipc___RecordBatchFileReader__schema <- function(reader) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__schema`, reader)
+}
+
+ipc___RecordBatchFileReader__num_record_batches <- function(reader) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__num_record_batches`, reader)
+}
+
+ipc___RecordBatchFileReader__ReadRecordBatch <- function(reader, i) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__ReadRecordBatch`, reader, i)
+}
+
+ipc___RecordBatchFileReader__Open <- function(file) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__Open`, file)
+}
+
+Table__from_RecordBatchFileReader <- function(reader) {
+ .Call(`_arrow_Table__from_RecordBatchFileReader`, reader)
+}
+
+ipc___RecordBatchFileReader__batches <- function(reader) {
+ .Call(`_arrow_ipc___RecordBatchFileReader__batches`, reader)
+}
+
+ipc___RecordBatchWriter__WriteRecordBatch <- function(batch_writer, batch) {
+ invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteRecordBatch`, batch_writer, batch))
+}
+
+ipc___RecordBatchWriter__WriteTable <- function(batch_writer, table) {
+ invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteTable`, batch_writer, table))
+}
+
+ipc___RecordBatchWriter__Close <- function(batch_writer) {
+ invisible(.Call(`_arrow_ipc___RecordBatchWriter__Close`, batch_writer))
+}
+
+ipc___RecordBatchFileWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) {
+ .Call(`_arrow_ipc___RecordBatchFileWriter__Open`, stream, schema, use_legacy_format, metadata_version)
+}
+
+ipc___RecordBatchStreamWriter__Open <- function(stream, schema, use_legacy_format, metadata_version) {
+ .Call(`_arrow_ipc___RecordBatchStreamWriter__Open`, stream, schema, use_legacy_format, metadata_version)
+}
+
+Array__GetScalar <- function(x, i) {
+ .Call(`_arrow_Array__GetScalar`, x, i)
+}
+
+Scalar__ToString <- function(s) {
+ .Call(`_arrow_Scalar__ToString`, s)
+}
+
+StructScalar__field <- function(s, i) {
+ .Call(`_arrow_StructScalar__field`, s, i)
+}
+
+StructScalar__GetFieldByName <- function(s, name) {
+ .Call(`_arrow_StructScalar__GetFieldByName`, s, name)
+}
+
+Scalar__as_vector <- function(scalar) {
+ .Call(`_arrow_Scalar__as_vector`, scalar)
+}
+
+MakeArrayFromScalar <- function(scalar, n) {
+ .Call(`_arrow_MakeArrayFromScalar`, scalar, n)
+}
+
+Scalar__is_valid <- function(s) {
+ .Call(`_arrow_Scalar__is_valid`, s)
+}
+
+Scalar__type <- function(s) {
+ .Call(`_arrow_Scalar__type`, s)
+}
+
+Scalar__Equals <- function(lhs, rhs) {
+ .Call(`_arrow_Scalar__Equals`, lhs, rhs)
+}
+
+Scalar__ApproxEquals <- function(lhs, rhs) {
+ .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs)
+}
+
+schema_ <- function(fields) {
+ .Call(`_arrow_schema_`, fields)
+}
+
+Schema__ToString <- function(s) {
+ .Call(`_arrow_Schema__ToString`, s)
+}
+
+Schema__num_fields <- function(s) {
+ .Call(`_arrow_Schema__num_fields`, s)
+}
+
+Schema__field <- function(s, i) {
+ .Call(`_arrow_Schema__field`, s, i)
+}
+
+Schema__AddField <- function(s, i, field) {
+ .Call(`_arrow_Schema__AddField`, s, i, field)
+}
+
+Schema__SetField <- function(s, i, field) {
+ .Call(`_arrow_Schema__SetField`, s, i, field)
+}
+
+Schema__RemoveField <- function(s, i) {
+ .Call(`_arrow_Schema__RemoveField`, s, i)
+}
+
+Schema__GetFieldByName <- function(s, x) {
+ .Call(`_arrow_Schema__GetFieldByName`, s, x)
+}
+
+Schema__fields <- function(schema) {
+ .Call(`_arrow_Schema__fields`, schema)
+}
+
+Schema__field_names <- function(schema) {
+ .Call(`_arrow_Schema__field_names`, schema)
+}
+
+Schema__HasMetadata <- function(schema) {
+ .Call(`_arrow_Schema__HasMetadata`, schema)
+}
+
+Schema__metadata <- function(schema) {
+ .Call(`_arrow_Schema__metadata`, schema)
+}
+
+Schema__WithMetadata <- function(schema, metadata) {
+ .Call(`_arrow_Schema__WithMetadata`, schema, metadata)
+}
+
+Schema__serialize <- function(schema) {
+ .Call(`_arrow_Schema__serialize`, schema)
+}
+
+Schema__Equals <- function(schema, other, check_metadata) {
+ .Call(`_arrow_Schema__Equals`, schema, other, check_metadata)
+}
+
+arrow__UnifySchemas <- function(schemas) {
+ .Call(`_arrow_arrow__UnifySchemas`, schemas)
+}
+
+Table__num_columns <- function(x) {
+ .Call(`_arrow_Table__num_columns`, x)
+}
+
+Table__num_rows <- function(x) {
+ .Call(`_arrow_Table__num_rows`, x)
+}
+
+Table__schema <- function(x) {
+ .Call(`_arrow_Table__schema`, x)
+}
+
+Table__ReplaceSchemaMetadata <- function(x, metadata) {
+ .Call(`_arrow_Table__ReplaceSchemaMetadata`, x, metadata)
+}
+
+Table__column <- function(table, i) {
+ .Call(`_arrow_Table__column`, table, i)
+}
+
+Table__field <- function(table, i) {
+ .Call(`_arrow_Table__field`, table, i)
+}
+
+Table__columns <- function(table) {
+ .Call(`_arrow_Table__columns`, table)
+}
+
+Table__ColumnNames <- function(table) {
+ .Call(`_arrow_Table__ColumnNames`, table)
+}
+
+Table__RenameColumns <- function(table, names) {
+ .Call(`_arrow_Table__RenameColumns`, table, names)
+}
+
+Table__Slice1 <- function(table, offset) {
+ .Call(`_arrow_Table__Slice1`, table, offset)
+}
+
+Table__Slice2 <- function(table, offset, length) {
+ .Call(`_arrow_Table__Slice2`, table, offset, length)
+}
+
+Table__Equals <- function(lhs, rhs, check_metadata) {
+ .Call(`_arrow_Table__Equals`, lhs, rhs, check_metadata)
+}
+
+Table__Validate <- function(table) {
+ .Call(`_arrow_Table__Validate`, table)
+}
+
+Table__ValidateFull <- function(table) {
+ .Call(`_arrow_Table__ValidateFull`, table)
+}
+
+Table__GetColumnByName <- function(table, name) {
+ .Call(`_arrow_Table__GetColumnByName`, table, name)
+}
+
+Table__RemoveColumn <- function(table, i) {
+ .Call(`_arrow_Table__RemoveColumn`, table, i)
+}
+
+Table__AddColumn <- function(table, i, field, column) {
+ .Call(`_arrow_Table__AddColumn`, table, i, field, column)
+}
+
+Table__SetColumn <- function(table, i, field, column) {
+ .Call(`_arrow_Table__SetColumn`, table, i, field, column)
+}
+
+Table__SelectColumns <- function(table, indices) {
+ .Call(`_arrow_Table__SelectColumns`, table, indices)
+}
+
+all_record_batches <- function(lst) {
+ .Call(`_arrow_all_record_batches`, lst)
+}
+
+Table__from_record_batches <- function(batches, schema_sxp) {
+ .Call(`_arrow_Table__from_record_batches`, batches, schema_sxp)
+}
+
+GetCpuThreadPoolCapacity <- function() {
+ .Call(`_arrow_GetCpuThreadPoolCapacity`)
+}
+
+SetCpuThreadPoolCapacity <- function(threads) {
+ invisible(.Call(`_arrow_SetCpuThreadPoolCapacity`, threads))
+}
+
+GetIOThreadPoolCapacity <- function() {
+ .Call(`_arrow_GetIOThreadPoolCapacity`)
+}
+
+SetIOThreadPoolCapacity <- function(threads) {
+ invisible(.Call(`_arrow_SetIOThreadPoolCapacity`, threads))
+}
+
+Array__infer_type <- function(x) {
+ .Call(`_arrow_Array__infer_type`, x)
+}
diff --git a/src/arrow/r/R/buffer.R b/src/arrow/r/R/buffer.R
new file mode 100644
index 000000000..a9424fd0d
--- /dev/null
+++ b/src/arrow/r/R/buffer.R
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @title Buffer class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description A Buffer is an object containing a pointer to a piece of
+#' contiguous memory with a particular size.
+#' @section Factory:
+#' `buffer()` lets you create an `arrow::Buffer` from an R object
+#' @section Methods:
+#'
+#' - `$is_mutable` : is this buffer mutable?
+#' - `$ZeroPadding()` : zero bytes in padding, i.e. bytes between size and capacity
+#' - `$size` : size in memory, in bytes
+#' - `$capacity`: possible capacity, in bytes
+#'
+#' @rdname buffer
+#' @name buffer
+#' @examplesIf arrow_available()
+#' my_buffer <- buffer(c(1, 2, 3, 4))
+#' my_buffer$is_mutable
+#' my_buffer$ZeroPadding()
+#' my_buffer$size
+#' my_buffer$capacity
+#' @export
+#' @include arrow-package.R
+#' @include enums.R
+Buffer <- R6Class("Buffer",
+ inherit = ArrowObject,
+ public = list(
+ ZeroPadding = function() Buffer__ZeroPadding(self),
+ data = function() Buffer__data(self),
+ Equals = function(other, ...) {
+ inherits(other, "Buffer") && Buffer__Equals(self, other)
+ }
+ ),
+ active = list(
+ is_mutable = function() Buffer__is_mutable(self),
+ size = function() Buffer__size(self),
+ capacity = function() Buffer__capacity(self)
+ )
+)
+
+Buffer$create <- function(x) {
+ if (inherits(x, "Buffer")) {
+ x
+ } else if (inherits(x, c("raw", "numeric", "integer", "complex"))) {
+ r___RBuffer__initialize(x)
+ } else if (inherits(x, "BufferOutputStream")) {
+ x$finish()
+ } else {
+ stop("Cannot convert object of class ", class(x), " to arrow::Buffer")
+ }
+}
+
+#' @param x R object. Only raw, numeric and integer vectors are currently supported
+#' @return an instance of `Buffer` that borrows memory from `x`
+#' @export
+buffer <- Buffer$create
+
+#' @export
+as.raw.Buffer <- function(x) x$data()
diff --git a/src/arrow/r/R/chunked-array.R b/src/arrow/r/R/chunked-array.R
new file mode 100644
index 000000000..597180ea7
--- /dev/null
+++ b/src/arrow/r/R/chunked-array.R
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+#' @title ChunkedArray class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description A `ChunkedArray` is a data structure managing a list of
+#' primitive Arrow [Arrays][Array] logically as one large array. Chunked arrays
+#' may be grouped together in a [Table].
+#' @section Factory:
+#' The `ChunkedArray$create()` factory method instantiates the object from
+#' various Arrays or R vectors. `chunked_array()` is an alias for it.
+#'
+#' @section Methods:
+#'
+#' - `$length()`: Size in the number of elements this array contains
+#' - `$chunk(i)`: Extract an `Array` chunk by integer position
+#' - `$as_vector()`: convert to an R vector
+#' - `$Slice(offset, length = NULL)`: Construct a zero-copy slice of the array
+#' with the indicated offset and length. If length is `NULL`, the slice goes
+#' until the end of the array.
+#' - `$Take(i)`: return a `ChunkedArray` with values at positions given by
+#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be
+#' coerced to an R vector before taking.
+#' - `$Filter(i, keep_na = TRUE)`: return a `ChunkedArray` with values at positions where
+#' logical vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
+#' - `$SortIndices(descending = FALSE)`: return an `Array` of integer positions that can be
+#' used to rearrange the `ChunkedArray` in ascending or descending order
+#' - `$cast(target_type, safe = TRUE, options = cast_options(safe))`: Alter the
+#' data in the array to change its type.
+#' - `$null_count`: The number of null entries in the array
+#' - `$chunks`: return a list of `Array`s
+#' - `$num_chunks`: integer number of chunks in the `ChunkedArray`
+#' - `$type`: logical type of data
+#' - `$View(type)`: Construct a zero-copy view of this `ChunkedArray` with the
+#' given type.
+#' - `$Validate()`: Perform any validation checks to determine obvious inconsistencies
+#' within the array's internal data. This can be an expensive check, potentially `O(length)`
+#'
+#' @rdname ChunkedArray
+#' @name ChunkedArray
+#' @seealso [Array]
+#' @examplesIf arrow_available()
+#' # Pass items into chunked_array as separate objects to create chunks
+#' class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73))
+#' class_scores$num_chunks
+#'
+#' # When taking a Slice from a chunked_array, chunks are preserved
+#' class_scores$Slice(2, length = 5)
+#'
+#' # You can combine Take and SortIndices to return a ChunkedArray with 1 chunk
+#' # containing all values, ordered.
+#' class_scores$Take(class_scores$SortIndices(descending = TRUE))
+#'
+#' # If you pass a list into chunked_array, you get a list of length 1
+#' list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8)))
+#' list_scores$num_chunks
+#'
+#' # When constructing a ChunkedArray, the first chunk is used to infer type.
+#' doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L))
+#' doubles$type
+#' @export
+ChunkedArray <- R6Class("ChunkedArray",
+ inherit = ArrowDatum,
+ public = list(
+ length = function() ChunkedArray__length(self),
+ type_id = function() ChunkedArray__type(self)$id,
+ chunk = function(i) Array$create(ChunkedArray__chunk(self, i)),
+ as_vector = function() ChunkedArray__as_vector(self, option_use_threads()),
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ ChunkedArray__Slice1(self, offset)
+ } else {
+ ChunkedArray__Slice2(self, offset, length)
+ }
+ },
+ Take = function(i) {
+ if (is.numeric(i)) {
+ i <- as.integer(i)
+ }
+ if (is.integer(i)) {
+ i <- Array$create(i)
+ }
+ call_function("take", self, i)
+ },
+ Filter = function(i, keep_na = TRUE) {
+ if (is.logical(i)) {
+ i <- Array$create(i)
+ }
+ call_function("filter", self, i, options = list(keep_na = keep_na))
+ },
+ SortIndices = function(descending = FALSE) {
+ assert_that(is.logical(descending))
+ assert_that(length(descending) == 1L)
+ assert_that(!is.na(descending))
+ # TODO: after ARROW-12042 is closed, review whether this and the
+ # Array$SortIndices definition can be consolidated
+ call_function(
+ "sort_indices",
+ self,
+ options = list(names = "", orders = as.integer(descending))
+ )
+ },
+ View = function(type) {
+ ChunkedArray__View(self, as_type(type))
+ },
+ Validate = function() {
+ ChunkedArray__Validate(self)
+ },
+ ToString = function() {
+ ChunkedArray__ToString(self)
+ },
+ Equals = function(other, ...) {
+ inherits(other, "ChunkedArray") && ChunkedArray__Equals(self, other)
+ }
+ ),
+ active = list(
+ null_count = function() ChunkedArray__null_count(self),
+ num_chunks = function() ChunkedArray__num_chunks(self),
+ chunks = function() map(ChunkedArray__chunks(self), Array$create),
+ type = function() ChunkedArray__type(self)
+ )
+)
+
+ChunkedArray$create <- function(..., type = NULL) {
+ if (!is.null(type)) {
+ type <- as_type(type)
+ }
+ ChunkedArray__from_list(list2(...), type)
+}
+
+#' @param \dots Vectors to coerce
+#' @param type currently ignored
+#' @rdname ChunkedArray
+#' @export
+chunked_array <- ChunkedArray$create
diff --git a/src/arrow/r/R/compression.R b/src/arrow/r/R/compression.R
new file mode 100644
index 000000000..7107012d0
--- /dev/null
+++ b/src/arrow/r/R/compression.R
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include enums.R
+#' @include arrow-package.R
+#' @include io.R
+
+#' @title Compression Codec class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description Codecs allow you to create [compressed input and output
+#' streams][compression].
+#' @section Factory:
+#' The `Codec$create()` factory method takes the following arguments:
+#' * `type`: string name of the compression method. Possible values are
+#' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo", or
+#' "bz2". `type` may be upper- or lower-cased. Not all methods may be
+#' available; support depends on build-time flags for the C++ library.
+#' See [codec_is_available()]. Most builds support at least "snappy" and
+#' "gzip". All support "uncompressed".
+#' * `compression_level`: compression level, the default value (`NA`) uses the
+#' default compression level for the selected compression `type`.
+#' @rdname Codec
+#' @name Codec
+#' @export
+Codec <- R6Class("Codec",
+ inherit = ArrowObject,
+ active = list(
+ name = function() util___Codec__name(self),
+ level = function() abort("Codec$level() not yet implemented")
+ )
+)
+Codec$create <- function(type = "gzip", compression_level = NA) {
+ if (is.string(type)) {
+ type <- util___Codec__Create(
+ compression_from_name(type), compression_level
+ )
+ }
+ assert_is(type, "Codec")
+ type
+}
+
+#' Check whether a compression codec is available
+#'
+#' Support for compression libraries depends on the build-time settings of
+#' the Arrow C++ library. This function lets you know which are available for
+#' use.
+#' @param type A string, one of "uncompressed", "snappy", "gzip", "brotli",
+#' "zstd", "lz4", "lzo", or "bz2", case insensitive.
+#' @return Logical: is `type` available?
+#' @export
+#' @examplesIf arrow_available()
+#' codec_is_available("gzip")
+codec_is_available <- function(type) {
+ util___Codec__IsAvailable(compression_from_name(type))
+}
+
+compression_from_name <- function(name) {
+ map_int(name, ~ CompressionType[[match.arg(toupper(.x), names(CompressionType))]])
+}
+
+#' @title Compressed stream classes
+#' @rdname compression
+#' @name compression
+#' @aliases CompressedInputStream CompressedOutputStream
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CompressedInputStream` and `CompressedOutputStream`
+#' allow you to apply a compression [Codec] to an
+#' input or output stream.
+#'
+#' @section Factory:
+#'
+#' The `CompressedInputStream$create()` and `CompressedOutputStream$create()`
+#' factory methods instantiate the object and take the following arguments:
+#'
+#' - `stream` An [InputStream] or [OutputStream], respectively
+#' - `codec` A `Codec`, either a [Codec][Codec] instance or a string
+#' - `compression_level` compression level for when the `codec` argument is given as a string
+#'
+#' @section Methods:
+#'
+#' Methods are inherited from [InputStream] and [OutputStream], respectively
+#' @export
+#' @include arrow-package.R
+CompressedOutputStream <- R6Class("CompressedOutputStream", inherit = OutputStream)
+CompressedOutputStream$create <- function(stream, codec = "gzip", compression_level = NA) {
+ codec <- Codec$create(codec, compression_level = compression_level)
+ if (is.string(stream)) {
+ stream <- FileOutputStream$create(stream)
+ }
+ assert_is(stream, "OutputStream")
+ io___CompressedOutputStream__Make(codec, stream)
+}
+
+#' @rdname compression
+#' @usage NULL
+#' @format NULL
+#' @export
+CompressedInputStream <- R6Class("CompressedInputStream", inherit = InputStream)
+CompressedInputStream$create <- function(stream, codec = "gzip", compression_level = NA) {
+ codec <- Codec$create(codec, compression_level = compression_level)
+ if (is.string(stream)) {
+ stream <- ReadableFile$create(stream)
+ }
+ assert_is(stream, "InputStream")
+ io___CompressedInputStream__Make(codec, stream)
+}
diff --git a/src/arrow/r/R/compute.R b/src/arrow/r/R/compute.R
new file mode 100644
index 000000000..0a7d77a09
--- /dev/null
+++ b/src/arrow/r/R/compute.R
@@ -0,0 +1,309 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Call an Arrow compute function
+#'
+#' This function provides a lower-level API for calling Arrow functions by their
+#' string function name. You won't use it directly for most applications.
+#' Many Arrow compute functions are mapped to R methods,
+#' and in a `dplyr` evaluation context, [all Arrow functions][list_compute_functions()]
+#' are callable with an `arrow_` prefix.
+#' @param function_name string Arrow compute function name
+#' @param ... Function arguments, which may include `Array`, `ChunkedArray`, `Scalar`,
+#' `RecordBatch`, or `Table`.
+#' @param args list arguments as an alternative to specifying in `...`
+#' @param options named list of C++ function options.
+#' @details When passing indices in `...`, `args`, or `options`, express them as
+#' 0-based integers (consistent with C++).
+#' @return An `Array`, `ChunkedArray`, `Scalar`, `RecordBatch`, or `Table`, whatever the compute function results in.
+#' @seealso [Arrow C++ documentation](https://arrow.apache.org/docs/cpp/compute.html) for
+#' the functions and their respective options.
+#' @examplesIf arrow_available()
+#' a <- Array$create(c(1L, 2L, 3L, NA, 5L))
+#' s <- Scalar$create(4L)
+#' call_function("coalesce", a, s)
+#'
+#' a <- Array$create(rnorm(10000))
+#' call_function("quantile", a, options = list(q = seq(0, 1, 0.25)))
+#' @export
+#' @include array.R
+#' @include chunked-array.R
+#' @include scalar.R
+call_function <- function(function_name, ..., args = list(...), options = empty_named_list()) {
+ assert_that(is.string(function_name))
+ assert_that(is.list(options), !is.null(names(options)))
+
+ datum_classes <- c("Array", "ChunkedArray", "RecordBatch", "Table", "Scalar")
+ valid_args <- map_lgl(args, ~ inherits(., datum_classes))
+ if (!all(valid_args)) {
+ # Lame, just pick one to report
+ first_bad <- min(which(!valid_args))
+ stop(
+ "Argument ", first_bad, " is of class ", head(class(args[[first_bad]]), 1),
+ " but it must be one of ", oxford_paste(datum_classes, "or"),
+ call. = FALSE
+ )
+ }
+
+ compute__CallFunction(function_name, args, options)
+}
+
+#' List available Arrow C++ compute functions
+#'
+#' This function lists the names of all available Arrow C++ library compute functions.
+#' These can be called by passing to [call_function()], or they can be
+#' called by name with an `arrow_` prefix inside a `dplyr` verb.
+#'
+#' The resulting list describes the capabilities of your `arrow` build.
+#' Some functions, such as string and regular expression functions,
+#' require optional build-time C++ dependencies. If your `arrow` package
+#' was not compiled with those features enabled, those functions will
+#' not appear in this list.
+#'
+#' Some functions take options that need to be passed when calling them
+#' (in a list called `options`). These options require custom handling
+#' in C++; many functions already have that handling set up but not all do.
+#' If you encounter one that needs special handling for options, please
+#' report an issue.
+#'
+#' Note that this list does *not* enumerate all of the R bindings for these functions.
+#' The package includes Arrow methods for many base R functions that can
+#' be called directly on Arrow objects, as well as some tidyverse-flavored versions
+#' available inside `dplyr` verbs.
+#'
+#' @param pattern Optional regular expression to filter the function list
+#' @param ... Additional parameters passed to `grep()`
+#' @return A character vector of available Arrow C++ function names
+#' @examplesIf arrow_available()
+#' available_funcs <- list_compute_functions()
+#' utf8_funcs <- list_compute_functions(pattern = "^UTF8", ignore.case = TRUE)
+#' @export
+list_compute_functions <- function(pattern = NULL, ...) {
+ funcs <- compute__GetFunctionNames()
+ if (!is.null(pattern)) {
+ funcs <- grep(pattern, funcs, value = TRUE, ...)
+ }
+ # TODO: Filtering of hash funcs will already happen in C++ with ARROW-13943
+ funcs <- grep(
+ "^hash_",
+ funcs,
+ value = TRUE,
+ invert = TRUE
+ )
+ funcs
+}
+
+#' @export
+sum.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("sum", ..., na.rm = na.rm)
+}
+
+#' @export
+mean.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("mean", ..., na.rm = na.rm)
+}
+
+#' @export
+min.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("min")
+}
+
+#' @export
+max.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("max")
+}
+
+scalar_aggregate <- function(FUN, ..., na.rm = FALSE, min_count = 0L) {
+ a <- collect_arrays_from_dots(list(...))
+ if (FUN == "min_max" && na.rm && a$null_count == length(a)) {
+ Array$create(data.frame(min = Inf, max = -Inf))
+ # If na.rm == TRUE and all values in array are NA, R returns
+ # Inf/-Inf, which are type double. Since Arrow is type-stable
+ # and does not do that, we handle this special case here.
+ } else {
+ call_function(FUN, a, options = list(skip_nulls = na.rm, min_count = min_count))
+ }
+}
+
+collect_arrays_from_dots <- function(dots) {
+ # Given a list that may contain both Arrays and ChunkedArrays,
+ # return a single ChunkedArray containing all of those chunks
+ # (may return a regular Array if there is only one element in dots)
+ # If there is only one element and it is a scalar, it returns the scalar
+ if (length(dots) == 1) {
+ return(dots[[1]])
+ }
+
+ assert_that(all(map_lgl(dots, is.Array)))
+ arrays <- unlist(lapply(dots, function(x) {
+ if (inherits(x, "ChunkedArray")) {
+ x$chunks
+ } else {
+ x
+ }
+ }))
+ ChunkedArray$create(!!!arrays)
+}
+
+#' @export
+quantile.ArrowDatum <- function(x,
+ probs = seq(0, 1, 0.25),
+ na.rm = FALSE,
+ type = 7,
+ interpolation = c("linear", "lower", "higher", "nearest", "midpoint"),
+ ...) {
+ if (inherits(x, "Scalar")) x <- Array$create(x)
+ assert_is(probs, c("numeric", "integer"))
+ assert_that(length(probs) > 0)
+ assert_that(all(probs >= 0 & probs <= 1))
+ if (!na.rm && x$null_count > 0) {
+ stop("Missing values not allowed if 'na.rm' is FALSE", call. = FALSE)
+ }
+ if (type != 7) {
+ stop(
+ "Argument `type` not supported in Arrow. To control the quantile ",
+ "interpolation algorithm, set argument `interpolation` to one of: ",
+ "\"linear\" (the default), \"lower\", \"higher\", \"nearest\", or ",
+ "\"midpoint\".",
+ call. = FALSE
+ )
+ }
+ interpolation <- QuantileInterpolation[[toupper(match.arg(interpolation))]]
+ out <- call_function("quantile", x, options = list(q = probs, interpolation = interpolation))
+ if (length(out) == 0) {
+ # When there are no non-missing values in the data, the Arrow quantile
+ # function returns an empty Array, but for consistency with the R quantile
+ # function, we want an Array of NA_real_ with the same length as probs
+ out <- Array$create(rep(NA_real_, length(probs)))
+ }
+ out
+}
+
+#' @export
+median.ArrowDatum <- function(x, na.rm = FALSE, ...) {
+ if (!na.rm && x$null_count > 0) {
+ Scalar$create(NA_real_)
+ } else {
+ Scalar$create(quantile(x, probs = 0.5, na.rm = TRUE, ...))
+ }
+}
+
+#' @export
+unique.ArrowDatum <- function(x, incomparables = FALSE, ...) {
+ call_function("unique", x)
+}
+
+#' @export
+any.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("any", ..., na.rm = na.rm)
+}
+
+#' @export
+all.ArrowDatum <- function(..., na.rm = FALSE) {
+ scalar_aggregate("all", ..., na.rm = na.rm)
+}
+
+#' `match` and `%in%` for Arrow objects
+#'
+#' `base::match()` is not a generic, so we can't just define Arrow methods for
+#' it. This function exposes the analogous functions in the Arrow C++ library.
+#'
+#' @param x `Scalar`, `Array` or `ChunkedArray`
+#' @param table `Scalar`, Array`, `ChunkedArray`, or R vector lookup table.
+#' @param ... additional arguments, ignored
+#' @return `match_arrow()` returns an `int32`-type Arrow object of the same length
+#' and type as `x` with the (0-based) indexes into `table`. `is_in()` returns a
+#' `boolean`-type Arrow object of the same length and type as `x` with values indicating
+#' per element of `x` it it is present in `table`.
+#' @examplesIf arrow_available()
+#' # note that the returned value is 0-indexed
+#' cars_tbl <- arrow_table(name = rownames(mtcars), mtcars)
+#' match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name)
+#'
+#' is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name)
+#'
+#' # Although there are multiple matches, you are returned the index of the first
+#' # match, as with the base R equivalent
+#' match(4, mtcars$cyl) # 1-indexed
+#' match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed
+#'
+#' # If `x` contains multiple values, you are returned the indices of the first
+#' # match for each value.
+#' match(c(4, 6, 8), mtcars$cyl)
+#' match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl)
+#'
+#' # Return type matches type of `x`
+#' is_in(c(4, 6, 8), mtcars$cyl) # returns vector
+#' is_in(Scalar$create(4), mtcars$cyl) # returns Scalar
+#' is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array
+#' is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray
+#' @export
+match_arrow <- function(x, table, ...) {
+ if (!inherits(x, "ArrowDatum")) {
+ x <- Array$create(x)
+ }
+
+ if (!inherits(table, c("Array", "ChunkedArray"))) {
+ table <- Array$create(table)
+ }
+ call_function("index_in_meta_binary", x, table)
+}
+
+#' @rdname match_arrow
+#' @export
+is_in <- function(x, table, ...) {
+ if (!inherits(x, "ArrowDatum")) {
+ x <- Array$create(x)
+ }
+
+ if (!inherits(table, c("Array", "DictionaryArray", "ChunkedArray"))) {
+ table <- Array$create(table)
+ }
+ call_function("is_in_meta_binary", x, table)
+}
+
+#' `table` for Arrow objects
+#'
+#' This function tabulates the values in the array and returns a table of counts.
+#' @param x `Array` or `ChunkedArray`
+#' @return A `StructArray` containing "values" (same type as `x`) and "counts"
+#' `Int64`.
+#' @examplesIf arrow_available()
+#' cyl_vals <- Array$create(mtcars$cyl)
+#' counts <- value_counts(cyl_vals)
+#' @export
+value_counts <- function(x) {
+ call_function("value_counts", x)
+}
+
+#' Cast options
+#'
+#' @param safe logical: enforce safe conversion? Default `TRUE`
+#' @param ... additional cast options, such as `allow_int_overflow`,
+#' `allow_time_truncate`, and `allow_float_truncate`, which are set to `!safe`
+#' by default
+#' @return A list
+#' @export
+#' @keywords internal
+cast_options <- function(safe = TRUE, ...) {
+ opts <- list(
+ allow_int_overflow = !safe,
+ allow_time_truncate = !safe,
+ allow_float_truncate = !safe
+ )
+ modifyList(opts, list(...))
+}
diff --git a/src/arrow/r/R/config.R b/src/arrow/r/R/config.R
new file mode 100644
index 000000000..af07ad9a9
--- /dev/null
+++ b/src/arrow/r/R/config.R
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Manage the global CPU thread pool in libarrow
+#'
+#' @export
+cpu_count <- function() {
+ GetCpuThreadPoolCapacity()
+}
+
+#' @rdname cpu_count
+#' @param num_threads integer: New number of threads for thread pool
+#' @export
+set_cpu_count <- function(num_threads) {
+ SetCpuThreadPoolCapacity(as.integer(num_threads))
+}
+
+#' Manage the global I/O thread pool in libarrow
+#'
+#' @export
+io_thread_count <- function() {
+ GetIOThreadPoolCapacity()
+}
+
+#' @rdname io_thread_count
+#' @param num_threads integer: New number of threads for thread pool
+#' @export
+set_io_thread_count <- function(num_threads) {
+ SetIOThreadPoolCapacity(as.integer(num_threads))
+}
diff --git a/src/arrow/r/R/csv.R b/src/arrow/r/R/csv.R
new file mode 100644
index 000000000..ee890578f
--- /dev/null
+++ b/src/arrow/r/R/csv.R
@@ -0,0 +1,644 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Read a CSV or other delimited file with Arrow
+#'
+#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`.
+#' Arrow C++ options have been mapped to argument names that follow those of
+#' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`.
+#'
+#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around
+#' `read_delim_arrow()` that specify a delimiter.
+#'
+#' Note that not all `readr` options are currently implemented here. Please file
+#' an issue if you encounter one that `arrow` should support.
+#'
+#' If you need to control Arrow-specific reader parameters that don't have an
+#' equivalent in `readr::read_csv()`, you can either provide them in the
+#' `parse_options`, `convert_options`, or `read_options` arguments, or you can
+#' use [CsvTableReader] directly for lower-level access.
+#'
+#' @section Specifying column types and names:
+#'
+#' By default, the CSV reader will infer the column names and data types from the file, but there
+#' are a few ways you can specify them directly.
+#'
+#' One way is to provide an Arrow [Schema] in the `schema` argument,
+#' which is an ordered map of column name to type.
+#' When provided, it satisfies both the `col_names` and `col_types` arguments.
+#' This is good if you know all of this information up front.
+#'
+#' You can also pass a `Schema` to the `col_types` argument. If you do this,
+#' column names will still be inferred from the file unless you also specify
+#' `col_names`. In either case, the column names in the `Schema` must match the
+#' data's column names, whether they are explicitly provided or inferred. That
+#' said, this `Schema` does not have to reference all columns: those omitted
+#' will have their types inferred.
+#'
+#' Alternatively, you can declare column types by providing the compact string representation
+#' that `readr` uses to the `col_types` argument. This means you provide a
+#' single string, one character per column, where the characters map to Arrow
+#' types analogously to the `readr` type mapping:
+#'
+#' * "c": `utf8()`
+#' * "i": `int32()`
+#' * "n": `float64()`
+#' * "d": `float64()`
+#' * "l": `bool()`
+#' * "f": `dictionary()`
+#' * "D": `date32()`
+#' * "T": `timestamp()`
+#' * "t": `time32()`
+#' * "_": `null()`
+#' * "-": `null()`
+#' * "?": infer the type from the data
+#'
+#' If you use the compact string representation for `col_types`, you must also
+#' specify `col_names`.
+#'
+#' Regardless of how types are specified, all columns with a `null()` type will
+#' be dropped.
+#'
+#' Note that if you are specifying column names, whether by `schema` or
+#' `col_names`, and the CSV file has a header row that would otherwise be used
+#' to idenfity column names, you'll need to add `skip = 1` to skip that row.
+#'
+#' @param file A character file name or URI, `raw` vector, an Arrow input stream,
+#' or a `FileSystem` with path (`SubTreeFileSystem`).
+#' If a file name, a memory-mapped Arrow [InputStream] will be opened and
+#' closed when finished; compression will be detected from the file extension
+#' and handled automatically. If an input stream is provided, it will be left
+#' open.
+#' @param delim Single character used to separate fields within a record.
+#' @param quote Single character used to quote strings.
+#' @param escape_double Does the file escape quotes by doubling them?
+#' i.e. If this option is `TRUE`, the value `""""` represents
+#' a single quote, `\"`.
+#' @param escape_backslash Does the file use backslashes to escape special
+#' characters? This is more general than `escape_double` as backslashes
+#' can be used to escape the delimiter character, the quote character, or
+#' to add special characters like `\\n`.
+#' @param schema [Schema] that describes the table. If provided, it will be
+#' used to satisfy both `col_names` and `col_types`.
+#' @param col_names If `TRUE`, the first row of the input will be used as the
+#' column names and will not be included in the data frame. If `FALSE`, column
+#' names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
+#' Alternatively, you can specify a character vector of column names.
+#' @param col_types A compact string representation of the column types, or
+#' `NULL` (the default) to infer types from the data.
+#' @param col_select A character vector of column names to keep, as in the
+#' "select" argument to `data.table::fread()`, or a
+#' [tidy selection specification][tidyselect::vars_select()]
+#' of columns, as used in `dplyr::select()`.
+#' @param na A character vector of strings to interpret as missing values.
+#' @param quoted_na Should missing values inside quotes be treated as missing
+#' values (the default) or strings. (Note that this is different from the
+#' the Arrow C++ default for the corresponding convert option,
+#' `strings_can_be_null`.)
+#' @param skip_empty_rows Should blank rows be ignored altogether? If
+#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
+#' filled with missings.
+#' @param skip Number of lines to skip before reading data.
+#' @param timestamp_parsers User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are:
+#' - `NULL`: the default, which uses the ISO-8601 parser
+#' - a character vector of [strptime][base::strptime()] parse strings
+#' - a list of [TimestampParser] objects
+#' @param parse_options see [file reader options][CsvReadOptions].
+#' If given, this overrides any
+#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
+#' @param convert_options see [file reader options][CsvReadOptions]
+#' @param read_options see [file reader options][CsvReadOptions]
+#' @param as_data_frame Should the function return a `data.frame` (default) or
+#' an Arrow [Table]?
+#'
+#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`.
+#' @export
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write.csv(mtcars, file = tf)
+#' df <- read_csv_arrow(tf)
+#' dim(df)
+#' # Can select columns
+#' df <- read_csv_arrow(tf, col_select = starts_with("d"))
+read_delim_arrow <- function(file,
+ delim = ",",
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ if (inherits(schema, "Schema")) {
+ col_names <- names(schema)
+ col_types <- schema
+ }
+ if (is.null(parse_options)) {
+ parse_options <- readr_to_csv_parse_options(
+ delim,
+ quote,
+ escape_double,
+ escape_backslash,
+ skip_empty_rows
+ )
+ }
+ if (is.null(read_options)) {
+ read_options <- readr_to_csv_read_options(skip, col_names)
+ }
+ if (is.null(convert_options)) {
+ convert_options <- readr_to_csv_convert_options(
+ na,
+ quoted_na,
+ col_types = col_types,
+ col_names = read_options$column_names,
+ timestamp_parsers = timestamp_parsers
+ )
+ }
+
+ if (!inherits(file, "InputStream")) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+ reader <- CsvTableReader$create(
+ file,
+ read_options = read_options,
+ parse_options = parse_options,
+ convert_options = convert_options
+ )
+
+ tab <- reader$Read()
+
+ # TODO: move this into convert_options using include_columns
+ col_select <- enquo(col_select)
+ if (!quo_is_null(col_select)) {
+ tab <- tab[vars_select(names(tab), !!col_select)]
+ }
+
+ if (isTRUE(as_data_frame)) {
+ tab <- as.data.frame(tab)
+ }
+
+ tab
+}
+
+#' @rdname read_delim_arrow
+#' @export
+read_csv_arrow <- function(file,
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ mc <- match.call()
+ mc$delim <- ","
+ mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
+ eval.parent(mc)
+}
+
+#' @rdname read_delim_arrow
+#' @export
+read_tsv_arrow <- function(file,
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL) {
+ mc <- match.call()
+ mc$delim <- "\t"
+ mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
+ eval.parent(mc)
+}
+
+#' @title Arrow CSV and JSON table reader classes
+#' @rdname CsvTableReader
+#' @name CsvTableReader
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvTableReader` and `JsonTableReader` wrap the Arrow C++ CSV
+#' and JSON table readers. See their usage in [read_csv_arrow()] and
+#' [read_json_arrow()], respectively.
+#'
+#' @section Factory:
+#'
+#' The `CsvTableReader$create()` and `JsonTableReader$create()` factory methods
+#' take the following arguments:
+#'
+#' - `file` An Arrow [InputStream]
+#' - `convert_options` (CSV only), `parse_options`, `read_options`: see
+#' [CsvReadOptions]
+#' - `...` additional parameters.
+#'
+#' @section Methods:
+#'
+#' - `$Read()`: returns an Arrow Table.
+#'
+#' @include arrow-package.R
+#' @export
+CsvTableReader <- R6Class("CsvTableReader",
+ inherit = ArrowObject,
+ public = list(
+ Read = function() csv___TableReader__Read(self)
+ )
+)
+CsvTableReader$create <- function(file,
+ read_options = CsvReadOptions$create(),
+ parse_options = CsvParseOptions$create(),
+ convert_options = CsvConvertOptions$create(),
+ ...) {
+ assert_is(file, "InputStream")
+ csv___TableReader__Make(file, read_options, parse_options, convert_options)
+}
+
+#' @title File reader options
+#' @rdname CsvReadOptions
+#' @name CsvReadOptions
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,
+#' `JsonReadOptions`, `JsonParseOptions`, and `TimestampParser` are containers for various
+#' file reading options. See their usage in [read_csv_arrow()] and
+#' [read_json_arrow()], respectively.
+#'
+#' @section Factory:
+#'
+#' The `CsvReadOptions$create()` and `JsonReadOptions$create()` factory methods
+#' take the following arguments:
+#'
+#' - `use_threads` Whether to use the global CPU thread pool
+#' - `block_size` Block size we request from the IO layer; also determines
+#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input
+#' must end with an empty line.
+#'
+#' `CsvReadOptions$create()` further accepts these additional arguments:
+#'
+#' - `skip_rows` Number of lines to skip before reading data (default 0)
+#' - `column_names` Character vector to supply column names. If length-0
+#' (the default), the first non-skipped row will be parsed to generate column
+#' names, unless `autogenerate_column_names` is `TRUE`.
+#' - `autogenerate_column_names` Logical: generate column names instead of
+#' using the first non-skipped row (the default)? If `TRUE`, column names will
+#' be "f0", "f1", ..., "fN".
+#'
+#' `CsvParseOptions$create()` takes the following arguments:
+#'
+#' - `delimiter` Field delimiting character (default `","`)
+#' - `quoting` Logical: are strings quoted? (default `TRUE`)
+#' - `quote_char` Quoting character, if `quoting` is `TRUE`
+#' - `double_quote` Logical: are quotes inside values double-quoted? (default `TRUE`)
+#' - `escaping` Logical: whether escaping is used (default `FALSE`)
+#' - `escape_char` Escaping character, if `escaping` is `TRUE`
+#' - `newlines_in_values` Logical: are values allowed to contain CR (`0x0d`)
+#' and LF (`0x0a`) characters? (default `FALSE`)
+#' - `ignore_empty_lines` Logical: should empty lines be ignored (default) or
+#' generate a row of missing values (if `FALSE`)?
+#'
+#' `JsonParseOptions$create()` accepts only the `newlines_in_values` argument.
+#'
+#' `CsvConvertOptions$create()` takes the following arguments:
+#'
+#' - `check_utf8` Logical: check UTF8 validity of string columns? (default `TRUE`)
+#' - `null_values` character vector of recognized spellings for null values.
+#' Analogous to the `na.strings` argument to
+#' [`read.csv()`][utils::read.csv()] or `na` in `readr::read_csv()`.
+#' - `strings_can_be_null` Logical: can string / binary columns have
+#' null values? Similar to the `quoted_na` argument to `readr::read_csv()`.
+#' (default `FALSE`)
+#' - `true_values` character vector of recognized spellings for `TRUE` values
+#' - `false_values` character vector of recognized spellings for `FALSE` values
+#' - `col_types` A `Schema` or `NULL` to infer types
+#' - `auto_dict_encode` Logical: Whether to try to automatically
+#' dictionary-encode string / binary data (think `stringsAsFactors`). Default `FALSE`.
+#' This setting is ignored for non-inferred columns (those in `col_types`).
+#' - `auto_dict_max_cardinality` If `auto_dict_encode`, string/binary columns
+#' are dictionary-encoded up to this number of unique values (default 50),
+#' after which it switches to regular encoding.
+#' - `include_columns` If non-empty, indicates the names of columns from the
+#' CSV file that should be actually read and converted (in the vector's order).
+#' - `include_missing_columns` Logical: if `include_columns` is provided, should
+#' columns named in it but not found in the data be included as a column of
+#' type `null()`? The default (`FALSE`) means that the reader will instead
+#' raise an error.
+#' - `timestamp_parsers` User-defined timestamp parsers. If more than one
+#' parser is specified, the CSV conversion logic will try parsing values
+#' starting from the beginning of this vector. Possible values are
+#' (a) `NULL`, the default, which uses the ISO-8601 parser;
+#' (b) a character vector of [strptime][base::strptime()] parse strings; or
+#' (c) a list of [TimestampParser] objects.
+#'
+#' `TimestampParser$create()` takes an optional `format` string argument.
+#' See [`strptime()`][base::strptime()] for example syntax.
+#' The default is to use an ISO-8601 format parser.
+#'
+#' The `CsvWriteOptions$create()` factory method takes the following arguments:
+#' - `include_header` Whether to write an initial header line with column names
+#' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @section Active bindings:
+#'
+#' - `column_names`: from `CsvReadOptions`
+#'
+#' @export
+CsvReadOptions <- R6Class("CsvReadOptions",
+ inherit = ArrowObject,
+ active = list(
+ column_names = function() csv___ReadOptions__column_names(self)
+ )
+)
+CsvReadOptions$create <- function(use_threads = option_use_threads(),
+ block_size = 1048576L,
+ skip_rows = 0L,
+ column_names = character(0),
+ autogenerate_column_names = FALSE) {
+ csv___ReadOptions__initialize(
+ list(
+ use_threads = use_threads,
+ block_size = block_size,
+ skip_rows = skip_rows,
+ column_names = column_names,
+ autogenerate_column_names = autogenerate_column_names
+ )
+ )
+}
+
+#' @rdname CsvReadOptions
+#' @export
+CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
+CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) {
+ assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+ csv___WriteOptions__initialize(
+ list(
+ include_header = include_header,
+ batch_size = as.integer(batch_size)
+ )
+ )
+}
+
+readr_to_csv_read_options <- function(skip, col_names, col_types) {
+ if (isTRUE(col_names)) {
+ # C++ default to parse is 0-length string array
+ col_names <- character(0)
+ }
+ if (identical(col_names, FALSE)) {
+ CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE)
+ } else {
+ CsvReadOptions$create(skip_rows = skip, column_names = col_names)
+ }
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject)
+CsvParseOptions$create <- function(delimiter = ",",
+ quoting = TRUE,
+ quote_char = '"',
+ double_quote = TRUE,
+ escaping = FALSE,
+ escape_char = "\\",
+ newlines_in_values = FALSE,
+ ignore_empty_lines = TRUE) {
+ csv___ParseOptions__initialize(
+ list(
+ delimiter = delimiter,
+ quoting = quoting,
+ quote_char = quote_char,
+ double_quote = double_quote,
+ escaping = escaping,
+ escape_char = escape_char,
+ newlines_in_values = newlines_in_values,
+ ignore_empty_lines = ignore_empty_lines
+ )
+ )
+}
+
+readr_to_csv_parse_options <- function(delim = ",",
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ skip_empty_rows = TRUE) {
+ # This function translates from the readr argument list to the arrow arg names
+ # TODO: validate inputs
+ CsvParseOptions$create(
+ delimiter = delim,
+ quoting = nzchar(quote),
+ quote_char = quote,
+ double_quote = escape_double,
+ escaping = escape_backslash,
+ escape_char = "\\",
+ newlines_in_values = escape_backslash,
+ ignore_empty_lines = skip_empty_rows
+ )
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+TimestampParser <- R6Class("TimestampParser",
+ inherit = ArrowObject,
+ public = list(
+ kind = function() TimestampParser__kind(self),
+ format = function() TimestampParser__format(self)
+ )
+)
+TimestampParser$create <- function(format = NULL) {
+ if (is.null(format)) {
+ TimestampParser__MakeISO8601()
+ } else {
+ TimestampParser__MakeStrptime(format)
+ }
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
+CsvConvertOptions$create <- function(check_utf8 = TRUE,
+ null_values = c("", "NA"),
+ true_values = c("T", "true", "TRUE"),
+ false_values = c("F", "false", "FALSE"),
+ strings_can_be_null = FALSE,
+ col_types = NULL,
+ auto_dict_encode = FALSE,
+ auto_dict_max_cardinality = 50L,
+ include_columns = character(),
+ include_missing_columns = FALSE,
+ timestamp_parsers = NULL) {
+ if (!is.null(col_types) && !inherits(col_types, "Schema")) {
+ abort(c(
+ "Unsupported `col_types` specification.",
+ i = "`col_types` must be NULL, or a <Schema>."
+ ))
+ }
+
+ csv___ConvertOptions__initialize(
+ list(
+ check_utf8 = check_utf8,
+ null_values = null_values,
+ strings_can_be_null = strings_can_be_null,
+ col_types = col_types,
+ true_values = true_values,
+ false_values = false_values,
+ auto_dict_encode = auto_dict_encode,
+ auto_dict_max_cardinality = auto_dict_max_cardinality,
+ include_columns = include_columns,
+ include_missing_columns = include_missing_columns,
+ timestamp_parsers = timestamp_parsers
+ )
+ )
+}
+
+readr_to_csv_convert_options <- function(na,
+ quoted_na,
+ col_types = NULL,
+ col_names = NULL,
+ timestamp_parsers = NULL) {
+ include_columns <- character()
+
+ if (is.character(col_types)) {
+ if (length(col_types) != 1L) {
+ abort("`col_types` is a character vector that is not of size 1")
+ }
+ n <- nchar(col_types)
+ specs <- substring(col_types, seq_len(n), seq_len(n))
+ if (!is_bare_character(col_names, n)) {
+ abort("Compact specification for `col_types` requires `col_names`")
+ }
+
+ col_types <- set_names(nm = col_names, map2(specs, col_names, ~ {
+ switch(.x,
+ "c" = utf8(),
+ "i" = int32(),
+ "n" = float64(),
+ "d" = float64(),
+ "l" = bool(),
+ "f" = dictionary(),
+ "D" = date32(),
+ "T" = timestamp(),
+ "t" = time32(),
+ "_" = null(),
+ "-" = null(),
+ "?" = NULL,
+ abort("Unsupported compact specification: '", .x, "' for column '", .y, "'")
+ )
+ }))
+ # To "guess" types, omit them from col_types
+ col_types <- keep(col_types, ~ !is.null(.x))
+ col_types <- schema(!!!col_types)
+ }
+
+ if (!is.null(col_types)) {
+ assert_is(col_types, "Schema")
+ # If any columns are null(), drop them
+ # (by specifying the other columns in include_columns)
+ nulls <- map_lgl(col_types$fields, ~ .$type$Equals(null()))
+ if (any(nulls)) {
+ include_columns <- setdiff(col_names, names(col_types)[nulls])
+ }
+ }
+ CsvConvertOptions$create(
+ null_values = na,
+ strings_can_be_null = quoted_na,
+ col_types = col_types,
+ timestamp_parsers = timestamp_parsers,
+ include_columns = include_columns
+ )
+}
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+ sink,
+ include_header = TRUE,
+ batch_size = 1024L) {
+ write_options <- CsvWriteOptions$create(include_header, batch_size)
+
+ x_out <- x
+ if (is.data.frame(x)) {
+ x <- Table$create(x)
+ }
+
+ assert_that(is_writable_table(x))
+
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+
+ if (inherits(x, "RecordBatch")) {
+ csv___WriteCSV__RecordBatch(x, write_options, sink)
+ } else if (inherits(x, "Table")) {
+ csv___WriteCSV__Table(x, write_options, sink)
+ }
+
+ invisible(x_out)
+}
diff --git a/src/arrow/r/R/dataset-factory.R b/src/arrow/r/R/dataset-factory.R
new file mode 100644
index 000000000..c56a6b181
--- /dev/null
+++ b/src/arrow/r/R/dataset-factory.R
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include dataset.R
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Dataset
+#' @export
+DatasetFactory <- R6Class("DatasetFactory",
+ inherit = ArrowObject,
+ public = list(
+ Finish = function(schema = NULL, unify_schemas = FALSE) {
+ if (is.null(schema)) {
+ dataset___DatasetFactory__Finish1(self, unify_schemas)
+ } else {
+ assert_is(schema, "Schema")
+ dataset___DatasetFactory__Finish2(self, schema)
+ }
+ },
+ Inspect = function(unify_schemas = FALSE) {
+ dataset___DatasetFactory__Inspect(self, unify_schemas)
+ }
+ )
+)
+DatasetFactory$create <- function(x,
+ filesystem = NULL,
+ format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
+ partitioning = NULL,
+ ...) {
+ if (is_list_of(x, "DatasetFactory")) {
+ return(dataset___UnionDatasetFactory__Make(x))
+ }
+
+ if (is.character(format)) {
+ format <- FileFormat$create(match.arg(format), ...)
+ } else {
+ assert_is(format, "FileFormat")
+ }
+
+ path_and_fs <- get_paths_and_filesystem(x, filesystem)
+ info <- path_and_fs$fs$GetFileInfo(path_and_fs$path)
+
+ if (length(info) > 1 || info[[1]]$type == FileType$File) {
+ # x looks like a vector of one or more file paths (not a directory path)
+ return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL, path_and_fs$path, format))
+ }
+
+ if (!is.null(partitioning)) {
+ if (inherits(partitioning, "Schema")) {
+ partitioning <- DirectoryPartitioning$create(partitioning)
+ } else if (is.character(partitioning)) {
+ # These are the column/field names, and we should autodetect their types
+ partitioning <- DirectoryPartitioningFactory$create(partitioning)
+ }
+ }
+
+ selector <- FileSelector$create(path_and_fs$path, allow_not_found = FALSE, recursive = TRUE)
+
+ FileSystemDatasetFactory$create(path_and_fs$fs, selector, NULL, format, partitioning)
+}
+
+#' Create a DatasetFactory
+#'
+#' A [Dataset] can constructed using one or more [DatasetFactory]s.
+#' This function helps you construct a `DatasetFactory` that you can pass to
+#' [open_dataset()].
+#'
+#' If you would only have a single `DatasetFactory` (for example, you have a
+#' single directory containing Parquet files), you can call `open_dataset()`
+#' directly. Use `dataset_factory()` when you
+#' want to combine different directories, file systems, or file formats.
+#'
+#' @param x A string path to a directory containing data files, a vector of one
+#' one or more string paths to data files, or a list of `DatasetFactory` objects
+#' whose datasets should be combined. If this argument is specified it will be
+#' used to construct a `UnionDatasetFactory` and other arguments will be
+#' ignored.
+#' @param filesystem A [FileSystem] object; if omitted, the `FileSystem` will
+#' be detected from `x`
+#' @param format A [FileFormat] object, or a string identifier of the format of
+#' the files in `x`. Currently supported values:
+#' * "parquet"
+#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+#' only version 2 files are supported
+#' * "csv"/"text", aliases for the same thing (because comma is the default
+#' delimiter for text files
+#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
+#'
+#' Default is "parquet", unless a `delimiter` is also specified, in which case
+#' it is assumed to be "text".
+#' @param partitioning One of
+#' * A `Schema`, in which case the file paths relative to `sources` will be
+#' parsed, and path segments will be matched with the schema fields. For
+#' example, `schema(year = int16(), month = int8())` would create partitions
+#' for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc.
+#' * A character vector that defines the field names corresponding to those
+#' path segments (that is, you're providing the names that would correspond
+#' to a `Schema` but the types will be autodetected)
+#' * A `HivePartitioning` or `HivePartitioningFactory`, as returned
+#' by [hive_partition()] which parses explicit or autodetected fields from
+#' Hive-style path segments
+#' * `NULL` for no partitioning
+#' @param ... Additional format-specific options, passed to
+#' `FileFormat$create()`. For CSV options, note that you can specify them either
+#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
+#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
+#' Not all `readr` options are currently supported; please file an issue if you
+#' encounter one that `arrow` should support.
+#' @return A `DatasetFactory` object. Pass this to [open_dataset()],
+#' in a list potentially with other `DatasetFactory` objects, to create
+#' a `Dataset`.
+#' @export
+dataset_factory <- DatasetFactory$create
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Dataset
+#' @export
+FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory",
+ inherit = DatasetFactory
+)
+FileSystemDatasetFactory$create <- function(filesystem,
+ selector = NULL,
+ paths = NULL,
+ format,
+ partitioning = NULL) {
+ assert_is(filesystem, "FileSystem")
+ is.null(selector) || assert_is(selector, "FileSelector")
+ is.null(paths) || assert_is(paths, "character")
+ assert_that(
+ xor(is.null(selector), is.null(paths)),
+ msg = "Either selector or paths must be specified"
+ )
+ assert_is(format, "FileFormat")
+ if (!is.null(paths)) {
+ assert_that(is.null(partitioning), msg = "Partitioning not supported with paths")
+ }
+
+ if (!is.null(paths)) {
+ ptr <- dataset___FileSystemDatasetFactory__Make0(filesystem, paths, format)
+ } else if (is.null(partitioning)) {
+ ptr <- dataset___FileSystemDatasetFactory__Make1(filesystem, selector, format)
+ } else if (inherits(partitioning, "PartitioningFactory")) {
+ ptr <- dataset___FileSystemDatasetFactory__Make3(filesystem, selector, format, partitioning)
+ } else if (inherits(partitioning, "Partitioning")) {
+ ptr <- dataset___FileSystemDatasetFactory__Make2(filesystem, selector, format, partitioning)
+ } else {
+ stop(
+ "Expected 'partitioning' to be NULL, PartitioningFactory or Partitioning",
+ call. = FALSE
+ )
+ }
+
+ ptr
+}
diff --git a/src/arrow/r/R/dataset-format.R b/src/arrow/r/R/dataset-format.R
new file mode 100644
index 000000000..b0b93219e
--- /dev/null
+++ b/src/arrow/r/R/dataset-format.R
@@ -0,0 +1,353 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Dataset file formats
+#'
+#' @description
+#' A `FileFormat` holds information about how to read and parse the files
+#' included in a `Dataset`. There are subclasses corresponding to the supported
+#' file formats (`ParquetFileFormat` and `IpcFileFormat`).
+#'
+#' @section Factory:
+#' `FileFormat$create()` takes the following arguments:
+#' * `format`: A string identifier of the file format. Currently supported values:
+#' * "parquet"
+#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+#' only version 2 files are supported
+#' * "csv"/"text", aliases for the same thing (because comma is the default
+#' delimiter for text files
+#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
+#' * `...`: Additional format-specific options
+#'
+#' `format = "parquet"``:
+#' * `dict_columns`: Names of columns which should be read as dictionaries.
+#' * Any Parquet options from [FragmentScanOptions].
+#'
+#' `format = "text"`: see [CsvParseOptions]. Note that you can specify them either
+#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
+#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
+#' Not all `readr` options are currently supported; please file an issue if
+#' you encounter one that `arrow` should support. Also, the following options are
+#' supported. From [CsvReadOptions]:
+#' * `skip_rows`
+#' * `column_names`
+#' * `autogenerate_column_names`
+#' From [CsvFragmentScanOptions] (these values can be overridden at scan time):
+#' * `convert_options`: a [CsvConvertOptions]
+#' * `block_size`
+#'
+#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
+#' @rdname FileFormat
+#' @name FileFormat
+#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows"
+#' ## Semi-colon delimited files
+#' # Set up directory for examples
+#' tf <- tempfile()
+#' dir.create(tf)
+#' on.exit(unlink(tf))
+#' write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)
+#'
+#' # Create FileFormat object
+#' format <- FileFormat$create(format = "text", delimiter = ";")
+#'
+#' open_dataset(tf, format = format)
+#' @export
+FileFormat <- R6Class("FileFormat",
+ inherit = ArrowObject,
+ active = list(
+ # @description
+ # Return the `FileFormat`'s type
+ type = function() dataset___FileFormat__type_name(self)
+ )
+)
+FileFormat$create <- function(format, schema = NULL, ...) {
+ opt_names <- names(list(...))
+ if (format %in% c("csv", "text") || any(opt_names %in% c("delim", "delimiter"))) {
+ CsvFileFormat$create(schema = schema, ...)
+ } else if (format == c("tsv")) {
+ CsvFileFormat$create(delimiter = "\t", schema = schema, ...)
+ } else if (format == "parquet") {
+ ParquetFileFormat$create(...)
+ } else if (format %in% c("ipc", "arrow", "feather")) { # These are aliases for the same thing
+ dataset___IpcFileFormat__Make()
+ } else {
+ stop("Unsupported file format: ", format, call. = FALSE)
+ }
+}
+
+#' @export
+as.character.FileFormat <- function(x, ...) {
+ out <- x$type
+ # Slight hack: special case IPC -> feather, otherwise is just the type_name
+ ifelse(out == "ipc", "feather", out)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat)
+ParquetFileFormat$create <- function(...,
+ dict_columns = character(0)) {
+ options <- ParquetFragmentScanOptions$create(...)
+ dataset___ParquetFileFormat__Make(options, dict_columns)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileFormat
+#' @export
+CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
+CsvFileFormat$create <- function(...,
+ opts = csv_file_format_parse_options(...),
+ convert_options = csv_file_format_convert_opts(...),
+ read_options = csv_file_format_read_opts(...)) {
+ dataset___CsvFileFormat__Make(opts, convert_options, read_options)
+}
+
+# Support both readr-style option names and Arrow C++ option names
+csv_file_format_parse_options <- function(...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
+ convert_opts <- names(formals(CsvConvertOptions$create))
+ read_opts <- names(formals(CsvReadOptions$create))
+ opts[convert_opts] <- NULL
+ opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
+ opt_names <- names(opts)
+ # Catch any readr-style options specified with full option names that are
+ # supported by read_delim_arrow() (and its wrappers) but are not yet
+ # supported here
+ unsup_readr_opts <- setdiff(
+ names(formals(read_delim_arrow)),
+ names(formals(readr_to_csv_parse_options))
+ )
+ is_unsup_opt <- opt_names %in% unsup_readr_opts
+ unsup_opts <- opt_names[is_unsup_opt]
+ if (length(unsup_opts)) {
+ stop(
+ "The following ",
+ ngettext(length(unsup_opts), "option is ", "options are "),
+ "supported in \"read_delim_arrow\" functions ",
+ "but not yet supported here: ",
+ oxford_paste(unsup_opts),
+ call. = FALSE
+ )
+ }
+ # Catch any options with full or partial names that do not match any of the
+ # recognized Arrow C++ option names or readr-style option names
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
+ is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
+ unrec_opts <- opt_names[!is_arrow_opt & !is_readr_opt]
+ if (length(unrec_opts)) {
+ stop(
+ "Unrecognized ",
+ ngettext(length(unrec_opts), "option", "options"),
+ ": ",
+ oxford_paste(unrec_opts),
+ call. = FALSE
+ )
+ }
+ # Catch options with ambiguous partial names (such as "del") that make it
+ # unclear whether the user is specifying Arrow C++ options ("delimiter") or
+ # readr-style options ("delim")
+ is_ambig_opt <- is.na(pmatch(opt_names, c(arrow_opts, readr_opts)))
+ ambig_opts <- opt_names[is_ambig_opt]
+ if (length(ambig_opts)) {
+ stop("Ambiguous ",
+ ngettext(length(ambig_opts), "option", "options"),
+ ": ",
+ oxford_paste(ambig_opts),
+ ". Use full argument names",
+ call. = FALSE
+ )
+ }
+ if (any(is_readr_opt)) {
+ # Catch cases when the user specifies a mix of Arrow C++ options and
+ # readr-style options
+ if (!all(is_readr_opt)) {
+ stop("Use either Arrow parse options or readr parse options, not both",
+ call. = FALSE
+ )
+ }
+ do.call(readr_to_csv_parse_options, opts) # all options have readr-style names
+ } else {
+ do.call(CsvParseOptions$create, opts) # all options have Arrow C++ names
+ }
+}
+
+csv_file_format_convert_opts <- function(...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvParseOptions/CsvReadOptions
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ read_opts <- names(formals(CsvReadOptions$create))
+ opts[arrow_opts] <- NULL
+ opts[readr_opts] <- NULL
+ opts[read_opts] <- NULL
+ opts[["schema"]] <- NULL
+ do.call(CsvConvertOptions$create, opts)
+}
+
+csv_file_format_read_opts <- function(schema = NULL, ...) {
+ opts <- list(...)
+ # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+ convert_opts <- names(formals(CsvConvertOptions$create))
+ opts[arrow_opts] <- NULL
+ opts[readr_opts] <- NULL
+ opts[convert_opts] <- NULL
+ if (!is.null(schema)) {
+ opts[["column_names"]] <- names(schema)
+ }
+ do.call(CsvReadOptions$create, opts)
+}
+
+#' Format-specific scan options
+#'
+#' @description
+#' A `FragmentScanOptions` holds options specific to a `FileFormat` and a scan
+#' operation.
+#'
+#' @section Factory:
+#' `FragmentScanOptions$create()` takes the following arguments:
+#' * `format`: A string identifier of the file format. Currently supported values:
+#' * "parquet"
+#' * "csv"/"text", aliases for the same format.
+#' * `...`: Additional format-specific options
+#'
+#' `format = "parquet"``:
+#' * `use_buffered_stream`: Read files through buffered input streams rather than
+#' loading entire row groups at once. This may be enabled
+#' to reduce memory overhead. Disabled by default.
+#' * `buffer_size`: Size of buffered stream, if enabled. Default is 8KB.
+#' * `pre_buffer`: Pre-buffer the raw Parquet data. This can improve performance
+#' on high-latency filesystems. Disabled by default.
+#
+#' `format = "text"`: see [CsvConvertOptions]. Note that options can only be
+#' specified with the Arrow C++ library naming. Also, "block_size" from
+#' [CsvReadOptions] may be given.
+#'
+#' It returns the appropriate subclass of `FragmentScanOptions`
+#' (e.g. `CsvFragmentScanOptions`).
+#' @rdname FragmentScanOptions
+#' @name FragmentScanOptions
+#' @export
+FragmentScanOptions <- R6Class("FragmentScanOptions",
+ inherit = ArrowObject,
+ active = list(
+ # @description
+ # Return the `FragmentScanOptions`'s type
+ type = function() dataset___FragmentScanOptions__type_name(self)
+ )
+)
+FragmentScanOptions$create <- function(format, ...) {
+ if (format %in% c("csv", "text", "tsv")) {
+ CsvFragmentScanOptions$create(...)
+ } else if (format == "parquet") {
+ ParquetFragmentScanOptions$create(...)
+ } else {
+ stop("Unsupported file format: ", format, call. = FALSE)
+ }
+}
+
+#' @export
+as.character.FragmentScanOptions <- function(x, ...) {
+ x$type
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FragmentScanOptions
+#' @export
+CsvFragmentScanOptions <- R6Class("CsvFragmentScanOptions", inherit = FragmentScanOptions)
+CsvFragmentScanOptions$create <- function(...,
+ convert_opts = csv_file_format_convert_opts(...),
+ read_opts = csv_file_format_read_opts(...)) {
+ dataset___CsvFragmentScanOptions__Make(convert_opts, read_opts)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FragmentScanOptions
+#' @export
+ParquetFragmentScanOptions <- R6Class("ParquetFragmentScanOptions", inherit = FragmentScanOptions)
+ParquetFragmentScanOptions$create <- function(use_buffered_stream = FALSE,
+ buffer_size = 8196,
+ pre_buffer = TRUE) {
+ dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer)
+}
+
+#' Format-specific write options
+#'
+#' @description
+#' A `FileWriteOptions` holds write options specific to a `FileFormat`.
+FileWriteOptions <- R6Class("FileWriteOptions",
+ inherit = ArrowObject,
+ public = list(
+ update = function(table, ...) {
+ if (self$type == "parquet") {
+ dataset___ParquetFileWriteOptions__update(
+ self,
+ ParquetWriterProperties$create(table, ...),
+ ParquetArrowWriterProperties$create(...)
+ )
+ } else if (self$type == "ipc") {
+ args <- list(...)
+ if (is.null(args$codec)) {
+ dataset___IpcFileWriteOptions__update1(
+ self,
+ get_ipc_use_legacy_format(args$use_legacy_format),
+ get_ipc_metadata_version(args$metadata_version)
+ )
+ } else {
+ dataset___IpcFileWriteOptions__update2(
+ self,
+ get_ipc_use_legacy_format(args$use_legacy_format),
+ args$codec,
+ get_ipc_metadata_version(args$metadata_version)
+ )
+ }
+ } else if (self$type == "csv") {
+ dataset___CsvFileWriteOptions__update(
+ self,
+ CsvWriteOptions$create(...)
+ )
+ }
+ invisible(self)
+ }
+ ),
+ active = list(
+ type = function() dataset___FileWriteOptions__type_name(self)
+ )
+)
+FileWriteOptions$create <- function(format, ...) {
+ if (!inherits(format, "FileFormat")) {
+ format <- FileFormat$create(format)
+ }
+ options <- dataset___FileFormat__DefaultWriteOptions(format)
+ options$update(...)
+}
diff --git a/src/arrow/r/R/dataset-partition.R b/src/arrow/r/R/dataset-partition.R
new file mode 100644
index 000000000..35d5bc00c
--- /dev/null
+++ b/src/arrow/r/R/dataset-partition.R
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Define Partitioning for a Dataset
+#'
+#' @description
+#' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()`
+#' method to indicate how the file's paths should be interpreted to define
+#' partitioning.
+#'
+#' `DirectoryPartitioning` describes how to interpret raw path segments, in
+#' order. For example, `schema(year = int16(), month = int8())` would define
+#' partitions for file paths like "2019/01/file.parquet",
+#' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In
+#' the previous example: when writing a dataset if the month was `NA` (or
+#' `NULL`), the files would be placed in "2019/file.parquet". When reading, the
+#' rows in "2019/file.parquet" would return an `NA` for the month column. An
+#' error will be raised if an outer directory is `NULL` and an inner directory
+#' is not.
+#'
+#' `HivePartitioning` is for Hive-style partitioning, which embeds field
+#' names and values in path segments, such as
+#' "/year=2019/month=2/data.parquet". Because fields are named in the path
+#' segments, order does not matter. This partitioning scheme allows `NULL`
+#' values. They will be replaced by a configurable `null_fallback` which
+#' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When
+#' reading, the `null_fallback` string will be replaced with `NA`s as
+#' appropriate.
+#'
+#' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect
+#' partition features from the file paths.
+#' @section Factory:
+#' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()`
+#' methods take a [Schema] as a single input argument. The helper
+#' function [`hive_partition(...)`][hive_partition] is shorthand for
+#' `HivePartitioning$create(schema(...))`.
+#'
+#' With `DirectoryPartitioningFactory$create()`, you can provide just the
+#' names of the path segments (in our example, `c("year", "month")`), and
+#' the `DatasetFactory` will infer the data types for those partition variables.
+#' `HivePartitioningFactory$create()` takes no arguments: both variable names
+#' and their types can be inferred from the file paths. `hive_partition()` with
+#' no arguments returns a `HivePartitioningFactory`.
+#' @name Partitioning
+#' @rdname Partitioning
+#' @export
+Partitioning <- R6Class("Partitioning", inherit = ArrowObject)
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning)
+DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") {
+ dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning)
+HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") {
+ dataset___HivePartitioning(schm,
+ null_fallback = null_fallback_or_default(null_fallback),
+ segment_encoding = segment_encoding
+ )
+}
+
+#' Construct Hive partitioning
+#'
+#' Hive partitioning embeds field names and values in path segments, such as
+#' "/year=2019/month=2/data.parquet".
+#'
+#' Because fields are named in the path segments, order of fields passed to
+#' `hive_partition()` does not matter.
+#' @param ... named list of [data types][data-type], passed to [schema()]
+#' @param null_fallback character to be used in place of missing values (`NA` or `NULL`)
+#' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`,
+#' which is what Hive uses.
+#' @param segment_encoding Decode partition segments after splitting paths.
+#' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is).
+#' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if
+#' calling `hive_partition()` with no arguments.
+#' @examplesIf arrow_with_dataset()
+#' hive_partition(year = int16(), month = int8())
+#' @export
+hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") {
+ schm <- schema(...)
+ if (length(schm) == 0) {
+ HivePartitioningFactory$create(null_fallback, segment_encoding)
+ } else {
+ HivePartitioning$create(schm, null_fallback, segment_encoding)
+ }
+}
+
+PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory)
+DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") {
+ dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Partitioning
+#' @export
+HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory)
+HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") {
+ dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding)
+}
+
+null_fallback_or_default <- function(null_fallback) {
+ null_fallback %||% "__HIVE_DEFAULT_PARTITION__"
+}
diff --git a/src/arrow/r/R/dataset-scan.R b/src/arrow/r/R/dataset-scan.R
new file mode 100644
index 000000000..03c926fb4
--- /dev/null
+++ b/src/arrow/r/R/dataset-scan.R
@@ -0,0 +1,262 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Scan the contents of a dataset
+#'
+#' @description
+#' A `Scanner` iterates over a [Dataset]'s fragments and returns data
+#' according to given row filtering and column projection. A `ScannerBuilder`
+#' can help create one.
+#'
+#' @section Factory:
+#' `Scanner$create()` wraps the `ScannerBuilder` interface to make a `Scanner`.
+#' It takes the following arguments:
+#'
+#' * `dataset`: A `Dataset` or `arrow_dplyr_query` object, as returned by the
+#' `dplyr` methods on `Dataset`.
+#' * `projection`: A character vector of column names to select columns or a
+#' named list of expressions
+#' * `filter`: A `Expression` to filter the scanned rows by, or `TRUE` (default)
+#' to keep all rows.
+#' * `use_threads`: logical: should scanning use multithreading? Default `TRUE`
+#' * `use_async`: logical: should the async scanner (performs better on
+#' high-latency/highly parallel filesystems like S3) be used? Default `FALSE`
+#' * `...`: Additional arguments, currently ignored
+#' @section Methods:
+#' `ScannerBuilder` has the following methods:
+#'
+#' - `$Project(cols)`: Indicate that the scan should only return columns given
+#' by `cols`, a character vector of column names
+#' - `$Filter(expr)`: Filter rows by an [Expression].
+#' - `$UseThreads(threads)`: logical: should the scan use multithreading?
+#' The method's default input is `TRUE`, but you must call the method to enable
+#' multithreading because the scanner default is `FALSE`.
+#' - `$UseAsync(use_async)`: logical: should the async scanner be used?
+#' - `$BatchSize(batch_size)`: integer: Maximum row count of scanned record
+#' batches, default is 32K. If scanned record batches are overflowing memory
+#' then this method can be called to reduce their size.
+#' - `$schema`: Active binding, returns the [Schema] of the Dataset
+#' - `$Finish()`: Returns a `Scanner`
+#'
+#' `Scanner` currently has a single method, `$ToTable()`, which evaluates the
+#' query and returns an Arrow [Table].
+#' @rdname Scanner
+#' @name Scanner
+#' @export
+Scanner <- R6Class("Scanner",
+ inherit = ArrowObject,
+ public = list(
+ ToTable = function() dataset___Scanner__ToTable(self),
+ ScanBatches = function() dataset___Scanner__ScanBatches(self),
+ ToRecordBatchReader = function() dataset___Scanner__ToRecordBatchReader(self),
+ CountRows = function() dataset___Scanner__CountRows(self)
+ ),
+ active = list(
+ schema = function() dataset___Scanner__schema(self)
+ )
+)
+Scanner$create <- function(dataset,
+ projection = NULL,
+ filter = TRUE,
+ use_threads = option_use_threads(),
+ use_async = getOption("arrow.use_async", FALSE),
+ batch_size = NULL,
+ fragment_scan_options = NULL,
+ ...) {
+ if (inherits(dataset, "arrow_dplyr_query")) {
+ if (is_collapsed(dataset)) {
+ # TODO: Is there a way to get a RecordBatchReader rather than evaluating?
+ dataset$.data <- as_adq(dplyr::compute(dataset$.data))$.data
+ }
+
+ proj <- c(dataset$selected_columns, dataset$temp_columns)
+
+ if (!is.null(projection)) {
+ if (is.character(projection)) {
+ stopifnot("attempting to project with unknown columns" = all(projection %in% names(proj)))
+ proj <- proj[projection]
+ } else {
+ # TODO: ARROW-13802 accepting lists of Expressions as a projection
+ warning(
+ "Scanner$create(projection = ...) must be a character vector, ",
+ "ignoring the projection argument."
+ )
+ }
+ }
+
+ if (!isTRUE(filter)) {
+ dataset <- set_filters(dataset, filter)
+ }
+
+ return(Scanner$create(
+ dataset$.data,
+ proj,
+ dataset$filtered_rows,
+ use_threads,
+ use_async,
+ batch_size,
+ fragment_scan_options,
+ ...
+ ))
+ }
+
+ scanner_builder <- ScannerBuilder$create(dataset)
+ if (use_threads) {
+ scanner_builder$UseThreads()
+ }
+ if (use_async) {
+ scanner_builder$UseAsync()
+ }
+ if (!is.null(projection)) {
+ scanner_builder$Project(projection)
+ }
+ if (!isTRUE(filter)) {
+ scanner_builder$Filter(filter)
+ }
+ if (is_integerish(batch_size)) {
+ scanner_builder$BatchSize(batch_size)
+ }
+ if (!is.null(fragment_scan_options)) {
+ scanner_builder$FragmentScanOptions(fragment_scan_options)
+ }
+ scanner_builder$Finish()
+}
+
+#' @export
+names.Scanner <- function(x) names(x$schema)
+
+#' @export
+head.Scanner <- function(x, n = 6L, ...) {
+ assert_that(n > 0) # For now
+ dataset___Scanner__head(x, n)
+}
+
+#' @export
+tail.Scanner <- function(x, n = 6L, ...) {
+ assert_that(n > 0) # For now
+ result <- list()
+ batch_num <- 0
+ for (batch in rev(dataset___Scanner__ScanBatches(x))) {
+ batch_num <- batch_num + 1
+ result[[batch_num]] <- tail(batch, n)
+ n <- n - nrow(batch)
+ if (n <= 0) break
+ }
+ Table$create(!!!rev(result))
+}
+
+ScanTask <- R6Class("ScanTask",
+ inherit = ArrowObject,
+ public = list(
+ Execute = function() dataset___ScanTask__get_batches(self)
+ )
+)
+
+#' Apply a function to a stream of RecordBatches
+#'
+#' As an alternative to calling `collect()` on a `Dataset` query, you can
+#' use this function to access the stream of `RecordBatch`es in the `Dataset`.
+#' This lets you aggregate on each chunk and pull the intermediate results into
+#' a `data.frame` for further aggregation, even if you couldn't fit the whole
+#' `Dataset` result in memory.
+#'
+#' This is experimental and not recommended for production use.
+#'
+#' @param X A `Dataset` or `arrow_dplyr_query` object, as returned by the
+#' `dplyr` methods on `Dataset`.
+#' @param FUN A function or `purrr`-style lambda expression to apply to each
+#' batch
+#' @param ... Additional arguments passed to `FUN`
+#' @param .data.frame logical: collect the resulting chunks into a single
+#' `data.frame`? Default `TRUE`
+#' @export
+map_batches <- function(X, FUN, ..., .data.frame = TRUE) {
+ if (.data.frame) {
+ lapply <- map_dfr
+ }
+ scanner <- Scanner$create(ensure_group_vars(X))
+ FUN <- as_mapper(FUN)
+ lapply(scanner$ScanBatches(), function(batch) {
+ # TODO: wrap batch in arrow_dplyr_query with X$selected_columns,
+ # X$temp_columns, and X$group_by_vars
+ # if X is arrow_dplyr_query, if some other arg (.dplyr?) == TRUE
+ FUN(batch, ...)
+ })
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname Scanner
+#' @export
+ScannerBuilder <- R6Class("ScannerBuilder",
+ inherit = ArrowObject,
+ public = list(
+ Project = function(cols) {
+ # cols is either a character vector or a named list of Expressions
+ if (is.character(cols)) {
+ dataset___ScannerBuilder__ProjectNames(self, cols)
+ } else if (length(cols) == 0) {
+ # Empty projection
+ dataset___ScannerBuilder__ProjectNames(self, character(0))
+ } else {
+ # List of Expressions
+ dataset___ScannerBuilder__ProjectExprs(self, cols, names(cols))
+ }
+ self
+ },
+ Filter = function(expr) {
+ assert_is(expr, "Expression")
+ dataset___ScannerBuilder__Filter(self, expr)
+ self
+ },
+ UseThreads = function(threads = option_use_threads()) {
+ dataset___ScannerBuilder__UseThreads(self, threads)
+ self
+ },
+ UseAsync = function(use_async = TRUE) {
+ dataset___ScannerBuilder__UseAsync(self, use_async)
+ self
+ },
+ BatchSize = function(batch_size) {
+ dataset___ScannerBuilder__BatchSize(self, batch_size)
+ self
+ },
+ FragmentScanOptions = function(options) {
+ dataset___ScannerBuilder__FragmentScanOptions(self, options)
+ self
+ },
+ Finish = function() dataset___ScannerBuilder__Finish(self)
+ ),
+ active = list(
+ schema = function() dataset___ScannerBuilder__schema(self)
+ )
+)
+ScannerBuilder$create <- function(dataset) {
+ if (inherits(dataset, "RecordBatchReader")) {
+ return(dataset___ScannerBuilder__FromRecordBatchReader(dataset))
+ }
+
+ if (inherits(dataset, c("data.frame", "ArrowTabular"))) {
+ dataset <- InMemoryDataset$create(dataset)
+ }
+ assert_is(dataset, "Dataset")
+
+ dataset$NewScan()
+}
+
+#' @export
+names.ScannerBuilder <- function(x) names(x$schema)
diff --git a/src/arrow/r/R/dataset-write.R b/src/arrow/r/R/dataset-write.R
new file mode 100644
index 000000000..3a98357b0
--- /dev/null
+++ b/src/arrow/r/R/dataset-write.R
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Write a dataset
+#'
+#' This function allows you to write a dataset. By writing to more efficient
+#' binary storage formats, and by specifying relevant partitioning, you can
+#' make it much faster to read and query.
+#'
+#' @param dataset [Dataset], [RecordBatch], [Table], `arrow_dplyr_query`, or
+#' `data.frame`. If an `arrow_dplyr_query`, the query will be evaluated and
+#' the result will be written. This means that you can `select()`, `filter()`, `mutate()`,
+#' etc. to transform the data before it is written if you need to.
+#' @param path string path, URI, or `SubTreeFileSystem` referencing a directory
+#' to write to (directory will be created if it does not exist)
+#' @param format a string identifier of the file format. Default is to use
+#' "parquet" (see [FileFormat])
+#' @param partitioning `Partitioning` or a character vector of columns to
+#' use as partition keys (to be written as path segments). Default is to
+#' use the current `group_by()` columns.
+#' @param basename_template string template for the names of files to be written.
+#' Must contain `"{i}"`, which will be replaced with an autoincremented
+#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"`
+#' will yield `"part-0.feather", ...`.
+#' @param hive_style logical: write partition segments as Hive-style
+#' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`.
+#' @param existing_data_behavior The behavior to use when there is already data
+#' in the destination directory. Must be one of "overwrite", "error", or
+#' "delete_matching".
+#' - "overwrite" (the default) then any new files created will overwrite
+#' existing files
+#' - "error" then the operation will fail if the destination directory is not
+#' empty
+#' - "delete_matching" then the writer will delete any existing partitions
+#' if data is going to be written to those partitions and will leave alone
+#' partitions which data is not written to.
+#' @param ... additional format-specific arguments. For available Parquet
+#' options, see [write_parquet()]. The available Feather options are
+#' - `use_legacy_format` logical: write data formatted so that Arrow libraries
+#' versions 0.14 and lower can read it. Default is `FALSE`. You can also
+#' enable this by setting the environment variable `ARROW_PRE_0_15_IPC_FORMAT=1`.
+#' - `metadata_version`: A string like "V5" or the equivalent integer indicating
+#' the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
+#' unless the environment variable `ARROW_PRE_1_0_METADATA_VERSION=1`, in
+#' which case it will be V4.
+#' - `codec`: A [Codec] which will be used to compress body buffers of written
+#' files. Default (NULL) will not compress body buffers.
+#' - `null_fallback`: character to be used in place of missing values (`NA` or
+#' `NULL`) when using Hive-style partitioning. See [hive_partition()].
+#' @return The input `dataset`, invisibly
+#' @examplesIf arrow_with_dataset() & arrow_with_parquet() & requireNamespace("dplyr", quietly = TRUE)
+#' # You can write datasets partitioned by the values in a column (here: "cyl").
+#' # This creates a structure of the form cyl=X/part-Z.parquet.
+#' one_level_tree <- tempfile()
+#' write_dataset(mtcars, one_level_tree, partitioning = "cyl")
+#' list.files(one_level_tree, recursive = TRUE)
+#'
+#' # You can also partition by the values in multiple columns
+#' # (here: "cyl" and "gear").
+#' # This creates a structure of the form cyl=X/gear=Y/part-Z.parquet.
+#' two_levels_tree <- tempfile()
+#' write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear"))
+#' list.files(two_levels_tree, recursive = TRUE)
+#'
+#' # In the two previous examples we would have:
+#' # X = {4,6,8}, the number of cylinders.
+#' # Y = {3,4,5}, the number of forward gears.
+#' # Z = {0,1,2}, the number of saved parts, starting from 0.
+#'
+#' # You can obtain the same result as as the previous examples using arrow with
+#' # a dplyr pipeline. This will be the same as two_levels_tree above, but the
+#' # output directory will be different.
+#' library(dplyr)
+#' two_levels_tree_2 <- tempfile()
+#' mtcars %>%
+#' group_by(cyl, gear) %>%
+#' write_dataset(two_levels_tree_2)
+#' list.files(two_levels_tree_2, recursive = TRUE)
+#'
+#' # And you can also turn off the Hive-style directory naming where the column
+#' # name is included with the values by using `hive_style = FALSE`.
+#'
+#' # Write a structure X/Y/part-Z.parquet.
+#' two_levels_tree_no_hive <- tempfile()
+#' mtcars %>%
+#' group_by(cyl, gear) %>%
+#' write_dataset(two_levels_tree_no_hive, hive_style = FALSE)
+#' list.files(two_levels_tree_no_hive, recursive = TRUE)
+#' @export
+write_dataset <- function(dataset,
+ path,
+ format = c("parquet", "feather", "arrow", "ipc", "csv"),
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = paste0("part-{i}.", as.character(format)),
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite", "error", "delete_matching"),
+ ...) {
+ format <- match.arg(format)
+ if (inherits(dataset, "arrow_dplyr_query")) {
+ # partitioning vars need to be in the `select` schema
+ dataset <- ensure_group_vars(dataset)
+ } else if (inherits(dataset, "grouped_df")) {
+ force(partitioning)
+ # Drop the grouping metadata before writing; we've already consumed it
+ # now to construct `partitioning` and don't want it in the metadata$r
+ dataset <- dplyr::ungroup(dataset)
+ }
+
+ scanner <- Scanner$create(dataset, use_async = TRUE)
+ if (!inherits(partitioning, "Partitioning")) {
+ partition_schema <- scanner$schema[partitioning]
+ if (isTRUE(hive_style)) {
+ partitioning <- HivePartitioning$create(partition_schema, null_fallback = list(...)$null_fallback)
+ } else {
+ partitioning <- DirectoryPartitioning$create(partition_schema)
+ }
+ }
+
+ path_and_fs <- get_path_and_filesystem(path)
+ options <- FileWriteOptions$create(format, table = scanner, ...)
+
+ existing_data_behavior_opts <- c("delete_matching", "overwrite", "error")
+ existing_data_behavior <- match(match.arg(existing_data_behavior), existing_data_behavior_opts) - 1L
+
+ dataset___Dataset__Write(
+ options, path_and_fs$fs, path_and_fs$path,
+ partitioning, basename_template, scanner,
+ existing_data_behavior
+ )
+}
diff --git a/src/arrow/r/R/dataset.R b/src/arrow/r/R/dataset.R
new file mode 100644
index 000000000..7207a5543
--- /dev/null
+++ b/src/arrow/r/R/dataset.R
@@ -0,0 +1,367 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Open a multi-file dataset
+#'
+#' Arrow Datasets allow you to query against data that has been split across
+#' multiple files. This sharding of data may indicate partitioning, which
+#' can accelerate queries that only touch some partitions (files). Call
+#' `open_dataset()` to point to a directory of data files and return a
+#' `Dataset`, then use `dplyr` methods to query it.
+#'
+#' @param sources One of:
+#' * a string path or URI to a directory containing data files
+#' * a string path or URI to a single file
+#' * a character vector of paths or URIs to individual data files
+#' * a list of `Dataset` objects as created by this function
+#' * a list of `DatasetFactory` objects as created by [dataset_factory()].
+#'
+#' When `sources` is a vector of file URIs, they must all use the same protocol
+#' and point to files located in the same file system and having the same
+#' format.
+#' @param schema [Schema] for the `Dataset`. If `NULL` (the default), the schema
+#' will be inferred from the data sources.
+#' @param partitioning When `sources` is a directory path/URI, one of:
+#' * a `Schema`, in which case the file paths relative to `sources` will be
+#' parsed, and path segments will be matched with the schema fields. For
+#' example, `schema(year = int16(), month = int8())` would create partitions
+#' for file paths like `"2019/01/file.parquet"`, `"2019/02/file.parquet"`,
+#' etc.
+#' * a character vector that defines the field names corresponding to those
+#' path segments (that is, you're providing the names that would correspond
+#' to a `Schema` but the types will be autodetected)
+#' * a `HivePartitioning` or `HivePartitioningFactory`, as returned
+#' by [hive_partition()] which parses explicit or autodetected fields from
+#' Hive-style path segments
+#' * `NULL` for no partitioning
+#'
+#' The default is to autodetect Hive-style partitions. When `sources` is not a
+#' directory path/URI, `partitioning` is ignored.
+#' @param unify_schemas logical: should all data fragments (files, `Dataset`s)
+#' be scanned in order to create a unified schema from them? If `FALSE`, only
+#' the first fragment will be inspected for its schema. Use this fast path
+#' when you know and trust that all fragments have an identical schema.
+#' The default is `FALSE` when creating a dataset from a directory path/URI or
+#' vector of file paths/URIs (because there may be many files and scanning may
+#' be slow) but `TRUE` when `sources` is a list of `Dataset`s (because there
+#' should be few `Dataset`s in the list and their `Schema`s are already in
+#' memory).
+#' @param format A [FileFormat] object, or a string identifier of the format of
+#' the files in `x`. This argument is ignored when `sources` is a list of `Dataset` objects.
+#' Currently supported values:
+#' * "parquet"
+#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+#' only version 2 files are supported
+#' * "csv"/"text", aliases for the same thing (because comma is the default
+#' delimiter for text files
+#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
+#'
+#' Default is "parquet", unless a `delimiter` is also specified, in which case
+#' it is assumed to be "text".
+#' @param ... additional arguments passed to `dataset_factory()` when `sources`
+#' is a directory path/URI or vector of file paths/URIs, otherwise ignored.
+#' These may include `format` to indicate the file format, or other
+#' format-specific options.
+#' @return A [Dataset] R6 object. Use `dplyr` methods on it to query the data,
+#' or call [`$NewScan()`][Scanner] to construct a query directly.
+#' @export
+#' @seealso `vignette("dataset", package = "arrow")`
+#' @include arrow-package.R
+#' @examplesIf arrow_with_dataset() & arrow_with_parquet()
+#' # Set up directory for examples
+#' tf <- tempfile()
+#' dir.create(tf)
+#' on.exit(unlink(tf))
+#'
+#' data <- dplyr::group_by(mtcars, cyl)
+#' write_dataset(data, tf)
+#'
+#' # You can specify a directory containing the files for your dataset and
+#' # open_dataset will scan all files in your directory.
+#' open_dataset(tf)
+#'
+#' # You can also supply a vector of paths
+#' open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"), file.path(tf, "cyl=8/part-0.parquet")))
+#'
+#' ## You must specify the file format if using a format other than parquet.
+#' tf2 <- tempfile()
+#' dir.create(tf2)
+#' on.exit(unlink(tf2))
+#' write_dataset(data, tf2, format = "ipc")
+#' # This line will results in errors when you try to work with the data
+#' \dontrun{
+#' open_dataset(tf2)
+#' }
+#' # This line will work
+#' open_dataset(tf2, format = "ipc")
+#'
+#' ## You can specify file partitioning to include it as a field in your dataset
+#' # Create a temporary directory and write example dataset
+#' tf3 <- tempfile()
+#' dir.create(tf3)
+#' on.exit(unlink(tf3))
+#' write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE)
+#'
+#' # View files - you can see the partitioning means that files have been written
+#' # to folders based on Month/Day values
+#' tf3_files <- list.files(tf3, recursive = TRUE)
+#'
+#' # With no partitioning specified, dataset contains all files but doesn't include
+#' # directory names as field names
+#' open_dataset(tf3)
+#'
+#' # Now that partitioning has been specified, your dataset contains columns for Month and Day
+#' open_dataset(tf3, partitioning = c("Month", "Day"))
+#'
+#' # If you want to specify the data types for your fields, you can pass in a Schema
+#' open_dataset(tf3, partitioning = schema(Month = int8(), Day = int8()))
+open_dataset <- function(sources,
+ schema = NULL,
+ partitioning = hive_partition(),
+ unify_schemas = NULL,
+ format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
+ ...) {
+ if (!arrow_with_dataset()) {
+ stop("This build of the arrow package does not support Datasets", call. = FALSE)
+ }
+ if (is_list_of(sources, "Dataset")) {
+ if (is.null(schema)) {
+ if (is.null(unify_schemas) || isTRUE(unify_schemas)) {
+ # Default is to unify schemas here
+ schema <- unify_schemas(schemas = map(sources, ~ .$schema))
+ } else {
+ # Take the first one.
+ schema <- sources[[1]]$schema
+ }
+ }
+ # Enforce that all datasets have the same schema
+ assert_is(schema, "Schema")
+ sources <- lapply(sources, function(x) {
+ x$schema <- schema
+ x
+ })
+ return(dataset___UnionDataset__create(sources, schema))
+ }
+
+ factory <- DatasetFactory$create(sources, partitioning = partitioning, format = format, schema = schema, ...)
+ tryCatch(
+ # Default is _not_ to inspect/unify schemas
+ factory$Finish(schema, isTRUE(unify_schemas)),
+ error = function(e) {
+ handle_parquet_io_error(e, format)
+ }
+ )
+}
+
+#' Multi-file datasets
+#'
+#' @description
+#' Arrow Datasets allow you to query against data that has been split across
+#' multiple files. This sharding of data may indicate partitioning, which
+#' can accelerate queries that only touch some partitions (files).
+#'
+#' A `Dataset` contains one or more `Fragments`, such as files, of potentially
+#' differing type and partitioning.
+#'
+#' For `Dataset$create()`, see [open_dataset()], which is an alias for it.
+#'
+#' `DatasetFactory` is used to provide finer control over the creation of `Dataset`s.
+#'
+#' @section Factory:
+#' `DatasetFactory` is used to create a `Dataset`, inspect the [Schema] of the
+#' fragments contained in it, and declare a partitioning.
+#' `FileSystemDatasetFactory` is a subclass of `DatasetFactory` for
+#' discovering files in the local file system, the only currently supported
+#' file system.
+#'
+#' For the `DatasetFactory$create()` factory method, see [dataset_factory()], an
+#' alias for it. A `DatasetFactory` has:
+#'
+#' - `$Inspect(unify_schemas)`: If `unify_schemas` is `TRUE`, all fragments
+#' will be scanned and a unified [Schema] will be created from them; if `FALSE`
+#' (default), only the first fragment will be inspected for its schema. Use this
+#' fast path when you know and trust that all fragments have an identical schema.
+#' - `$Finish(schema, unify_schemas)`: Returns a `Dataset`. If `schema` is provided,
+#' it will be used for the `Dataset`; if omitted, a `Schema` will be created from
+#' inspecting the fragments (files) in the dataset, following `unify_schemas`
+#' as described above.
+#'
+#' `FileSystemDatasetFactory$create()` is a lower-level factory method and
+#' takes the following arguments:
+#' * `filesystem`: A [FileSystem]
+#' * `selector`: Either a [FileSelector] or `NULL`
+#' * `paths`: Either a character vector of file paths or `NULL`
+#' * `format`: A [FileFormat]
+#' * `partitioning`: Either `Partitioning`, `PartitioningFactory`, or `NULL`
+#' @section Methods:
+#'
+#' A `Dataset` has the following methods:
+#' - `$NewScan()`: Returns a [ScannerBuilder] for building a query
+#' - `$schema`: Active binding that returns the [Schema] of the Dataset; you
+#' may also replace the dataset's schema by using `ds$schema <- new_schema`.
+#' This method currently supports only adding, removing, or reordering
+#' fields in the schema: you cannot alter or cast the field types.
+#'
+#' `FileSystemDataset` has the following methods:
+#' - `$files`: Active binding, returns the files of the `FileSystemDataset`
+#' - `$format`: Active binding, returns the [FileFormat] of the `FileSystemDataset`
+#'
+#' `UnionDataset` has the following methods:
+#' - `$children`: Active binding, returns all child `Dataset`s.
+#'
+#' @export
+#' @seealso [open_dataset()] for a simple interface to creating a `Dataset`
+Dataset <- R6Class("Dataset",
+ inherit = ArrowObject,
+ public = list(
+ # @description
+ # Start a new scan of the data
+ # @return A [ScannerBuilder]
+ NewScan = function() dataset___Dataset__NewScan(self),
+ ToString = function() self$schema$ToString()
+ ),
+ active = list(
+ schema = function(schema) {
+ if (missing(schema)) {
+ dataset___Dataset__schema(self)
+ } else {
+ assert_is(schema, "Schema")
+ invisible(dataset___Dataset__ReplaceSchema(self, schema))
+ }
+ },
+ metadata = function() self$schema$metadata,
+ num_rows = function() self$NewScan()$Finish()$CountRows(),
+ num_cols = function() length(self$schema),
+ # @description
+ # Return the Dataset's type.
+ type = function() dataset___Dataset__type_name(self)
+ )
+)
+Dataset$create <- open_dataset
+
+#' @name FileSystemDataset
+#' @rdname Dataset
+#' @export
+FileSystemDataset <- R6Class("FileSystemDataset",
+ inherit = Dataset,
+ public = list(
+ .class_title = function() {
+ nfiles <- length(self$files)
+ file_type <- self$format$type
+ pretty_file_type <- list(
+ parquet = "Parquet",
+ ipc = "Feather"
+ )[[file_type]]
+
+ paste(
+ class(self)[[1]],
+ "with",
+ nfiles,
+ pretty_file_type %||% file_type,
+ ifelse(nfiles == 1, "file", "files")
+ )
+ }
+ ),
+ active = list(
+ # @description
+ # Return the files contained in this `FileSystemDataset`
+ files = function() dataset___FileSystemDataset__files(self),
+ # @description
+ # Return the format of files in this `Dataset`
+ format = function() {
+ dataset___FileSystemDataset__format(self)
+ },
+ # @description
+ # Return the filesystem of files in this `Dataset`
+ filesystem = function() {
+ dataset___FileSystemDataset__filesystem(self)
+ }
+ )
+)
+
+#' @name UnionDataset
+#' @rdname Dataset
+#' @export
+UnionDataset <- R6Class("UnionDataset",
+ inherit = Dataset,
+ active = list(
+ # @description
+ # Return the UnionDataset's child `Dataset`s
+ children = function() {
+ dataset___UnionDataset__children(self)
+ }
+ )
+)
+
+#' @name InMemoryDataset
+#' @rdname Dataset
+#' @export
+InMemoryDataset <- R6Class("InMemoryDataset", inherit = Dataset)
+InMemoryDataset$create <- function(x) {
+ if (!arrow_with_dataset()) {
+ stop("This build of the arrow package does not support Datasets", call. = FALSE)
+ }
+ if (!inherits(x, "Table")) {
+ x <- Table$create(x)
+ }
+ dataset___InMemoryDataset__create(x)
+}
+
+
+#' @export
+names.Dataset <- function(x) names(x$schema)
+
+#' @export
+dim.Dataset <- function(x) c(x$num_rows, x$num_cols)
+
+#' @export
+c.Dataset <- function(...) Dataset$create(list(...))
+
+#' @export
+head.Dataset <- function(x, n = 6L, ...) {
+ head(Scanner$create(x), n)
+}
+
+#' @export
+tail.Dataset <- function(x, n = 6L, ...) {
+ tail(Scanner$create(x), n)
+}
+
+#' @export
+`[.Dataset` <- function(x, i, j, ..., drop = FALSE) {
+ if (nargs() == 2L) {
+ # List-like column extraction (x[i])
+ return(x[, i])
+ }
+ if (!missing(j)) {
+ x <- select.Dataset(x, all_of(j))
+ }
+
+ if (!missing(i)) {
+ x <- take_dataset_rows(x, i)
+ }
+ x
+}
+
+take_dataset_rows <- function(x, i) {
+ if (!is.numeric(i) || any(i < 0)) {
+ stop("Only slicing with positive indices is supported", call. = FALSE)
+ }
+ scanner <- Scanner$create(x)
+ i <- Array$create(i - 1)
+ dataset___Scanner__TakeRows(scanner, i)
+}
diff --git a/src/arrow/r/R/deprecated.R b/src/arrow/r/R/deprecated.R
new file mode 100644
index 000000000..e8848c4aa
--- /dev/null
+++ b/src/arrow/r/R/deprecated.R
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @rdname read_ipc_stream
+#' @export
+read_arrow <- function(file, ...) {
+ .Deprecated(msg = "Use 'read_ipc_stream' or 'read_feather' instead.")
+ if (inherits(file, "raw")) {
+ read_ipc_stream(file, ...)
+ } else {
+ read_feather(file, ...)
+ }
+}
+
+#' @rdname write_ipc_stream
+#' @export
+write_arrow <- function(x, sink, ...) {
+ .Deprecated(msg = "Use 'write_ipc_stream' or 'write_feather' instead.")
+ if (inherits(sink, "raw")) {
+ # HACK for sparklyr
+ # Note that this returns a new R raw vector, not the one passed as `sink`
+ write_to_raw(x)
+ } else {
+ write_feather(x, sink, ...)
+ }
+}
diff --git a/src/arrow/r/R/dictionary.R b/src/arrow/r/R/dictionary.R
new file mode 100644
index 000000000..b701768d6
--- /dev/null
+++ b/src/arrow/r/R/dictionary.R
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include type.R
+
+#' @title class DictionaryType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname DictionaryType
+#' @name DictionaryType
+DictionaryType <- R6Class("DictionaryType",
+ inherit = FixedWidthType,
+ public = list(
+ ToString = function() {
+ prettier_dictionary_type(DataType__ToString(self))
+ }
+ ),
+ active = list(
+ index_type = function() DictionaryType__index_type(self),
+ value_type = function() DictionaryType__value_type(self),
+ name = function() DictionaryType__name(self),
+ ordered = function() DictionaryType__ordered(self)
+ )
+)
+DictionaryType$create <- function(index_type = int32(),
+ value_type = utf8(),
+ ordered = FALSE) {
+ assert_is(index_type, "DataType")
+ assert_is(value_type, "DataType")
+ DictionaryType__initialize(index_type, value_type, ordered)
+}
+
+#' Create a dictionary type
+#'
+#' @param index_type A DataType for the indices (default [int32()])
+#' @param value_type A DataType for the values (default [utf8()])
+#' @param ordered Is this an ordered dictionary (default `FALSE`)?
+#'
+#' @return A [DictionaryType]
+#' @seealso [Other Arrow data types][data-type]
+#' @export
+dictionary <- DictionaryType$create
+
+prettier_dictionary_type <- function(x) {
+ # Prettier format the "ordered" attribute
+ x <- sub(", ordered=0", "", x)
+ sub("ordered=1", "ordered", x)
+}
diff --git a/src/arrow/r/R/dplyr-arrange.R b/src/arrow/r/R/dplyr-arrange.R
new file mode 100644
index 000000000..4c8c687a3
--- /dev/null
+++ b/src/arrow/r/R/dplyr-arrange.R
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) {
+ call <- match.call()
+ exprs <- quos(...)
+ if (.by_group) {
+ # when the data is is grouped and .by_group is TRUE, order the result by
+ # the grouping columns first
+ exprs <- c(quos(!!!dplyr::groups(.data)), exprs)
+ }
+ if (length(exprs) == 0) {
+ # Nothing to do
+ return(.data)
+ }
+ .data <- as_adq(.data)
+ # find and remove any dplyr::desc() and tidy-eval
+ # the arrange expressions inside an Arrow data_mask
+ sorts <- vector("list", length(exprs))
+ descs <- logical(0)
+ mask <- arrow_mask(.data)
+ for (i in seq_along(exprs)) {
+ x <- find_and_remove_desc(exprs[[i]])
+ exprs[[i]] <- x[["quos"]]
+ sorts[[i]] <- arrow_eval(exprs[[i]], mask)
+ names(sorts)[i] <- format_expr(exprs[[i]])
+ if (inherits(sorts[[i]], "try-error")) {
+ msg <- paste("Expression", names(sorts)[i], "not supported in Arrow")
+ return(abandon_ship(call, .data, msg))
+ }
+ descs[i] <- x[["desc"]]
+ }
+ .data$arrange_vars <- c(sorts, .data$arrange_vars)
+ .data$arrange_desc <- c(descs, .data$arrange_desc)
+ .data
+}
+arrange.Dataset <- arrange.ArrowTabular <- arrange.arrow_dplyr_query
+
+# Helper to handle desc() in arrange()
+# * Takes a quosure as input
+# * Returns a list with two elements:
+# 1. The quosure with any wrapping parentheses and desc() removed
+# 2. A logical value indicating whether desc() was found
+# * Performs some other validation
+find_and_remove_desc <- function(quosure) {
+ expr <- quo_get_expr(quosure)
+ descending <- FALSE
+ if (length(all.vars(expr)) < 1L) {
+ stop(
+ "Expression in arrange() does not contain any field names: ",
+ deparse(expr),
+ call. = FALSE
+ )
+ }
+ # Use a while loop to remove any number of nested pairs of enclosing
+ # parentheses and any number of nested desc() calls. In the case of multiple
+ # nested desc() calls, each one toggles the sort order.
+ while (identical(typeof(expr), "language") && is.call(expr)) {
+ if (identical(expr[[1]], quote(`(`))) {
+ # remove enclosing parentheses
+ expr <- expr[[2]]
+ } else if (identical(expr[[1]], quote(desc))) {
+ # ensure desc() has only one argument (when an R expression is a function
+ # call, length == 2 means it has exactly one argument)
+ if (length(expr) > 2) {
+ stop("desc() expects only one argument", call. = FALSE)
+ }
+ # remove desc() and toggle descending
+ expr <- expr[[2]]
+ descending <- !descending
+ } else {
+ break
+ }
+ }
+ return(
+ list(
+ quos = quo_set_expr(quosure, expr),
+ desc = descending
+ )
+ )
+}
diff --git a/src/arrow/r/R/dplyr-collect.R b/src/arrow/r/R/dplyr-collect.R
new file mode 100644
index 000000000..13e68f3f4
--- /dev/null
+++ b/src/arrow/r/R/dplyr-collect.R
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
+ # head and tail are not ExecNodes, at best we can handle them via sink node
+ # so if there are any steps done after head/tail, we need to
+ # evaluate the query up to then and then do a new query for the rest
+ if (is_collapsed(x) && has_head_tail(x$.data)) {
+ x$.data <- as_adq(dplyr::compute(x$.data))$.data
+ }
+
+ # See query-engine.R for ExecPlan/Nodes
+ tab <- do_exec_plan(x)
+ if (as_data_frame) {
+ df <- as.data.frame(tab)
+ tab$invalidate()
+ restore_dplyr_features(df, x)
+ } else {
+ restore_dplyr_features(tab, x)
+ }
+}
+collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
+ if (as_data_frame) {
+ as.data.frame(x, ...)
+ } else {
+ x
+ }
+}
+collect.Dataset <- function(x, ...) dplyr::collect(as_adq(x), ...)
+
+compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame = FALSE)
+compute.ArrowTabular <- function(x, ...) x
+compute.Dataset <- compute.arrow_dplyr_query
+
+pull.arrow_dplyr_query <- function(.data, var = -1) {
+ .data <- as_adq(.data)
+ var <- vars_pull(names(.data), !!enquo(var))
+ .data$selected_columns <- set_names(.data$selected_columns[var], var)
+ dplyr::collect(.data)[[1]]
+}
+pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query
+
+restore_dplyr_features <- function(df, query) {
+ # An arrow_dplyr_query holds some attributes that Arrow doesn't know about
+ # After calling collect(), make sure these features are carried over
+
+ if (length(query$group_by_vars) > 0) {
+ # Preserve groupings, if present
+ if (is.data.frame(df)) {
+ df <- dplyr::grouped_df(
+ df,
+ dplyr::group_vars(query),
+ drop = dplyr::group_by_drop_default(query)
+ )
+ } else {
+ # This is a Table, via compute() or collect(as_data_frame = FALSE)
+ df <- as_adq(df)
+ df$group_by_vars <- query$group_by_vars
+ df$drop_empty_groups <- query$drop_empty_groups
+ }
+ }
+ df
+}
+
+collapse.arrow_dplyr_query <- function(x, ...) {
+ # Figure out what schema will result from the query
+ x$schema <- implicit_schema(x)
+ # Nest inside a new arrow_dplyr_query (and keep groups)
+ restore_dplyr_features(arrow_dplyr_query(x), x)
+}
+collapse.Dataset <- collapse.ArrowTabular <- function(x, ...) {
+ arrow_dplyr_query(x)
+}
+
+implicit_schema <- function(.data) {
+ .data <- ensure_group_vars(.data)
+ old_schm <- .data$.data$schema
+
+ if (is.null(.data$aggregations)) {
+ new_fields <- map(.data$selected_columns, ~ .$type(old_schm))
+ if (!is.null(.data$join) && !(.data$join$type %in% JoinType[1:4])) {
+ # Add cols from right side, except for semi/anti joins
+ right_cols <- .data$join$right_data$selected_columns
+ new_fields <- c(new_fields, map(
+ right_cols[setdiff(names(right_cols), .data$join$by)],
+ ~ .$type(.data$join$right_data$.data$schema)
+ ))
+ }
+ } else {
+ new_fields <- map(summarize_projection(.data), ~ .$type(old_schm))
+ # * Put group_by_vars first (this can't be done by summarize,
+ # they have to be last per the aggregate node signature,
+ # and they get projected to this order after aggregation)
+ # * Infer the output types from the aggregations
+ group_fields <- new_fields[.data$group_by_vars]
+ hash <- length(.data$group_by_vars) > 0
+ agg_fields <- imap(
+ new_fields[setdiff(names(new_fields), .data$group_by_vars)],
+ ~ output_type(.data$aggregations[[.y]][["fun"]], .x, hash)
+ )
+ new_fields <- c(group_fields, agg_fields)
+ }
+ schema(!!!new_fields)
+}
diff --git a/src/arrow/r/R/dplyr-count.R b/src/arrow/r/R/dplyr-count.R
new file mode 100644
index 000000000..c567c285f
--- /dev/null
+++ b/src/arrow/r/R/dplyr-count.R
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# The following S3 methods are registered on load if dplyr is present
+
+count.arrow_dplyr_query <- function(x, ..., wt = NULL, sort = FALSE, name = NULL) {
+ if (!missing(...)) {
+ out <- dplyr::group_by(x, ..., .add = TRUE)
+ } else {
+ out <- x
+ }
+ out <- dplyr::tally(out, wt = {{ wt }}, sort = sort, name = name)
+
+ # Restore original group vars
+ gv <- dplyr::group_vars(x)
+ if (length(gv)) {
+ out$group_by_vars <- gv
+ }
+
+ out
+}
+
+count.Dataset <- count.ArrowTabular <- count.arrow_dplyr_query
+
+#' @importFrom rlang sym :=
+tally.arrow_dplyr_query <- function(x, wt = NULL, sort = FALSE, name = NULL) {
+ check_name <- utils::getFromNamespace("check_name", "dplyr")
+ name <- check_name(name, dplyr::group_vars(x))
+
+ if (quo_is_null(enquo(wt))) {
+ out <- dplyr::summarize(x, !!name := n())
+ } else {
+ out <- dplyr::summarize(x, !!name := sum({{ wt }}, na.rm = TRUE))
+ }
+
+ if (sort) {
+ dplyr::arrange(out, desc(!!sym(name)))
+ } else {
+ out
+ }
+}
+
+tally.Dataset <- tally.ArrowTabular <- tally.arrow_dplyr_query
+
+# we don't want to depend on dplyr, but we refrence these above
+utils::globalVariables(c("n", "desc"))
diff --git a/src/arrow/r/R/dplyr-distinct.R b/src/arrow/r/R/dplyr-distinct.R
new file mode 100644
index 000000000..5dfcb641f
--- /dev/null
+++ b/src/arrow/r/R/dplyr-distinct.R
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# The following S3 methods are registered on load if dplyr is present
+
+distinct.arrow_dplyr_query <- function(.data, ..., .keep_all = FALSE) {
+ if (.keep_all == TRUE) {
+ # After ARROW-13993 is merged, we can implement this (ARROW-14045)
+ arrow_not_supported("`distinct()` with `.keep_all = TRUE`")
+ }
+
+ original_gv <- dplyr::group_vars(.data)
+ if (length(quos(...))) {
+ # group_by() calls mutate() if there are any expressions in ...
+ .data <- dplyr::group_by(.data, ..., .add = TRUE)
+ # `data %>% group_by() %>% summarise()` returns cols in order supplied
+ # but distinct() returns cols in dataset order, so sort group vars
+ .data$group_by_vars <- names(.data)[names(.data) %in% .data$group_by_vars]
+ } else {
+ # distinct() with no vars specified means distinct across all cols
+ .data <- dplyr::group_by(.data, !!!syms(names(.data)))
+ }
+
+ out <- dplyr::summarize(.data, .groups = "drop")
+ # distinct() doesn't modify group by vars, so restore the original ones
+ if (length(original_gv)) {
+ out$group_by_vars <- original_gv
+ }
+ out
+}
+
+distinct.Dataset <- distinct.ArrowTabular <- distinct.arrow_dplyr_query
diff --git a/src/arrow/r/R/dplyr-eval.R b/src/arrow/r/R/dplyr-eval.R
new file mode 100644
index 000000000..9d944ab80
--- /dev/null
+++ b/src/arrow/r/R/dplyr-eval.R
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+arrow_eval <- function(expr, mask) {
+ # filter(), mutate(), etc. work by evaluating the quoted `exprs` to generate Expressions
+ # with references to Arrays (if .data is Table/RecordBatch) or Fields (if
+ # .data is a Dataset).
+
+ # This yields an Expression as long as the `exprs` are implemented in Arrow.
+ # Otherwise, it returns a try-error
+ tryCatch(eval_tidy(expr, mask), error = function(e) {
+ # Look for the cases where bad input was given, i.e. this would fail
+ # in regular dplyr anyway, and let those raise those as errors;
+ # else, for things not supported by Arrow return a "try-error",
+ # which we'll handle differently
+ msg <- conditionMessage(e)
+ if (getOption("arrow.debug", FALSE)) print(msg)
+ patterns <- .cache$i18ized_error_pattern
+ if (is.null(patterns)) {
+ patterns <- i18ize_error_messages()
+ # Memoize it
+ .cache$i18ized_error_pattern <- patterns
+ }
+ if (grepl(patterns, msg)) {
+ stop(e)
+ }
+
+ out <- structure(msg, class = "try-error", condition = e)
+ if (grepl("not supported.*Arrow", msg) || getOption("arrow.debug", FALSE)) {
+ # One of ours. Mark it so that consumers can handle it differently
+ class(out) <- c("arrow-try-error", class(out))
+ }
+ invisible(out)
+ })
+}
+
+handle_arrow_not_supported <- function(err, lab) {
+ # Look for informative message from the Arrow function version (see above)
+ if (inherits(err, "arrow-try-error")) {
+ # Include it if found
+ paste0("In ", lab, ", ", as.character(err))
+ } else {
+ # Otherwise be opaque (the original error is probably not useful)
+ paste("Expression", lab, "not supported in Arrow")
+ }
+}
+
+i18ize_error_messages <- function() {
+ # Figure out what the error messages will be with this LANGUAGE
+ # so that we can look for them
+ out <- list(
+ obj = tryCatch(eval(parse(text = "X_____X")), error = function(e) conditionMessage(e)),
+ fun = tryCatch(eval(parse(text = "X_____X()")), error = function(e) conditionMessage(e))
+ )
+ paste(map(out, ~ sub("X_____X", ".*", .)), collapse = "|")
+}
+
+# Helper to raise a common error
+arrow_not_supported <- function(msg) {
+ # TODO: raise a classed error?
+ stop(paste(msg, "not supported by Arrow"), call. = FALSE)
+}
+
+# Create a data mask for evaluating a dplyr expression
+arrow_mask <- function(.data, aggregation = FALSE) {
+ f_env <- new_environment(.cache$functions)
+
+ # Add functions that need to error hard and clear.
+ # Some R functions will still try to evaluate on an Expression
+ # and return NA with a warning
+ fail <- function(...) stop("Not implemented")
+ for (f in c("mean", "sd")) {
+ f_env[[f]] <- fail
+ }
+
+ if (aggregation) {
+ # This should probably be done with an environment inside an environment
+ # but a first attempt at that had scoping problems (ARROW-13499)
+ for (f in names(agg_funcs)) {
+ f_env[[f]] <- agg_funcs[[f]]
+ }
+ }
+
+ # Assign the schema to the expressions
+ map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+
+ # Add the column references and make the mask
+ out <- new_data_mask(
+ new_environment(.data$selected_columns, parent = f_env),
+ f_env
+ )
+ # Then insert the data pronoun
+ # TODO: figure out what rlang::as_data_pronoun does/why we should use it
+ # (because if we do we get `Error: Can't modify the data pronoun` in mutate())
+ out$.data <- .data$selected_columns
+ out
+}
+
+format_expr <- function(x) {
+ if (is_quosure(x)) {
+ x <- quo_get_expr(x)
+ }
+ out <- deparse(x)
+ if (length(out) > 1) {
+ # Add ellipses because we are going to truncate
+ out[1] <- paste0(out[1], "...")
+ }
+ head(out, 1)
+}
diff --git a/src/arrow/r/R/dplyr-filter.R b/src/arrow/r/R/dplyr-filter.R
new file mode 100644
index 000000000..3c8c08ea5
--- /dev/null
+++ b/src/arrow/r/R/dplyr-filter.R
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) {
+ # TODO something with the .preserve argument
+ filts <- quos(...)
+ if (length(filts) == 0) {
+ # Nothing to do
+ return(.data)
+ }
+
+ .data <- as_adq(.data)
+ # tidy-eval the filter expressions inside an Arrow data_mask
+ filters <- lapply(filts, arrow_eval, arrow_mask(.data))
+ bad_filters <- map_lgl(filters, ~ inherits(., "try-error"))
+ if (any(bad_filters)) {
+ # This is similar to abandon_ship() except that the filter eval is
+ # vectorized, and we apply filters that _did_ work before abandoning ship
+ # with the rest
+ expr_labs <- map_chr(filts[bad_filters], format_expr)
+ if (query_on_dataset(.data)) {
+ # Abort. We don't want to auto-collect if this is a Dataset because that
+ # could blow up, too big.
+ stop(
+ "Filter expression not supported for Arrow Datasets: ",
+ oxford_paste(expr_labs, quote = FALSE),
+ "\nCall collect() first to pull data into R.",
+ call. = FALSE
+ )
+ } else {
+ arrow_errors <- map2_chr(
+ filters[bad_filters], expr_labs,
+ handle_arrow_not_supported
+ )
+ if (length(arrow_errors) == 1) {
+ msg <- paste0(arrow_errors, "; ")
+ } else {
+ msg <- paste0("* ", arrow_errors, "\n", collapse = "")
+ }
+ warning(
+ msg, "pulling data into R",
+ immediate. = TRUE,
+ call. = FALSE
+ )
+ # Set any valid filters first, then collect and then apply the invalid ones in R
+ .data <- set_filters(.data, filters[!bad_filters])
+ return(dplyr::filter(dplyr::collect(.data), !!!filts[bad_filters]))
+ }
+ }
+
+ set_filters(.data, filters)
+}
+filter.Dataset <- filter.ArrowTabular <- filter.arrow_dplyr_query
+
+set_filters <- function(.data, expressions) {
+ if (length(expressions)) {
+ if (is_list_of(expressions, "Expression")) {
+ # expressions is a list of Expressions. AND them together and set them on .data
+ new_filter <- Reduce("&", expressions)
+ } else if (inherits(expressions, "Expression")) {
+ new_filter <- expressions
+ } else {
+ stop("filter expressions must be either an expression or a list of expressions", call. = FALSE)
+ }
+
+ if (isTRUE(.data$filtered_rows)) {
+ # TRUE is default (i.e. no filter yet), so we don't need to & with it
+ .data$filtered_rows <- new_filter
+ } else {
+ .data$filtered_rows <- .data$filtered_rows & new_filter
+ }
+ }
+ .data
+}
diff --git a/src/arrow/r/R/dplyr-functions.R b/src/arrow/r/R/dplyr-functions.R
new file mode 100644
index 000000000..717cdae96
--- /dev/null
+++ b/src/arrow/r/R/dplyr-functions.R
@@ -0,0 +1,1087 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' @include expression.R
+NULL
+
+# This environment is an internal cache for things including data mask functions
+# We'll populate it at package load time.
+.cache <- NULL
+init_env <- function() {
+ .cache <<- new.env(hash = TRUE)
+}
+init_env()
+
+# nse_funcs is a list of functions that operated on (and return) Expressions
+# These will be the basis for a data_mask inside dplyr methods
+# and will be added to .cache at package load time
+
+# Start with mappings from R function name spellings
+nse_funcs <- lapply(set_names(names(.array_function_map)), function(operator) {
+ force(operator)
+ function(...) build_expr(operator, ...)
+})
+
+# Now add functions to that list where the mapping from R to Arrow isn't 1:1
+# Each of these functions should have the same signature as the R function
+# they're replacing.
+#
+# When to use `build_expr()` vs. `Expression$create()`?
+#
+# Use `build_expr()` if you need to
+# (1) map R function names to Arrow C++ functions
+# (2) wrap R inputs (vectors) as Array/Scalar
+#
+# `Expression$create()` is lower level. Most of the functions below use it
+# because they manage the preparation of the user-provided inputs
+# and don't need to wrap scalars
+
+nse_funcs$cast <- function(x, target_type, safe = TRUE, ...) {
+ opts <- cast_options(safe, ...)
+ opts$to_type <- as_type(target_type)
+ Expression$create("cast", x, options = opts)
+}
+
+nse_funcs$coalesce <- function(...) {
+ args <- list2(...)
+ if (length(args) < 1) {
+ abort("At least one argument must be supplied to coalesce()")
+ }
+
+ # Treat NaN like NA for consistency with dplyr::coalesce(), but if *all*
+ # the values are NaN, we should return NaN, not NA, so don't replace
+ # NaN with NA in the final (or only) argument
+ # TODO: if an option is added to the coalesce kernel to treat NaN as NA,
+ # use that to simplify the code here (ARROW-13389)
+ attr(args[[length(args)]], "last") <- TRUE
+ args <- lapply(args, function(arg) {
+ last_arg <- is.null(attr(arg, "last"))
+ attr(arg, "last") <- NULL
+
+ if (!inherits(arg, "Expression")) {
+ arg <- Expression$scalar(arg)
+ }
+
+ # coalesce doesn't yet support factors/dictionaries
+ # TODO: remove this after ARROW-14167 is merged
+ if (nse_funcs$is.factor(arg)) {
+ warning("Dictionaries (in R: factors) are currently converted to strings (characters) in coalesce", call. = FALSE)
+ }
+
+ if (last_arg && arg$type_id() %in% TYPES_WITH_NAN) {
+ # store the NA_real_ in the same type as arg to avoid avoid casting
+ # smaller float types to larger float types
+ NA_expr <- Expression$scalar(Scalar$create(NA_real_, type = arg$type()))
+ Expression$create("if_else", Expression$create("is_nan", arg), NA_expr, arg)
+ } else {
+ arg
+ }
+ })
+ Expression$create("coalesce", args = args)
+}
+
+nse_funcs$is.na <- function(x) {
+ build_expr("is_null", x, options = list(nan_is_null = TRUE))
+}
+
+nse_funcs$is.nan <- function(x) {
+ if (is.double(x) || (inherits(x, "Expression") &&
+ x$type_id() %in% TYPES_WITH_NAN)) {
+ # TODO: if an option is added to the is_nan kernel to treat NA as NaN,
+ # use that to simplify the code here (ARROW-13366)
+ build_expr("is_nan", x) & build_expr("is_valid", x)
+ } else {
+ Expression$scalar(FALSE)
+ }
+}
+
+nse_funcs$is <- function(object, class2) {
+ if (is.string(class2)) {
+ switch(class2,
+ # for R data types, pass off to is.*() functions
+ character = nse_funcs$is.character(object),
+ numeric = nse_funcs$is.numeric(object),
+ integer = nse_funcs$is.integer(object),
+ integer64 = nse_funcs$is.integer64(object),
+ logical = nse_funcs$is.logical(object),
+ factor = nse_funcs$is.factor(object),
+ list = nse_funcs$is.list(object),
+ # for Arrow data types, compare class2 with object$type()$ToString(),
+ # but first strip off any parameters to only compare the top-level data
+ # type, and canonicalize class2
+ sub("^([^([<]+).*$", "\\1", object$type()$ToString()) ==
+ canonical_type_str(class2)
+ )
+ } else if (inherits(class2, "DataType")) {
+ object$type() == as_type(class2)
+ } else {
+ stop("Second argument to is() is not a string or DataType", call. = FALSE)
+ }
+}
+
+nse_funcs$dictionary_encode <- function(x,
+ null_encoding_behavior = c("mask", "encode")) {
+ behavior <- toupper(match.arg(null_encoding_behavior))
+ null_encoding_behavior <- NullEncodingBehavior[[behavior]]
+ Expression$create(
+ "dictionary_encode",
+ x,
+ options = list(null_encoding_behavior = null_encoding_behavior)
+ )
+}
+
+nse_funcs$between <- function(x, left, right) {
+ x >= left & x <= right
+}
+
+nse_funcs$is.finite <- function(x) {
+ is_fin <- Expression$create("is_finite", x)
+ # for compatibility with base::is.finite(), return FALSE for NA_real_
+ is_fin & !nse_funcs$is.na(is_fin)
+}
+
+nse_funcs$is.infinite <- function(x) {
+ is_inf <- Expression$create("is_inf", x)
+ # for compatibility with base::is.infinite(), return FALSE for NA_real_
+ is_inf & !nse_funcs$is.na(is_inf)
+}
+
+# as.* type casting functions
+# as.factor() is mapped in expression.R
+nse_funcs$as.character <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = string()))
+}
+nse_funcs$as.double <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = float64()))
+}
+nse_funcs$as.integer <- function(x) {
+ Expression$create(
+ "cast",
+ x,
+ options = cast_options(
+ to_type = int32(),
+ allow_float_truncate = TRUE,
+ allow_decimal_truncate = TRUE
+ )
+ )
+}
+nse_funcs$as.integer64 <- function(x) {
+ Expression$create(
+ "cast",
+ x,
+ options = cast_options(
+ to_type = int64(),
+ allow_float_truncate = TRUE,
+ allow_decimal_truncate = TRUE
+ )
+ )
+}
+nse_funcs$as.logical <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = boolean()))
+}
+nse_funcs$as.numeric <- function(x) {
+ Expression$create("cast", x, options = cast_options(to_type = float64()))
+}
+
+# is.* type functions
+nse_funcs$is.character <- function(x) {
+ is.character(x) || (inherits(x, "Expression") &&
+ x$type_id() %in% Type[c("STRING", "LARGE_STRING")])
+}
+nse_funcs$is.numeric <- function(x) {
+ is.numeric(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c(
+ "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32",
+ "UINT64", "INT64", "HALF_FLOAT", "FLOAT", "DOUBLE",
+ "DECIMAL", "DECIMAL256"
+ )])
+}
+nse_funcs$is.double <- function(x) {
+ is.double(x) || (inherits(x, "Expression") && x$type_id() == Type["DOUBLE"])
+}
+nse_funcs$is.integer <- function(x) {
+ is.integer(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c(
+ "UINT8", "INT8", "UINT16", "INT16", "UINT32", "INT32",
+ "UINT64", "INT64"
+ )])
+}
+nse_funcs$is.integer64 <- function(x) {
+ is.integer64(x) || (inherits(x, "Expression") && x$type_id() == Type["INT64"])
+}
+nse_funcs$is.logical <- function(x) {
+ is.logical(x) || (inherits(x, "Expression") && x$type_id() == Type["BOOL"])
+}
+nse_funcs$is.factor <- function(x) {
+ is.factor(x) || (inherits(x, "Expression") && x$type_id() == Type["DICTIONARY"])
+}
+nse_funcs$is.list <- function(x) {
+ is.list(x) || (inherits(x, "Expression") && x$type_id() %in% Type[c(
+ "LIST", "FIXED_SIZE_LIST", "LARGE_LIST"
+ )])
+}
+
+# rlang::is_* type functions
+nse_funcs$is_character <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.character(x)
+}
+nse_funcs$is_double <- function(x, n = NULL, finite = NULL) {
+ assert_that(is.null(n) && is.null(finite))
+ nse_funcs$is.double(x)
+}
+nse_funcs$is_integer <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.integer(x)
+}
+nse_funcs$is_list <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.list(x)
+}
+nse_funcs$is_logical <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ nse_funcs$is.logical(x)
+}
+nse_funcs$is_timestamp <- function(x, n = NULL) {
+ assert_that(is.null(n))
+ inherits(x, "POSIXt") || (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")])
+}
+
+# String functions
+nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
+ if (allowNA) {
+ arrow_not_supported("allowNA = TRUE")
+ }
+ if (is.na(keepNA)) {
+ keepNA <- !identical(type, "width")
+ }
+ if (!keepNA) {
+ # TODO: I think there is a fill_null kernel we could use, set null to 2
+ arrow_not_supported("keepNA = TRUE")
+ }
+ if (identical(type, "bytes")) {
+ Expression$create("binary_length", x)
+ } else {
+ Expression$create("utf8_length", x)
+ }
+}
+
+nse_funcs$paste <- function(..., sep = " ", collapse = NULL, recycle0 = FALSE) {
+ assert_that(
+ is.null(collapse),
+ msg = "paste() with the collapse argument is not yet supported in Arrow"
+ )
+ if (!inherits(sep, "Expression")) {
+ assert_that(!is.na(sep), msg = "Invalid separator")
+ }
+ arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep)
+}
+
+nse_funcs$paste0 <- function(..., collapse = NULL, recycle0 = FALSE) {
+ assert_that(
+ is.null(collapse),
+ msg = "paste0() with the collapse argument is not yet supported in Arrow"
+ )
+ arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "")
+}
+
+nse_funcs$str_c <- function(..., sep = "", collapse = NULL) {
+ assert_that(
+ is.null(collapse),
+ msg = "str_c() with the collapse argument is not yet supported in Arrow"
+ )
+ arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep)
+}
+
+arrow_string_join_function <- function(null_handling, null_replacement = NULL) {
+ # the `binary_join_element_wise` Arrow C++ compute kernel takes the separator
+ # as the last argument, so pass `sep` as the last dots arg to this function
+ function(...) {
+ args <- lapply(list(...), function(arg) {
+ # handle scalar literal args, and cast all args to string for
+ # consistency with base::paste(), base::paste0(), and stringr::str_c()
+ if (!inherits(arg, "Expression")) {
+ assert_that(
+ length(arg) == 1,
+ msg = "Literal vectors of length != 1 not supported in string concatenation"
+ )
+ Expression$scalar(as.character(arg))
+ } else {
+ nse_funcs$as.character(arg)
+ }
+ })
+ Expression$create(
+ "binary_join_element_wise",
+ args = args,
+ options = list(
+ null_handling = null_handling,
+ null_replacement = null_replacement
+ )
+ )
+ }
+}
+
+# Currently, Arrow does not supports a locale option for string case conversion
+# functions, contrast to stringr's API, so the 'locale' argument is only valid
+# for stringr's default value ("en"). The following are string functions that
+# take a 'locale' option as its second argument:
+# str_to_lower
+# str_to_upper
+# str_to_title
+#
+# Arrow locale will be supported with ARROW-14126
+stop_if_locale_provided <- function(locale) {
+ if (!identical(locale, "en")) {
+ stop("Providing a value for 'locale' other than the default ('en') is not supported by Arrow. ",
+ "To change locale, use 'Sys.setlocale()'",
+ call. = FALSE
+ )
+ }
+}
+
+nse_funcs$str_to_lower <- function(string, locale = "en") {
+ stop_if_locale_provided(locale)
+ Expression$create("utf8_lower", string)
+}
+
+nse_funcs$str_to_upper <- function(string, locale = "en") {
+ stop_if_locale_provided(locale)
+ Expression$create("utf8_upper", string)
+}
+
+nse_funcs$str_to_title <- function(string, locale = "en") {
+ stop_if_locale_provided(locale)
+ Expression$create("utf8_title", string)
+}
+
+nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) {
+ side <- match.arg(side)
+ trim_fun <- switch(side,
+ left = "utf8_ltrim_whitespace",
+ right = "utf8_rtrim_whitespace",
+ both = "utf8_trim_whitespace"
+ )
+ Expression$create(trim_fun, string)
+}
+
+nse_funcs$substr <- function(x, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # substr treats values as if they're on a continous number line, so values
+ # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
+ # this behavior
+ if (start <= 0) {
+ start <- 1
+ }
+
+ # if `stop` is lower than `start`, this is invalid, so set `stop` to
+ # 0 so that an empty string will be returned (consistent with base::substr())
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ x,
+ # we don't need to subtract 1 from `stop` as C++ counts exclusively
+ # which effectively cancels out the difference in indexing between R & C++
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- function(text, first, last) {
+ nse_funcs$substr(x = text, start = first, stop = last)
+}
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(end) == 1,
+ msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # In stringr::str_sub, an `end` value of -1 means the end of the string, so
+ # set it to the maximum integer to match this behavior
+ if (end == -1) {
+ end <- .Machine$integer.max
+ }
+
+ # An end value lower than a start value returns an empty string in
+ # stringr::str_sub so set end to 0 here to match this behavior
+ if (end < start) {
+ end <- 0
+ }
+
+ # subtract 1 from `start` because C++ is 0-based and R is 1-based
+ # str_sub treats a `start` value of 0 or 1 as the same thing so don't subtract 1 when `start` == 0
+ # when `start` < 0, both str_sub and utf8_slice_codeunits count backwards from the end
+ if (start > 0) {
+ start <- start - 1L
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start, stop = end)
+ )
+}
+
+nse_funcs$grepl <- function(pattern, x, ignore.case = FALSE, fixed = FALSE) {
+ arrow_fun <- ifelse(fixed, "match_substring", "match_substring_regex")
+ Expression$create(
+ arrow_fun,
+ x,
+ options = list(pattern = pattern, ignore_case = ignore.case)
+ )
+}
+
+nse_funcs$str_detect <- function(string, pattern, negate = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ out <- nse_funcs$grepl(
+ pattern = opts$pattern,
+ x = string,
+ ignore.case = opts$ignore_case,
+ fixed = opts$fixed
+ )
+ if (negate) {
+ out <- !out
+ }
+ out
+}
+
+nse_funcs$str_like <- function(string, pattern, ignore_case = TRUE) {
+ Expression$create(
+ "match_like",
+ string,
+ options = list(pattern = pattern, ignore_case = ignore_case)
+ )
+}
+
+# Encapsulate some common logic for sub/gsub/str_replace/str_replace_all
+arrow_r_string_replace_function <- function(max_replacements) {
+ function(pattern, replacement, x, ignore.case = FALSE, fixed = FALSE) {
+ Expression$create(
+ ifelse(fixed && !ignore.case, "replace_substring", "replace_substring_regex"),
+ x,
+ options = list(
+ pattern = format_string_pattern(pattern, ignore.case, fixed),
+ replacement = format_string_replacement(replacement, ignore.case, fixed),
+ max_replacements = max_replacements
+ )
+ )
+ }
+}
+
+arrow_stringr_string_replace_function <- function(max_replacements) {
+ function(string, pattern, replacement) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ arrow_r_string_replace_function(max_replacements)(
+ pattern = opts$pattern,
+ replacement = replacement,
+ x = string,
+ ignore.case = opts$ignore_case,
+ fixed = opts$fixed
+ )
+ }
+}
+
+nse_funcs$sub <- arrow_r_string_replace_function(1L)
+nse_funcs$gsub <- arrow_r_string_replace_function(-1L)
+nse_funcs$str_replace <- arrow_stringr_string_replace_function(1L)
+nse_funcs$str_replace_all <- arrow_stringr_string_replace_function(-1L)
+
+nse_funcs$strsplit <- function(x,
+ split,
+ fixed = FALSE,
+ perl = FALSE,
+ useBytes = FALSE) {
+ assert_that(is.string(split))
+
+ arrow_fun <- ifelse(fixed, "split_pattern", "split_pattern_regex")
+ # warn when the user specifies both fixed = TRUE and perl = TRUE, for
+ # consistency with the behavior of base::strsplit()
+ if (fixed && perl) {
+ warning("Argument 'perl = TRUE' will be ignored", call. = FALSE)
+ }
+ # since split is not a regex, proceed without any warnings or errors regardless
+ # of the value of perl, for consistency with the behavior of base::strsplit()
+ Expression$create(
+ arrow_fun,
+ x,
+ options = list(pattern = split, reverse = FALSE, max_splits = -1L)
+ )
+}
+
+nse_funcs$str_split <- function(string, pattern, n = Inf, simplify = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ arrow_fun <- ifelse(opts$fixed, "split_pattern", "split_pattern_regex")
+ if (opts$ignore_case) {
+ arrow_not_supported("Case-insensitive string splitting")
+ }
+ if (n == 0) {
+ arrow_not_supported("Splitting strings into zero parts")
+ }
+ if (identical(n, Inf)) {
+ n <- 0L
+ }
+ if (simplify) {
+ warning("Argument 'simplify = TRUE' will be ignored", call. = FALSE)
+ }
+ # The max_splits option in the Arrow C++ library controls the maximum number
+ # of places at which the string is split, whereas the argument n to
+ # str_split() controls the maximum number of pieces to return. So we must
+ # subtract 1 from n to get max_splits.
+ Expression$create(
+ arrow_fun,
+ string,
+ options = list(
+ pattern = opts$pattern,
+ reverse = FALSE,
+ max_splits = n - 1L
+ )
+ )
+}
+
+nse_funcs$pmin <- function(..., na.rm = FALSE) {
+ build_expr(
+ "min_element_wise",
+ ...,
+ options = list(skip_nulls = na.rm)
+ )
+}
+
+nse_funcs$pmax <- function(..., na.rm = FALSE) {
+ build_expr(
+ "max_element_wise",
+ ...,
+ options = list(skip_nulls = na.rm)
+ )
+}
+
+nse_funcs$str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") {
+ assert_that(is_integerish(width))
+ side <- match.arg(side)
+ assert_that(is.string(pad))
+
+ if (side == "left") {
+ pad_func <- "utf8_lpad"
+ } else if (side == "right") {
+ pad_func <- "utf8_rpad"
+ } else if (side == "both") {
+ pad_func <- "utf8_center"
+ }
+
+ Expression$create(
+ pad_func,
+ string,
+ options = list(width = width, padding = pad)
+ )
+}
+
+nse_funcs$startsWith <- function(x, prefix) {
+ Expression$create(
+ "starts_with",
+ x,
+ options = list(pattern = prefix)
+ )
+}
+
+nse_funcs$endsWith <- function(x, suffix) {
+ Expression$create(
+ "ends_with",
+ x,
+ options = list(pattern = suffix)
+ )
+}
+
+nse_funcs$str_starts <- function(string, pattern, negate = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ if (opts$fixed) {
+ out <- nse_funcs$startsWith(x = string, prefix = opts$pattern)
+ } else {
+ out <- nse_funcs$grepl(pattern = paste0("^", opts$pattern), x = string, fixed = FALSE)
+ }
+
+ if (negate) {
+ out <- !out
+ }
+ out
+}
+
+nse_funcs$str_ends <- function(string, pattern, negate = FALSE) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ if (opts$fixed) {
+ out <- nse_funcs$endsWith(x = string, suffix = opts$pattern)
+ } else {
+ out <- nse_funcs$grepl(pattern = paste0(opts$pattern, "$"), x = string, fixed = FALSE)
+ }
+
+ if (negate) {
+ out <- !out
+ }
+ out
+}
+
+nse_funcs$str_count <- function(string, pattern) {
+ opts <- get_stringr_pattern_options(enexpr(pattern))
+ if (!is.string(pattern)) {
+ arrow_not_supported("`pattern` must be a length 1 character vector; other values")
+ }
+ arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex")
+ Expression$create(
+ arrow_fun,
+ string,
+ options = list(pattern = opts$pattern, ignore_case = opts$ignore_case)
+ )
+}
+
+# String function helpers
+
+# format `pattern` as needed for case insensitivity and literal matching by RE2
+format_string_pattern <- function(pattern, ignore.case, fixed) {
+ # Arrow lacks native support for case-insensitive literal string matching and
+ # replacement, so we use the regular expression engine (RE2) to do this.
+ # https://github.com/google/re2/wiki/Syntax
+ if (ignore.case) {
+ if (fixed) {
+ # Everything between "\Q" and "\E" is treated as literal text.
+ # If the search text contains any literal "\E" strings, make them
+ # lowercase so they won't signal the end of the literal text:
+ pattern <- gsub("\\E", "\\e", pattern, fixed = TRUE)
+ pattern <- paste0("\\Q", pattern, "\\E")
+ }
+ # Prepend "(?i)" for case-insensitive matching
+ pattern <- paste0("(?i)", pattern)
+ }
+ pattern
+}
+
+# format `replacement` as needed for literal replacement by RE2
+format_string_replacement <- function(replacement, ignore.case, fixed) {
+ # Arrow lacks native support for case-insensitive literal string
+ # replacement, so we use the regular expression engine (RE2) to do this.
+ # https://github.com/google/re2/wiki/Syntax
+ if (ignore.case && fixed) {
+ # Escape single backslashes in the regex replacement text so they are
+ # interpreted as literal backslashes:
+ replacement <- gsub("\\", "\\\\", replacement, fixed = TRUE)
+ }
+ replacement
+}
+
+#' Get `stringr` pattern options
+#'
+#' This function assigns definitions for the `stringr` pattern modifier
+#' functions (`fixed()`, `regex()`, etc.) inside itself, and uses them to
+#' evaluate the quoted expression `pattern`, returning a list that is used
+#' to control pattern matching behavior in internal `arrow` functions.
+#'
+#' @param pattern Unevaluated expression containing a call to a `stringr`
+#' pattern modifier function
+#'
+#' @return List containing elements `pattern`, `fixed`, and `ignore_case`
+#' @keywords internal
+get_stringr_pattern_options <- function(pattern) {
+ fixed <- function(pattern, ignore_case = FALSE, ...) {
+ check_dots(...)
+ list(pattern = pattern, fixed = TRUE, ignore_case = ignore_case)
+ }
+ regex <- function(pattern, ignore_case = FALSE, ...) {
+ check_dots(...)
+ list(pattern = pattern, fixed = FALSE, ignore_case = ignore_case)
+ }
+ coll <- function(...) {
+ arrow_not_supported("Pattern modifier `coll()`")
+ }
+ boundary <- function(...) {
+ arrow_not_supported("Pattern modifier `boundary()`")
+ }
+ check_dots <- function(...) {
+ dots <- list(...)
+ if (length(dots)) {
+ warning(
+ "Ignoring pattern modifier ",
+ ngettext(length(dots), "argument ", "arguments "),
+ "not supported in Arrow: ",
+ oxford_paste(names(dots)),
+ call. = FALSE
+ )
+ }
+ }
+ ensure_opts <- function(opts) {
+ if (is.character(opts)) {
+ opts <- list(pattern = opts, fixed = FALSE, ignore_case = FALSE)
+ }
+ opts
+ }
+ ensure_opts(eval(pattern))
+}
+
+#' Does this string contain regex metacharacters?
+#'
+#' @param string String to be tested
+#' @keywords internal
+#' @return Logical: does `string` contain regex metacharacters?
+contains_regex <- function(string) {
+ grepl("[.\\|()[{^$*+?]", string)
+}
+
+nse_funcs$strptime <- function(x, format = "%Y-%m-%d %H:%M:%S", tz = NULL, unit = "ms") {
+ # Arrow uses unit for time parsing, strptime() does not.
+ # Arrow has no default option for strptime (format, unit),
+ # we suggest following format = "%Y-%m-%d %H:%M:%S", unit = MILLI/1L/"ms",
+ # (ARROW-12809)
+
+ # ParseTimestampStrptime currently ignores the timezone information (ARROW-12820).
+ # Stop if tz is provided.
+ if (is.character(tz)) {
+ arrow_not_supported("Time zone argument")
+ }
+
+ unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units))
+
+ Expression$create("strptime", x, options = list(format = format, unit = unit))
+}
+
+nse_funcs$strftime <- function(x, format = "", tz = "", usetz = FALSE) {
+ if (usetz) {
+ format <- paste(format, "%Z")
+ }
+ if (tz == "") {
+ tz <- Sys.timezone()
+ }
+ # Arrow's strftime prints in timezone of the timestamp. To match R's strftime behavior we first
+ # cast the timestamp to desired timezone. This is a metadata only change.
+ if (nse_funcs$is_timestamp(x)) {
+ ts <- Expression$create("cast", x, options = list(to_type = timestamp(x$type()$unit(), tz)))
+ } else {
+ ts <- x
+ }
+ Expression$create("strftime", ts, options = list(format = format, locale = Sys.getlocale("LC_TIME")))
+}
+
+nse_funcs$format_ISO8601 <- function(x, usetz = FALSE, precision = NULL, ...) {
+ ISO8601_precision_map <-
+ list(
+ y = "%Y",
+ ym = "%Y-%m",
+ ymd = "%Y-%m-%d",
+ ymdh = "%Y-%m-%dT%H",
+ ymdhm = "%Y-%m-%dT%H:%M",
+ ymdhms = "%Y-%m-%dT%H:%M:%S"
+ )
+
+ if (is.null(precision)) {
+ precision <- "ymdhms"
+ }
+ if (!precision %in% names(ISO8601_precision_map)) {
+ abort(
+ paste(
+ "`precision` must be one of the following values:",
+ paste(names(ISO8601_precision_map), collapse = ", "),
+ "\nValue supplied was: ",
+ precision
+ )
+ )
+ }
+ format <- ISO8601_precision_map[[precision]]
+ if (usetz) {
+ format <- paste0(format, "%z")
+ }
+ Expression$create("strftime", x, options = list(format = format, locale = "C"))
+}
+
+nse_funcs$second <- function(x) {
+ Expression$create("add", Expression$create("second", x), Expression$create("subsecond", x))
+}
+
+nse_funcs$trunc <- function(x, ...) {
+ # accepts and ignores ... for consistency with base::trunc()
+ build_expr("trunc", x)
+}
+
+nse_funcs$round <- function(x, digits = 0) {
+ build_expr(
+ "round",
+ x,
+ options = list(ndigits = digits, round_mode = RoundMode$HALF_TO_EVEN)
+ )
+}
+
+nse_funcs$wday <- function(x,
+ label = FALSE,
+ abbr = TRUE,
+ week_start = getOption("lubridate.week.start", 7),
+ locale = Sys.getlocale("LC_TIME")) {
+ if (label) {
+ if (abbr) {
+ format <- "%a"
+ } else {
+ format <- "%A"
+ }
+ return(Expression$create("strftime", x, options = list(format = format, locale = locale)))
+ }
+
+ Expression$create("day_of_week", x, options = list(count_from_zero = FALSE, week_start = week_start))
+}
+
+nse_funcs$log <- nse_funcs$logb <- function(x, base = exp(1)) {
+ # like other binary functions, either `x` or `base` can be Expression or double(1)
+ if (is.numeric(x) && length(x) == 1) {
+ x <- Expression$scalar(x)
+ } else if (!inherits(x, "Expression")) {
+ arrow_not_supported("x must be a column or a length-1 numeric; other values")
+ }
+
+ # handle `base` differently because we use the simpler ln, log2, and log10
+ # functions for specific scalar base values
+ if (inherits(base, "Expression")) {
+ return(Expression$create("logb_checked", x, base))
+ }
+
+ if (!is.numeric(base) || length(base) != 1) {
+ arrow_not_supported("base must be a column or a length-1 numeric; other values")
+ }
+
+ if (base == exp(1)) {
+ return(Expression$create("ln_checked", x))
+ }
+
+ if (base == 2) {
+ return(Expression$create("log2_checked", x))
+ }
+
+ if (base == 10) {
+ return(Expression$create("log10_checked", x))
+ }
+
+ Expression$create("logb_checked", x, Expression$scalar(base))
+}
+
+nse_funcs$if_else <- function(condition, true, false, missing = NULL) {
+ if (!is.null(missing)) {
+ return(nse_funcs$if_else(
+ nse_funcs$is.na(condition),
+ missing,
+ nse_funcs$if_else(condition, true, false)
+ ))
+ }
+
+ # if_else doesn't yet support factors/dictionaries
+ # TODO: remove this after ARROW-13358 is merged
+ warn_types <- nse_funcs$is.factor(true) | nse_funcs$is.factor(false)
+ if (warn_types) {
+ warning(
+ "Dictionaries (in R: factors) are currently converted to strings (characters) ",
+ "in if_else and ifelse",
+ call. = FALSE
+ )
+ }
+
+ build_expr("if_else", condition, true, false)
+}
+
+# Although base R ifelse allows `yes` and `no` to be different classes
+nse_funcs$ifelse <- function(test, yes, no) {
+ nse_funcs$if_else(condition = test, true = yes, false = no)
+}
+
+nse_funcs$case_when <- function(...) {
+ formulas <- list2(...)
+ n <- length(formulas)
+ if (n == 0) {
+ abort("No cases provided in case_when()")
+ }
+ query <- vector("list", n)
+ value <- vector("list", n)
+ mask <- caller_env()
+ for (i in seq_len(n)) {
+ f <- formulas[[i]]
+ if (!inherits(f, "formula")) {
+ abort("Each argument to case_when() must be a two-sided formula")
+ }
+ query[[i]] <- arrow_eval(f[[2]], mask)
+ value[[i]] <- arrow_eval(f[[3]], mask)
+ if (!nse_funcs$is.logical(query[[i]])) {
+ abort("Left side of each formula in case_when() must be a logical expression")
+ }
+ if (inherits(value[[i]], "try-error")) {
+ abort(handle_arrow_not_supported(value[[i]], format_expr(f[[3]])))
+ }
+ }
+ build_expr(
+ "case_when",
+ args = c(
+ build_expr(
+ "make_struct",
+ args = query,
+ options = list(field_names = as.character(seq_along(query)))
+ ),
+ value
+ )
+ )
+}
+
+# Aggregation functions
+# These all return a list of:
+# @param fun string function name
+# @param data Expression (these are all currently a single field)
+# @param options list of function options, as passed to call_function
+# For group-by aggregation, `hash_` gets prepended to the function name.
+# So to see a list of available hash aggregation functions,
+# you can use list_compute_functions("^hash_")
+agg_funcs <- list()
+agg_funcs$sum <- function(..., na.rm = FALSE) {
+ list(
+ fun = "sum",
+ data = ensure_one_arg(list2(...), "sum"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$any <- function(..., na.rm = FALSE) {
+ list(
+ fun = "any",
+ data = ensure_one_arg(list2(...), "any"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$all <- function(..., na.rm = FALSE) {
+ list(
+ fun = "all",
+ data = ensure_one_arg(list2(...), "all"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$mean <- function(x, na.rm = FALSE) {
+ list(
+ fun = "mean",
+ data = x,
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) {
+ list(
+ fun = "stddev",
+ data = x,
+ options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof)
+ )
+}
+agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) {
+ list(
+ fun = "variance",
+ data = x,
+ options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof)
+ )
+}
+agg_funcs$quantile <- function(x, probs, na.rm = FALSE) {
+ if (length(probs) != 1) {
+ arrow_not_supported("quantile() with length(probs) != 1")
+ }
+ # TODO: Bind to the Arrow function that returns an exact quantile and remove
+ # this warning (ARROW-14021)
+ warn(
+ "quantile() currently returns an approximate quantile in Arrow",
+ .frequency = ifelse(is_interactive(), "once", "always"),
+ .frequency_id = "arrow.quantile.approximate"
+ )
+ list(
+ fun = "tdigest",
+ data = x,
+ options = list(skip_nulls = na.rm, q = probs)
+ )
+}
+agg_funcs$median <- function(x, na.rm = FALSE) {
+ # TODO: Bind to the Arrow function that returns an exact median and remove
+ # this warning (ARROW-14021)
+ warn(
+ "median() currently returns an approximate median in Arrow",
+ .frequency = ifelse(is_interactive(), "once", "always"),
+ .frequency_id = "arrow.median.approximate"
+ )
+ list(
+ fun = "approximate_median",
+ data = x,
+ options = list(skip_nulls = na.rm)
+ )
+}
+agg_funcs$n_distinct <- function(..., na.rm = FALSE) {
+ list(
+ fun = "count_distinct",
+ data = ensure_one_arg(list2(...), "n_distinct"),
+ options = list(na.rm = na.rm)
+ )
+}
+agg_funcs$n <- function() {
+ list(
+ fun = "sum",
+ data = Expression$scalar(1L),
+ options = list()
+ )
+}
+agg_funcs$min <- function(..., na.rm = FALSE) {
+ list(
+ fun = "min",
+ data = ensure_one_arg(list2(...), "min"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+agg_funcs$max <- function(..., na.rm = FALSE) {
+ list(
+ fun = "max",
+ data = ensure_one_arg(list2(...), "max"),
+ options = list(skip_nulls = na.rm, min_count = 0L)
+ )
+}
+
+ensure_one_arg <- function(args, fun) {
+ if (length(args) == 0) {
+ arrow_not_supported(paste0(fun, "() with 0 arguments"))
+ } else if (length(args) > 1) {
+ arrow_not_supported(paste0("Multiple arguments to ", fun, "()"))
+ }
+ args[[1]]
+}
+
+output_type <- function(fun, input_type, hash) {
+ # These are quick and dirty heuristics.
+ if (fun %in% c("any", "all")) {
+ bool()
+ } else if (fun %in% "sum") {
+ # It may upcast to a bigger type but this is close enough
+ input_type
+ } else if (fun %in% c("mean", "stddev", "variance", "approximate_median")) {
+ float64()
+ } else if (fun %in% "tdigest") {
+ if (hash) {
+ fixed_size_list_of(float64(), 1L)
+ } else {
+ float64()
+ }
+ } else {
+ # Just so things don't error, assume the resulting type is the same
+ input_type
+ }
+}
diff --git a/src/arrow/r/R/dplyr-group-by.R b/src/arrow/r/R/dplyr-group-by.R
new file mode 100644
index 000000000..66b867210
--- /dev/null
+++ b/src/arrow/r/R/dplyr-group-by.R
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+group_by.arrow_dplyr_query <- function(.data,
+ ...,
+ .add = FALSE,
+ add = .add,
+ .drop = dplyr::group_by_drop_default(.data)) {
+ .data <- as_adq(.data)
+ new_groups <- enquos(...)
+ # ... can contain expressions (i.e. can add (or rename?) columns) and so we
+ # need to identify those and add them on to the query with mutate. Specifically,
+ # we want to mark as new:
+ # * expressions (named or otherwise)
+ # * variables that have new names
+ # All others (i.e. simple references to variables) should not be (re)-added
+
+ # Identify any groups with names which aren't in names of .data
+ new_group_ind <- map_lgl(new_groups, ~ !(quo_name(.x) %in% names(.data)))
+ # Identify any groups which don't have names
+ named_group_ind <- map_lgl(names(new_groups), nzchar)
+ # Retain any new groups identified above
+ new_groups <- new_groups[new_group_ind | named_group_ind]
+ if (length(new_groups)) {
+ # now either use the name that was given in ... or if that is "" then use the expr
+ names(new_groups) <- imap_chr(new_groups, ~ ifelse(.y == "", quo_name(.x), .y))
+
+ # Add them to the data
+ .data <- dplyr::mutate(.data, !!!new_groups)
+ }
+ if (".add" %in% names(formals(dplyr::group_by))) {
+ # For compatibility with dplyr >= 1.0
+ gv <- dplyr::group_by_prepare(.data, ..., .add = .add)$group_names
+ } else {
+ gv <- dplyr::group_by_prepare(.data, ..., add = add)$group_names
+ }
+ .data$group_by_vars <- gv
+ .data$drop_empty_groups <- ifelse(length(gv), .drop, dplyr::group_by_drop_default(.data))
+ .data
+}
+group_by.Dataset <- group_by.ArrowTabular <- group_by.arrow_dplyr_query
+
+groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x))
+groups.Dataset <- groups.ArrowTabular <- function(x) NULL
+
+group_vars.arrow_dplyr_query <- function(x) x$group_by_vars
+group_vars.Dataset <- function(x) NULL
+group_vars.RecordBatchReader <- function(x) NULL
+group_vars.ArrowTabular <- function(x) {
+ x$r_metadata$attributes$.group_vars
+}
+
+# the logical literal in the two functions below controls the default value of
+# the .drop argument to group_by()
+group_by_drop_default.arrow_dplyr_query <-
+ function(.tbl) .tbl$drop_empty_groups %||% TRUE
+group_by_drop_default.Dataset <- group_by_drop_default.ArrowTabular <-
+ function(.tbl) TRUE
+
+ungroup.arrow_dplyr_query <- function(x, ...) {
+ x$group_by_vars <- character()
+ x$drop_empty_groups <- NULL
+ x
+}
+ungroup.Dataset <- force
+ungroup.ArrowTabular <- function(x) {
+ x$r_metadata$attributes$.group_vars <- NULL
+ x
+}
diff --git a/src/arrow/r/R/dplyr-join.R b/src/arrow/r/R/dplyr-join.R
new file mode 100644
index 000000000..c14b1a8f3
--- /dev/null
+++ b/src/arrow/r/R/dplyr-join.R
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+do_join <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE,
+ na_matches,
+ join_type) {
+ # TODO: handle `copy` arg: ignore?
+ # TODO: handle `suffix` arg: Arrow does prefix
+ # TODO: handle `keep` arg: "Should the join keys from both ‘x’ and ‘y’ be preserved in the output?"
+ # TODO: handle `na_matches` arg
+ x <- as_adq(x)
+ y <- as_adq(y)
+ by <- handle_join_by(by, x, y)
+
+ x$join <- list(
+ type = JoinType[[join_type]],
+ right_data = y,
+ by = by
+ )
+ collapse.arrow_dplyr_query(x)
+}
+
+left_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_OUTER")
+}
+left_join.Dataset <- left_join.ArrowTabular <- left_join.arrow_dplyr_query
+
+right_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER")
+}
+right_join.Dataset <- right_join.ArrowTabular <- right_join.arrow_dplyr_query
+
+inner_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "INNER")
+}
+inner_join.Dataset <- inner_join.ArrowTabular <- inner_join.arrow_dplyr_query
+
+full_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "FULL_OUTER")
+}
+full_join.Dataset <- full_join.ArrowTabular <- full_join.arrow_dplyr_query
+
+semi_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_SEMI")
+}
+semi_join.Dataset <- semi_join.ArrowTabular <- semi_join.arrow_dplyr_query
+
+anti_join.arrow_dplyr_query <- function(x,
+ y,
+ by = NULL,
+ copy = FALSE,
+ suffix = c(".x", ".y"),
+ ...,
+ keep = FALSE) {
+ do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_ANTI")
+}
+anti_join.Dataset <- anti_join.ArrowTabular <- anti_join.arrow_dplyr_query
+
+handle_join_by <- function(by, x, y) {
+ if (is.null(by)) {
+ return(set_names(intersect(names(x), names(y))))
+ }
+ stopifnot(is.character(by))
+ if (is.null(names(by))) {
+ by <- set_names(by)
+ }
+ # TODO: nicer messages?
+ stopifnot(
+ all(names(by) %in% names(x)),
+ all(by %in% names(y))
+ )
+ by
+}
diff --git a/src/arrow/r/R/dplyr-mutate.R b/src/arrow/r/R/dplyr-mutate.R
new file mode 100644
index 000000000..2e5239484
--- /dev/null
+++ b/src/arrow/r/R/dplyr-mutate.R
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+mutate.arrow_dplyr_query <- function(.data,
+ ...,
+ .keep = c("all", "used", "unused", "none"),
+ .before = NULL,
+ .after = NULL) {
+ call <- match.call()
+ exprs <- ensure_named_exprs(quos(...))
+
+ .keep <- match.arg(.keep)
+ .before <- enquo(.before)
+ .after <- enquo(.after)
+
+ if (.keep %in% c("all", "unused") && length(exprs) == 0) {
+ # Nothing to do
+ return(.data)
+ }
+
+ .data <- as_adq(.data)
+
+ # Restrict the cases we support for now
+ has_aggregations <- any(unlist(lapply(exprs, all_funs)) %in% names(agg_funcs))
+ if (has_aggregations) {
+ # ARROW-13926
+ # mutate() on a grouped dataset does calculations within groups
+ # This doesn't matter on scalar ops (arithmetic etc.) but it does
+ # for things with aggregations (e.g. subtracting the mean)
+ return(abandon_ship(call, .data, "window functions not currently supported in Arrow"))
+ }
+
+ mask <- arrow_mask(.data)
+ results <- list()
+ for (i in seq_along(exprs)) {
+ # Iterate over the indices and not the names because names may be repeated
+ # (which overwrites the previous name)
+ new_var <- names(exprs)[i]
+ results[[new_var]] <- arrow_eval(exprs[[i]], mask)
+ if (inherits(results[[new_var]], "try-error")) {
+ msg <- handle_arrow_not_supported(
+ results[[new_var]],
+ format_expr(exprs[[i]])
+ )
+ return(abandon_ship(call, .data, msg))
+ } else if (!inherits(results[[new_var]], "Expression") &&
+ !is.null(results[[new_var]])) {
+ # We need some wrapping to handle literal values
+ if (length(results[[new_var]]) != 1) {
+ msg <- paste0("In ", new_var, " = ", format_expr(exprs[[i]]), ", only values of size one are recycled")
+ return(abandon_ship(call, .data, msg))
+ }
+ results[[new_var]] <- Expression$scalar(results[[new_var]])
+ }
+ # Put it in the data mask too
+ mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]]
+ }
+
+ old_vars <- names(.data$selected_columns)
+ # Note that this is names(exprs) not names(results):
+ # if results$new_var is NULL, that means we are supposed to remove it
+ new_vars <- names(exprs)
+
+ # Assign the new columns into the .data$selected_columns
+ for (new_var in new_vars) {
+ .data$selected_columns[[new_var]] <- results[[new_var]]
+ }
+
+ # Deduplicate new_vars and remove NULL columns from new_vars
+ new_vars <- intersect(new_vars, names(.data$selected_columns))
+
+ # Respect .before and .after
+ if (!quo_is_null(.before) || !quo_is_null(.after)) {
+ new <- setdiff(new_vars, old_vars)
+ .data <- dplyr::relocate(.data, all_of(new), .before = !!.before, .after = !!.after)
+ }
+
+ # Respect .keep
+ if (.keep == "none") {
+ .data$selected_columns <- .data$selected_columns[new_vars]
+ } else if (.keep != "all") {
+ # "used" or "unused"
+ used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE)
+ if (.keep == "used") {
+ .data$selected_columns[setdiff(old_vars, used_vars)] <- NULL
+ } else {
+ # "unused"
+ .data$selected_columns[intersect(old_vars, used_vars)] <- NULL
+ }
+ }
+ # Even if "none", we still keep group vars
+ ensure_group_vars(.data)
+}
+mutate.Dataset <- mutate.ArrowTabular <- mutate.arrow_dplyr_query
+
+transmute.arrow_dplyr_query <- function(.data, ...) {
+ dots <- check_transmute_args(...)
+ dplyr::mutate(.data, !!!dots, .keep = "none")
+}
+transmute.Dataset <- transmute.ArrowTabular <- transmute.arrow_dplyr_query
+
+# This function is a copy of dplyr:::check_transmute_args at
+# https://github.com/tidyverse/dplyr/blob/master/R/mutate.R
+check_transmute_args <- function(..., .keep, .before, .after) {
+ if (!missing(.keep)) {
+ abort("`transmute()` does not support the `.keep` argument")
+ }
+ if (!missing(.before)) {
+ abort("`transmute()` does not support the `.before` argument")
+ }
+ if (!missing(.after)) {
+ abort("`transmute()` does not support the `.after` argument")
+ }
+ enquos(...)
+}
+
+ensure_named_exprs <- function(exprs) {
+ # Check for unnamed expressions and fix if any
+ unnamed <- !nzchar(names(exprs))
+ # Deparse and take the first element in case they're long expressions
+ names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
+ exprs
+}
diff --git a/src/arrow/r/R/dplyr-select.R b/src/arrow/r/R/dplyr-select.R
new file mode 100644
index 000000000..9a867ced9
--- /dev/null
+++ b/src/arrow/r/R/dplyr-select.R
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)
+
+select.arrow_dplyr_query <- function(.data, ...) {
+ check_select_helpers(enexprs(...))
+ column_select(as_adq(.data), !!!enquos(...))
+}
+select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query
+
+rename.arrow_dplyr_query <- function(.data, ...) {
+ check_select_helpers(enexprs(...))
+ column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename)
+}
+rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query
+
+column_select <- function(.data, ..., .FUN = vars_select) {
+ # .FUN is either tidyselect::vars_select or tidyselect::vars_rename
+ # It operates on the names() of selected_columns, i.e. the column names
+ # factoring in any renaming that may already have happened
+ out <- .FUN(names(.data), !!!enquos(...))
+ # Make sure that the resulting selected columns map back to the original data,
+ # as in when there are multiple renaming steps
+ .data$selected_columns <- set_names(.data$selected_columns[out], names(out))
+
+ # If we've renamed columns, we need to project that renaming into other
+ # query parameters we've collected
+ renamed <- out[names(out) != out]
+ if (length(renamed)) {
+ # Massage group_by
+ gbv <- .data$group_by_vars
+ renamed_groups <- gbv %in% renamed
+ gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
+ .data$group_by_vars <- gbv
+ # No need to massage filters because those contain references to Arrow objects
+ }
+ .data
+}
+
+relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
+ # The code in this function is adapted from the code in dplyr::relocate.data.frame
+ # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
+ # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829
+
+ .data <- as_adq(.data)
+
+ # Assign the schema to the expressions
+ map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+
+ # Create a mask for evaluating expressions in tidyselect helpers
+ mask <- new_environment(.cache$functions, parent = caller_env())
+
+ to_move <- eval_select(substitute(c(...)), .data$selected_columns, mask)
+
+ .before <- enquo(.before)
+ .after <- enquo(.after)
+ has_before <- !quo_is_null(.before)
+ has_after <- !quo_is_null(.after)
+
+ if (has_before && has_after) {
+ abort("Must supply only one of `.before` and `.after`.")
+ } else if (has_before) {
+ where <- min(unname(eval_select(quo_get_expr(.before), .data$selected_columns, mask)))
+ if (!where %in% to_move) {
+ to_move <- c(to_move, where)
+ }
+ } else if (has_after) {
+ where <- max(unname(eval_select(quo_get_expr(.after), .data$selected_columns, mask)))
+ if (!where %in% to_move) {
+ to_move <- c(where, to_move)
+ }
+ } else {
+ where <- 1L
+ if (!where %in% to_move) {
+ to_move <- c(to_move, where)
+ }
+ }
+
+ lhs <- setdiff(seq2(1, where - 1), to_move)
+ rhs <- setdiff(seq2(where + 1, length(.data$selected_columns)), to_move)
+
+ pos <- vec_unique(c(lhs, to_move, rhs))
+ new_names <- names(pos)
+ .data$selected_columns <- .data$selected_columns[pos]
+
+ if (!is.null(new_names)) {
+ names(.data$selected_columns)[new_names != ""] <- new_names[new_names != ""]
+ }
+ .data
+}
+relocate.Dataset <- relocate.ArrowTabular <- relocate.arrow_dplyr_query
+
+check_select_helpers <- function(exprs) {
+ # Throw an error if unsupported tidyselect selection helpers in `exprs`
+ exprs <- lapply(exprs, function(x) if (is_quosure(x)) quo_get_expr(x) else x)
+ unsup_select_helpers <- "where"
+ funs_in_exprs <- unlist(lapply(exprs, all_funs))
+ unsup_funs <- funs_in_exprs[funs_in_exprs %in% unsup_select_helpers]
+ if (length(unsup_funs)) {
+ stop(
+ "Unsupported selection ",
+ ngettext(length(unsup_funs), "helper: ", "helpers: "),
+ oxford_paste(paste0(unsup_funs, "()"), quote = FALSE),
+ call. = FALSE
+ )
+ }
+}
diff --git a/src/arrow/r/R/dplyr-summarize.R b/src/arrow/r/R/dplyr-summarize.R
new file mode 100644
index 000000000..a6b7a3592
--- /dev/null
+++ b/src/arrow/r/R/dplyr-summarize.R
@@ -0,0 +1,289 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# The following S3 methods are registered on load if dplyr is present
+
+summarise.arrow_dplyr_query <- function(.data, ...) {
+ call <- match.call()
+ .data <- as_adq(.data)
+ exprs <- quos(...)
+ # Only retain the columns we need to do our aggregations
+ vars_to_keep <- unique(c(
+ unlist(lapply(exprs, all.vars)), # vars referenced in summarise
+ dplyr::group_vars(.data) # vars needed for grouping
+ ))
+ # If exprs rely on the results of previous exprs
+ # (total = sum(x), mean = total / n())
+ # then not all vars will correspond to columns in the data,
+ # so don't try to select() them (use intersect() to exclude them)
+ # Note that this select() isn't useful for the Arrow summarize implementation
+ # because it will effectively project to keep what it needs anyway,
+ # but the data.frame fallback version does benefit from select here
+ .data <- dplyr::select(.data, intersect(vars_to_keep, names(.data)))
+
+ # Try stuff, if successful return()
+ out <- try(do_arrow_summarize(.data, ...), silent = TRUE)
+ if (inherits(out, "try-error")) {
+ return(abandon_ship(call, .data, format(out)))
+ } else {
+ return(out)
+ }
+}
+summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query
+
+# This is the Arrow summarize implementation
+do_arrow_summarize <- function(.data, ..., .groups = NULL) {
+ exprs <- ensure_named_exprs(quos(...))
+
+ # Create a stateful environment for recording our evaluated expressions
+ # It's more complex than other places because a single summarize() expr
+ # may result in multiple query nodes (Aggregate, Project),
+ # and we have to walk through the expressions to disentangle them.
+ ctx <- env(
+ mask = arrow_mask(.data, aggregation = TRUE),
+ aggregations = empty_named_list(),
+ post_mutate = empty_named_list()
+ )
+ for (i in seq_along(exprs)) {
+ # Iterate over the indices and not the names because names may be repeated
+ # (which overwrites the previous name)
+ summarize_eval(
+ names(exprs)[i],
+ exprs[[i]],
+ ctx,
+ length(.data$group_by_vars) > 0
+ )
+ }
+
+ # Apply the results to the .data object.
+ # First, the aggregations
+ .data$aggregations <- ctx$aggregations
+ # Then collapse the query so that the resulting query object can have
+ # additional operations applied to it
+ out <- collapse.arrow_dplyr_query(.data)
+ # The expressions may have been translated into
+ # "first, aggregate, then transform the result further"
+ # nolint start
+ # For example,
+ # summarize(mean = sum(x) / n())
+ # is effectively implemented as
+ # summarize(..temp0 = sum(x), ..temp1 = n()) %>%
+ # mutate(mean = ..temp0 / ..temp1) %>%
+ # select(-starts_with("..temp"))
+ # If this is the case, there will be expressions in post_mutate
+ # nolint end
+ if (length(ctx$post_mutate)) {
+ # Append post_mutate, and make sure order is correct
+ # according to input exprs (also dropping ..temp columns)
+ out$selected_columns <- c(
+ out$selected_columns,
+ ctx$post_mutate
+ )[c(.data$group_by_vars, names(exprs))]
+ }
+
+ # If the object has .drop = FALSE and any group vars are dictionaries,
+ # we can't (currently) preserve the empty rows that dplyr does,
+ # so give a warning about that.
+ if (!dplyr::group_by_drop_default(.data)) {
+ group_by_exprs <- .data$selected_columns[.data$group_by_vars]
+ if (any(map_lgl(group_by_exprs, ~ inherits(.$type(), "DictionaryType")))) {
+ warning(
+ ".drop = FALSE currently not supported in Arrow aggregation",
+ call. = FALSE
+ )
+ }
+ }
+
+ # Handle .groups argument
+ if (length(.data$group_by_vars)) {
+ if (is.null(.groups)) {
+ # dplyr docs say:
+ # When ‘.groups’ is not specified, it is chosen based on the
+ # number of rows of the results:
+ # • If all the results have 1 row, you get "drop_last".
+ # • If the number of rows varies, you get "keep".
+ #
+ # But we don't support anything that returns multiple rows now
+ .groups <- "drop_last"
+ } else {
+ assert_that(is.string(.groups))
+ }
+ if (.groups == "drop_last") {
+ out$group_by_vars <- head(.data$group_by_vars, -1)
+ } else if (.groups == "keep") {
+ out$group_by_vars <- .data$group_by_vars
+ } else if (.groups == "rowwise") {
+ stop(arrow_not_supported('.groups = "rowwise"'))
+ } else if (.groups == "drop") {
+ # collapse() preserves groups so remove them
+ out <- dplyr::ungroup(out)
+ } else {
+ stop(paste("Invalid .groups argument:", .groups))
+ }
+ # TODO: shouldn't we be doing something with `drop_empty_groups` in summarize? (ARROW-14044)
+ out$drop_empty_groups <- .data$drop_empty_groups
+ }
+ out
+}
+
+arrow_eval_or_stop <- function(expr, mask) {
+ # TODO: change arrow_eval error handling behavior?
+ out <- arrow_eval(expr, mask)
+ if (inherits(out, "try-error")) {
+ msg <- handle_arrow_not_supported(out, format_expr(expr))
+ stop(msg, call. = FALSE)
+ }
+ out
+}
+
+summarize_projection <- function(.data) {
+ c(
+ map(.data$aggregations, ~ .$data),
+ .data$selected_columns[.data$group_by_vars]
+ )
+}
+
+format_aggregation <- function(x) {
+ paste0(x$fun, "(", x$data$ToString(), ")")
+}
+
+# This function handles each summarize expression and turns it into the
+# appropriate combination of (1) aggregations (possibly temporary) and
+# (2) post-aggregation transformations (mutate)
+# The function returns nothing: it assigns into the `ctx` environment
+summarize_eval <- function(name, quosure, ctx, hash, recurse = FALSE) {
+ expr <- quo_get_expr(quosure)
+ ctx$quo_env <- quo_get_env(quosure)
+
+ funs_in_expr <- all_funs(expr)
+ if (length(funs_in_expr) == 0) {
+ # If it is a scalar or field ref, no special handling required
+ ctx$aggregations[[name]] <- arrow_eval_or_stop(quosure, ctx$mask)
+ return()
+ }
+
+ # For the quantile() binding in the hash aggregation case, we need to mutate
+ # the list output from the Arrow hash_tdigest kernel to flatten it into a
+ # column of type float64. We do that by modifying the unevaluated expression
+ # to replace quantile(...) with arrow_list_element(quantile(...), 0L)
+ if (hash && "quantile" %in% funs_in_expr) {
+ expr <- wrap_hash_quantile(expr)
+ funs_in_expr <- all_funs(expr)
+ }
+
+ # Start inspecting the expr to see what aggregations it involves
+ agg_funs <- names(agg_funcs)
+ outer_agg <- funs_in_expr[1] %in% agg_funs
+ inner_agg <- funs_in_expr[-1] %in% agg_funs
+
+ # First, pull out any aggregations wrapped in other function calls
+ if (any(inner_agg)) {
+ expr <- extract_aggregations(expr, ctx)
+ }
+
+ # By this point, there are no more aggregation functions in expr
+ # except for possibly the outer function call:
+ # they've all been pulled out to ctx$aggregations, and in their place in expr
+ # there are variable names, which will correspond to field refs in the
+ # query object after aggregation and collapse().
+ # So if we want to know if there are any aggregations inside expr,
+ # we have to look for them by their new var names
+ inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations)
+
+ if (outer_agg) {
+ # This is something like agg(fun(x, y)
+ # It just works by normal arrow_eval, unless there's a mix of aggs and
+ # columns in the original data like agg(fun(x, agg(x)))
+ # (but that will have been caught in extract_aggregations())
+ ctx$aggregations[[name]] <- arrow_eval_or_stop(
+ as_quosure(expr, ctx$quo_env),
+ ctx$mask
+ )
+ return()
+ } else if (all(inner_agg_exprs)) {
+ # Something like: fun(agg(x), agg(y))
+ # So based on the aggregations that have been extracted, mutate after
+ mutate_mask <- arrow_mask(
+ list(selected_columns = make_field_refs(names(ctx$aggregations)))
+ )
+ ctx$post_mutate[[name]] <- arrow_eval_or_stop(
+ as_quosure(expr, ctx$quo_env),
+ mutate_mask
+ )
+ return()
+ }
+
+ # Backstop for any other odd cases, like fun(x, y) (i.e. no aggregation),
+ # or aggregation functions that aren't supported in Arrow (not in agg_funcs)
+ stop(
+ handle_arrow_not_supported(quo_get_expr(quosure), format_expr(quosure)),
+ call. = FALSE
+ )
+}
+
+# This function recurses through expr, pulls out any aggregation expressions,
+# and inserts a variable name (field ref) in place of the aggregation
+extract_aggregations <- function(expr, ctx) {
+ # Keep the input in case we need to raise an error message with it
+ original_expr <- expr
+ funs <- all_funs(expr)
+ if (length(funs) == 0) {
+ return(expr)
+ } else if (length(funs) > 1) {
+ # Recurse more
+ expr[-1] <- lapply(expr[-1], extract_aggregations, ctx)
+ }
+ if (funs[1] %in% names(agg_funcs)) {
+ inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations)
+ if (any(inner_agg_exprs) & !all(inner_agg_exprs)) {
+ # We can't aggregate over a combination of dataset columns and other
+ # aggregations (e.g. sum(x - mean(x)))
+ # TODO: support in ARROW-13926
+ # TODO: Add "because" arg to explain _why_ it's not supported?
+ # TODO: this message could also say "not supported in summarize()"
+ # since some of these expressions may be legal elsewhere
+ stop(
+ handle_arrow_not_supported(original_expr, format_expr(original_expr)),
+ call. = FALSE
+ )
+ }
+
+ # We have an aggregation expression with no other aggregations inside it,
+ # so arrow_eval the expression on the data and give it a ..temp name prefix,
+ # then insert that name (symbol) back into the expression so that we can
+ # mutate() on the result of the aggregation and reference this field.
+ tmpname <- paste0("..temp", length(ctx$aggregations))
+ ctx$aggregations[[tmpname]] <- arrow_eval_or_stop(as_quosure(expr, ctx$quo_env), ctx$mask)
+ expr <- as.symbol(tmpname)
+ }
+ expr
+}
+
+# This function recurses through expr and wraps each call to quantile() with a
+# call to arrow_list_element()
+wrap_hash_quantile <- function(expr) {
+ if (length(expr) == 1) {
+ return(expr)
+ } else {
+ if (is.call(expr) && expr[[1]] == quote(quantile)) {
+ return(str2lang(paste0("arrow_list_element(", deparse1(expr), ", 0L)")))
+ } else {
+ return(as.call(lapply(expr, wrap_hash_quantile)))
+ }
+ }
+}
diff --git a/src/arrow/r/R/dplyr.R b/src/arrow/r/R/dplyr.R
new file mode 100644
index 000000000..e6f67c066
--- /dev/null
+++ b/src/arrow/r/R/dplyr.R
@@ -0,0 +1,259 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include expression.R
+#' @include record-batch.R
+#' @include table.R
+
+arrow_dplyr_query <- function(.data) {
+ # An arrow_dplyr_query is a container for an Arrow data object (Table,
+ # RecordBatch, or Dataset) and the state of the user's dplyr query--things
+ # like selected columns, filters, and group vars.
+ # An arrow_dplyr_query can contain another arrow_dplyr_query in .data
+ gv <- dplyr::group_vars(.data) %||% character()
+
+ if (!inherits(.data, c("Dataset", "arrow_dplyr_query", "RecordBatchReader"))) {
+ .data <- InMemoryDataset$create(.data)
+ }
+ # Evaluating expressions on a dataset with duplicated fieldnames will error
+ dupes <- duplicated(names(.data))
+ if (any(dupes)) {
+ abort(c(
+ "Duplicated field names",
+ x = paste0(
+ "The following field names were found more than once in the data: ",
+ oxford_paste(names(.data)[dupes])
+ )
+ ))
+ }
+ structure(
+ list(
+ .data = .data,
+ # selected_columns is a named list:
+ # * contents are references/expressions pointing to the data
+ # * names are the names they should be in the end (i.e. this
+ # records any renaming)
+ selected_columns = make_field_refs(names(.data$schema)),
+ # filtered_rows will be an Expression
+ filtered_rows = TRUE,
+ # group_by_vars is a character vector of columns (as renamed)
+ # in the data. They will be kept when data is pulled into R.
+ group_by_vars = gv,
+ # drop_empty_groups is a logical value indicating whether to drop
+ # groups formed by factor levels that don't appear in the data. It
+ # should be non-null only when the data is grouped.
+ drop_empty_groups = NULL,
+ # arrange_vars will be a list of expressions named by their associated
+ # column names
+ arrange_vars = list(),
+ # arrange_desc will be a logical vector indicating the sort order for each
+ # expression in arrange_vars (FALSE for ascending, TRUE for descending)
+ arrange_desc = logical()
+ ),
+ class = "arrow_dplyr_query"
+ )
+}
+
+# The only difference between `arrow_dplyr_query()` and `as_adq()` is that if
+# `.data` is already an `arrow_dplyr_query`, `as_adq()`, will return it as is, but
+# `arrow_dplyr_query()` will nest it inside a new `arrow_dplyr_query`. The only
+# place where `arrow_dplyr_query()` should be called directly is inside
+# `collapse()` methods; everywhere else, call `as_adq()`.
+as_adq <- function(.data) {
+ # For most dplyr methods,
+ # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query
+ # This works because the functions all pass .data through as_adq()
+ if (inherits(.data, "arrow_dplyr_query")) {
+ return(.data)
+ }
+ arrow_dplyr_query(.data)
+}
+
+make_field_refs <- function(field_names) {
+ set_names(lapply(field_names, Expression$field_ref), field_names)
+}
+
+#' @export
+print.arrow_dplyr_query <- function(x, ...) {
+ schm <- x$.data$schema
+ types <- map_chr(x$selected_columns, function(expr) {
+ name <- expr$field_name
+ if (nzchar(name)) {
+ # Just a field_ref, so look up in the schema
+ schm$GetFieldByName(name)$type$ToString()
+ } else {
+ # Expression, so get its type and append the expression
+ paste0(
+ expr$type(schm)$ToString(),
+ " (", expr$ToString(), ")"
+ )
+ }
+ })
+ fields <- paste(names(types), types, sep = ": ", collapse = "\n")
+ cat(class(source_data(x))[1], " (query)\n", sep = "")
+ cat(fields, "\n", sep = "")
+ cat("\n")
+ if (length(x$aggregations)) {
+ cat("* Aggregations:\n")
+ aggs <- paste0(names(x$aggregations), ": ", map_chr(x$aggregations, format_aggregation), collapse = "\n")
+ cat(aggs, "\n", sep = "")
+ }
+ if (!isTRUE(x$filtered_rows)) {
+ filter_string <- x$filtered_rows$ToString()
+ cat("* Filter: ", filter_string, "\n", sep = "")
+ }
+ if (length(x$group_by_vars)) {
+ cat("* Grouped by ", paste(x$group_by_vars, collapse = ", "), "\n", sep = "")
+ }
+ if (length(x$arrange_vars)) {
+ arrange_strings <- map_chr(x$arrange_vars, function(x) x$ToString())
+ cat(
+ "* Sorted by ",
+ paste(
+ paste0(
+ arrange_strings,
+ " [", ifelse(x$arrange_desc, "desc", "asc"), "]"
+ ),
+ collapse = ", "
+ ),
+ "\n",
+ sep = ""
+ )
+ }
+ cat("See $.data for the source Arrow object\n")
+ invisible(x)
+}
+
+# These are the names reflecting all select/rename, not what is in Arrow
+#' @export
+names.arrow_dplyr_query <- function(x) names(x$selected_columns)
+
+#' @export
+dim.arrow_dplyr_query <- function(x) {
+ cols <- length(names(x))
+
+ if (is_collapsed(x)) {
+ # Don't evaluate just for nrow
+ rows <- NA_integer_
+ } else if (isTRUE(x$filtered_rows)) {
+ rows <- x$.data$num_rows
+ } else {
+ rows <- Scanner$create(x)$CountRows()
+ }
+ c(rows, cols)
+}
+
+#' @export
+as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) {
+ collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
+}
+
+#' @export
+head.arrow_dplyr_query <- function(x, n = 6L, ...) {
+ x$head <- n
+ collapse.arrow_dplyr_query(x)
+}
+
+#' @export
+tail.arrow_dplyr_query <- function(x, n = 6L, ...) {
+ x$tail <- n
+ collapse.arrow_dplyr_query(x)
+}
+
+#' @export
+`[.arrow_dplyr_query` <- function(x, i, j, ..., drop = FALSE) {
+ x <- ensure_group_vars(x)
+ if (nargs() == 2L) {
+ # List-like column extraction (x[i])
+ return(x[, i])
+ }
+ if (!missing(j)) {
+ x <- select.arrow_dplyr_query(x, all_of(j))
+ }
+
+ if (!missing(i)) {
+ out <- take_dataset_rows(x, i)
+ x <- restore_dplyr_features(out, x)
+ }
+ x
+}
+
+ensure_group_vars <- function(x) {
+ if (inherits(x, "arrow_dplyr_query")) {
+ # Before pulling data from Arrow, make sure all group vars are in the projection
+ gv <- set_names(setdiff(dplyr::group_vars(x), names(x)))
+ if (length(gv)) {
+ # Add them back
+ x$selected_columns <- c(
+ x$selected_columns,
+ make_field_refs(gv)
+ )
+ }
+ }
+ x
+}
+
+ensure_arrange_vars <- function(x) {
+ # The arrange() operation is not performed until later, because:
+ # - It must be performed after mutate(), to enable sorting by new columns.
+ # - It should be performed after filter() and select(), for efficiency.
+ # However, we need users to be able to arrange() by columns and expressions
+ # that are *not* returned in the query result. To enable this, we must
+ # *temporarily* include these columns and expressions in the projection. We
+ # use x$temp_columns to store these. Later, after the arrange() operation has
+ # been performed, these are omitted from the result. This differs from the
+ # columns in x$group_by_vars which *are* returned in the result.
+ x$temp_columns <- x$arrange_vars[!names(x$arrange_vars) %in% names(x$selected_columns)]
+ x
+}
+
+# Helper to handle unsupported dplyr features
+# * For Table/RecordBatch, we collect() and then call the dplyr method in R
+# * For Dataset, we just error
+abandon_ship <- function(call, .data, msg) {
+ msg <- trimws(msg)
+ dplyr_fun_name <- sub("^(.*?)\\..*", "\\1", as.character(call[[1]]))
+ if (query_on_dataset(.data)) {
+ stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE)
+ }
+ # else, collect and call dplyr method
+ warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE)
+ call$.data <- dplyr::collect(.data)
+ call[[1]] <- get(dplyr_fun_name, envir = asNamespace("dplyr"))
+ eval.parent(call, 2)
+}
+
+query_on_dataset <- function(x) !inherits(source_data(x), "InMemoryDataset")
+
+source_data <- function(x) {
+ if (is_collapsed(x)) {
+ source_data(x$.data)
+ } else {
+ x$.data
+ }
+}
+
+is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query")
+
+has_aggregation <- function(x) {
+ # TODO: update with joins (check right side data too)
+ !is.null(x$aggregations) || (is_collapsed(x) && has_aggregation(x$.data))
+}
+
+has_head_tail <- function(x) {
+ !is.null(x$head) || !is.null(x$tail) || (is_collapsed(x) && has_head_tail(x$.data))
+}
diff --git a/src/arrow/r/R/duckdb.R b/src/arrow/r/R/duckdb.R
new file mode 100644
index 000000000..c772d4fbd
--- /dev/null
+++ b/src/arrow/r/R/duckdb.R
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Create a (virtual) DuckDB table from an Arrow object
+#'
+#' This will do the necessary configuration to create a (virtual) table in DuckDB
+#' that is backed by the Arrow object given. No data is copied or modified until
+#' `collect()` or `compute()` are called or a query is run against the table.
+#'
+#' The result is a dbplyr-compatible object that can be used in d(b)plyr pipelines.
+#'
+#' If `auto_disconnect = TRUE`, the DuckDB table that is created will be configured
+#' to be unregistered when the `tbl` object is garbage collected. This is helpful
+#' if you don't want to have extra table objects in DuckDB after you've finished
+#' using them. Currently, this cleanup can, however, sometimes lead to hangs if
+#' tables are created and deleted in quick succession, hence the default value
+#' of `FALSE`
+#'
+#' @param .data the Arrow object (e.g. Dataset, Table) to use for the DuckDB table
+#' @param con a DuckDB connection to use (default will create one and store it
+#' in `options("arrow_duck_con")`)
+#' @param table_name a name to use in DuckDB for this object. The default is a
+#' unique string `"arrow_"` followed by numbers.
+#' @param auto_disconnect should the table be automatically cleaned up when the
+#' resulting object is removed (and garbage collected)? Default: `FALSE`
+#'
+#' @return A `tbl` of the new table in DuckDB
+#'
+#' @name to_duckdb
+#' @export
+#' @examplesIf getFromNamespace("run_duckdb_examples", "arrow")()
+#' library(dplyr)
+#'
+#' ds <- InMemoryDataset$create(mtcars)
+#'
+#' ds %>%
+#' filter(mpg < 30) %>%
+#' to_duckdb() %>%
+#' group_by(cyl) %>%
+#' summarize(mean_mpg = mean(mpg, na.rm = TRUE))
+to_duckdb <- function(.data,
+ con = arrow_duck_connection(),
+ table_name = unique_arrow_tablename(),
+ auto_disconnect = FALSE) {
+ .data <- as_adq(.data)
+ duckdb::duckdb_register_arrow(con, table_name, .data)
+
+ tbl <- tbl(con, table_name)
+ groups <- dplyr::groups(.data)
+ if (length(groups)) {
+ tbl <- dplyr::group_by(tbl, groups)
+ }
+
+ if (auto_disconnect) {
+ # this will add the correct connection disconnection when the tbl is gced.
+ # we should probably confirm that this use of src$disco is kosher.
+ tbl$src$disco <- duckdb_disconnector(con, table_name)
+ }
+
+ tbl
+}
+
+arrow_duck_connection <- function() {
+ con <- getOption("arrow_duck_con")
+ if (is.null(con) || !DBI::dbIsValid(con)) {
+ con <- DBI::dbConnect(duckdb::duckdb())
+ # Use the same CPU count that the arrow library is set to
+ DBI::dbExecute(con, paste0("PRAGMA threads=", cpu_count()))
+ options(arrow_duck_con = con)
+ }
+ con
+}
+
+# helper function to determine if duckdb examples should run
+# see: https://github.com/r-lib/roxygen2/issues/1242
+run_duckdb_examples <- function() {
+ arrow_with_dataset() &&
+ requireNamespace("duckdb", quietly = TRUE) &&
+ packageVersion("duckdb") > "0.2.7" &&
+ requireNamespace("dplyr", quietly = TRUE) &&
+ requireNamespace("dbplyr", quietly = TRUE)
+}
+
+# Adapted from dbplyr
+unique_arrow_tablename <- function() {
+ i <- getOption("arrow_table_name", 0) + 1
+ options(arrow_table_name = i)
+ sprintf("arrow_%03i", i)
+}
+
+# Creates an environment that disconnects the database when it's GC'd
+duckdb_disconnector <- function(con, tbl_name) {
+ reg.finalizer(environment(), function(...) {
+ # remote the table we ephemerally created (though only if the connection is
+ # still valid)
+ if (DBI::dbIsValid(con)) {
+ duckdb::duckdb_unregister_arrow(con, tbl_name)
+ }
+
+ # and there are no more tables, so we can safely shutdown
+ if (length(DBI::dbListTables(con)) == 0) {
+ DBI::dbDisconnect(con, shutdown = TRUE)
+ }
+ })
+ environment()
+}
+
+#' Create an Arrow object from others
+#'
+#' This can be used in pipelines that pass data back and forth between Arrow and
+#' other processes (like DuckDB).
+#'
+#' @param .data the object to be converted
+#'
+#' @return an `arrow_dplyr_query` object, to be used in dplyr pipelines.
+#' @export
+#'
+#' @examplesIf getFromNamespace("run_duckdb_examples", "arrow")()
+#' library(dplyr)
+#'
+#' ds <- InMemoryDataset$create(mtcars)
+#'
+#' ds %>%
+#' filter(mpg < 30) %>%
+#' to_duckdb() %>%
+#' group_by(cyl) %>%
+#' summarize(mean_mpg = mean(mpg, na.rm = TRUE)) %>%
+#' to_arrow() %>%
+#' collect()
+to_arrow <- function(.data) {
+ # If this is an Arrow object already, return quickly since we're already Arrow
+ if (inherits(.data, c("arrow_dplyr_query", "ArrowObject"))) {
+ return(.data)
+ }
+
+ # For now, we only handle .data from duckdb, so check that it is that if we've
+ # gotten this far
+ if (!inherits(dbplyr::remote_con(.data), "duckdb_connection")) {
+ stop(
+ "to_arrow() currently only supports Arrow tables, Arrow datasets, ",
+ "Arrow queries, or dbplyr tbls from duckdb connections",
+ call. = FALSE
+ )
+ }
+
+ # Run the query
+ res <- DBI::dbSendQuery(dbplyr::remote_con(.data), dbplyr::remote_query(.data), arrow = TRUE)
+
+ # TODO: we shouldn't need $read_table(), but we get segfaults when we do.
+ arrow_dplyr_query(duckdb::duckdb_fetch_record_batch(res)$read_table())
+}
diff --git a/src/arrow/r/R/enums.R b/src/arrow/r/R/enums.R
new file mode 100644
index 000000000..4e69b7a19
--- /dev/null
+++ b/src/arrow/r/R/enums.R
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @export
+`print.arrow-enum` <- function(x, ...) {
+ NextMethod()
+}
+
+enum <- function(class, ..., .list = list(...)) {
+ structure(
+ .list,
+ class = c(class, "arrow-enum")
+ )
+}
+
+#' Arrow enums
+#' @name enums
+#' @export
+#' @keywords internal
+TimeUnit <- enum("TimeUnit::type",
+ SECOND = 0L, MILLI = 1L, MICRO = 2L, NANO = 3L
+)
+
+#' @rdname enums
+#' @export
+DateUnit <- enum("DateUnit", DAY = 0L, MILLI = 1L)
+
+#' @rdname enums
+#' @export
+Type <- enum("Type::type",
+ "NA" = 0L,
+ BOOL = 1L,
+ UINT8 = 2L,
+ INT8 = 3L,
+ UINT16 = 4L,
+ INT16 = 5L,
+ UINT32 = 6L,
+ INT32 = 7L,
+ UINT64 = 8L,
+ INT64 = 9L,
+ HALF_FLOAT = 10L,
+ FLOAT = 11L,
+ DOUBLE = 12L,
+ STRING = 13L,
+ BINARY = 14L,
+ FIXED_SIZE_BINARY = 15L,
+ DATE32 = 16L,
+ DATE64 = 17L,
+ TIMESTAMP = 18L,
+ TIME32 = 19L,
+ TIME64 = 20L,
+ INTERVAL_MONTHS = 21L,
+ INTERVAL_DAY_TIME = 22L,
+ DECIMAL = 23L,
+ DECIMAL256 = 24L,
+ LIST = 25L,
+ STRUCT = 26L,
+ SPARSE_UNION = 27L,
+ DENSE_UNION = 28L,
+ DICTIONARY = 29L,
+ MAP = 30L,
+ EXTENSION = 31L,
+ FIXED_SIZE_LIST = 32L,
+ DURATION = 33L,
+ LARGE_STRING = 34L,
+ LARGE_BINARY = 35L,
+ LARGE_LIST = 36L
+)
+
+TYPES_WITH_NAN <- Type[c("HALF_FLOAT", "FLOAT", "DOUBLE")]
+
+#' @rdname enums
+#' @export
+StatusCode <- enum("StatusCode",
+ OK = 0L, OutOfMemory = 1L, KeyError = 2L, TypeError = 3L,
+ Invalid = 4L, IOError = 5L, CapacityError = 6L, IndexError = 7L,
+ UnknownError = 9L, NotImplemented = 10L, SerializationError = 11L,
+ PythonError = 12L, RError = 13L,
+ PlasmaObjectExists = 20L, PlasmaObjectNotFound = 21L,
+ PlasmaStoreFull = 22L, PlasmaObjectAlreadySealed = 23L
+)
+
+#' @rdname enums
+#' @export
+FileMode <- enum("FileMode",
+ READ = 0L, WRITE = 1L, READWRITE = 2L
+)
+
+#' @rdname enums
+#' @export
+MessageType <- enum("MessageType",
+ NONE = 0L, SCHEMA = 1L, DICTIONARY_BATCH = 2L, RECORD_BATCH = 3L, TENSOR = 4L
+)
+
+#' @rdname enums
+#' @export
+CompressionType <- enum("Compression::type",
+ UNCOMPRESSED = 0L, SNAPPY = 1L, GZIP = 2L, BROTLI = 3L, ZSTD = 4L, LZ4 = 5L,
+ LZ4_FRAME = 6L, LZO = 7L, BZ2 = 8L
+)
+
+#' @export
+#' @rdname enums
+FileType <- enum("FileType",
+ NotFound = 0L, Unknown = 1L, File = 2L, Directory = 3L
+)
+
+#' @export
+#' @rdname enums
+ParquetVersionType <- enum("ParquetVersionType",
+ PARQUET_1_0 = 0L, PARQUET_2_0 = 1L
+)
+
+#' @export
+#' @rdname enums
+MetadataVersion <- enum("MetadataVersion",
+ V1 = 0L, V2 = 1L, V3 = 2L, V4 = 3L, V5 = 4L
+)
+
+#' @export
+#' @rdname enums
+QuantileInterpolation <- enum("QuantileInterpolation",
+ LINEAR = 0L, LOWER = 1L, HIGHER = 2L, NEAREST = 3L, MIDPOINT = 4L
+)
+
+#' @export
+#' @rdname enums
+NullEncodingBehavior <- enum("NullEncodingBehavior",
+ ENCODE = 0L, MASK = 1L
+)
+
+#' @export
+#' @rdname enums
+NullHandlingBehavior <- enum("NullHandlingBehavior",
+ EMIT_NULL = 0L, SKIP = 1L, REPLACE = 2L
+)
+
+#' @export
+#' @rdname enums
+RoundMode <- enum("RoundMode",
+ DOWN = 0L,
+ UP = 1L,
+ TOWARDS_ZERO = 2L,
+ TOWARDS_INFINITY = 3L,
+ HALF_DOWN = 4L,
+ HALF_UP = 5L,
+ HALF_TOWARDS_ZERO = 6L,
+ HALF_TOWARDS_INFINITY = 7L,
+ HALF_TO_EVEN = 8L,
+ HALF_TO_ODD = 9L
+)
+
+#' @export
+#' @rdname enums
+JoinType <- enum("JoinType",
+ LEFT_SEMI = 0L,
+ RIGHT_SEMI = 1L,
+ LEFT_ANTI = 2L,
+ RIGHT_ANTI = 3L,
+ INNER = 4L,
+ LEFT_OUTER = 5L,
+ RIGHT_OUTER = 6L,
+ FULL_OUTER = 7L
+)
diff --git a/src/arrow/r/R/expression.R b/src/arrow/r/R/expression.R
new file mode 100644
index 000000000..b1b6635f5
--- /dev/null
+++ b/src/arrow/r/R/expression.R
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrowExports.R
+
+.unary_function_map <- list(
+ # NOTE: Each of the R functions mapped here takes exactly *one* argument, maps
+ # *directly* to an Arrow C++ compute kernel, and does not require any
+ # non-default options to be specified. More complex R function mappings are
+ # defined in dplyr-functions.R.
+
+ # functions are arranged alphabetically by name within categories
+
+ # arithmetic functions
+ "abs" = "abs_checked",
+ "ceiling" = "ceil",
+ "floor" = "floor",
+ "log10" = "log10_checked",
+ "log1p" = "log1p_checked",
+ "log2" = "log2_checked",
+ "sign" = "sign",
+ # trunc is defined in dplyr-functions.R
+
+ # trigonometric functions
+ "acos" = "acos_checked",
+ "asin" = "asin_checked",
+ "cos" = "cos_checked",
+ "sin" = "sin_checked",
+ "tan" = "tan_checked",
+
+ # logical functions
+ "!" = "invert",
+
+ # string functions
+ # nchar is defined in dplyr-functions.R
+ "str_length" = "utf8_length",
+ # str_pad is defined in dplyr-functions.R
+ # str_sub is defined in dplyr-functions.R
+ # str_to_lower is defined in dplyr-functions.R
+ # str_to_title is defined in dplyr-functions.R
+ # str_to_upper is defined in dplyr-functions.R
+ # str_trim is defined in dplyr-functions.R
+ "stri_reverse" = "utf8_reverse",
+ # substr is defined in dplyr-functions.R
+ # substring is defined in dplyr-functions.R
+ "tolower" = "utf8_lower",
+ "toupper" = "utf8_upper",
+
+ # date and time functions
+ "day" = "day",
+ "hour" = "hour",
+ "isoweek" = "iso_week",
+ "epiweek" = "us_week",
+ "isoyear" = "iso_year",
+ "minute" = "minute",
+ "month" = "month",
+ "quarter" = "quarter",
+ # second is defined in dplyr-functions.R
+ # wday is defined in dplyr-functions.R
+ "yday" = "day_of_year",
+ "year" = "year",
+
+ # type conversion functions
+ "as.factor" = "dictionary_encode"
+)
+
+.binary_function_map <- list(
+ # NOTE: Each of the R functions/operators mapped here takes exactly *two*
+ # arguments. Most map *directly* to an Arrow C++ compute kernel and require no
+ # non-default options, but some are modified by build_expr(). More complex R
+ # function/operator mappings are defined in dplyr-functions.R.
+ "==" = "equal",
+ "!=" = "not_equal",
+ ">" = "greater",
+ ">=" = "greater_equal",
+ "<" = "less",
+ "<=" = "less_equal",
+ "&" = "and_kleene",
+ "|" = "or_kleene",
+ "+" = "add_checked",
+ "-" = "subtract_checked",
+ "*" = "multiply_checked",
+ "/" = "divide",
+ "%/%" = "divide_checked",
+ # we don't actually use divide_checked with `%%`, rather it is rewritten to
+ # use `%/%` above.
+ "%%" = "divide_checked",
+ "^" = "power_checked",
+ "%in%" = "is_in_meta_binary"
+)
+
+.array_function_map <- c(.unary_function_map, .binary_function_map)
+
+#' Arrow expressions
+#'
+#' @description
+#' `Expression`s are used to define filter logic for passing to a [Dataset]
+#' [Scanner].
+#'
+#' `Expression$scalar(x)` constructs an `Expression` which always evaluates to
+#' the provided scalar (length-1) R value.
+#'
+#' `Expression$field_ref(name)` is used to construct an `Expression` which
+#' evaluates to the named column in the `Dataset` against which it is evaluated.
+#'
+#' `Expression$create(function_name, ..., options)` builds a function-call
+#' `Expression` containing one or more `Expression`s.
+#' @name Expression
+#' @rdname Expression
+#' @export
+Expression <- R6Class("Expression",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() compute___expr__ToString(self),
+ Equals = function(other, ...) {
+ inherits(other, "Expression") && compute___expr__equals(self, other)
+ },
+ # TODO: Implement type determination without storing
+ # schemas in Expression objects (ARROW-13186)
+ schema = NULL,
+ type = function(schema = self$schema) {
+ assert_that(!is.null(schema))
+ compute___expr__type(self, schema)
+ },
+ type_id = function(schema = self$schema) {
+ assert_that(!is.null(schema))
+ compute___expr__type_id(self, schema)
+ },
+ cast = function(to_type, safe = TRUE, ...) {
+ opts <- list(
+ to_type = to_type,
+ allow_int_overflow = !safe,
+ allow_time_truncate = !safe,
+ allow_float_truncate = !safe
+ )
+ Expression$create("cast", self, options = modifyList(opts, list(...)))
+ }
+ ),
+ active = list(
+ field_name = function() compute___expr__get_field_ref_name(self)
+ )
+)
+Expression$create <- function(function_name,
+ ...,
+ args = list(...),
+ options = empty_named_list()) {
+ assert_that(is.string(function_name))
+ assert_that(is_list_of(args, "Expression"), msg = "Expression arguments must be Expression objects")
+ expr <- compute___expr__call(function_name, args, options)
+ expr$schema <- unify_schemas(schemas = lapply(args, function(x) x$schema))
+ expr
+}
+
+Expression$field_ref <- function(name) {
+ assert_that(is.string(name))
+ compute___expr__field_ref(name)
+}
+Expression$scalar <- function(x) {
+ expr <- compute___expr__scalar(Scalar$create(x))
+ expr$schema <- schema()
+ expr
+}
+
+# Wrapper around Expression$create that:
+# (1) maps R function names to Arrow C++ compute ("/" --> "divide_checked")
+# (2) wraps R input args as Array or Scalar
+build_expr <- function(FUN,
+ ...,
+ args = list(...),
+ options = empty_named_list()) {
+ if (FUN == "-" && length(args) == 1L) {
+ if (inherits(args[[1]], c("ArrowObject", "Expression"))) {
+ return(build_expr("negate_checked", args[[1]]))
+ } else {
+ return(-args[[1]])
+ }
+ }
+ if (FUN == "%in%") {
+ # Special-case %in%, which is different from the Array function name
+ expr <- Expression$create("is_in", args[[1]],
+ options = list(
+ # If args[[2]] is already an Arrow object (like a scalar),
+ # this wouldn't work
+ value_set = Array$create(args[[2]]),
+ skip_nulls = TRUE
+ )
+ )
+ } else {
+ args <- lapply(args, function(x) {
+ if (!inherits(x, "Expression")) {
+ x <- Expression$scalar(x)
+ }
+ x
+ })
+
+ # In Arrow, "divide" is one function, which does integer division on
+ # integer inputs and floating-point division on floats
+ if (FUN == "/") {
+ # TODO: omg so many ways it's wrong to assume these types
+ args <- lapply(args, function(x) x$cast(float64()))
+ } else if (FUN == "%/%") {
+ # In R, integer division works like floor(float division)
+ out <- build_expr("/", args = args)
+ return(out$cast(int32(), allow_float_truncate = TRUE))
+ } else if (FUN == "%%") {
+ return(args[[1]] - args[[2]] * (args[[1]] %/% args[[2]]))
+ }
+
+ expr <- Expression$create(.array_function_map[[FUN]] %||% FUN, args = args, options = options)
+ }
+ expr
+}
+
+#' @export
+Ops.Expression <- function(e1, e2) {
+ if (.Generic == "!") {
+ build_expr(.Generic, e1)
+ } else {
+ build_expr(.Generic, e1, e2)
+ }
+}
+
+#' @export
+is.na.Expression <- function(x) {
+ Expression$create("is_null", x, options = list(nan_is_null = TRUE))
+}
diff --git a/src/arrow/r/R/feather.R b/src/arrow/r/R/feather.R
new file mode 100644
index 000000000..70a270bbe
--- /dev/null
+++ b/src/arrow/r/R/feather.R
@@ -0,0 +1,219 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Write data in the Feather format
+#'
+#' Feather provides binary columnar serialization for data frames.
+#' It is designed to make reading and writing data frames efficient,
+#' and to make sharing data across data analysis languages easy.
+#' This function writes both the original, limited specification of the format
+#' and the version 2 specification, which is the Apache Arrow IPC file format.
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param version integer Feather file version. Version 2 is the current.
+#' Version 1 is the more limited legacy format.
+#' @param chunk_size For V2 files, the number of rows that each chunk of data
+#' should have in the file. Use a smaller `chunk_size` when you need faster
+#' random row access. Default is 64K. This option is not supported for V1.
+#' @param compression Name of compression codec to use, if any. Default is
+#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
+#' "uncompressed". "zstd" is the other available codec and generally has better
+#' compression ratios in exchange for slower read and write performance
+#' See [codec_is_available()]. This option is not supported for V1.
+#' @param compression_level If `compression` is "zstd", you may
+#' specify an integer compression level. If omitted, the compression codec's
+#' default compression level is used.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
+#' @seealso [Schema] for information about schemas and metadata handling.
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_feather(mtcars, tf)
+#' @include arrow-package.R
+write_feather <- function(x,
+ sink,
+ version = 2,
+ chunk_size = 65536L,
+ compression = c("default", "lz4", "uncompressed", "zstd"),
+ compression_level = NULL) {
+ # Handle and validate options before touching data
+ version <- as.integer(version)
+ assert_that(version %in% 1:2)
+ compression <- match.arg(compression)
+ chunk_size <- as.integer(chunk_size)
+ assert_that(chunk_size > 0)
+ if (compression == "default") {
+ if (version == 2 && codec_is_available("lz4")) {
+ compression <- "lz4"
+ } else {
+ compression <- "uncompressed"
+ }
+ }
+ if (is.null(compression_level)) {
+ # Use -1 as sentinal for "default"
+ compression_level <- -1L
+ }
+ compression_level <- as.integer(compression_level)
+ # Now make sure that options make sense together
+ if (version == 1) {
+ if (chunk_size != 65536L) {
+ stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE)
+ }
+ if (compression != "uncompressed") {
+ stop("Feather version 1 does not support the 'compression' option", call. = FALSE)
+ }
+ if (compression_level != -1L) {
+ stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE)
+ }
+ }
+ if (compression != "zstd" && compression_level != -1L) {
+ stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE)
+ }
+ # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug:
+ version <- version + 1L
+
+ # "lz4" is the convenience
+ if (compression == "lz4") {
+ compression <- "lz4_frame"
+ }
+
+ compression <- compression_from_name(compression)
+
+ x_out <- x
+ if (is.data.frame(x) || inherits(x, "RecordBatch")) {
+ x <- Table$create(x)
+ }
+
+ assert_that(is_writable_table(x))
+
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+ ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level)
+ invisible(x_out)
+}
+
+#' Read a Feather file
+#'
+#' Feather provides binary columnar serialization for data frames.
+#' It is designed to make reading and writing data frames efficient,
+#' and to make sharing data across data analysis languages easy.
+#' This function reads both the original, limited specification of the format
+#' and the version 2 specification, which is the Apache Arrow IPC file format.
+#'
+#' @inheritParams read_ipc_stream
+#' @inheritParams read_delim_arrow
+#' @param ... additional parameters, passed to [make_readable_file()].
+#'
+#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
+#' Arrow [Table] otherwise
+#'
+#' @export
+#' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data.
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_feather(mtcars, tf)
+#' df <- read_feather(tf)
+#' dim(df)
+#' # Can select columns
+#' df <- read_feather(tf, col_select = starts_with("d"))
+read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
+ if (!inherits(file, "RandomAccessFile")) {
+ file <- make_readable_file(file, ...)
+ on.exit(file$close())
+ }
+ reader <- FeatherReader$create(file)
+
+ col_select <- enquo(col_select)
+ columns <- if (!quo_is_null(col_select)) {
+ vars_select(names(reader), !!col_select)
+ }
+
+ out <- tryCatch(
+ reader$Read(columns),
+ error = read_compressed_error
+ )
+
+ if (isTRUE(as_data_frame)) {
+ out <- as.data.frame(out)
+ }
+ out
+}
+
+#' @title FeatherReader class
+#' @rdname FeatherReader
+#' @name FeatherReader
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class enables you to interact with Feather files. Create
+#' one to connect to a file or other InputStream, and call `Read()` on it to
+#' make an `arrow::Table`. See its usage in [`read_feather()`].
+#'
+#' @section Factory:
+#'
+#' The `FeatherReader$create()` factory method instantiates the object and
+#' takes the following argument:
+#'
+#' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
+#'
+#' @section Methods:
+#'
+#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of
+#' integer indices
+#' - `$column_names`: Active binding, returns the column names in the Feather file
+#' - `$schema`: Active binding, returns the schema of the Feather file
+#' - `$version`: Active binding, returns `1` or `2`, according to the Feather
+#' file version
+#'
+#' @export
+#' @include arrow-package.R
+FeatherReader <- R6Class("FeatherReader",
+ inherit = ArrowObject,
+ public = list(
+ Read = function(columns) {
+ ipc___feather___Reader__Read(self, columns)
+ },
+ print = function(...) {
+ cat("FeatherReader:\n")
+ print(self$schema)
+ invisible(self)
+ }
+ ),
+ active = list(
+ # versions are officially 2 for V1 and 3 for V2 :shrug:
+ version = function() ipc___feather___Reader__version(self) - 1L,
+ column_names = function() names(self$schema),
+ schema = function() ipc___feather___Reader__schema(self)
+ )
+)
+
+#' @export
+names.FeatherReader <- function(x) x$column_names
+
+FeatherReader$create <- function(file) {
+ assert_is(file, "RandomAccessFile")
+ ipc___feather___Reader__Open(file)
+}
diff --git a/src/arrow/r/R/field.R b/src/arrow/r/R/field.R
new file mode 100644
index 000000000..d10ee7818
--- /dev/null
+++ b/src/arrow/r/R/field.R
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title Field class
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @description `field()` lets you create an `arrow::Field` that maps a
+#' [DataType][data-type] to a column name. Fields are contained in
+#' [Schemas][Schema].
+#' @section Methods:
+#'
+#' - `f$ToString()`: convert to a string
+#' - `f$Equals(other)`: test for equality. More naturally called as `f == other`
+#'
+#' @rdname Field
+#' @name Field
+#' @export
+Field <- R6Class("Field",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ prettier_dictionary_type(Field__ToString(self))
+ },
+ Equals = function(other, ...) {
+ inherits(other, "Field") && Field__Equals(self, other)
+ },
+ export_to_c = function(ptr) ExportField(self, ptr)
+ ),
+ active = list(
+ name = function() {
+ Field__name(self)
+ },
+ nullable = function() {
+ Field__nullable(self)
+ },
+ type = function() {
+ Field__type(self)
+ }
+ )
+)
+Field$create <- function(name, type, metadata, nullable = TRUE) {
+ assert_that(inherits(name, "character"), length(name) == 1L)
+ type <- as_type(type, name)
+ assert_that(missing(metadata), msg = "metadata= is currently ignored")
+ Field__initialize(enc2utf8(name), type, nullable)
+}
+#' @include arrowExports.R
+Field$import_from_c <- ImportField
+
+#' @param name field name
+#' @param type logical type, instance of [DataType]
+#' @param metadata currently ignored
+#' @param nullable TRUE if field is nullable
+#'
+#' @examplesIf arrow_available()
+#' field("x", int32())
+#' @rdname Field
+#' @export
+field <- Field$create
+
+.fields <- function(.list, nullable = TRUE) {
+ if (length(.list)) {
+ assert_that(!is.null(nms <- names(.list)))
+ map2(nms, .list, field)
+ } else {
+ list()
+ }
+}
diff --git a/src/arrow/r/R/filesystem.R b/src/arrow/r/R/filesystem.R
new file mode 100644
index 000000000..a09d0a51d
--- /dev/null
+++ b/src/arrow/r/R/filesystem.R
@@ -0,0 +1,505 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title FileSystem entry info
+#' @usage NULL
+#' @format NULL
+#'
+#' @section Methods:
+#'
+#' - `base_name()` : The file base name (component after the last directory
+#' separator).
+#' - `extension()` : The file extension
+#'
+#' @section Active bindings:
+#'
+#' - `$type`: The file type
+#' - `$path`: The full file path in the filesystem
+#' - `$size`: The size in bytes, if available. Only regular files are
+#' guaranteed to have a size.
+#' - `$mtime`: The time of last modification, if available.
+#'
+#' @rdname FileInfo
+#' @export
+FileInfo <- R6Class("FileInfo",
+ inherit = ArrowObject,
+ public = list(
+ base_name = function() fs___FileInfo__base_name(self),
+ extension = function() fs___FileInfo__extension(self)
+ ),
+ active = list(
+ type = function(type) {
+ if (missing(type)) {
+ fs___FileInfo__type(self)
+ } else {
+ fs___FileInfo__set_type(self, type)
+ }
+ },
+ path = function(path) {
+ if (missing(path)) {
+ fs___FileInfo__path(self)
+ } else {
+ invisible(fs___FileInfo__set_path(self))
+ }
+ },
+ size = function(size) {
+ if (missing(size)) {
+ fs___FileInfo__size(self)
+ } else {
+ invisible(fs___FileInfo__set_size(self, size))
+ }
+ },
+ mtime = function(time) {
+ if (missing(time)) {
+ fs___FileInfo__mtime(self)
+ } else {
+ if (!inherits(time, "POSIXct") && length(time) == 1L) {
+ abort("invalid time")
+ }
+ invisible(fs___FileInfo__set_mtime(self, time))
+ }
+ }
+ )
+)
+
+#' @title file selector
+#' @format NULL
+#'
+#' @section Factory:
+#'
+#' The `$create()` factory method instantiates a `FileSelector` given the 3 fields
+#' described below.
+#'
+#' @section Fields:
+#'
+#' - `base_dir`: The directory in which to select files. If the path exists but
+#' doesn't point to a directory, this should be an error.
+#' - `allow_not_found`: The behavior if `base_dir` doesn't exist in the
+#' filesystem. If `FALSE`, an error is returned. If `TRUE`, an empty
+#' selection is returned
+#' - `recursive`: Whether to recurse into subdirectories.
+#'
+#' @rdname FileSelector
+#' @export
+FileSelector <- R6Class("FileSelector",
+ inherit = ArrowObject,
+ active = list(
+ base_dir = function() fs___FileSelector__base_dir(self),
+ allow_not_found = function() fs___FileSelector__allow_not_found(self),
+ recursive = function() fs___FileSelector__recursive(self)
+ )
+)
+
+FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = FALSE) {
+ fs___FileSelector__create(clean_path_rel(base_dir), allow_not_found, recursive)
+}
+
+#' @title FileSystem classes
+#' @description `FileSystem` is an abstract file system API,
+#' `LocalFileSystem` is an implementation accessing files
+#' on the local machine. `SubTreeFileSystem` is an implementation that delegates
+#' to another implementation after prepending a fixed base path
+#'
+#' @section Factory:
+#'
+#' `LocalFileSystem$create()` returns the object and takes no arguments.
+#'
+#' `SubTreeFileSystem$create()` takes the following arguments:
+#'
+#' - `base_path`, a string path
+#' - `base_fs`, a `FileSystem` object
+#'
+#' `S3FileSystem$create()` optionally takes arguments:
+#'
+#' - `anonymous`: logical, default `FALSE`. If true, will not attempt to look up
+#' credentials using standard AWS configuration methods.
+#' - `access_key`, `secret_key`: authentication credentials. If one is provided,
+#' the other must be as well. If both are provided, they will override any
+#' AWS configuration set at the environment level.
+#' - `session_token`: optional string for authentication along with
+#' `access_key` and `secret_key`
+#' - `role_arn`: string AWS ARN of an AccessRole. If provided instead of `access_key` and
+#' `secret_key`, temporary credentials will be fetched by assuming this role.
+#' - `session_name`: optional string identifier for the assumed role session.
+#' - `external_id`: optional unique string identifier that might be required
+#' when you assume a role in another account.
+#' - `load_frequency`: integer, frequency (in seconds) with which temporary
+#' credentials from an assumed role session will be refreshed. Default is
+#' 900 (i.e. 15 minutes)
+#' - `region`: AWS region to connect to. If omitted, the AWS library will
+#' provide a sensible default based on client configuration, falling back
+#' to "us-east-1" if no other alternatives are found.
+#' - `endpoint_override`: If non-empty, override region with a connect string
+#' such as "localhost:9000". This is useful for connecting to file systems
+#' that emulate S3.
+#' - `scheme`: S3 connection transport (default "https")
+#' - `background_writes`: logical, whether `OutputStream` writes will be issued
+#' in the background, without blocking (default `TRUE`)
+#'
+#' @section Methods:
+#'
+#' - `$GetFileInfo(x)`: `x` may be a [FileSelector][FileSelector] or a character
+#' vector of paths. Returns a list of [FileInfo][FileInfo]
+#' - `$CreateDir(path, recursive = TRUE)`: Create a directory and subdirectories.
+#' - `$DeleteDir(path)`: Delete a directory and its contents, recursively.
+#' - `$DeleteDirContents(path)`: Delete a directory's contents, recursively.
+#' Like `$DeleteDir()`,
+#' but doesn't delete the directory itself. Passing an empty path (`""`) will
+#' wipe the entire filesystem tree.
+#' - `$DeleteFile(path)` : Delete a file.
+#' - `$DeleteFiles(paths)` : Delete many files. The default implementation
+#' issues individual delete operations in sequence.
+#' - `$Move(src, dest)`: Move / rename a file or directory. If the destination
+#' exists:
+#' if it is a non-empty directory, an error is returned
+#' otherwise, if it has the same type as the source, it is replaced
+#' otherwise, behavior is unspecified (implementation-dependent).
+#' - `$CopyFile(src, dest)`: Copy a file. If the destination exists and is a
+#' directory, an error is returned. Otherwise, it is replaced.
+#' - `$OpenInputStream(path)`: Open an [input stream][InputStream] for
+#' sequential reading.
+#' - `$OpenInputFile(path)`: Open an [input file][RandomAccessFile] for random
+#' access reading.
+#' - `$OpenOutputStream(path)`: Open an [output stream][OutputStream] for
+#' sequential writing.
+#' - `$OpenAppendStream(path)`: Open an [output stream][OutputStream] for
+#' appending.
+#'
+#' @section Active bindings:
+#'
+#' - `$type_name`: string filesystem type name, such as "local", "s3", etc.
+#' - `$region`: string AWS region, for `S3FileSystem` and `SubTreeFileSystem`
+#' containing a `S3FileSystem`
+#' - `$base_fs`: for `SubTreeFileSystem`, the `FileSystem` it contains
+#' - `$base_path`: for `SubTreeFileSystem`, the path in `$base_fs` which is considered
+#' root in this `SubTreeFileSystem`.
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @rdname FileSystem
+#' @name FileSystem
+#' @export
+FileSystem <- R6Class("FileSystem",
+ inherit = ArrowObject,
+ public = list(
+ GetFileInfo = function(x) {
+ if (inherits(x, "FileSelector")) {
+ fs___FileSystem__GetTargetInfos_FileSelector(self, x)
+ } else if (is.character(x)) {
+ fs___FileSystem__GetTargetInfos_Paths(self, clean_path_rel(x))
+ } else {
+ abort("incompatible type for FileSystem$GetFileInfo()")
+ }
+ },
+ CreateDir = function(path, recursive = TRUE) {
+ fs___FileSystem__CreateDir(self, clean_path_rel(path), isTRUE(recursive))
+ },
+ DeleteDir = function(path) {
+ fs___FileSystem__DeleteDir(self, clean_path_rel(path))
+ },
+ DeleteDirContents = function(path) {
+ fs___FileSystem__DeleteDirContents(self, clean_path_rel(path))
+ },
+ DeleteFile = function(path) {
+ fs___FileSystem__DeleteFile(self, clean_path_rel(path))
+ },
+ DeleteFiles = function(paths) {
+ fs___FileSystem__DeleteFiles(self, clean_path_rel(paths))
+ },
+ Move = function(src, dest) {
+ fs___FileSystem__Move(self, clean_path_rel(src), clean_path_rel(dest))
+ },
+ CopyFile = function(src, dest) {
+ fs___FileSystem__CopyFile(self, clean_path_rel(src), clean_path_rel(dest))
+ },
+ OpenInputStream = function(path) {
+ fs___FileSystem__OpenInputStream(self, clean_path_rel(path))
+ },
+ OpenInputFile = function(path) {
+ fs___FileSystem__OpenInputFile(self, clean_path_rel(path))
+ },
+ OpenOutputStream = function(path) {
+ fs___FileSystem__OpenOutputStream(self, clean_path_rel(path))
+ },
+ OpenAppendStream = function(path) {
+ fs___FileSystem__OpenAppendStream(self, clean_path_rel(path))
+ },
+
+ # Friendlier R user interface
+ path = function(x) SubTreeFileSystem$create(x, self),
+ cd = function(x) SubTreeFileSystem$create(x, self),
+ ls = function(path = "", ...) {
+ selector <- FileSelector$create(path, ...) # ... for recursive = TRUE
+ infos <- self$GetFileInfo(selector)
+ map_chr(infos, ~ .$path)
+ # TODO: add full.names argument like base::dir() (default right now is TRUE)
+ # TODO: see fs package for glob/regexp filtering
+ # TODO: verbose method that shows other attributes as df
+ # TODO: print methods for FileInfo, SubTreeFileSystem, S3FileSystem
+ }
+ ),
+ active = list(
+ type_name = function() fs___FileSystem__type_name(self)
+ )
+)
+FileSystem$from_uri <- function(uri) {
+ assert_that(is.string(uri))
+ fs___FileSystemFromUri(uri)
+}
+
+get_paths_and_filesystem <- function(x, filesystem = NULL) {
+ # Wrapper around FileSystem$from_uri that handles local paths
+ # and an optional explicit filesystem
+ if (inherits(x, "SubTreeFileSystem")) {
+ return(list(fs = x$base_fs, path = x$base_path))
+ }
+ assert_that(is.character(x))
+ are_urls <- are_urls(x)
+ if (any(are_urls)) {
+ if (!all(are_urls)) {
+ stop("Vectors of mixed paths and URIs are not supported", call. = FALSE)
+ }
+ if (!is.null(filesystem)) {
+ # Stop? Can't have URL (which yields a fs) and another fs
+ }
+ x <- lapply(x, FileSystem$from_uri)
+ if (length(unique(map(x, ~ class(.$fs)))) > 1) {
+ stop(
+ "Vectors of URIs for different file systems are not supported",
+ call. = FALSE
+ )
+ }
+ fs <- x[[1]]$fs
+ path <- map_chr(x, ~ .$path) # singular name "path" used for compatibility
+ } else {
+ fs <- filesystem %||% LocalFileSystem$create()
+ if (inherits(fs, "LocalFileSystem")) {
+ path <- clean_path_abs(x)
+ } else {
+ path <- clean_path_rel(x)
+ }
+ }
+ list(
+ fs = fs,
+ path = path
+ )
+}
+
+# variant of the above function that asserts that x is either a scalar string
+# or a SubTreeFileSystem
+get_path_and_filesystem <- function(x, filesystem = NULL) {
+ assert_that(is.string(x) || inherits(x, "SubTreeFileSystem"))
+ get_paths_and_filesystem(x, filesystem)
+}
+
+is_url <- function(x) is.string(x) && grepl("://", x)
+are_urls <- function(x) if (!is.character(x)) FALSE else grepl("://", x)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileSystem
+#' @export
+LocalFileSystem <- R6Class("LocalFileSystem", inherit = FileSystem)
+LocalFileSystem$create <- function() {
+ fs___LocalFileSystem__create()
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileSystem
+#' @importFrom utils modifyList
+#' @export
+S3FileSystem <- R6Class("S3FileSystem",
+ inherit = FileSystem,
+ active = list(
+ region = function() fs___S3FileSystem__region(self)
+ )
+)
+S3FileSystem$create <- function(anonymous = FALSE, ...) {
+ args <- list2(...)
+ if (anonymous) {
+ invalid_args <- intersect(
+ c(
+ "access_key", "secret_key", "session_token", "role_arn", "session_name",
+ "external_id", "load_frequency"
+ ),
+ names(args)
+ )
+ if (length(invalid_args)) {
+ stop("Cannot specify ", oxford_paste(invalid_args), " when anonymous = TRUE", call. = FALSE)
+ }
+ } else {
+ keys_present <- length(intersect(c("access_key", "secret_key"), names(args)))
+ if (keys_present == 1) {
+ stop("Key authentication requires both access_key and secret_key", call. = FALSE)
+ }
+ if ("session_token" %in% names(args) && keys_present != 2) {
+ stop(
+ "In order to initialize a session with temporary credentials, ",
+ "both secret_key and access_key must be provided ",
+ "in addition to session_token.",
+ call. = FALSE
+ )
+ }
+ arn <- "role_arn" %in% names(args)
+ if (keys_present == 2 && arn) {
+ stop("Cannot provide both key authentication and role_arn", call. = FALSE)
+ }
+ arn_extras <- intersect(c("session_name", "external_id", "load_frequency"), names(args))
+ if (length(arn_extras) > 0 && !arn) {
+ stop("Cannot specify ", oxford_paste(arn_extras), " without providing a role_arn string", call. = FALSE)
+ }
+ }
+ args <- c(modifyList(default_s3_options, args), anonymous = anonymous)
+ exec(fs___S3FileSystem__create, !!!args)
+}
+
+default_s3_options <- list(
+ access_key = "",
+ secret_key = "",
+ session_token = "",
+ role_arn = "",
+ session_name = "",
+ external_id = "",
+ load_frequency = 900L,
+ region = "",
+ endpoint_override = "",
+ scheme = "",
+ background_writes = TRUE
+)
+
+#' Connect to an AWS S3 bucket
+#'
+#' `s3_bucket()` is a convenience function to create an `S3FileSystem` object
+#' that automatically detects the bucket's AWS region and holding onto the its
+#' relative path.
+#'
+#' @param bucket string S3 bucket name or path
+#' @param ... Additional connection options, passed to `S3FileSystem$create()`
+#' @return A `SubTreeFileSystem` containing an `S3FileSystem` and the bucket's
+#' relative path. Note that this function's success does not guarantee that you
+#' are authorized to access the bucket's contents.
+#' @examplesIf arrow_with_s3()
+#' bucket <- s3_bucket("ursa-labs-taxi-data")
+#' @export
+s3_bucket <- function(bucket, ...) {
+ assert_that(is.string(bucket))
+ args <- list2(...)
+
+ # Use FileSystemFromUri to detect the bucket's region
+ if (!is_url(bucket)) {
+ bucket <- paste0("s3://", bucket)
+ }
+ fs_and_path <- FileSystem$from_uri(bucket)
+ fs <- fs_and_path$fs
+ # If there are no additional S3Options, we can use that filesystem
+ # Otherwise, take the region that was detected and make a new fs with the args
+ if (length(args)) {
+ args$region <- fs$region
+ fs <- exec(S3FileSystem$create, !!!args)
+ }
+ # Return a subtree pointing at that bucket path
+ SubTreeFileSystem$create(fs_and_path$path, fs)
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname FileSystem
+#' @export
+SubTreeFileSystem <- R6Class("SubTreeFileSystem",
+ inherit = FileSystem,
+ public = list(
+ print = function(...) {
+ if (inherits(self$base_fs, "LocalFileSystem")) {
+ cat("SubTreeFileSystem: ", "file://", self$base_path, "\n", sep = "")
+ } else if (inherits(self$base_fs, "S3FileSystem")) {
+ cat("SubTreeFileSystem: ", "s3://", self$base_path, "\n", sep = "")
+ } else {
+ cat("SubTreeFileSystem", "\n", sep = "")
+ }
+ invisible(self)
+ }
+ ),
+ active = list(
+ base_fs = function() {
+ fs___SubTreeFileSystem__base_fs(self)
+ },
+ base_path = function() fs___SubTreeFileSystem__base_path(self)
+ )
+)
+SubTreeFileSystem$create <- function(base_path, base_fs = NULL) {
+ fs_and_path <- get_path_and_filesystem(base_path, base_fs)
+ fs___SubTreeFileSystem__create(fs_and_path$path, fs_and_path$fs)
+}
+
+#' @export
+`$.SubTreeFileSystem` <- function(x, name, ...) {
+ # This is to allow delegating methods/properties to the base_fs
+ assert_that(is.string(name))
+ if (name %in% ls(envir = x)) {
+ get(name, x)
+ } else if (name %in% ls(envir = x$base_fs)) {
+ get(name, x$base_fs)
+ } else {
+ NULL
+ }
+}
+
+#' Copy files between FileSystems
+#'
+#' @param from A string path to a local directory or file, a URI, or a
+#' `SubTreeFileSystem`. Files will be copied recursively from this path.
+#' @param to A string path to a local directory or file, a URI, or a
+#' `SubTreeFileSystem`. Directories will be created as necessary
+#' @param chunk_size The maximum size of block to read before flushing
+#' to the destination file. A larger chunk_size will use more memory while
+#' copying but may help accommodate high latency FileSystems.
+#' @return Nothing: called for side effects in the file system
+#' @export
+#' @examplesIf FALSE
+#' # Copy an S3 bucket's files to a local directory:
+#' copy_files("s3://your-bucket-name", "local-directory")
+#' # Using a FileSystem object
+#' copy_files(s3_bucket("your-bucket-name"), "local-directory")
+#' # Or go the other way, from local to S3
+#' copy_files("local-directory", s3_bucket("your-bucket-name"))
+copy_files <- function(from, to, chunk_size = 1024L * 1024L) {
+ from <- get_path_and_filesystem(from)
+ to <- get_path_and_filesystem(to)
+ invisible(fs___CopyFiles(
+ from$fs,
+ FileSelector$create(from$path, recursive = TRUE),
+ to$fs,
+ to$path,
+ chunk_size,
+ option_use_threads()
+ ))
+}
+
+clean_path_abs <- function(path) {
+ # Make sure we have a valid, absolute, forward-slashed path for passing to Arrow
+ normalizePath(path, winslash = "/", mustWork = FALSE)
+}
+
+clean_path_rel <- function(path) {
+ # Make sure all path separators are "/", not "\" as on Windows
+ path_sep <- ifelse(tolower(Sys.info()[["sysname"]]) == "windows", "\\\\", "/")
+ gsub(path_sep, "/", path)
+}
diff --git a/src/arrow/r/R/flight.R b/src/arrow/r/R/flight.R
new file mode 100644
index 000000000..cde297853
--- /dev/null
+++ b/src/arrow/r/R/flight.R
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Load a Python Flight server
+#'
+#' @param name string Python module name
+#' @param path file system path where the Python module is found. Default is
+#' to look in the `inst/` directory for included modules.
+#' @export
+#' @examplesIf FALSE
+#' load_flight_server("demo_flight_server")
+load_flight_server <- function(name, path = system.file(package = "arrow")) {
+ reticulate::import_from_path(name, path)
+}
+
+#' Connect to a Flight server
+#'
+#' @param host string hostname to connect to
+#' @param port integer port to connect on
+#' @param scheme URL scheme, default is "grpc+tcp"
+#' @return A `pyarrow.flight.FlightClient`.
+#' @export
+flight_connect <- function(host = "localhost", port, scheme = "grpc+tcp") {
+ pa <- reticulate::import("pyarrow")
+ location <- paste0(scheme, "://", host, ":", port)
+ pa$flight$FlightClient(location)
+}
+
+#' Send data to a Flight server
+#'
+#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()]
+#' @param data `data.frame`, [RecordBatch], or [Table] to upload
+#' @param path string identifier to store the data under
+#' @param overwrite logical: if `path` exists on `client` already, should we
+#' replace it with the contents of `data`? Default is `TRUE`; if `FALSE` and
+#' `path` exists, the function will error.
+#' @return `client`, invisibly.
+#' @export
+flight_put <- function(client, data, path, overwrite = TRUE) {
+ if (!overwrite && flight_path_exists(client, path)) {
+ stop(path, " exists.", call. = FALSE)
+ }
+ if (is.data.frame(data)) {
+ data <- Table$create(data)
+ }
+ py_data <- reticulate::r_to_py(data)
+ writer <- client$do_put(descriptor_for_path(path), py_data$schema)[[1]]
+ if (inherits(data, "RecordBatch")) {
+ writer$write_batch(py_data)
+ } else {
+ writer$write_table(py_data)
+ }
+ writer$close()
+ invisible(client)
+}
+
+#' Get data from a Flight server
+#'
+#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()]
+#' @param path string identifier under which data is stored
+#' @return A [Table]
+#' @export
+flight_get <- function(client, path) {
+ reader <- flight_reader(client, path)
+ reader$read_all()
+}
+
+# TODO: could use this as a RecordBatch iterator, call $read_chunk() on this
+flight_reader <- function(client, path) {
+ info <- client$get_flight_info(descriptor_for_path(path))
+ # Hack: assume a single ticket, on the same server as client is already connected
+ ticket <- info$endpoints[[1]]$ticket
+ client$do_get(ticket)
+}
+
+descriptor_for_path <- function(path) {
+ pa <- reticulate::import("pyarrow")
+ pa$flight$FlightDescriptor$for_path(path)
+}
+
+#' See available resources on a Flight server
+#'
+#' @inheritParams flight_get
+#' @return `list_flights()` returns a character vector of paths.
+#' `flight_path_exists()` returns a logical value, the equivalent of `path %in% list_flights()`
+#' @export
+list_flights <- function(client) {
+ generator <- client$list_flights()
+ out <- reticulate::iterate(generator, function(x) as.character(x$descriptor$path[[1]]))
+ out
+}
+
+#' @rdname list_flights
+#' @export
+flight_path_exists <- function(client, path) {
+ it_exists <- tryCatch(
+ expr = {
+ client$get_flight_info(descriptor_for_path(path))
+ TRUE
+ },
+ error = function(e) {
+ msg <- conditionMessage(e)
+ if (!any(grepl("ArrowKeyError", msg))) {
+ # Raise an error if this fails for any reason other than not found
+ stop(e)
+ }
+ FALSE
+ }
+ )
+}
diff --git a/src/arrow/r/R/install-arrow.R b/src/arrow/r/R/install-arrow.R
new file mode 100644
index 000000000..3e295c543
--- /dev/null
+++ b/src/arrow/r/R/install-arrow.R
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Install or upgrade the Arrow library
+#'
+#' Use this function to install the latest release of `arrow`, to switch to or
+#' from a nightly development version, or on Linux to try reinstalling with
+#' all necessary C++ dependencies.
+#'
+#' Note that, unlike packages like `tensorflow`, `blogdown`, and others that
+#' require external dependencies, you do not need to run `install_arrow()`
+#' after a successful `arrow` installation.
+#'
+#' @param nightly logical: Should we install a development version of the
+#' package, or should we install from CRAN (the default).
+#' @param binary On Linux, value to set for the environment variable
+#' `LIBARROW_BINARY`, which governs how C++ binaries are used, if at all.
+#' The default value, `TRUE`, tells the installation script to detect the
+#' Linux distribution and version and find an appropriate C++ library. `FALSE`
+#' would tell the script not to retrieve a binary and instead build Arrow C++
+#' from source. Other valid values are strings corresponding to a Linux
+#' distribution-version, to override the value that would be detected.
+#' See `vignette("install", package = "arrow")` for further details.
+#' @param use_system logical: Should we use `pkg-config` to look for Arrow
+#' system packages? Default is `FALSE`. If `TRUE`, source installation may be
+#' faster, but there is a risk of version mismatch. This sets the
+#' `ARROW_USE_PKG_CONFIG` environment variable.
+#' @param minimal logical: If building from source, should we build without
+#' optional dependencies (compression libraries, for example)? Default is
+#' `FALSE`. This sets the `LIBARROW_MINIMAL` environment variable.
+#' @param verbose logical: Print more debugging output when installing? Default
+#' is `FALSE`. This sets the `ARROW_R_DEV` environment variable.
+#' @param repos character vector of base URLs of the repositories to install
+#' from (passed to `install.packages()`)
+#' @param ... Additional arguments passed to `install.packages()`
+#' @export
+#' @importFrom utils install.packages
+#' @seealso [arrow_available()] to see if the package was configured with
+#' necessary C++ dependencies. `vignette("install", package = "arrow")` for
+#' more ways to tune installation on Linux.
+install_arrow <- function(nightly = FALSE,
+ binary = Sys.getenv("LIBARROW_BINARY", TRUE),
+ use_system = Sys.getenv("ARROW_USE_PKG_CONFIG", FALSE),
+ minimal = Sys.getenv("LIBARROW_MINIMAL", FALSE),
+ verbose = Sys.getenv("ARROW_R_DEV", FALSE),
+ repos = getOption("repos"),
+ ...) {
+ sysname <- tolower(Sys.info()[["sysname"]])
+ conda <- isTRUE(grepl("conda", R.Version()$platform))
+
+ if (conda) {
+ if (nightly) {
+ system("conda install -y -c arrow-nightlies -c conda-forge --strict-channel-priority r-arrow")
+ } else {
+ system("conda install -y -c conda-forge --strict-channel-priority r-arrow")
+ }
+ } else {
+ Sys.setenv(
+ LIBARROW_BINARY = binary,
+ LIBARROW_MINIMAL = minimal,
+ ARROW_R_DEV = verbose,
+ ARROW_USE_PKG_CONFIG = use_system
+ )
+ # On the M1, we can't use the usual autobrew, which pulls Intel dependencies
+ apple_m1 <- grepl("arm-apple|aarch64.*darwin", R.Version()$platform)
+ # On Rosetta, we have to build without JEMALLOC, so we also can't autobrew
+ rosetta <- identical(sysname, "darwin") && identical(system("sysctl -n sysctl.proc_translated", intern = TRUE), "1")
+ if (rosetta) {
+ Sys.setenv(ARROW_JEMALLOC = "OFF")
+ }
+ if (apple_m1 || rosetta) {
+ Sys.setenv(FORCE_BUNDLED_BUILD = "true")
+ }
+
+ opts <- list()
+ if (apple_m1 || rosetta) {
+ # Skip binaries (esp. for rosetta)
+ opts$pkgType <- "source"
+ } else if (isTRUE(binary)) {
+ # Unless otherwise directed, don't consider newer source packages when
+ # options(pkgType) == "both" (default on win/mac)
+ opts$install.packages.check.source <- "no"
+ opts$install.packages.compile.from.source <- "never"
+ }
+ if (length(opts)) {
+ old <- options(opts)
+ on.exit(options(old))
+ }
+ install.packages("arrow", repos = arrow_repos(repos, nightly), ...)
+ }
+ if ("arrow" %in% loadedNamespaces()) {
+ # If you've just sourced this file, "arrow" won't be (re)loaded
+ reload_arrow()
+ }
+}
+
+arrow_repos <- function(repos = getOption("repos"), nightly = FALSE) {
+ if (length(repos) == 0 || identical(repos, c(CRAN = "@CRAN@"))) {
+ # Set the default/CDN
+ repos <- "https://cloud.r-project.org/"
+ }
+ dev_repo <- getOption("arrow.dev_repo", "https://arrow-r-nightly.s3.amazonaws.com")
+ # Remove it if it's there (so nightly=FALSE won't accidentally pull from it)
+ repos <- setdiff(repos, dev_repo)
+ if (nightly) {
+ # Add it first
+ repos <- c(dev_repo, repos)
+ }
+ repos
+}
+
+reload_arrow <- function() {
+ if (requireNamespace("pkgload", quietly = TRUE)) {
+ is_attached <- "package:arrow" %in% search()
+ pkgload::unload("arrow")
+ if (is_attached) {
+ require("arrow", character.only = TRUE, quietly = TRUE)
+ } else {
+ requireNamespace("arrow", quietly = TRUE)
+ }
+ } else {
+ message("Please restart R to use the 'arrow' package.")
+ }
+}
+
+
+#' Create a source bundle that includes all thirdparty dependencies
+#'
+#' @param dest_file File path for the new tar.gz package. Defaults to
+#' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version)
+#' @param source_file File path for the input tar.gz package. Defaults to
+#' downloading the package from CRAN (or whatever you have set as the first in
+#' `getOption("repos")`)
+#' @return The full path to `dest_file`, invisibly
+#'
+#' This function is used for setting up an offline build. If it's possible to
+#' download at build time, don't use this function. Instead, let `cmake`
+#' download the required dependencies for you.
+#' These downloaded dependencies are only used in the build if
+#' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`.
+#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds
+#'
+#' If you're using binary packages you shouldn't need to use this function. You
+#' should download the appropriate binary from your package repository, transfer
+#' that to the offline computer, and install that. Any OS can create the source
+#' bundle, but it cannot be installed on Windows. (Instead, use a standard
+#' Windows binary package.)
+#'
+#' Note if you're using RStudio Package Manager on Linux: If you still want to
+#' make a source bundle with this function, make sure to set the first repo in
+#' `options("repos")` to be a mirror that contains source packages (that is:
+#' something other than the RSPM binary mirror URLs).
+#'
+#' ## Steps for an offline install with optional dependencies:
+#'
+#' ### Using a computer with internet access, pre-download the dependencies:
+#' * Install the `arrow` package _or_ run
+#' `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")`
+#' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")`
+#' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access
+#'
+#' ### On the computer without internet access, install the prepared package:
+#' * Install the `arrow` package from the copied file
+#' * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))`
+#' * This installation will build from source, so `cmake` must be available
+#' * Run [arrow_info()] to check installed capabilities
+#'
+#'
+#' @examples
+#' \dontrun{
+#' new_pkg <- create_package_with_all_dependencies()
+#' # Note: this works when run in the same R session, but it's meant to be
+#' # copied to a different computer.
+#' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo"))
+#' }
+#' @export
+create_package_with_all_dependencies <- function(dest_file = NULL, source_file = NULL) {
+ if (is.null(source_file)) {
+ pkg_download_dir <- tempfile()
+ dir.create(pkg_download_dir)
+ on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE)
+ message("Downloading Arrow source file")
+ downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source")
+ source_file <- downloaded[1, 2, drop = TRUE]
+ }
+ if (!file.exists(source_file) || !endsWith(source_file, "tar.gz")) {
+ stop("Arrow package .tar.gz file not found")
+ }
+ if (is.null(dest_file)) {
+ # e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz
+ # (add 'with_deps' for clarity if the file was downloaded locally)
+ dest_file <- paste0(gsub(".tar.gz$", "", basename(source_file)), "_with_deps.tar.gz")
+ }
+ untar_dir <- tempfile()
+ on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE)
+ utils::untar(source_file, exdir = untar_dir)
+ tools_dir <- file.path(untar_dir, "arrow/tools")
+ download_dependencies_sh <- file.path(tools_dir, "cpp/thirdparty/download_dependencies.sh")
+ # If you change this path, also need to edit nixlibs.R
+ download_dir <- file.path(tools_dir, "thirdparty_dependencies")
+ dir.create(download_dir)
+
+ message("Downloading files to ", download_dir)
+ download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0
+ if (!download_successful) {
+ stop("Failed to download thirdparty dependencies")
+ }
+ # Need to change directory to untar_dir so tar() will use relative paths. That
+ # means we'll need a full, non-relative path for dest_file. (extra_flags="-C"
+ # doesn't work with R's internal tar)
+ orig_wd <- getwd()
+ on.exit(setwd(orig_wd), add = TRUE)
+ # normalizePath() may return the input unchanged if dest_file doesn't exist,
+ # so create it first.
+ file.create(dest_file)
+ dest_file <- normalizePath(dest_file, mustWork = TRUE)
+ setwd(untar_dir)
+
+ message("Repacking tar.gz file to ", dest_file)
+ tar_successful <- utils::tar(dest_file, compression = "gz") == 0
+ if (!tar_successful) {
+ stop("Failed to create new tar.gz file")
+ }
+ invisible(dest_file)
+}
diff --git a/src/arrow/r/R/io.R b/src/arrow/r/R/io.R
new file mode 100644
index 000000000..898b306a3
--- /dev/null
+++ b/src/arrow/r/R/io.R
@@ -0,0 +1,295 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @include enums.R
+#' @include buffer.R
+
+# OutputStream ------------------------------------------------------------
+
+Writable <- R6Class("Writable",
+ inherit = ArrowObject,
+ public = list(
+ write = function(x) io___Writable__write(self, buffer(x))
+ )
+)
+
+#' @title OutputStream classes
+#' @description `FileOutputStream` is for writing to a file;
+#' `BufferOutputStream` writes to a buffer;
+#' You can create one and pass it to any of the table writers, for example.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `$create()` factory methods instantiate the `OutputStream` object and
+#' take the following arguments, depending on the subclass:
+#'
+#' - `path` For `FileOutputStream`, a character file name
+#' - `initial_capacity` For `BufferOutputStream`, the size in bytes of the
+#' buffer.
+#'
+#' @section Methods:
+#'
+#' - `$tell()`: return the position in the stream
+#' - `$close()`: close the stream
+#' - `$write(x)`: send `x` to the stream
+#' - `$capacity()`: for `BufferOutputStream`
+#' - `$finish()`: for `BufferOutputStream`
+#' - `$GetExtentBytesWritten()`: for `MockOutputStream`, report how many bytes
+#' were sent.
+#'
+#' @rdname OutputStream
+#' @name OutputStream
+OutputStream <- R6Class("OutputStream",
+ inherit = Writable,
+ public = list(
+ close = function() io___OutputStream__Close(self),
+ tell = function() io___OutputStream__Tell(self)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname OutputStream
+#' @export
+FileOutputStream <- R6Class("FileOutputStream", inherit = OutputStream)
+FileOutputStream$create <- function(path) {
+ io___FileOutputStream__Open(clean_path_abs(path))
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname OutputStream
+#' @export
+BufferOutputStream <- R6Class("BufferOutputStream",
+ inherit = OutputStream,
+ public = list(
+ capacity = function() io___BufferOutputStream__capacity(self),
+ finish = function() io___BufferOutputStream__Finish(self),
+ write = function(bytes) io___BufferOutputStream__Write(self, bytes),
+ tell = function() io___BufferOutputStream__Tell(self)
+ )
+)
+BufferOutputStream$create <- function(initial_capacity = 0L) {
+ io___BufferOutputStream__Create(initial_capacity)
+}
+
+# InputStream -------------------------------------------------------------
+
+
+Readable <- R6Class("Readable",
+ inherit = ArrowObject,
+ public = list(
+ Read = function(nbytes) io___Readable__Read(self, nbytes)
+ )
+)
+
+#' @title InputStream classes
+#' @description `RandomAccessFile` inherits from `InputStream` and is a base
+#' class for: `ReadableFile` for reading from a file; `MemoryMappedFile` for
+#' the same but with memory mapping; and `BufferReader` for reading from a
+#' buffer. Use these with the various table readers.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `$create()` factory methods instantiate the `InputStream` object and
+#' take the following arguments, depending on the subclass:
+#'
+#' - `path` For `ReadableFile`, a character file name
+#' - `x` For `BufferReader`, a [Buffer] or an object that can be
+#' made into a buffer via `buffer()`.
+#'
+#' To instantiate a `MemoryMappedFile`, call [mmap_open()].
+#'
+#' @section Methods:
+#'
+#' - `$GetSize()`:
+#' - `$supports_zero_copy()`: Logical
+#' - `$seek(position)`: go to that position in the stream
+#' - `$tell()`: return the position in the stream
+#' - `$close()`: close the stream
+#' - `$Read(nbytes)`: read data from the stream, either a specified `nbytes` or
+#' all, if `nbytes` is not provided
+#' - `$ReadAt(position, nbytes)`: similar to `$seek(position)$Read(nbytes)`
+#' - `$Resize(size)`: for a `MemoryMappedFile` that is writeable
+#'
+#' @rdname InputStream
+#' @name InputStream
+InputStream <- R6Class("InputStream",
+ inherit = Readable,
+ public = list(
+ close = function() io___InputStream__Close(self)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+RandomAccessFile <- R6Class("RandomAccessFile",
+ inherit = InputStream,
+ public = list(
+ GetSize = function() io___RandomAccessFile__GetSize(self),
+ supports_zero_copy = function() io___RandomAccessFile__supports_zero_copy(self),
+ seek = function(position) io___RandomAccessFile__Seek(self, position),
+ tell = function() io___RandomAccessFile__Tell(self),
+ Read = function(nbytes = NULL) {
+ if (is.null(nbytes)) {
+ io___RandomAccessFile__Read0(self)
+ } else {
+ io___Readable__Read(self, nbytes)
+ }
+ },
+ ReadAt = function(position, nbytes = NULL) {
+ if (is.null(nbytes)) {
+ nbytes <- self$GetSize() - position
+ }
+ io___RandomAccessFile__ReadAt(self, position, nbytes)
+ }
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+MemoryMappedFile <- R6Class("MemoryMappedFile",
+ inherit = RandomAccessFile,
+ public = list(
+ Resize = function(size) io___MemoryMappedFile__Resize(self, size)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+ReadableFile <- R6Class("ReadableFile", inherit = RandomAccessFile)
+ReadableFile$create <- function(path) {
+ io___ReadableFile__Open(clean_path_abs(path))
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname InputStream
+#' @export
+BufferReader <- R6Class("BufferReader", inherit = RandomAccessFile)
+BufferReader$create <- function(x) {
+ x <- buffer(x)
+ io___BufferReader__initialize(x)
+}
+
+#' Create a new read/write memory mapped file of a given size
+#'
+#' @param path file path
+#' @param size size in bytes
+#'
+#' @return a [arrow::io::MemoryMappedFile][MemoryMappedFile]
+#'
+#' @export
+mmap_create <- function(path, size) {
+ path <- clean_path_abs(path)
+ io___MemoryMappedFile__Create(path, size)
+}
+
+#' Open a memory mapped file
+#'
+#' @param path file path
+#' @param mode file mode (read/write/readwrite)
+#'
+#' @export
+mmap_open <- function(path, mode = c("read", "write", "readwrite")) {
+ mode <- match(match.arg(mode), c("read", "write", "readwrite")) - 1L
+ path <- clean_path_abs(path)
+ io___MemoryMappedFile__Open(path, mode)
+}
+
+#' Handle a range of possible input sources
+#' @param file A character file name, `raw` vector, or an Arrow input stream
+#' @param mmap Logical: whether to memory-map the file (default `TRUE`)
+#' @param compression If the file is compressed, created a [CompressedInputStream]
+#' with this compression codec, either a [Codec] or the string name of one.
+#' If `NULL` (default) and `file` is a string file name, the function will try
+#' to infer compression from the file extension.
+#' @param filesystem If not `NULL`, `file` will be opened via the
+#' `filesystem$OpenInputFile()` filesystem method, rather than the `io` module's
+#' `MemoryMappedFile` or `ReadableFile` constructors.
+#' @return An `InputStream` or a subclass of one.
+#' @keywords internal
+make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem = NULL) {
+ if (inherits(file, "SubTreeFileSystem")) {
+ filesystem <- file$base_fs
+ file <- file$base_path
+ }
+ if (is.string(file)) {
+ if (is_url(file)) {
+ fs_and_path <- FileSystem$from_uri(file)
+ filesystem <- fs_and_path$fs
+ file <- fs_and_path$path
+ }
+ if (is.null(compression)) {
+ # Infer compression from the file path
+ compression <- detect_compression(file)
+ }
+ if (!is.null(filesystem)) {
+ file <- filesystem$OpenInputFile(file)
+ } else if (isTRUE(mmap)) {
+ file <- mmap_open(file)
+ } else {
+ file <- ReadableFile$create(file)
+ }
+ if (!identical(compression, "uncompressed")) {
+ file <- CompressedInputStream$create(file, compression)
+ }
+ } else if (inherits(file, c("raw", "Buffer"))) {
+ file <- BufferReader$create(file)
+ }
+ assert_is(file, "InputStream")
+ file
+}
+
+make_output_stream <- function(x, filesystem = NULL) {
+ if (inherits(x, "SubTreeFileSystem")) {
+ filesystem <- x$base_fs
+ x <- x$base_path
+ } else if (is_url(x)) {
+ fs_and_path <- FileSystem$from_uri(x)
+ filesystem <- fs_and_path$fs
+ x <- fs_and_path$path
+ }
+ assert_that(is.string(x))
+ if (is.null(filesystem)) {
+ FileOutputStream$create(x)
+ } else {
+ filesystem$OpenOutputStream(x)
+ }
+}
+
+detect_compression <- function(path) {
+ assert_that(is.string(path))
+ switch(tools::file_ext(path),
+ bz2 = "bz2",
+ gz = "gzip",
+ lz4 = "lz4",
+ zst = "zstd",
+ "uncompressed"
+ )
+}
diff --git a/src/arrow/r/R/ipc_stream.R b/src/arrow/r/R/ipc_stream.R
new file mode 100644
index 000000000..c45d1de6e
--- /dev/null
+++ b/src/arrow/r/R/ipc_stream.R
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Write Arrow IPC stream format
+#'
+#' Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather. `write_ipc_stream()`
+#' and [write_feather()] write those formats, respectively.
+#'
+#' `write_arrow()`, a wrapper around `write_ipc_stream()` and `write_feather()`
+#' with some nonstandard behavior, is deprecated. You should explicitly choose
+#' the function that will write the desired IPC format (stream or file) since
+#' either can be written to a file or `OutputStream`.
+#'
+#' @inheritParams write_feather
+#' @param ... extra parameters passed to `write_feather()`.
+#'
+#' @return `x`, invisibly.
+#' @seealso [write_feather()] for writing IPC files. [write_to_raw()] to
+#' serialize data to a buffer.
+#' [RecordBatchWriter] for a lower-level interface.
+#' @export
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_ipc_stream(mtcars, tf)
+write_ipc_stream <- function(x, sink, ...) {
+ x_out <- x # So we can return the data we got
+ if (is.data.frame(x)) {
+ x <- Table$create(x)
+ }
+ assert_that(is_writable_table(x))
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+
+ writer <- RecordBatchStreamWriter$create(sink, x$schema)
+ writer$write(x)
+ writer$close()
+ invisible(x_out)
+}
+
+#' Write Arrow data to a raw vector
+#'
+#' [write_ipc_stream()] and [write_feather()] write data to a sink and return
+#' the data (`data.frame`, `RecordBatch`, or `Table`) they were given.
+#' This function wraps those so that you can serialize data to a buffer and
+#' access that buffer as a `raw` vector in R.
+#' @inheritParams write_feather
+#' @param format one of `c("stream", "file")`, indicating the IPC format to use
+#' @return A `raw` vector containing the bytes of the IPC serialized data.
+#' @examplesIf arrow_available()
+#' # The default format is "stream"
+#' mtcars_raw <- write_to_raw(mtcars)
+#' @export
+write_to_raw <- function(x, format = c("stream", "file")) {
+ sink <- BufferOutputStream$create()
+ if (match.arg(format) == "stream") {
+ write_ipc_stream(x, sink)
+ } else {
+ write_feather(x, sink)
+ }
+ as.raw(buffer(sink))
+}
+
+#' Read Arrow IPC stream format
+#'
+#' Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather. `read_ipc_stream()`
+#' and [read_feather()] read those formats, respectively.
+#'
+#' `read_arrow()`, a wrapper around `read_ipc_stream()` and `read_feather()`,
+#' is deprecated. You should explicitly choose
+#' the function that will read the desired IPC format (stream or file) since
+#' a file or `InputStream` may contain either.
+#'
+#' @param file A character file name or URI, `raw` vector, an Arrow input stream,
+#' or a `FileSystem` with path (`SubTreeFileSystem`).
+#' If a file name or URI, an Arrow [InputStream] will be opened and
+#' closed when finished. If an input stream is provided, it will be left
+#' open.
+#' @param as_data_frame Should the function return a `data.frame` (default) or
+#' an Arrow [Table]?
+#' @param ... extra parameters passed to `read_feather()`.
+#'
+#' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
+#' Arrow [Table] otherwise
+#' @seealso [read_feather()] for writing IPC files. [RecordBatchReader] for a
+#' lower-level interface.
+#' @export
+read_ipc_stream <- function(file, as_data_frame = TRUE, ...) {
+ if (!inherits(file, "InputStream")) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+
+ # TODO: this could take col_select, like the other readers
+ # https://issues.apache.org/jira/browse/ARROW-6830
+ out <- RecordBatchStreamReader$create(file)$read_table()
+ if (as_data_frame) {
+ out <- as.data.frame(out)
+ }
+ out
+}
diff --git a/src/arrow/r/R/json.R b/src/arrow/r/R/json.R
new file mode 100644
index 000000000..0d54c8a8a
--- /dev/null
+++ b/src/arrow/r/R/json.R
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Read a JSON file
+#'
+#' Using [JsonTableReader]
+#'
+#' @inheritParams read_delim_arrow
+#' @param schema [Schema] that describes the table.
+#' @param ... Additional options passed to `JsonTableReader$create()`
+#'
+#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`.
+#' @export
+#' @examplesIf arrow_with_json()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' writeLines('
+#' { "hello": 3.5, "world": false, "yo": "thing" }
+#' { "hello": 3.25, "world": null }
+#' { "hello": 0.0, "world": true, "yo": null }
+#' ', tf, useBytes = TRUE)
+#' df <- read_json_arrow(tf)
+read_json_arrow <- function(file,
+ col_select = NULL,
+ as_data_frame = TRUE,
+ schema = NULL,
+ ...) {
+ if (!inherits(file, "InputStream")) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+ tab <- JsonTableReader$create(file, schema = schema, ...)$Read()
+
+ col_select <- enquo(col_select)
+ if (!quo_is_null(col_select)) {
+ tab <- tab[vars_select(names(tab), !!col_select)]
+ }
+
+ if (isTRUE(as_data_frame)) {
+ tab <- as.data.frame(tab)
+ }
+ tab
+}
+
+#' @include arrow-package.R
+#' @rdname CsvTableReader
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+JsonTableReader <- R6Class("JsonTableReader",
+ inherit = ArrowObject,
+ public = list(
+ Read = function() json___TableReader__Read(self)
+ )
+)
+JsonTableReader$create <- function(file,
+ read_options = JsonReadOptions$create(),
+ parse_options = JsonParseOptions$create(schema = schema),
+ schema = NULL,
+ ...) {
+ assert_is(file, "InputStream")
+ json___TableReader__Make(file, read_options, parse_options)
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+JsonReadOptions <- R6Class("JsonReadOptions", inherit = ArrowObject)
+JsonReadOptions$create <- function(use_threads = option_use_threads(), block_size = 1048576L) {
+ json___ReadOptions__initialize(use_threads, block_size)
+}
+
+#' @rdname CsvReadOptions
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @export
+JsonParseOptions <- R6Class("JsonParseOptions", inherit = ArrowObject)
+JsonParseOptions$create <- function(newlines_in_values = FALSE, schema = NULL) {
+ if (is.null(schema)) {
+ json___ParseOptions__initialize1(newlines_in_values)
+ } else {
+ json___ParseOptions__initialize2(newlines_in_values, schema)
+ }
+}
diff --git a/src/arrow/r/R/memory-pool.R b/src/arrow/r/R/memory-pool.R
new file mode 100644
index 000000000..2207ed6be
--- /dev/null
+++ b/src/arrow/r/R/memory-pool.R
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#'
+#' @title class arrow::MemoryPool
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' - `backend_name`: one of "jemalloc", "mimalloc", or "system". Alternative
+#' memory allocators are optionally enabled at build time. Windows builds
+#' generally have `mimalloc`, and most others have both `jemalloc` (used by
+#' default) and `mimalloc`. To change memory allocators at runtime, set the
+#' environment variable `ARROW_DEFAULT_MEMORY_POOL` to one of those strings
+#' prior to loading the `arrow` library.
+#' - `bytes_allocated`
+#' - `max_memory`
+#'
+#' @rdname MemoryPool
+#' @name MemoryPool
+#' @keywords internal
+MemoryPool <- R6Class("MemoryPool",
+ inherit = ArrowObject,
+ public = list(
+ # TODO: Allocate
+ # TODO: Reallocate
+ # TODO: Free
+ ),
+ active = list(
+ backend_name = function() MemoryPool__backend_name(self),
+ bytes_allocated = function() MemoryPool__bytes_allocated(self),
+ max_memory = function() MemoryPool__max_memory(self)
+ )
+)
+
+#' Arrow's default [MemoryPool]
+#'
+#' @return the default [MemoryPool]
+#' @export
+#' @keywords internal
+default_memory_pool <- function() {
+ MemoryPool__default()
+}
diff --git a/src/arrow/r/R/message.R b/src/arrow/r/R/message.R
new file mode 100644
index 000000000..ef33f1623
--- /dev/null
+++ b/src/arrow/r/R/message.R
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+
+#' @title class arrow::Message
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname Message
+#' @name Message
+Message <- R6Class("Message",
+ inherit = ArrowObject,
+ public = list(
+ Equals = function(other, ...) {
+ inherits(other, "Message") && ipc___Message__Equals(self, other)
+ },
+ body_length = function() ipc___Message__body_length(self),
+ Verify = function() ipc___Message__Verify(self)
+ ),
+ active = list(
+ type = function() ipc___Message__type(self),
+ metadata = function() ipc___Message__metadata(self),
+ body = function() ipc___Message__body(self)
+ )
+)
+
+#' @title class arrow::MessageReader
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname MessageReader
+#' @name MessageReader
+#' @export
+MessageReader <- R6Class("MessageReader",
+ inherit = ArrowObject,
+ public = list(
+ ReadNextMessage = function() ipc___MessageReader__ReadNextMessage(self)
+ )
+)
+
+MessageReader$create <- function(stream) {
+ if (!inherits(stream, "InputStream")) {
+ stream <- BufferReader$create(stream)
+ }
+ ipc___MessageReader__Open(stream)
+}
+
+#' Read a Message from a stream
+#'
+#' @param stream an InputStream
+#'
+#' @export
+read_message <- function(stream) {
+ UseMethod("read_message")
+}
+
+#' @export
+read_message.default <- function(stream) {
+ read_message(BufferReader$create(stream))
+}
+
+#' @export
+read_message.InputStream <- function(stream) {
+ ipc___ReadMessage(stream)
+}
+
+#' @export
+read_message.MessageReader <- function(stream) {
+ stream$ReadNextMessage()
+}
diff --git a/src/arrow/r/R/metadata.R b/src/arrow/r/R/metadata.R
new file mode 100644
index 000000000..768abeda7
--- /dev/null
+++ b/src/arrow/r/R/metadata.R
@@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom utils object.size
+.serialize_arrow_r_metadata <- function(x) {
+ assert_is(x, "list")
+
+ # drop problems attributes (most likely from readr)
+ x[["attributes"]][["problems"]] <- NULL
+
+ out <- serialize(x, NULL, ascii = TRUE)
+
+ # if the metadata is over 100 kB, compress
+ if (option_compress_metadata() && object.size(out) > 100000) {
+ out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
+
+ # but ensure that the compression+serialization is effective.
+ if (object.size(out) > object.size(out_comp)) out <- out_comp
+ }
+
+ rawToChar(out)
+}
+
+.unserialize_arrow_r_metadata <- function(x) {
+ tryCatch(
+ expr = {
+ out <- unserialize(charToRaw(x))
+
+ # if this is still raw, try decompressing
+ if (is.raw(out)) {
+ out <- unserialize(memDecompress(out, type = "gzip"))
+ }
+ out
+ },
+ error = function(e) {
+ warning("Invalid metadata$r", call. = FALSE)
+ NULL
+ }
+ )
+}
+
+#' @importFrom rlang trace_back
+apply_arrow_r_metadata <- function(x, r_metadata) {
+ tryCatch(
+ expr = {
+ columns_metadata <- r_metadata$columns
+ if (is.data.frame(x)) {
+ if (length(names(x)) && !is.null(columns_metadata)) {
+ for (name in intersect(names(columns_metadata), names(x))) {
+ x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
+ }
+ }
+ } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
+ # If we have a list and "columns_metadata" this applies row-level metadata
+ # inside of a column in a dataframe.
+
+ # However, if we are inside of a dplyr collection (including all datasets),
+ # we cannot apply this row-level metadata, since the order of the rows is
+ # not guaranteed to be the same, so don't even try, but warn what's going on
+ trace <- trace_back()
+ # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
+ in_dplyr_collect <- any(map_lgl(trace$calls %||% trace$call, function(x) {
+ grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]]
+ }))
+ if (in_dplyr_collect) {
+ warning(
+ "Row-level metadata is not compatible with this operation and has ",
+ "been ignored",
+ call. = FALSE
+ )
+ } else {
+ x <- map2(x, columns_metadata, function(.x, .y) {
+ apply_arrow_r_metadata(.x, .y)
+ })
+ }
+ x
+ }
+
+ if (!is.null(r_metadata$attributes)) {
+ attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
+ if (inherits(x, "POSIXlt")) {
+ # We store POSIXlt as a StructArray, which is translated back to R
+ # as a data.frame, but while data frames have a row.names = c(NA, nrow(x))
+ # attribute, POSIXlt does not, so since this is now no longer an object
+ # of class data.frame, remove the extraneous attribute
+ attr(x, "row.names") <- NULL
+ }
+ if (!is.null(attr(x, ".group_vars")) && requireNamespace("dplyr", quietly = TRUE)) {
+ x <- dplyr::group_by(x, !!!syms(attr(x, ".group_vars")))
+ attr(x, ".group_vars") <- NULL
+ }
+ }
+ },
+ error = function(e) {
+ warning("Invalid metadata$r", call. = FALSE)
+ }
+ )
+ x
+}
+
+remove_attributes <- function(x) {
+ removed_attributes <- character()
+ if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) {
+ removed_attributes <- c("class", "row.names", "names")
+ } else if (inherits(x, "data.frame")) {
+ removed_attributes <- c("row.names", "names")
+ } else if (inherits(x, "factor")) {
+ removed_attributes <- c("class", "levels")
+ } else if (inherits(x, c("integer64", "Date", "arrow_binary", "arrow_large_binary"))) {
+ removed_attributes <- c("class")
+ } else if (inherits(x, "arrow_fixed_size_binary")) {
+ removed_attributes <- c("class", "byte_width")
+ } else if (inherits(x, "POSIXct")) {
+ removed_attributes <- c("class", "tzone")
+ } else if (inherits(x, "hms") || inherits(x, "difftime")) {
+ removed_attributes <- c("class", "units")
+ }
+ removed_attributes
+}
+
+arrow_attributes <- function(x, only_top_level = FALSE) {
+ if (inherits(x, "grouped_df")) {
+ # Keep only the group var names, not the rest of the cached data that dplyr
+ # uses, which may be large
+ if (requireNamespace("dplyr", quietly = TRUE)) {
+ gv <- dplyr::group_vars(x)
+ x <- dplyr::ungroup(x)
+ # ungroup() first, then set attribute, bc ungroup() would erase it
+ attr(x, ".group_vars") <- gv
+ } else {
+ # Regardless, we shouldn't keep groups around
+ attr(x, "groups") <- NULL
+ }
+ }
+ att <- attributes(x)
+
+ removed_attributes <- remove_attributes(x)
+
+ att <- att[setdiff(names(att), removed_attributes)]
+ if (isTRUE(only_top_level)) {
+ return(att)
+ }
+
+ if (is.data.frame(x)) {
+ columns <- map(x, arrow_attributes)
+ out <- if (length(att) || !all(map_lgl(columns, is.null))) {
+ list(attributes = att, columns = columns)
+ }
+ return(out)
+ }
+
+ columns <- NULL
+ attempt_to_save_row_level <- getOption("arrow.preserve_row_level_metadata", FALSE) &&
+ is.list(x) && !inherits(x, "POSIXlt")
+ if (attempt_to_save_row_level) {
+ # However, if we are inside of a dplyr collection (including all datasets),
+ # we cannot apply this row-level metadata, since the order of the rows is
+ # not guaranteed to be the same, so don't even try, but warn what's going on
+ trace <- trace_back()
+ # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
+ in_dataset_write <- any(map_lgl(trace$calls %||% trace$call, function(x) {
+ grepl("write_dataset", x, fixed = TRUE)[[1]]
+ }))
+ if (in_dataset_write) {
+ warning(
+ "Row-level metadata is not compatible with datasets and will be discarded",
+ call. = FALSE
+ )
+ } else {
+ # for list columns, we also keep attributes of each
+ # element in columns
+ columns <- map(x, arrow_attributes)
+ }
+ if (all(map_lgl(columns, is.null))) {
+ columns <- NULL
+ }
+ } else if (inherits(x, c("sfc", "sf"))) {
+ # Check if there are any columns that look like sf columns, warn that we will
+ # not be saving this data for now (but only if arrow.preserve_row_level_metadata
+ # is set to FALSE)
+ warning(
+ "One of the columns given appears to be an `sfc` SF column. Due to their unique ",
+ "nature, these columns do not convert to Arrow well. We are working on ",
+ "better ways to do this, but in the interim we recommend converting any `sfc` ",
+ "columns to WKB (well-known binary) columns before using them with Arrow ",
+ "(for example, with `sf::st_as_binary(col)`).",
+ call. = FALSE
+ )
+ }
+
+ if (length(att) || !is.null(columns)) {
+ list(attributes = att, columns = columns)
+ } else {
+ NULL
+ }
+}
diff --git a/src/arrow/r/R/parquet.R b/src/arrow/r/R/parquet.R
new file mode 100644
index 000000000..ee2ed57de
--- /dev/null
+++ b/src/arrow/r/R/parquet.R
@@ -0,0 +1,585 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Read a Parquet file
+#'
+#' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format.
+#' This function enables you to read Parquet files into R.
+#'
+#' @inheritParams read_feather
+#' @param props [ParquetArrowReaderProperties]
+#' @param ... Additional arguments passed to `ParquetFileReader$create()`
+#'
+#' @return A [arrow::Table][Table], or a `data.frame` if `as_data_frame` is
+#' `TRUE` (the default).
+#' @examplesIf arrow_with_parquet()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_parquet(mtcars, tf)
+#' df <- read_parquet(tf, col_select = starts_with("d"))
+#' head(df)
+#' @export
+read_parquet <- function(file,
+ col_select = NULL,
+ as_data_frame = TRUE,
+ props = ParquetArrowReaderProperties$create(),
+ ...) {
+ if (is.string(file)) {
+ file <- make_readable_file(file)
+ on.exit(file$close())
+ }
+ reader <- ParquetFileReader$create(file, props = props, ...)
+
+ col_select <- enquo(col_select)
+ if (!quo_is_null(col_select)) {
+ # infer which columns to keep from schema
+ schema <- reader$GetSchema()
+ names <- names(schema)
+ indices <- match(vars_select(names, !!col_select), names) - 1L
+ tab <- tryCatch(
+ reader$ReadTable(indices),
+ error = read_compressed_error
+ )
+ } else {
+ # read all columns
+ tab <- tryCatch(
+ reader$ReadTable(),
+ error = read_compressed_error
+ )
+ }
+
+ if (as_data_frame) {
+ tab <- as.data.frame(tab)
+ }
+ tab
+}
+
+#' Write Parquet file to disk
+#'
+#' [Parquet](https://parquet.apache.org/) is a columnar storage file format.
+#' This function enables you to write Parquet files from R.
+#'
+#' Due to features of the format, Parquet files cannot be appended to.
+#' If you want to use the Parquet format but also want the ability to extend
+#' your dataset, you can write to additional Parquet files and then treat
+#' the whole directory of files as a [Dataset] you can query.
+#' See `vignette("dataset", package = "arrow")` for examples of this.
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param chunk_size chunk size in number of rows. If NULL, the total number of rows is used.
+#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values
+#' are coerced to character.
+#' @param compression compression algorithm. Default "snappy". See details.
+#' @param compression_level compression level. Meaning depends on compression algorithm
+#' @param use_dictionary Specify if we should use dictionary encoding. Default `TRUE`
+#' @param write_statistics Specify if we should write statistics. Default `TRUE`
+#' @param data_page_size Set a target threshold for the approximate encoded
+#' size of data pages within a column chunk (in bytes). Default 1 MiB.
+#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format. Default `FALSE`.
+#' @param coerce_timestamps Cast timestamps a particular resolution. Can be
+#' `NULL`, "ms" or "us". Default `NULL` (no casting)
+#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a
+#' particular resolution. E.g. if microsecond or nanosecond data is lost when coercing
+#' to "ms", do not raise an exception
+#' @param properties A `ParquetWriterProperties` object, used instead of the options
+#' enumerated in this function's signature. Providing `properties` as an argument
+#' is deprecated; if you need to assemble `ParquetWriterProperties` outside
+#' of `write_parquet()`, use `ParquetFileWriter` instead.
+#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like
+#' `properties`, this argument is deprecated.
+#'
+#' @details The parameters `compression`, `compression_level`, `use_dictionary` and
+#' `write_statistics` support various patterns:
+#'
+#' - The default `NULL` leaves the parameter unspecified, and the C++ library
+#' uses an appropriate default for each column (defaults listed above)
+#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns
+#' - An unnamed vector, of the same size as the number of columns, to specify a
+#' value for each column, in positional order
+#' - A named vector, to specify the value for the named columns, the default
+#' value for the setting is used when not supplied
+#'
+#' The `compression` argument can be any of the following (case insensitive):
+#' "uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2".
+#' Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip"
+#' are almost always included. See [codec_is_available()].
+#' The default "snappy" is used if available, otherwise "uncompressed". To
+#' disable compression, set `compression = "uncompressed"`.
+#' Note that "uncompressed" columns may still have dictionary encoding.
+#'
+#' @return the input `x` invisibly.
+#'
+#' @examplesIf arrow_with_parquet()
+#' tf1 <- tempfile(fileext = ".parquet")
+#' write_parquet(data.frame(x = 1:5), tf1)
+#'
+#' # using compression
+#' if (codec_is_available("gzip")) {
+#' tf2 <- tempfile(fileext = ".gz.parquet")
+#' write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
+#' }
+#' @export
+write_parquet <- function(x,
+ sink,
+ chunk_size = NULL,
+ # writer properties
+ version = NULL,
+ compression = default_parquet_compression(),
+ compression_level = NULL,
+ use_dictionary = NULL,
+ write_statistics = NULL,
+ data_page_size = NULL,
+ # arrow writer properties
+ use_deprecated_int96_timestamps = FALSE,
+ coerce_timestamps = NULL,
+ allow_truncated_timestamps = FALSE,
+ properties = NULL,
+ arrow_properties = NULL) {
+ x_out <- x
+
+ if (is.data.frame(x) || inherits(x, "RecordBatch")) {
+ x <- Table$create(x)
+ }
+
+ assert_that(is_writable_table(x))
+
+ if (!inherits(sink, "OutputStream")) {
+ sink <- make_output_stream(sink)
+ on.exit(sink$close())
+ }
+
+ # Deprecation warnings
+ if (!is.null(properties)) {
+ warning(
+ "Providing 'properties' is deprecated. If you need to assemble properties outside ",
+ "this function, use ParquetFileWriter instead."
+ )
+ }
+ if (!is.null(arrow_properties)) {
+ warning(
+ "Providing 'arrow_properties' is deprecated. If you need to assemble arrow_properties ",
+ "outside this function, use ParquetFileWriter instead."
+ )
+ }
+
+ writer <- ParquetFileWriter$create(
+ x$schema,
+ sink,
+ properties = properties %||% ParquetWriterProperties$create(
+ x,
+ version = version,
+ compression = compression,
+ compression_level = compression_level,
+ use_dictionary = use_dictionary,
+ write_statistics = write_statistics,
+ data_page_size = data_page_size
+ ),
+ arrow_properties = arrow_properties %||% ParquetArrowWriterProperties$create(
+ use_deprecated_int96_timestamps = use_deprecated_int96_timestamps,
+ coerce_timestamps = coerce_timestamps,
+ allow_truncated_timestamps = allow_truncated_timestamps
+ )
+ )
+ writer$WriteTable(x, chunk_size = chunk_size %||% x$num_rows)
+ writer$Close()
+
+ invisible(x_out)
+}
+
+default_parquet_compression <- function() {
+ # Match the pyarrow default (overriding the C++ default)
+ if (codec_is_available("snappy")) {
+ "snappy"
+ } else {
+ NULL
+ }
+}
+
+ParquetArrowWriterProperties <- R6Class("ParquetArrowWriterProperties", inherit = ArrowObject)
+ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps = FALSE,
+ coerce_timestamps = NULL,
+ allow_truncated_timestamps = FALSE,
+ ...) {
+ if (is.null(coerce_timestamps)) {
+ timestamp_unit <- -1L # null sentinel value
+ } else {
+ timestamp_unit <- make_valid_time_unit(
+ coerce_timestamps,
+ c("ms" = TimeUnit$MILLI, "us" = TimeUnit$MICRO)
+ )
+ }
+ parquet___ArrowWriterProperties___create(
+ use_deprecated_int96_timestamps = isTRUE(use_deprecated_int96_timestamps),
+ timestamp_unit = timestamp_unit,
+ allow_truncated_timestamps = isTRUE(allow_truncated_timestamps)
+ )
+}
+
+valid_parquet_version <- c(
+ "1.0" = ParquetVersionType$PARQUET_1_0,
+ "2.0" = ParquetVersionType$PARQUET_2_0
+)
+
+make_valid_version <- function(version, valid_versions = valid_parquet_version) {
+ if (is_integerish(version)) {
+ version <- as.character(version)
+ }
+ tryCatch(
+ valid_versions[[match.arg(version, choices = names(valid_versions))]],
+ error = function(cond) {
+ stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call. = FALSE)
+ }
+ )
+}
+
+#' @title ParquetWriterProperties class
+#' @rdname ParquetWriterProperties
+#' @name ParquetWriterProperties
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class holds settings to control how a Parquet file is read
+#' by [ParquetFileWriter].
+#'
+#' @section Factory:
+#'
+#' The `ParquetWriterProperties$create()` factory method instantiates the object
+#' and takes the following arguments:
+#'
+#' - `table`: table to write (required)
+#' - `version`: Parquet version, "1.0" or "2.0". Default "1.0"
+#' - `compression`: Compression type, algorithm `"uncompressed"`
+#' - `compression_level`: Compression level; meaning depends on compression algorithm
+#' - `use_dictionary`: Specify if we should use dictionary encoding. Default `TRUE`
+#' - `write_statistics`: Specify if we should write statistics. Default `TRUE`
+#' - `data_page_size`: Set a target threshold for the approximate encoded
+#' size of data pages within a column chunk (in bytes). Default 1 MiB.
+#'
+#' @details The parameters `compression`, `compression_level`, `use_dictionary`
+#' and write_statistics` support various patterns:
+#'
+#' - The default `NULL` leaves the parameter unspecified, and the C++ library
+#' uses an appropriate default for each column (defaults listed above)
+#' - A single, unnamed, value (e.g. a single string for `compression`) applies to all columns
+#' - An unnamed vector, of the same size as the number of columns, to specify a
+#' value for each column, in positional order
+#' - A named vector, to specify the value for the named columns, the default
+#' value for the setting is used when not supplied
+#'
+#' Unlike the high-level [write_parquet], `ParquetWriterProperties` arguments
+#' use the C++ defaults. Currently this means "uncompressed" rather than
+#' "snappy" for the `compression` argument.
+#'
+#' @seealso [write_parquet]
+#' @seealso [Schema] for information about schemas and metadata handling.
+#'
+#' @export
+ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = ArrowObject)
+ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder",
+ inherit = ArrowObject,
+ public = list(
+ set_version = function(version) {
+ parquet___WriterProperties___Builder__version(self, make_valid_version(version))
+ },
+ set_compression = function(table, compression) {
+ compression <- compression_from_name(compression)
+ assert_that(is.integer(compression))
+ private$.set(
+ table, compression,
+ parquet___ArrowWriterProperties___Builder__set_compressions
+ )
+ },
+ set_compression_level = function(table, compression_level) {
+ # cast to integer but keep names
+ compression_level <- set_names(as.integer(compression_level), names(compression_level))
+ private$.set(
+ table, compression_level,
+ parquet___ArrowWriterProperties___Builder__set_compression_levels
+ )
+ },
+ set_dictionary = function(table, use_dictionary) {
+ assert_that(is.logical(use_dictionary))
+ private$.set(
+ table, use_dictionary,
+ parquet___ArrowWriterProperties___Builder__set_use_dictionary
+ )
+ },
+ set_write_statistics = function(table, write_statistics) {
+ assert_that(is.logical(write_statistics))
+ private$.set(
+ table, write_statistics,
+ parquet___ArrowWriterProperties___Builder__set_write_statistics
+ )
+ },
+ set_data_page_size = function(data_page_size) {
+ parquet___ArrowWriterProperties___Builder__data_page_size(self, data_page_size)
+ }
+ ),
+ private = list(
+ .set = function(table, value, FUN) {
+ msg <- paste0("unsupported ", substitute(value), "= specification")
+ column_names <- names(table)
+ given_names <- names(value)
+ if (is.null(given_names)) {
+ if (length(value) %in% c(1L, length(column_names))) {
+ # If there's a single, unnamed value, FUN will set it globally
+ # If there are values for all columns, send them along with the names
+ FUN(self, column_names, value)
+ } else {
+ abort(msg)
+ }
+ } else if (all(given_names %in% column_names)) {
+ # Use the given names
+ FUN(self, given_names, value)
+ } else {
+ abort(msg)
+ }
+ }
+ )
+)
+
+ParquetWriterProperties$create <- function(table,
+ version = NULL,
+ compression = default_parquet_compression(),
+ compression_level = NULL,
+ use_dictionary = NULL,
+ write_statistics = NULL,
+ data_page_size = NULL,
+ ...) {
+ builder <- parquet___WriterProperties___Builder__create()
+ if (!is.null(version)) {
+ builder$set_version(version)
+ }
+ if (!is.null(compression)) {
+ builder$set_compression(table, compression = compression)
+ }
+ if (!is.null(compression_level)) {
+ builder$set_compression_level(table, compression_level = compression_level)
+ }
+ if (!is.null(use_dictionary)) {
+ builder$set_dictionary(table, use_dictionary)
+ }
+ if (!is.null(write_statistics)) {
+ builder$set_write_statistics(table, write_statistics)
+ }
+ if (!is.null(data_page_size)) {
+ builder$set_data_page_size(data_page_size)
+ }
+ parquet___WriterProperties___Builder__build(builder)
+}
+
+#' @title ParquetFileWriter class
+#' @rdname ParquetFileWriter
+#' @name ParquetFileWriter
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class enables you to interact with Parquet files.
+#'
+#' @section Factory:
+#'
+#' The `ParquetFileWriter$create()` factory method instantiates the object and
+#' takes the following arguments:
+#'
+#' - `schema` A [Schema]
+#' - `sink` An [arrow::io::OutputStream][OutputStream]
+#' - `properties` An instance of [ParquetWriterProperties]
+#' - `arrow_properties` An instance of `ParquetArrowWriterProperties`
+#'
+#' @section Methods:
+#'
+#' - `WriteTable` Write a [Table] to `sink`
+#' - `Close` Close the writer. Note: does not close the `sink`.
+#' [arrow::io::OutputStream][OutputStream] has its own `close()` method.
+#'
+#' @export
+#' @include arrow-package.R
+ParquetFileWriter <- R6Class("ParquetFileWriter",
+ inherit = ArrowObject,
+ public = list(
+ WriteTable = function(table, chunk_size) {
+ parquet___arrow___FileWriter__WriteTable(self, table, chunk_size)
+ },
+ Close = function() parquet___arrow___FileWriter__Close(self)
+ )
+)
+ParquetFileWriter$create <- function(schema,
+ sink,
+ properties = ParquetWriterProperties$create(),
+ arrow_properties = ParquetArrowWriterProperties$create()) {
+ assert_is(sink, "OutputStream")
+ parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties)
+}
+
+
+#' @title ParquetFileReader class
+#' @rdname ParquetFileReader
+#' @name ParquetFileReader
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class enables you to interact with Parquet files.
+#'
+#' @section Factory:
+#'
+#' The `ParquetFileReader$create()` factory method instantiates the object and
+#' takes the following arguments:
+#'
+#' - `file` A character file name, raw vector, or Arrow file connection object
+#' (e.g. `RandomAccessFile`).
+#' - `props` Optional [ParquetArrowReaderProperties]
+#' - `mmap` Logical: whether to memory-map the file (default `TRUE`)
+#' - `...` Additional arguments, currently ignored
+#'
+#' @section Methods:
+#'
+#' - `$ReadTable(column_indices)`: get an `arrow::Table` from the file. The optional
+#' `column_indices=` argument is a 0-based integer vector indicating which columns to retain.
+#' - `$ReadRowGroup(i, column_indices)`: get an `arrow::Table` by reading the `i`th row group (0-based).
+#' The optional `column_indices=` argument is a 0-based integer vector indicating which columns to retain.
+#' - `$ReadRowGroups(row_groups, column_indices)`: get an `arrow::Table` by reading several row
+#' groups (0-based integers).
+#' The optional `column_indices=` argument is a 0-based integer vector indicating which columns to retain.
+#' - `$GetSchema()`: get the `arrow::Schema` of the data in the file
+#' - `$ReadColumn(i)`: read the `i`th column (0-based) as a [ChunkedArray].
+#'
+#' @section Active bindings:
+#'
+#' - `$num_rows`: number of rows.
+#' - `$num_columns`: number of columns.
+#' - `$num_row_groups`: number of row groups.
+#'
+#' @export
+#' @examplesIf arrow_with_parquet()
+#' f <- system.file("v0.7.1.parquet", package = "arrow")
+#' pq <- ParquetFileReader$create(f)
+#' pq$GetSchema()
+#' if (codec_is_available("snappy")) {
+#' # This file has compressed data columns
+#' tab <- pq$ReadTable()
+#' tab$schema
+#' }
+#' @include arrow-package.R
+ParquetFileReader <- R6Class("ParquetFileReader",
+ inherit = ArrowObject,
+ active = list(
+ num_rows = function() {
+ as.integer(parquet___arrow___FileReader__num_rows(self))
+ },
+ num_columns = function() {
+ parquet___arrow___FileReader__num_columns(self)
+ },
+ num_row_groups = function() {
+ parquet___arrow___FileReader__num_row_groups(self)
+ }
+ ),
+ public = list(
+ ReadTable = function(column_indices = NULL) {
+ if (is.null(column_indices)) {
+ parquet___arrow___FileReader__ReadTable1(self)
+ } else {
+ column_indices <- vec_cast(column_indices, integer())
+ parquet___arrow___FileReader__ReadTable2(self, column_indices)
+ }
+ },
+ ReadRowGroup = function(i, column_indices = NULL) {
+ i <- vec_cast(i, integer())
+ if (is.null(column_indices)) {
+ parquet___arrow___FileReader__ReadRowGroup1(self, i)
+ } else {
+ column_indices <- vec_cast(column_indices, integer())
+ parquet___arrow___FileReader__ReadRowGroup2(self, i, column_indices)
+ }
+ },
+ ReadRowGroups = function(row_groups, column_indices = NULL) {
+ row_groups <- vec_cast(row_groups, integer())
+ if (is.null(column_indices)) {
+ parquet___arrow___FileReader__ReadRowGroups1(self, row_groups)
+ } else {
+ column_indices <- vec_cast(column_indices, integer())
+ parquet___arrow___FileReader__ReadRowGroups2(self, row_groups, column_indices)
+ }
+ },
+ ReadColumn = function(i) {
+ i <- vec_cast(i, integer())
+ parquet___arrow___FileReader__ReadColumn(self, i)
+ },
+ GetSchema = function() {
+ parquet___arrow___FileReader__GetSchema(self)
+ }
+ )
+)
+
+ParquetFileReader$create <- function(file,
+ props = ParquetArrowReaderProperties$create(),
+ mmap = TRUE,
+ ...) {
+ file <- make_readable_file(file, mmap)
+ assert_is(props, "ParquetArrowReaderProperties")
+
+ parquet___arrow___FileReader__OpenFile(file, props)
+}
+
+#' @title ParquetArrowReaderProperties class
+#' @rdname ParquetArrowReaderProperties
+#' @name ParquetArrowReaderProperties
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description This class holds settings to control how a Parquet file is read
+#' by [ParquetFileReader].
+#'
+#' @section Factory:
+#'
+#' The `ParquetArrowReaderProperties$create()` factory method instantiates the object
+#' and takes the following arguments:
+#'
+#' - `use_threads` Logical: whether to use multithreading (default `TRUE`)
+#'
+#' @section Methods:
+#'
+#' - `$read_dictionary(column_index)`
+#' - `$set_read_dictionary(column_index, read_dict)`
+#' - `$use_threads(use_threads)`
+#'
+#' @export
+ParquetArrowReaderProperties <- R6Class("ParquetArrowReaderProperties",
+ inherit = ArrowObject,
+ public = list(
+ read_dictionary = function(column_index) {
+ parquet___arrow___ArrowReaderProperties__get_read_dictionary(self, column_index)
+ },
+ set_read_dictionary = function(column_index, read_dict) {
+ parquet___arrow___ArrowReaderProperties__set_read_dictionary(self, column_index, read_dict)
+ }
+ ),
+ active = list(
+ use_threads = function(use_threads) {
+ if (missing(use_threads)) {
+ parquet___arrow___ArrowReaderProperties__get_use_threads(self)
+ } else {
+ parquet___arrow___ArrowReaderProperties__set_use_threads(self, use_threads)
+ }
+ }
+ )
+)
+
+ParquetArrowReaderProperties$create <- function(use_threads = option_use_threads()) {
+ parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads))
+}
diff --git a/src/arrow/r/R/python.R b/src/arrow/r/R/python.R
new file mode 100644
index 000000000..07cd4456b
--- /dev/null
+++ b/src/arrow/r/R/python.R
@@ -0,0 +1,225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+py_to_r.pyarrow.lib.Array <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ x$`_export_to_c`(array_ptr, schema_ptr)
+ Array$import_from_c(array_ptr, schema_ptr)
+}
+
+r_to_py.Array <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(array_ptr, schema_ptr)
+ out <- pa$Array$`_import_from_c`(array_ptr, schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.RecordBatch <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ x$`_export_to_c`(array_ptr, schema_ptr)
+
+ RecordBatch$import_from_c(array_ptr, schema_ptr)
+}
+
+r_to_py.RecordBatch <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ on.exit({
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+ })
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(array_ptr, schema_ptr)
+ out <- pa$RecordBatch$`_import_from_c`(array_ptr, schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+r_to_py.ChunkedArray <- function(x, convert = FALSE) {
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ out <- pa$chunked_array(x$chunks)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.ChunkedArray <- function(x, ...) {
+ ChunkedArray$create(!!!maybe_py_to_r(x$chunks))
+}
+
+r_to_py.Table <- function(x, convert = FALSE) {
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ out <- pa$Table$from_arrays(x$columns, schema = x$schema)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.Table <- function(x, ...) {
+ colnames <- maybe_py_to_r(x$column_names)
+ r_cols <- maybe_py_to_r(x$columns)
+ names(r_cols) <- colnames
+ Table$create(!!!r_cols, schema = maybe_py_to_r(x$schema))
+}
+
+py_to_r.pyarrow.lib.Schema <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ x$`_export_to_c`(schema_ptr)
+ Schema$import_from_c(schema_ptr)
+}
+
+r_to_py.Schema <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(schema_ptr)
+ out <- pa$Schema$`_import_from_c`(schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.Field <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ x$`_export_to_c`(schema_ptr)
+ Field$import_from_c(schema_ptr)
+}
+
+r_to_py.Field <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(schema_ptr)
+ out <- pa$Field$`_import_from_c`(schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.DataType <- function(x, ...) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ x$`_export_to_c`(schema_ptr)
+ DataType$import_from_c(schema_ptr)
+}
+
+r_to_py.DataType <- function(x, convert = FALSE) {
+ schema_ptr <- allocate_arrow_schema()
+ on.exit(delete_arrow_schema(schema_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(schema_ptr)
+ out <- pa$DataType$`_import_from_c`(schema_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+py_to_r.pyarrow.lib.RecordBatchReader <- function(x, ...) {
+ stream_ptr <- allocate_arrow_array_stream()
+ on.exit(delete_arrow_array_stream(stream_ptr))
+
+ x$`_export_to_c`(stream_ptr)
+ RecordBatchReader$import_from_c(stream_ptr)
+}
+
+r_to_py.RecordBatchReader <- function(x, convert = FALSE) {
+ stream_ptr <- allocate_arrow_array_stream()
+ on.exit(delete_arrow_array_stream(stream_ptr))
+
+ # Import with convert = FALSE so that `_import_from_c` returns a Python object
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ x$export_to_c(stream_ptr)
+ # TODO: handle subclasses of RecordBatchReader?
+ out <- pa$lib$RecordBatchReader$`_import_from_c`(stream_ptr)
+ # But set the convert attribute on the return object to the requested value
+ assign("convert", convert, out)
+ out
+}
+
+
+maybe_py_to_r <- function(x) {
+ if (inherits(x, "python.builtin.object")) {
+ # Depending on some auto-convert behavior, x may already be converted
+ # or it may still be a Python object
+ x <- reticulate::py_to_r(x)
+ }
+ x
+}
+
+#' Install pyarrow for use with reticulate
+#'
+#' `pyarrow` is the Python package for Apache Arrow. This function helps with
+#' installing it for use with `reticulate`.
+#'
+#' @param envname The name or full path of the Python environment to install
+#' into. This can be a virtualenv or conda environment created by `reticulate`.
+#' See `reticulate::py_install()`.
+#' @param nightly logical: Should we install a development version of the
+#' package? Default is to use the official release version.
+#' @param ... additional arguments passed to `reticulate::py_install()`.
+#' @export
+install_pyarrow <- function(envname = NULL, nightly = FALSE, ...) {
+ if (nightly) {
+ reticulate::py_install("pyarrow",
+ envname = envname, ...,
+ # Nightly for pip
+ pip_options = "--extra-index-url https://repo.fury.io/arrow-nightlies/ --pre --upgrade",
+ # Nightly for conda
+ channel = "arrow-nightlies"
+ )
+ } else {
+ reticulate::py_install("pyarrow", envname = envname, ...)
+ }
+}
diff --git a/src/arrow/r/R/query-engine.R b/src/arrow/r/R/query-engine.R
new file mode 100644
index 000000000..234aaf569
--- /dev/null
+++ b/src/arrow/r/R/query-engine.R
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+do_exec_plan <- function(.data) {
+ plan <- ExecPlan$create()
+ final_node <- plan$Build(.data)
+ tab <- plan$Run(final_node)
+
+ # TODO (ARROW-14289): make the head/tail methods return RBR not Table
+ if (inherits(tab, "RecordBatchReader")) {
+ tab <- tab$read_table()
+ }
+
+ # If arrange() created $temp_columns, make sure to omit them from the result
+ # We can't currently handle this in the ExecPlan itself because sorting
+ # happens in the end (SinkNode) so nothing comes after it.
+ if (length(final_node$sort$temp_columns) > 0) {
+ tab <- tab[, setdiff(names(tab), final_node$sort$temp_columns), drop = FALSE]
+ }
+
+ if (ncol(tab)) {
+ # Apply any column metadata from the original schema, where appropriate
+ original_schema <- source_data(.data)$schema
+ # TODO: do we care about other (non-R) metadata preservation?
+ # How would we know if it were meaningful?
+ r_meta <- original_schema$r_metadata
+ if (!is.null(r_meta)) {
+ # Filter r_metadata$columns on columns with name _and_ type match
+ new_schema <- tab$schema
+ common_names <- intersect(names(r_meta$columns), names(tab))
+ keep <- common_names[
+ map_lgl(common_names, ~ original_schema[[.]] == new_schema[[.]])
+ ]
+ r_meta$columns <- r_meta$columns[keep]
+ if (has_aggregation(.data)) {
+ # dplyr drops top-level attributes if you do summarize
+ r_meta$attributes <- NULL
+ }
+ tab$r_metadata <- r_meta
+ }
+ }
+
+ tab
+}
+
+ExecPlan <- R6Class("ExecPlan",
+ inherit = ArrowObject,
+ public = list(
+ Scan = function(dataset) {
+ # Handle arrow_dplyr_query
+ if (inherits(dataset, "arrow_dplyr_query")) {
+ if (inherits(dataset$.data, "RecordBatchReader")) {
+ return(ExecNode_ReadFromRecordBatchReader(self, dataset$.data))
+ }
+
+ filter <- dataset$filtered_rows
+ if (isTRUE(filter)) {
+ filter <- Expression$scalar(TRUE)
+ }
+ # Use FieldsInExpression to find all from dataset$selected_columns
+ colnames <- unique(unlist(map(
+ dataset$selected_columns,
+ field_names_in_expression
+ )))
+ dataset <- dataset$.data
+ assert_is(dataset, "Dataset")
+ } else {
+ if (inherits(dataset, "ArrowTabular")) {
+ dataset <- InMemoryDataset$create(dataset)
+ }
+ assert_is(dataset, "Dataset")
+ # Set some defaults
+ filter <- Expression$scalar(TRUE)
+ colnames <- names(dataset)
+ }
+ # ScanNode needs the filter to do predicate pushdown and skip partitions,
+ # and it needs to know which fields to materialize (and which are unnecessary)
+ ExecNode_Scan(self, dataset, filter, colnames %||% character(0))
+ },
+ Build = function(.data) {
+ # This method takes an arrow_dplyr_query and chains together the
+ # ExecNodes that they produce. It does not evaluate them--that is Run().
+ group_vars <- dplyr::group_vars(.data)
+ grouped <- length(group_vars) > 0
+
+ # Collect the target names first because we have to add back the group vars
+ target_names <- names(.data)
+ .data <- ensure_group_vars(.data)
+ .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns
+
+ if (inherits(.data$.data, "arrow_dplyr_query")) {
+ # We have a nested query. Recurse.
+ node <- self$Build(.data$.data)
+ } else {
+ node <- self$Scan(.data)
+ }
+
+ # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again
+ if (inherits(.data$filtered_rows, "Expression")) {
+ node <- node$Filter(.data$filtered_rows)
+ }
+
+ if (!is.null(.data$aggregations)) {
+ # Project to include just the data required for each aggregation,
+ # plus group_by_vars (last)
+ # TODO: validate that none of names(aggregations) are the same as names(group_by_vars)
+ # dplyr does not error on this but the result it gives isn't great
+ node <- node$Project(summarize_projection(.data))
+
+ if (grouped) {
+ # We need to prefix all of the aggregation function names with "hash_"
+ .data$aggregations <- lapply(.data$aggregations, function(x) {
+ x[["fun"]] <- paste0("hash_", x[["fun"]])
+ x
+ })
+ }
+
+ node <- node$Aggregate(
+ options = map(.data$aggregations, ~ .[c("fun", "options")]),
+ target_names = names(.data$aggregations),
+ out_field_names = names(.data$aggregations),
+ key_names = group_vars
+ )
+
+ if (grouped) {
+ # The result will have result columns first then the grouping cols.
+ # dplyr orders group cols first, so adapt the result to meet that expectation.
+ node <- node$Project(
+ make_field_refs(c(group_vars, names(.data$aggregations)))
+ )
+ if (getOption("arrow.summarise.sort", FALSE)) {
+ # Add sorting instructions for the rows too to match dplyr
+ # (see below about why sorting isn't itself a Node)
+ node$sort <- list(
+ names = group_vars,
+ orders = rep(0L, length(group_vars))
+ )
+ }
+ }
+ } else {
+ # If any columns are derived, reordered, or renamed we need to Project
+ # If there are aggregations, the projection was already handled above
+ # We have to project at least once to eliminate some junk columns
+ # that the ExecPlan adds:
+ # __fragment_index, __batch_index, __last_in_fragment
+ # Presumably extraneous repeated projection of the same thing
+ # (as when we've done collapse() and not projected after) is cheap/no-op
+ projection <- c(.data$selected_columns, .data$temp_columns)
+ node <- node$Project(projection)
+
+ if (!is.null(.data$join)) {
+ node <- node$Join(
+ type = .data$join$type,
+ right_node = self$Build(.data$join$right_data),
+ by = .data$join$by,
+ left_output = names(.data),
+ right_output = setdiff(names(.data$join$right_data), .data$join$by)
+ )
+ }
+ }
+
+ # Apply sorting: this is currently not an ExecNode itself, it is a
+ # sink node option.
+ # TODO: handle some cases:
+ # (1) arrange > summarize > arrange
+ # (2) ARROW-13779: arrange then operation where order matters (e.g. cumsum)
+ if (length(.data$arrange_vars)) {
+ node$sort <- list(
+ names = names(.data$arrange_vars),
+ orders = .data$arrange_desc,
+ temp_columns = names(.data$temp_columns)
+ )
+ }
+
+ # This is only safe because we are going to evaluate queries that end
+ # with head/tail first, then evaluate any subsequent query as a new query
+ if (!is.null(.data$head)) {
+ node$head <- .data$head
+ }
+ if (!is.null(.data$tail)) {
+ node$tail <- .data$tail
+ }
+
+ node
+ },
+ Run = function(node) {
+ assert_is(node, "ExecNode")
+
+ # Sorting and head/tail (if sorted) are handled in the SinkNode,
+ # created in ExecPlan_run
+ sorting <- node$sort %||% list()
+ select_k <- node$head %||% -1L
+ has_sorting <- length(sorting) > 0
+ if (has_sorting) {
+ if (!is.null(node$tail)) {
+ # Reverse the sort order and take the top K, then after we'll reverse
+ # the resulting rows so that it is ordered as expected
+ sorting$orders <- !sorting$orders
+ select_k <- node$tail
+ }
+ sorting$orders <- as.integer(sorting$orders)
+ }
+
+ out <- ExecPlan_run(self, node, sorting, select_k)
+
+ if (!has_sorting) {
+ # Since ExecPlans don't scan in deterministic order, head/tail are both
+ # essentially taking a random slice from somewhere in the dataset.
+ # And since the head() implementation is way more efficient than tail(),
+ # just use it to take the random slice
+ slice_size <- node$head %||% node$tail
+ if (!is.null(slice_size)) {
+ # TODO (ARROW-14289): make the head methods return RBR not Table
+ out <- head(out, slice_size)
+ }
+ # Can we now tell `self$Stop()` to StopProducing? We already have
+ # everything we need for the head (but it seems to segfault: ARROW-14329)
+ } else if (!is.null(node$tail)) {
+ # Reverse the row order to get back what we expect
+ # TODO: don't return Table, return RecordBatchReader
+ out <- out$read_table()
+ out <- out[rev(seq_len(nrow(out))), , drop = FALSE]
+ }
+
+ out
+ },
+ Stop = function() ExecPlan_StopProducing(self)
+ )
+)
+ExecPlan$create <- function(use_threads = option_use_threads()) {
+ ExecPlan_create(use_threads)
+}
+
+ExecNode <- R6Class("ExecNode",
+ inherit = ArrowObject,
+ public = list(
+ # `sort` is a slight hack to be able to keep around arrange() params,
+ # which don't currently yield their own ExecNode but rather are consumed
+ # in the SinkNode (in ExecPlan$run())
+ sort = NULL,
+ # Similar hacks for head and tail
+ head = NULL,
+ tail = NULL,
+ preserve_sort = function(new_node) {
+ new_node$sort <- self$sort
+ new_node$head <- self$head
+ new_node$tail <- self$tail
+ new_node
+ },
+ Project = function(cols) {
+ if (length(cols)) {
+ assert_is_list_of(cols, "Expression")
+ self$preserve_sort(ExecNode_Project(self, cols, names(cols)))
+ } else {
+ self$preserve_sort(ExecNode_Project(self, character(0), character(0)))
+ }
+ },
+ Filter = function(expr) {
+ assert_is(expr, "Expression")
+ self$preserve_sort(ExecNode_Filter(self, expr))
+ },
+ Aggregate = function(options, target_names, out_field_names, key_names) {
+ self$preserve_sort(
+ ExecNode_Aggregate(self, options, target_names, out_field_names, key_names)
+ )
+ },
+ Join = function(type, right_node, by, left_output, right_output) {
+ self$preserve_sort(
+ ExecNode_Join(
+ self,
+ type,
+ right_node,
+ left_keys = names(by),
+ right_keys = by,
+ left_output = left_output,
+ right_output = right_output
+ )
+ )
+ }
+ ),
+ active = list(
+ schema = function() ExecNode_output_schema(self)
+ )
+)
diff --git a/src/arrow/r/R/record-batch-reader.R b/src/arrow/r/R/record-batch-reader.R
new file mode 100644
index 000000000..1542e3649
--- /dev/null
+++ b/src/arrow/r/R/record-batch-reader.R
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' @title RecordBatchReader classes
+#' @description Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather.
+#' `RecordBatchStreamReader` and `RecordBatchFileReader` are
+#' interfaces for accessing record batches from input sources in those formats,
+#' respectively.
+#'
+#' For guidance on how to use these classes, see the examples section.
+#'
+#' @seealso [read_ipc_stream()] and [read_feather()] provide a much simpler interface
+#' for reading data from these formats and are sufficient for many use cases.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `RecordBatchFileReader$create()` and `RecordBatchStreamReader$create()`
+#' factory methods instantiate the object and
+#' take a single argument, named according to the class:
+#'
+#' - `file` A character file name, raw vector, or Arrow file connection object
+#' (e.g. [RandomAccessFile]).
+#' - `stream` A raw vector, [Buffer], or [InputStream].
+#'
+#' @section Methods:
+#'
+#' - `$read_next_batch()`: Returns a `RecordBatch`, iterating through the
+#' Reader. If there are no further batches in the Reader, it returns `NULL`.
+#' - `$schema`: Returns a [Schema] (active binding)
+#' - `$batches()`: Returns a list of `RecordBatch`es
+#' - `$read_table()`: Collects the reader's `RecordBatch`es into a [Table]
+#' - `$get_batch(i)`: For `RecordBatchFileReader`, return a particular batch
+#' by an integer index.
+#' - `$num_record_batches()`: For `RecordBatchFileReader`, see how many batches
+#' are in the file.
+#'
+#' @rdname RecordBatchReader
+#' @name RecordBatchReader
+#' @include arrow-package.R
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#'
+#' batch <- record_batch(chickwts)
+#'
+#' # This opens a connection to the file in Arrow
+#' file_obj <- FileOutputStream$create(tf)
+#' # Pass that to a RecordBatchWriter to write data conforming to a schema
+#' writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+#' writer$write(batch)
+#' # You may write additional batches to the stream, provided that they have
+#' # the same schema.
+#' # Call "close" on the writer to indicate end-of-file/stream
+#' writer$close()
+#' # Then, close the connection--closing the IPC message does not close the file
+#' file_obj$close()
+#'
+#' # Now, we have a file we can read from. Same pattern: open file connection,
+#' # then pass it to a RecordBatchReader
+#' read_file_obj <- ReadableFile$create(tf)
+#' reader <- RecordBatchFileReader$create(read_file_obj)
+#' # RecordBatchFileReader knows how many batches it has (StreamReader does not)
+#' reader$num_record_batches
+#' # We could consume the Reader by calling $read_next_batch() until all are,
+#' # consumed, or we can call $read_table() to pull them all into a Table
+#' tab <- reader$read_table()
+#' # Call as.data.frame to turn that Table into an R data.frame
+#' df <- as.data.frame(tab)
+#' # This should be the same data we sent
+#' all.equal(df, chickwts, check.attributes = FALSE)
+#' # Unlike the Writers, we don't have to close RecordBatchReaders,
+#' # but we do still need to close the file connection
+#' read_file_obj$close()
+RecordBatchReader <- R6Class("RecordBatchReader",
+ inherit = ArrowObject,
+ public = list(
+ read_next_batch = function() RecordBatchReader__ReadNext(self),
+ batches = function() RecordBatchReader__batches(self),
+ read_table = function() Table__from_RecordBatchReader(self),
+ export_to_c = function(stream_ptr) ExportRecordBatchReader(self, stream_ptr)
+ ),
+ active = list(
+ schema = function() RecordBatchReader__schema(self)
+ )
+)
+
+#' @export
+head.RecordBatchReader <- function(x, n = 6L, ...) {
+ head(Scanner$create(x), n)
+}
+
+#' @export
+tail.RecordBatchReader <- function(x, n = 6L, ...) {
+ tail(Scanner$create(x), n)
+}
+
+#' @rdname RecordBatchReader
+#' @usage NULL
+#' @format NULL
+#' @export
+RecordBatchStreamReader <- R6Class("RecordBatchStreamReader", inherit = RecordBatchReader)
+RecordBatchStreamReader$create <- function(stream) {
+ if (inherits(stream, c("raw", "Buffer"))) {
+ # TODO: deprecate this because it doesn't close the connection to the Buffer
+ # (that's a problem, right?)
+ stream <- BufferReader$create(stream)
+ }
+ assert_is(stream, "InputStream")
+ ipc___RecordBatchStreamReader__Open(stream)
+}
+#' @include arrowExports.R
+RecordBatchReader$import_from_c <- RecordBatchStreamReader$import_from_c <- ImportRecordBatchReader
+
+#' @rdname RecordBatchReader
+#' @usage NULL
+#' @format NULL
+#' @export
+RecordBatchFileReader <- R6Class("RecordBatchFileReader",
+ inherit = ArrowObject,
+ # Why doesn't this inherit from RecordBatchReader in C++?
+ # Origin: https://github.com/apache/arrow/pull/679
+ public = list(
+ get_batch = function(i) {
+ ipc___RecordBatchFileReader__ReadRecordBatch(self, i)
+ },
+ batches = function() {
+ ipc___RecordBatchFileReader__batches(self)
+ },
+ read_table = function() Table__from_RecordBatchFileReader(self)
+ ),
+ active = list(
+ num_record_batches = function() ipc___RecordBatchFileReader__num_record_batches(self),
+ schema = function() ipc___RecordBatchFileReader__schema(self)
+ )
+)
+RecordBatchFileReader$create <- function(file) {
+ if (inherits(file, c("raw", "Buffer"))) {
+ # TODO: deprecate this because it doesn't close the connection to the Buffer
+ # (that's a problem, right?)
+ file <- BufferReader$create(file)
+ }
+ assert_is(file, "InputStream")
+ ipc___RecordBatchFileReader__Open(file)
+}
diff --git a/src/arrow/r/R/record-batch-writer.R b/src/arrow/r/R/record-batch-writer.R
new file mode 100644
index 000000000..8675e785a
--- /dev/null
+++ b/src/arrow/r/R/record-batch-writer.R
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' @title RecordBatchWriter classes
+#' @description Apache Arrow defines two formats for [serializing data for interprocess
+#' communication
+#' (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
+#' a "stream" format and a "file" format, known as Feather.
+#' `RecordBatchStreamWriter` and `RecordBatchFileWriter` are
+#' interfaces for writing record batches to those formats, respectively.
+#'
+#' For guidance on how to use these classes, see the examples section.
+#'
+#' @seealso [write_ipc_stream()] and [write_feather()] provide a much simpler
+#' interface for writing data to these formats and are sufficient for many use
+#' cases. [write_to_raw()] is a version that serializes data to a buffer.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Factory:
+#'
+#' The `RecordBatchFileWriter$create()` and `RecordBatchStreamWriter$create()`
+#' factory methods instantiate the object and take the following arguments:
+#'
+#' - `sink` An `OutputStream`
+#' - `schema` A [Schema] for the data to be written
+#' - `use_legacy_format` logical: write data formatted so that Arrow libraries
+#' versions 0.14 and lower can read it. Default is `FALSE`. You can also
+#' enable this by setting the environment variable `ARROW_PRE_0_15_IPC_FORMAT=1`.
+#' - `metadata_version`: A string like "V5" or the equivalent integer indicating
+#' the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
+#' unless the environment variable `ARROW_PRE_1_0_METADATA_VERSION=1`, in
+#' which case it will be V4.
+#'
+#' @section Methods:
+#'
+#' - `$write(x)`: Write a [RecordBatch], [Table], or `data.frame`, dispatching
+#' to the methods below appropriately
+#' - `$write_batch(batch)`: Write a `RecordBatch` to stream
+#' - `$write_table(table)`: Write a `Table` to stream
+#' - `$close()`: close stream. Note that this indicates end-of-file or
+#' end-of-stream--it does not close the connection to the `sink`. That needs
+#' to be closed separately.
+#'
+#' @rdname RecordBatchWriter
+#' @name RecordBatchWriter
+#' @include arrow-package.R
+#' @examplesIf arrow_available()
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#'
+#' batch <- record_batch(chickwts)
+#'
+#' # This opens a connection to the file in Arrow
+#' file_obj <- FileOutputStream$create(tf)
+#' # Pass that to a RecordBatchWriter to write data conforming to a schema
+#' writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+#' writer$write(batch)
+#' # You may write additional batches to the stream, provided that they have
+#' # the same schema.
+#' # Call "close" on the writer to indicate end-of-file/stream
+#' writer$close()
+#' # Then, close the connection--closing the IPC message does not close the file
+#' file_obj$close()
+#'
+#' # Now, we have a file we can read from. Same pattern: open file connection,
+#' # then pass it to a RecordBatchReader
+#' read_file_obj <- ReadableFile$create(tf)
+#' reader <- RecordBatchFileReader$create(read_file_obj)
+#' # RecordBatchFileReader knows how many batches it has (StreamReader does not)
+#' reader$num_record_batches
+#' # We could consume the Reader by calling $read_next_batch() until all are,
+#' # consumed, or we can call $read_table() to pull them all into a Table
+#' tab <- reader$read_table()
+#' # Call as.data.frame to turn that Table into an R data.frame
+#' df <- as.data.frame(tab)
+#' # This should be the same data we sent
+#' all.equal(df, chickwts, check.attributes = FALSE)
+#' # Unlike the Writers, we don't have to close RecordBatchReaders,
+#' # but we do still need to close the file connection
+#' read_file_obj$close()
+RecordBatchWriter <- R6Class("RecordBatchWriter",
+ inherit = ArrowObject,
+ public = list(
+ write_batch = function(batch) ipc___RecordBatchWriter__WriteRecordBatch(self, batch),
+ write_table = function(table) ipc___RecordBatchWriter__WriteTable(self, table),
+ write = function(x) {
+ if (inherits(x, "RecordBatch")) {
+ self$write_batch(x)
+ } else if (inherits(x, "Table")) {
+ self$write_table(x)
+ } else {
+ self$write_table(Table$create(x))
+ }
+ },
+ close = function() ipc___RecordBatchWriter__Close(self)
+ )
+)
+
+#' @usage NULL
+#' @format NULL
+#' @rdname RecordBatchWriter
+#' @export
+RecordBatchStreamWriter <- R6Class("RecordBatchStreamWriter", inherit = RecordBatchWriter)
+RecordBatchStreamWriter$create <- function(sink,
+ schema,
+ use_legacy_format = NULL,
+ metadata_version = NULL) {
+ if (is.string(sink)) {
+ stop(
+ "RecordBatchStreamWriter$create() requires an Arrow InputStream. ",
+ "Try providing FileOutputStream$create(", substitute(sink), ")",
+ call. = FALSE
+ )
+ }
+ assert_is(sink, "OutputStream")
+ assert_is(schema, "Schema")
+
+ ipc___RecordBatchStreamWriter__Open(
+ sink,
+ schema,
+ get_ipc_use_legacy_format(use_legacy_format),
+ get_ipc_metadata_version(metadata_version)
+ )
+}
+
+#' @usage NULL
+#' @format NULL
+#' @rdname RecordBatchWriter
+#' @export
+RecordBatchFileWriter <- R6Class("RecordBatchFileWriter", inherit = RecordBatchStreamWriter)
+RecordBatchFileWriter$create <- function(sink,
+ schema,
+ use_legacy_format = NULL,
+ metadata_version = NULL) {
+ if (is.string(sink)) {
+ stop(
+ "RecordBatchFileWriter$create() requires an Arrow InputStream. ",
+ "Try providing FileOutputStream$create(", substitute(sink), ")",
+ call. = FALSE
+ )
+ }
+ assert_is(sink, "OutputStream")
+ assert_is(schema, "Schema")
+
+ ipc___RecordBatchFileWriter__Open(
+ sink,
+ schema,
+ get_ipc_use_legacy_format(use_legacy_format),
+ get_ipc_metadata_version(metadata_version)
+ )
+}
+
+get_ipc_metadata_version <- function(x) {
+ input <- x
+ if (is_integerish(x)) {
+ # 4 means "V4", which actually happens to be 3L
+ x <- paste0("V", x)
+ } else if (is.null(x)) {
+ if (identical(Sys.getenv("ARROW_PRE_1_0_METADATA_VERSION"), "1") ||
+ identical(Sys.getenv("ARROW_PRE_0_15_IPC_FORMAT"), "1")) {
+ # PRE_1_0 is specific for this;
+ # if you already set PRE_0_15, PRE_1_0 should be implied
+ x <- "V4"
+ } else {
+ # Take the latest
+ x <- length(MetadataVersion)
+ }
+ }
+ out <- MetadataVersion[[x]]
+ if (is.null(out)) {
+ stop(deparse(input), " is not a valid IPC MetadataVersion", call. = FALSE)
+ }
+ out
+}
+
+get_ipc_use_legacy_format <- function(x) {
+ isTRUE(x %||% identical(Sys.getenv("ARROW_PRE_0_15_IPC_FORMAT"), "1"))
+}
diff --git a/src/arrow/r/R/record-batch.R b/src/arrow/r/R/record-batch.R
new file mode 100644
index 000000000..c66ff7fb0
--- /dev/null
+++ b/src/arrow/r/R/record-batch.R
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @include array.R
+#' @title RecordBatch class
+#' @description A record batch is a collection of equal-length arrays matching
+#' a particular [Schema]. It is a table-like data structure that is semantically
+#' a sequence of [fields][Field], each a contiguous Arrow [Array].
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section S3 Methods and Usage:
+#' Record batches are data-frame-like, and many methods you expect to work on
+#' a `data.frame` are implemented for `RecordBatch`. This includes `[`, `[[`,
+#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull
+#' the data from an Arrow record batch into R with `as.data.frame()`. See the
+#' examples.
+#'
+#' A caveat about the `$` method: because `RecordBatch` is an `R6` object,
+#' `$` is also used to access the object's methods (see below). Methods take
+#' precedence over the table's columns. So, `batch$Slice` would return the
+#' "Slice" method function even if there were a column in the table called
+#' "Slice".
+#'
+#' @section R6 Methods:
+#' In addition to the more R-friendly S3 methods, a `RecordBatch` object has
+#' the following R6 methods that map onto the underlying C++ methods:
+#'
+#' - `$Equals(other)`: Returns `TRUE` if the `other` record batch is equal
+#' - `$column(i)`: Extract an `Array` by integer position from the batch
+#' - `$column_name(i)`: Get a column's name by integer position
+#' - `$names()`: Get all column names (called by `names(batch)`)
+#' - `$RenameColumns(value)`: Set all column names (called by `names(batch) <- value`)
+#' - `$GetColumnByName(name)`: Extract an `Array` by string name
+#' - `$RemoveColumn(i)`: Drops a column from the batch by integer position
+#' - `$SelectColumns(indices)`: Return a new record batch with a selection of columns, expressed as 0-based integers.
+#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
+#' indicated integer offset and going for the given length, or to the end
+#' of the table if `NULL`, the default.
+#' - `$Take(i)`: return an `RecordBatch` with rows at positions given by
+#' integers (R vector or Array Array) `i`.
+#' - `$Filter(i, keep_na = TRUE)`: return an `RecordBatch` with rows at positions where logical
+#' vector (or Arrow boolean Array) `i` is `TRUE`.
+#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row
+#' positions that can be used to rearrange the `RecordBatch` in ascending or
+#' descending order by the first named column, breaking ties with further named
+#' columns. `descending` can be a logical vector of length one or of the same
+#' length as `names`.
+#' - `$serialize()`: Returns a raw vector suitable for interprocess communication
+#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
+#' the schema of the record batch.
+#'
+#' There are also some active bindings
+#' - `$num_columns`
+#' - `$num_rows`
+#' - `$schema`
+#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
+#' Modify or replace by assigning in (`batch$metadata <- new_metadata`).
+#' All list elements are coerced to string. See `schema()` for more information.
+#' - `$columns`: Returns a list of `Array`s
+#' @rdname RecordBatch
+#' @name RecordBatch
+#' @export
+RecordBatch <- R6Class("RecordBatch",
+ inherit = ArrowTabular,
+ public = list(
+ column = function(i) RecordBatch__column(self, i),
+ column_name = function(i) RecordBatch__column_name(self, i),
+ names = function() RecordBatch__names(self),
+ RenameColumns = function(value) RecordBatch__RenameColumns(self, value),
+ Equals = function(other, check_metadata = FALSE, ...) {
+ inherits(other, "RecordBatch") && RecordBatch__Equals(self, other, isTRUE(check_metadata))
+ },
+ GetColumnByName = function(name) {
+ assert_that(is.string(name))
+ RecordBatch__GetColumnByName(self, name)
+ },
+ SelectColumns = function(indices) RecordBatch__SelectColumns(self, indices),
+ AddColumn = function(i, new_field, value) {
+ RecordBatch__AddColumn(self, i, new_field, value)
+ },
+ SetColumn = function(i, new_field, value) {
+ RecordBatch__SetColumn(self, i, new_field, value)
+ },
+ RemoveColumn = function(i) RecordBatch__RemoveColumn(self, i),
+ ReplaceSchemaMetadata = function(new) {
+ RecordBatch__ReplaceSchemaMetadata(self, new)
+ },
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ RecordBatch__Slice1(self, offset)
+ } else {
+ RecordBatch__Slice2(self, offset, length)
+ }
+ },
+ # Take, Filter, and SortIndices are methods on ArrowTabular
+ serialize = function() ipc___SerializeRecordBatch__Raw(self),
+ to_data_frame = function() {
+ RecordBatch__to_dataframe(self, use_threads = option_use_threads())
+ },
+ cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) {
+ assert_is(target_schema, "Schema")
+ assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
+ RecordBatch__cast(self, target_schema, options)
+ },
+ invalidate = function() {
+ .Call(`_arrow_RecordBatch__Reset`, self)
+ super$invalidate()
+ },
+ export_to_c = function(array_ptr, schema_ptr) {
+ ExportRecordBatch(self, array_ptr, schema_ptr)
+ }
+ ),
+ active = list(
+ num_columns = function() RecordBatch__num_columns(self),
+ num_rows = function() RecordBatch__num_rows(self),
+ schema = function() RecordBatch__schema(self),
+ columns = function() RecordBatch__columns(self)
+ )
+)
+
+RecordBatch$create <- function(..., schema = NULL) {
+ arrays <- list2(...)
+ if (length(arrays) == 1 && inherits(arrays[[1]], c("raw", "Buffer", "InputStream", "Message"))) {
+ return(RecordBatch$from_message(arrays[[1]], schema))
+ }
+
+ # Else, a list of arrays or data.frames
+ # making sure there are always names
+ if (is.null(names(arrays))) {
+ names(arrays) <- rep_len("", length(arrays))
+ }
+ stopifnot(length(arrays) > 0)
+
+ # If any arrays are length 1, recycle them
+ arrays <- recycle_scalars(arrays)
+
+ # TODO: should this also assert that they're all Arrays?
+ RecordBatch__from_arrays(schema, arrays)
+}
+
+RecordBatch$from_message <- function(obj, schema) {
+ # Message/Buffer readers, previously in read_record_batch()
+ assert_is(schema, "Schema")
+ if (inherits(obj, c("raw", "Buffer"))) {
+ obj <- BufferReader$create(obj)
+ on.exit(obj$close())
+ }
+ if (inherits(obj, "InputStream")) {
+ ipc___ReadRecordBatch__InputStream__Schema(obj, schema)
+ } else {
+ ipc___ReadRecordBatch__Message__Schema(obj, schema)
+ }
+}
+#' @include arrowExports.R
+RecordBatch$import_from_c <- ImportRecordBatch
+
+#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a
+#' mixture of data.frames and vectors, the inputs will be autospliced together
+#' (see examples). Alternatively, you can provide a single Arrow IPC
+#' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`.
+#' @param schema a [Schema], or `NULL` (the default) to infer the schema from
+#' the data in `...`. When providing an Arrow IPC buffer, `schema` is required.
+#' @rdname RecordBatch
+#' @examplesIf arrow_available()
+#' batch <- record_batch(name = rownames(mtcars), mtcars)
+#' dim(batch)
+#' dim(head(batch))
+#' names(batch)
+#' batch$mpg
+#' batch[["cyl"]]
+#' as.data.frame(batch[4:8, c("gear", "hp", "wt")])
+#' @export
+record_batch <- RecordBatch$create
+
+#' @export
+names.RecordBatch <- function(x) x$names()
diff --git a/src/arrow/r/R/reexports-bit64.R b/src/arrow/r/R/reexports-bit64.R
new file mode 100644
index 000000000..c89d2b150
--- /dev/null
+++ b/src/arrow/r/R/reexports-bit64.R
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @importFrom bit64 print.integer64
+bit64::print.integer64
+
+#' @importFrom bit64 str.integer64
+bit64::str.integer64
diff --git a/src/arrow/r/R/reexports-tidyselect.R b/src/arrow/r/R/reexports-tidyselect.R
new file mode 100644
index 000000000..cd0de2849
--- /dev/null
+++ b/src/arrow/r/R/reexports-tidyselect.R
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Alias required for help links in downstream packages
+#' @aliases select_helpers
+#' @importFrom tidyselect contains
+#' @export
+tidyselect::contains
+#' @importFrom tidyselect ends_with
+#' @export
+tidyselect::ends_with
+#' @importFrom tidyselect everything
+#' @export
+tidyselect::everything
+#' @importFrom tidyselect matches
+#' @export
+tidyselect::matches
+#' @importFrom tidyselect num_range
+#' @export
+tidyselect::num_range
+#' @importFrom tidyselect one_of
+#' @export
+tidyselect::one_of
+#' @importFrom tidyselect starts_with
+#' @export
+tidyselect::starts_with
+#' @importFrom tidyselect last_col
+#' @export
+tidyselect::last_col
+#' @importFrom tidyselect all_of
+#' @export
+tidyselect::all_of
diff --git a/src/arrow/r/R/scalar.R b/src/arrow/r/R/scalar.R
new file mode 100644
index 000000000..4dedc6c12
--- /dev/null
+++ b/src/arrow/r/R/scalar.R
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-datum.R
+
+#' @title Arrow scalars
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @description A `Scalar` holds a single value of an Arrow type.
+#'
+#' @section Methods:
+#' `$ToString()`: convert to a string
+#' `$as_vector()`: convert to an R vector
+#' `$as_array()`: convert to an Arrow `Array`
+#' `$Equals(other)`: is this Scalar equal to `other`
+#' `$ApproxEquals(other)`: is this Scalar approximately equal to `other`
+#' `$is_valid`: is this Scalar valid
+#' `$null_count`: number of invalid values - 1 or 0
+#' `$type`: Scalar type
+#'
+#' @name Scalar
+#' @rdname Scalar
+#' @examplesIf arrow_available()
+#' Scalar$create(pi)
+#' Scalar$create(404)
+#' # If you pass a vector into Scalar$create, you get a list containing your items
+#' Scalar$create(c(1, 2, 3))
+#'
+#' # Comparisons
+#' my_scalar <- Scalar$create(99)
+#' my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE
+#' my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE
+#' my_scalar$Equals(Scalar$create(99.000009)) # FALSE
+#' my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match)
+#'
+#' my_scalar$ToString()
+#' @export
+Scalar <- R6Class("Scalar",
+ inherit = ArrowDatum,
+ # TODO: document the methods
+ public = list(
+ ToString = function() Scalar__ToString(self),
+ type_id = function() Scalar__type(self)$id,
+ as_vector = function() Scalar__as_vector(self),
+ as_array = function(length = 1L) MakeArrayFromScalar(self, as.integer(length)),
+ Equals = function(other, ...) {
+ inherits(other, "Scalar") && Scalar__Equals(self, other)
+ },
+ ApproxEquals = function(other, ...) {
+ inherits(other, "Scalar") && Scalar__ApproxEquals(self, other)
+ }
+ ),
+ active = list(
+ is_valid = function() Scalar__is_valid(self),
+ null_count = function() sum(!self$is_valid),
+ type = function() Scalar__type(self)
+ )
+)
+Scalar$create <- function(x, type = NULL) {
+ if (is.null(x)) {
+ x <- vctrs::unspecified(1)
+ } else if (length(x) != 1 && !is.data.frame(x)) {
+ # Wrap in a list type
+ x <- list(x)
+ }
+ Array__GetScalar(Array$create(x, type = type), 0)
+}
+
+#' @rdname array
+#' @usage NULL
+#' @format NULL
+#' @export
+StructScalar <- R6Class("StructScalar",
+ inherit = Scalar,
+ public = list(
+ field = function(i) StructScalar__field(self, i),
+ GetFieldByName = function(name) StructScalar__GetFieldByName(self, name)
+ )
+)
+
+#' @export
+length.Scalar <- function(x) 1L
+
+#' @export
+sort.Scalar <- function(x, decreasing = FALSE, ...) x
diff --git a/src/arrow/r/R/schema.R b/src/arrow/r/R/schema.R
new file mode 100644
index 000000000..c3dfee5f9
--- /dev/null
+++ b/src/arrow/r/R/schema.R
@@ -0,0 +1,330 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title Schema class
+#'
+#' @description A `Schema` is a list of [Field]s, which map names to
+#' Arrow [data types][data-type]. Create a `Schema` when you
+#' want to convert an R `data.frame` to Arrow but don't want to rely on the
+#' default mapping of R types to Arrow types, such as when you want to choose a
+#' specific numeric precision, or when creating a [Dataset] and you want to
+#' ensure a specific schema rather than inferring it from the various files.
+#'
+#' Many Arrow objects, including [Table] and [Dataset], have a `$schema` method
+#' (active binding) that lets you access their schema.
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#' @section Methods:
+#'
+#' - `$ToString()`: convert to a string
+#' - `$field(i)`: returns the field at index `i` (0-based)
+#' - `$GetFieldByName(x)`: returns the field with name `x`
+#' - `$WithMetadata(metadata)`: returns a new `Schema` with the key-value
+#' `metadata` set. Note that all list elements in `metadata` will be coerced
+#' to `character`.
+#'
+#' @section Active bindings:
+#'
+#' - `$names`: returns the field names (called in `names(Schema)`)
+#' - `$num_fields`: returns the number of fields (called in `length(Schema)`)
+#' - `$fields`: returns the list of `Field`s in the `Schema`, suitable for
+#' iterating over
+#' - `$HasMetadata`: logical: does this `Schema` have extra metadata?
+#' - `$metadata`: returns the key-value metadata as a named list.
+#' Modify or replace by assigning in (`sch$metadata <- new_metadata`).
+#' All list elements are coerced to string.
+#'
+#' @section R Metadata:
+#'
+#' When converting a data.frame to an Arrow Table or RecordBatch, attributes
+#' from the `data.frame` are saved alongside tables so that the object can be
+#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
+#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
+#' at the column (e.g. `attributes(df$col_a)`) or for list columns only:
+#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
+#' for storing `haven` columns in a table and being able to faithfully
+#' re-create them when pulled back into R. This metadata is separate from the
+#' schema (column names and types) which is compatible with other Arrow
+#' clients. The R metadata is only read by R and is ignored by other clients
+#' (e.g. Pandas has its own custom metadata). This metadata is stored in
+#' `$metadata$r`.
+#'
+#' Since Schema metadata keys and values must be strings, this metadata is
+#' saved by serializing R's attribute list structure to a string. If the
+#' serialized metadata exceeds 100Kb in size, by default it is compressed
+#' starting in version 3.0.0. To disable this compression (e.g. for tables
+#' that are compatible with Arrow versions before 3.0.0 and include large
+#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
+#' Files with compressed metadata are readable by older versions of arrow, but
+#' the metadata is dropped.
+#'
+#' @rdname Schema
+#' @name Schema
+#' @examplesIf arrow_available()
+#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+#' tab1 <- arrow_table(df)
+#' tab1$schema
+#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
+#' tab2$schema
+#' @export
+Schema <- R6Class("Schema",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ fields <- print_schema_fields(self)
+ if (self$HasMetadata) {
+ fields <- paste0(fields, "\n\nSee $metadata for additional Schema metadata")
+ }
+ fields
+ },
+ field = function(i) Schema__field(self, i),
+ GetFieldByName = function(x) Schema__GetFieldByName(self, x),
+ AddField = function(i, field) {
+ assert_is(field, "Field")
+ Schema__AddField(self, i, field)
+ },
+ SetField = function(i, field) {
+ assert_is(field, "Field")
+ Schema__SetField(self, i, field)
+ },
+ RemoveField = function(i) Schema__RemoveField(self, i),
+ serialize = function() Schema__serialize(self),
+ WithMetadata = function(metadata = NULL) {
+ metadata <- prepare_key_value_metadata(metadata)
+ Schema__WithMetadata(self, metadata)
+ },
+ Equals = function(other, check_metadata = FALSE, ...) {
+ inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata))
+ },
+ export_to_c = function(ptr) ExportSchema(self, ptr)
+ ),
+ active = list(
+ names = function() {
+ Schema__field_names(self)
+ },
+ num_fields = function() Schema__num_fields(self),
+ fields = function() Schema__fields(self),
+ HasMetadata = function() Schema__HasMetadata(self),
+ metadata = function(new_metadata) {
+ if (missing(new_metadata)) {
+ Schema__metadata(self)
+ } else {
+ # Set the metadata
+ out <- self$WithMetadata(new_metadata)
+ # $WithMetadata returns a new object but we're modifying in place,
+ # so swap in that new C++ object pointer into our R6 object
+ self$set_pointer(out$pointer())
+ self
+ }
+ },
+ r_metadata = function(new) {
+ # Helper for the R metadata that handles the serialization
+ # See also method on ArrowTabular
+ if (missing(new)) {
+ out <- self$metadata$r
+ if (!is.null(out)) {
+ # Can't unserialize NULL
+ out <- .unserialize_arrow_r_metadata(out)
+ }
+ # Returns either NULL or a named list
+ out
+ } else {
+ # Set the R metadata
+ self$metadata$r <- .serialize_arrow_r_metadata(new)
+ self
+ }
+ }
+ )
+)
+Schema$create <- function(...) {
+ .list <- list2(...)
+ if (all(map_lgl(.list, ~ inherits(., "Field")))) {
+ schema_(.list)
+ } else {
+ schema_(.fields(.list))
+ }
+}
+#' @include arrowExports.R
+Schema$import_from_c <- ImportSchema
+
+prepare_key_value_metadata <- function(metadata) {
+ # key-value-metadata must be a named character vector;
+ # this function validates and coerces
+ if (is.null(metadata)) {
+ # NULL to remove metadata, so equivalent to setting an empty list
+ metadata <- empty_named_list()
+ }
+ if (is.null(names(metadata))) {
+ stop(
+ "Key-value metadata must be a named list or character vector",
+ call. = FALSE
+ )
+ }
+ map_chr(metadata, as.character)
+}
+
+print_schema_fields <- function(s) {
+ # Alternative to Schema__ToString that doesn't print metadata
+ paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
+}
+
+#' @param ... named list containing [data types][data-type] or
+#' a list of [fields][field] containing the fields for the schema
+#' @export
+#' @rdname Schema
+schema <- Schema$create
+
+#' @export
+names.Schema <- function(x) x$names
+
+#' @export
+length.Schema <- function(x) x$num_fields
+
+#' @export
+`[[.Schema` <- function(x, i, ...) {
+ if (is.character(i)) {
+ x$GetFieldByName(i)
+ } else if (is.numeric(i)) {
+ x$field(i - 1)
+ } else {
+ stop("'i' must be character or numeric, not ", class(i), call. = FALSE)
+ }
+}
+
+#' @export
+`[[<-.Schema` <- function(x, i, value) {
+ assert_that(length(i) == 1)
+ if (is.character(i)) {
+ field_names <- names(x)
+ if (anyDuplicated(field_names)) {
+ stop("Cannot update field by name with duplicates", call. = FALSE)
+ }
+
+ # If i is character, it's the field name
+ if (!is.null(value) && !inherits(value, "Field")) {
+ value <- field(i, as_type(value, "value"))
+ }
+
+ # No match means we're adding to the end
+ i <- match(i, field_names, nomatch = length(field_names) + 1L)
+ } else {
+ assert_that(is.numeric(i), !is.na(i), i > 0)
+ # If i is numeric and we have a type,
+ # we need to grab the existing field name for the new one
+ if (!is.null(value) && !inherits(value, "Field")) {
+ value <- field(names(x)[i], as_type(value, "value"))
+ }
+ }
+
+ i <- as.integer(i - 1L)
+ if (i >= length(x)) {
+ if (!is.null(value)) {
+ x <- x$AddField(i, value)
+ }
+ } else if (is.null(value)) {
+ x <- x$RemoveField(i)
+ } else {
+ x <- x$SetField(i, value)
+ }
+ x
+}
+
+#' @export
+`$<-.Schema` <- `$<-.ArrowTabular`
+
+#' @export
+`[.Schema` <- function(x, i, ...) {
+ if (is.logical(i)) {
+ i <- rep_len(i, length(x)) # For R recycling behavior
+ i <- which(i)
+ }
+ if (is.numeric(i)) {
+ if (all(i < 0)) {
+ # in R, negative i means "everything but i"
+ i <- setdiff(seq_len(length(x)), -1 * i)
+ }
+ }
+ fields <- map(i, ~ x[[.]])
+ invalid <- map_lgl(fields, is.null)
+ if (any(invalid)) {
+ stop(
+ "Invalid field name", ifelse(sum(invalid) > 1, "s: ", ": "),
+ oxford_paste(i[invalid]),
+ call. = FALSE
+ )
+ }
+ schema_(fields)
+}
+
+#' @export
+`$.Schema` <- function(x, name, ...) {
+ assert_that(is.string(name))
+ if (name %in% ls(x)) {
+ get(name, x)
+ } else {
+ x$GetFieldByName(name)
+ }
+}
+
+#' @export
+as.list.Schema <- function(x, ...) x$fields
+
+#' read a Schema from a stream
+#'
+#' @param stream a `Message`, `InputStream`, or `Buffer`
+#' @param ... currently ignored
+#' @return A [Schema]
+#' @export
+read_schema <- function(stream, ...) {
+ if (inherits(stream, "Message")) {
+ return(ipc___ReadSchema_Message(stream))
+ } else {
+ if (!inherits(stream, "InputStream")) {
+ stream <- BufferReader$create(stream)
+ on.exit(stream$close())
+ }
+ return(ipc___ReadSchema_InputStream(stream))
+ }
+}
+
+#' Combine and harmonize schemas
+#'
+#' @param ... [Schema]s to unify
+#' @param schemas Alternatively, a list of schemas
+#' @return A `Schema` with the union of fields contained in the inputs, or
+#' `NULL` if any of `schemas` is `NULL`
+#' @export
+#' @examplesIf arrow_available()
+#' a <- schema(b = double(), c = bool())
+#' z <- schema(b = double(), k = utf8())
+#' unify_schemas(a, z)
+unify_schemas <- function(..., schemas = list(...)) {
+ if (any(vapply(schemas, is.null, TRUE))) {
+ return(NULL)
+ }
+ arrow__UnifySchemas(schemas)
+}
+
+#' @export
+print.arrow_r_metadata <- function(x, ...) {
+ utils::str(x)
+ utils::str(.unserialize_arrow_r_metadata(x))
+ invisible(x)
+}
diff --git a/src/arrow/r/R/table.R b/src/arrow/r/R/table.R
new file mode 100644
index 000000000..5ae87f7e3
--- /dev/null
+++ b/src/arrow/r/R/table.R
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include record-batch.R
+#' @title Table class
+#' @description A Table is a sequence of [chunked arrays][ChunkedArray]. They
+#' have a similar interface to [record batches][RecordBatch], but they can be
+#' composed from multiple record batches or chunked arrays.
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section S3 Methods and Usage:
+#' Tables are data-frame-like, and many methods you expect to work on
+#' a `data.frame` are implemented for `Table`. This includes `[`, `[[`,
+#' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull
+#' the data from an Arrow table into R with `as.data.frame()`. See the
+#' examples.
+#'
+#' A caveat about the `$` method: because `Table` is an `R6` object,
+#' `$` is also used to access the object's methods (see below). Methods take
+#' precedence over the table's columns. So, `tab$Slice` would return the
+#' "Slice" method function even if there were a column in the table called
+#' "Slice".
+#'
+#' @section R6 Methods:
+#' In addition to the more R-friendly S3 methods, a `Table` object has
+#' the following R6 methods that map onto the underlying C++ methods:
+#'
+#' - `$column(i)`: Extract a `ChunkedArray` by integer position from the table
+#' - `$ColumnNames()`: Get all column names (called by `names(tab)`)
+#' - `$RenameColumns(value)`: Set all column names (called by `names(tab) <- value`)
+#' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name
+#' - `$field(i)`: Extract a `Field` from the table schema by integer position
+#' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers.
+#' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
+#' indicated integer offset and going for the given length, or to the end
+#' of the table if `NULL`, the default.
+#' - `$Take(i)`: return an `Table` with rows at positions given by
+#' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be
+#' coerced to an R vector before taking.
+#' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical
+#' vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
+#' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row
+#' positions that can be used to rearrange the `Table` in ascending or descending
+#' order by the first named column, breaking ties with further named columns.
+#' `descending` can be a logical vector of length one or of the same length as
+#' `names`.
+#' - `$serialize(output_stream, ...)`: Write the table to the given
+#' [OutputStream]
+#' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
+#' the schema of the record batch.
+#'
+#' There are also some active bindings:
+#' - `$num_columns`
+#' - `$num_rows`
+#' - `$schema`
+#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
+#' Modify or replace by assigning in (`tab$metadata <- new_metadata`).
+#' All list elements are coerced to string. See `schema()` for more information.
+#' - `$columns`: Returns a list of `ChunkedArray`s
+#' @rdname Table
+#' @name Table
+#' @export
+Table <- R6Class("Table",
+ inherit = ArrowTabular,
+ public = list(
+ column = function(i) Table__column(self, i),
+ ColumnNames = function() Table__ColumnNames(self),
+ RenameColumns = function(value) Table__RenameColumns(self, value),
+ GetColumnByName = function(name) {
+ assert_is(name, "character")
+ assert_that(length(name) == 1)
+ Table__GetColumnByName(self, name)
+ },
+ RemoveColumn = function(i) Table__RemoveColumn(self, i),
+ AddColumn = function(i, new_field, value) Table__AddColumn(self, i, new_field, value),
+ SetColumn = function(i, new_field, value) Table__SetColumn(self, i, new_field, value),
+ ReplaceSchemaMetadata = function(new) {
+ Table__ReplaceSchemaMetadata(self, new)
+ },
+ field = function(i) Table__field(self, i),
+ serialize = function(output_stream, ...) write_table(self, output_stream, ...),
+ to_data_frame = function() {
+ Table__to_dataframe(self, use_threads = option_use_threads())
+ },
+ cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) {
+ assert_is(target_schema, "Schema")
+ assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
+ Table__cast(self, target_schema, options)
+ },
+ SelectColumns = function(indices) Table__SelectColumns(self, indices),
+ Slice = function(offset, length = NULL) {
+ if (is.null(length)) {
+ Table__Slice1(self, offset)
+ } else {
+ Table__Slice2(self, offset, length)
+ }
+ },
+ # Take, Filter, and SortIndices are methods on ArrowTabular
+ Equals = function(other, check_metadata = FALSE, ...) {
+ inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata))
+ },
+ Validate = function() Table__Validate(self),
+ ValidateFull = function() Table__ValidateFull(self),
+ invalidate = function() {
+ .Call(`_arrow_Table__Reset`, self)
+ super$invalidate()
+ }
+ ),
+ active = list(
+ num_columns = function() Table__num_columns(self),
+ num_rows = function() Table__num_rows(self),
+ schema = function() Table__schema(self),
+ columns = function() Table__columns(self)
+ )
+)
+
+Table$create <- function(..., schema = NULL) {
+ dots <- list2(...)
+ # making sure there are always names
+ if (is.null(names(dots))) {
+ names(dots) <- rep_len("", length(dots))
+ }
+ stopifnot(length(dots) > 0)
+
+ if (all_record_batches(dots)) {
+ return(Table__from_record_batches(dots, schema))
+ }
+
+ # If any arrays are length 1, recycle them
+ dots <- recycle_scalars(dots)
+
+ Table__from_dots(dots, schema, option_use_threads())
+}
+
+#' @export
+names.Table <- function(x) x$ColumnNames()
+
+#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a
+#' mixture of data.frames and named vectors, the inputs will be autospliced together
+#' (see examples). Alternatively, you can provide a single Arrow IPC
+#' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`.
+#' @param schema a [Schema], or `NULL` (the default) to infer the schema from
+#' the data in `...`. When providing an Arrow IPC buffer, `schema` is required.
+#' @rdname Table
+#' @examplesIf arrow_available()
+#' tbl <- arrow_table(name = rownames(mtcars), mtcars)
+#' dim(tbl)
+#' dim(head(tbl))
+#' names(tbl)
+#' tbl$mpg
+#' tbl[["cyl"]]
+#' as.data.frame(tbl[4:8, c("gear", "hp", "wt")])
+#' @export
+arrow_table <- Table$create
diff --git a/src/arrow/r/R/type.R b/src/arrow/r/R/type.R
new file mode 100644
index 000000000..4ef7cefb5
--- /dev/null
+++ b/src/arrow/r/R/type.R
@@ -0,0 +1,541 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+#' @title class arrow::DataType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname DataType
+#' @name DataType
+DataType <- R6Class("DataType",
+ inherit = ArrowObject,
+ public = list(
+ ToString = function() {
+ DataType__ToString(self)
+ },
+ Equals = function(other, ...) {
+ inherits(other, "DataType") && DataType__Equals(self, other)
+ },
+ fields = function() {
+ DataType__fields(self)
+ },
+ export_to_c = function(ptr) ExportType(self, ptr)
+ ),
+ active = list(
+ id = function() DataType__id(self),
+ name = function() DataType__name(self),
+ num_fields = function() DataType__num_fields(self)
+ )
+)
+
+#' @include arrowExports.R
+DataType$import_from_c <- ImportType
+
+INTEGER_TYPES <- as.character(outer(c("uint", "int"), c(8, 16, 32, 64), paste0))
+FLOAT_TYPES <- c("float16", "float32", "float64", "halffloat", "float", "double")
+
+#' infer the arrow Array type from an R vector
+#'
+#' @param x an R vector
+#'
+#' @return an arrow logical type
+#' @examplesIf arrow_available()
+#' type(1:10)
+#' type(1L:10L)
+#' type(c(1, 1.5, 2))
+#' type(c("A", "B", "C"))
+#' type(mtcars)
+#' type(Sys.Date())
+#' @export
+type <- function(x) UseMethod("type")
+
+#' @export
+type.default <- function(x) Array__infer_type(x)
+
+#' @export
+type.ArrowDatum <- function(x) x$type
+
+#----- metadata
+
+#' @title class arrow::FixedWidthType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' TODO
+#'
+#' @rdname FixedWidthType
+#' @name FixedWidthType
+FixedWidthType <- R6Class("FixedWidthType",
+ inherit = DataType,
+ active = list(
+ bit_width = function() FixedWidthType__bit_width(self)
+ )
+)
+
+Int8 <- R6Class("Int8", inherit = FixedWidthType)
+Int16 <- R6Class("Int16", inherit = FixedWidthType)
+Int32 <- R6Class("Int32", inherit = FixedWidthType)
+Int64 <- R6Class("Int64", inherit = FixedWidthType)
+UInt8 <- R6Class("UInt8", inherit = FixedWidthType)
+UInt16 <- R6Class("UInt16", inherit = FixedWidthType)
+UInt32 <- R6Class("UInt32", inherit = FixedWidthType)
+UInt64 <- R6Class("UInt64", inherit = FixedWidthType)
+Float16 <- R6Class("Float16", inherit = FixedWidthType)
+Float32 <- R6Class("Float32", inherit = FixedWidthType)
+Float64 <- R6Class("Float64", inherit = FixedWidthType)
+Boolean <- R6Class("Boolean", inherit = FixedWidthType)
+Utf8 <- R6Class("Utf8", inherit = DataType)
+LargeUtf8 <- R6Class("LargeUtf8", inherit = DataType)
+Binary <- R6Class("Binary", inherit = DataType)
+FixedSizeBinary <- R6Class("FixedSizeBinary", inherit = FixedWidthType)
+LargeBinary <- R6Class("LargeBinary", inherit = DataType)
+
+DateType <- R6Class("DateType",
+ inherit = FixedWidthType,
+ public = list(
+ unit = function() DateType__unit(self)
+ )
+)
+Date32 <- R6Class("Date32", inherit = DateType)
+Date64 <- R6Class("Date64", inherit = DateType)
+
+TimeType <- R6Class("TimeType",
+ inherit = FixedWidthType,
+ public = list(
+ unit = function() TimeType__unit(self)
+ )
+)
+Time32 <- R6Class("Time32", inherit = TimeType)
+Time64 <- R6Class("Time64", inherit = TimeType)
+
+Null <- R6Class("Null", inherit = DataType)
+
+Timestamp <- R6Class("Timestamp",
+ inherit = FixedWidthType,
+ public = list(
+ timezone = function() TimestampType__timezone(self),
+ unit = function() TimestampType__unit(self)
+ )
+)
+
+DecimalType <- R6Class("DecimalType",
+ inherit = FixedWidthType,
+ public = list(
+ precision = function() DecimalType__precision(self),
+ scale = function() DecimalType__scale(self)
+ )
+)
+Decimal128Type <- R6Class("Decimal128Type", inherit = DecimalType)
+
+NestedType <- R6Class("NestedType", inherit = DataType)
+
+#' Apache Arrow data types
+#'
+#' These functions create type objects corresponding to Arrow types. Use them
+#' when defining a [schema()] or as inputs to other types, like `struct`. Most
+#' of these functions don't take arguments, but a few do.
+#'
+#' A few functions have aliases:
+#'
+#' * `utf8()` and `string()`
+#' * `float16()` and `halffloat()`
+#' * `float32()` and `float()`
+#' * `bool()` and `boolean()`
+#' * When called inside an `arrow` function, such as `schema()` or `cast()`,
+#' `double()` also is supported as a way of creating a `float64()`
+#'
+#' `date32()` creates a datetime type with a "day" unit, like the R `Date`
+#' class. `date64()` has a "ms" unit.
+#'
+#' `uint32` (32 bit unsigned integer), `uint64` (64 bit unsigned integer), and
+#' `int64` (64-bit signed integer) types may contain values that exceed the
+#' range of R's `integer` type (32-bit signed integer). When these arrow objects
+#' are translated to R objects, `uint32` and `uint64` are converted to `double`
+#' ("numeric") and `int64` is converted to `bit64::integer64`. For `int64`
+#' types, this conversion can be disabled (so that `int64` always yields a
+#' `bit64::integer64` object) by setting `options(arrow.int64_downcast =
+#' FALSE)`.
+#'
+#' @param unit For time/timestamp types, the time unit. `time32()` can take
+#' either "s" or "ms", while `time64()` can be "us" or "ns". `timestamp()` can
+#' take any of those four values.
+#' @param timezone For `timestamp()`, an optional time zone string.
+#' @param byte_width byte width for `FixedSizeBinary` type.
+#' @param list_size list size for `FixedSizeList` type.
+#' @param precision For `decimal()`, precision
+#' @param scale For `decimal()`, scale
+#' @param type For `list_of()`, a data type to make a list-of-type
+#' @param ... For `struct()`, a named list of types to define the struct columns
+#'
+#' @name data-type
+#' @return An Arrow type object inheriting from DataType.
+#' @export
+#' @seealso [dictionary()] for creating a dictionary (factor-like) type.
+#' @examplesIf arrow_available()
+#' bool()
+#' struct(a = int32(), b = double())
+#' timestamp("ms", timezone = "CEST")
+#' time64("ns")
+int8 <- function() Int8__initialize()
+
+#' @rdname data-type
+#' @export
+int16 <- function() Int16__initialize()
+
+#' @rdname data-type
+#' @export
+int32 <- function() Int32__initialize()
+
+#' @rdname data-type
+#' @export
+int64 <- function() Int64__initialize()
+
+#' @rdname data-type
+#' @export
+uint8 <- function() UInt8__initialize()
+
+#' @rdname data-type
+#' @export
+uint16 <- function() UInt16__initialize()
+
+#' @rdname data-type
+#' @export
+uint32 <- function() UInt32__initialize()
+
+#' @rdname data-type
+#' @export
+uint64 <- function() UInt64__initialize()
+
+#' @rdname data-type
+#' @export
+float16 <- function() Float16__initialize()
+
+#' @rdname data-type
+#' @export
+halffloat <- float16
+
+#' @rdname data-type
+#' @export
+float32 <- function() Float32__initialize()
+
+#' @rdname data-type
+#' @export
+float <- float32
+
+#' @rdname data-type
+#' @export
+float64 <- function() Float64__initialize()
+
+#' @rdname data-type
+#' @export
+boolean <- function() Boolean__initialize()
+
+#' @rdname data-type
+#' @export
+bool <- boolean
+
+#' @rdname data-type
+#' @export
+utf8 <- function() Utf8__initialize()
+
+#' @rdname data-type
+#' @export
+large_utf8 <- function() LargeUtf8__initialize()
+
+#' @rdname data-type
+#' @export
+binary <- function() Binary__initialize()
+
+#' @rdname data-type
+#' @export
+large_binary <- function() LargeBinary__initialize()
+
+#' @rdname data-type
+#' @export
+fixed_size_binary <- function(byte_width) FixedSizeBinary__initialize(byte_width)
+
+#' @rdname data-type
+#' @export
+string <- utf8
+
+#' @rdname data-type
+#' @export
+date32 <- function() Date32__initialize()
+
+#' @rdname data-type
+#' @export
+date64 <- function() Date64__initialize()
+
+#' @rdname data-type
+#' @export
+time32 <- function(unit = c("ms", "s")) {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, valid_time32_units)
+ Time32__initialize(unit)
+}
+
+valid_time32_units <- c(
+ "ms" = TimeUnit$MILLI,
+ "s" = TimeUnit$SECOND
+)
+
+valid_time64_units <- c(
+ "ns" = TimeUnit$NANO,
+ "us" = TimeUnit$MICRO
+)
+
+make_valid_time_unit <- function(unit, valid_units) {
+ if (is.character(unit)) {
+ unit <- valid_units[match.arg(unit, choices = names(valid_units))]
+ }
+ if (is.numeric(unit)) {
+ # Allow non-integer input for convenience
+ unit <- as.integer(unit)
+ } else {
+ stop('"unit" should be one of ', oxford_paste(names(valid_units), "or"), call. = FALSE)
+ }
+ if (!(unit %in% valid_units)) {
+ stop('"unit" should be one of ', oxford_paste(valid_units, "or"), call. = FALSE)
+ }
+ unit
+}
+
+#' @rdname data-type
+#' @export
+time64 <- function(unit = c("ns", "us")) {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, valid_time64_units)
+ Time64__initialize(unit)
+}
+
+#' @rdname data-type
+#' @export
+null <- function() Null__initialize()
+
+#' @rdname data-type
+#' @export
+timestamp <- function(unit = c("s", "ms", "us", "ns"), timezone = "") {
+ if (is.character(unit)) {
+ unit <- match.arg(unit)
+ }
+ unit <- make_valid_time_unit(unit, c(valid_time64_units, valid_time32_units))
+ assert_that(is.string(timezone))
+ Timestamp__initialize(unit, timezone)
+}
+
+#' @rdname data-type
+#' @export
+decimal <- function(precision, scale) {
+ if (is.numeric(precision)) {
+ precision <- as.integer(precision)
+ } else {
+ stop('"precision" must be an integer', call. = FALSE)
+ }
+ if (is.numeric(scale)) {
+ scale <- as.integer(scale)
+ } else {
+ stop('"scale" must be an integer', call. = FALSE)
+ }
+ Decimal128Type__initialize(precision, scale)
+}
+
+StructType <- R6Class("StructType",
+ inherit = NestedType,
+ public = list(
+ GetFieldByName = function(name) StructType__GetFieldByName(self, name),
+ GetFieldIndex = function(name) StructType__GetFieldIndex(self, name)
+ )
+)
+StructType$create <- function(...) struct__(.fields(list(...)))
+
+#' @rdname data-type
+#' @export
+struct <- StructType$create
+
+ListType <- R6Class("ListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() ListType__value_field(self),
+ value_type = function() ListType__value_type(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+list_of <- function(type) list__(type)
+
+LargeListType <- R6Class("LargeListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() LargeListType__value_field(self),
+ value_type = function() LargeListType__value_type(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+large_list_of <- function(type) large_list__(type)
+
+#' @rdname data-type
+#' @export
+FixedSizeListType <- R6Class("FixedSizeListType",
+ inherit = NestedType,
+ active = list(
+ value_field = function() FixedSizeListType__value_field(self),
+ value_type = function() FixedSizeListType__value_type(self),
+ list_size = function() FixedSizeListType__list_size(self)
+ )
+)
+
+#' @rdname data-type
+#' @export
+fixed_size_list_of <- function(type, list_size) fixed_size_list__(type, list_size)
+
+as_type <- function(type, name = "type") {
+ # magic so we don't have to mask base::double()
+ if (identical(type, double())) {
+ type <- float64()
+ }
+ if (!inherits(type, "DataType")) {
+ stop(name, " must be a DataType, not ", class(type), call. = FALSE)
+ }
+ type
+}
+
+canonical_type_str <- function(type_str) {
+ # canonicalizes data type strings, converting data type function names and
+ # aliases to match the strings returned by DataType$ToString()
+ assert_that(is.string(type_str))
+ if (grepl("[([<]", type_str)) {
+ stop("Cannot interpret string representations of data types that have parameters", call. = FALSE)
+ }
+ switch(type_str,
+ int8 = "int8",
+ int16 = "int16",
+ int32 = "int32",
+ int64 = "int64",
+ uint8 = "uint8",
+ uint16 = "uint16",
+ uint32 = "uint32",
+ uint64 = "uint64",
+ float16 = "halffloat",
+ halffloat = "halffloat",
+ float32 = "float",
+ float = "float",
+ float64 = "double",
+ double = "double",
+ boolean = "bool",
+ bool = "bool",
+ utf8 = "string",
+ large_utf8 = "large_string",
+ large_string = "large_string",
+ binary = "binary",
+ large_binary = "large_binary",
+ fixed_size_binary = "fixed_size_binary",
+ string = "string",
+ date32 = "date32",
+ date64 = "date64",
+ time32 = "time32",
+ time64 = "time64",
+ null = "null",
+ timestamp = "timestamp",
+ decimal = "decimal128",
+ struct = "struct",
+ list_of = "list",
+ list = "list",
+ large_list_of = "large_list",
+ large_list = "large_list",
+ fixed_size_list_of = "fixed_size_list",
+ fixed_size_list = "fixed_size_list",
+ stop("Unrecognized string representation of data type", call. = FALSE)
+ )
+}
+
+# vctrs support -----------------------------------------------------------
+str_dup <- function(x, times) {
+ paste0(rep(x, times = times), collapse = "")
+}
+
+indent <- function(x, n) {
+ pad <- str_dup(" ", n)
+ sapply(x, gsub, pattern = "(\n+)", replacement = paste0("\\1", pad))
+}
+
+#' @importFrom vctrs vec_ptype_full vec_ptype_abbr
+#' @export
+vec_ptype_full.arrow_fixed_size_binary <- function(x, ...) {
+ paste0("fixed_size_binary<", attr(x, "byte_width"), ">")
+}
+
+#' @export
+vec_ptype_full.arrow_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("list<", param, ">")
+}
+
+#' @export
+vec_ptype_full.arrow_large_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("large_list<", param, ">")
+}
+
+#' @export
+vec_ptype_full.arrow_fixed_size_list <- function(x, ...) {
+ param <- vec_ptype_full(attr(x, "ptype"))
+ if (grepl("\n", param)) {
+ param <- paste0(indent(paste0("\n", param), 2), "\n")
+ }
+ paste0("fixed_size_list<", param, ", ", attr(x, "list_size"), ">")
+}
+
+#' @export
+vec_ptype_abbr.arrow_fixed_size_binary <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_large_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
+#' @export
+vec_ptype_abbr.arrow_fixed_size_list <- function(x, ...) {
+ vec_ptype_full(x, ...)
+}
diff --git a/src/arrow/r/R/util.R b/src/arrow/r/R/util.R
new file mode 100644
index 000000000..9e3ade6a9
--- /dev/null
+++ b/src/arrow/r/R/util.R
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# for compatibility with R versions earlier than 4.0.0
+if (!exists("deparse1")) {
+ deparse1 <- function(expr, collapse = " ", width.cutoff = 500L, ...) {
+ paste(deparse(expr, width.cutoff, ...), collapse = collapse)
+ }
+}
+
+# for compatibility with R versions earlier than 3.6.0
+if (!exists("str2lang")) {
+ str2lang <- function(s) {
+ parse(text = s, keep.source = FALSE)[[1]]
+ }
+}
+
+oxford_paste <- function(x, conjunction = "and", quote = TRUE) {
+ if (quote && is.character(x)) {
+ x <- paste0('"', x, '"')
+ }
+ if (length(x) < 2) {
+ return(x)
+ }
+ x[length(x)] <- paste(conjunction, x[length(x)])
+ if (length(x) > 2) {
+ return(paste(x, collapse = ", "))
+ } else {
+ return(paste(x, collapse = " "))
+ }
+}
+
+assert_is <- function(object, class) {
+ msg <- paste(substitute(object), "must be a", oxford_paste(class, "or"))
+ assert_that(inherits(object, class), msg = msg)
+}
+
+assert_is_list_of <- function(object, class) {
+ msg <- paste(substitute(object), "must be a list of", oxford_paste(class, "or"))
+ assert_that(is_list_of(object, class), msg = msg)
+}
+
+is_list_of <- function(object, class) {
+ is.list(object) && all(map_lgl(object, ~ inherits(., class)))
+}
+
+empty_named_list <- function() structure(list(), .Names = character(0))
+
+r_symbolic_constants <- c(
+ "pi", "TRUE", "FALSE", "NULL", "Inf", "NA", "NaN",
+ "NA_integer_", "NA_real_", "NA_complex_", "NA_character_"
+)
+
+is_function <- function(expr, name) {
+ # We could have a quosure here if we have an expression like `sum({{ var }})`
+ if (is_quosure(expr)) {
+ expr <- quo_get_expr(expr)
+ }
+ if (!is.call(expr)) {
+ return(FALSE)
+ } else {
+ if (deparse(expr[[1]]) == name) {
+ return(TRUE)
+ }
+ out <- lapply(expr, is_function, name)
+ }
+ any(map_lgl(out, isTRUE))
+}
+
+all_funs <- function(expr) {
+ # It is not sufficient to simply do: setdiff(all.names, all.vars)
+ # here because that would fail to return the names of functions that
+ # share names with variables.
+ # To preserve duplicates, call `all.names()` not `all_names()` here.
+ if (is_quosure(expr)) {
+ expr <- quo_get_expr(expr)
+ }
+ names <- all.names(expr)
+ names[map_lgl(names, ~ is_function(expr, .))]
+}
+
+all_vars <- function(expr) {
+ setdiff(all.vars(expr), r_symbolic_constants)
+}
+
+all_names <- function(expr) {
+ setdiff(all.names(expr), r_symbolic_constants)
+}
+
+is_constant <- function(expr) {
+ length(all_vars(expr)) == 0
+}
+
+read_compressed_error <- function(e) {
+ msg <- conditionMessage(e)
+ if (grepl(" codec ", msg)) {
+ compression <- sub(".*Support for codec '(.*)'.*", "\\1", msg)
+ e$message <- paste0(
+ msg,
+ "\nIn order to read this file, you will need to reinstall arrow with additional features enabled.",
+ "\nSet one of these environment variables before installing:",
+ sprintf("\n\n * LIBARROW_MINIMAL=false (for all optional features, including '%s')", compression),
+ sprintf("\n * ARROW_WITH_%s=ON (for just '%s')", toupper(compression), compression),
+ "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details"
+ )
+ }
+ stop(e)
+}
+
+handle_parquet_io_error <- function(e, format) {
+ msg <- conditionMessage(e)
+ if (grepl("Parquet magic bytes not found in footer", msg) && length(format) > 1 && is_character(format)) {
+ # If length(format) > 1, that means it is (almost certainly) the default/not specified value
+ # so let the user know that they should specify the actual (not parquet) format
+ abort(c(
+ msg,
+ i = "Did you mean to specify a 'format' other than the default (parquet)?"
+ ))
+ }
+ stop(e)
+}
+
+is_writable_table <- function(x) {
+ inherits(x, c("data.frame", "ArrowTabular"))
+}
+
+# This attribute is used when is_writable is passed into assert_that, and allows
+# the call to form part of the error message when is_writable is FALSE
+attr(is_writable_table, "fail") <- function(call, env) {
+ paste0(
+ deparse(call$x),
+ " must be an object of class 'data.frame', 'RecordBatch', or 'Table', not '",
+ class(env[[deparse(call$x)]])[[1]],
+ "'."
+ )
+}
+
+#' Recycle scalar values in a list of arrays
+#'
+#' @param arrays List of arrays
+#' @return List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled
+#' @keywords internal
+recycle_scalars <- function(arrays) {
+ # Get lengths of items in arrays
+ arr_lens <- map_int(arrays, NROW)
+
+ is_scalar <- arr_lens == 1
+
+ if (length(arrays) > 1 && any(is_scalar) && !all(is_scalar)) {
+
+ # Recycling not supported for tibbles and data.frames
+ if (all(map_lgl(arrays, ~ inherits(.x, "data.frame")))) {
+ abort(c(
+ "All input tibbles or data.frames must have the same number of rows",
+ x = paste(
+ "Number of rows in longest and shortest inputs:",
+ oxford_paste(c(max(arr_lens), min(arr_lens)))
+ )
+ ))
+ }
+
+ max_array_len <- max(arr_lens)
+ arrays[is_scalar] <- lapply(arrays[is_scalar], repeat_value_as_array, max_array_len)
+ }
+ arrays
+}
+
+#' Take an object of length 1 and repeat it.
+#'
+#' @param object Object of length 1 to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray`
+#' @param n Number of repetitions
+#'
+#' @return `Array` of length `n`
+#'
+#' @keywords internal
+repeat_value_as_array <- function(object, n) {
+ if (inherits(object, "ChunkedArray")) {
+ return(Scalar$create(object$chunks[[1]])$as_array(n))
+ }
+ return(Scalar$create(object)$as_array(n))
+}
diff --git a/src/arrow/r/README.md b/src/arrow/r/README.md
new file mode 100644
index 000000000..dcd529dae
--- /dev/null
+++ b/src/arrow/r/README.md
@@ -0,0 +1,335 @@
+# arrow
+
+[![cran](https://www.r-pkg.org/badges/version-last-release/arrow)](https://cran.r-project.org/package=arrow)
+[![CI](https://github.com/apache/arrow/workflows/R/badge.svg?event=push)](https://github.com/apache/arrow/actions?query=workflow%3AR+branch%3Amaster+event%3Apush)
+[![conda-forge](https://img.shields.io/conda/vn/conda-forge/r-arrow.svg)](https://anaconda.org/conda-forge/r-arrow)
+
+**[Apache Arrow](https://arrow.apache.org/) is a cross-language
+development platform for in-memory data.** It specifies a standardized
+language-independent columnar memory format for flat and hierarchical
+data, organized for efficient analytic operations on modern hardware. It
+also provides computational libraries and zero-copy streaming messaging
+and interprocess communication.
+
+**The `arrow` package exposes an interface to the Arrow C++ library,
+enabling access to many of its features in R.** It provides low-level
+access to the Arrow C++ library API and higher-level access through a
+`dplyr` backend and familiar R functions.
+
+## What can the `arrow` package do?
+
+- Read and write **Parquet files** (`read_parquet()`,
+ `write_parquet()`), an efficient and widely used columnar format
+- Read and write **Feather files** (`read_feather()`,
+ `write_feather()`), a format optimized for speed and
+ interoperability
+- Analyze, process, and write **multi-file, larger-than-memory
+ datasets** (`open_dataset()`, `write_dataset()`)
+- Read **large CSV and JSON files** with excellent **speed and
+ efficiency** (`read_csv_arrow()`, `read_json_arrow()`)
+- Write CSV files (`write_csv_arrow()`)
+- Manipulate and analyze Arrow data with **`dplyr` verbs**
+- Read and write files in **Amazon S3** buckets with no additional
+ function calls
+- Exercise **fine control over column types** for seamless
+ interoperability with databases and data warehouse systems
+- Use **compression codecs** including Snappy, gzip, Brotli,
+ Zstandard, LZ4, LZO, and bzip2 for reading and writing data
+- Enable **zero-copy data sharing** between **R and Python**
+- Connect to **Arrow Flight** RPC servers to send and receive large
+ datasets over networks
+- Access and manipulate Arrow objects through **low-level bindings**
+ to the C++ library
+- Provide a **toolkit for building connectors** to other applications
+ and services that use Arrow
+
+## Installation
+
+### Installing the latest release version
+
+Install the latest release of `arrow` from CRAN with
+
+``` r
+install.packages("arrow")
+```
+
+Conda users can install `arrow` from conda-forge with
+
+``` shell
+conda install -c conda-forge --strict-channel-priority r-arrow
+```
+
+Installing a released version of the `arrow` package requires no
+additional system dependencies. For macOS and Windows, CRAN hosts binary
+packages that contain the Arrow C++ library. On Linux, source package
+installation will also build necessary C++ dependencies. For a faster,
+more complete installation, set the environment variable
+`NOT_CRAN=true`. See `vignette("install", package = "arrow")` for
+details.
+
+For Windows users of R 3.6 and earlier, note that support for AWS S3 is not
+available, and the 32-bit version does not support Arrow Datasets.
+These features are only supported by the `rtools40` toolchain on Windows
+and thus are only available in R >= 4.0.
+
+### Installing a development version
+
+Development versions of the package (binary and source) are built
+nightly and hosted at <https://arrow-r-nightly.s3.amazonaws.com>. To
+install from there:
+
+``` r
+install.packages("arrow", repos = "https://arrow-r-nightly.s3.amazonaws.com")
+```
+
+Conda users can install `arrow` nightly builds with
+
+``` shell
+conda install -c arrow-nightlies -c conda-forge --strict-channel-priority r-arrow
+```
+
+If you already have a version of `arrow` installed, you can switch to
+the latest nightly development version with
+
+``` r
+arrow::install_arrow(nightly = TRUE)
+```
+
+These nightly package builds are not official Apache releases and are
+not recommended for production use. They may be useful for testing bug
+fixes and new features under active development.
+
+## Usage
+
+Among the many applications of the `arrow` package, two of the most accessible are:
+
+- High-performance reading and writing of data files with multiple
+ file formats and compression codecs, including built-in support for
+ cloud storage
+- Analyzing and manipulating bigger-than-memory data with `dplyr`
+ verbs
+
+The sections below describe these two uses and illustrate them with
+basic examples. The sections below mention two Arrow data structures:
+
+- `Table`: a tabular, column-oriented data structure capable of
+ storing and processing large amounts of data more efficiently than
+ R’s built-in `data.frame` and with SQL-like column data types that
+ afford better interoperability with databases and data warehouse
+ systems
+- `Dataset`: a data structure functionally similar to `Table` but with
+ the capability to work on larger-than-memory data partitioned across
+ multiple files
+
+### Reading and writing data files with `arrow`
+
+The `arrow` package provides functions for reading single data files in
+several common formats. By default, calling any of these functions
+returns an R `data.frame`. To return an Arrow `Table`, set argument
+`as_data_frame = FALSE`.
+
+- `read_parquet()`: read a file in Parquet format
+- `read_feather()`: read a file in Feather format (the Apache Arrow
+ IPC format)
+- `read_delim_arrow()`: read a delimited text file (default delimiter
+ is comma)
+- `read_csv_arrow()`: read a comma-separated values (CSV) file
+- `read_tsv_arrow()`: read a tab-separated values (TSV) file
+- `read_json_arrow()`: read a JSON data file
+
+For writing data to single files, the `arrow` package provides the
+functions `write_parquet()`, `write_feather()`, and `write_csv_arrow()`.
+These can be used with R `data.frame` and Arrow `Table` objects.
+
+For example, let’s write the Star Wars characters data that’s included
+in `dplyr` to a Parquet file, then read it back in. Parquet is a popular
+choice for storing analytic data; it is optimized for reduced file sizes
+and fast read performance, especially for column-based access patterns.
+Parquet is widely supported by many tools and platforms.
+
+First load the `arrow` and `dplyr` packages:
+
+``` r
+library(arrow, warn.conflicts = FALSE)
+library(dplyr, warn.conflicts = FALSE)
+```
+
+Then write the `data.frame` named `starwars` to a Parquet file at
+`file_path`:
+
+``` r
+file_path <- tempfile()
+write_parquet(starwars, file_path)
+```
+
+Then read the Parquet file into an R `data.frame` named `sw`:
+
+``` r
+sw <- read_parquet(file_path)
+```
+
+R object attributes are preserved when writing data to Parquet or
+Feather files and when reading those files back into R. This enables
+round-trip writing and reading of `sf::sf` objects, R `data.frame`s with
+with `haven::labelled` columns, and `data.frame`s with other custom
+attributes.
+
+For reading and writing larger files or sets of multiple files, `arrow`
+defines `Dataset` objects and provides the functions `open_dataset()`
+and `write_dataset()`, which enable analysis and processing of
+bigger-than-memory data, including the ability to partition data into
+smaller chunks without loading the full data into memory. For examples
+of these functions, see `vignette("dataset", package = "arrow")`.
+
+All these functions can read and write files in the local filesystem or
+in Amazon S3 (by passing S3 URIs beginning with `s3://`). For more
+details, see `vignette("fs", package = "arrow")`
+
+### Using `dplyr` with `arrow`
+
+The `arrow` package provides a `dplyr` backend enabling manipulation of
+Arrow tabular data with `dplyr` verbs. To use it, first load both
+packages `arrow` and `dplyr`. Then load data into an Arrow `Table` or
+`Dataset` object. For example, read the Parquet file written in the
+previous example into an Arrow `Table` named `sw`:
+
+``` r
+sw <- read_parquet(file_path, as_data_frame = FALSE)
+```
+
+Next, pipe on `dplyr` verbs:
+
+``` r
+result <- sw %>%
+ filter(homeworld == "Tatooine") %>%
+ rename(height_cm = height, mass_kg = mass) %>%
+ mutate(height_in = height_cm / 2.54, mass_lbs = mass_kg * 2.2046) %>%
+ arrange(desc(birth_year)) %>%
+ select(name, height_in, mass_lbs)
+```
+
+The `arrow` package uses lazy evaluation to delay computation until the
+result is required. This speeds up processing by enabling the Arrow C++
+library to perform multiple computations in one operation. `result` is
+an object with class `arrow_dplyr_query` which represents all the
+computations to be performed:
+
+``` r
+result
+#> Table (query)
+#> name: string
+#> height_in: expr
+#> mass_lbs: expr
+#>
+#> * Filter: equal(homeworld, "Tatooine")
+#> * Sorted by birth_year [desc]
+#> See $.data for the source Arrow object
+```
+
+To perform these computations and materialize the result, call
+`compute()` or `collect()`. `compute()` returns an Arrow `Table`,
+suitable for passing to other `arrow` or `dplyr` functions:
+
+``` r
+result %>% compute()
+#> Table
+#> 10 rows x 3 columns
+#> $name <string>
+#> $height_in <double>
+#> $mass_lbs <double>
+```
+
+`collect()` returns an R `data.frame`, suitable for viewing or passing
+to other R functions for analysis or visualization:
+
+``` r
+result %>% collect()
+#> # A tibble: 10 x 3
+#> name height_in mass_lbs
+#> <chr> <dbl> <dbl>
+#> 1 C-3PO 65.7 165.
+#> 2 Cliegg Lars 72.0 NA
+#> 3 Shmi Skywalker 64.2 NA
+#> 4 Owen Lars 70.1 265.
+#> 5 Beru Whitesun lars 65.0 165.
+#> 6 Darth Vader 79.5 300.
+#> 7 Anakin Skywalker 74.0 185.
+#> 8 Biggs Darklighter 72.0 185.
+#> 9 Luke Skywalker 67.7 170.
+#> 10 R5-D4 38.2 70.5
+```
+
+The `arrow` package works with most single-table `dplyr` verbs, including those
+that compute aggregates.
+
+```r
+sw %>%
+ group_by(species) %>%
+ summarise(mean_height = mean(height, na.rm = TRUE)) %>%
+ collect()
+```
+
+Additionally, equality joins (e.g. `left_join()`, `inner_join()`) are supported
+for joining multiple tables.
+
+```r
+jedi <- data.frame(
+ name = c("C-3PO", "Luke Skywalker", "Obi-Wan Kenobi"),
+ jedi = c(FALSE, TRUE, TRUE)
+)
+
+sw %>%
+ select(1:11) %>%
+ right_join(jedi) %>%
+ collect()
+```
+
+Window functions (e.g. `ntile()`) are not yet
+supported. Inside `dplyr` verbs, Arrow offers support for many functions and
+operators, with common functions mapped to their base R and tidyverse
+equivalents. The [changelog](https://arrow.apache.org/docs/r/news/index.html)
+lists many of them. If there are additional functions you would like to see
+implemented, please file an issue as described in the [Getting
+help](#getting-help) section below.
+
+For `dplyr` queries on `Table` objects, if the `arrow` package detects
+an unimplemented function within a `dplyr` verb, it automatically calls
+`collect()` to return the data as an R `data.frame` before processing
+that `dplyr` verb. For queries on `Dataset` objects (which can be larger
+than memory), it raises an error if the function is unimplemented;
+you need to explicitly tell it to `collect()`.
+
+### Additional features
+
+Other applications of `arrow` are described in the following vignettes:
+
+- `vignette("python", package = "arrow")`: use `arrow` and
+ `reticulate` to pass data between R and Python
+- `vignette("flight", package = "arrow")`: connect to Arrow Flight RPC
+ servers to send and receive data
+- `vignette("arrow", package = "arrow")`: access and manipulate Arrow
+ objects through low-level bindings to the C++ library
+
+## Getting help
+
+If you encounter a bug, please file an issue with a minimal reproducible
+example on the [Apache Jira issue
+tracker](https://issues.apache.org/jira/projects/ARROW/issues). Create
+an account or log in, then click **Create** to file an issue. Select the
+project **Apache Arrow (ARROW)**, select the component **R**, and begin
+the issue summary with **`[R]`** followed by a space. For more
+information, see the **Report bugs and propose features** section of the
+[Contributing to Apache
+Arrow](https://arrow.apache.org/docs/developers/contributing.html) page
+in the Arrow developer documentation.
+
+We welcome questions, discussion, and contributions from users of the
+`arrow` package. For information about mailing lists and other venues
+for engaging with the Arrow developer and user communities, please see
+the [Apache Arrow Community](https://arrow.apache.org/community/) page.
+
+------------------------------------------------------------------------
+
+All participation in the Apache Arrow project is governed by the Apache
+Software Foundation’s [code of
+conduct](https://www.apache.org/foundation/policies/conduct.html).
diff --git a/src/arrow/r/STYLE.md b/src/arrow/r/STYLE.md
new file mode 100644
index 000000000..760084936
--- /dev/null
+++ b/src/arrow/r/STYLE.md
@@ -0,0 +1,38 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Style
+
+This is a style guide to writing documentation for arrow.
+
+## Coding style
+
+Please use the [tidyverse coding style](https://style.tidyverse.org/).
+
+## Referring to external packages
+
+When referring to external packages, include a link to the package at the first mention, and subsequently refer to it in plain text, e.g.
+
+* "The arrow R package provides a [dplyr](https://dplyr.tidyverse.org/) interface to Arrow Datasets. This vignette introduces Datasets and shows how to use dplyr to analyze them."
+
+## Data frames
+
+When referring to the concept, use the phrase "data frame", whereas when referring to an object of that class or when the class is important, write `data.frame`, e.g.
+
+* "You can call `write_dataset()` on tabular data objects such as Arrow Tables or RecordBatches, or R data frames. If working with data frames you might want to use a `tibble` instead of a `data.frame` to take advantage of the default behaviour of partitioning data based on grouped variables."
diff --git a/src/arrow/r/_pkgdown.yml b/src/arrow/r/_pkgdown.yml
new file mode 100644
index 000000000..c6a19119e
--- /dev/null
+++ b/src/arrow/r/_pkgdown.yml
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NPR: uncomment this to build docs for release
+# destination: ../../arrow-site/asf-site/docs/r/
+url: https://arrow.apache.org/docs/r/
+title: Arrow R Package
+template:
+ params:
+ bootswatch: cosmo
+ ganalytics: UA-107500873-1
+navbar:
+ structure:
+ left:
+ - home
+ - intro
+ - reference
+ - articles
+ - news
+ - project
+ right: github
+ components:
+ home:
+ text: â¯â¯â¯
+ href: https://arrow.apache.org/
+ reference:
+ text: Reference
+ href: reference/index.html
+ project:
+ text: Project docs
+ menu:
+ - text: Specification
+ href: https://arrow.apache.org/docs/format/README.html
+ - text: C GLib
+ href: https://arrow.apache.org/docs/c_glib
+ - text: C++
+ href: https://arrow.apache.org/docs/cpp
+ - text: Java
+ href: https://arrow.apache.org/docs/java
+ - text: JavaScript
+ href: https://arrow.apache.org/docs/js
+ - text: Python
+ href: https://arrow.apache.org/docs/python
+ - text: R
+ href: index.html
+ articles:
+ text: Articles
+ menu:
+ - text: Installing the Arrow Package on Linux
+ href: articles/install.html
+ - text: Working with Arrow Datasets and dplyr
+ href: articles/dataset.html
+ - text: Working with Cloud Storage (S3)
+ href: articles/fs.html
+ - text: Apache Arrow in Python and R with reticulate
+ href: articles/python.html
+ - text: Connecting to Flight RPC Servers
+ href: articles/flight.html
+ - text: Arrow R Developer Guide
+ href: articles/developing.html
+reference:
+ - title: Multi-file datasets
+ contents:
+ - open_dataset
+ - write_dataset
+ - dataset_factory
+ - hive_partition
+ - Dataset
+ - Partitioning
+ - Expression
+ - Scanner
+ - FileFormat
+ - FileWriteOptions
+ - FragmentScanOptions
+ - map_batches
+ - title: Reading and writing files
+ contents:
+ - read_feather
+ - read_ipc_stream
+ - read_parquet
+ - read_delim_arrow
+ - read_json_arrow
+ - write_feather
+ - write_ipc_stream
+ - write_to_raw
+ - write_parquet
+ - write_csv_arrow
+ - title: C++ reader/writer interface
+ contents:
+ - ParquetFileReader
+ - ParquetArrowReaderProperties
+ - ParquetFileWriter
+ - ParquetWriterProperties
+ - FeatherReader
+ - CsvTableReader
+ - RecordBatchReader
+ - RecordBatchWriter
+ - CsvReadOptions
+ - CsvWriteOptions
+ - title: Arrow data containers
+ contents:
+ - array
+ - ChunkedArray
+ - Scalar
+ - RecordBatch
+ - Table
+ - ArrayData
+ - buffer
+ - read_message
+ - title: Arrow data types and schema
+ contents:
+ - Schema
+ - unify_schemas
+ - type
+ - dictionary
+ - Field
+ - read_schema
+ - data-type
+ - DataType
+ - DictionaryType
+ - FixedWidthType
+ - title: Flight
+ contents:
+ - load_flight_server
+ - flight_connect
+ - flight_get
+ - flight_put
+ - list_flights
+ - title: File systems
+ contents:
+ - s3_bucket
+ - FileSystem
+ - FileInfo
+ - FileSelector
+ - copy_files
+ - title: Input/Output
+ contents:
+ - InputStream
+ - mmap_open
+ - mmap_create
+ - OutputStream
+ - Message
+ - MessageReader
+ - compression
+ - Codec
+ - codec_is_available
+ - title: Computation
+ contents:
+ - call_function
+ - match_arrow
+ - value_counts
+ - list_compute_functions
+ - title: Connections to other systems
+ contents:
+ - to_arrow
+ - to_duckdb
+ - title: Configuration
+ contents:
+ - arrow_info
+ - cpu_count
+ - io_thread_count
+ - arrow_available
+ - install_arrow
+ - install_pyarrow
+ - create_package_with_all_dependencies
+
+repo:
+ jira_projects: [ARROW]
+ url:
+ source: https://github.com/apache/arrow/blob/master/r/
+ issue: https://issues.apache.org/jira/browse/
diff --git a/src/arrow/r/arrow.Rproj b/src/arrow/r/arrow.Rproj
new file mode 100644
index 000000000..cba1b6b7a
--- /dev/null
+++ b/src/arrow/r/arrow.Rproj
@@ -0,0 +1,21 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/src/arrow/r/cleanup b/src/arrow/r/cleanup
new file mode 100755
index 000000000..7605d50de
--- /dev/null
+++ b/src/arrow/r/cleanup
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+rm -f src/Makevars
+
diff --git a/src/arrow/r/configure b/src/arrow/r/configure
new file mode 100755
index 000000000..cd2314949
--- /dev/null
+++ b/src/arrow/r/configure
@@ -0,0 +1,307 @@
+#!/usr/bin/env sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Anticonf (tm) script by Jeroen Ooms, Jim Hester (2017)
+# License: MIT
+#
+# This script will query 'pkg-config' for the required cflags and ldflags.
+# If pkg-config is unavailable or does not find the library, try setting
+# INCLUDE_DIR and LIB_DIR manually via e.g:
+# R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib'
+
+# Library settings
+PKG_CONFIG_NAME="arrow"
+PKG_DEB_NAME="(unsuppored)"
+PKG_RPM_NAME="(unsuppored)"
+PKG_BREW_NAME="apache-arrow"
+PKG_TEST_HEADER="<arrow/api.h>"
+PKG_LIBS="-larrow"
+
+# Make some env vars case-insensitive
+ARROW_R_DEV=`echo $ARROW_R_DEV | tr '[:upper:]' '[:lower:]'`
+FORCE_AUTOBREW=`echo $FORCE_AUTOBREW | tr '[:upper:]' '[:lower:]'`
+FORCE_BUNDLED_BUILD=`echo $FORCE_BUNDLED_BUILD | tr '[:upper:]' '[:lower:]'`
+ARROW_USE_PKG_CONFIG=`echo $ARROW_USE_PKG_CONFIG | tr '[:upper:]' '[:lower:]'`
+LIBARROW_MINIMAL=`echo $LIBARROW_MINIMAL | tr '[:upper:]' '[:lower:]'`
+TEST_OFFLINE_BUILD=`echo $TEST_OFFLINE_BUILD | tr '[:upper:]' '[:lower:]'`
+NOT_CRAN=`echo $NOT_CRAN | tr '[:upper:]' '[:lower:]'`
+
+VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //`
+UNAME=`uname -s`
+
+# generate code
+if [ "$ARROW_R_DEV" = "true" ] && [ -f "data-raw/codegen.R" ]; then
+ echo "*** Generating code with data-raw/codegen.R"
+ ${R_HOME}/bin/Rscript data-raw/codegen.R
+fi
+
+if [ -f "tools/apache-arrow.rb" ]; then
+ # If you want to use a local apache-arrow.rb formula, do
+ # $ cp ../dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb tools/apache-arrow.rb
+ # before R CMD build or INSTALL (assuming a local checkout of the apache/arrow repository)
+ cp tools/autobrew .
+ if [ "$FORCE_AUTOBREW" != "false" ]; then
+ # It is possible to turn off forced autobrew if the formula is included,
+ # but most likely you shouldn't because the included formula will reference
+ # the C++ library at the version that matches the R package.
+ FORCE_AUTOBREW="true"
+ fi
+fi
+
+if [ "$FORCE_AUTOBREW" = "true" ] || [ "$FORCE_BUNDLED_BUILD" = "true" ]; then
+ ARROW_USE_PKG_CONFIG="false"
+fi
+
+# Note that cflags may be empty in case of success
+if [ "$ARROW_HOME" ] && [ "$FORCE_BUNDLED_BUILD" != "true" ]; then
+ echo "*** Using ARROW_HOME as the source of libarrow"
+ PKG_CFLAGS="-I$ARROW_HOME/include $PKG_CFLAGS"
+ PKG_DIRS="-L$ARROW_HOME/lib"
+elif [ "$INCLUDE_DIR" ] && [ "$LIB_DIR" ]; then
+ echo "*** Using INCLUDE_DIR/LIB_DIR as the source of libarrow"
+ PKG_CFLAGS="-I$INCLUDE_DIR $PKG_CFLAGS"
+ PKG_DIRS="-L$LIB_DIR"
+else
+ # Use pkg-config to find libarrow if available and allowed
+ pkg-config --version >/dev/null 2>&1
+ if [ $? -eq 0 ] && [ "$ARROW_USE_PKG_CONFIG" != "false" ]; then
+ # Set the search paths and compile flags
+ PKGCONFIG_CFLAGS=`pkg-config --cflags --silence-errors ${PKG_CONFIG_NAME}`
+ PKGCONFIG_LIBS=`pkg-config --libs-only-l --libs-only-other --silence-errors ${PKG_CONFIG_NAME}`
+ PKGCONFIG_DIRS=`pkg-config --libs-only-L --silence-errors ${PKG_CONFIG_NAME}`
+ fi
+
+ if [ "$PKGCONFIG_CFLAGS" ] && [ "$PKGCONFIG_LIBS" ]; then
+ FOUND_LIB_DIR=`echo $PKG_DIRS | sed -e 's/^-L//'`
+ echo "*** Arrow C++ libraries found via pkg-config at $FOUND_LIB_DIR"
+ PKG_CFLAGS="$PKGCONFIG_CFLAGS"
+ PKG_LIBS=${PKGCONFIG_LIBS}
+ PKG_DIRS=${PKGCONFIG_DIRS}
+
+ # Check for version mismatch
+ PC_LIB_VERSION=`pkg-config --modversion arrow`
+ echo $PC_LIB_VERSION | grep -e 'SNAPSHOT$' >/dev/null 2>&1
+ # If on a release (i.e. not SNAPSHOT) and version != R package version, warn
+ if [ $? -eq 1 ] && [ "$PC_LIB_VERSION" != "$VERSION" ]; then
+ echo "**** Warning: library version mismatch"
+ echo "**** C++ is $PC_LIB_VERSION but R is $VERSION"
+ echo "**** If installation fails, upgrade the C++ library to match"
+ echo "**** or retry with ARROW_USE_PKG_CONFIG=false"
+ fi
+ else
+ if [ "$UNAME" = "Darwin" ] && [ "$FORCE_BUNDLED_BUILD" != "true" ]; then
+ if [ "$FORCE_AUTOBREW" != "true" ] && [ "`command -v brew`" ] && [ "`brew ls --versions ${PKG_BREW_NAME}`" != "" ]; then
+ echo "*** Using Homebrew ${PKG_BREW_NAME}"
+ BREWDIR=`brew --prefix`
+ PKG_LIBS="$PKG_LIBS -larrow_bundled_dependencies"
+ PKG_DIRS="-L$BREWDIR/opt/$PKG_BREW_NAME/lib $PKG_DIRS"
+ PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include"
+ else
+ echo "*** Downloading ${PKG_BREW_NAME}"
+ if [ -f "autobrew" ]; then
+ echo "**** Using local manifest for ${PKG_BREW_NAME}"
+ else
+ curl -sfL "https://autobrew.github.io/scripts/$PKG_BREW_NAME" > autobrew
+ if [ $? -ne 0 ]; then
+ echo "Failed to download manifest for ${PKG_BREW_NAME}"
+ fi
+ fi
+ . autobrew
+ if [ $? -ne 0 ]; then
+ echo "Failed to retrieve binary for ${PKG_BREW_NAME}"
+ fi
+ # autobrew sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS`
+ fi
+ else
+ if [ "${NOT_CRAN}" = "true" ]; then
+ # Set some default values
+ if [ "${LIBARROW_BINARY}" = "" ]; then
+ LIBARROW_BINARY=true; export LIBARROW_BINARY
+ fi
+ if [ "${LIBARROW_MINIMAL}" = "" ]; then
+ LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL
+ fi
+ fi
+
+ # find openssl on macos. macOS ships with libressl. openssl is installable
+ # with brew, but it is generally not linked. We can over-ride this and find
+ # openssl but setting OPENSSL_ROOT_DIR (which cmake will pick up later in
+ # the installation process). FWIW, arrow's cmake process uses this
+ # same process to find openssl, but doing it now allows us to catch it in
+ # nixlibs.R and throw a nicer error.
+ if [ "$UNAME" = "Darwin" ] && [ "${OPENSSL_ROOT_DIR}" = "" ]; then
+ brew --prefix openssl >/dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ OPENSSL_ROOT_DIR="`brew --prefix openssl`"; export OPENSSL_ROOT_DIR
+ fi
+ fi
+
+ if [ "${ARROW_DEPENDENCY_SOURCE}" = "" ]; then
+ # TODO: BUNDLED is still default for now, but we plan to change it to AUTO
+ ARROW_DEPENDENCY_SOURCE=BUNDLED; export ARROW_DEPENDENCY_SOURCE
+ fi
+ if [ "${ARROW_DEPENDENCY_SOURCE}" = "AUTO" ]; then
+ pkg-config --version >/dev/null 2>&1
+ if [ $? -ne 0 ]; then
+ export ARROW_DEPENDENCY_SOURCE=BUNDLED
+ echo "**** Warning: ARROW_DEPENDENCY_SOURCE set to 'AUTO' but pkg-config not installed"
+ echo "**** ARROW_DEPENDENCY_SOURCE has been set to 'BUNDLED'"
+ fi
+ fi
+
+ ${R_HOME}/bin/Rscript tools/nixlibs.R $VERSION
+ PKG_CFLAGS="-I`pwd`/libarrow/arrow-${VERSION}/include $PKG_CFLAGS"
+
+ LIB_DIR="libarrow/arrow-${VERSION}/lib"
+ if [ -d "$LIB_DIR" ]; then
+ # Enumerate the static libs, put their -l flags in BUNDLED_LIBS,
+ # and put their -L location in PKG_DIRS
+ #
+ # If tools/nixlibs.R fails to produce libs, this dir won't exist
+ # so don't try (the error message from `ls` would be misleading)
+ # Assume nixlibs.R has handled and messaged about its failure already
+ #
+ # TODO: what about non-bundled deps?
+ BUNDLED_LIBS=`cd $LIB_DIR && ls *.a`
+ BUNDLED_LIBS=`echo "$BUNDLED_LIBS" | sed -e "s/\\.a lib/ -l/g" | sed -e "s/\\.a$//" | sed -e "s/^lib/-l/" | tr '\n' ' ' | sed -e "s/ $//"`
+ PKG_DIRS="-L`pwd`/$LIB_DIR"
+
+ # Use pkg-config to do static linking of libarrow's dependencies
+ if [ "$ARROW_DEPENDENCY_SOURCE" = "AUTO" ] || [ "$ARROW_DEPENDENCY_SOURCE" = "SYSTEM" ]; then
+ PKG_LIBS="$PKG_LIBS `PKG_CONFIG_PATH=${LIB_DIR}/pkgconfig pkg-config --libs-only-l --libs-only-other --static --silence-errors ${PKG_CONFIG_NAME}`"
+ fi
+
+ # When using brew's openssl it is not bundled and it is not on the system
+ # search path and so we must add the lib path to BUNDLED_LIBS if we are
+ # using it. Note the order is important, this must be after the arrow
+ # lib path + the pkg and bundled libs above so this is why we're
+ # appending to BUNDLED_LIBS and not PKG_DIRS
+ if [ "$OPENSSL_ROOT_DIR" != "" ]; then
+ BUNDLED_LIBS="$BUNDLED_LIBS -L$OPENSSL_ROOT_DIR/lib"
+ fi
+ fi
+ fi
+ fi
+fi
+
+# If on Raspberry Pi, need to manually link against latomic
+# See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81358 for similar example
+if grep raspbian /etc/os-release >/dev/null 2>&1; then
+ PKG_CFLAGS="$PKG_CFLAGS -DARROW_CXXFLAGS=-latomic"
+fi
+
+# If libarrow uses the old GLIBCXX ABI, so we have to use it too
+if [ "$ARROW_USE_OLD_CXXABI" ]; then
+ PKG_CFLAGS="$PKG_CFLAGS -D_GLIBCXX_USE_CXX11_ABI=0"
+fi
+
+# Set any user-defined CXXFLAGS
+if [ "$ARROW_R_CXXFLAGS" ]; then
+ PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS"
+fi
+
+# Test that we can find libarrow
+CXXCPP="`${R_HOME}/bin/R CMD config CXX11` -E"
+if [ $? -eq 0 ]; then
+ # Great, CXX11 exists for this version of R (current);
+ # now let's set the other two variables
+ CXX11FLAGS=`"${R_HOME}"/bin/R CMD config CXX11FLAGS`
+ CXX11STD=`"${R_HOME}"/bin/R CMD config CXX11STD`
+else
+ # For compatibility with R < 3.4, when these were called CXX1X
+ CXXCPP="`${R_HOME}/bin/R CMD config CXX1X` -E"
+ CXX11FLAGS=`"${R_HOME}"/bin/R CMD config CXX1XFLAGS`
+ CXX11STD=`"${R_HOME}"/bin/R CMD config CXX1XSTD`
+fi
+CPPFLAGS=`"${R_HOME}"/bin/R CMD config CPPFLAGS`
+TEST_CMD="${CXXCPP} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX11FLAGS} ${CXX11STD} -xc++ -"
+echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} >/dev/null 2>&1
+
+if [ $? -eq 0 ]; then
+ PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_ARROW"
+ # Check for features
+ LIB_DIR=`echo $PKG_DIRS | sed -e 's/^-L//'`
+ ARROW_OPTS_CMAKE="$LIB_DIR/cmake/arrow/ArrowOptions.cmake"
+ # Check for Parquet
+ grep 'set(ARROW_PARQUET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_PARQUET"
+ PKG_LIBS="-lparquet $PKG_LIBS"
+ # NOTE: parquet is assumed to have the same -L flag as arrow
+ # so there is no need to add its location to PKG_DIRS
+ fi
+ # Check for Arrow Dataset subcomponent
+ grep 'set(ARROW_DATASET "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_DATASET"
+ PKG_LIBS="-larrow_dataset $PKG_LIBS"
+ # NOTE: arrow-dataset is assumed to have the same -L flag as arrow
+ # so there is no need to add its location to PKG_DIRS
+ fi
+ # Check for S3
+ grep 'set(ARROW_S3 "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3"
+ if [ "$BUNDLED_LIBS" != "" ]; then
+ # We're depending on openssl/curl from the system, so they're not in the bundled deps
+ BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl"
+ fi
+ fi
+ # Check for JSON
+ grep 'set(ARROW_JSON "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON"
+ fi
+ # prepend PKG_DIRS and append BUNDLED_LIBS to PKG_LIBS
+ PKG_LIBS="$PKG_DIRS $PKG_LIBS $BUNDLED_LIBS"
+ echo "PKG_CFLAGS=$PKG_CFLAGS"
+ echo "PKG_LIBS=$PKG_LIBS"
+else
+ echo "------------------------- NOTE ---------------------------"
+ echo "There was an issue preparing the Arrow C++ libraries."
+ echo "See https://arrow.apache.org/docs/r/articles/install.html"
+ echo "---------------------------------------------------------"
+ PKG_LIBS=""
+ PKG_CFLAGS=""
+ if [ "$UNAME" != "SunOS" ] && [ "$TEST_R_WITHOUT_LIBARROW" != "TRUE" ]; then
+ # We should build fine on Solaris, but because we don't have CI for it,
+ # allow the build to proceed with most R package features disabled.
+ # But on every other platforom stop here if libarrow was not found.
+ # (also check an env var so that we can test this build configuration)
+ exit 1
+ fi
+fi
+
+# Write to Makevars
+sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars
+
+# This is removed because a (bad?) CRAN check fails when arrow.so is stripped
+# # Add stripping
+# if [ "$R_STRIP_SHARED_LIB" != "" ]; then
+# # R_STRIP_SHARED_LIB is set in the global Renviron and should be available here
+# echo "
+# strip: \$(SHLIB)
+# $R_STRIP_SHARED_LIB \$(SHLIB) >/dev/null 2>&1 || true
+#
+# .phony: strip
+# " >> src/Makevars
+# fi
+
+# Success
+exit 0
diff --git a/src/arrow/r/configure.win b/src/arrow/r/configure.win
new file mode 100644
index 000000000..6d731bb09
--- /dev/null
+++ b/src/arrow/r/configure.win
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# generate code
+if [ "$ARROW_R_DEV" == "TRUE" ]; then
+ echo "*** Generating code with data-raw/codegen.R"
+ "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" data-raw/codegen.R
+fi
+
+VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //)
+# Try to find/download a C++ Arrow binary,
+# including possibly a local .zip file if RWINLIB_LOCAL is set
+"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" "tools/winlibs.R" $VERSION $RWINLIB_LOCAL
+# If binary not found, script exits nonzero
+if [ $? -ne 0 ]; then
+ echo "Arrow C++ library was not found"
+fi
+
+# Set the right flags to point to and enable arrow/parquet
+if [ -d "windows/arrow-$VERSION" ]; then
+ RWINLIB="../windows/arrow-$VERSION"
+else
+ # It's possible that the version of the libarrow binary is not identical to the
+ # R version, e.g. if the R build is a patch release, so find what the dir is
+ # actually called. If there is more than one version present, use the one
+ # with the highest version:
+ RWINLIB="../windows/$(ls windows/ | grep ^arrow- | tail -n 1)"
+fi
+OPENSSL_LIBS="-lcrypto -lcrypt32"
+MIMALLOC_LIBS="-lbcrypt -lpsapi"
+AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common -lUserenv -lversion -lws2_32 -lBcrypt -lWininet -lwinhttp"
+
+# NOTE: If you make changes to the libraries below, you should also change
+# ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD
+PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON"
+PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH)$(CRT) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 -lole32 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
+
+# S3 and re2 support only for Rtools40 (i.e. R >= 4.0)
+"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE >/dev/null 2>&1
+if [ $? -eq 0 ]; then
+ PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3"
+ PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS}"
+else
+ # It seems that order matters
+ PKG_LIBS="${PKG_LIBS} -lws2_32"
+fi
+
+# Set any user-defined CXXFLAGS
+if [ "$ARROW_R_CXXFLAGS" ]; then
+ PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS"
+fi
+
+echo "*** Writing Makevars.win"
+sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars.win
+# Success
+exit 0
diff --git a/src/arrow/r/cran-comments.md b/src/arrow/r/cran-comments.md
new file mode 100644
index 000000000..8fb63b84d
--- /dev/null
+++ b/src/arrow/r/cran-comments.md
@@ -0,0 +1,10 @@
+## Test environments
+* Debian Linux, GCC, R-devel/R-patched/R-release
+* Fedora Linux, GCC/clang, R-devel
+* Ubuntu Linux 16.04 LTS, R-release, GCC
+* win-builder (R-devel and R-release)
+* macOS 10.14, R-oldrel
+
+## R CMD check results
+
+There were no ERRORs or WARNINGs. On some platforms, there is a NOTE about the installed package size.
diff --git a/src/arrow/r/data-raw/codegen.R b/src/arrow/r/data-raw/codegen.R
new file mode 100644
index 000000000..46b02fd64
--- /dev/null
+++ b/src/arrow/r/data-raw/codegen.R
@@ -0,0 +1,258 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is used to generate code in the files
+# src/arrowExports.cpp and R/arrowExports.R
+#
+# This is similar to what compileAttributes() would do,
+# with some arrow specific changes.
+#
+# Functions are decorated with [[arrow::export]]
+# and the generated code adds a layer of protection so that
+# the arrow package can be installed even when libarrow is not
+#
+# All the C++ code should be guarded by
+#
+# #if defined(ARROW_R_WITH_ARROW)
+# // [[arrow::export]]
+# std::shared_ptr<arrow::Array> some_function_using_arrow_api(){
+# ...
+# }
+# #endif
+
+
+# Different flags can be used to export different features.
+# [[feature::export]]
+# maps to
+# #if defined(ARROW_R_WITH_FEATURE)
+# and each feature is written to its own set of export files.
+
+# Ensure that all machines are sorting the same way
+invisible(Sys.setlocale("LC_COLLATE", "C"))
+
+features <- c("arrow", "dataset", "parquet", "s3", "json")
+
+suppressPackageStartupMessages({
+ library(decor)
+ library(dplyr)
+ library(purrr)
+ library(glue)
+ library(vctrs)
+})
+
+get_exported_functions <- function(decorations, export_tag) {
+ out <- decorations %>%
+ filter(decoration %in% paste0(export_tag, "::export")) %>%
+ mutate(functions = map(context, decor:::parse_cpp_function)) %>%
+ { vec_cbind(., vec_rbind(!!!pull(., functions))) } %>%
+ select(-functions) %>%
+ mutate(decoration = sub("::export", "", decoration))
+ message(glue("*** > {n} functions decorated with [[{tags}::export]]", n = nrow(out), tags = paste0(export_tag, collapse = "|")))
+ out
+}
+
+glue_collapse_data <- function(data, ..., sep = ", ", last = "") {
+ res <- glue_collapse(glue_data(data, ...), sep = sep, last = last)
+ if (length(res) == 0) res <- ""
+ res
+}
+
+wrap_call <- function(name, return_type, args) {
+ call <- glue::glue('{name}({list_params})', list_params = glue_collapse_data(args, "{name}"))
+ if (return_type == "void") {
+ glue::glue("\t{call};\n\treturn R_NilValue;", .trim = FALSE)
+ } else {
+ glue::glue("\treturn cpp11::as_sexp({call});")
+ }
+}
+
+feature_available <- function(feat) {
+ glue::glue(
+'extern "C" SEXP _{feat}_available() {{
+return Rf_ScalarLogical(
+#if defined(ARROW_R_WITH_{toupper(feat)})
+ TRUE
+#else
+ FALSE
+#endif
+);
+}}
+')
+}
+
+write_if_modified <- function(code, file) {
+ old <- try(readLines(file), silent=TRUE)
+ new <- unclass(unlist(strsplit(code, "\n")))
+ # We don't care about changes in empty lines
+ if (!identical(old[nzchar(old)], new[nzchar(new)])) {
+ writeLines(con = file, code)
+ # To debug why they're different if you think they shouldn't be:
+ # print(waldo::compare(old[nzchar(old)], new[nzchar(new)]))
+ message(glue::glue("*** > generated file `{file}`"))
+ } else {
+ message(glue::glue("*** > `{file}` not modified"))
+ }
+}
+
+all_decorations <- cpp_decorations()
+arrow_exports <- get_exported_functions(all_decorations, features)
+
+arrow_classes <- c(
+ "Table" = "arrow::Table",
+ "RecordBatch" = "arrow::RecordBatch"
+)
+
+# This takes a cpp11 C wrapper and conditionally makes it available based on
+# a feature decoration
+ifdef_wrap <- function(cpp11_wrapped, name, sexp_signature, decoration) {
+ # if (identical(decoration, "arrow")) {
+ # # Arrow is now required
+ # return(cpp11_wrapped)
+ # }
+ glue('
+ #if defined(ARROW_R_WITH_{toupper(decoration)})
+ {cpp11_wrapped}
+ #else
+ extern "C" SEXP {sexp_signature}{{
+ \tRf_error("Cannot call {name}(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+ }}
+ #endif
+ \n')
+}
+
+cpp_functions_definitions <- arrow_exports %>%
+ select(name, return_type, args, file, line, decoration) %>%
+ pmap_chr(function(name, return_type, args, file, line, decoration) {
+ sexp_params <- glue_collapse_data(args, "SEXP {name}_sexp")
+ sexp_signature <- glue('_arrow_{name}({sexp_params})')
+ cpp11_wrapped <- glue('
+ {return_type} {name}({real_params});
+ extern "C" SEXP {sexp_signature}{{
+ BEGIN_CPP11
+ {input_params}{return_line}{wrap_call(name, return_type, args)}
+ END_CPP11
+ }}',
+ sep = "\n",
+ real_params = glue_collapse_data(args, "{type} {name}"),
+ input_params = glue_collapse_data(args, "\tarrow::r::Input<{type}>::type {name}({name}_sexp);", sep = "\n"),
+ return_line = if (nrow(args)) "\n" else "")
+
+ glue::glue('
+ // {basename(file)}
+ {ifdef_wrap(cpp11_wrapped, name, sexp_signature, decoration)}
+ ',
+ sep = "\n",
+ )
+ }) %>%
+ glue_collapse(sep = "\n")
+
+cpp_functions_registration <- arrow_exports %>%
+ select(name, return_type, args) %>%
+ pmap_chr(function(name, return_type, args) {
+ glue('\t\t{{ "_arrow_{name}", (DL_FUNC) &_arrow_{name}, {nrow(args)}}}, ')
+ }) %>%
+ glue_collapse(sep = "\n")
+
+cpp_classes_finalizers <- map2(names(arrow_classes), arrow_classes, function(name, class) {
+ sexp_signature <- glue('_arrow_{name}__Reset(SEXP r6)')
+ cpp11_wrapped <- glue('
+ extern "C" SEXP {sexp_signature} {{
+ BEGIN_CPP11
+ arrow::r::r6_reset_pointer<{class}>(r6);
+ END_CPP11
+ return R_NilValue;
+ }}')
+ ifdef_wrap(cpp11_wrapped, name, sexp_signature, "arrow")
+}) %>%
+ glue_collapse(sep = "\n")
+
+classes_finalizers_registration <- glue('\t\t{{ "_arrow_{names(arrow_classes)}__Reset", (DL_FUNC) &_arrow_{names(arrow_classes)}__Reset, 1}}, ') %>%
+ glue_collapse(sep = "\n")
+
+cpp_file_header <- '// Generated by using data-raw/codegen.R -> do not edit by hand
+#include <cpp11.hpp>
+#include <cpp11/declarations.hpp>
+
+#include "./arrow_types.h"
+'
+
+arrow_exports_cpp <- paste0(
+glue::glue('
+{cpp_file_header}
+{cpp_functions_definitions}
+{cpp_classes_finalizers}
+\n'),
+glue::glue_collapse(glue::glue('
+{feature_available({features})}
+'), sep = '\n'),
+'
+static const R_CallMethodDef CallEntries[] = {
+',
+glue::glue_collapse(glue::glue('
+\t\t{{ "_{features}_available", (DL_FUNC)& _{features}_available, 0 }},
+'), sep = '\n'),
+glue::glue('\n
+{cpp_functions_registration}
+{classes_finalizers_registration}
+\t\t{{NULL, NULL, 0}}
+}};
+\n'),
+'extern "C" void R_init_arrow(DllInfo* dll){
+ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+ R_useDynamicSymbols(dll, FALSE);
+
+ #if defined(ARROW_R_WITH_ARROW) && defined(HAS_ALTREP)
+ arrow::r::altrep::Init_Altrep_classes(dll);
+ #endif
+
+}
+\n')
+
+write_if_modified(arrow_exports_cpp, "src/arrowExports.cpp")
+
+r_functions <- arrow_exports %>%
+ select(name, return_type, args) %>%
+ pmap_chr(function(name, return_type, args) {
+ params <- if (nrow(args)) {
+ paste0(", ", glue_collapse_data(args, "{name}"))
+ } else {
+ ""
+ }
+ call <- glue::glue('.Call(`_arrow_{name}`{params})')
+ if (return_type == "void") {
+ call <- glue::glue('invisible({call})')
+ }
+
+ glue::glue('
+ {name} <- function({list_params}) {{
+ {call}
+ }}
+
+ ',
+ list_params = glue_collapse_data(args, "{name}"),
+ sep = "\n",
+ )
+ }) %>%
+ glue_collapse(sep = "\n")
+
+arrow_exports_r <- glue::glue('
+# Generated by using data-raw/codegen.R -> do not edit by hand
+
+{r_functions}
+')
+
+write_if_modified(arrow_exports_r, "R/arrowExports.R")
diff --git a/src/arrow/r/extra-tests/helpers.R b/src/arrow/r/extra-tests/helpers.R
new file mode 100644
index 000000000..3fb450ee3
--- /dev/null
+++ b/src/arrow/r/extra-tests/helpers.R
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if_version <- function(version, op = `==`) {
+ op(packageVersion("arrow"), version)
+}
+
+if_version_less_than <- function(version) {
+ if_version(version, op = `<`)
+}
+
+skip_if_version_less_than <- function(version, msg) {
+ if (if_version(version, `<`)) {
+ skip(msg)
+ }
+}
+
+skip_if_version_equals <- function(version, msg) {
+ if (if_version(version, `==`)) {
+ skip(msg)
+ }
+}
diff --git a/src/arrow/r/extra-tests/test-read-files.R b/src/arrow/r/extra-tests/test-read-files.R
new file mode 100644
index 000000000..a2453e251
--- /dev/null
+++ b/src/arrow/r/extra-tests/test-read-files.R
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(arrow)
+library(testthat)
+
+pq_file <- "files/ex_data.parquet"
+
+test_that("Can read the file (parquet)", {
+ # We can read with no error, we assert metadata below
+ expect_error(
+ df <- read_parquet(pq_file),
+ NA
+ )
+})
+
+### Parquet
+test_that("Can see the metadata (parquet)", {
+ skip_if_version_less_than("2.0.0", "Version 1.0.1 can't read new version metadata.")
+
+ df <- read_parquet(pq_file)
+ expect_s3_class(df, "tbl")
+
+ # expect_mapequal() instead of expect_equal() because there was an order change where
+ # `class` is located in version 3.0.0 and above.
+ expect_mapequal(
+ attributes(df),
+ list(
+ names = letters[1:4],
+ row.names = 1L,
+ top_level = list(
+ field_one = 12,
+ field_two = "more stuff"
+ ),
+ class = c("tbl_df", "tbl", "data.frame")
+ )
+ )
+
+ # column-level attributes
+ expect_equal(attributes(df$a), list(class = "special_string"))
+ expect_equal(
+ attributes(df$c),
+ list(
+ row.names = 1L,
+ names = c("c1", "c2", "c3"),
+ class = c("tbl_df", "tbl", "data.frame")
+ )
+ )
+})
+
+### Feather
+for (comp in c("lz4", "uncompressed", "zstd")) {
+ feather_file <- paste0("files/ex_data_", comp, ".feather")
+
+ test_that(paste0("Can read the file (feather ", comp, ")"), {
+ # We can read with no error, we assert metadata below
+ expect_error(
+ df <- read_feather(feather_file),
+ NA
+ )
+ })
+
+ test_that(paste0("Can see the metadata (feather ", comp, ")"), {
+ skip_if_version_less_than("2.0.0", "Version 1.0.1 can't read new version metadata.")
+
+ df <- read_feather(feather_file)
+ expect_s3_class(df, "tbl")
+
+ expect_mapequal(
+ attributes(df),
+ list(
+ names = letters[1:4],
+ row.names = 1L,
+ top_level = list(
+ field_one = 12,
+ field_two = "more stuff"
+ ),
+ class = c("tbl_df", "tbl", "data.frame")
+ )
+ )
+
+ # column-level attributes
+ expect_equal(attributes(df$a), list(class = "special_string"))
+ expect_equal(
+ attributes(df$c),
+ list(
+ row.names = 1L,
+ names = c("c1", "c2", "c3"),
+ class = c("tbl_df", "tbl", "data.frame")
+ )
+ )
+ })
+}
+
+test_that("Can read feather version 1", {
+ feather_v1_file <- "files/ex_data_v1.feather"
+
+ df <- read_feather(feather_v1_file)
+ expect_s3_class(df, "tbl")
+
+ expect_equal(
+ attributes(df),
+ list(
+ names = c("a", "b", "d"),
+ class = c("tbl_df", "tbl", "data.frame"),
+ row.names = 1L
+ )
+ )
+})
+
+### IPC Stream
+stream_file <- "files/ex_data.stream"
+
+test_that("Can read the file (parquet)", {
+ # We can read with no error, we assert metadata below
+ expect_error(
+ df <- read_ipc_stream(stream_file),
+ NA
+ )
+})
+
+test_that("Can see the metadata (stream)", {
+ skip_if_version_less_than("2.0.0", "Version 1.0.1 can't read new version metadata.")
+ df <- read_ipc_stream(stream_file)
+
+ expect_s3_class(df, "tbl")
+
+ expect_mapequal(
+ attributes(df),
+ list(
+ names = letters[1:4],
+ row.names = 1L,
+ top_level = list(
+ field_one = 12,
+ field_two = "more stuff"
+ ),
+ class = c("tbl_df", "tbl", "data.frame")
+ )
+ )
+
+ # column-level attributes
+ expect_equal(attributes(df$a), list(class = "special_string"))
+ expect_equal(
+ attributes(df$c),
+ list(
+ row.names = 1L,
+ names = c("c1", "c2", "c3"),
+ class = c("tbl_df", "tbl", "data.frame")
+ )
+ )
+})
+
+test_that("Can see the extra metadata (parquet)", {
+ pq_file <- "files/ex_data_extra_metadata.parquet"
+
+ if (if_version_less_than("3.0.0")) {
+ expect_warning(
+ df <- read_parquet(pq_file),
+ "Invalid metadata$r",
+ fixed = TRUE
+ )
+ expect_s3_class(df, "tbl")
+ } else {
+ # version 3.0.0 and greater
+ df <- read_parquet(pq_file)
+ expect_s3_class(df, "tbl")
+
+ expect_equal(
+ attributes(df),
+ list(
+ names = letters[1:4],
+ row.names = 1L,
+ class = c("tbl_df", "tbl", "data.frame"),
+ top_level = list(
+ field_one = 12,
+ field_two = "more stuff"
+ )
+ )
+ )
+
+ # column-level attributes for the large column.
+ expect_named(attributes(df$b), "lots")
+ expect_length(attributes(df$b)$lots, 100)
+ }
+})
diff --git a/src/arrow/r/extra-tests/write-files.R b/src/arrow/r/extra-tests/write-files.R
new file mode 100644
index 000000000..4495507f3
--- /dev/null
+++ b/src/arrow/r/extra-tests/write-files.R
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(arrow)
+
+if (!dir.exists("extra-tests/files")) {
+ dir.create("extra-tests/files")
+}
+
+source("tests/testthat/helper-data.R")
+
+write_parquet(example_with_metadata, "extra-tests/files/ex_data.parquet")
+
+for (comp in c("lz4", "uncompressed", "zstd")) {
+ if (!codec_is_available(comp)) break
+
+ name <- paste0("extra-tests/files/ex_data_", comp, ".feather")
+ write_feather(example_with_metadata, name, compression = comp)
+}
+
+example_with_metadata_v1 <- example_with_metadata
+example_with_metadata_v1$c <- NULL
+write_feather(example_with_metadata_v1, "extra-tests/files/ex_data_v1.feather", version = 1)
+
+write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream")
+
+write_parquet(example_with_extra_metadata, "extra-tests/files/ex_data_extra_metadata.parquet")
diff --git a/src/arrow/r/inst/NOTICE.txt b/src/arrow/r/inst/NOTICE.txt
new file mode 100644
index 000000000..a60979137
--- /dev/null
+++ b/src/arrow/r/inst/NOTICE.txt
@@ -0,0 +1,84 @@
+Apache Arrow
+Copyright 2016-2019 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+This product includes software from the SFrame project (BSD, 3-clause).
+* Copyright (C) 2015 Dato, Inc.
+* Copyright (c) 2009 Carnegie Mellon University.
+
+This product includes software from the Feather project (Apache 2.0)
+https://github.com/wesm/feather
+
+This product includes software from the DyND project (BSD 2-clause)
+https://github.com/libdynd
+
+This product includes software from the LLVM project
+ * distributed under the University of Illinois Open Source
+
+This product includes software from the google-lint project
+ * Copyright (c) 2009 Google Inc. All rights reserved.
+
+This product includes software from the mman-win32 project
+ * Copyright https://code.google.com/p/mman-win32/
+ * Licensed under the MIT License;
+
+This product includes software from the LevelDB project
+ * Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * Moved from Kudu http://github.com/cloudera/kudu
+
+This product includes software from the CMake project
+ * Copyright 2001-2009 Kitware, Inc.
+ * Copyright 2012-2014 Continuum Analytics, Inc.
+ * All rights reserved.
+
+This product includes software from https://github.com/matthew-brett/multibuild (BSD 2-clause)
+ * Copyright (c) 2013-2016, Matt Terry and Matthew Brett; all rights reserved.
+
+This product includes software from the Ibis project (Apache 2.0)
+ * Copyright (c) 2015 Cloudera, Inc.
+ * https://github.com/cloudera/ibis
+
+This product includes software from Dremio (Apache 2.0)
+ * Copyright (C) 2017-2018 Dremio Corporation
+ * https://github.com/dremio/dremio-oss
+
+This product includes software from Google Guava (Apache 2.0)
+ * Copyright (C) 2007 The Guava Authors
+ * https://github.com/google/guava
+
+This product include software from CMake (BSD 3-Clause)
+ * CMake - Cross Platform Makefile Generator
+ * Copyright 2000-2019 Kitware, Inc. and Contributors
+
+The web site includes files generated by Jekyll.
+
+--------------------------------------------------------------------------------
+
+This product includes code from Apache Kudu, which includes the following in
+its NOTICE file:
+
+ Apache Kudu
+ Copyright 2016 The Apache Software Foundation
+
+ This product includes software developed at
+ The Apache Software Foundation (http://www.apache.org/).
+
+ Portions of this software were developed at
+ Cloudera, Inc (http://www.cloudera.com/).
+
+--------------------------------------------------------------------------------
+
+This product includes code from Apache ORC, which includes the following in
+its NOTICE file:
+
+ Apache ORC
+ Copyright 2013-2019 The Apache Software Foundation
+
+ This product includes software developed by The Apache Software
+ Foundation (http://www.apache.org/).
+
+ This product includes software developed by Hewlett-Packard:
+ (c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
diff --git a/src/arrow/r/inst/build_arrow_static.sh b/src/arrow/r/inst/build_arrow_static.sh
new file mode 100755
index 000000000..c424646e3
--- /dev/null
+++ b/src/arrow/r/inst/build_arrow_static.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Quit on failure
+set -e
+
+# Print commands for debugging
+set -x
+
+# By default, this script assumes it's in the top-level dir of the apache/arrow
+# git repository. Set any of the following env vars to customize where to read
+# and write from
+: ${ARROW_HOME:="$(pwd)"} # Only used in default SOURCE/BUILD dirs
+: ${SOURCE_DIR:="${ARROW_HOME}/cpp"} # Where the C++ source is
+: ${BUILD_DIR:="${ARROW_HOME}/r/libarrow/dist"} # Where cmake should build
+: ${DEST_DIR:="$BUILD_DIR"} # Where the resulting /lib and /include should be
+: ${CMAKE:="$(which cmake)"}
+
+# Make sure SOURCE and DEST dirs are absolute and exist
+SOURCE_DIR="$(cd "${SOURCE_DIR}" && pwd)"
+DEST_DIR="$(mkdir -p "${DEST_DIR}" && cd "${DEST_DIR}" && pwd)"
+
+# Make some env vars case-insensitive
+if [ "$LIBARROW_MINIMAL" != "" ]; then
+ LIBARROW_MINIMAL=`echo $LIBARROW_MINIMAL | tr '[:upper:]' '[:lower:]'`
+fi
+
+if [ "$LIBARROW_MINIMAL" = "false" ]; then
+ ARROW_DEFAULT_PARAM="ON"
+else
+ ARROW_DEFAULT_PARAM="OFF"
+fi
+
+mkdir -p "${BUILD_DIR}"
+pushd "${BUILD_DIR}"
+${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \
+ -DARROW_BUILD_TESTS=OFF \
+ -DARROW_BUILD_SHARED=OFF \
+ -DARROW_BUILD_STATIC=ON \
+ -DARROW_COMPUTE=ON \
+ -DARROW_CSV=ON \
+ -DARROW_DATASET=${ARROW_DATASET:-ON} \
+ -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-BUNDLED} \
+ -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \
+ -DARROW_FILESYSTEM=ON \
+ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \
+ -DARROW_JSON=${ARROW_JSON:-ON} \
+ -DARROW_PARQUET=${ARROW_PARQUET:-ON} \
+ -DARROW_S3=${ARROW_S3:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_WITH_RE2=${ARROW_WITH_RE2:-ON} \
+ -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \
+ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-$ARROW_DEFAULT_PARAM} \
+ -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \
+ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
+ -DCMAKE_INSTALL_LIBDIR=lib \
+ -DCMAKE_INSTALL_PREFIX=${DEST_DIR} \
+ -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON \
+ -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON \
+ -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-ON} \
+ ${EXTRA_CMAKE_FLAGS} \
+ -G ${CMAKE_GENERATOR:-"Unix Makefiles"} \
+ ${SOURCE_DIR}
+${CMAKE} --build . --target install
+popd
diff --git a/src/arrow/r/inst/demo_flight_server.py b/src/arrow/r/inst/demo_flight_server.py
new file mode 100644
index 000000000..0c81aa912
--- /dev/null
+++ b/src/arrow/r/inst/demo_flight_server.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+ An example Flight Python server.
+ See https://github.com/apache/arrow/blob/master/python/examples/flight/server.py
+"""
+
+import ast
+import threading
+import time
+
+import pyarrow
+import pyarrow.flight
+
+
+class DemoFlightServer(pyarrow.flight.FlightServerBase):
+ def __init__(self, host="localhost", port=5005):
+ if isinstance(port, float):
+ # Because R is looser with integer vs. float
+ port = int(port)
+ location = "grpc+tcp://{}:{}".format(host, port)
+ super(DemoFlightServer, self).__init__(location)
+ self.flights = {}
+ self.host = host
+
+ @classmethod
+ def descriptor_to_key(self, descriptor):
+ return (descriptor.descriptor_type.value, descriptor.command,
+ tuple(descriptor.path or tuple()))
+
+ def _make_flight_info(self, key, descriptor, table):
+ location = pyarrow.flight.Location.for_grpc_tcp(self.host, self.port)
+ endpoints = [pyarrow.flight.FlightEndpoint(repr(key), [location]), ]
+
+ mock_sink = pyarrow.MockOutputStream()
+ stream_writer = pyarrow.RecordBatchStreamWriter(
+ mock_sink, table.schema)
+ stream_writer.write_table(table)
+ stream_writer.close()
+ data_size = mock_sink.size()
+
+ return pyarrow.flight.FlightInfo(table.schema,
+ descriptor, endpoints,
+ table.num_rows, data_size)
+
+ def list_flights(self, context, criteria):
+ print("list_flights")
+ for key, table in self.flights.items():
+ if key[1] is not None:
+ descriptor = \
+ pyarrow.flight.FlightDescriptor.for_command(key[1])
+ else:
+ descriptor = pyarrow.flight.FlightDescriptor.for_path(*key[2])
+
+ yield self._make_flight_info(key, descriptor, table)
+
+ def get_flight_info(self, context, descriptor):
+ print("get_flight_info")
+ key = DemoFlightServer.descriptor_to_key(descriptor)
+ if key in self.flights:
+ table = self.flights[key]
+ return self._make_flight_info(key, descriptor, table)
+ raise KeyError('Flight not found.')
+
+ def do_put(self, context, descriptor, reader, writer):
+ print("do_put")
+ key = DemoFlightServer.descriptor_to_key(descriptor)
+ print(key)
+ self.flights[key] = reader.read_all()
+ print(self.flights[key])
+
+ def do_get(self, context, ticket):
+ print("do_get")
+ key = ast.literal_eval(ticket.ticket.decode())
+ if key not in self.flights:
+ return None
+ return pyarrow.flight.RecordBatchStream(self.flights[key])
+
+ def list_actions(self, context):
+ print("list_actions")
+ return [
+ ("clear", "Clear the stored flights."),
+ ("shutdown", "Shut down this server."),
+ ]
+
+ def do_action(self, context, action):
+ print("do_action")
+ if action.type == "clear":
+ raise NotImplementedError(
+ "{} is not implemented.".format(action.type))
+ elif action.type == "healthcheck":
+ pass
+ elif action.type == "shutdown":
+ yield pyarrow.flight.Result(pyarrow.py_buffer(b'Shutdown!'))
+ # Shut down on background thread to avoid blocking current
+ # request
+ threading.Thread(target=self._shutdown).start()
+ else:
+ raise KeyError("Unknown action {!r}".format(action.type))
+
+ def _shutdown(self):
+ """Shut down after a delay."""
+ print("Server is shutting down...")
+ time.sleep(2)
+ self.shutdown()
diff --git a/src/arrow/r/inst/include/cpp11.hpp b/src/arrow/r/inst/include/cpp11.hpp
new file mode 100644
index 000000000..737fbb80b
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11.hpp
@@ -0,0 +1,26 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include "cpp11/R.hpp"
+#include "cpp11/altrep.hpp"
+#include "cpp11/as.hpp"
+#include "cpp11/attribute_proxy.hpp"
+#include "cpp11/data_frame.hpp"
+#include "cpp11/doubles.hpp"
+#include "cpp11/environment.hpp"
+#include "cpp11/external_pointer.hpp"
+#include "cpp11/function.hpp"
+#include "cpp11/integers.hpp"
+#include "cpp11/list.hpp"
+#include "cpp11/list_of.hpp"
+#include "cpp11/logicals.hpp"
+#include "cpp11/matrix.hpp"
+#include "cpp11/named_arg.hpp"
+#include "cpp11/protect.hpp"
+#include "cpp11/r_bool.hpp"
+#include "cpp11/r_string.hpp"
+#include "cpp11/r_vector.hpp"
+#include "cpp11/raws.hpp"
+#include "cpp11/sexp.hpp"
+#include "cpp11/strings.hpp"
diff --git a/src/arrow/r/inst/include/cpp11/R.hpp b/src/arrow/r/inst/include/cpp11/R.hpp
new file mode 100644
index 000000000..f32dcd0b8
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/R.hpp
@@ -0,0 +1,46 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#ifdef R_INTERNALS_H_
+#if !(defined(R_NO_REMAP) && defined(STRICT_R_HEADERS))
+#error R headers were included before cpp11 headers \
+ and at least one of R_NO_REMAP or STRICT_R_HEADERS \
+ was not defined.
+#endif
+#endif
+
+#define R_NO_REMAP
+#define STRICT_R_HEADERS
+#include "Rinternals.h"
+
+// clang-format off
+#ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wattributes"
+#endif
+
+#ifdef __GNUC__
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wattributes"
+#endif
+// clang-format on
+
+#include "cpp11/altrep.hpp"
+
+namespace cpp11 {
+namespace literals {
+
+constexpr R_xlen_t operator"" _xl(unsigned long long int value) { return value; }
+
+} // namespace literals
+
+template <typename T>
+inline T na();
+
+template <typename T>
+inline bool is_na(const T& value) {
+ return value == na<T>();
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/altrep.hpp b/src/arrow/r/inst/include/cpp11/altrep.hpp
new file mode 100644
index 000000000..3d6e1172e
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/altrep.hpp
@@ -0,0 +1,44 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include "Rversion.h"
+
+#if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0)
+#define HAS_ALTREP
+#endif
+
+#ifndef HAS_ALTREP
+
+#define ALTREP(x) false
+
+#define REAL_ELT(x, i) REAL(x)[i]
+#define INTEGER_ELT(x, i) INTEGER(x)[i]
+#define LOGICAL_ELT(x, i) LOGICAL(x)[i]
+#define RAW_ELT(x, i) RAW(x)[i]
+
+#define SET_REAL_ELT(x, i, val) REAL(x)[i] = val
+#define SET_INTEGER_ELT(x, i, val) INTEGER(x)[i] = val
+#define SET_LOGICAL_ELT(x, i, val) LOGICAL(x)[i] = val
+#define SET_RAW_ELT(x, i, val) RAW(x)[i] = val
+
+#define REAL_GET_REGION(...) \
+ do { \
+ } while (false)
+
+#define INTEGER_GET_REGION(...) \
+ do { \
+ } while (false)
+#endif
+
+#if !defined HAS_ALTREP || (defined(R_VERSION) && R_VERSION < R_Version(3, 6, 0))
+
+#define LOGICAL_GET_REGION(...) \
+ do { \
+ } while (false)
+
+#define RAW_GET_REGION(...) \
+ do { \
+ } while (false)
+
+#endif
diff --git a/src/arrow/r/inst/include/cpp11/as.hpp b/src/arrow/r/inst/include/cpp11/as.hpp
new file mode 100644
index 000000000..dd9641a16
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/as.hpp
@@ -0,0 +1,337 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <cmath> // for modf
+#include <initializer_list> // for initializer_list
+#include <memory> // for std::shared_ptr, std::weak_ptr, std::unique_ptr
+#include <string> // for string, basic_string
+#include <type_traits> // for decay, enable_if, is_same, is_convertible
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, Rf_xlength, R_xlen_t
+#include "cpp11/protect.hpp" // for stop, protect, safe, protect::function
+
+namespace cpp11 {
+
+template <bool C, typename R = void>
+using enable_if_t = typename std::enable_if<C, R>::type;
+
+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+template <typename T>
+struct is_smart_ptr : std::false_type {};
+
+template <typename T>
+struct is_smart_ptr<std::shared_ptr<T>> : std::true_type {};
+
+template <typename T>
+struct is_smart_ptr<std::unique_ptr<T>> : std::true_type {};
+
+template <typename T>
+struct is_smart_ptr<std::weak_ptr<T>> : std::true_type {};
+
+template <typename T, typename R = void>
+using enable_if_constructible_from_sexp =
+ enable_if_t<!is_smart_ptr<T>::value && // workaround for gcc 4.8
+ std::is_class<T>::value && std::is_constructible<T, SEXP>::value,
+ R>;
+
+template <typename T, typename R = void>
+using enable_if_is_sexp = enable_if_t<std::is_same<T, SEXP>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_convertible_to_sexp = enable_if_t<std::is_convertible<T, SEXP>::value, R>;
+
+template <typename T, typename R = void>
+using disable_if_convertible_to_sexp =
+ enable_if_t<!std::is_convertible<T, SEXP>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_integral =
+ enable_if_t<std::is_integral<T>::value && !std::is_same<T, bool>::value &&
+ !std::is_same<T, char>::value,
+ R>;
+
+template <typename T, typename R = void>
+using enable_if_floating_point =
+ typename std::enable_if<std::is_floating_point<T>::value, R>::type;
+
+template <typename E, typename R = void>
+using enable_if_enum = enable_if_t<std::is_enum<E>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_bool = enable_if_t<std::is_same<T, bool>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_char = enable_if_t<std::is_same<T, char>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_std_string = enable_if_t<std::is_same<T, std::string>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_c_string = enable_if_t<std::is_same<T, const char*>::value, R>;
+
+// https://stackoverflow.com/a/1521682/2055486
+//
+inline bool is_convertable_without_loss_to_integer(double value) {
+ double int_part;
+ return std::modf(value, &int_part) == 0.0;
+}
+
+template <typename T>
+enable_if_constructible_from_sexp<T, T> as_cpp(SEXP from) {
+ return T(from);
+}
+
+template <typename T>
+enable_if_is_sexp<T, T> as_cpp(SEXP from) {
+ return from;
+}
+
+template <typename T>
+enable_if_integral<T, T> as_cpp(SEXP from) {
+ if (Rf_isInteger(from)) {
+ if (Rf_xlength(from) == 1) {
+ return INTEGER_ELT(from, 0);
+ }
+ } else if (Rf_isReal(from)) {
+ if (Rf_xlength(from) == 1) {
+ if (ISNA(REAL_ELT(from, 0))) {
+ return NA_INTEGER;
+ }
+ double value = REAL_ELT(from, 0);
+ if (is_convertable_without_loss_to_integer(value)) {
+ return value;
+ }
+ }
+ } else if (Rf_isLogical(from)) {
+ if (Rf_xlength(from) == 1) {
+ if (LOGICAL_ELT(from, 0) == NA_LOGICAL) {
+ return NA_INTEGER;
+ }
+ }
+ }
+
+ stop("Expected single integer value");
+}
+
+template <typename E>
+enable_if_enum<E, E> as_cpp(SEXP from) {
+ if (Rf_isInteger(from)) {
+ using underlying_type = typename std::underlying_type<E>::type;
+ using int_type = typename std::conditional<std::is_same<char, underlying_type>::value,
+ int, // as_cpp<char> would trigger
+ // undesired string conversions
+ underlying_type>::type;
+ return static_cast<E>(as_cpp<int_type>(from));
+ }
+
+ stop("Expected single integer value");
+}
+
+template <typename T>
+enable_if_bool<T, T> as_cpp(SEXP from) {
+ if (Rf_isLogical(from)) {
+ if (Rf_xlength(from) == 1) {
+ return LOGICAL_ELT(from, 0) == 1;
+ }
+ }
+
+ stop("Expected single logical value");
+}
+
+template <typename T>
+enable_if_floating_point<T, T> as_cpp(SEXP from) {
+ if (Rf_isReal(from)) {
+ if (Rf_xlength(from) == 1) {
+ return REAL_ELT(from, 0);
+ }
+ }
+ // All 32 bit integers can be coerced to doubles, so we just convert them.
+ if (Rf_isInteger(from)) {
+ if (Rf_xlength(from) == 1) {
+ if (INTEGER_ELT(from, 0) == NA_INTEGER) {
+ return NA_REAL;
+ }
+ return INTEGER_ELT(from, 0);
+ }
+ }
+
+ // Also allow NA values
+ if (Rf_isLogical(from)) {
+ if (Rf_xlength(from) == 1) {
+ if (LOGICAL_ELT(from, 0) == NA_LOGICAL) {
+ return NA_REAL;
+ }
+ }
+ }
+
+ stop("Expected single double value");
+}
+
+template <typename T>
+enable_if_char<T, T> as_cpp(SEXP from) {
+ if (Rf_isString(from)) {
+ if (Rf_xlength(from) == 1) {
+ return unwind_protect([&] { return Rf_translateCharUTF8(STRING_ELT(from, 0))[0]; });
+ }
+ }
+
+ stop("Expected string vector of length 1");
+}
+
+template <typename T>
+enable_if_c_string<T, T> as_cpp(SEXP from) {
+ if (Rf_isString(from)) {
+ if (Rf_xlength(from) == 1) {
+ // TODO: use vmaxget / vmaxset here?
+ return {unwind_protect([&] { return Rf_translateCharUTF8(STRING_ELT(from, 0)); })};
+ }
+ }
+
+ stop("Expected string vector of length 1");
+}
+
+template <typename T>
+enable_if_std_string<T, T> as_cpp(SEXP from) {
+ return {as_cpp<const char*>(from)};
+}
+
+/// Temporary workaround for compatibility with cpp11 0.1.0
+template <typename T>
+enable_if_t<!std::is_same<decay_t<T>, T>::value, decay_t<T>> as_cpp(SEXP from) {
+ return as_cpp<decay_t<T>>(from);
+}
+
+template <typename T>
+enable_if_integral<T, SEXP> as_sexp(T from) {
+ return safe[Rf_ScalarInteger](from);
+}
+
+template <typename T>
+enable_if_floating_point<T, SEXP> as_sexp(T from) {
+ return safe[Rf_ScalarReal](from);
+}
+
+template <typename T>
+enable_if_bool<T, SEXP> as_sexp(T from) {
+ return safe[Rf_ScalarLogical](from);
+}
+
+template <typename T>
+enable_if_c_string<T, SEXP> as_sexp(T from) {
+ return unwind_protect([&] { return Rf_ScalarString(Rf_mkCharCE(from, CE_UTF8)); });
+}
+
+template <typename T>
+enable_if_std_string<T, SEXP> as_sexp(const T& from) {
+ return as_sexp(from.c_str());
+}
+
+template <typename Container, typename T = typename Container::value_type,
+ typename = disable_if_convertible_to_sexp<Container>>
+enable_if_integral<T, SEXP> as_sexp(const Container& from) {
+ R_xlen_t size = from.size();
+ SEXP data = safe[Rf_allocVector](INTSXP, size);
+
+ auto it = from.begin();
+ int* data_p = INTEGER(data);
+ for (R_xlen_t i = 0; i < size; ++i, ++it) {
+ data_p[i] = *it;
+ }
+ return data;
+}
+
+inline SEXP as_sexp(std::initializer_list<int> from) {
+ return as_sexp<std::initializer_list<int>>(from);
+}
+
+template <typename Container, typename T = typename Container::value_type,
+ typename = disable_if_convertible_to_sexp<Container>>
+enable_if_floating_point<T, SEXP> as_sexp(const Container& from) {
+ R_xlen_t size = from.size();
+ SEXP data = safe[Rf_allocVector](REALSXP, size);
+
+ auto it = from.begin();
+ double* data_p = REAL(data);
+ for (R_xlen_t i = 0; i < size; ++i, ++it) {
+ data_p[i] = *it;
+ }
+ return data;
+}
+
+inline SEXP as_sexp(std::initializer_list<double> from) {
+ return as_sexp<std::initializer_list<double>>(from);
+}
+
+template <typename Container, typename T = typename Container::value_type,
+ typename = disable_if_convertible_to_sexp<Container>>
+enable_if_bool<T, SEXP> as_sexp(const Container& from) {
+ R_xlen_t size = from.size();
+ SEXP data = safe[Rf_allocVector](LGLSXP, size);
+
+ auto it = from.begin();
+ int* data_p = LOGICAL(data);
+ for (R_xlen_t i = 0; i < size; ++i, ++it) {
+ data_p[i] = *it;
+ }
+ return data;
+}
+
+inline SEXP as_sexp(std::initializer_list<bool> from) {
+ return as_sexp<std::initializer_list<bool>>(from);
+}
+
+namespace detail {
+template <typename Container, typename AsCstring>
+SEXP as_sexp_strings(const Container& from, AsCstring&& c_str) {
+ R_xlen_t size = from.size();
+
+ SEXP data;
+ try {
+ data = PROTECT(safe[Rf_allocVector](STRSXP, size));
+
+ auto it = from.begin();
+ for (R_xlen_t i = 0; i < size; ++i, ++it) {
+ SET_STRING_ELT(data, i, safe[Rf_mkCharCE](c_str(*it), CE_UTF8));
+ }
+ } catch (const unwind_exception& e) {
+ UNPROTECT(1);
+ throw e;
+ }
+
+ UNPROTECT(1);
+ return data;
+}
+} // namespace detail
+
+class r_string;
+
+template <typename T, typename R = void>
+using disable_if_r_string = enable_if_t<!std::is_same<T, cpp11::r_string>::value, R>;
+
+template <typename Container, typename T = typename Container::value_type,
+ typename = disable_if_r_string<T>>
+enable_if_t<std::is_convertible<T, std::string>::value &&
+ !std::is_convertible<T, const char*>::value,
+ SEXP>
+as_sexp(const Container& from) {
+ return detail::as_sexp_strings(from, [](const std::string& s) { return s.c_str(); });
+}
+
+template <typename Container, typename T = typename Container::value_type>
+enable_if_c_string<T, SEXP> as_sexp(const Container& from) {
+ return detail::as_sexp_strings(from, [](const char* s) { return s; });
+}
+
+inline SEXP as_sexp(std::initializer_list<const char*> from) {
+ return as_sexp<std::initializer_list<const char*>>(from);
+}
+
+template <typename T, typename = disable_if_r_string<T>>
+enable_if_convertible_to_sexp<T, SEXP> as_sexp(const T& from) {
+ return from;
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/attribute_proxy.hpp b/src/arrow/r/inst/include/cpp11/attribute_proxy.hpp
new file mode 100644
index 000000000..7301919c7
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/attribute_proxy.hpp
@@ -0,0 +1,50 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <initializer_list> // for initializer_list
+#include <string> // for string, basic_string
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, Rf_install, PROTECT, Rf_...
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/protect.hpp" // for protect, safe, protect::function
+
+namespace cpp11 {
+
+class sexp;
+
+template <typename T>
+class attribute_proxy {
+ private:
+ const T& parent_;
+ SEXP symbol_;
+
+ public:
+ attribute_proxy(const T& parent, const char* index)
+ : parent_(parent), symbol_(safe[Rf_install](index)) {}
+
+ attribute_proxy(const T& parent, const std::string& index)
+ : parent_(parent), symbol_(safe[Rf_install](index.c_str())) {}
+
+ attribute_proxy(const T& parent, SEXP index) : parent_(parent), symbol_(index) {}
+
+ template <typename C>
+ attribute_proxy& operator=(C rhs) {
+ SEXP value = PROTECT(as_sexp(rhs));
+ Rf_setAttrib(parent_.data(), symbol_, value);
+ UNPROTECT(1);
+ return *this;
+ }
+
+ template <typename C>
+ attribute_proxy& operator=(std::initializer_list<C> rhs) {
+ SEXP value = PROTECT(as_sexp(rhs));
+ Rf_setAttrib(parent_.data(), symbol_, value);
+ UNPROTECT(1);
+ return *this;
+ }
+
+ operator SEXP() const { return safe[Rf_getAttrib](parent_.data(), symbol_); }
+};
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/data_frame.hpp b/src/arrow/r/inst/include/cpp11/data_frame.hpp
new file mode 100644
index 000000000..9abbc0f33
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/data_frame.hpp
@@ -0,0 +1,102 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <cstdlib> // for abs
+#include <cstdlib>
+#include <initializer_list> // for initializer_list
+#include <string> // for string, basic_string
+#include <utility> // for move
+
+#include "R_ext/Arith.h" // for NA_INTEGER
+#include "cpp11/R.hpp" // for Rf_xlength, SEXP, SEXPREC, INTEGER
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/list.hpp" // for list, r_vector<>::r_vector, r_v...
+#include "cpp11/r_vector.hpp" // for r_vector
+
+namespace cpp11 {
+
+class named_arg;
+namespace writable {
+class data_frame;
+} // namespace writable
+
+class data_frame : public list {
+ using list::list;
+
+ friend class writable::data_frame;
+
+ /* we cannot use Rf_getAttrib because it has a special case for c(NA, -n) and creates
+ * the full vector */
+ static SEXP get_attrib0(SEXP x, SEXP sym) {
+ for (SEXP attr = ATTRIB(x); attr != R_NilValue; attr = CDR(attr)) {
+ if (TAG(attr) == sym) {
+ return CAR(attr);
+ }
+ }
+
+ return R_NilValue;
+ }
+
+ static int calc_nrow(SEXP x) {
+ auto nms = get_attrib0(x, R_RowNamesSymbol);
+ bool has_short_rownames =
+ (Rf_isInteger(nms) && Rf_xlength(nms) == 2 && INTEGER(nms)[0] == NA_INTEGER);
+ if (has_short_rownames) {
+ return abs(INTEGER(nms)[1]);
+ }
+
+ if (!Rf_isNull(nms)) {
+ return Rf_xlength(nms);
+ }
+
+ if (Rf_xlength(x) == 0) {
+ return 0;
+ }
+
+ return Rf_xlength(VECTOR_ELT(x, 0));
+ }
+
+ public:
+ /* Adapted from
+ * https://github.com/wch/r-source/blob/f2a0dfab3e26fb42b8b296fcba40cbdbdbec767d/src/main/attrib.c#L198-L207
+ */
+ R_xlen_t nrow() const { return calc_nrow(*this); }
+ R_xlen_t ncol() const { return size(); }
+};
+
+namespace writable {
+class data_frame : public cpp11::data_frame {
+ private:
+ writable::list set_data_frame_attributes(writable::list&& x) {
+ x.attr(R_RowNamesSymbol) = {NA_INTEGER, -static_cast<int>(calc_nrow(x))};
+ x.attr(R_ClassSymbol) = "data.frame";
+ return std::move(x);
+ }
+
+ public:
+ data_frame(const SEXP data) : cpp11::data_frame(set_data_frame_attributes(data)) {}
+ data_frame(const SEXP data, bool is_altrep)
+ : cpp11::data_frame(set_data_frame_attributes(data), is_altrep) {}
+ data_frame(std::initializer_list<list> il)
+ : cpp11::data_frame(set_data_frame_attributes(writable::list(il))) {}
+ data_frame(std::initializer_list<named_arg> il)
+ : cpp11::data_frame(set_data_frame_attributes(writable::list(il))) {}
+
+ using cpp11::data_frame::ncol;
+ using cpp11::data_frame::nrow;
+
+ attribute_proxy<data_frame> attr(const char* name) const { return {*this, name}; }
+
+ attribute_proxy<data_frame> attr(const std::string& name) const {
+ return {*this, name.c_str()};
+ }
+
+ attribute_proxy<data_frame> attr(SEXP name) const { return {*this, name}; }
+
+ attribute_proxy<data_frame> names() const { return {*this, R_NamesSymbol}; }
+};
+
+} // namespace writable
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/declarations.hpp b/src/arrow/r/inst/include/cpp11/declarations.hpp
new file mode 100644
index 000000000..c67c9db1b
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/declarations.hpp
@@ -0,0 +1,54 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#ifndef CPP11_PARTIAL
+#include "cpp11.hpp"
+using namespace cpp11;
+namespace writable = cpp11::writable;
+#endif
+
+#include <R_ext/Rdynload.h>
+
+namespace cpp11 {
+template <class T>
+T& unmove(T&& t) {
+ return t;
+}
+} // namespace cpp11
+
+#ifdef HAS_UNWIND_PROTECT
+#define CPP11_UNWIND R_ContinueUnwind(err);
+#else
+#define CPP11_UNWIND \
+ do { \
+ } while (false);
+#endif
+
+#define CPP11_ERROR_BUFSIZE 8192
+
+#define BEGIN_CPP11 \
+ SEXP err = R_NilValue; \
+ char buf[CPP11_ERROR_BUFSIZE] = ""; \
+ try {
+#define END_CPP11 \
+ } \
+ catch (cpp11::unwind_exception & e) { \
+ err = e.token; \
+ } \
+ catch (std::exception & e) { \
+ strncpy(buf, e.what(), sizeof(buf) - 1); \
+ } \
+ catch (...) { \
+ strncpy(buf, "C++ error (unknown cause)", sizeof(buf) - 1); \
+ } \
+ if (buf[0] != '\0') { \
+ Rf_errorcall(R_NilValue, "%s", buf); \
+ } else if (err != R_NilValue) { \
+ CPP11_UNWIND \
+ } \
+ return R_NilValue;
diff --git a/src/arrow/r/inst/include/cpp11/doubles.hpp b/src/arrow/r/inst/include/cpp11/doubles.hpp
new file mode 100644
index 000000000..a12f7c7c0
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/doubles.hpp
@@ -0,0 +1,145 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <algorithm> // for min
+#include <array> // for array
+#include <initializer_list> // for initializer_list
+
+#include "R_ext/Arith.h" // for ISNA
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, Rf_allocVector, REAL
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/named_arg.hpp" // for named_arg
+#include "cpp11/protect.hpp" // for SEXP, SEXPREC, REAL_ELT, R_Preserve...
+#include "cpp11/r_vector.hpp" // for vector, vector<>::proxy, vector<>::...
+#include "cpp11/sexp.hpp" // for sexp
+
+// Specializations for doubles
+
+namespace cpp11 {
+
+template <>
+inline SEXP r_vector<double>::valid_type(SEXP data) {
+ if (TYPEOF(data) != REALSXP) {
+ throw type_error(REALSXP, TYPEOF(data));
+ }
+ return data;
+}
+
+template <>
+inline double r_vector<double>::operator[](const R_xlen_t pos) const {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return is_altrep_ ? REAL_ELT(data_, pos) : data_p_[pos];
+}
+
+template <>
+inline double* r_vector<double>::get_p(bool is_altrep, SEXP data) {
+ if (is_altrep) {
+ return nullptr;
+ } else {
+ return REAL(data);
+ }
+}
+
+template <>
+inline void r_vector<double>::const_iterator::fill_buf(R_xlen_t pos) {
+ length_ = std::min(64_xl, data_->size() - pos);
+ REAL_GET_REGION(data_->data_, pos, length_, buf_.data());
+ block_start_ = pos;
+}
+
+typedef r_vector<double> doubles;
+
+namespace writable {
+
+template <>
+inline typename r_vector<double>::proxy& r_vector<double>::proxy::operator=(
+ const double& rhs) {
+ if (is_altrep_) {
+ // NOPROTECT: likely too costly to unwind protect every set elt
+ SET_REAL_ELT(data_, index_, rhs);
+ } else {
+ *p_ = rhs;
+ }
+ return *this;
+}
+
+template <>
+inline r_vector<double>::proxy::operator double() const {
+ if (p_ == nullptr) {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return REAL_ELT(data_, index_);
+ } else {
+ return *p_;
+ }
+}
+
+template <>
+inline r_vector<double>::r_vector(std::initializer_list<double> il)
+ : cpp11::r_vector<double>(as_sexp(il)), capacity_(il.size()) {}
+
+template <>
+inline r_vector<double>::r_vector(std::initializer_list<named_arg> il)
+ : cpp11::r_vector<double>(safe[Rf_allocVector](REALSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ int n_protected = 0;
+
+ try {
+ unwind_protect([&] {
+ Rf_setAttrib(data_, R_NamesSymbol, Rf_allocVector(STRSXP, capacity_));
+ SEXP names = PROTECT(Rf_getAttrib(data_, R_NamesSymbol));
+ ++n_protected;
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ data_p_[i] = REAL_ELT(it->value(), 0);
+ SET_STRING_ELT(names, i, Rf_mkCharCE(it->name(), CE_UTF8));
+ }
+ UNPROTECT(n_protected);
+ });
+ } catch (const unwind_exception& e) {
+ preserved.release(protect_);
+ UNPROTECT(n_protected);
+ throw e;
+ }
+}
+
+template <>
+inline void r_vector<double>::reserve(R_xlen_t new_capacity) {
+ data_ = data_ == R_NilValue ? safe[Rf_allocVector](REALSXP, new_capacity)
+ : safe[Rf_xlengthgets](data_, new_capacity);
+ SEXP old_protect = protect_;
+ protect_ = preserved.insert(data_);
+ preserved.release(old_protect);
+
+ data_p_ = REAL(data_);
+ capacity_ = new_capacity;
+}
+
+template <>
+inline void r_vector<double>::push_back(double value) {
+ while (length_ >= capacity_) {
+ reserve(capacity_ == 0 ? 1 : capacity_ *= 2);
+ }
+ if (is_altrep_) {
+ SET_REAL_ELT(data_, length_, value);
+ } else {
+ data_p_[length_] = value;
+ }
+ ++length_;
+}
+
+typedef r_vector<double> doubles;
+
+} // namespace writable
+
+template <>
+inline double na() {
+ return NA_REAL;
+}
+
+template <>
+inline bool is_na(const double& x) {
+ return ISNA(x);
+}
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/environment.hpp b/src/arrow/r/inst/include/cpp11/environment.hpp
new file mode 100644
index 000000000..038fb60a8
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/environment.hpp
@@ -0,0 +1,75 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <string> // for string, basic_string
+
+#include "Rversion.h" // for R_VERSION, R_Version
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, Rf_install, Rf_findVarIn...
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/protect.hpp" // for protect, protect::function, safe, unwin...
+#include "cpp11/sexp.hpp" // for sexp
+
+#if R_VERSION >= R_Version(4, 0, 0)
+#define HAS_REMOVE_VAR_FROM_FRAME
+#endif
+
+#ifndef HAS_REMOVE_VAR_FROM_FRAME
+#include "cpp11/function.hpp"
+#endif
+
+namespace cpp11 {
+
+class environment {
+ private:
+ sexp env_;
+
+ class proxy {
+ SEXP parent_;
+ SEXP name_;
+
+ public:
+ proxy(SEXP parent, SEXP name) : parent_(parent), name_(name) {}
+
+ template <typename T>
+ proxy& operator=(T value) {
+ safe[Rf_defineVar](name_, as_sexp(value), parent_);
+ return *this;
+ }
+ operator SEXP() const { return safe[Rf_findVarInFrame3](parent_, name_, TRUE); };
+ operator sexp() const { return SEXP(); };
+ };
+
+ public:
+ environment(SEXP env) : env_(env) {}
+ proxy operator[](SEXP name) const { return {env_, name}; }
+ proxy operator[](const char* name) const { return operator[](safe[Rf_install](name)); }
+ proxy operator[](const std::string& name) const { return operator[](name.c_str()); }
+
+ bool exists(SEXP name) const {
+ SEXP res = safe[Rf_findVarInFrame3](env_, name, FALSE);
+ return res != R_UnboundValue;
+ }
+ bool exists(const char* name) const { return exists(safe[Rf_install](name)); }
+
+ bool exists(const std::string& name) const { return exists(name.c_str()); }
+
+ void remove(SEXP name) {
+ PROTECT(name);
+#ifdef HAS_REMOVE_VAR_FROM_FRAME
+ R_removeVarFromFrame(name, env_);
+#else
+ auto remove = package("base")["remove"];
+ remove(name, "envir"_nm = env_);
+#endif
+ UNPROTECT(1);
+ }
+
+ void remove(const char* name) { remove(safe[Rf_install](name)); }
+
+ R_xlen_t size() const { return Rf_xlength(env_); }
+
+ operator SEXP() const { return env_; }
+};
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/external_pointer.hpp b/src/arrow/r/inst/include/cpp11/external_pointer.hpp
new file mode 100644
index 000000000..059a1aa55
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/external_pointer.hpp
@@ -0,0 +1,166 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <cstddef> // for nullptr_t, NULL
+#include <memory> // for bad_weak_ptr
+#include <type_traits> // for add_lvalue_reference
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, TYPEOF, R_NilValue, R_C...
+#include "cpp11/protect.hpp" // for protect, safe, protect::function
+#include "cpp11/r_bool.hpp" // for r_bool
+#include "cpp11/r_vector.hpp" // for type_error
+#include "cpp11/sexp.hpp" // for sexp
+
+namespace cpp11 {
+
+template <typename T>
+void default_deleter(T* obj) {
+ delete obj;
+}
+
+template <typename T, void Deleter(T*) = default_deleter<T>>
+class external_pointer {
+ private:
+ sexp data_ = R_NilValue;
+
+ static SEXP valid_type(SEXP data) {
+ if (TYPEOF(data) != EXTPTRSXP) {
+ throw type_error(EXTPTRSXP, TYPEOF(data));
+ }
+
+ return data;
+ }
+
+ static void r_deleter(SEXP p) {
+ if (TYPEOF(p) != EXTPTRSXP) return;
+
+ T* ptr = static_cast<T*>(R_ExternalPtrAddr(p));
+
+ if (ptr == NULL) {
+ return;
+ }
+
+ R_ClearExternalPtr(p);
+
+ Deleter(ptr);
+ }
+
+ public:
+ using pointer = T*;
+
+ external_pointer() noexcept {}
+ external_pointer(std::nullptr_t) noexcept {}
+
+ external_pointer(SEXP data) : data_(valid_type(data)) {}
+
+ external_pointer(pointer p, bool use_deleter = true, bool finalize_on_exit = true)
+ : data_(safe[R_MakeExternalPtr]((void*)p, R_NilValue, R_NilValue)) {
+ if (use_deleter) {
+ R_RegisterCFinalizerEx(data_, r_deleter, static_cast<r_bool>(finalize_on_exit));
+ }
+ }
+
+ external_pointer(const external_pointer& rhs) {
+ data_ = safe[Rf_shallow_duplicate](rhs.data_);
+ }
+
+ external_pointer(external_pointer&& rhs) { reset(rhs.release()); }
+
+ external_pointer& operator=(external_pointer&& rhs) noexcept { reset(rhs.release()); }
+
+ external_pointer& operator=(std::nullptr_t) noexcept { reset(); };
+
+ operator SEXP() const noexcept { return data_; }
+
+ pointer get() const noexcept {
+ pointer addr = static_cast<T*>(R_ExternalPtrAddr(data_));
+ if (addr == nullptr) {
+ return nullptr;
+ }
+ return addr;
+ }
+
+ typename std::add_lvalue_reference<T>::type operator*() {
+ pointer addr = get();
+ if (addr == nullptr) {
+ throw std::bad_weak_ptr();
+ }
+ return *get();
+ }
+
+ pointer operator->() const {
+ pointer addr = get();
+ if (addr == nullptr) {
+ throw std::bad_weak_ptr();
+ }
+ return get();
+ }
+
+ pointer release() noexcept {
+ if (get() == nullptr) {
+ return nullptr;
+ }
+ pointer ptr = get();
+ R_ClearExternalPtr(data_);
+
+ return ptr;
+ }
+
+ void reset(pointer ptr = pointer()) {
+ SEXP old_data = data_;
+ data_ = safe[R_MakeExternalPtr]((void*)ptr, R_NilValue, R_NilValue);
+ r_deleter(old_data);
+ }
+
+ void swap(external_pointer& other) noexcept {
+ SEXP tmp = other.data_;
+ other.data_ = data_;
+ data_ = tmp;
+ }
+
+ operator bool() noexcept { return data_ != nullptr; }
+};
+
+template <class T, void Deleter(T*)>
+void swap(external_pointer<T, Deleter>& lhs, external_pointer<T, Deleter>& rhs) noexcept {
+ lhs.swap(rhs);
+}
+
+template <class T, void Deleter(T*)>
+bool operator==(const external_pointer<T, Deleter>& x,
+ const external_pointer<T, Deleter>& y) {
+ return x.data_ == y.data_;
+}
+
+template <class T, void Deleter(T*)>
+bool operator!=(const external_pointer<T, Deleter>& x,
+ const external_pointer<T, Deleter>& y) {
+ return x.data_ != y.data_;
+}
+
+template <class T, void Deleter(T*)>
+bool operator<(const external_pointer<T, Deleter>& x,
+ const external_pointer<T, Deleter>& y) {
+ return x.data_ < y.data_;
+}
+
+template <class T, void Deleter(T*)>
+bool operator<=(const external_pointer<T, Deleter>& x,
+ const external_pointer<T, Deleter>& y) {
+ return x.data_ <= y.data_;
+}
+
+template <class T, void Deleter(T*)>
+bool operator>(const external_pointer<T, Deleter>& x,
+ const external_pointer<T, Deleter>& y) {
+ return x.data_ > y.data_;
+}
+
+template <class T, void Deleter(T*)>
+bool operator>=(const external_pointer<T, Deleter>& x,
+ const external_pointer<T, Deleter>& y) {
+ return x.data_ >= y.data_;
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/function.hpp b/src/arrow/r/inst/include/cpp11/function.hpp
new file mode 100644
index 000000000..06e602ac6
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/function.hpp
@@ -0,0 +1,78 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <string.h> // for strcmp
+
+#include <string> // for string, basic_string
+#include <utility> // for forward
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, CDR, Rf_install, SETCAR
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/named_arg.hpp" // for named_arg
+#include "cpp11/protect.hpp" // for protect, protect::function, safe
+#include "cpp11/sexp.hpp" // for sexp
+
+namespace cpp11 {
+
+class function {
+ public:
+ function(SEXP data) : data_(data) {}
+
+ template <typename... Args>
+ sexp operator()(Args&&... args) const {
+ // Size of the arguments plus one for the function name itself
+ R_xlen_t num_args = sizeof...(args) + 1;
+
+ sexp call(safe[Rf_allocVector](LANGSXP, num_args));
+
+ construct_call(call, data_, std::forward<Args>(args)...);
+
+ return safe[Rf_eval](call, R_GlobalEnv);
+ }
+
+ private:
+ SEXP data_;
+
+ template <typename... Args>
+ SEXP construct_call(SEXP val, const named_arg& arg, Args&&... args) const {
+ SETCAR(val, arg.value());
+ SET_TAG(val, safe[Rf_install](arg.name()));
+ val = CDR(val);
+ return construct_call(val, std::forward<Args>(args)...);
+ }
+
+ // Construct the call recursively, each iteration adds an Arg to the pairlist.
+ // We need
+ template <typename T, typename... Args>
+ SEXP construct_call(SEXP val, const T& arg, Args&&... args) const {
+ SETCAR(val, as_sexp(arg));
+ val = CDR(val);
+ return construct_call(val, std::forward<Args>(args)...);
+ }
+
+ // Base case, just return
+ SEXP construct_call(SEXP val) const { return val; }
+};
+
+class package {
+ public:
+ package(const char* name) : data_(get_namespace(name)) {}
+ package(const std::string& name) : data_(get_namespace(name.c_str())) {}
+ function operator[](const char* name) {
+ return safe[Rf_findFun](safe[Rf_install](name), data_);
+ }
+ function operator[](const std::string& name) { return operator[](name.c_str()); }
+
+ private:
+ static SEXP get_namespace(const char* name) {
+ if (strcmp(name, "base") == 0) {
+ return R_BaseEnv;
+ }
+ sexp name_sexp = safe[Rf_install](name);
+ return safe[Rf_findVarInFrame](R_NamespaceRegistry, name_sexp);
+ }
+
+ SEXP data_;
+};
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/integers.hpp b/src/arrow/r/inst/include/cpp11/integers.hpp
new file mode 100644
index 000000000..19f85c060
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/integers.hpp
@@ -0,0 +1,146 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <algorithm> // for min
+#include <array> // for array
+#include <initializer_list> // for initializer_list
+
+#include "R_ext/Arith.h" // for NA_INTEGER
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, Rf_allocVector
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/named_arg.hpp" // for named_arg
+#include "cpp11/protect.hpp" // for preserved
+#include "cpp11/r_vector.hpp" // for r_vector, r_vector<>::proxy
+#include "cpp11/sexp.hpp" // for sexp
+
+// Specializations for integers
+
+namespace cpp11 {
+
+template <>
+inline SEXP r_vector<int>::valid_type(SEXP data) {
+ if (TYPEOF(data) != INTSXP) {
+ throw type_error(INTSXP, TYPEOF(data));
+ }
+ return data;
+}
+
+template <>
+inline int r_vector<int>::operator[](const R_xlen_t pos) const {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return is_altrep_ ? INTEGER_ELT(data_, pos) : data_p_[pos];
+}
+
+template <>
+inline int* r_vector<int>::get_p(bool is_altrep, SEXP data) {
+ if (is_altrep) {
+ return nullptr;
+ } else {
+ return INTEGER(data);
+ }
+}
+
+template <>
+inline void r_vector<int>::const_iterator::fill_buf(R_xlen_t pos) {
+ length_ = std::min(64_xl, data_->size() - pos);
+ INTEGER_GET_REGION(data_->data_, pos, length_, buf_.data());
+ block_start_ = pos;
+}
+
+typedef r_vector<int> integers;
+
+namespace writable {
+
+template <>
+inline typename r_vector<int>::proxy& r_vector<int>::proxy::operator=(const int& rhs) {
+ if (is_altrep_) {
+ // NOPROTECT: likely too costly to unwind protect every set elt
+ SET_INTEGER_ELT(data_, index_, rhs);
+ } else {
+ *p_ = rhs;
+ }
+ return *this;
+}
+
+template <>
+inline r_vector<int>::proxy::operator int() const {
+ if (p_ == nullptr) {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return INTEGER_ELT(data_, index_);
+ } else {
+ return *p_;
+ }
+}
+
+template <>
+inline r_vector<int>::r_vector(std::initializer_list<int> il)
+ : cpp11::r_vector<int>(as_sexp(il)), capacity_(il.size()) {}
+
+template <>
+inline void r_vector<int>::reserve(R_xlen_t new_capacity) {
+ data_ = data_ == R_NilValue ? safe[Rf_allocVector](INTSXP, new_capacity)
+ : safe[Rf_xlengthgets](data_, new_capacity);
+ SEXP old_protect = protect_;
+
+ // Protect the new data
+ protect_ = preserved.insert(data_);
+
+ // Release the old protection;
+ preserved.release(old_protect);
+
+ data_p_ = INTEGER(data_);
+ capacity_ = new_capacity;
+}
+
+template <>
+inline r_vector<int>::r_vector(std::initializer_list<named_arg> il)
+ : cpp11::r_vector<int>(safe[Rf_allocVector](INTSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ int n_protected = 0;
+
+ try {
+ unwind_protect([&] {
+ Rf_setAttrib(data_, R_NamesSymbol, Rf_allocVector(STRSXP, capacity_));
+ SEXP names = PROTECT(Rf_getAttrib(data_, R_NamesSymbol));
+ ++n_protected;
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ data_p_[i] = INTEGER_ELT(it->value(), 0);
+ SET_STRING_ELT(names, i, Rf_mkCharCE(it->name(), CE_UTF8));
+ }
+ UNPROTECT(n_protected);
+ });
+ } catch (const unwind_exception& e) {
+ preserved.release(protect_);
+ UNPROTECT(n_protected);
+ throw e;
+ }
+}
+
+template <>
+inline void r_vector<int>::push_back(int value) {
+ while (length_ >= capacity_) {
+ reserve(capacity_ == 0 ? 1 : capacity_ *= 2);
+ }
+ if (is_altrep_) {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ SET_INTEGER_ELT(data_, length_, value);
+ } else {
+ data_p_[length_] = value;
+ }
+ ++length_;
+}
+
+typedef r_vector<int> integers;
+
+} // namespace writable
+
+template <>
+inline int na() {
+ return NA_INTEGER;
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/list.hpp b/src/arrow/r/inst/include/cpp11/list.hpp
new file mode 100644
index 000000000..28140fe2f
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/list.hpp
@@ -0,0 +1,138 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <initializer_list> // for initializer_list
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, SET_VECTOR_ELT
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/named_arg.hpp" // for named_arg
+#include "cpp11/protect.hpp" // for preserved
+#include "cpp11/r_string.hpp" // for r_string
+#include "cpp11/r_vector.hpp" // for r_vector, r_vector<>::proxy
+#include "cpp11/sexp.hpp" // for sexp
+
+// Specializations for list
+
+namespace cpp11 {
+
+template <>
+inline SEXP r_vector<SEXP>::valid_type(SEXP data) {
+ if (TYPEOF(data) != VECSXP) {
+ throw type_error(VECSXP, TYPEOF(data));
+ }
+ return data;
+}
+
+template <>
+inline SEXP r_vector<SEXP>::operator[](const R_xlen_t pos) const {
+ return VECTOR_ELT(data_, pos);
+}
+
+template <>
+inline SEXP r_vector<SEXP>::operator[](const r_string& name) const {
+ SEXP names = this->names();
+ R_xlen_t size = Rf_xlength(names);
+
+ for (R_xlen_t pos = 0; pos < size; ++pos) {
+ auto cur = Rf_translateCharUTF8(STRING_ELT(names, pos));
+ if (name == cur) {
+ return operator[](pos);
+ }
+ }
+ return R_NilValue;
+}
+
+template <>
+inline SEXP* r_vector<SEXP>::get_p(bool, SEXP) {
+ return nullptr;
+}
+
+template <>
+inline void r_vector<SEXP>::const_iterator::fill_buf(R_xlen_t) {
+ return;
+}
+
+template <>
+inline SEXP r_vector<SEXP>::const_iterator::operator*() const {
+ return VECTOR_ELT(data_->data(), pos_);
+}
+
+typedef r_vector<SEXP> list;
+
+namespace writable {
+
+template <>
+inline typename r_vector<SEXP>::proxy& r_vector<SEXP>::proxy::operator=(const SEXP& rhs) {
+ SET_VECTOR_ELT(data_, index_, rhs);
+ return *this;
+}
+
+template <>
+inline r_vector<SEXP>::proxy::operator SEXP() const {
+ return VECTOR_ELT(data_, index_);
+}
+
+template <>
+inline r_vector<SEXP>::r_vector(std::initializer_list<SEXP> il)
+ : cpp11::r_vector<SEXP>(safe[Rf_allocVector](VECSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ SET_VECTOR_ELT(data_, i, *it);
+ }
+}
+
+template <>
+inline r_vector<SEXP>::r_vector(std::initializer_list<named_arg> il)
+ : cpp11::r_vector<SEXP>(safe[Rf_allocVector](VECSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ int n_protected = 0;
+
+ try {
+ unwind_protect([&] {
+ Rf_setAttrib(data_, R_NamesSymbol, Rf_allocVector(STRSXP, capacity_));
+ SEXP names = PROTECT(Rf_getAttrib(data_, R_NamesSymbol));
+ ++n_protected;
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ SET_VECTOR_ELT(data_, i, it->value());
+ SET_STRING_ELT(names, i, Rf_mkCharCE(it->name(), CE_UTF8));
+ }
+ UNPROTECT(n_protected);
+ });
+ } catch (const unwind_exception& e) {
+ preserved.release(protect_);
+ UNPROTECT(n_protected);
+ throw e;
+ }
+}
+
+template <>
+inline void r_vector<SEXP>::reserve(R_xlen_t new_capacity) {
+ data_ = data_ == R_NilValue ? safe[Rf_allocVector](VECSXP, new_capacity)
+ : safe[Rf_xlengthgets](data_, new_capacity);
+
+ SEXP old_protect = protect_;
+ protect_ = preserved.insert(data_);
+ preserved.release(old_protect);
+
+ capacity_ = new_capacity;
+}
+
+template <>
+inline void r_vector<SEXP>::push_back(SEXP value) {
+ while (length_ >= capacity_) {
+ reserve(capacity_ == 0 ? 1 : capacity_ *= 2);
+ }
+ SET_VECTOR_ELT(data_, length_, value);
+ ++length_;
+}
+
+typedef r_vector<SEXP> list;
+
+} // namespace writable
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/list_of.hpp b/src/arrow/r/inst/include/cpp11/list_of.hpp
new file mode 100644
index 000000000..d9b8f8020
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/list_of.hpp
@@ -0,0 +1,73 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <string> // for string, basic_string
+
+#include "cpp11/R.hpp" // for R_xlen_t, SEXP, SEXPREC, LONG_VECTOR_SUPPORT
+#include "cpp11/list.hpp" // for list
+
+namespace cpp11 {
+
+template <typename T>
+class list_of : public list {
+ public:
+ list_of(const list& data) : list(data) {}
+
+#ifdef LONG_VECTOR_SUPPORT
+ T operator[](int pos) { return operator[](static_cast<R_xlen_t>(pos)); }
+#endif
+
+ T operator[](R_xlen_t pos) { return list::operator[](pos); }
+
+ T operator[](const char* pos) { return list::operator[](pos); }
+
+ T operator[](const std::string& pos) { return list::operator[](pos.c_str()); }
+};
+
+namespace writable {
+template <typename T>
+class list_of : public writable::list {
+ public:
+ list_of(const list& data) : writable::list(data) {}
+ list_of(R_xlen_t n) : writable::list(n) {}
+
+ class proxy {
+ private:
+ writable::list::proxy data_;
+
+ public:
+ proxy(const writable::list::proxy& data) : data_(data) {}
+
+ operator T() const { return static_cast<SEXP>(*this); }
+ operator SEXP() const { return static_cast<SEXP>(data_); }
+#ifdef LONG_VECTOR_SUPPORT
+ typename T::proxy operator[](int pos) { return static_cast<T>(data_)[pos]; }
+#endif
+ typename T::proxy operator[](R_xlen_t pos) { return static_cast<T>(data_)[pos]; }
+ proxy operator[](const char* pos) { static_cast<T>(data_)[pos]; }
+ proxy operator[](const std::string& pos) { return static_cast<T>(data_)[pos]; }
+ proxy& operator=(const T& rhs) {
+ data_ = rhs;
+
+ return *this;
+ }
+ };
+
+#ifdef LONG_VECTOR_SUPPORT
+ proxy operator[](int pos) {
+ return {writable::list::operator[](static_cast<R_xlen_t>(pos))};
+ }
+#endif
+
+ proxy operator[](R_xlen_t pos) { return writable::list::operator[](pos); }
+
+ proxy operator[](const char* pos) { return {writable::list::operator[](pos)}; }
+
+ proxy operator[](const std::string& pos) {
+ return writable::list::operator[](pos.c_str());
+ }
+};
+} // namespace writable
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/logicals.hpp b/src/arrow/r/inst/include/cpp11/logicals.hpp
new file mode 100644
index 000000000..5f96b3eab
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/logicals.hpp
@@ -0,0 +1,143 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <algorithm> // for min
+#include <array> // for array
+#include <initializer_list> // for initializer_list
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, Rf_all...
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/named_arg.hpp" // for named_arg
+#include "cpp11/protect.hpp" // for preserved
+#include "cpp11/r_bool.hpp" // for r_bool
+#include "cpp11/r_vector.hpp" // for r_vector, r_vector<>::proxy
+#include "cpp11/sexp.hpp" // for sexp
+
+// Specializations for logicals
+
+namespace cpp11 {
+
+template <>
+inline SEXP r_vector<r_bool>::valid_type(SEXP data) {
+ if (TYPEOF(data) != LGLSXP) {
+ throw type_error(LGLSXP, TYPEOF(data));
+ }
+ return data;
+}
+
+template <>
+inline r_bool r_vector<r_bool>::operator[](const R_xlen_t pos) const {
+ return is_altrep_ ? static_cast<r_bool>(LOGICAL_ELT(data_, pos)) : data_p_[pos];
+}
+
+template <>
+inline r_bool* r_vector<r_bool>::get_p(bool is_altrep, SEXP data) {
+ if (is_altrep) {
+ return nullptr;
+ } else {
+ return reinterpret_cast<r_bool*>(LOGICAL(data));
+ }
+}
+
+template <>
+inline void r_vector<r_bool>::const_iterator::fill_buf(R_xlen_t pos) {
+ length_ = std::min(64_xl, data_->size() - pos);
+ LOGICAL_GET_REGION(data_->data_, pos, length_, reinterpret_cast<int*>(buf_.data()));
+ block_start_ = pos;
+}
+
+typedef r_vector<r_bool> logicals;
+
+namespace writable {
+
+template <>
+inline typename r_vector<r_bool>::proxy& r_vector<r_bool>::proxy::operator=(
+ const r_bool& rhs) {
+ if (is_altrep_) {
+ SET_LOGICAL_ELT(data_, index_, rhs);
+ } else {
+ *p_ = rhs;
+ }
+ return *this;
+}
+
+template <>
+inline r_vector<r_bool>::proxy::operator r_bool() const {
+ if (p_ == nullptr) {
+ return static_cast<r_bool>(LOGICAL_ELT(data_, index_));
+ } else {
+ return *p_;
+ }
+}
+
+inline bool operator==(const r_vector<r_bool>::proxy& lhs, r_bool rhs) {
+ return static_cast<r_bool>(lhs).operator==(rhs);
+}
+
+template <>
+inline r_vector<r_bool>::r_vector(std::initializer_list<r_bool> il)
+ : cpp11::r_vector<r_bool>(Rf_allocVector(LGLSXP, il.size())), capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ SET_LOGICAL_ELT(data_, i, *it);
+ }
+}
+
+template <>
+inline r_vector<r_bool>::r_vector(std::initializer_list<named_arg> il)
+ : cpp11::r_vector<r_bool>(safe[Rf_allocVector](LGLSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ int n_protected = 0;
+
+ try {
+ unwind_protect([&] {
+ Rf_setAttrib(data_, R_NamesSymbol, Rf_allocVector(STRSXP, capacity_));
+ SEXP names = PROTECT(Rf_getAttrib(data_, R_NamesSymbol));
+ ++n_protected;
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ data_p_[i] = static_cast<r_bool>(LOGICAL_ELT(it->value(), 0));
+ SET_STRING_ELT(names, i, Rf_mkCharCE(it->name(), CE_UTF8));
+ }
+ UNPROTECT(n_protected);
+ });
+ } catch (const unwind_exception& e) {
+ preserved.release(protect_);
+ UNPROTECT(n_protected);
+ throw e;
+ }
+}
+
+template <>
+inline void r_vector<r_bool>::reserve(R_xlen_t new_capacity) {
+ data_ = data_ == R_NilValue ? safe[Rf_allocVector](LGLSXP, new_capacity)
+ : safe[Rf_xlengthgets](data_, new_capacity);
+ SEXP old_protect = protect_;
+ protect_ = preserved.insert(data_);
+
+ preserved.release(old_protect);
+
+ data_p_ = reinterpret_cast<r_bool*>(LOGICAL(data_));
+ capacity_ = new_capacity;
+}
+
+template <>
+inline void r_vector<r_bool>::push_back(r_bool value) {
+ while (length_ >= capacity_) {
+ reserve(capacity_ == 0 ? 1 : capacity_ *= 2);
+ }
+ if (is_altrep_) {
+ SET_LOGICAL_ELT(data_, length_, value);
+ } else {
+ data_p_[length_] = value;
+ }
+ ++length_;
+}
+
+typedef r_vector<r_bool> logicals;
+
+} // namespace writable
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/matrix.hpp b/src/arrow/r/inst/include/cpp11/matrix.hpp
new file mode 100644
index 000000000..30698c65a
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/matrix.hpp
@@ -0,0 +1,112 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <string> // for string
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, R_xlen_t, INT...
+#include "cpp11/r_bool.hpp" // for r_bool
+#include "cpp11/r_string.hpp" // for r_string
+#include "cpp11/r_vector.hpp" // for r_vector
+#include "cpp11/sexp.hpp" // for sexp
+
+namespace cpp11 {
+template <typename V, typename T>
+class matrix {
+ private:
+ V vector_;
+ int nrow_;
+
+ public:
+ class row {
+ private:
+ matrix& parent_;
+ int row_;
+
+ public:
+ row(matrix& parent, R_xlen_t row) : parent_(parent), row_(row) {}
+ T operator[](const int pos) { return parent_.vector_[row_ + (pos * parent_.nrow_)]; }
+
+ class iterator {
+ private:
+ row& row_;
+ int pos_;
+
+ public:
+ iterator(row& row, R_xlen_t pos) : row_(row), pos_(pos) {}
+ iterator begin() const { return row_.parent_.vector_iterator(&this, 0); }
+ iterator end() const { return iterator(&this, row_.size()); }
+ inline iterator& operator++() {
+ ++pos_;
+ return *this;
+ }
+ bool operator!=(const iterator& rhs) {
+ return !(pos_ == rhs.pos_ && row_.row_ == rhs.row_.row_);
+ }
+ T operator*() const { return row_[pos_]; };
+ };
+
+ iterator begin() { return iterator(*this, 0); }
+ iterator end() { return iterator(*this, size()); }
+ R_xlen_t size() const { return parent_.vector_.size() / parent_.nrow_; }
+ bool operator!=(const row& rhs) { return row_ != rhs.row_; }
+ row& operator++() {
+ ++row_;
+ return *this;
+ }
+ row& operator*() { return *this; }
+ };
+ friend row;
+
+ public:
+ matrix(SEXP data) : vector_(data), nrow_(INTEGER_ELT(vector_.attr("dim"), 0)) {}
+
+ template <typename V2, typename T2>
+ matrix(const cpp11::matrix<V2, T2>& rhs) : vector_(rhs), nrow_(rhs.nrow()) {}
+
+ matrix(int nrow, int ncol) : vector_(R_xlen_t(nrow * ncol)), nrow_(nrow) {
+ vector_.attr("dim") = {nrow, ncol};
+ }
+
+ int nrow() const { return nrow_; }
+
+ int ncol() const { return size() / nrow_; }
+
+ SEXP data() const { return vector_.data(); }
+
+ R_xlen_t size() const { return vector_.size(); }
+
+ operator SEXP() const { return SEXP(vector_); }
+
+ // operator sexp() { return sexp(vector_); }
+
+ sexp attr(const char* name) const { return SEXP(vector_.attr(name)); }
+
+ sexp attr(const std::string& name) const { return SEXP(vector_.attr(name)); }
+
+ sexp attr(SEXP name) const { return SEXP(vector_.attr(name)); }
+
+ r_vector<r_string> names() const { return SEXP(vector_.names()); }
+
+ row operator[](const int pos) { return {*this, pos}; }
+
+ T operator()(int row, int col) { return vector_[row + (col * nrow_)]; }
+
+ row begin() { return {*this, 0}; }
+ row end() { return {*this, nrow_}; }
+};
+
+using doubles_matrix = matrix<r_vector<double>, double>;
+using integers_matrix = matrix<r_vector<int>, int>;
+using logicals_matrix = matrix<r_vector<r_bool>, r_bool>;
+using strings_matrix = matrix<r_vector<r_string>, r_string>;
+
+namespace writable {
+using doubles_matrix = matrix<r_vector<double>, r_vector<double>::proxy>;
+using integers_matrix = matrix<r_vector<int>, r_vector<int>::proxy>;
+using logicals_matrix = matrix<r_vector<r_bool>, r_vector<r_bool>::proxy>;
+using strings_matrix = matrix<r_vector<r_string>, r_vector<r_string>::proxy>;
+} // namespace writable
+
+// TODO: Add tests for Matrix class
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/named_arg.hpp b/src/arrow/r/inst/include/cpp11/named_arg.hpp
new file mode 100644
index 000000000..762c8a79d
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/named_arg.hpp
@@ -0,0 +1,50 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <stddef.h> // for size_t
+
+#include <initializer_list> // for initializer_list
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, literals
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/sexp.hpp" // for sexp
+
+namespace cpp11 {
+class named_arg {
+ public:
+ explicit named_arg(const char* name) : name_(name), value_(R_NilValue) {}
+ named_arg& operator=(std::initializer_list<int> il) {
+ value_ = as_sexp(il);
+ return *this;
+ }
+
+ template <typename T>
+ named_arg& operator=(T rhs) {
+ value_ = as_sexp(rhs);
+ return *this;
+ }
+
+ template <typename T>
+ named_arg& operator=(std::initializer_list<T> rhs) {
+ value_ = as_sexp(rhs);
+ return *this;
+ }
+
+ const char* name() const { return name_; }
+ SEXP value() const { return value_; }
+
+ private:
+ const char* name_;
+ sexp value_;
+};
+
+namespace literals {
+
+inline named_arg operator"" _nm(const char* name, std::size_t) { return named_arg(name); }
+
+} // namespace literals
+
+using namespace literals;
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/protect.hpp b/src/arrow/r/inst/include/cpp11/protect.hpp
new file mode 100644
index 000000000..1d1b48bb5
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/protect.hpp
@@ -0,0 +1,372 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <csetjmp> // for longjmp, setjmp, jmp_buf
+#include <exception> // for exception
+#include <stdexcept> // for std::runtime_error
+#include <string> // for string, basic_string
+#include <tuple> // for tuple, make_tuple
+
+// NB: cpp11/R.hpp must precede R_ext/Error.h to ensure R_NO_REMAP is defined
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, CDR, R_NilValue, CAR, R_Pres...
+
+#include "R_ext/Boolean.h" // for Rboolean
+#include "R_ext/Error.h" // for Rf_error, Rf_warning
+#include "R_ext/Print.h" // for REprintf
+#include "R_ext/Utils.h" // for R_CheckUserInterrupt
+#include "Rversion.h" // for R_VERSION, R_Version
+
+#if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0)
+#define HAS_UNWIND_PROTECT
+#endif
+
+namespace cpp11 {
+class unwind_exception : public std::exception {
+ public:
+ SEXP token;
+ unwind_exception(SEXP token_) : token(token_) {}
+};
+
+#ifdef HAS_UNWIND_PROTECT
+
+/// Unwind Protection from C longjmp's, like those used in R error handling
+///
+/// @param code The code to which needs to be protected, as a nullary callable
+template <typename Fun, typename = typename std::enable_if<std::is_same<
+ decltype(std::declval<Fun&&>()()), SEXP>::value>::type>
+SEXP unwind_protect(Fun&& code) {
+ static SEXP token = [] {
+ SEXP res = R_MakeUnwindCont();
+ R_PreserveObject(res);
+ return res;
+ }();
+
+ std::jmp_buf jmpbuf;
+ if (setjmp(jmpbuf)) {
+ throw unwind_exception(token);
+ }
+
+ SEXP res = R_UnwindProtect(
+ [](void* data) -> SEXP {
+ auto callback = static_cast<decltype(&code)>(data);
+ return static_cast<Fun&&>(*callback)();
+ },
+ &code,
+ [](void* jmpbuf, Rboolean jump) {
+ if (jump == TRUE) {
+ // We need to first jump back into the C++ stacks because you can't safely throw
+ // exceptions from C stack frames.
+ longjmp(*static_cast<std::jmp_buf*>(jmpbuf), 1);
+ }
+ },
+ &jmpbuf, token);
+
+ // R_UnwindProtect adds the result to the CAR of the continuation token,
+ // which implicitly protects the result. However if there is no error and
+ // R_UwindProtect does a normal exit the memory shouldn't be protected, so we
+ // unset it here before returning the value ourselves.
+ SETCAR(token, R_NilValue);
+
+ return res;
+}
+
+template <typename Fun, typename = typename std::enable_if<std::is_same<
+ decltype(std::declval<Fun&&>()()), void>::value>::type>
+void unwind_protect(Fun&& code) {
+ (void)unwind_protect([&] {
+ std::forward<Fun>(code)();
+ return R_NilValue;
+ });
+}
+
+template <typename Fun, typename R = decltype(std::declval<Fun&&>()())>
+typename std::enable_if<!std::is_same<R, SEXP>::value && !std::is_same<R, void>::value,
+ R>::type
+unwind_protect(Fun&& code) {
+ R out;
+ (void)unwind_protect([&] {
+ out = std::forward<Fun>(code)();
+ return R_NilValue;
+ });
+ return out;
+}
+
+#else
+// Don't do anything if we don't have unwind protect. This will leak C++ resources,
+// including those held by cpp11 objects, but the other alternatives are also not great.
+template <typename Fun>
+decltype(std::declval<Fun&&>()()) unwind_protect(Fun&& code) {
+ return std::forward<Fun>(code)();
+}
+#endif
+
+namespace detail {
+
+template <size_t...>
+struct index_sequence {
+ using type = index_sequence;
+};
+
+template <typename, size_t>
+struct appended_sequence;
+
+template <std::size_t... I, std::size_t J>
+struct appended_sequence<index_sequence<I...>, J> : index_sequence<I..., J> {};
+
+template <size_t N>
+struct make_index_sequence
+ : appended_sequence<typename make_index_sequence<N - 1>::type, N - 1> {};
+
+template <>
+struct make_index_sequence<0> : index_sequence<> {};
+
+template <typename F, typename... Aref, size_t... I>
+decltype(std::declval<F&&>()(std::declval<Aref>()...)) apply(
+ F&& f, std::tuple<Aref...>&& a, const index_sequence<I...>&) {
+ return std::forward<F>(f)(std::get<I>(std::move(a))...);
+}
+
+template <typename F, typename... Aref>
+decltype(std::declval<F&&>()(std::declval<Aref>()...)) apply(F&& f,
+ std::tuple<Aref...>&& a) {
+ return apply(std::forward<F>(f), std::move(a), make_index_sequence<sizeof...(Aref)>{});
+}
+
+// overload to silence a compiler warning that the (empty) tuple parameter is set but
+// unused
+template <typename F>
+decltype(std::declval<F&&>()()) apply(F&& f, std::tuple<>&&) {
+ return std::forward<F>(f)();
+}
+
+template <typename F, typename... Aref>
+struct closure {
+ decltype(std::declval<F*>()(std::declval<Aref>()...)) operator()() && {
+ return apply(ptr_, std::move(arefs_));
+ }
+ F* ptr_;
+ std::tuple<Aref...> arefs_;
+};
+
+} // namespace detail
+
+struct protect {
+ template <typename F>
+ struct function {
+ template <typename... A>
+ decltype(std::declval<F*>()(std::declval<A&&>()...)) operator()(A&&... a) const {
+ // workaround to support gcc4.8, which can't capture a parameter pack
+ return unwind_protect(
+ detail::closure<F, A&&...>{ptr_, std::forward_as_tuple(std::forward<A>(a)...)});
+ }
+
+ F* ptr_;
+ };
+
+ /// May not be applied to a function bearing attributes, which interfere with linkage on
+ /// some compilers; use an appropriately attributed alternative. (For example, Rf_error
+ /// bears the [[noreturn]] attribute and must be protected with safe.noreturn rather
+ /// than safe.operator[]).
+ template <typename F>
+ constexpr function<F> operator[](F* raw) const {
+ return {raw};
+ }
+
+ template <typename F>
+ struct noreturn_function {
+ template <typename... A>
+ void operator() [[noreturn]] (A&&... a) const {
+ // workaround to support gcc4.8, which can't capture a parameter pack
+ unwind_protect(
+ detail::closure<F, A&&...>{ptr_, std::forward_as_tuple(std::forward<A>(a)...)});
+ // Compiler hint to allow [[noreturn]] attribute; this is never executed since
+ // the above call will not return.
+ throw std::runtime_error("[[noreturn]]");
+ }
+ F* ptr_;
+ };
+
+ template <typename F>
+ constexpr noreturn_function<F> noreturn(F* raw) const {
+ return {raw};
+ }
+};
+constexpr struct protect safe = {};
+
+inline void check_user_interrupt() { safe[R_CheckUserInterrupt](); }
+
+template <typename... Args>
+void stop [[noreturn]] (const char* fmt, Args... args) {
+ safe.noreturn(Rf_errorcall)(R_NilValue, fmt, args...);
+}
+
+template <typename... Args>
+void stop [[noreturn]] (const std::string& fmt, Args... args) {
+ safe.noreturn(Rf_errorcall)(R_NilValue, fmt.c_str(), args...);
+}
+
+template <typename... Args>
+void warning(const char* fmt, Args... args) {
+ safe[Rf_warningcall](R_NilValue, fmt, args...);
+}
+
+template <typename... Args>
+void warning(const std::string& fmt, Args... args) {
+ safe[Rf_warningcall](R_NilValue, fmt.c_str(), args...);
+}
+
+/// A doubly-linked list of preserved objects, allowing O(1) insertion/release of
+/// objects compared to O(N preserved) with R_PreserveObject.
+static struct {
+ SEXP insert(SEXP obj) {
+ if (obj == R_NilValue) {
+ return R_NilValue;
+ }
+
+#ifdef CPP11_USE_PRESERVE_OBJECT
+ PROTECT(obj);
+ R_PreserveObject(obj);
+ UNPROTECT(1);
+ return obj;
+#endif
+
+ PROTECT(obj);
+
+ SEXP list_ = get_preserve_list();
+
+ // Add a new cell that points to the previous end.
+ SEXP cell = PROTECT(Rf_cons(list_, CDR(list_)));
+
+ SET_TAG(cell, obj);
+
+ SETCDR(list_, cell);
+
+ if (CDR(cell) != R_NilValue) {
+ SETCAR(CDR(cell), cell);
+ }
+
+ UNPROTECT(2);
+
+ return cell;
+ }
+
+ void print() {
+ for (SEXP head = get_preserve_list(); head != R_NilValue; head = CDR(head)) {
+ REprintf("%x CAR: %x CDR: %x TAG: %x\n", head, CAR(head), CDR(head), TAG(head));
+ }
+ REprintf("---\n");
+ }
+
+ // This is currently unused, but client packages could use it to free leaked resources
+ // in older R versions if needed
+ void release_all() {
+#if !defined(CPP11_USE_PRESERVE_OBJECT)
+ SEXP list_ = get_preserve_list();
+ SEXP first = CDR(list_);
+ if (first != R_NilValue) {
+ SETCAR(first, R_NilValue);
+ SETCDR(list_, R_NilValue);
+ }
+#endif
+ }
+
+ void release(SEXP token) {
+ if (token == R_NilValue) {
+ return;
+ }
+
+#ifdef CPP11_USE_PRESERVE_OBJECT
+ R_ReleaseObject(token);
+ return;
+#endif
+
+ SEXP before = CAR(token);
+
+ SEXP after = CDR(token);
+
+ if (before == R_NilValue && after == R_NilValue) {
+ Rf_error("should never happen");
+ }
+
+ SETCDR(before, after);
+
+ if (after != R_NilValue) {
+ SETCAR(after, before);
+ }
+ }
+
+ private:
+ // We deliberately avoid using safe[] in the below code, as this code runs
+ // when the shared library is loaded and will not be wrapped by
+ // `CPP11_UNWIND`, so if an error occurs we will not catch the C++ exception
+ // that safe emits.
+ static void set_option(SEXP name, SEXP value) {
+ static SEXP opt = SYMVALUE(Rf_install(".Options"));
+ SEXP t = opt;
+ while (CDR(t) != R_NilValue) {
+ if (TAG(CDR(t)) == name) {
+ opt = CDR(t);
+ SET_TAG(opt, name);
+ SETCAR(opt, value);
+ return;
+ }
+ t = CDR(t);
+ }
+ SETCDR(t, Rf_allocList(1));
+ opt = CDR(t);
+ SET_TAG(opt, name);
+ SETCAR(opt, value);
+ }
+
+ // The preserved list singleton is stored in a XPtr within an R global option.
+ //
+ // It is not constructed as a static variable directly since many
+ // translation units may be compiled, resulting in unrelated instances of each
+ // static variable.
+ //
+ // We cannot store it in the cpp11 namespace, as cpp11 likely will not be loaded by
+ // packages.
+ // We cannot store it in R's global environment, as that is against CRAN
+ // policies.
+ // We instead store it as an XPtr in the global options, which avoids issues
+ // both copying and serializing.
+ static SEXP get_preserve_xptr_addr() {
+ static SEXP preserve_xptr_sym = Rf_install("cpp11_preserve_xptr");
+ SEXP preserve_xptr = Rf_GetOption1(preserve_xptr_sym);
+
+ if (TYPEOF(preserve_xptr) != EXTPTRSXP) {
+ return R_NilValue;
+ }
+ auto addr = R_ExternalPtrAddr(preserve_xptr);
+ if (addr == nullptr) {
+ return R_NilValue;
+ }
+ return static_cast<SEXP>(addr);
+ }
+
+ static void set_preserve_xptr(SEXP value) {
+ static SEXP preserve_xptr_sym = Rf_install("cpp11_preserve_xptr");
+
+ SEXP xptr = PROTECT(R_MakeExternalPtr(value, R_NilValue, R_NilValue));
+ set_option(preserve_xptr_sym, xptr);
+ UNPROTECT(1);
+ }
+
+ static SEXP get_preserve_list() {
+ static SEXP preserve_list = R_NilValue;
+
+ if (TYPEOF(preserve_list) != LISTSXP) {
+ preserve_list = get_preserve_xptr_addr();
+ if (TYPEOF(preserve_list) != LISTSXP) {
+ preserve_list = Rf_cons(R_NilValue, R_NilValue);
+ R_PreserveObject(preserve_list);
+ set_preserve_xptr(preserve_list);
+ }
+ }
+
+ return preserve_list;
+ }
+} // namespace cpp11
+preserved;
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/r_bool.hpp b/src/arrow/r/inst/include/cpp11/r_bool.hpp
new file mode 100644
index 000000000..e5c8592d7
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/r_bool.hpp
@@ -0,0 +1,76 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <limits> // for numeric_limits
+#include <ostream>
+#include <type_traits> // for is_convertible, enable_if
+
+#include "R_ext/Boolean.h" // for Rboolean
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, ...
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/protect.hpp" // for unwind_protect, preserved
+#include "cpp11/r_vector.hpp"
+#include "cpp11/sexp.hpp" // for sexp
+
+namespace cpp11 {
+
+class r_bool {
+ public:
+ r_bool() = default;
+
+ r_bool(SEXP data) {
+ if (Rf_isLogical(data)) {
+ if (Rf_xlength(data) == 1) {
+ value_ = static_cast<Rboolean>(LOGICAL_ELT(data, 0));
+ }
+ }
+ stop("Invalid r_bool value: %x", data);
+ }
+
+ r_bool(bool value) : value_(value ? TRUE : FALSE) {}
+ r_bool(Rboolean value) : value_(value) {}
+ r_bool(int value) : value_(from_int(value)) {}
+
+ operator bool() const { return value_ == TRUE; }
+ operator int() const { return value_; }
+ operator Rboolean() const { return value_ ? TRUE : FALSE; }
+
+ bool operator==(r_bool rhs) const { return value_ == rhs.value_; }
+ bool operator==(bool rhs) const { return operator==(r_bool(rhs)); }
+ bool operator==(Rboolean rhs) const { return operator==(r_bool(rhs)); }
+ bool operator==(int rhs) const { return operator==(r_bool(rhs)); }
+
+ private:
+ static constexpr int na = std::numeric_limits<int>::min();
+
+ static int from_int(int value) {
+ if (value == static_cast<int>(FALSE)) return FALSE;
+ if (value == static_cast<int>(na)) return na;
+ return TRUE;
+ }
+
+ int value_ = na;
+};
+
+inline std::ostream& operator<<(std::ostream& os, r_bool const& value) {
+ os << ((value == TRUE) ? "TRUE" : "FALSE");
+ return os;
+}
+
+template <typename T, typename R = void>
+using enable_if_r_bool = enable_if_t<std::is_same<T, r_bool>::value, R>;
+
+template <typename T>
+enable_if_r_bool<T, SEXP> as_sexp(T from) {
+ sexp res = Rf_allocVector(LGLSXP, 1);
+ unwind_protect([&] { SET_LOGICAL_ELT(res.data(), 0, from); });
+ return res;
+}
+
+template <>
+inline r_bool na() {
+ return NA_LOGICAL;
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/r_string.hpp b/src/arrow/r/inst/include/cpp11/r_string.hpp
new file mode 100644
index 000000000..d62f7270f
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/r_string.hpp
@@ -0,0 +1,98 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <string> // for string, basic_string, operator==
+#include <type_traits> // for is_convertible, enable_if
+
+#include "R_ext/Memory.h" // for vmaxget, vmaxset
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, Rf_mkCharCE, Rf_translat...
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/protect.hpp" // for unwind_protect, protect, protect::function
+#include "cpp11/sexp.hpp" // for sexp
+
+namespace cpp11 {
+
+class r_string {
+ public:
+ r_string() = default;
+ r_string(SEXP data) : data_(data) {}
+ r_string(const char* data) : data_(safe[Rf_mkCharCE](data, CE_UTF8)) {}
+ r_string(const std::string& data)
+ : data_(safe[Rf_mkCharLenCE](data.c_str(), data.size(), CE_UTF8)) {}
+
+ operator SEXP() const { return data_; }
+ operator sexp() const { return data_; }
+ operator std::string() const {
+ std::string res;
+ res.reserve(size());
+
+ void* vmax = vmaxget();
+ unwind_protect([&] { res.assign(Rf_translateCharUTF8(data_)); });
+ vmaxset(vmax);
+
+ return res;
+ }
+
+ bool operator==(const r_string& rhs) const { return data_.data() == rhs.data_.data(); }
+
+ bool operator==(const SEXP rhs) const { return data_.data() == rhs; }
+
+ bool operator==(const char* rhs) const {
+ return static_cast<std::string>(*this) == rhs;
+ }
+
+ bool operator==(const std::string& rhs) const {
+ return static_cast<std::string>(*this) == rhs;
+ }
+
+ R_xlen_t size() const { return Rf_xlength(data_); }
+
+ private:
+ sexp data_ = R_NilValue;
+};
+
+inline SEXP as_sexp(std::initializer_list<r_string> il) {
+ R_xlen_t size = il.size();
+
+ sexp data;
+ unwind_protect([&] {
+ data = Rf_allocVector(STRSXP, size);
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < size; ++i, ++it) {
+ if (*it == NA_STRING) {
+ SET_STRING_ELT(data, i, *it);
+ } else {
+ SET_STRING_ELT(data, i, Rf_mkCharCE(Rf_translateCharUTF8(*it), CE_UTF8));
+ }
+ }
+ });
+ return data;
+}
+
+template <typename T, typename R = void>
+using enable_if_r_string = enable_if_t<std::is_same<T, cpp11::r_string>::value, R>;
+
+template <typename T>
+enable_if_r_string<T, SEXP> as_sexp(T from) {
+ r_string str(from);
+ sexp res;
+ unwind_protect([&] {
+ res = Rf_allocVector(STRSXP, 1);
+
+ if (str == NA_STRING) {
+ SET_STRING_ELT(res, 0, str);
+ } else {
+ SET_STRING_ELT(res, 0, Rf_mkCharCE(Rf_translateCharUTF8(str), CE_UTF8));
+ }
+ });
+
+ return res;
+}
+
+template <>
+inline r_string na() {
+ return NA_STRING;
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/r_vector.hpp b/src/arrow/r/inst/include/cpp11/r_vector.hpp
new file mode 100644
index 000000000..3a3d53b36
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/r_vector.hpp
@@ -0,0 +1,1009 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <stddef.h> // for ptrdiff_t, size_t
+
+#include <algorithm> // for max
+#include <array> // for array
+#include <cstdio> // for snprintf
+#include <exception> // for exception
+#include <initializer_list> // for initializer_list
+#include <iterator> // for forward_iterator_tag, random_ac...
+#include <stdexcept> // for out_of_range
+#include <string> // for string, basic_string
+#include <type_traits> // for decay, is_same, enable_if, is_c...
+#include <utility> // for declval
+
+#include "cpp11/R.hpp" // for R_xlen_t, SEXP, SEXPREC, Rf_xle...
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/protect.hpp" // for preserved
+#include "cpp11/r_string.hpp" // for r_string
+#include "cpp11/sexp.hpp" // for sexp
+
+namespace cpp11 {
+
+using namespace cpp11::literals;
+
+class type_error : public std::exception {
+ public:
+ type_error(int expected, int actual) : expected_(expected), actual_(actual) {}
+ virtual const char* what() const noexcept {
+ snprintf(str_, 64, "Invalid input type, expected '%s' actual '%s'",
+ Rf_type2char(expected_), Rf_type2char(actual_));
+ return str_;
+ }
+
+ private:
+ int expected_;
+ int actual_;
+ mutable char str_[64];
+};
+
+// Forward Declarations
+class named_arg;
+
+namespace writable {
+template <typename T>
+class r_vector;
+} // namespace writable
+
+// Declarations
+template <typename T>
+class r_vector {
+ public:
+ typedef ptrdiff_t difference_type;
+ typedef size_t size_type;
+ typedef T value_type;
+ typedef T* pointer;
+ typedef T& reference;
+
+ r_vector() = default;
+
+ r_vector(SEXP data);
+
+ r_vector(SEXP data, bool is_altrep);
+
+#ifdef LONG_VECTOR_SUPPORT
+ T operator[](const int pos) const;
+ T at(const int pos) const;
+#endif
+ T operator[](const R_xlen_t pos) const;
+ T operator[](const size_type pos) const;
+ T operator[](const r_string& name) const;
+
+ T at(const R_xlen_t pos) const;
+ T at(const size_type pos) const;
+ T at(const r_string& name) const;
+
+ bool contains(const r_string& name) const;
+
+ r_vector& operator=(const r_vector& rhs) {
+ SEXP old_protect = protect_;
+
+ data_ = rhs.data_;
+ protect_ = preserved.insert(data_);
+ is_altrep_ = rhs.is_altrep_;
+ data_p_ = rhs.data_p_;
+ length_ = rhs.length_;
+
+ preserved.release(old_protect);
+
+ return *this;
+ };
+
+ r_vector(const r_vector& rhs) {
+ SEXP old_protect = protect_;
+
+ data_ = rhs.data_;
+ protect_ = preserved.insert(data_);
+ is_altrep_ = rhs.is_altrep_;
+ data_p_ = rhs.data_p_;
+ length_ = rhs.length_;
+
+ preserved.release(old_protect);
+ };
+
+ r_vector(const writable::r_vector<T>& rhs) : r_vector(static_cast<SEXP>(rhs)) {}
+
+ bool is_altrep() const;
+
+ bool named() const;
+
+ R_xlen_t size() const;
+
+ operator SEXP() const;
+
+ operator sexp() const;
+
+ bool empty() const;
+
+ /// Provide access to the underlying data, mainly for interface
+ /// compatibility with std::vector
+ SEXP data() const;
+
+ sexp attr(const char* name) const {
+ return SEXP(attribute_proxy<r_vector<T>>(*this, name));
+ }
+
+ sexp attr(const std::string& name) const {
+ return SEXP(attribute_proxy<r_vector<T>>(*this, name.c_str()));
+ }
+
+ sexp attr(SEXP name) const { return SEXP(attribute_proxy<r_vector<T>>(*this, name)); }
+
+ r_vector<r_string> names() const {
+ SEXP nms = SEXP(attribute_proxy<r_vector<T>>(*this, R_NamesSymbol));
+ if (nms == R_NilValue) {
+ return r_vector<r_string>();
+ }
+
+ return nms;
+ }
+
+ class const_iterator {
+ public:
+ using difference_type = ptrdiff_t;
+ using value_type = T;
+ using pointer = T*;
+ using reference = T&;
+ using iterator_category = std::random_access_iterator_tag;
+
+ const_iterator(const r_vector* data, R_xlen_t pos);
+
+ inline const_iterator& operator+(R_xlen_t pos);
+ inline ptrdiff_t operator-(const const_iterator& other) const;
+
+ inline const_iterator& operator++();
+ inline const_iterator& operator--();
+
+ inline const_iterator& operator+=(R_xlen_t pos);
+ inline const_iterator& operator-=(R_xlen_t pos);
+
+ inline bool operator!=(const const_iterator& other) const;
+ inline bool operator==(const const_iterator& other) const;
+
+ inline T operator*() const;
+
+ friend class writable::r_vector<T>::iterator;
+
+ private:
+ const r_vector* data_;
+ void fill_buf(R_xlen_t pos);
+
+ R_xlen_t pos_;
+ std::array<T, 64 * 64> buf_;
+ R_xlen_t block_start_ = 0;
+ R_xlen_t length_ = 0;
+ };
+
+ public:
+ const_iterator begin() const;
+ const_iterator end() const;
+
+ const_iterator cbegin() const;
+ const_iterator cend() const;
+
+ const_iterator find(const r_string& name) const;
+
+ ~r_vector() { preserved.release(protect_); }
+
+ private:
+ SEXP data_ = R_NilValue;
+ SEXP protect_ = R_NilValue;
+ bool is_altrep_ = false;
+ T* data_p_ = nullptr;
+ R_xlen_t length_ = 0;
+
+ static T* get_p(bool is_altrep, SEXP data);
+
+ static SEXP valid_type(SEXP data);
+
+ friend class writable::r_vector<T>;
+};
+
+namespace writable {
+
+template <typename T>
+using has_begin_fun = std::decay<decltype(*begin(std::declval<T>()))>;
+
+/// Read/write access to new or copied r_vectors
+template <typename T>
+class r_vector : public cpp11::r_vector<T> {
+ private:
+ SEXP protect_ = R_NilValue;
+
+ // These are necessary because type names are not directly accessible in
+ // template inheritance
+ using cpp11::r_vector<T>::data_;
+ using cpp11::r_vector<T>::data_p_;
+ using cpp11::r_vector<T>::is_altrep_;
+ using cpp11::r_vector<T>::length_;
+
+ R_xlen_t capacity_ = 0;
+
+ public:
+ class proxy {
+ private:
+ const SEXP data_;
+ const R_xlen_t index_;
+ T* const p_;
+ bool is_altrep_;
+
+ public:
+ proxy(SEXP data, const R_xlen_t index, T* const p, bool is_altrep);
+
+ proxy& operator=(const T& rhs);
+ proxy& operator+=(const T& rhs);
+ proxy& operator-=(const T& rhs);
+ proxy& operator*=(const T& rhs);
+ proxy& operator/=(const T& rhs);
+ proxy& operator++(int);
+ proxy& operator--(int);
+
+ void operator++();
+ void operator--();
+
+ operator T() const;
+ };
+
+ typedef ptrdiff_t difference_type;
+ typedef size_t size_type;
+ typedef proxy value_type;
+ typedef proxy* pointer;
+ typedef proxy& reference;
+
+ class iterator : public cpp11::r_vector<T>::const_iterator {
+ private:
+ const r_vector& data_;
+ using cpp11::r_vector<T>::const_iterator::block_start_;
+ using cpp11::r_vector<T>::const_iterator::pos_;
+ using cpp11::r_vector<T>::const_iterator::buf_;
+ using cpp11::r_vector<T>::const_iterator::length_;
+ using cpp11::r_vector<T>::const_iterator::fill_buf;
+
+ public:
+ using difference_type = ptrdiff_t;
+ using value_type = proxy;
+ using pointer = proxy*;
+ using reference = proxy&;
+ using iterator_category = std::forward_iterator_tag;
+
+ iterator(const r_vector& data, R_xlen_t pos);
+
+ inline iterator& operator++();
+
+ inline proxy operator*() const;
+
+ using cpp11::r_vector<T>::const_iterator::operator!=;
+
+ inline iterator& operator+(R_xlen_t rhs);
+ };
+
+ r_vector() = default;
+ r_vector(const SEXP& data);
+ r_vector(SEXP&& data);
+ r_vector(const SEXP& data, bool is_altrep);
+ r_vector(SEXP&& data, bool is_altrep);
+ r_vector(std::initializer_list<T> il);
+ r_vector(std::initializer_list<named_arg> il);
+ r_vector(std::initializer_list<const char*> il);
+ r_vector(std::initializer_list<std::string> il);
+
+ template <typename Iter>
+ r_vector(Iter first, Iter last);
+
+ template <typename V, typename W = has_begin_fun<V>>
+ r_vector(const V& obj);
+
+ r_vector(const R_xlen_t size);
+
+ ~r_vector();
+
+ r_vector(const r_vector& rhs);
+ r_vector(r_vector&& rhs);
+
+ r_vector(const cpp11::r_vector<T>& rhs);
+
+ r_vector& operator=(const r_vector& rhs);
+ r_vector& operator=(r_vector&& rhs);
+
+#ifdef LONG_VECTOR_SUPPORT
+ proxy operator[](const int pos) const;
+ proxy at(const int pos) const;
+#endif
+ proxy operator[](const R_xlen_t pos) const;
+ proxy operator[](const size_type pos) const;
+ proxy operator[](const r_string& name) const;
+
+ proxy at(const R_xlen_t pos) const;
+ proxy at(const size_type pos) const;
+ proxy at(const r_string& name) const;
+
+ void push_back(T value);
+ void push_back(const named_arg& value);
+ void pop_back();
+
+ void resize(R_xlen_t count);
+
+ void reserve(R_xlen_t new_capacity);
+
+ iterator insert(R_xlen_t pos, T value);
+ iterator erase(R_xlen_t pos);
+
+ void clear();
+
+ iterator begin() const;
+ iterator end() const;
+
+ using cpp11::r_vector<T>::cbegin;
+ using cpp11::r_vector<T>::cend;
+ using cpp11::r_vector<T>::size;
+
+ iterator find(const r_string& name) const;
+
+ attribute_proxy<r_vector<T>> attr(const char* name) const {
+ return attribute_proxy<r_vector<T>>(*this, name);
+ }
+
+ attribute_proxy<r_vector<T>> attr(const std::string& name) const {
+ return attribute_proxy<r_vector<T>>(*this, name.c_str());
+ }
+
+ attribute_proxy<r_vector<T>> attr(SEXP name) const {
+ return attribute_proxy<r_vector<T>>(*this, name);
+ }
+
+ attribute_proxy<r_vector<T>> names() const {
+ return attribute_proxy<r_vector<T>>(*this, R_NamesSymbol);
+ }
+
+ operator SEXP() const;
+};
+} // namespace writable
+
+// Implementations below
+
+template <typename T>
+inline r_vector<T>::r_vector(const SEXP data)
+ : data_(valid_type(data)),
+ protect_(preserved.insert(data)),
+ is_altrep_(ALTREP(data)),
+ data_p_(get_p(ALTREP(data), data)),
+ length_(Rf_xlength(data)) {}
+
+template <typename T>
+inline r_vector<T>::r_vector(const SEXP data, bool is_altrep)
+ : data_(valid_type(data)),
+ protect_(preserved.insert(data)),
+ is_altrep_(is_altrep),
+ data_p_(get_p(is_altrep, data)),
+ length_(Rf_xlength(data)) {}
+
+template <typename T>
+inline bool r_vector<T>::is_altrep() const {
+ return is_altrep_;
+}
+
+template <typename T>
+inline bool r_vector<T>::named() const {
+ return ((this->names()) != R_NilValue);
+}
+
+template <typename T>
+inline R_xlen_t r_vector<T>::size() const {
+ return length_;
+}
+
+template <typename T>
+inline r_vector<T>::operator SEXP() const {
+ return data_;
+}
+
+template <typename T>
+inline bool r_vector<T>::empty() const {
+ return (!(this->size() > 0));
+}
+
+template <typename T>
+inline r_vector<T>::operator sexp() const {
+ return data_;
+}
+
+/// Provide access to the underlying data, mainly for interface
+/// compatibility with std::vector
+template <typename T>
+inline SEXP r_vector<T>::data() const {
+ return data_;
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator r_vector<T>::begin() const {
+ return const_iterator(this, 0);
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator r_vector<T>::end() const {
+ return const_iterator(this, length_);
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator r_vector<T>::cbegin() const {
+ return const_iterator(this, 0);
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator r_vector<T>::cend() const {
+ return const_iterator(this, length_);
+}
+
+template <typename T>
+r_vector<T>::const_iterator::const_iterator(const r_vector* data, R_xlen_t pos)
+ : data_(data), pos_(pos), buf_() {
+ if (data_->is_altrep()) {
+ fill_buf(pos);
+ }
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator& r_vector<T>::const_iterator::operator++() {
+ ++pos_;
+ if (data_->is_altrep() && pos_ >= block_start_ + length_) {
+ fill_buf(pos_);
+ }
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator& r_vector<T>::const_iterator::operator--() {
+ --pos_;
+ if (data_->is_altrep() && pos_ > 0 && pos_ < block_start_) {
+ fill_buf(std::max(0_xl, pos_ - 64));
+ }
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator& r_vector<T>::const_iterator::operator+=(
+ R_xlen_t i) {
+ pos_ += i;
+ if (data_->is_altrep() && pos_ >= block_start_ + length_) {
+ fill_buf(pos_);
+ }
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator& r_vector<T>::const_iterator::operator-=(
+ R_xlen_t i) {
+ pos_ -= i;
+ if (data_->is_altrep() && pos_ >= block_start_ + length_) {
+ fill_buf(std::max(0_xl, pos_ - 64));
+ }
+ return *this;
+}
+
+template <typename T>
+inline bool r_vector<T>::const_iterator::operator!=(
+ const r_vector<T>::const_iterator& other) const {
+ return pos_ != other.pos_;
+}
+
+template <typename T>
+inline bool r_vector<T>::const_iterator::operator==(
+ const r_vector<T>::const_iterator& other) const {
+ return pos_ == other.pos_;
+}
+
+template <typename T>
+inline ptrdiff_t r_vector<T>::const_iterator::operator-(
+ const r_vector<T>::const_iterator& other) const {
+ return pos_ - other.pos_;
+}
+
+template <typename T>
+inline typename r_vector<T>::const_iterator& r_vector<T>::const_iterator::operator+(
+ R_xlen_t rhs) {
+ pos_ += rhs;
+ if (data_->is_altrep() && pos_ >= block_start_ + length_) {
+ fill_buf(pos_);
+ }
+ return *this;
+}
+
+template <typename T>
+inline T cpp11::r_vector<T>::at(R_xlen_t pos) const {
+ if (pos < 0 || pos >= length_) {
+ throw std::out_of_range("r_vector");
+ }
+
+ return operator[](pos);
+}
+
+template <typename T>
+inline T cpp11::r_vector<T>::at(size_type pos) const {
+ return at(static_cast<R_xlen_t>(pos));
+}
+
+template <typename T>
+inline T cpp11::r_vector<T>::operator[](const r_string& name) const {
+ SEXP names = this->names();
+ R_xlen_t size = Rf_xlength(names);
+
+ for (R_xlen_t pos = 0; pos < size; ++pos) {
+ auto cur = Rf_translateCharUTF8(STRING_ELT(names, pos));
+ if (name == cur) {
+ return operator[](pos);
+ }
+ }
+
+ throw std::out_of_range("r_vector");
+}
+
+template <typename T>
+inline bool cpp11::r_vector<T>::contains(const r_string& name) const {
+ SEXP names = this->names();
+ R_xlen_t size = Rf_xlength(names);
+
+ for (R_xlen_t pos = 0; pos < size; ++pos) {
+ auto cur = Rf_translateCharUTF8(STRING_ELT(names, pos));
+ if (name == cur) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+template <typename T>
+inline typename cpp11::r_vector<T>::const_iterator cpp11::r_vector<T>::find(
+ const r_string& name) const {
+ SEXP names = this->names();
+ R_xlen_t size = Rf_xlength(names);
+
+ for (R_xlen_t pos = 0; pos < size; ++pos) {
+ auto cur = Rf_translateCharUTF8(STRING_ELT(names, pos));
+ if (name == cur) {
+ return begin() + pos;
+ }
+ }
+
+ return end();
+}
+
+template <typename T>
+inline T r_vector<T>::const_iterator::operator*() const {
+ if (data_->is_altrep()) {
+ return buf_[pos_ - block_start_];
+ } else {
+ return data_->data_p_[pos_];
+ }
+}
+
+#ifdef LONG_VECTOR_SUPPORT
+template <typename T>
+inline T r_vector<T>::operator[](const int pos) const {
+ return operator[](static_cast<R_xlen_t>(pos));
+}
+
+template <typename T>
+inline T r_vector<T>::at(const int pos) const {
+ return at(static_cast<R_xlen_t>(pos));
+}
+#endif
+
+template <typename T>
+inline T r_vector<T>::operator[](size_type pos) const {
+ return operator[](static_cast<R_xlen_t>(pos));
+}
+
+namespace writable {
+
+template <typename T>
+r_vector<T>::proxy::proxy(SEXP data, const R_xlen_t index, T* const p, bool is_altrep)
+ : data_(data), index_(index), p_(p), is_altrep_(is_altrep) {}
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::iterator::operator*() const {
+ if (data_.is_altrep()) {
+ return proxy(data_.data(), pos_, const_cast<T*>(&buf_[pos_ - block_start_]), true);
+ } else {
+ return proxy(data_.data(), pos_,
+ data_.data_p_ != nullptr ? &data_.data_p_[pos_] : nullptr, false);
+ }
+}
+
+template <typename T>
+r_vector<T>::iterator::iterator(const r_vector& data, R_xlen_t pos)
+ : r_vector<T>::const_iterator(&data, pos), data_(data) {}
+
+template <typename T>
+inline typename r_vector<T>::iterator& r_vector<T>::iterator::operator++() {
+ ++pos_;
+ if (data_.is_altrep() && pos_ >= block_start_ + length_) {
+ fill_buf(pos_);
+ }
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::iterator& r_vector<T>::iterator::operator+(R_xlen_t rhs) {
+ pos_ += rhs;
+ if (data_.is_altrep() && pos_ >= block_start_ + length_) {
+ fill_buf(pos_);
+ }
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::iterator r_vector<T>::begin() const {
+ return iterator(*this, 0);
+}
+
+template <typename T>
+inline typename r_vector<T>::iterator r_vector<T>::end() const {
+ return iterator(*this, length_);
+}
+
+template <typename T>
+inline r_vector<T>::r_vector(const SEXP& data)
+ : cpp11::r_vector<T>(safe[Rf_shallow_duplicate](data)),
+ protect_(preserved.insert(data_)),
+ capacity_(length_) {}
+
+template <typename T>
+inline r_vector<T>::r_vector(const SEXP& data, bool is_altrep)
+ : cpp11::r_vector<T>(safe[Rf_shallow_duplicate](data), is_altrep),
+ protect_(preserved.insert(data_)),
+ capacity_(length_) {}
+
+template <typename T>
+inline r_vector<T>::r_vector(SEXP&& data)
+ : cpp11::r_vector<T>(data), protect_(preserved.insert(data_)), capacity_(length_) {}
+
+template <typename T>
+inline r_vector<T>::r_vector(SEXP&& data, bool is_altrep)
+ : cpp11::r_vector<T>(data, is_altrep),
+ protect_(preserved.insert(data_)),
+ capacity_(length_) {}
+
+template <typename T>
+template <typename Iter>
+inline r_vector<T>::r_vector(Iter first, Iter last) : r_vector() {
+ reserve(last - first);
+ while (first != last) {
+ push_back(*first);
+ ++first;
+ }
+}
+
+template <typename T>
+template <typename V, typename W>
+inline r_vector<T>::r_vector(const V& obj) : r_vector() {
+ auto first = obj.begin();
+ auto last = obj.end();
+ reserve(last - first);
+ while (first != last) {
+ push_back(*first);
+ ++first;
+ }
+}
+
+template <typename T>
+inline r_vector<T>::r_vector(R_xlen_t size) : r_vector() {
+ resize(size);
+}
+
+template <typename T>
+inline r_vector<T>::~r_vector() {
+ preserved.release(protect_);
+}
+
+#ifdef LONG_VECTOR_SUPPORT
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::operator[](const int pos) const {
+ return operator[](static_cast<R_xlen_t>(pos));
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::at(const int pos) const {
+ return at(static_cast<R_xlen_t>(pos));
+}
+#endif
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::operator[](const R_xlen_t pos) const {
+ if (is_altrep_) {
+ return {data_, pos, nullptr, true};
+ }
+ return {data_, pos, data_p_ != nullptr ? &data_p_[pos] : nullptr, false};
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::operator[](size_type pos) const {
+ return operator[](static_cast<R_xlen_t>(pos));
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::at(const R_xlen_t pos) const {
+ if (pos < 0 || pos >= length_) {
+ throw std::out_of_range("r_vector");
+ }
+ return operator[](static_cast<R_xlen_t>(pos));
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::at(size_type pos) const {
+ return at(static_cast<R_xlen_t>(pos));
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::operator[](const r_string& name) const {
+ SEXP names = PROTECT(this->names());
+ R_xlen_t size = Rf_xlength(names);
+
+ for (R_xlen_t pos = 0; pos < size; ++pos) {
+ auto cur = Rf_translateCharUTF8(STRING_ELT(names, pos));
+ if (name == cur) {
+ UNPROTECT(1);
+ return operator[](pos);
+ }
+ }
+
+ UNPROTECT(1);
+ throw std::out_of_range("r_vector");
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy r_vector<T>::at(const r_string& name) const {
+ return operator[](name);
+}
+
+template <typename T>
+inline typename r_vector<T>::iterator r_vector<T>::find(const r_string& name) const {
+ SEXP names = PROTECT(this->names());
+ R_xlen_t size = Rf_xlength(names);
+
+ for (R_xlen_t pos = 0; pos < size; ++pos) {
+ auto cur = Rf_translateCharUTF8(STRING_ELT(names, pos));
+ if (name == cur) {
+ UNPROTECT(1);
+ return begin() + pos;
+ }
+ }
+
+ UNPROTECT(1);
+ return end();
+}
+
+template <typename T>
+inline r_vector<T>::r_vector(const r_vector<T>& rhs)
+ : cpp11::r_vector<T>(safe[Rf_shallow_duplicate](rhs)),
+ protect_(preserved.insert(data_)),
+ capacity_(rhs.capacity_) {}
+
+template <typename T>
+inline r_vector<T>::r_vector(r_vector<T>&& rhs)
+ : cpp11::r_vector<T>(rhs), protect_(rhs.protect_), capacity_(rhs.capacity_) {
+ rhs.data_ = R_NilValue;
+ rhs.protect_ = R_NilValue;
+}
+
+template <typename T>
+inline r_vector<T>::r_vector(const cpp11::r_vector<T>& rhs)
+ : cpp11::r_vector<T>(safe[Rf_shallow_duplicate](rhs)),
+ protect_(preserved.insert(data_)),
+ capacity_(rhs.length_) {}
+
+// We don't release the old object until the end in case we throw an exception
+// during the duplicate.
+template <typename T>
+inline r_vector<T>& r_vector<T>::operator=(const r_vector<T>& rhs) {
+ if (data_ == rhs.data_) {
+ return *this;
+ }
+
+ cpp11::r_vector<T>::operator=(rhs);
+
+ auto old_protect = protect_;
+
+ data_ = safe[Rf_shallow_duplicate](rhs.data_);
+ protect_ = preserved.insert(data_);
+
+ preserved.release(old_protect);
+
+ capacity_ = rhs.capacity_;
+
+ return *this;
+}
+
+template <typename T>
+inline r_vector<T>& r_vector<T>::operator=(r_vector<T>&& rhs) {
+ if (data_ == rhs.data_) {
+ return *this;
+ }
+
+ cpp11::r_vector<T>::operator=(rhs);
+
+ SEXP old_protect = protect_;
+
+ data_ = rhs.data_;
+ protect_ = preserved.insert(data_);
+
+ preserved.release(old_protect);
+
+ capacity_ = rhs.capacity_;
+
+ rhs.data_ = R_NilValue;
+ rhs.protect_ = R_NilValue;
+
+ return *this;
+}
+
+template <typename T>
+inline void r_vector<T>::pop_back() {
+ --length_;
+}
+
+template <typename T>
+inline void r_vector<T>::resize(R_xlen_t count) {
+ reserve(count);
+ length_ = count;
+}
+
+template <typename T>
+inline typename r_vector<T>::iterator r_vector<T>::insert(R_xlen_t pos, T value) {
+ push_back(value);
+
+ R_xlen_t i = length_ - 1;
+ while (i > pos) {
+ operator[](i) = (T) operator[](i - 1);
+ --i;
+ };
+ operator[](pos) = value;
+
+ return begin() + pos;
+}
+
+template <typename T>
+inline typename r_vector<T>::iterator r_vector<T>::erase(R_xlen_t pos) {
+ R_xlen_t i = pos;
+ while (i < length_ - 1) {
+ operator[](i) = (T) operator[](i + 1);
+ ++i;
+ }
+ pop_back();
+
+ return begin() + pos;
+}
+
+template <typename T>
+inline void r_vector<T>::clear() {
+ length_ = 0;
+}
+
+template <typename T>
+inline r_vector<T>::operator SEXP() const {
+ if (length_ < capacity_) {
+#if R_VERSION >= R_Version(3, 4, 0)
+ SETLENGTH(data_, length_);
+ SET_TRUELENGTH(data_, capacity_);
+ SET_GROWABLE_BIT(data_);
+#else
+ auto* p = const_cast<r_vector<T>*>(this);
+ p->data_ = safe[Rf_lengthgets](data_, length_);
+#endif
+ }
+ return data_;
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy& r_vector<T>::proxy::operator+=(const T& rhs) {
+ operator=(static_cast<T>(*this) + rhs);
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy& r_vector<T>::proxy::operator-=(const T& rhs) {
+ operator=(static_cast<T>(*this) - rhs);
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy& r_vector<T>::proxy::operator*=(const T& rhs) {
+ operator=(static_cast<T>(*this) * rhs);
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy& r_vector<T>::proxy::operator/=(const T& rhs) {
+ operator=(static_cast<T>(*this) / rhs);
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy& r_vector<T>::proxy::operator++(int) {
+ operator=(static_cast<T>(*this) + 1);
+ return *this;
+}
+
+template <typename T>
+inline typename r_vector<T>::proxy& r_vector<T>::proxy::operator--(int) {
+ operator=(static_cast<T>(*this) - 1);
+ return *this;
+}
+
+template <typename T>
+inline void r_vector<T>::proxy::operator--() {
+ operator=(static_cast<T>(*this) - 1);
+}
+
+template <typename T>
+inline void r_vector<T>::proxy::operator++() {
+ operator=(static_cast<T>(*this) + 1);
+}
+
+} // namespace writable
+
+// TODO: is there a better condition we could use, e.g. assert something true
+// rather than three things false?
+template <typename C, typename T>
+using is_container_but_not_sexp_or_string = typename std::enable_if<
+ !std::is_constructible<C, SEXP>::value &&
+ !std::is_same<typename std::decay<C>::type, std::string>::value &&
+ !std::is_same<typename std::decay<T>::type, std::string>::value,
+ typename std::decay<C>::type>::type;
+
+template <typename C, typename T = typename std::decay<C>::type::value_type>
+// typename T = typename C::value_type>
+is_container_but_not_sexp_or_string<C, T> as_cpp(SEXP from) {
+ auto obj = cpp11::r_vector<T>(from);
+ return {obj.begin(), obj.end()};
+}
+
+// TODO: could we make this generalize outside of std::string?
+template <typename C, typename T = C>
+using is_vector_of_strings = typename std::enable_if<
+ std::is_same<typename std::decay<T>::type, std::string>::value,
+ typename std::decay<C>::type>::type;
+
+template <typename C, typename T = typename std::decay<C>::type::value_type>
+// typename T = typename C::value_type>
+is_vector_of_strings<C, T> as_cpp(SEXP from) {
+ auto obj = cpp11::r_vector<cpp11::r_string>(from);
+ typename std::decay<C>::type res;
+ auto it = obj.begin();
+ while (it != obj.end()) {
+ r_string s = *it;
+ res.emplace_back(static_cast<std::string>(s));
+ ++it;
+ }
+ return res;
+}
+
+template <typename T>
+bool operator==(const r_vector<T>& lhs, const r_vector<T>& rhs) {
+ if (lhs.size() != rhs.size()) {
+ return false;
+ }
+
+ auto lhs_it = lhs.begin();
+ auto rhs_it = rhs.begin();
+
+ auto end = lhs.end();
+ while (lhs_it != end) {
+ if (!(*lhs_it == *rhs_it)) {
+ return false;
+ }
+ ++lhs_it;
+ ++rhs_it;
+ }
+ return true;
+}
+
+template <typename T>
+bool operator!=(const r_vector<T>& lhs, const r_vector<T>& rhs) {
+ return !(lhs == rhs);
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/raws.hpp b/src/arrow/r/inst/include/cpp11/raws.hpp
new file mode 100644
index 000000000..ef1ab304d
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/raws.hpp
@@ -0,0 +1,148 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <algorithm> // for min
+#include <array> // for array
+#include <cstdint> // for uint8_t
+#include <initializer_list> // for initializer_list
+
+#include "cpp11/R.hpp" // for RAW, SEXP, SEXPREC, Rf_allocVector
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/named_arg.hpp" // for named_arg
+#include "cpp11/protect.hpp" // for preserved
+#include "cpp11/r_vector.hpp" // for r_vector, r_vector<>::proxy
+#include "cpp11/sexp.hpp" // for sexp
+
+// Specializations for raws
+
+namespace cpp11 {
+
+template <>
+inline SEXP r_vector<uint8_t>::valid_type(SEXP data) {
+ if (TYPEOF(data) != RAWSXP) {
+ throw type_error(RAWSXP, TYPEOF(data));
+ }
+ return data;
+}
+
+template <>
+inline uint8_t r_vector<uint8_t>::operator[](const R_xlen_t pos) const {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return is_altrep_ ? RAW_ELT(data_, pos) : data_p_[pos];
+}
+
+template <>
+inline uint8_t* r_vector<uint8_t>::get_p(bool is_altrep, SEXP data) {
+ if (is_altrep) {
+ return nullptr;
+ } else {
+ return reinterpret_cast<uint8_t*>(RAW(data));
+ }
+}
+
+template <>
+inline void r_vector<uint8_t>::const_iterator::fill_buf(R_xlen_t pos) {
+ using namespace cpp11::literals;
+ length_ = std::min(64_xl, data_->size() - pos);
+ unwind_protect(
+ [&] { RAW_GET_REGION(data_->data_, pos, length_, (uint8_t*)buf_.data()); });
+ block_start_ = pos;
+}
+
+typedef r_vector<uint8_t> raws;
+
+namespace writable {
+
+template <>
+inline typename r_vector<uint8_t>::proxy& r_vector<uint8_t>::proxy::operator=(
+ const uint8_t& rhs) {
+ if (is_altrep_) {
+ // NOPROTECT: likely too costly to unwind protect every set elt
+ RAW(data_)[index_] = rhs;
+ } else {
+ *p_ = rhs;
+ }
+ return *this;
+}
+
+template <>
+inline r_vector<uint8_t>::proxy::operator uint8_t() const {
+ if (p_ == nullptr) {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return RAW(data_)[index_];
+ } else {
+ return *p_;
+ }
+}
+
+template <>
+inline r_vector<uint8_t>::r_vector(std::initializer_list<uint8_t> il)
+ : cpp11::r_vector<uint8_t>(safe[Rf_allocVector](RAWSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ data_p_[i] = *it;
+ }
+}
+
+template <>
+inline r_vector<uint8_t>::r_vector(std::initializer_list<named_arg> il)
+ : cpp11::r_vector<uint8_t>(safe[Rf_allocVector](RAWSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ int n_protected = 0;
+
+ try {
+ unwind_protect([&] {
+ Rf_setAttrib(data_, R_NamesSymbol, Rf_allocVector(STRSXP, capacity_));
+ SEXP names = PROTECT(Rf_getAttrib(data_, R_NamesSymbol));
+ ++n_protected;
+
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ data_p_[i] = RAW_ELT(it->value(), 0);
+ SET_STRING_ELT(names, i, Rf_mkCharCE(it->name(), CE_UTF8));
+ }
+ UNPROTECT(n_protected);
+ });
+ } catch (const unwind_exception& e) {
+ preserved.release(protect_);
+ UNPROTECT(n_protected);
+ throw e;
+ }
+}
+
+template <>
+inline void r_vector<uint8_t>::reserve(R_xlen_t new_capacity) {
+ data_ = data_ == R_NilValue ? safe[Rf_allocVector](RAWSXP, new_capacity)
+ : safe[Rf_xlengthgets](data_, new_capacity);
+
+ SEXP old_protect = protect_;
+ protect_ = preserved.insert(data_);
+ preserved.release(old_protect);
+
+ data_p_ = reinterpret_cast<uint8_t*>(RAW(data_));
+ capacity_ = new_capacity;
+}
+
+template <>
+inline void r_vector<uint8_t>::push_back(uint8_t value) {
+ while (length_ >= capacity_) {
+ reserve(capacity_ == 0 ? 1 : capacity_ *= 2);
+ }
+ if (is_altrep_) {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ RAW(data_)[length_] = value;
+ } else {
+ data_p_[length_] = value;
+ }
+ ++length_;
+}
+
+typedef r_vector<uint8_t> raws;
+
+} // namespace writable
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/sexp.hpp b/src/arrow/r/inst/include/cpp11/sexp.hpp
new file mode 100644
index 000000000..0a5edccb4
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/sexp.hpp
@@ -0,0 +1,85 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <stddef.h> // for size_t
+
+#include <string> // for string, basic_string
+
+#include "cpp11/R.hpp" // for SEXP, SEXPREC, REAL_ELT, R_NilV...
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/protect.hpp" // for preserved
+
+namespace cpp11 {
+
+/// Converting to SEXP
+class sexp {
+ private:
+ SEXP data_ = R_NilValue;
+ SEXP preserve_token_ = R_NilValue;
+
+ public:
+ sexp() = default;
+
+ sexp(SEXP data) : data_(data), preserve_token_(preserved.insert(data_)) {
+ // REprintf("created %x %x : %i\n", data_, preserve_token_, protect_head_size());
+ }
+
+ sexp(const sexp& rhs) {
+ data_ = rhs.data_;
+ preserve_token_ = preserved.insert(data_);
+ // REprintf("copied %x new protect %x : %i\n", rhs.data_, preserve_token_,
+ // protect_head_size());
+ }
+
+ sexp(sexp&& rhs) {
+ data_ = rhs.data_;
+ preserve_token_ = rhs.preserve_token_;
+
+ rhs.data_ = R_NilValue;
+ rhs.preserve_token_ = R_NilValue;
+
+ // REprintf("moved %x : %i\n", rhs.data_, protect_head_size());
+ }
+
+ sexp& operator=(const sexp& rhs) {
+ preserved.release(preserve_token_);
+
+ data_ = rhs.data_;
+ preserve_token_ = preserved.insert(data_);
+ // REprintf("assigned %x : %i\n", rhs.data_, protect_head_size());
+ return *this;
+ }
+
+ // void swap(sexp& rhs) {
+ // sexp tmp(rhs);
+ // rhs = *this;
+ //*this = tmp;
+ //}
+
+ ~sexp() { preserved.release(preserve_token_); }
+
+ attribute_proxy<sexp> attr(const char* name) const {
+ return attribute_proxy<sexp>(*this, name);
+ }
+
+ attribute_proxy<sexp> attr(const std::string& name) const {
+ return attribute_proxy<sexp>(*this, name.c_str());
+ }
+
+ attribute_proxy<sexp> attr(SEXP name) const {
+ return attribute_proxy<sexp>(*this, name);
+ }
+
+ attribute_proxy<sexp> names() const {
+ return attribute_proxy<sexp>(*this, R_NamesSymbol);
+ }
+
+ operator SEXP() const { return data_; }
+ operator double() const { return REAL_ELT(data_, 0); }
+ operator size_t() const { return REAL_ELT(data_, 0); }
+ operator bool() const { return LOGICAL_ELT(data_, 0); }
+ SEXP data() const { return data_; }
+};
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/include/cpp11/strings.hpp b/src/arrow/r/inst/include/cpp11/strings.hpp
new file mode 100644
index 000000000..adca2a174
--- /dev/null
+++ b/src/arrow/r/inst/include/cpp11/strings.hpp
@@ -0,0 +1,187 @@
+// cpp11 version: 0.3.1.1
+// vendored on: 2021-08-11
+#pragma once
+
+#include <initializer_list> // for initializer_list
+#include <string> // for string, basic_string
+
+#include "cpp11/R.hpp" // for SEXP, TYPEOF, SEXPREC, SET_STRI...
+#include "cpp11/as.hpp" // for as_sexp
+#include "cpp11/attribute_proxy.hpp" // for attribute_proxy
+#include "cpp11/named_arg.hpp" // for named_arg
+#include "cpp11/protect.hpp" // for preserved
+#include "cpp11/r_string.hpp" // for r_string
+#include "cpp11/r_vector.hpp" // for r_vector, r_vector<>::proxy
+#include "cpp11/sexp.hpp" // for sexp
+
+// Specializations for strings
+
+namespace cpp11 {
+
+template <>
+inline SEXP r_vector<r_string>::valid_type(SEXP data) {
+ if (TYPEOF(data) != STRSXP) {
+ throw type_error(STRSXP, TYPEOF(data));
+ }
+ return data;
+}
+
+template <>
+inline r_string r_vector<r_string>::operator[](const R_xlen_t pos) const {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return STRING_ELT(data_, pos);
+}
+
+template <>
+inline r_string* r_vector<r_string>::get_p(bool, SEXP) {
+ return nullptr;
+}
+
+template <>
+inline void r_vector<r_string>::const_iterator::fill_buf(R_xlen_t) {
+ return;
+}
+
+template <>
+inline r_string r_vector<r_string>::const_iterator::operator*() const {
+ return STRING_ELT(data_->data(), pos_);
+}
+
+typedef r_vector<r_string> strings;
+
+namespace writable {
+
+template <>
+inline typename r_vector<r_string>::proxy& r_vector<r_string>::proxy::operator=(
+ const r_string& rhs) {
+ unwind_protect([&] { SET_STRING_ELT(data_, index_, rhs); });
+ return *this;
+}
+
+template <>
+inline r_vector<r_string>::proxy::operator r_string() const {
+ // NOPROTECT: likely too costly to unwind protect every elt
+ return STRING_ELT(data_, index_);
+}
+
+inline bool operator==(const r_vector<r_string>::proxy& lhs, r_string rhs) {
+ return static_cast<r_string>(lhs).operator==(static_cast<std::string>(rhs).c_str());
+}
+
+inline SEXP alloc_or_copy(const SEXP data) {
+ switch (TYPEOF(data)) {
+ case CHARSXP:
+ return cpp11::r_vector<r_string>(safe[Rf_allocVector](STRSXP, 1));
+ case STRSXP:
+ return safe[Rf_shallow_duplicate](data);
+ default:
+ throw type_error(STRSXP, TYPEOF(data));
+ }
+}
+
+inline SEXP alloc_if_charsxp(const SEXP data) {
+ switch (TYPEOF(data)) {
+ case CHARSXP:
+ return cpp11::r_vector<r_string>(safe[Rf_allocVector](STRSXP, 1));
+ case STRSXP:
+ return data;
+ default:
+ throw type_error(STRSXP, TYPEOF(data));
+ }
+}
+
+template <>
+inline r_vector<r_string>::r_vector(const SEXP& data)
+ : cpp11::r_vector<r_string>(alloc_or_copy(data)),
+ protect_(preserved.insert(data_)),
+ capacity_(length_) {
+ if (TYPEOF(data) == CHARSXP) {
+ SET_STRING_ELT(data_, 0, data);
+ }
+}
+
+template <>
+inline r_vector<r_string>::r_vector(SEXP&& data)
+ : cpp11::r_vector<r_string>(alloc_if_charsxp(data)),
+ protect_(preserved.insert(data_)),
+ capacity_(length_) {
+ if (TYPEOF(data) == CHARSXP) {
+ SET_STRING_ELT(data_, 0, data);
+ }
+}
+
+template <>
+inline r_vector<r_string>::r_vector(std::initializer_list<r_string> il)
+ : cpp11::r_vector<r_string>(as_sexp(il)), capacity_(il.size()) {}
+
+template <>
+inline r_vector<r_string>::r_vector(std::initializer_list<const char*> il)
+ : cpp11::r_vector<r_string>(as_sexp(il)), capacity_(il.size()) {}
+
+template <>
+inline r_vector<r_string>::r_vector(std::initializer_list<std::string> il)
+ : cpp11::r_vector<r_string>(as_sexp(il)), capacity_(il.size()) {}
+
+template <>
+inline r_vector<r_string>::r_vector(std::initializer_list<named_arg> il)
+ : cpp11::r_vector<r_string>(safe[Rf_allocVector](STRSXP, il.size())),
+ capacity_(il.size()) {
+ protect_ = preserved.insert(data_);
+ int n_protected = 0;
+
+ try {
+ unwind_protect([&] {
+ Rf_setAttrib(data_, R_NamesSymbol, Rf_allocVector(STRSXP, capacity_));
+ SEXP names = PROTECT(Rf_getAttrib(data_, R_NamesSymbol));
+ ++n_protected;
+ auto it = il.begin();
+ for (R_xlen_t i = 0; i < capacity_; ++i, ++it) {
+ SET_STRING_ELT(data_, i, STRING_ELT(it->value(), 0));
+ SET_STRING_ELT(names, i, Rf_mkCharCE(it->name(), CE_UTF8));
+ }
+ UNPROTECT(n_protected);
+ });
+ } catch (const unwind_exception& e) {
+ preserved.release(protect_);
+ UNPROTECT(n_protected);
+ throw e;
+ }
+}
+
+template <>
+inline void r_vector<r_string>::reserve(R_xlen_t new_capacity) {
+ data_ = data_ == R_NilValue ? safe[Rf_allocVector](STRSXP, new_capacity)
+ : safe[Rf_xlengthgets](data_, new_capacity);
+
+ SEXP old_protect = protect_;
+ protect_ = preserved.insert(data_);
+ preserved.release(old_protect);
+
+ capacity_ = new_capacity;
+}
+
+template <>
+inline void r_vector<r_string>::push_back(r_string value) {
+ while (length_ >= capacity_) {
+ reserve(capacity_ == 0 ? 1 : capacity_ *= 2);
+ }
+ unwind_protect([&] { SET_STRING_ELT(data_, length_, value); });
+ ++length_;
+}
+
+typedef r_vector<r_string> strings;
+
+template <typename T>
+inline void r_vector<T>::push_back(const named_arg& value) {
+ push_back(value.value());
+ if (Rf_xlength(names()) == 0) {
+ cpp11::writable::strings new_nms(size());
+ names() = new_nms;
+ }
+ cpp11::writable::strings nms(names());
+ nms[size() - 1] = value.name();
+}
+
+} // namespace writable
+
+} // namespace cpp11
diff --git a/src/arrow/r/inst/v0.7.1.parquet b/src/arrow/r/inst/v0.7.1.parquet
new file mode 100644
index 000000000..44670bcd1
--- /dev/null
+++ b/src/arrow/r/inst/v0.7.1.parquet
Binary files differ
diff --git a/src/arrow/r/lint.sh b/src/arrow/r/lint.sh
new file mode 100755
index 000000000..629879e04
--- /dev/null
+++ b/src/arrow/r/lint.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script requires Python 3 and clang-format, which should already be
+# on your system. See r/README.md for further guidance
+
+set -e
+
+SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+CPP_BUILD_SUPPORT=$SOURCE_DIR/../cpp/build-support
+
+# Run clang-format
+: ${CLANG_FORMAT:=$(. "${SOURCE_DIR}/../.env" && echo clang-format-${CLANG_TOOLS})}
+$CPP_BUILD_SUPPORT/run_clang_format.py \
+ --clang_format_binary=$CLANG_FORMAT \
+ --exclude_glob=$CPP_BUILD_SUPPORT/lint_exclusions.txt \
+ --source_dir=$SOURCE_DIR/src --quiet $1
+
+
+# Run cpplint
+CPPLINT=$CPP_BUILD_SUPPORT/cpplint.py
+$CPP_BUILD_SUPPORT/run_cpplint.py \
+ --cpplint_binary=$CPPLINT \
+ --exclude_glob=$CPP_BUILD_SUPPORT/lint_exclusions.txt \
+ --source_dir=$SOURCE_DIR/src --quiet
+
+# Run lintr
+R -e "if(!requireNamespace('lintr', quietly=TRUE)){stop('lintr is not installed, please install it with R -e \"install.packages(\'lintr\')\"')}"
+NOT_CRAN=true R -e "lintr::lint_package('${SOURCE_DIR}', path_prefix = 'r')"
diff --git a/src/arrow/r/man/ArrayData.Rd b/src/arrow/r/man/ArrayData.Rd
new file mode 100644
index 000000000..383ab317d
--- /dev/null
+++ b/src/arrow/r/man/ArrayData.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/array-data.R
+\docType{class}
+\name{ArrayData}
+\alias{ArrayData}
+\title{ArrayData class}
+\description{
+The \code{ArrayData} class allows you to get and inspect the data
+inside an \code{arrow::Array}.
+}
+\section{Usage}{
+\preformatted{data <- Array$create(x)$data()
+
+data$type
+data$length
+data$null_count
+data$offset
+data$buffers
+}
+}
+
+\section{Methods}{
+
+
+...
+}
+
diff --git a/src/arrow/r/man/ChunkedArray.Rd b/src/arrow/r/man/ChunkedArray.Rd
new file mode 100644
index 000000000..3a504f014
--- /dev/null
+++ b/src/arrow/r/man/ChunkedArray.Rd
@@ -0,0 +1,80 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chunked-array.R
+\docType{class}
+\name{ChunkedArray}
+\alias{ChunkedArray}
+\alias{chunked_array}
+\title{ChunkedArray class}
+\usage{
+chunked_array(..., type = NULL)
+}
+\arguments{
+\item{\dots}{Vectors to coerce}
+
+\item{type}{currently ignored}
+}
+\description{
+A \code{ChunkedArray} is a data structure managing a list of
+primitive Arrow \link[=Array]{Arrays} logically as one large array. Chunked arrays
+may be grouped together in a \link{Table}.
+}
+\section{Factory}{
+
+The \code{ChunkedArray$create()} factory method instantiates the object from
+various Arrays or R vectors. \code{chunked_array()} is an alias for it.
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$length()}: Size in the number of elements this array contains
+\item \verb{$chunk(i)}: Extract an \code{Array} chunk by integer position
+\item \verb{$as_vector()}: convert to an R vector
+\item \verb{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array
+with the indicated offset and length. If length is \code{NULL}, the slice goes
+until the end of the array.
+\item \verb{$Take(i)}: return a \code{ChunkedArray} with values at positions given by
+integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be
+coerced to an R vector before taking.
+\item \verb{$Filter(i, keep_na = TRUE)}: return a \code{ChunkedArray} with values at positions where
+logical vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(descending = FALSE)}: return an \code{Array} of integer positions that can be
+used to rearrange the \code{ChunkedArray} in ascending or descending order
+\item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the
+data in the array to change its type.
+\item \verb{$null_count}: The number of null entries in the array
+\item \verb{$chunks}: return a list of \code{Array}s
+\item \verb{$num_chunks}: integer number of chunks in the \code{ChunkedArray}
+\item \verb{$type}: logical type of data
+\item \verb{$View(type)}: Construct a zero-copy view of this \code{ChunkedArray} with the
+given type.
+\item \verb{$Validate()}: Perform any validation checks to determine obvious inconsistencies
+within the array's internal data. This can be an expensive check, potentially \code{O(length)}
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Pass items into chunked_array as separate objects to create chunks
+class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73))
+class_scores$num_chunks
+
+# When taking a Slice from a chunked_array, chunks are preserved
+class_scores$Slice(2, length = 5)
+
+# You can combine Take and SortIndices to return a ChunkedArray with 1 chunk
+# containing all values, ordered.
+class_scores$Take(class_scores$SortIndices(descending = TRUE))
+
+# If you pass a list into chunked_array, you get a list of length 1
+list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8)))
+list_scores$num_chunks
+
+# When constructing a ChunkedArray, the first chunk is used to infer type.
+doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L))
+doubles$type
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\link{Array}
+}
diff --git a/src/arrow/r/man/Codec.Rd b/src/arrow/r/man/Codec.Rd
new file mode 100644
index 000000000..86723aed5
--- /dev/null
+++ b/src/arrow/r/man/Codec.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compression.R
+\docType{class}
+\name{Codec}
+\alias{Codec}
+\title{Compression Codec class}
+\description{
+Codecs allow you to create \link[=compression]{compressed input and output streams}.
+}
+\section{Factory}{
+
+The \code{Codec$create()} factory method takes the following arguments:
+\itemize{
+\item \code{type}: string name of the compression method. Possible values are
+"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo", or
+"bz2". \code{type} may be upper- or lower-cased. Not all methods may be
+available; support depends on build-time flags for the C++ library.
+See \code{\link[=codec_is_available]{codec_is_available()}}. Most builds support at least "snappy" and
+"gzip". All support "uncompressed".
+\item \code{compression_level}: compression level, the default value (\code{NA}) uses the
+default compression level for the selected compression \code{type}.
+}
+}
+
diff --git a/src/arrow/r/man/CsvReadOptions.Rd b/src/arrow/r/man/CsvReadOptions.Rd
new file mode 100644
index 000000000..d08869270
--- /dev/null
+++ b/src/arrow/r/man/CsvReadOptions.Rd
@@ -0,0 +1,107 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R, R/json.R
+\docType{class}
+\name{CsvReadOptions}
+\alias{CsvReadOptions}
+\alias{CsvWriteOptions}
+\alias{CsvParseOptions}
+\alias{TimestampParser}
+\alias{CsvConvertOptions}
+\alias{JsonReadOptions}
+\alias{JsonParseOptions}
+\title{File reader options}
+\description{
+\code{CsvReadOptions}, \code{CsvParseOptions}, \code{CsvConvertOptions},
+\code{JsonReadOptions}, \code{JsonParseOptions}, and \code{TimestampParser} are containers for various
+file reading options. See their usage in \code{\link[=read_csv_arrow]{read_csv_arrow()}} and
+\code{\link[=read_json_arrow]{read_json_arrow()}}, respectively.
+}
+\section{Factory}{
+
+
+The \code{CsvReadOptions$create()} and \code{JsonReadOptions$create()} factory methods
+take the following arguments:
+\itemize{
+\item \code{use_threads} Whether to use the global CPU thread pool
+\item \code{block_size} Block size we request from the IO layer; also determines
+the size of chunks when use_threads is \code{TRUE}. NB: if \code{FALSE}, JSON input
+must end with an empty line.
+}
+
+\code{CsvReadOptions$create()} further accepts these additional arguments:
+\itemize{
+\item \code{skip_rows} Number of lines to skip before reading data (default 0)
+\item \code{column_names} Character vector to supply column names. If length-0
+(the default), the first non-skipped row will be parsed to generate column
+names, unless \code{autogenerate_column_names} is \code{TRUE}.
+\item \code{autogenerate_column_names} Logical: generate column names instead of
+using the first non-skipped row (the default)? If \code{TRUE}, column names will
+be "f0", "f1", ..., "fN".
+}
+
+\code{CsvParseOptions$create()} takes the following arguments:
+\itemize{
+\item \code{delimiter} Field delimiting character (default \code{","})
+\item \code{quoting} Logical: are strings quoted? (default \code{TRUE})
+\item \code{quote_char} Quoting character, if \code{quoting} is \code{TRUE}
+\item \code{double_quote} Logical: are quotes inside values double-quoted? (default \code{TRUE})
+\item \code{escaping} Logical: whether escaping is used (default \code{FALSE})
+\item \code{escape_char} Escaping character, if \code{escaping} is \code{TRUE}
+\item \code{newlines_in_values} Logical: are values allowed to contain CR (\code{0x0d})
+and LF (\code{0x0a}) characters? (default \code{FALSE})
+\item \code{ignore_empty_lines} Logical: should empty lines be ignored (default) or
+generate a row of missing values (if \code{FALSE})?
+}
+
+\code{JsonParseOptions$create()} accepts only the \code{newlines_in_values} argument.
+
+\code{CsvConvertOptions$create()} takes the following arguments:
+\itemize{
+\item \code{check_utf8} Logical: check UTF8 validity of string columns? (default \code{TRUE})
+\item \code{null_values} character vector of recognized spellings for null values.
+Analogous to the \code{na.strings} argument to
+\code{\link[utils:read.table]{read.csv()}} or \code{na} in \code{readr::read_csv()}.
+\item \code{strings_can_be_null} Logical: can string / binary columns have
+null values? Similar to the \code{quoted_na} argument to \code{readr::read_csv()}.
+(default \code{FALSE})
+\item \code{true_values} character vector of recognized spellings for \code{TRUE} values
+\item \code{false_values} character vector of recognized spellings for \code{FALSE} values
+\item \code{col_types} A \code{Schema} or \code{NULL} to infer types
+\item \code{auto_dict_encode} Logical: Whether to try to automatically
+dictionary-encode string / binary data (think \code{stringsAsFactors}). Default \code{FALSE}.
+This setting is ignored for non-inferred columns (those in \code{col_types}).
+\item \code{auto_dict_max_cardinality} If \code{auto_dict_encode}, string/binary columns
+are dictionary-encoded up to this number of unique values (default 50),
+after which it switches to regular encoding.
+\item \code{include_columns} If non-empty, indicates the names of columns from the
+CSV file that should be actually read and converted (in the vector's order).
+\item \code{include_missing_columns} Logical: if \code{include_columns} is provided, should
+columns named in it but not found in the data be included as a column of
+type \code{null()}? The default (\code{FALSE}) means that the reader will instead
+raise an error.
+\item \code{timestamp_parsers} User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are
+(a) \code{NULL}, the default, which uses the ISO-8601 parser;
+(b) a character vector of \link[base:strptime]{strptime} parse strings; or
+(c) a list of \link{TimestampParser} objects.
+}
+
+\code{TimestampParser$create()} takes an optional \code{format} string argument.
+See \code{\link[base:strptime]{strptime()}} for example syntax.
+The default is to use an ISO-8601 format parser.
+
+The \code{CsvWriteOptions$create()} factory method takes the following arguments:
+\itemize{
+\item \code{include_header} Whether to write an initial header line with column names
+\item \code{batch_size} Maximum number of rows processed at a time. Default is 1024.
+}
+}
+
+\section{Active bindings}{
+
+\itemize{
+\item \code{column_names}: from \code{CsvReadOptions}
+}
+}
+
diff --git a/src/arrow/r/man/CsvTableReader.Rd b/src/arrow/r/man/CsvTableReader.Rd
new file mode 100644
index 000000000..1afa9d020
--- /dev/null
+++ b/src/arrow/r/man/CsvTableReader.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R, R/json.R
+\docType{class}
+\name{CsvTableReader}
+\alias{CsvTableReader}
+\alias{JsonTableReader}
+\title{Arrow CSV and JSON table reader classes}
+\description{
+\code{CsvTableReader} and \code{JsonTableReader} wrap the Arrow C++ CSV
+and JSON table readers. See their usage in \code{\link[=read_csv_arrow]{read_csv_arrow()}} and
+\code{\link[=read_json_arrow]{read_json_arrow()}}, respectively.
+}
+\section{Factory}{
+
+
+The \code{CsvTableReader$create()} and \code{JsonTableReader$create()} factory methods
+take the following arguments:
+\itemize{
+\item \code{file} An Arrow \link{InputStream}
+\item \code{convert_options} (CSV only), \code{parse_options}, \code{read_options}: see
+\link{CsvReadOptions}
+\item \code{...} additional parameters.
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$Read()}: returns an Arrow Table.
+}
+}
+
diff --git a/src/arrow/r/man/DataType.Rd b/src/arrow/r/man/DataType.Rd
new file mode 100644
index 000000000..8c96141be
--- /dev/null
+++ b/src/arrow/r/man/DataType.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/type.R
+\docType{class}
+\name{DataType}
+\alias{DataType}
+\title{class arrow::DataType}
+\description{
+class arrow::DataType
+}
+\section{Methods}{
+
+
+TODO
+}
+
diff --git a/src/arrow/r/man/Dataset.Rd b/src/arrow/r/man/Dataset.Rd
new file mode 100644
index 000000000..c19a0df6c
--- /dev/null
+++ b/src/arrow/r/man/Dataset.Rd
@@ -0,0 +1,81 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset.R, R/dataset-factory.R
+\name{Dataset}
+\alias{Dataset}
+\alias{FileSystemDataset}
+\alias{UnionDataset}
+\alias{InMemoryDataset}
+\alias{DatasetFactory}
+\alias{FileSystemDatasetFactory}
+\title{Multi-file datasets}
+\description{
+Arrow Datasets allow you to query against data that has been split across
+multiple files. This sharding of data may indicate partitioning, which
+can accelerate queries that only touch some partitions (files).
+
+A \code{Dataset} contains one or more \code{Fragments}, such as files, of potentially
+differing type and partitioning.
+
+For \code{Dataset$create()}, see \code{\link[=open_dataset]{open_dataset()}}, which is an alias for it.
+
+\code{DatasetFactory} is used to provide finer control over the creation of \code{Dataset}s.
+}
+\section{Factory}{
+
+\code{DatasetFactory} is used to create a \code{Dataset}, inspect the \link{Schema} of the
+fragments contained in it, and declare a partitioning.
+\code{FileSystemDatasetFactory} is a subclass of \code{DatasetFactory} for
+discovering files in the local file system, the only currently supported
+file system.
+
+For the \code{DatasetFactory$create()} factory method, see \code{\link[=dataset_factory]{dataset_factory()}}, an
+alias for it. A \code{DatasetFactory} has:
+\itemize{
+\item \verb{$Inspect(unify_schemas)}: If \code{unify_schemas} is \code{TRUE}, all fragments
+will be scanned and a unified \link{Schema} will be created from them; if \code{FALSE}
+(default), only the first fragment will be inspected for its schema. Use this
+fast path when you know and trust that all fragments have an identical schema.
+\item \verb{$Finish(schema, unify_schemas)}: Returns a \code{Dataset}. If \code{schema} is provided,
+it will be used for the \code{Dataset}; if omitted, a \code{Schema} will be created from
+inspecting the fragments (files) in the dataset, following \code{unify_schemas}
+as described above.
+}
+
+\code{FileSystemDatasetFactory$create()} is a lower-level factory method and
+takes the following arguments:
+\itemize{
+\item \code{filesystem}: A \link{FileSystem}
+\item \code{selector}: Either a \link{FileSelector} or \code{NULL}
+\item \code{paths}: Either a character vector of file paths or \code{NULL}
+\item \code{format}: A \link{FileFormat}
+\item \code{partitioning}: Either \code{Partitioning}, \code{PartitioningFactory}, or \code{NULL}
+}
+}
+
+\section{Methods}{
+
+
+A \code{Dataset} has the following methods:
+\itemize{
+\item \verb{$NewScan()}: Returns a \link{ScannerBuilder} for building a query
+\item \verb{$schema}: Active binding that returns the \link{Schema} of the Dataset; you
+may also replace the dataset's schema by using \code{ds$schema <- new_schema}.
+This method currently supports only adding, removing, or reordering
+fields in the schema: you cannot alter or cast the field types.
+}
+
+\code{FileSystemDataset} has the following methods:
+\itemize{
+\item \verb{$files}: Active binding, returns the files of the \code{FileSystemDataset}
+\item \verb{$format}: Active binding, returns the \link{FileFormat} of the \code{FileSystemDataset}
+}
+
+\code{UnionDataset} has the following methods:
+\itemize{
+\item \verb{$children}: Active binding, returns all child \code{Dataset}s.
+}
+}
+
+\seealso{
+\code{\link[=open_dataset]{open_dataset()}} for a simple interface to creating a \code{Dataset}
+}
diff --git a/src/arrow/r/man/DictionaryType.Rd b/src/arrow/r/man/DictionaryType.Rd
new file mode 100644
index 000000000..8c9087f1a
--- /dev/null
+++ b/src/arrow/r/man/DictionaryType.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dictionary.R
+\docType{class}
+\name{DictionaryType}
+\alias{DictionaryType}
+\title{class DictionaryType}
+\description{
+class DictionaryType
+}
+\section{Methods}{
+
+
+TODO
+}
+
diff --git a/src/arrow/r/man/Expression.Rd b/src/arrow/r/man/Expression.Rd
new file mode 100644
index 000000000..58a6a44c0
--- /dev/null
+++ b/src/arrow/r/man/Expression.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/expression.R
+\name{Expression}
+\alias{Expression}
+\title{Arrow expressions}
+\description{
+\code{Expression}s are used to define filter logic for passing to a \link{Dataset}
+\link{Scanner}.
+
+\code{Expression$scalar(x)} constructs an \code{Expression} which always evaluates to
+the provided scalar (length-1) R value.
+
+\code{Expression$field_ref(name)} is used to construct an \code{Expression} which
+evaluates to the named column in the \code{Dataset} against which it is evaluated.
+
+\code{Expression$create(function_name, ..., options)} builds a function-call
+\code{Expression} containing one or more \code{Expression}s.
+}
diff --git a/src/arrow/r/man/FeatherReader.Rd b/src/arrow/r/man/FeatherReader.Rd
new file mode 100644
index 000000000..64a307fcf
--- /dev/null
+++ b/src/arrow/r/man/FeatherReader.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feather.R
+\docType{class}
+\name{FeatherReader}
+\alias{FeatherReader}
+\title{FeatherReader class}
+\description{
+This class enables you to interact with Feather files. Create
+one to connect to a file or other InputStream, and call \code{Read()} on it to
+make an \code{arrow::Table}. See its usage in \code{\link[=read_feather]{read_feather()}}.
+}
+\section{Factory}{
+
+
+The \code{FeatherReader$create()} factory method instantiates the object and
+takes the following argument:
+\itemize{
+\item \code{file} an Arrow file connection object inheriting from \code{RandomAccessFile}.
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$Read(columns)}: Returns a \code{Table} of the selected columns, a vector of
+integer indices
+\item \verb{$column_names}: Active binding, returns the column names in the Feather file
+\item \verb{$schema}: Active binding, returns the schema of the Feather file
+\item \verb{$version}: Active binding, returns \code{1} or \code{2}, according to the Feather
+file version
+}
+}
+
diff --git a/src/arrow/r/man/Field.Rd b/src/arrow/r/man/Field.Rd
new file mode 100644
index 000000000..3b709e879
--- /dev/null
+++ b/src/arrow/r/man/Field.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/field.R
+\docType{class}
+\name{Field}
+\alias{Field}
+\alias{field}
+\title{Field class}
+\usage{
+field(name, type, metadata, nullable = TRUE)
+}
+\arguments{
+\item{name}{field name}
+
+\item{type}{logical type, instance of \link{DataType}}
+
+\item{metadata}{currently ignored}
+
+\item{nullable}{TRUE if field is nullable}
+}
+\description{
+\code{field()} lets you create an \code{arrow::Field} that maps a
+\link[=data-type]{DataType} to a column name. Fields are contained in
+\link[=Schema]{Schemas}.
+}
+\section{Methods}{
+
+\itemize{
+\item \code{f$ToString()}: convert to a string
+\item \code{f$Equals(other)}: test for equality. More naturally called as \code{f == other}
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+field("x", int32())
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/FileFormat.Rd b/src/arrow/r/man/FileFormat.Rd
new file mode 100644
index 000000000..cabacc937
--- /dev/null
+++ b/src/arrow/r/man/FileFormat.Rd
@@ -0,0 +1,68 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-format.R
+\name{FileFormat}
+\alias{FileFormat}
+\alias{ParquetFileFormat}
+\alias{IpcFileFormat}
+\alias{CsvFileFormat}
+\title{Dataset file formats}
+\description{
+A \code{FileFormat} holds information about how to read and parse the files
+included in a \code{Dataset}. There are subclasses corresponding to the supported
+file formats (\code{ParquetFileFormat} and \code{IpcFileFormat}).
+}
+\section{Factory}{
+
+\code{FileFormat$create()} takes the following arguments:
+\itemize{
+\item \code{format}: A string identifier of the file format. Currently supported values:
+\itemize{
+\item "parquet"
+\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+only version 2 files are supported
+\item "csv"/"text", aliases for the same thing (because comma is the default
+delimiter for text files
+\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
+}
+\item \code{...}: Additional format-specific options
+
+`format = "parquet"``:
+\itemize{
+\item \code{dict_columns}: Names of columns which should be read as dictionaries.
+\item Any Parquet options from \link{FragmentScanOptions}.
+}
+
+\code{format = "text"}: see \link{CsvParseOptions}. Note that you can specify them either
+with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
+\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
+Not all \code{readr} options are currently supported; please file an issue if
+you encounter one that \code{arrow} should support. Also, the following options are
+supported. From \link{CsvReadOptions}:
+\itemize{
+\item \code{skip_rows}
+\item \code{column_names}
+\item \code{autogenerate_column_names}
+From \link{CsvFragmentScanOptions} (these values can be overridden at scan time):
+\item \code{convert_options}: a \link{CsvConvertOptions}
+\item \code{block_size}
+}
+}
+
+It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
+}
+
+\examples{
+\dontshow{if (arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+## Semi-colon delimited files
+# Set up directory for examples
+tf <- tempfile()
+dir.create(tf)
+on.exit(unlink(tf))
+write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)
+
+# Create FileFormat object
+format <- FileFormat$create(format = "text", delimiter = ";")
+
+open_dataset(tf, format = format)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/FileInfo.Rd b/src/arrow/r/man/FileInfo.Rd
new file mode 100644
index 000000000..ef6182e4e
--- /dev/null
+++ b/src/arrow/r/man/FileInfo.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filesystem.R
+\name{FileInfo}
+\alias{FileInfo}
+\title{FileSystem entry info}
+\description{
+FileSystem entry info
+}
+\section{Methods}{
+
+\itemize{
+\item \code{base_name()} : The file base name (component after the last directory
+separator).
+\item \code{extension()} : The file extension
+}
+}
+
+\section{Active bindings}{
+
+\itemize{
+\item \verb{$type}: The file type
+\item \verb{$path}: The full file path in the filesystem
+\item \verb{$size}: The size in bytes, if available. Only regular files are
+guaranteed to have a size.
+\item \verb{$mtime}: The time of last modification, if available.
+}
+}
+
diff --git a/src/arrow/r/man/FileSelector.Rd b/src/arrow/r/man/FileSelector.Rd
new file mode 100644
index 000000000..a3c6deefc
--- /dev/null
+++ b/src/arrow/r/man/FileSelector.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filesystem.R
+\name{FileSelector}
+\alias{FileSelector}
+\title{file selector}
+\description{
+file selector
+}
+\section{Factory}{
+
+
+The \verb{$create()} factory method instantiates a \code{FileSelector} given the 3 fields
+described below.
+}
+
+\section{Fields}{
+
+\itemize{
+\item \code{base_dir}: The directory in which to select files. If the path exists but
+doesn't point to a directory, this should be an error.
+\item \code{allow_not_found}: The behavior if \code{base_dir} doesn't exist in the
+filesystem. If \code{FALSE}, an error is returned. If \code{TRUE}, an empty
+selection is returned
+\item \code{recursive}: Whether to recurse into subdirectories.
+}
+}
+
diff --git a/src/arrow/r/man/FileSystem.Rd b/src/arrow/r/man/FileSystem.Rd
new file mode 100644
index 000000000..2f3dcff67
--- /dev/null
+++ b/src/arrow/r/man/FileSystem.Rd
@@ -0,0 +1,99 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filesystem.R
+\docType{class}
+\name{FileSystem}
+\alias{FileSystem}
+\alias{LocalFileSystem}
+\alias{S3FileSystem}
+\alias{SubTreeFileSystem}
+\title{FileSystem classes}
+\description{
+\code{FileSystem} is an abstract file system API,
+\code{LocalFileSystem} is an implementation accessing files
+on the local machine. \code{SubTreeFileSystem} is an implementation that delegates
+to another implementation after prepending a fixed base path
+}
+\section{Factory}{
+
+
+\code{LocalFileSystem$create()} returns the object and takes no arguments.
+
+\code{SubTreeFileSystem$create()} takes the following arguments:
+\itemize{
+\item \code{base_path}, a string path
+\item \code{base_fs}, a \code{FileSystem} object
+}
+
+\code{S3FileSystem$create()} optionally takes arguments:
+\itemize{
+\item \code{anonymous}: logical, default \code{FALSE}. If true, will not attempt to look up
+credentials using standard AWS configuration methods.
+\item \code{access_key}, \code{secret_key}: authentication credentials. If one is provided,
+the other must be as well. If both are provided, they will override any
+AWS configuration set at the environment level.
+\item \code{session_token}: optional string for authentication along with
+\code{access_key} and \code{secret_key}
+\item \code{role_arn}: string AWS ARN of an AccessRole. If provided instead of \code{access_key} and
+\code{secret_key}, temporary credentials will be fetched by assuming this role.
+\item \code{session_name}: optional string identifier for the assumed role session.
+\item \code{external_id}: optional unique string identifier that might be required
+when you assume a role in another account.
+\item \code{load_frequency}: integer, frequency (in seconds) with which temporary
+credentials from an assumed role session will be refreshed. Default is
+900 (i.e. 15 minutes)
+\item \code{region}: AWS region to connect to. If omitted, the AWS library will
+provide a sensible default based on client configuration, falling back
+to "us-east-1" if no other alternatives are found.
+\item \code{endpoint_override}: If non-empty, override region with a connect string
+such as "localhost:9000". This is useful for connecting to file systems
+that emulate S3.
+\item \code{scheme}: S3 connection transport (default "https")
+\item \code{background_writes}: logical, whether \code{OutputStream} writes will be issued
+in the background, without blocking (default \code{TRUE})
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$GetFileInfo(x)}: \code{x} may be a \link{FileSelector} or a character
+vector of paths. Returns a list of \link{FileInfo}
+\item \verb{$CreateDir(path, recursive = TRUE)}: Create a directory and subdirectories.
+\item \verb{$DeleteDir(path)}: Delete a directory and its contents, recursively.
+\item \verb{$DeleteDirContents(path)}: Delete a directory's contents, recursively.
+Like \verb{$DeleteDir()},
+but doesn't delete the directory itself. Passing an empty path (\code{""}) will
+wipe the entire filesystem tree.
+\item \verb{$DeleteFile(path)} : Delete a file.
+\item \verb{$DeleteFiles(paths)} : Delete many files. The default implementation
+issues individual delete operations in sequence.
+\item \verb{$Move(src, dest)}: Move / rename a file or directory. If the destination
+exists:
+if it is a non-empty directory, an error is returned
+otherwise, if it has the same type as the source, it is replaced
+otherwise, behavior is unspecified (implementation-dependent).
+\item \verb{$CopyFile(src, dest)}: Copy a file. If the destination exists and is a
+directory, an error is returned. Otherwise, it is replaced.
+\item \verb{$OpenInputStream(path)}: Open an \link[=InputStream]{input stream} for
+sequential reading.
+\item \verb{$OpenInputFile(path)}: Open an \link[=RandomAccessFile]{input file} for random
+access reading.
+\item \verb{$OpenOutputStream(path)}: Open an \link[=OutputStream]{output stream} for
+sequential writing.
+\item \verb{$OpenAppendStream(path)}: Open an \link[=OutputStream]{output stream} for
+appending.
+}
+}
+
+\section{Active bindings}{
+
+\itemize{
+\item \verb{$type_name}: string filesystem type name, such as "local", "s3", etc.
+\item \verb{$region}: string AWS region, for \code{S3FileSystem} and \code{SubTreeFileSystem}
+containing a \code{S3FileSystem}
+\item \verb{$base_fs}: for \code{SubTreeFileSystem}, the \code{FileSystem} it contains
+\item \verb{$base_path}: for \code{SubTreeFileSystem}, the path in \verb{$base_fs} which is considered
+root in this \code{SubTreeFileSystem}.
+}
+}
+
diff --git a/src/arrow/r/man/FileWriteOptions.Rd b/src/arrow/r/man/FileWriteOptions.Rd
new file mode 100644
index 000000000..661393c8e
--- /dev/null
+++ b/src/arrow/r/man/FileWriteOptions.Rd
@@ -0,0 +1,8 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-format.R
+\name{FileWriteOptions}
+\alias{FileWriteOptions}
+\title{Format-specific write options}
+\description{
+A \code{FileWriteOptions} holds write options specific to a \code{FileFormat}.
+}
diff --git a/src/arrow/r/man/FixedWidthType.Rd b/src/arrow/r/man/FixedWidthType.Rd
new file mode 100644
index 000000000..28578268d
--- /dev/null
+++ b/src/arrow/r/man/FixedWidthType.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/type.R
+\docType{class}
+\name{FixedWidthType}
+\alias{FixedWidthType}
+\title{class arrow::FixedWidthType}
+\description{
+class arrow::FixedWidthType
+}
+\section{Methods}{
+
+
+TODO
+}
+
diff --git a/src/arrow/r/man/FragmentScanOptions.Rd b/src/arrow/r/man/FragmentScanOptions.Rd
new file mode 100644
index 000000000..103d05895
--- /dev/null
+++ b/src/arrow/r/man/FragmentScanOptions.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-format.R
+\name{FragmentScanOptions}
+\alias{FragmentScanOptions}
+\alias{CsvFragmentScanOptions}
+\alias{ParquetFragmentScanOptions}
+\title{Format-specific scan options}
+\description{
+A \code{FragmentScanOptions} holds options specific to a \code{FileFormat} and a scan
+operation.
+}
+\section{Factory}{
+
+\code{FragmentScanOptions$create()} takes the following arguments:
+\itemize{
+\item \code{format}: A string identifier of the file format. Currently supported values:
+\itemize{
+\item "parquet"
+\item "csv"/"text", aliases for the same format.
+}
+\item \code{...}: Additional format-specific options
+
+`format = "parquet"``:
+\itemize{
+\item \code{use_buffered_stream}: Read files through buffered input streams rather than
+loading entire row groups at once. This may be enabled
+to reduce memory overhead. Disabled by default.
+\item \code{buffer_size}: Size of buffered stream, if enabled. Default is 8KB.
+\item \code{pre_buffer}: Pre-buffer the raw Parquet data. This can improve performance
+on high-latency filesystems. Disabled by default.
+\code{format = "text"}: see \link{CsvConvertOptions}. Note that options can only be
+specified with the Arrow C++ library naming. Also, "block_size" from
+\link{CsvReadOptions} may be given.
+}
+}
+
+It returns the appropriate subclass of \code{FragmentScanOptions}
+(e.g. \code{CsvFragmentScanOptions}).
+}
+
diff --git a/src/arrow/r/man/InputStream.Rd b/src/arrow/r/man/InputStream.Rd
new file mode 100644
index 000000000..b909a77a1
--- /dev/null
+++ b/src/arrow/r/man/InputStream.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/io.R
+\docType{class}
+\name{InputStream}
+\alias{InputStream}
+\alias{RandomAccessFile}
+\alias{MemoryMappedFile}
+\alias{ReadableFile}
+\alias{BufferReader}
+\title{InputStream classes}
+\description{
+\code{RandomAccessFile} inherits from \code{InputStream} and is a base
+class for: \code{ReadableFile} for reading from a file; \code{MemoryMappedFile} for
+the same but with memory mapping; and \code{BufferReader} for reading from a
+buffer. Use these with the various table readers.
+}
+\section{Factory}{
+
+
+The \verb{$create()} factory methods instantiate the \code{InputStream} object and
+take the following arguments, depending on the subclass:
+\itemize{
+\item \code{path} For \code{ReadableFile}, a character file name
+\item \code{x} For \code{BufferReader}, a \link{Buffer} or an object that can be
+made into a buffer via \code{buffer()}.
+}
+
+To instantiate a \code{MemoryMappedFile}, call \code{\link[=mmap_open]{mmap_open()}}.
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$GetSize()}:
+\item \verb{$supports_zero_copy()}: Logical
+\item \verb{$seek(position)}: go to that position in the stream
+\item \verb{$tell()}: return the position in the stream
+\item \verb{$close()}: close the stream
+\item \verb{$Read(nbytes)}: read data from the stream, either a specified \code{nbytes} or
+all, if \code{nbytes} is not provided
+\item \verb{$ReadAt(position, nbytes)}: similar to \verb{$seek(position)$Read(nbytes)}
+\item \verb{$Resize(size)}: for a \code{MemoryMappedFile} that is writeable
+}
+}
+
diff --git a/src/arrow/r/man/MemoryPool.Rd b/src/arrow/r/man/MemoryPool.Rd
new file mode 100644
index 000000000..75f1882d2
--- /dev/null
+++ b/src/arrow/r/man/MemoryPool.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/memory-pool.R
+\docType{class}
+\name{MemoryPool}
+\alias{MemoryPool}
+\title{class arrow::MemoryPool}
+\description{
+class arrow::MemoryPool
+}
+\section{Methods}{
+
+\itemize{
+\item \code{backend_name}: one of "jemalloc", "mimalloc", or "system". Alternative
+memory allocators are optionally enabled at build time. Windows builds
+generally have \code{mimalloc}, and most others have both \code{jemalloc} (used by
+default) and \code{mimalloc}. To change memory allocators at runtime, set the
+environment variable \code{ARROW_DEFAULT_MEMORY_POOL} to one of those strings
+prior to loading the \code{arrow} library.
+\item \code{bytes_allocated}
+\item \code{max_memory}
+}
+}
+
+\keyword{internal}
diff --git a/src/arrow/r/man/Message.Rd b/src/arrow/r/man/Message.Rd
new file mode 100644
index 000000000..84dd90a64
--- /dev/null
+++ b/src/arrow/r/man/Message.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/message.R
+\docType{class}
+\name{Message}
+\alias{Message}
+\title{class arrow::Message}
+\description{
+class arrow::Message
+}
+\section{Methods}{
+
+
+TODO
+}
+
diff --git a/src/arrow/r/man/MessageReader.Rd b/src/arrow/r/man/MessageReader.Rd
new file mode 100644
index 000000000..d198c185e
--- /dev/null
+++ b/src/arrow/r/man/MessageReader.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/message.R
+\docType{class}
+\name{MessageReader}
+\alias{MessageReader}
+\title{class arrow::MessageReader}
+\description{
+class arrow::MessageReader
+}
+\section{Methods}{
+
+
+TODO
+}
+
diff --git a/src/arrow/r/man/OutputStream.Rd b/src/arrow/r/man/OutputStream.Rd
new file mode 100644
index 000000000..f7c71b192
--- /dev/null
+++ b/src/arrow/r/man/OutputStream.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/io.R
+\docType{class}
+\name{OutputStream}
+\alias{OutputStream}
+\alias{FileOutputStream}
+\alias{BufferOutputStream}
+\title{OutputStream classes}
+\description{
+\code{FileOutputStream} is for writing to a file;
+\code{BufferOutputStream} writes to a buffer;
+You can create one and pass it to any of the table writers, for example.
+}
+\section{Factory}{
+
+
+The \verb{$create()} factory methods instantiate the \code{OutputStream} object and
+take the following arguments, depending on the subclass:
+\itemize{
+\item \code{path} For \code{FileOutputStream}, a character file name
+\item \code{initial_capacity} For \code{BufferOutputStream}, the size in bytes of the
+buffer.
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$tell()}: return the position in the stream
+\item \verb{$close()}: close the stream
+\item \verb{$write(x)}: send \code{x} to the stream
+\item \verb{$capacity()}: for \code{BufferOutputStream}
+\item \verb{$finish()}: for \code{BufferOutputStream}
+\item \verb{$GetExtentBytesWritten()}: for \code{MockOutputStream}, report how many bytes
+were sent.
+}
+}
+
diff --git a/src/arrow/r/man/ParquetArrowReaderProperties.Rd b/src/arrow/r/man/ParquetArrowReaderProperties.Rd
new file mode 100644
index 000000000..33a50f712
--- /dev/null
+++ b/src/arrow/r/man/ParquetArrowReaderProperties.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\docType{class}
+\name{ParquetArrowReaderProperties}
+\alias{ParquetArrowReaderProperties}
+\title{ParquetArrowReaderProperties class}
+\description{
+This class holds settings to control how a Parquet file is read
+by \link{ParquetFileReader}.
+}
+\section{Factory}{
+
+
+The \code{ParquetArrowReaderProperties$create()} factory method instantiates the object
+and takes the following arguments:
+\itemize{
+\item \code{use_threads} Logical: whether to use multithreading (default \code{TRUE})
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$read_dictionary(column_index)}
+\item \verb{$set_read_dictionary(column_index, read_dict)}
+\item \verb{$use_threads(use_threads)}
+}
+}
+
diff --git a/src/arrow/r/man/ParquetFileReader.Rd b/src/arrow/r/man/ParquetFileReader.Rd
new file mode 100644
index 000000000..30d0725a4
--- /dev/null
+++ b/src/arrow/r/man/ParquetFileReader.Rd
@@ -0,0 +1,59 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\docType{class}
+\name{ParquetFileReader}
+\alias{ParquetFileReader}
+\title{ParquetFileReader class}
+\description{
+This class enables you to interact with Parquet files.
+}
+\section{Factory}{
+
+
+The \code{ParquetFileReader$create()} factory method instantiates the object and
+takes the following arguments:
+\itemize{
+\item \code{file} A character file name, raw vector, or Arrow file connection object
+(e.g. \code{RandomAccessFile}).
+\item \code{props} Optional \link{ParquetArrowReaderProperties}
+\item \code{mmap} Logical: whether to memory-map the file (default \code{TRUE})
+\item \code{...} Additional arguments, currently ignored
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$ReadTable(column_indices)}: get an \code{arrow::Table} from the file. The optional
+\verb{column_indices=} argument is a 0-based integer vector indicating which columns to retain.
+\item \verb{$ReadRowGroup(i, column_indices)}: get an \code{arrow::Table} by reading the \code{i}th row group (0-based).
+The optional \verb{column_indices=} argument is a 0-based integer vector indicating which columns to retain.
+\item \verb{$ReadRowGroups(row_groups, column_indices)}: get an \code{arrow::Table} by reading several row
+groups (0-based integers).
+The optional \verb{column_indices=} argument is a 0-based integer vector indicating which columns to retain.
+\item \verb{$GetSchema()}: get the \code{arrow::Schema} of the data in the file
+\item \verb{$ReadColumn(i)}: read the \code{i}th column (0-based) as a \link{ChunkedArray}.
+}
+}
+
+\section{Active bindings}{
+
+\itemize{
+\item \verb{$num_rows}: number of rows.
+\item \verb{$num_columns}: number of columns.
+\item \verb{$num_row_groups}: number of row groups.
+}
+}
+
+\examples{
+\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+f <- system.file("v0.7.1.parquet", package = "arrow")
+pq <- ParquetFileReader$create(f)
+pq$GetSchema()
+if (codec_is_available("snappy")) {
+ # This file has compressed data columns
+ tab <- pq$ReadTable()
+ tab$schema
+}
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/ParquetFileWriter.Rd b/src/arrow/r/man/ParquetFileWriter.Rd
new file mode 100644
index 000000000..f36e85ab6
--- /dev/null
+++ b/src/arrow/r/man/ParquetFileWriter.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\docType{class}
+\name{ParquetFileWriter}
+\alias{ParquetFileWriter}
+\title{ParquetFileWriter class}
+\description{
+This class enables you to interact with Parquet files.
+}
+\section{Factory}{
+
+
+The \code{ParquetFileWriter$create()} factory method instantiates the object and
+takes the following arguments:
+\itemize{
+\item \code{schema} A \link{Schema}
+\item \code{sink} An \link[=OutputStream]{arrow::io::OutputStream}
+\item \code{properties} An instance of \link{ParquetWriterProperties}
+\item \code{arrow_properties} An instance of \code{ParquetArrowWriterProperties}
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \code{WriteTable} Write a \link{Table} to \code{sink}
+\item \code{Close} Close the writer. Note: does not close the \code{sink}.
+\link[=OutputStream]{arrow::io::OutputStream} has its own \code{close()} method.
+}
+}
+
diff --git a/src/arrow/r/man/ParquetWriterProperties.Rd b/src/arrow/r/man/ParquetWriterProperties.Rd
new file mode 100644
index 000000000..7beb8a82a
--- /dev/null
+++ b/src/arrow/r/man/ParquetWriterProperties.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\docType{class}
+\name{ParquetWriterProperties}
+\alias{ParquetWriterProperties}
+\title{ParquetWriterProperties class}
+\description{
+This class holds settings to control how a Parquet file is read
+by \link{ParquetFileWriter}.
+}
+\details{
+The parameters \code{compression}, \code{compression_level}, \code{use_dictionary}
+and write_statistics` support various patterns:
+\itemize{
+\item The default \code{NULL} leaves the parameter unspecified, and the C++ library
+uses an appropriate default for each column (defaults listed above)
+\item A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns
+\item An unnamed vector, of the same size as the number of columns, to specify a
+value for each column, in positional order
+\item A named vector, to specify the value for the named columns, the default
+value for the setting is used when not supplied
+}
+
+Unlike the high-level \link{write_parquet}, \code{ParquetWriterProperties} arguments
+use the C++ defaults. Currently this means "uncompressed" rather than
+"snappy" for the \code{compression} argument.
+}
+\section{Factory}{
+
+
+The \code{ParquetWriterProperties$create()} factory method instantiates the object
+and takes the following arguments:
+\itemize{
+\item \code{table}: table to write (required)
+\item \code{version}: Parquet version, "1.0" or "2.0". Default "1.0"
+\item \code{compression}: Compression type, algorithm \code{"uncompressed"}
+\item \code{compression_level}: Compression level; meaning depends on compression algorithm
+\item \code{use_dictionary}: Specify if we should use dictionary encoding. Default \code{TRUE}
+\item \code{write_statistics}: Specify if we should write statistics. Default \code{TRUE}
+\item \code{data_page_size}: Set a target threshold for the approximate encoded
+size of data pages within a column chunk (in bytes). Default 1 MiB.
+}
+}
+
+\seealso{
+\link{write_parquet}
+
+\link{Schema} for information about schemas and metadata handling.
+}
diff --git a/src/arrow/r/man/Partitioning.Rd b/src/arrow/r/man/Partitioning.Rd
new file mode 100644
index 000000000..cfe374155
--- /dev/null
+++ b/src/arrow/r/man/Partitioning.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-partition.R
+\name{Partitioning}
+\alias{Partitioning}
+\alias{DirectoryPartitioning}
+\alias{HivePartitioning}
+\alias{DirectoryPartitioningFactory}
+\alias{HivePartitioningFactory}
+\title{Define Partitioning for a Dataset}
+\description{
+Pass a \code{Partitioning} object to a \link{FileSystemDatasetFactory}'s \verb{$create()}
+method to indicate how the file's paths should be interpreted to define
+partitioning.
+
+\code{DirectoryPartitioning} describes how to interpret raw path segments, in
+order. For example, \code{schema(year = int16(), month = int8())} would define
+partitions for file paths like "2019/01/file.parquet",
+"2019/02/file.parquet", etc. In this scheme \code{NULL} values will be skipped. In
+the previous example: when writing a dataset if the month was \code{NA} (or
+\code{NULL}), the files would be placed in "2019/file.parquet". When reading, the
+rows in "2019/file.parquet" would return an \code{NA} for the month column. An
+error will be raised if an outer directory is \code{NULL} and an inner directory
+is not.
+
+\code{HivePartitioning} is for Hive-style partitioning, which embeds field
+names and values in path segments, such as
+"/year=2019/month=2/data.parquet". Because fields are named in the path
+segments, order does not matter. This partitioning scheme allows \code{NULL}
+values. They will be replaced by a configurable \code{null_fallback} which
+defaults to the string \code{"__HIVE_DEFAULT_PARTITION__"} when writing. When
+reading, the \code{null_fallback} string will be replaced with \code{NA}s as
+appropriate.
+
+\code{PartitioningFactory} subclasses instruct the \code{DatasetFactory} to detect
+partition features from the file paths.
+}
+\section{Factory}{
+
+Both \code{DirectoryPartitioning$create()} and \code{HivePartitioning$create()}
+methods take a \link{Schema} as a single input argument. The helper
+function \code{\link[=hive_partition]{hive_partition(...)}} is shorthand for
+\code{HivePartitioning$create(schema(...))}.
+
+With \code{DirectoryPartitioningFactory$create()}, you can provide just the
+names of the path segments (in our example, \code{c("year", "month")}), and
+the \code{DatasetFactory} will infer the data types for those partition variables.
+\code{HivePartitioningFactory$create()} takes no arguments: both variable names
+and their types can be inferred from the file paths. \code{hive_partition()} with
+no arguments returns a \code{HivePartitioningFactory}.
+}
+
diff --git a/src/arrow/r/man/RecordBatch.Rd b/src/arrow/r/man/RecordBatch.Rd
new file mode 100644
index 000000000..ff08c2158
--- /dev/null
+++ b/src/arrow/r/man/RecordBatch.Rd
@@ -0,0 +1,92 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/record-batch.R
+\docType{class}
+\name{RecordBatch}
+\alias{RecordBatch}
+\alias{record_batch}
+\title{RecordBatch class}
+\usage{
+record_batch(..., schema = NULL)
+}
+\arguments{
+\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a
+mixture of data.frames and vectors, the inputs will be autospliced together
+(see examples). Alternatively, you can provide a single Arrow IPC
+\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.}
+
+\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from
+the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.}
+}
+\description{
+A record batch is a collection of equal-length arrays matching
+a particular \link{Schema}. It is a table-like data structure that is semantically
+a sequence of \link[=Field]{fields}, each a contiguous Arrow \link{Array}.
+}
+\section{S3 Methods and Usage}{
+
+Record batches are data-frame-like, and many methods you expect to work on
+a \code{data.frame} are implemented for \code{RecordBatch}. This includes \code{[}, \code{[[},
+\code{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull
+the data from an Arrow record batch into R with \code{as.data.frame()}. See the
+examples.
+
+A caveat about the \code{$} method: because \code{RecordBatch} is an \code{R6} object,
+\code{$} is also used to access the object's methods (see below). Methods take
+precedence over the table's columns. So, \code{batch$Slice} would return the
+"Slice" method function even if there were a column in the table called
+"Slice".
+}
+
+\section{R6 Methods}{
+
+In addition to the more R-friendly S3 methods, a \code{RecordBatch} object has
+the following R6 methods that map onto the underlying C++ methods:
+\itemize{
+\item \verb{$Equals(other)}: Returns \code{TRUE} if the \code{other} record batch is equal
+\item \verb{$column(i)}: Extract an \code{Array} by integer position from the batch
+\item \verb{$column_name(i)}: Get a column's name by integer position
+\item \verb{$names()}: Get all column names (called by \code{names(batch)})
+\item \verb{$RenameColumns(value)}: Set all column names (called by \code{names(batch) <- value})
+\item \verb{$GetColumnByName(name)}: Extract an \code{Array} by string name
+\item \verb{$RemoveColumn(i)}: Drops a column from the batch by integer position
+\item \verb{$SelectColumns(indices)}: Return a new record batch with a selection of columns, expressed as 0-based integers.
+\item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the
+indicated integer offset and going for the given length, or to the end
+of the table if \code{NULL}, the default.
+\item \verb{$Take(i)}: return an \code{RecordBatch} with rows at positions given by
+integers (R vector or Array Array) \code{i}.
+\item \verb{$Filter(i, keep_na = TRUE)}: return an \code{RecordBatch} with rows at positions where logical
+vector (or Arrow boolean Array) \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(names, descending = FALSE)}: return an \code{Array} of integer row
+positions that can be used to rearrange the \code{RecordBatch} in ascending or
+descending order by the first named column, breaking ties with further named
+columns. \code{descending} can be a logical vector of length one or of the same
+length as \code{names}.
+\item \verb{$serialize()}: Returns a raw vector suitable for interprocess communication
+\item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter
+the schema of the record batch.
+}
+
+There are also some active bindings
+\itemize{
+\item \verb{$num_columns}
+\item \verb{$num_rows}
+\item \verb{$schema}
+\item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list.
+Modify or replace by assigning in (\code{batch$metadata <- new_metadata}).
+All list elements are coerced to string. See \code{schema()} for more information.
+\item \verb{$columns}: Returns a list of \code{Array}s
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+batch <- record_batch(name = rownames(mtcars), mtcars)
+dim(batch)
+dim(head(batch))
+names(batch)
+batch$mpg
+batch[["cyl"]]
+as.data.frame(batch[4:8, c("gear", "hp", "wt")])
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/RecordBatchReader.Rd b/src/arrow/r/man/RecordBatchReader.Rd
new file mode 100644
index 000000000..90c796a66
--- /dev/null
+++ b/src/arrow/r/man/RecordBatchReader.Rd
@@ -0,0 +1,86 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/record-batch-reader.R
+\docType{class}
+\name{RecordBatchReader}
+\alias{RecordBatchReader}
+\alias{RecordBatchStreamReader}
+\alias{RecordBatchFileReader}
+\title{RecordBatchReader classes}
+\description{
+Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}:
+a "stream" format and a "file" format, known as Feather.
+\code{RecordBatchStreamReader} and \code{RecordBatchFileReader} are
+interfaces for accessing record batches from input sources in those formats,
+respectively.
+
+For guidance on how to use these classes, see the examples section.
+}
+\section{Factory}{
+
+
+The \code{RecordBatchFileReader$create()} and \code{RecordBatchStreamReader$create()}
+factory methods instantiate the object and
+take a single argument, named according to the class:
+\itemize{
+\item \code{file} A character file name, raw vector, or Arrow file connection object
+(e.g. \link{RandomAccessFile}).
+\item \code{stream} A raw vector, \link{Buffer}, or \link{InputStream}.
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$read_next_batch()}: Returns a \code{RecordBatch}, iterating through the
+Reader. If there are no further batches in the Reader, it returns \code{NULL}.
+\item \verb{$schema}: Returns a \link{Schema} (active binding)
+\item \verb{$batches()}: Returns a list of \code{RecordBatch}es
+\item \verb{$read_table()}: Collects the reader's \code{RecordBatch}es into a \link{Table}
+\item \verb{$get_batch(i)}: For \code{RecordBatchFileReader}, return a particular batch
+by an integer index.
+\item \verb{$num_record_batches()}: For \code{RecordBatchFileReader}, see how many batches
+are in the file.
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+
+batch <- record_batch(chickwts)
+
+# This opens a connection to the file in Arrow
+file_obj <- FileOutputStream$create(tf)
+# Pass that to a RecordBatchWriter to write data conforming to a schema
+writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+writer$write(batch)
+# You may write additional batches to the stream, provided that they have
+# the same schema.
+# Call "close" on the writer to indicate end-of-file/stream
+writer$close()
+# Then, close the connection--closing the IPC message does not close the file
+file_obj$close()
+
+# Now, we have a file we can read from. Same pattern: open file connection,
+# then pass it to a RecordBatchReader
+read_file_obj <- ReadableFile$create(tf)
+reader <- RecordBatchFileReader$create(read_file_obj)
+# RecordBatchFileReader knows how many batches it has (StreamReader does not)
+reader$num_record_batches
+# We could consume the Reader by calling $read_next_batch() until all are,
+# consumed, or we can call $read_table() to pull them all into a Table
+tab <- reader$read_table()
+# Call as.data.frame to turn that Table into an R data.frame
+df <- as.data.frame(tab)
+# This should be the same data we sent
+all.equal(df, chickwts, check.attributes = FALSE)
+# Unlike the Writers, we don't have to close RecordBatchReaders,
+# but we do still need to close the file connection
+read_file_obj$close()
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\code{\link[=read_ipc_stream]{read_ipc_stream()}} and \code{\link[=read_feather]{read_feather()}} provide a much simpler interface
+for reading data from these formats and are sufficient for many use cases.
+}
diff --git a/src/arrow/r/man/RecordBatchWriter.Rd b/src/arrow/r/man/RecordBatchWriter.Rd
new file mode 100644
index 000000000..219c150e6
--- /dev/null
+++ b/src/arrow/r/man/RecordBatchWriter.Rd
@@ -0,0 +1,89 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/record-batch-writer.R
+\docType{class}
+\name{RecordBatchWriter}
+\alias{RecordBatchWriter}
+\alias{RecordBatchStreamWriter}
+\alias{RecordBatchFileWriter}
+\title{RecordBatchWriter classes}
+\description{
+Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}:
+a "stream" format and a "file" format, known as Feather.
+\code{RecordBatchStreamWriter} and \code{RecordBatchFileWriter} are
+interfaces for writing record batches to those formats, respectively.
+
+For guidance on how to use these classes, see the examples section.
+}
+\section{Factory}{
+
+
+The \code{RecordBatchFileWriter$create()} and \code{RecordBatchStreamWriter$create()}
+factory methods instantiate the object and take the following arguments:
+\itemize{
+\item \code{sink} An \code{OutputStream}
+\item \code{schema} A \link{Schema} for the data to be written
+\item \code{use_legacy_format} logical: write data formatted so that Arrow libraries
+versions 0.14 and lower can read it. Default is \code{FALSE}. You can also
+enable this by setting the environment variable \code{ARROW_PRE_0_15_IPC_FORMAT=1}.
+\item \code{metadata_version}: A string like "V5" or the equivalent integer indicating
+the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
+unless the environment variable \code{ARROW_PRE_1_0_METADATA_VERSION=1}, in
+which case it will be V4.
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$write(x)}: Write a \link{RecordBatch}, \link{Table}, or \code{data.frame}, dispatching
+to the methods below appropriately
+\item \verb{$write_batch(batch)}: Write a \code{RecordBatch} to stream
+\item \verb{$write_table(table)}: Write a \code{Table} to stream
+\item \verb{$close()}: close stream. Note that this indicates end-of-file or
+end-of-stream--it does not close the connection to the \code{sink}. That needs
+to be closed separately.
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+
+batch <- record_batch(chickwts)
+
+# This opens a connection to the file in Arrow
+file_obj <- FileOutputStream$create(tf)
+# Pass that to a RecordBatchWriter to write data conforming to a schema
+writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+writer$write(batch)
+# You may write additional batches to the stream, provided that they have
+# the same schema.
+# Call "close" on the writer to indicate end-of-file/stream
+writer$close()
+# Then, close the connection--closing the IPC message does not close the file
+file_obj$close()
+
+# Now, we have a file we can read from. Same pattern: open file connection,
+# then pass it to a RecordBatchReader
+read_file_obj <- ReadableFile$create(tf)
+reader <- RecordBatchFileReader$create(read_file_obj)
+# RecordBatchFileReader knows how many batches it has (StreamReader does not)
+reader$num_record_batches
+# We could consume the Reader by calling $read_next_batch() until all are,
+# consumed, or we can call $read_table() to pull them all into a Table
+tab <- reader$read_table()
+# Call as.data.frame to turn that Table into an R data.frame
+df <- as.data.frame(tab)
+# This should be the same data we sent
+all.equal(df, chickwts, check.attributes = FALSE)
+# Unlike the Writers, we don't have to close RecordBatchReaders,
+# but we do still need to close the file connection
+read_file_obj$close()
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\code{\link[=write_ipc_stream]{write_ipc_stream()}} and \code{\link[=write_feather]{write_feather()}} provide a much simpler
+interface for writing data to these formats and are sufficient for many use
+cases. \code{\link[=write_to_raw]{write_to_raw()}} is a version that serializes data to a buffer.
+}
diff --git a/src/arrow/r/man/Scalar.Rd b/src/arrow/r/man/Scalar.Rd
new file mode 100644
index 000000000..21e04c12e
--- /dev/null
+++ b/src/arrow/r/man/Scalar.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/scalar.R
+\docType{class}
+\name{Scalar}
+\alias{Scalar}
+\title{Arrow scalars}
+\description{
+A \code{Scalar} holds a single value of an Arrow type.
+}
+\section{Methods}{
+
+\verb{$ToString()}: convert to a string
+\verb{$as_vector()}: convert to an R vector
+\verb{$as_array()}: convert to an Arrow \code{Array}
+\verb{$Equals(other)}: is this Scalar equal to \code{other}
+\verb{$ApproxEquals(other)}: is this Scalar approximately equal to \code{other}
+\verb{$is_valid}: is this Scalar valid
+\verb{$null_count}: number of invalid values - 1 or 0
+\verb{$type}: Scalar type
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+Scalar$create(pi)
+Scalar$create(404)
+# If you pass a vector into Scalar$create, you get a list containing your items
+Scalar$create(c(1, 2, 3))
+
+# Comparisons
+my_scalar <- Scalar$create(99)
+my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE
+my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE
+my_scalar$Equals(Scalar$create(99.000009)) # FALSE
+my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match)
+
+my_scalar$ToString()
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/Scanner.Rd b/src/arrow/r/man/Scanner.Rd
new file mode 100644
index 000000000..db6488f50
--- /dev/null
+++ b/src/arrow/r/man/Scanner.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-scan.R
+\name{Scanner}
+\alias{Scanner}
+\alias{ScannerBuilder}
+\title{Scan the contents of a dataset}
+\description{
+A \code{Scanner} iterates over a \link{Dataset}'s fragments and returns data
+according to given row filtering and column projection. A \code{ScannerBuilder}
+can help create one.
+}
+\section{Factory}{
+
+\code{Scanner$create()} wraps the \code{ScannerBuilder} interface to make a \code{Scanner}.
+It takes the following arguments:
+\itemize{
+\item \code{dataset}: A \code{Dataset} or \code{arrow_dplyr_query} object, as returned by the
+\code{dplyr} methods on \code{Dataset}.
+\item \code{projection}: A character vector of column names to select columns or a
+named list of expressions
+\item \code{filter}: A \code{Expression} to filter the scanned rows by, or \code{TRUE} (default)
+to keep all rows.
+\item \code{use_threads}: logical: should scanning use multithreading? Default \code{TRUE}
+\item \code{use_async}: logical: should the async scanner (performs better on
+high-latency/highly parallel filesystems like S3) be used? Default \code{FALSE}
+\item \code{...}: Additional arguments, currently ignored
+}
+}
+
+\section{Methods}{
+
+\code{ScannerBuilder} has the following methods:
+\itemize{
+\item \verb{$Project(cols)}: Indicate that the scan should only return columns given
+by \code{cols}, a character vector of column names
+\item \verb{$Filter(expr)}: Filter rows by an \link{Expression}.
+\item \verb{$UseThreads(threads)}: logical: should the scan use multithreading?
+The method's default input is \code{TRUE}, but you must call the method to enable
+multithreading because the scanner default is \code{FALSE}.
+\item \verb{$UseAsync(use_async)}: logical: should the async scanner be used?
+\item \verb{$BatchSize(batch_size)}: integer: Maximum row count of scanned record
+batches, default is 32K. If scanned record batches are overflowing memory
+then this method can be called to reduce their size.
+\item \verb{$schema}: Active binding, returns the \link{Schema} of the Dataset
+\item \verb{$Finish()}: Returns a \code{Scanner}
+}
+
+\code{Scanner} currently has a single method, \verb{$ToTable()}, which evaluates the
+query and returns an Arrow \link{Table}.
+}
+
diff --git a/src/arrow/r/man/Schema.Rd b/src/arrow/r/man/Schema.Rd
new file mode 100644
index 000000000..7322c70f2
--- /dev/null
+++ b/src/arrow/r/man/Schema.Rd
@@ -0,0 +1,86 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/schema.R
+\docType{class}
+\name{Schema}
+\alias{Schema}
+\alias{schema}
+\title{Schema class}
+\usage{
+schema(...)
+}
+\arguments{
+\item{...}{named list containing \link[=data-type]{data types} or
+a list of \link[=field]{fields} containing the fields for the schema}
+}
+\description{
+A \code{Schema} is a list of \link{Field}s, which map names to
+Arrow \link[=data-type]{data types}. Create a \code{Schema} when you
+want to convert an R \code{data.frame} to Arrow but don't want to rely on the
+default mapping of R types to Arrow types, such as when you want to choose a
+specific numeric precision, or when creating a \link{Dataset} and you want to
+ensure a specific schema rather than inferring it from the various files.
+
+Many Arrow objects, including \link{Table} and \link{Dataset}, have a \verb{$schema} method
+(active binding) that lets you access their schema.
+}
+\section{Methods}{
+
+\itemize{
+\item \verb{$ToString()}: convert to a string
+\item \verb{$field(i)}: returns the field at index \code{i} (0-based)
+\item \verb{$GetFieldByName(x)}: returns the field with name \code{x}
+\item \verb{$WithMetadata(metadata)}: returns a new \code{Schema} with the key-value
+\code{metadata} set. Note that all list elements in \code{metadata} will be coerced
+to \code{character}.
+}
+}
+
+\section{Active bindings}{
+
+\itemize{
+\item \verb{$names}: returns the field names (called in \code{names(Schema)})
+\item \verb{$num_fields}: returns the number of fields (called in \code{length(Schema)})
+\item \verb{$fields}: returns the list of \code{Field}s in the \code{Schema}, suitable for
+iterating over
+\item \verb{$HasMetadata}: logical: does this \code{Schema} have extra metadata?
+\item \verb{$metadata}: returns the key-value metadata as a named list.
+Modify or replace by assigning in (\code{sch$metadata <- new_metadata}).
+All list elements are coerced to string.
+}
+}
+
+\section{R Metadata}{
+
+
+When converting a data.frame to an Arrow Table or RecordBatch, attributes
+from the \code{data.frame} are saved alongside tables so that the object can be
+reconstructed faithfully in R (e.g. with \code{as.data.frame()}). This metadata
+can be both at the top-level of the \code{data.frame} (e.g. \code{attributes(df)}) or
+at the column (e.g. \code{attributes(df$col_a)}) or for list columns only:
+element level (e.g. \code{attributes(df[1, "col_a"])}). For example, this allows
+for storing \code{haven} columns in a table and being able to faithfully
+re-create them when pulled back into R. This metadata is separate from the
+schema (column names and types) which is compatible with other Arrow
+clients. The R metadata is only read by R and is ignored by other clients
+(e.g. Pandas has its own custom metadata). This metadata is stored in
+\verb{$metadata$r}.
+
+Since Schema metadata keys and values must be strings, this metadata is
+saved by serializing R's attribute list structure to a string. If the
+serialized metadata exceeds 100Kb in size, by default it is compressed
+starting in version 3.0.0. To disable this compression (e.g. for tables
+that are compatible with Arrow versions before 3.0.0 and include large
+amounts of metadata), set the option \code{arrow.compress_metadata} to \code{FALSE}.
+Files with compressed metadata are readable by older versions of arrow, but
+the metadata is dropped.
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+tab1 <- arrow_table(df)
+tab1$schema
+tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
+tab2$schema
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/Table.Rd b/src/arrow/r/man/Table.Rd
new file mode 100644
index 000000000..d5654bf93
--- /dev/null
+++ b/src/arrow/r/man/Table.Rd
@@ -0,0 +1,92 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/table.R
+\docType{class}
+\name{Table}
+\alias{Table}
+\alias{arrow_table}
+\title{Table class}
+\usage{
+arrow_table(..., schema = NULL)
+}
+\arguments{
+\item{...}{A \code{data.frame} or a named set of Arrays or vectors. If given a
+mixture of data.frames and named vectors, the inputs will be autospliced together
+(see examples). Alternatively, you can provide a single Arrow IPC
+\code{InputStream}, \code{Message}, \code{Buffer}, or R \code{raw} object containing a \code{Buffer}.}
+
+\item{schema}{a \link{Schema}, or \code{NULL} (the default) to infer the schema from
+the data in \code{...}. When providing an Arrow IPC buffer, \code{schema} is required.}
+}
+\description{
+A Table is a sequence of \link[=ChunkedArray]{chunked arrays}. They
+have a similar interface to \link[=RecordBatch]{record batches}, but they can be
+composed from multiple record batches or chunked arrays.
+}
+\section{S3 Methods and Usage}{
+
+Tables are data-frame-like, and many methods you expect to work on
+a \code{data.frame} are implemented for \code{Table}. This includes \code{[}, \code{[[},
+\code{$}, \code{names}, \code{dim}, \code{nrow}, \code{ncol}, \code{head}, and \code{tail}. You can also pull
+the data from an Arrow table into R with \code{as.data.frame()}. See the
+examples.
+
+A caveat about the \code{$} method: because \code{Table} is an \code{R6} object,
+\code{$} is also used to access the object's methods (see below). Methods take
+precedence over the table's columns. So, \code{tab$Slice} would return the
+"Slice" method function even if there were a column in the table called
+"Slice".
+}
+
+\section{R6 Methods}{
+
+In addition to the more R-friendly S3 methods, a \code{Table} object has
+the following R6 methods that map onto the underlying C++ methods:
+\itemize{
+\item \verb{$column(i)}: Extract a \code{ChunkedArray} by integer position from the table
+\item \verb{$ColumnNames()}: Get all column names (called by \code{names(tab)})
+\item \verb{$RenameColumns(value)}: Set all column names (called by \code{names(tab) <- value})
+\item \verb{$GetColumnByName(name)}: Extract a \code{ChunkedArray} by string name
+\item \verb{$field(i)}: Extract a \code{Field} from the table schema by integer position
+\item \verb{$SelectColumns(indices)}: Return new \code{Table} with specified columns, expressed as 0-based integers.
+\item \verb{$Slice(offset, length = NULL)}: Create a zero-copy view starting at the
+indicated integer offset and going for the given length, or to the end
+of the table if \code{NULL}, the default.
+\item \verb{$Take(i)}: return an \code{Table} with rows at positions given by
+integers \code{i}. If \code{i} is an Arrow \code{Array} or \code{ChunkedArray}, it will be
+coerced to an R vector before taking.
+\item \verb{$Filter(i, keep_na = TRUE)}: return an \code{Table} with rows at positions where logical
+vector or Arrow boolean-type \verb{(Chunked)Array} \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(names, descending = FALSE)}: return an \code{Array} of integer row
+positions that can be used to rearrange the \code{Table} in ascending or descending
+order by the first named column, breaking ties with further named columns.
+\code{descending} can be a logical vector of length one or of the same length as
+\code{names}.
+\item \verb{$serialize(output_stream, ...)}: Write the table to the given
+\link{OutputStream}
+\item \verb{$cast(target_schema, safe = TRUE, options = cast_options(safe))}: Alter
+the schema of the record batch.
+}
+
+There are also some active bindings:
+\itemize{
+\item \verb{$num_columns}
+\item \verb{$num_rows}
+\item \verb{$schema}
+\item \verb{$metadata}: Returns the key-value metadata of the \code{Schema} as a named list.
+Modify or replace by assigning in (\code{tab$metadata <- new_metadata}).
+All list elements are coerced to string. See \code{schema()} for more information.
+\item \verb{$columns}: Returns a list of \code{ChunkedArray}s
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tbl <- arrow_table(name = rownames(mtcars), mtcars)
+dim(tbl)
+dim(head(tbl))
+names(tbl)
+tbl$mpg
+tbl[["cyl"]]
+as.data.frame(tbl[4:8, c("gear", "hp", "wt")])
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/array.Rd b/src/arrow/r/man/array.Rd
new file mode 100644
index 000000000..78d3eaff6
--- /dev/null
+++ b/src/arrow/r/man/array.Rd
@@ -0,0 +1,107 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/array.R, R/scalar.R
+\docType{class}
+\name{array}
+\alias{array}
+\alias{Array}
+\alias{DictionaryArray}
+\alias{StructArray}
+\alias{ListArray}
+\alias{LargeListArray}
+\alias{FixedSizeListArray}
+\alias{StructScalar}
+\title{Arrow Arrays}
+\description{
+An \code{Array} is an immutable data array with some logical type
+and some length. Most logical types are contained in the base
+\code{Array} class; there are also subclasses for \code{DictionaryArray}, \code{ListArray},
+and \code{StructArray}.
+}
+\section{Factory}{
+
+The \code{Array$create()} factory method instantiates an \code{Array} and
+takes the following arguments:
+\itemize{
+\item \code{x}: an R vector, list, or \code{data.frame}
+\item \code{type}: an optional \link[=data-type]{data type} for \code{x}. If omitted, the type
+will be inferred from the data.
+}
+
+\code{Array$create()} will return the appropriate subclass of \code{Array}, such as
+\code{DictionaryArray} when given an R factor.
+
+To compose a \code{DictionaryArray} directly, call \code{DictionaryArray$create()},
+which takes two arguments:
+\itemize{
+\item \code{x}: an R vector or \code{Array} of integers for the dictionary indices
+\item \code{dict}: an R vector or \code{Array} of dictionary values (like R factor levels
+but not limited to strings only)
+}
+}
+
+\section{Usage}{
+\preformatted{a <- Array$create(x)
+length(a)
+
+print(a)
+a == a
+}
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$IsNull(i)}: Return true if value at index is null. Does not boundscheck
+\item \verb{$IsValid(i)}: Return true if value at index is valid. Does not boundscheck
+\item \verb{$length()}: Size in the number of elements this array contains
+\item \verb{$offset}: A relative position into another array's data, to enable zero-copy slicing
+\item \verb{$null_count}: The number of null entries in the array
+\item \verb{$type}: logical type of data
+\item \verb{$type_id()}: type id
+\item \verb{$Equals(other)} : is this array equal to \code{other}
+\item \verb{$ApproxEquals(other)} :
+\item \verb{$Diff(other)} : return a string expressing the difference between two arrays
+\item \verb{$data()}: return the underlying \link{ArrayData}
+\item \verb{$as_vector()}: convert to an R vector
+\item \verb{$ToString()}: string representation of the array
+\item \verb{$Slice(offset, length = NULL)}: Construct a zero-copy slice of the array
+with the indicated offset and length. If length is \code{NULL}, the slice goes
+until the end of the array.
+\item \verb{$Take(i)}: return an \code{Array} with values at positions given by integers
+(R vector or Array Array) \code{i}.
+\item \verb{$Filter(i, keep_na = TRUE)}: return an \code{Array} with values at positions where logical
+vector (or Arrow boolean Array) \code{i} is \code{TRUE}.
+\item \verb{$SortIndices(descending = FALSE)}: return an \code{Array} of integer positions that can be
+used to rearrange the \code{Array} in ascending or descending order
+\item \verb{$RangeEquals(other, start_idx, end_idx, other_start_idx)} :
+\item \verb{$cast(target_type, safe = TRUE, options = cast_options(safe))}: Alter the
+data in the array to change its type.
+\item \verb{$View(type)}: Construct a zero-copy view of this array with the given type.
+\item \verb{$Validate()} : Perform any validation checks to determine obvious inconsistencies
+within the array's internal data. This can be an expensive check, potentially \code{O(length)}
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+my_array <- Array$create(1:10)
+my_array$type
+my_array$cast(int8())
+
+# Check if value is null; zero-indexed
+na_array <- Array$create(c(1:5, NA))
+na_array$IsNull(0)
+na_array$IsNull(5)
+na_array$IsValid(5)
+na_array$null_count
+
+# zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice
+new_array <- na_array$Slice(5)
+new_array$offset
+
+# Compare 2 arrays
+na_array2 <- na_array
+na_array2 == na_array # element-wise comparison
+na_array2$Equals(na_array) # overall comparison
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/arrow-package.Rd b/src/arrow/r/man/arrow-package.Rd
new file mode 100644
index 000000000..021762162
--- /dev/null
+++ b/src/arrow/r/man/arrow-package.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/arrow-package.R
+\docType{package}
+\name{arrow-package}
+\alias{arrow}
+\alias{arrow-package}
+\title{arrow: Integration to 'Apache' 'Arrow'}
+\description{
+'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
+ development platform for in-memory data. It specifies a standardized
+ language-independent columnar memory format for flat and hierarchical data,
+ organized for efficient analytic operations on modern hardware. This
+ package provides an interface to the 'Arrow C++' library.
+}
+\seealso{
+Useful links:
+\itemize{
+ \item \url{https://github.com/apache/arrow/}
+ \item \url{https://arrow.apache.org/docs/r/}
+ \item Report bugs at \url{https://issues.apache.org/jira/projects/ARROW/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Neal Richardson \email{neal@ursalabs.org}
+
+Authors:
+\itemize{
+ \item Ian Cook \email{ianmcook@gmail.com}
+ \item Nic Crane \email{thisisnic@gmail.com}
+ \item Jonathan Keane \email{jkeane@gmail.com}
+ \item Romain François \email{romain@rstudio.com} (\href{https://orcid.org/0000-0002-2444-4226}{ORCID})
+ \item Jeroen Ooms \email{jeroen@berkeley.edu}
+ \item Apache Arrow \email{dev@arrow.apache.org} [copyright holder]
+}
+
+Other contributors:
+\itemize{
+ \item Javier Luraschi \email{javier@rstudio.com} [contributor]
+ \item Karl Dunkle Werner \email{karldw@users.noreply.github.com} (\href{https://orcid.org/0000-0003-0523-7309}{ORCID}) [contributor]
+ \item Jeffrey Wong \email{jeffreyw@netflix.com} [contributor]
+}
+
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/arrow_available.Rd b/src/arrow/r/man/arrow_available.Rd
new file mode 100644
index 000000000..3061d10dc
--- /dev/null
+++ b/src/arrow/r/man/arrow_available.Rd
@@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/arrow-package.R
+\name{arrow_available}
+\alias{arrow_available}
+\alias{arrow_with_dataset}
+\alias{arrow_with_parquet}
+\alias{arrow_with_s3}
+\alias{arrow_with_json}
+\title{Is the C++ Arrow library available?}
+\usage{
+arrow_available()
+
+arrow_with_dataset()
+
+arrow_with_parquet()
+
+arrow_with_s3()
+
+arrow_with_json()
+}
+\value{
+\code{TRUE} or \code{FALSE} depending on whether the package was installed
+with:
+\itemize{
+\item The Arrow C++ library (check with \code{arrow_available()})
+\item Arrow Dataset support enabled (check with \code{arrow_with_dataset()})
+\item Parquet support enabled (check with \code{arrow_with_parquet()})
+\item JSON support enabled (check with \code{arrow_with_json()})
+\item Amazon S3 support enabled (check with \code{arrow_with_s3()})
+}
+}
+\description{
+You won't generally need to call these function, but they're made available
+for diagnostic purposes.
+}
+\examples{
+arrow_available()
+arrow_with_dataset()
+arrow_with_parquet()
+arrow_with_json()
+arrow_with_s3()
+}
+\seealso{
+If any of these are \code{FALSE}, see
+\code{vignette("install", package = "arrow")} for guidance on reinstalling the
+package.
+}
diff --git a/src/arrow/r/man/arrow_info.Rd b/src/arrow/r/man/arrow_info.Rd
new file mode 100644
index 000000000..95444a8bb
--- /dev/null
+++ b/src/arrow/r/man/arrow_info.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/arrow-package.R
+\name{arrow_info}
+\alias{arrow_info}
+\title{Report information on the package's capabilities}
+\usage{
+arrow_info()
+}
+\value{
+A list including version information, boolean "capabilities", and
+statistics from Arrow's memory allocator, and also Arrow's run-time
+information.
+}
+\description{
+This function summarizes a number of build-time configurations and run-time
+settings for the Arrow package. It may be useful for diagnostics.
+}
diff --git a/src/arrow/r/man/buffer.Rd b/src/arrow/r/man/buffer.Rd
new file mode 100644
index 000000000..a3ca1fc2f
--- /dev/null
+++ b/src/arrow/r/man/buffer.Rd
@@ -0,0 +1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/buffer.R
+\docType{class}
+\name{buffer}
+\alias{buffer}
+\alias{Buffer}
+\title{Buffer class}
+\usage{
+buffer(x)
+}
+\arguments{
+\item{x}{R object. Only raw, numeric and integer vectors are currently supported}
+}
+\value{
+an instance of \code{Buffer} that borrows memory from \code{x}
+}
+\description{
+A Buffer is an object containing a pointer to a piece of
+contiguous memory with a particular size.
+}
+\section{Factory}{
+
+\code{buffer()} lets you create an \code{arrow::Buffer} from an R object
+}
+
+\section{Methods}{
+
+\itemize{
+\item \verb{$is_mutable} : is this buffer mutable?
+\item \verb{$ZeroPadding()} : zero bytes in padding, i.e. bytes between size and capacity
+\item \verb{$size} : size in memory, in bytes
+\item \verb{$capacity}: possible capacity, in bytes
+}
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+my_buffer <- buffer(c(1, 2, 3, 4))
+my_buffer$is_mutable
+my_buffer$ZeroPadding()
+my_buffer$size
+my_buffer$capacity
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/call_function.Rd b/src/arrow/r/man/call_function.Rd
new file mode 100644
index 000000000..c216af06f
--- /dev/null
+++ b/src/arrow/r/man/call_function.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute.R
+\name{call_function}
+\alias{call_function}
+\title{Call an Arrow compute function}
+\usage{
+call_function(
+ function_name,
+ ...,
+ args = list(...),
+ options = empty_named_list()
+)
+}
+\arguments{
+\item{function_name}{string Arrow compute function name}
+
+\item{...}{Function arguments, which may include \code{Array}, \code{ChunkedArray}, \code{Scalar},
+\code{RecordBatch}, or \code{Table}.}
+
+\item{args}{list arguments as an alternative to specifying in \code{...}}
+
+\item{options}{named list of C++ function options.}
+}
+\value{
+An \code{Array}, \code{ChunkedArray}, \code{Scalar}, \code{RecordBatch}, or \code{Table}, whatever the compute function results in.
+}
+\description{
+This function provides a lower-level API for calling Arrow functions by their
+string function name. You won't use it directly for most applications.
+Many Arrow compute functions are mapped to R methods,
+and in a \code{dplyr} evaluation context, \link[=list_compute_functions]{all Arrow functions}
+are callable with an \code{arrow_} prefix.
+}
+\details{
+When passing indices in \code{...}, \code{args}, or \code{options}, express them as
+0-based integers (consistent with C++).
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+a <- Array$create(c(1L, 2L, 3L, NA, 5L))
+s <- Scalar$create(4L)
+call_function("coalesce", a, s)
+
+a <- Array$create(rnorm(10000))
+call_function("quantile", a, options = list(q = seq(0, 1, 0.25)))
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\href{https://arrow.apache.org/docs/cpp/compute.html}{Arrow C++ documentation} for
+the functions and their respective options.
+}
diff --git a/src/arrow/r/man/cast_options.Rd b/src/arrow/r/man/cast_options.Rd
new file mode 100644
index 000000000..40d78052c
--- /dev/null
+++ b/src/arrow/r/man/cast_options.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute.R
+\name{cast_options}
+\alias{cast_options}
+\title{Cast options}
+\usage{
+cast_options(safe = TRUE, ...)
+}
+\arguments{
+\item{safe}{logical: enforce safe conversion? Default \code{TRUE}}
+
+\item{...}{additional cast options, such as \code{allow_int_overflow},
+\code{allow_time_truncate}, and \code{allow_float_truncate}, which are set to \code{!safe}
+by default}
+}
+\value{
+A list
+}
+\description{
+Cast options
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/codec_is_available.Rd b/src/arrow/r/man/codec_is_available.Rd
new file mode 100644
index 000000000..b3238ff1d
--- /dev/null
+++ b/src/arrow/r/man/codec_is_available.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compression.R
+\name{codec_is_available}
+\alias{codec_is_available}
+\title{Check whether a compression codec is available}
+\usage{
+codec_is_available(type)
+}
+\arguments{
+\item{type}{A string, one of "uncompressed", "snappy", "gzip", "brotli",
+"zstd", "lz4", "lzo", or "bz2", case insensitive.}
+}
+\value{
+Logical: is \code{type} available?
+}
+\description{
+Support for compression libraries depends on the build-time settings of
+the Arrow C++ library. This function lets you know which are available for
+use.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+codec_is_available("gzip")
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/compression.Rd b/src/arrow/r/man/compression.Rd
new file mode 100644
index 000000000..7cdb320d6
--- /dev/null
+++ b/src/arrow/r/man/compression.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compression.R
+\docType{class}
+\name{compression}
+\alias{compression}
+\alias{CompressedOutputStream}
+\alias{CompressedInputStream}
+\title{Compressed stream classes}
+\description{
+\code{CompressedInputStream} and \code{CompressedOutputStream}
+allow you to apply a compression \link{Codec} to an
+input or output stream.
+}
+\section{Factory}{
+
+
+The \code{CompressedInputStream$create()} and \code{CompressedOutputStream$create()}
+factory methods instantiate the object and take the following arguments:
+\itemize{
+\item \code{stream} An \link{InputStream} or \link{OutputStream}, respectively
+\item \code{codec} A \code{Codec}, either a \link{Codec} instance or a string
+\item \code{compression_level} compression level for when the \code{codec} argument is given as a string
+}
+}
+
+\section{Methods}{
+
+
+Methods are inherited from \link{InputStream} and \link{OutputStream}, respectively
+}
+
diff --git a/src/arrow/r/man/contains_regex.Rd b/src/arrow/r/man/contains_regex.Rd
new file mode 100644
index 000000000..f05f11d02
--- /dev/null
+++ b/src/arrow/r/man/contains_regex.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dplyr-functions.R
+\name{contains_regex}
+\alias{contains_regex}
+\title{Does this string contain regex metacharacters?}
+\usage{
+contains_regex(string)
+}
+\arguments{
+\item{string}{String to be tested}
+}
+\value{
+Logical: does \code{string} contain regex metacharacters?
+}
+\description{
+Does this string contain regex metacharacters?
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/copy_files.Rd b/src/arrow/r/man/copy_files.Rd
new file mode 100644
index 000000000..1b83703f1
--- /dev/null
+++ b/src/arrow/r/man/copy_files.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filesystem.R
+\name{copy_files}
+\alias{copy_files}
+\title{Copy files between FileSystems}
+\usage{
+copy_files(from, to, chunk_size = 1024L * 1024L)
+}
+\arguments{
+\item{from}{A string path to a local directory or file, a URI, or a
+\code{SubTreeFileSystem}. Files will be copied recursively from this path.}
+
+\item{to}{A string path to a local directory or file, a URI, or a
+\code{SubTreeFileSystem}. Directories will be created as necessary}
+
+\item{chunk_size}{The maximum size of block to read before flushing
+to the destination file. A larger chunk_size will use more memory while
+copying but may help accommodate high latency FileSystems.}
+}
+\value{
+Nothing: called for side effects in the file system
+}
+\description{
+Copy files between FileSystems
+}
+\examples{
+\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Copy an S3 bucket's files to a local directory:
+copy_files("s3://your-bucket-name", "local-directory")
+# Using a FileSystem object
+copy_files(s3_bucket("your-bucket-name"), "local-directory")
+# Or go the other way, from local to S3
+copy_files("local-directory", s3_bucket("your-bucket-name"))
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/cpu_count.Rd b/src/arrow/r/man/cpu_count.Rd
new file mode 100644
index 000000000..f2abfc197
--- /dev/null
+++ b/src/arrow/r/man/cpu_count.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/config.R
+\name{cpu_count}
+\alias{cpu_count}
+\alias{set_cpu_count}
+\title{Manage the global CPU thread pool in libarrow}
+\usage{
+cpu_count()
+
+set_cpu_count(num_threads)
+}
+\arguments{
+\item{num_threads}{integer: New number of threads for thread pool}
+}
+\description{
+Manage the global CPU thread pool in libarrow
+}
diff --git a/src/arrow/r/man/create_package_with_all_dependencies.Rd b/src/arrow/r/man/create_package_with_all_dependencies.Rd
new file mode 100644
index 000000000..b2da8c249
--- /dev/null
+++ b/src/arrow/r/man/create_package_with_all_dependencies.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/install-arrow.R
+\name{create_package_with_all_dependencies}
+\alias{create_package_with_all_dependencies}
+\title{Create a source bundle that includes all thirdparty dependencies}
+\usage{
+create_package_with_all_dependencies(dest_file = NULL, source_file = NULL)
+}
+\arguments{
+\item{dest_file}{File path for the new tar.gz package. Defaults to
+\code{arrow_V.V.V_with_deps.tar.gz} in the current directory (\code{V.V.V} is the version)}
+
+\item{source_file}{File path for the input tar.gz package. Defaults to
+downloading the package from CRAN (or whatever you have set as the first in
+\code{getOption("repos")})}
+}
+\value{
+The full path to \code{dest_file}, invisibly
+
+This function is used for setting up an offline build. If it's possible to
+download at build time, don't use this function. Instead, let \code{cmake}
+download the required dependencies for you.
+These downloaded dependencies are only used in the build if
+\code{ARROW_DEPENDENCY_SOURCE} is unset, \code{BUNDLED}, or \code{AUTO}.
+https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds
+
+If you're using binary packages you shouldn't need to use this function. You
+should download the appropriate binary from your package repository, transfer
+that to the offline computer, and install that. Any OS can create the source
+bundle, but it cannot be installed on Windows. (Instead, use a standard
+Windows binary package.)
+
+Note if you're using RStudio Package Manager on Linux: If you still want to
+make a source bundle with this function, make sure to set the first repo in
+\code{options("repos")} to be a mirror that contains source packages (that is:
+something other than the RSPM binary mirror URLs).
+\subsection{Steps for an offline install with optional dependencies:}{
+\subsection{Using a computer with internet access, pre-download the dependencies:}{
+\itemize{
+\item Install the \code{arrow} package \emph{or} run
+\code{source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")}
+\item Run \code{create_package_with_all_dependencies("my_arrow_pkg.tar.gz")}
+\item Copy the newly created \code{my_arrow_pkg.tar.gz} to the computer without internet access
+}
+}
+
+\subsection{On the computer without internet access, install the prepared package:}{
+\itemize{
+\item Install the \code{arrow} package from the copied file
+\itemize{
+\item \code{install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))}
+\item This installation will build from source, so \code{cmake} must be available
+}
+\item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities
+}
+}
+
+}
+}
+\description{
+Create a source bundle that includes all thirdparty dependencies
+}
+\examples{
+\dontrun{
+new_pkg <- create_package_with_all_dependencies()
+# Note: this works when run in the same R session, but it's meant to be
+# copied to a different computer.
+install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo"))
+}
+}
diff --git a/src/arrow/r/man/data-type.Rd b/src/arrow/r/man/data-type.Rd
new file mode 100644
index 000000000..a06318975
--- /dev/null
+++ b/src/arrow/r/man/data-type.Rd
@@ -0,0 +1,163 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/type.R
+\name{data-type}
+\alias{data-type}
+\alias{int8}
+\alias{int16}
+\alias{int32}
+\alias{int64}
+\alias{uint8}
+\alias{uint16}
+\alias{uint32}
+\alias{uint64}
+\alias{float16}
+\alias{halffloat}
+\alias{float32}
+\alias{float}
+\alias{float64}
+\alias{boolean}
+\alias{bool}
+\alias{utf8}
+\alias{large_utf8}
+\alias{binary}
+\alias{large_binary}
+\alias{fixed_size_binary}
+\alias{string}
+\alias{date32}
+\alias{date64}
+\alias{time32}
+\alias{time64}
+\alias{null}
+\alias{timestamp}
+\alias{decimal}
+\alias{struct}
+\alias{list_of}
+\alias{large_list_of}
+\alias{FixedSizeListType}
+\alias{fixed_size_list_of}
+\title{Apache Arrow data types}
+\usage{
+int8()
+
+int16()
+
+int32()
+
+int64()
+
+uint8()
+
+uint16()
+
+uint32()
+
+uint64()
+
+float16()
+
+halffloat()
+
+float32()
+
+float()
+
+float64()
+
+boolean()
+
+bool()
+
+utf8()
+
+large_utf8()
+
+binary()
+
+large_binary()
+
+fixed_size_binary(byte_width)
+
+string()
+
+date32()
+
+date64()
+
+time32(unit = c("ms", "s"))
+
+time64(unit = c("ns", "us"))
+
+null()
+
+timestamp(unit = c("s", "ms", "us", "ns"), timezone = "")
+
+decimal(precision, scale)
+
+struct(...)
+
+list_of(type)
+
+large_list_of(type)
+
+fixed_size_list_of(type, list_size)
+}
+\arguments{
+\item{byte_width}{byte width for \code{FixedSizeBinary} type.}
+
+\item{unit}{For time/timestamp types, the time unit. \code{time32()} can take
+either "s" or "ms", while \code{time64()} can be "us" or "ns". \code{timestamp()} can
+take any of those four values.}
+
+\item{timezone}{For \code{timestamp()}, an optional time zone string.}
+
+\item{precision}{For \code{decimal()}, precision}
+
+\item{scale}{For \code{decimal()}, scale}
+
+\item{...}{For \code{struct()}, a named list of types to define the struct columns}
+
+\item{type}{For \code{list_of()}, a data type to make a list-of-type}
+
+\item{list_size}{list size for \code{FixedSizeList} type.}
+}
+\value{
+An Arrow type object inheriting from DataType.
+}
+\description{
+These functions create type objects corresponding to Arrow types. Use them
+when defining a \code{\link[=schema]{schema()}} or as inputs to other types, like \code{struct}. Most
+of these functions don't take arguments, but a few do.
+}
+\details{
+A few functions have aliases:
+\itemize{
+\item \code{utf8()} and \code{string()}
+\item \code{float16()} and \code{halffloat()}
+\item \code{float32()} and \code{float()}
+\item \code{bool()} and \code{boolean()}
+\item When called inside an \code{arrow} function, such as \code{schema()} or \code{cast()},
+\code{double()} also is supported as a way of creating a \code{float64()}
+}
+
+\code{date32()} creates a datetime type with a "day" unit, like the R \code{Date}
+class. \code{date64()} has a "ms" unit.
+
+\code{uint32} (32 bit unsigned integer), \code{uint64} (64 bit unsigned integer), and
+\code{int64} (64-bit signed integer) types may contain values that exceed the
+range of R's \code{integer} type (32-bit signed integer). When these arrow objects
+are translated to R objects, \code{uint32} and \code{uint64} are converted to \code{double}
+("numeric") and \code{int64} is converted to \code{bit64::integer64}. For \code{int64}
+types, this conversion can be disabled (so that \code{int64} always yields a
+\code{bit64::integer64} object) by setting \code{options(arrow.int64_downcast = FALSE)}.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+bool()
+struct(a = int32(), b = double())
+timestamp("ms", timezone = "CEST")
+time64("ns")
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\code{\link[=dictionary]{dictionary()}} for creating a dictionary (factor-like) type.
+}
diff --git a/src/arrow/r/man/dataset_factory.Rd b/src/arrow/r/man/dataset_factory.Rd
new file mode 100644
index 000000000..d119c150b
--- /dev/null
+++ b/src/arrow/r/man/dataset_factory.Rd
@@ -0,0 +1,76 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-factory.R
+\name{dataset_factory}
+\alias{dataset_factory}
+\title{Create a DatasetFactory}
+\usage{
+dataset_factory(
+ x,
+ filesystem = NULL,
+ format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
+ partitioning = NULL,
+ ...
+)
+}
+\arguments{
+\item{x}{A string path to a directory containing data files, a vector of one
+one or more string paths to data files, or a list of \code{DatasetFactory} objects
+whose datasets should be combined. If this argument is specified it will be
+used to construct a \code{UnionDatasetFactory} and other arguments will be
+ignored.}
+
+\item{filesystem}{A \link{FileSystem} object; if omitted, the \code{FileSystem} will
+be detected from \code{x}}
+
+\item{format}{A \link{FileFormat} object, or a string identifier of the format of
+the files in \code{x}. Currently supported values:
+\itemize{
+\item "parquet"
+\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+only version 2 files are supported
+\item "csv"/"text", aliases for the same thing (because comma is the default
+delimiter for text files
+\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
+}
+
+Default is "parquet", unless a \code{delimiter} is also specified, in which case
+it is assumed to be "text".}
+
+\item{partitioning}{One of
+\itemize{
+\item A \code{Schema}, in which case the file paths relative to \code{sources} will be
+parsed, and path segments will be matched with the schema fields. For
+example, \code{schema(year = int16(), month = int8())} would create partitions
+for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc.
+\item A character vector that defines the field names corresponding to those
+path segments (that is, you're providing the names that would correspond
+to a \code{Schema} but the types will be autodetected)
+\item A \code{HivePartitioning} or \code{HivePartitioningFactory}, as returned
+by \code{\link[=hive_partition]{hive_partition()}} which parses explicit or autodetected fields from
+Hive-style path segments
+\item \code{NULL} for no partitioning
+}}
+
+\item{...}{Additional format-specific options, passed to
+\code{FileFormat$create()}. For CSV options, note that you can specify them either
+with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
+\code{readr}-style naming used in \code{\link[=read_csv_arrow]{read_csv_arrow()}} ("delim", "quote", etc.).
+Not all \code{readr} options are currently supported; please file an issue if you
+encounter one that \code{arrow} should support.}
+}
+\value{
+A \code{DatasetFactory} object. Pass this to \code{\link[=open_dataset]{open_dataset()}},
+in a list potentially with other \code{DatasetFactory} objects, to create
+a \code{Dataset}.
+}
+\description{
+A \link{Dataset} can constructed using one or more \link{DatasetFactory}s.
+This function helps you construct a \code{DatasetFactory} that you can pass to
+\code{\link[=open_dataset]{open_dataset()}}.
+}
+\details{
+If you would only have a single \code{DatasetFactory} (for example, you have a
+single directory containing Parquet files), you can call \code{open_dataset()}
+directly. Use \code{dataset_factory()} when you
+want to combine different directories, file systems, or file formats.
+}
diff --git a/src/arrow/r/man/default_memory_pool.Rd b/src/arrow/r/man/default_memory_pool.Rd
new file mode 100644
index 000000000..232a89e6a
--- /dev/null
+++ b/src/arrow/r/man/default_memory_pool.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/memory-pool.R
+\name{default_memory_pool}
+\alias{default_memory_pool}
+\title{Arrow's default \link{MemoryPool}}
+\usage{
+default_memory_pool()
+}
+\value{
+the default \link{MemoryPool}
+}
+\description{
+Arrow's default \link{MemoryPool}
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/dictionary.Rd b/src/arrow/r/man/dictionary.Rd
new file mode 100644
index 000000000..d4b934954
--- /dev/null
+++ b/src/arrow/r/man/dictionary.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dictionary.R
+\name{dictionary}
+\alias{dictionary}
+\title{Create a dictionary type}
+\usage{
+dictionary(index_type = int32(), value_type = utf8(), ordered = FALSE)
+}
+\arguments{
+\item{index_type}{A DataType for the indices (default \code{\link[=int32]{int32()}})}
+
+\item{value_type}{A DataType for the values (default \code{\link[=utf8]{utf8()}})}
+
+\item{ordered}{Is this an ordered dictionary (default \code{FALSE})?}
+}
+\value{
+A \link{DictionaryType}
+}
+\description{
+Create a dictionary type
+}
+\seealso{
+\link[=data-type]{Other Arrow data types}
+}
diff --git a/src/arrow/r/man/enums.Rd b/src/arrow/r/man/enums.Rd
new file mode 100644
index 000000000..7ec126a01
--- /dev/null
+++ b/src/arrow/r/man/enums.Rd
@@ -0,0 +1,88 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/enums.R
+\docType{data}
+\name{enums}
+\alias{enums}
+\alias{TimeUnit}
+\alias{DateUnit}
+\alias{Type}
+\alias{StatusCode}
+\alias{FileMode}
+\alias{MessageType}
+\alias{CompressionType}
+\alias{FileType}
+\alias{ParquetVersionType}
+\alias{MetadataVersion}
+\alias{QuantileInterpolation}
+\alias{NullEncodingBehavior}
+\alias{NullHandlingBehavior}
+\alias{RoundMode}
+\alias{JoinType}
+\title{Arrow enums}
+\format{
+An object of class \code{TimeUnit::type} (inherits from \code{arrow-enum}) of length 4.
+
+An object of class \code{DateUnit} (inherits from \code{arrow-enum}) of length 2.
+
+An object of class \code{Type::type} (inherits from \code{arrow-enum}) of length 37.
+
+An object of class \code{StatusCode} (inherits from \code{arrow-enum}) of length 17.
+
+An object of class \code{FileMode} (inherits from \code{arrow-enum}) of length 3.
+
+An object of class \code{MessageType} (inherits from \code{arrow-enum}) of length 5.
+
+An object of class \code{Compression::type} (inherits from \code{arrow-enum}) of length 9.
+
+An object of class \code{FileType} (inherits from \code{arrow-enum}) of length 4.
+
+An object of class \code{ParquetVersionType} (inherits from \code{arrow-enum}) of length 2.
+
+An object of class \code{MetadataVersion} (inherits from \code{arrow-enum}) of length 5.
+
+An object of class \code{QuantileInterpolation} (inherits from \code{arrow-enum}) of length 5.
+
+An object of class \code{NullEncodingBehavior} (inherits from \code{arrow-enum}) of length 2.
+
+An object of class \code{NullHandlingBehavior} (inherits from \code{arrow-enum}) of length 3.
+
+An object of class \code{RoundMode} (inherits from \code{arrow-enum}) of length 10.
+
+An object of class \code{JoinType} (inherits from \code{arrow-enum}) of length 8.
+}
+\usage{
+TimeUnit
+
+DateUnit
+
+Type
+
+StatusCode
+
+FileMode
+
+MessageType
+
+CompressionType
+
+FileType
+
+ParquetVersionType
+
+MetadataVersion
+
+QuantileInterpolation
+
+NullEncodingBehavior
+
+NullHandlingBehavior
+
+RoundMode
+
+JoinType
+}
+\description{
+Arrow enums
+}
+\keyword{datasets}
+\keyword{internal}
diff --git a/src/arrow/r/man/flight_connect.Rd b/src/arrow/r/man/flight_connect.Rd
new file mode 100644
index 000000000..9da7fad75
--- /dev/null
+++ b/src/arrow/r/man/flight_connect.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/flight.R
+\name{flight_connect}
+\alias{flight_connect}
+\title{Connect to a Flight server}
+\usage{
+flight_connect(host = "localhost", port, scheme = "grpc+tcp")
+}
+\arguments{
+\item{host}{string hostname to connect to}
+
+\item{port}{integer port to connect on}
+
+\item{scheme}{URL scheme, default is "grpc+tcp"}
+}
+\value{
+A \code{pyarrow.flight.FlightClient}.
+}
+\description{
+Connect to a Flight server
+}
diff --git a/src/arrow/r/man/flight_get.Rd b/src/arrow/r/man/flight_get.Rd
new file mode 100644
index 000000000..a79c4d727
--- /dev/null
+++ b/src/arrow/r/man/flight_get.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/flight.R
+\name{flight_get}
+\alias{flight_get}
+\title{Get data from a Flight server}
+\usage{
+flight_get(client, path)
+}
+\arguments{
+\item{client}{\code{pyarrow.flight.FlightClient}, as returned by \code{\link[=flight_connect]{flight_connect()}}}
+
+\item{path}{string identifier under which data is stored}
+}
+\value{
+A \link{Table}
+}
+\description{
+Get data from a Flight server
+}
diff --git a/src/arrow/r/man/flight_put.Rd b/src/arrow/r/man/flight_put.Rd
new file mode 100644
index 000000000..13a8da16f
--- /dev/null
+++ b/src/arrow/r/man/flight_put.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/flight.R
+\name{flight_put}
+\alias{flight_put}
+\title{Send data to a Flight server}
+\usage{
+flight_put(client, data, path, overwrite = TRUE)
+}
+\arguments{
+\item{client}{\code{pyarrow.flight.FlightClient}, as returned by \code{\link[=flight_connect]{flight_connect()}}}
+
+\item{data}{\code{data.frame}, \link{RecordBatch}, or \link{Table} to upload}
+
+\item{path}{string identifier to store the data under}
+
+\item{overwrite}{logical: if \code{path} exists on \code{client} already, should we
+replace it with the contents of \code{data}? Default is \code{TRUE}; if \code{FALSE} and
+\code{path} exists, the function will error.}
+}
+\value{
+\code{client}, invisibly.
+}
+\description{
+Send data to a Flight server
+}
diff --git a/src/arrow/r/man/get_stringr_pattern_options.Rd b/src/arrow/r/man/get_stringr_pattern_options.Rd
new file mode 100644
index 000000000..7107b9060
--- /dev/null
+++ b/src/arrow/r/man/get_stringr_pattern_options.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dplyr-functions.R
+\name{get_stringr_pattern_options}
+\alias{get_stringr_pattern_options}
+\title{Get \code{stringr} pattern options}
+\usage{
+get_stringr_pattern_options(pattern)
+}
+\arguments{
+\item{pattern}{Unevaluated expression containing a call to a \code{stringr}
+pattern modifier function}
+}
+\value{
+List containing elements \code{pattern}, \code{fixed}, and \code{ignore_case}
+}
+\description{
+This function assigns definitions for the \code{stringr} pattern modifier
+functions (\code{fixed()}, \code{regex()}, etc.) inside itself, and uses them to
+evaluate the quoted expression \code{pattern}, returning a list that is used
+to control pattern matching behavior in internal \code{arrow} functions.
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/hive_partition.Rd b/src/arrow/r/man/hive_partition.Rd
new file mode 100644
index 000000000..eef9f9157
--- /dev/null
+++ b/src/arrow/r/man/hive_partition.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-partition.R
+\name{hive_partition}
+\alias{hive_partition}
+\title{Construct Hive partitioning}
+\usage{
+hive_partition(..., null_fallback = NULL, segment_encoding = "uri")
+}
+\arguments{
+\item{...}{named list of \link[=data-type]{data types}, passed to \code{\link[=schema]{schema()}}}
+
+\item{null_fallback}{character to be used in place of missing values (\code{NA} or \code{NULL})
+in partition columns. Default is \code{"__HIVE_DEFAULT_PARTITION__"},
+which is what Hive uses.}
+
+\item{segment_encoding}{Decode partition segments after splitting paths.
+Default is \code{"uri"} (URI-decode segments). May also be \code{"none"} (leave as-is).}
+}
+\value{
+A \link[=Partitioning]{HivePartitioning}, or a \code{HivePartitioningFactory} if
+calling \code{hive_partition()} with no arguments.
+}
+\description{
+Hive partitioning embeds field names and values in path segments, such as
+"/year=2019/month=2/data.parquet".
+}
+\details{
+Because fields are named in the path segments, order of fields passed to
+\code{hive_partition()} does not matter.
+}
+\examples{
+\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+hive_partition(year = int16(), month = int8())
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/install_arrow.Rd b/src/arrow/r/man/install_arrow.Rd
new file mode 100644
index 000000000..bf94650b3
--- /dev/null
+++ b/src/arrow/r/man/install_arrow.Rd
@@ -0,0 +1,61 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/install-arrow.R
+\name{install_arrow}
+\alias{install_arrow}
+\title{Install or upgrade the Arrow library}
+\usage{
+install_arrow(
+ nightly = FALSE,
+ binary = Sys.getenv("LIBARROW_BINARY", TRUE),
+ use_system = Sys.getenv("ARROW_USE_PKG_CONFIG", FALSE),
+ minimal = Sys.getenv("LIBARROW_MINIMAL", FALSE),
+ verbose = Sys.getenv("ARROW_R_DEV", FALSE),
+ repos = getOption("repos"),
+ ...
+)
+}
+\arguments{
+\item{nightly}{logical: Should we install a development version of the
+package, or should we install from CRAN (the default).}
+
+\item{binary}{On Linux, value to set for the environment variable
+\code{LIBARROW_BINARY}, which governs how C++ binaries are used, if at all.
+The default value, \code{TRUE}, tells the installation script to detect the
+Linux distribution and version and find an appropriate C++ library. \code{FALSE}
+would tell the script not to retrieve a binary and instead build Arrow C++
+from source. Other valid values are strings corresponding to a Linux
+distribution-version, to override the value that would be detected.
+See \code{vignette("install", package = "arrow")} for further details.}
+
+\item{use_system}{logical: Should we use \code{pkg-config} to look for Arrow
+system packages? Default is \code{FALSE}. If \code{TRUE}, source installation may be
+faster, but there is a risk of version mismatch. This sets the
+\code{ARROW_USE_PKG_CONFIG} environment variable.}
+
+\item{minimal}{logical: If building from source, should we build without
+optional dependencies (compression libraries, for example)? Default is
+\code{FALSE}. This sets the \code{LIBARROW_MINIMAL} environment variable.}
+
+\item{verbose}{logical: Print more debugging output when installing? Default
+is \code{FALSE}. This sets the \code{ARROW_R_DEV} environment variable.}
+
+\item{repos}{character vector of base URLs of the repositories to install
+from (passed to \code{install.packages()})}
+
+\item{...}{Additional arguments passed to \code{install.packages()}}
+}
+\description{
+Use this function to install the latest release of \code{arrow}, to switch to or
+from a nightly development version, or on Linux to try reinstalling with
+all necessary C++ dependencies.
+}
+\details{
+Note that, unlike packages like \code{tensorflow}, \code{blogdown}, and others that
+require external dependencies, you do not need to run \code{install_arrow()}
+after a successful \code{arrow} installation.
+}
+\seealso{
+\code{\link[=arrow_available]{arrow_available()}} to see if the package was configured with
+necessary C++ dependencies. \code{vignette("install", package = "arrow")} for
+more ways to tune installation on Linux.
+}
diff --git a/src/arrow/r/man/install_pyarrow.Rd b/src/arrow/r/man/install_pyarrow.Rd
new file mode 100644
index 000000000..223a26754
--- /dev/null
+++ b/src/arrow/r/man/install_pyarrow.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/python.R
+\name{install_pyarrow}
+\alias{install_pyarrow}
+\title{Install pyarrow for use with reticulate}
+\usage{
+install_pyarrow(envname = NULL, nightly = FALSE, ...)
+}
+\arguments{
+\item{envname}{The name or full path of the Python environment to install
+into. This can be a virtualenv or conda environment created by \code{reticulate}.
+See \code{reticulate::py_install()}.}
+
+\item{nightly}{logical: Should we install a development version of the
+package? Default is to use the official release version.}
+
+\item{...}{additional arguments passed to \code{reticulate::py_install()}.}
+}
+\description{
+\code{pyarrow} is the Python package for Apache Arrow. This function helps with
+installing it for use with \code{reticulate}.
+}
diff --git a/src/arrow/r/man/io_thread_count.Rd b/src/arrow/r/man/io_thread_count.Rd
new file mode 100644
index 000000000..b1dfa0ba7
--- /dev/null
+++ b/src/arrow/r/man/io_thread_count.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/config.R
+\name{io_thread_count}
+\alias{io_thread_count}
+\alias{set_io_thread_count}
+\title{Manage the global I/O thread pool in libarrow}
+\usage{
+io_thread_count()
+
+set_io_thread_count(num_threads)
+}
+\arguments{
+\item{num_threads}{integer: New number of threads for thread pool}
+}
+\description{
+Manage the global I/O thread pool in libarrow
+}
diff --git a/src/arrow/r/man/list_compute_functions.Rd b/src/arrow/r/man/list_compute_functions.Rd
new file mode 100644
index 000000000..45e033836
--- /dev/null
+++ b/src/arrow/r/man/list_compute_functions.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute.R
+\name{list_compute_functions}
+\alias{list_compute_functions}
+\title{List available Arrow C++ compute functions}
+\usage{
+list_compute_functions(pattern = NULL, ...)
+}
+\arguments{
+\item{pattern}{Optional regular expression to filter the function list}
+
+\item{...}{Additional parameters passed to \code{grep()}}
+}
+\value{
+A character vector of available Arrow C++ function names
+}
+\description{
+This function lists the names of all available Arrow C++ library compute functions.
+These can be called by passing to \code{\link[=call_function]{call_function()}}, or they can be
+called by name with an \code{arrow_} prefix inside a \code{dplyr} verb.
+}
+\details{
+The resulting list describes the capabilities of your \code{arrow} build.
+Some functions, such as string and regular expression functions,
+require optional build-time C++ dependencies. If your \code{arrow} package
+was not compiled with those features enabled, those functions will
+not appear in this list.
+
+Some functions take options that need to be passed when calling them
+(in a list called \code{options}). These options require custom handling
+in C++; many functions already have that handling set up but not all do.
+If you encounter one that needs special handling for options, please
+report an issue.
+
+Note that this list does \emph{not} enumerate all of the R bindings for these functions.
+The package includes Arrow methods for many base R functions that can
+be called directly on Arrow objects, as well as some tidyverse-flavored versions
+available inside \code{dplyr} verbs.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+available_funcs <- list_compute_functions()
+utf8_funcs <- list_compute_functions(pattern = "^UTF8", ignore.case = TRUE)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/list_flights.Rd b/src/arrow/r/man/list_flights.Rd
new file mode 100644
index 000000000..d8ebb0d02
--- /dev/null
+++ b/src/arrow/r/man/list_flights.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/flight.R
+\name{list_flights}
+\alias{list_flights}
+\alias{flight_path_exists}
+\title{See available resources on a Flight server}
+\usage{
+list_flights(client)
+
+flight_path_exists(client, path)
+}
+\arguments{
+\item{client}{\code{pyarrow.flight.FlightClient}, as returned by \code{\link[=flight_connect]{flight_connect()}}}
+
+\item{path}{string identifier under which data is stored}
+}
+\value{
+\code{list_flights()} returns a character vector of paths.
+\code{flight_path_exists()} returns a logical value, the equivalent of \code{path \%in\% list_flights()}
+}
+\description{
+See available resources on a Flight server
+}
diff --git a/src/arrow/r/man/load_flight_server.Rd b/src/arrow/r/man/load_flight_server.Rd
new file mode 100644
index 000000000..66d30f391
--- /dev/null
+++ b/src/arrow/r/man/load_flight_server.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/flight.R
+\name{load_flight_server}
+\alias{load_flight_server}
+\title{Load a Python Flight server}
+\usage{
+load_flight_server(name, path = system.file(package = "arrow"))
+}
+\arguments{
+\item{name}{string Python module name}
+
+\item{path}{file system path where the Python module is found. Default is
+to look in the \verb{inst/} directory for included modules.}
+}
+\description{
+Load a Python Flight server
+}
+\examples{
+\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+load_flight_server("demo_flight_server")
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/make_readable_file.Rd b/src/arrow/r/man/make_readable_file.Rd
new file mode 100644
index 000000000..fe2e29826
--- /dev/null
+++ b/src/arrow/r/man/make_readable_file.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/io.R
+\name{make_readable_file}
+\alias{make_readable_file}
+\title{Handle a range of possible input sources}
+\usage{
+make_readable_file(file, mmap = TRUE, compression = NULL, filesystem = NULL)
+}
+\arguments{
+\item{file}{A character file name, \code{raw} vector, or an Arrow input stream}
+
+\item{mmap}{Logical: whether to memory-map the file (default \code{TRUE})}
+
+\item{compression}{If the file is compressed, created a \link{CompressedInputStream}
+with this compression codec, either a \link{Codec} or the string name of one.
+If \code{NULL} (default) and \code{file} is a string file name, the function will try
+to infer compression from the file extension.}
+
+\item{filesystem}{If not \code{NULL}, \code{file} will be opened via the
+\code{filesystem$OpenInputFile()} filesystem method, rather than the \code{io} module's
+\code{MemoryMappedFile} or \code{ReadableFile} constructors.}
+}
+\value{
+An \code{InputStream} or a subclass of one.
+}
+\description{
+Handle a range of possible input sources
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/map_batches.Rd b/src/arrow/r/man/map_batches.Rd
new file mode 100644
index 000000000..08e7b86c0
--- /dev/null
+++ b/src/arrow/r/man/map_batches.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-scan.R
+\name{map_batches}
+\alias{map_batches}
+\title{Apply a function to a stream of RecordBatches}
+\usage{
+map_batches(X, FUN, ..., .data.frame = TRUE)
+}
+\arguments{
+\item{X}{A \code{Dataset} or \code{arrow_dplyr_query} object, as returned by the
+\code{dplyr} methods on \code{Dataset}.}
+
+\item{FUN}{A function or \code{purrr}-style lambda expression to apply to each
+batch}
+
+\item{...}{Additional arguments passed to \code{FUN}}
+
+\item{.data.frame}{logical: collect the resulting chunks into a single
+\code{data.frame}? Default \code{TRUE}}
+}
+\description{
+As an alternative to calling \code{collect()} on a \code{Dataset} query, you can
+use this function to access the stream of \code{RecordBatch}es in the \code{Dataset}.
+This lets you aggregate on each chunk and pull the intermediate results into
+a \code{data.frame} for further aggregation, even if you couldn't fit the whole
+\code{Dataset} result in memory.
+}
+\details{
+This is experimental and not recommended for production use.
+}
diff --git a/src/arrow/r/man/match_arrow.Rd b/src/arrow/r/man/match_arrow.Rd
new file mode 100644
index 000000000..877a41926
--- /dev/null
+++ b/src/arrow/r/man/match_arrow.Rd
@@ -0,0 +1,53 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute.R
+\name{match_arrow}
+\alias{match_arrow}
+\alias{is_in}
+\title{\code{match} and \code{\%in\%} for Arrow objects}
+\usage{
+match_arrow(x, table, ...)
+
+is_in(x, table, ...)
+}
+\arguments{
+\item{x}{\code{Scalar}, \code{Array} or \code{ChunkedArray}}
+
+\item{table}{\code{Scalar}, Array\verb{, }ChunkedArray`, or R vector lookup table.}
+
+\item{...}{additional arguments, ignored}
+}
+\value{
+\code{match_arrow()} returns an \code{int32}-type Arrow object of the same length
+and type as \code{x} with the (0-based) indexes into \code{table}. \code{is_in()} returns a
+\code{boolean}-type Arrow object of the same length and type as \code{x} with values indicating
+per element of \code{x} it it is present in \code{table}.
+}
+\description{
+\code{base::match()} is not a generic, so we can't just define Arrow methods for
+it. This function exposes the analogous functions in the Arrow C++ library.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# note that the returned value is 0-indexed
+cars_tbl <- arrow_table(name = rownames(mtcars), mtcars)
+match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name)
+
+is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name)
+
+# Although there are multiple matches, you are returned the index of the first
+# match, as with the base R equivalent
+match(4, mtcars$cyl) # 1-indexed
+match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed
+
+# If `x` contains multiple values, you are returned the indices of the first
+# match for each value.
+match(c(4, 6, 8), mtcars$cyl)
+match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl)
+
+# Return type matches type of `x`
+is_in(c(4, 6, 8), mtcars$cyl) # returns vector
+is_in(Scalar$create(4), mtcars$cyl) # returns Scalar
+is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array
+is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/mmap_create.Rd b/src/arrow/r/man/mmap_create.Rd
new file mode 100644
index 000000000..b85519348
--- /dev/null
+++ b/src/arrow/r/man/mmap_create.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/io.R
+\name{mmap_create}
+\alias{mmap_create}
+\title{Create a new read/write memory mapped file of a given size}
+\usage{
+mmap_create(path, size)
+}
+\arguments{
+\item{path}{file path}
+
+\item{size}{size in bytes}
+}
+\value{
+a \link[=MemoryMappedFile]{arrow::io::MemoryMappedFile}
+}
+\description{
+Create a new read/write memory mapped file of a given size
+}
diff --git a/src/arrow/r/man/mmap_open.Rd b/src/arrow/r/man/mmap_open.Rd
new file mode 100644
index 000000000..d0047a72c
--- /dev/null
+++ b/src/arrow/r/man/mmap_open.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/io.R
+\name{mmap_open}
+\alias{mmap_open}
+\title{Open a memory mapped file}
+\usage{
+mmap_open(path, mode = c("read", "write", "readwrite"))
+}
+\arguments{
+\item{path}{file path}
+
+\item{mode}{file mode (read/write/readwrite)}
+}
+\description{
+Open a memory mapped file
+}
diff --git a/src/arrow/r/man/open_dataset.Rd b/src/arrow/r/man/open_dataset.Rd
new file mode 100644
index 000000000..4d6b492e3
--- /dev/null
+++ b/src/arrow/r/man/open_dataset.Rd
@@ -0,0 +1,146 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset.R
+\name{open_dataset}
+\alias{open_dataset}
+\title{Open a multi-file dataset}
+\usage{
+open_dataset(
+ sources,
+ schema = NULL,
+ partitioning = hive_partition(),
+ unify_schemas = NULL,
+ format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
+ ...
+)
+}
+\arguments{
+\item{sources}{One of:
+\itemize{
+\item a string path or URI to a directory containing data files
+\item a string path or URI to a single file
+\item a character vector of paths or URIs to individual data files
+\item a list of \code{Dataset} objects as created by this function
+\item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}.
+}
+
+When \code{sources} is a vector of file URIs, they must all use the same protocol
+and point to files located in the same file system and having the same
+format.}
+
+\item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema
+will be inferred from the data sources.}
+
+\item{partitioning}{When \code{sources} is a directory path/URI, one of:
+\itemize{
+\item a \code{Schema}, in which case the file paths relative to \code{sources} will be
+parsed, and path segments will be matched with the schema fields. For
+example, \code{schema(year = int16(), month = int8())} would create partitions
+for file paths like \code{"2019/01/file.parquet"}, \code{"2019/02/file.parquet"},
+etc.
+\item a character vector that defines the field names corresponding to those
+path segments (that is, you're providing the names that would correspond
+to a \code{Schema} but the types will be autodetected)
+\item a \code{HivePartitioning} or \code{HivePartitioningFactory}, as returned
+by \code{\link[=hive_partition]{hive_partition()}} which parses explicit or autodetected fields from
+Hive-style path segments
+\item \code{NULL} for no partitioning
+}
+
+The default is to autodetect Hive-style partitions. When \code{sources} is not a
+directory path/URI, \code{partitioning} is ignored.}
+
+\item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s)
+be scanned in order to create a unified schema from them? If \code{FALSE}, only
+the first fragment will be inspected for its schema. Use this fast path
+when you know and trust that all fragments have an identical schema.
+The default is \code{FALSE} when creating a dataset from a directory path/URI or
+vector of file paths/URIs (because there may be many files and scanning may
+be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there
+should be few \code{Dataset}s in the list and their \code{Schema}s are already in
+memory).}
+
+\item{format}{A \link{FileFormat} object, or a string identifier of the format of
+the files in \code{x}. This argument is ignored when \code{sources} is a list of \code{Dataset} objects.
+Currently supported values:
+\itemize{
+\item "parquet"
+\item "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
+only version 2 files are supported
+\item "csv"/"text", aliases for the same thing (because comma is the default
+delimiter for text files
+\item "tsv", equivalent to passing \verb{format = "text", delimiter = "\\t"}
+}
+
+Default is "parquet", unless a \code{delimiter} is also specified, in which case
+it is assumed to be "text".}
+
+\item{...}{additional arguments passed to \code{dataset_factory()} when \code{sources}
+is a directory path/URI or vector of file paths/URIs, otherwise ignored.
+These may include \code{format} to indicate the file format, or other
+format-specific options.}
+}
+\value{
+A \link{Dataset} R6 object. Use \code{dplyr} methods on it to query the data,
+or call \code{\link[=Scanner]{$NewScan()}} to construct a query directly.
+}
+\description{
+Arrow Datasets allow you to query against data that has been split across
+multiple files. This sharding of data may indicate partitioning, which
+can accelerate queries that only touch some partitions (files). Call
+\code{open_dataset()} to point to a directory of data files and return a
+\code{Dataset}, then use \code{dplyr} methods to query it.
+}
+\examples{
+\dontshow{if (arrow_with_dataset() & arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Set up directory for examples
+tf <- tempfile()
+dir.create(tf)
+on.exit(unlink(tf))
+
+data <- dplyr::group_by(mtcars, cyl)
+write_dataset(data, tf)
+
+# You can specify a directory containing the files for your dataset and
+# open_dataset will scan all files in your directory.
+open_dataset(tf)
+
+# You can also supply a vector of paths
+open_dataset(c(file.path(tf, "cyl=4/part-0.parquet"), file.path(tf, "cyl=8/part-0.parquet")))
+
+## You must specify the file format if using a format other than parquet.
+tf2 <- tempfile()
+dir.create(tf2)
+on.exit(unlink(tf2))
+write_dataset(data, tf2, format = "ipc")
+# This line will results in errors when you try to work with the data
+\dontrun{
+open_dataset(tf2)
+}
+# This line will work
+open_dataset(tf2, format = "ipc")
+
+## You can specify file partitioning to include it as a field in your dataset
+# Create a temporary directory and write example dataset
+tf3 <- tempfile()
+dir.create(tf3)
+on.exit(unlink(tf3))
+write_dataset(airquality, tf3, partitioning = c("Month", "Day"), hive_style = FALSE)
+
+# View files - you can see the partitioning means that files have been written
+# to folders based on Month/Day values
+tf3_files <- list.files(tf3, recursive = TRUE)
+
+# With no partitioning specified, dataset contains all files but doesn't include
+# directory names as field names
+open_dataset(tf3)
+
+# Now that partitioning has been specified, your dataset contains columns for Month and Day
+open_dataset(tf3, partitioning = c("Month", "Day"))
+
+# If you want to specify the data types for your fields, you can pass in a Schema
+open_dataset(tf3, partitioning = schema(Month = int8(), Day = int8()))
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\code{vignette("dataset", package = "arrow")}
+}
diff --git a/src/arrow/r/man/read_delim_arrow.Rd b/src/arrow/r/man/read_delim_arrow.Rd
new file mode 100644
index 000000000..7bfda29b8
--- /dev/null
+++ b/src/arrow/r/man/read_delim_arrow.Rd
@@ -0,0 +1,218 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{read_delim_arrow}
+\alias{read_delim_arrow}
+\alias{read_csv_arrow}
+\alias{read_tsv_arrow}
+\title{Read a CSV or other delimited file with Arrow}
+\usage{
+read_delim_arrow(
+ file,
+ delim = ",",
+ quote = "\\"",
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL
+)
+
+read_csv_arrow(
+ file,
+ quote = "\\"",
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL
+)
+
+read_tsv_arrow(
+ file,
+ quote = "\\"",
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ schema = NULL,
+ col_names = TRUE,
+ col_types = NULL,
+ col_select = NULL,
+ na = c("", "NA"),
+ quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = NULL,
+ as_data_frame = TRUE,
+ timestamp_parsers = NULL
+)
+}
+\arguments{
+\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream,
+or a \code{FileSystem} with path (\code{SubTreeFileSystem}).
+If a file name, a memory-mapped Arrow \link{InputStream} will be opened and
+closed when finished; compression will be detected from the file extension
+and handled automatically. If an input stream is provided, it will be left
+open.}
+
+\item{delim}{Single character used to separate fields within a record.}
+
+\item{quote}{Single character used to quote strings.}
+
+\item{escape_double}{Does the file escape quotes by doubling them?
+i.e. If this option is \code{TRUE}, the value \verb{""""} represents
+a single quote, \verb{\\"}.}
+
+\item{escape_backslash}{Does the file use backslashes to escape special
+characters? This is more general than \code{escape_double} as backslashes
+can be used to escape the delimiter character, the quote character, or
+to add special characters like \verb{\\\\n}.}
+
+\item{schema}{\link{Schema} that describes the table. If provided, it will be
+used to satisfy both \code{col_names} and \code{col_types}.}
+
+\item{col_names}{If \code{TRUE}, the first row of the input will be used as the
+column names and will not be included in the data frame. If \code{FALSE}, column
+names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
+Alternatively, you can specify a character vector of column names.}
+
+\item{col_types}{A compact string representation of the column types, or
+\code{NULL} (the default) to infer types from the data.}
+
+\item{col_select}{A character vector of column names to keep, as in the
+"select" argument to \code{data.table::fread()}, or a
+\link[tidyselect:vars_select]{tidy selection specification}
+of columns, as used in \code{dplyr::select()}.}
+
+\item{na}{A character vector of strings to interpret as missing values.}
+
+\item{quoted_na}{Should missing values inside quotes be treated as missing
+values (the default) or strings. (Note that this is different from the
+the Arrow C++ default for the corresponding convert option,
+\code{strings_can_be_null}.)}
+
+\item{skip_empty_rows}{Should blank rows be ignored altogether? If
+\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be
+filled with missings.}
+
+\item{skip}{Number of lines to skip before reading data.}
+
+\item{parse_options}{see \link[=CsvReadOptions]{file reader options}.
+If given, this overrides any
+parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).}
+
+\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}
+
+\item{read_options}{see \link[=CsvReadOptions]{file reader options}}
+
+\item{as_data_frame}{Should the function return a \code{data.frame} (default) or
+an Arrow \link{Table}?}
+
+\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are:
+\itemize{
+\item \code{NULL}: the default, which uses the ISO-8601 parser
+\item a character vector of \link[base:strptime]{strptime} parse strings
+\item a list of \link{TimestampParser} objects
+}}
+}
+\value{
+A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}.
+}
+\description{
+These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}.
+Arrow C++ options have been mapped to argument names that follow those of
+\code{readr::read_delim()}, and \code{col_select} was inspired by \code{vroom::vroom()}.
+}
+\details{
+\code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around
+\code{read_delim_arrow()} that specify a delimiter.
+
+Note that not all \code{readr} options are currently implemented here. Please file
+an issue if you encounter one that \code{arrow} should support.
+
+If you need to control Arrow-specific reader parameters that don't have an
+equivalent in \code{readr::read_csv()}, you can either provide them in the
+\code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can
+use \link{CsvTableReader} directly for lower-level access.
+}
+\section{Specifying column types and names}{
+
+
+By default, the CSV reader will infer the column names and data types from the file, but there
+are a few ways you can specify them directly.
+
+One way is to provide an Arrow \link{Schema} in the \code{schema} argument,
+which is an ordered map of column name to type.
+When provided, it satisfies both the \code{col_names} and \code{col_types} arguments.
+This is good if you know all of this information up front.
+
+You can also pass a \code{Schema} to the \code{col_types} argument. If you do this,
+column names will still be inferred from the file unless you also specify
+\code{col_names}. In either case, the column names in the \code{Schema} must match the
+data's column names, whether they are explicitly provided or inferred. That
+said, this \code{Schema} does not have to reference all columns: those omitted
+will have their types inferred.
+
+Alternatively, you can declare column types by providing the compact string representation
+that \code{readr} uses to the \code{col_types} argument. This means you provide a
+single string, one character per column, where the characters map to Arrow
+types analogously to the \code{readr} type mapping:
+\itemize{
+\item "c": \code{utf8()}
+\item "i": \code{int32()}
+\item "n": \code{float64()}
+\item "d": \code{float64()}
+\item "l": \code{bool()}
+\item "f": \code{dictionary()}
+\item "D": \code{date32()}
+\item "T": \code{timestamp()}
+\item "t": \code{time32()}
+\item "_": \code{null()}
+\item "-": \code{null()}
+\item "?": infer the type from the data
+}
+
+If you use the compact string representation for \code{col_types}, you must also
+specify \code{col_names}.
+
+Regardless of how types are specified, all columns with a \code{null()} type will
+be dropped.
+
+Note that if you are specifying column names, whether by \code{schema} or
+\code{col_names}, and the CSV file has a header row that would otherwise be used
+to idenfity column names, you'll need to add \code{skip = 1} to skip that row.
+}
+
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write.csv(mtcars, file = tf)
+df <- read_csv_arrow(tf)
+dim(df)
+# Can select columns
+df <- read_csv_arrow(tf, col_select = starts_with("d"))
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/read_feather.Rd b/src/arrow/r/man/read_feather.Rd
new file mode 100644
index 000000000..95f4d1d12
--- /dev/null
+++ b/src/arrow/r/man/read_feather.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feather.R
+\name{read_feather}
+\alias{read_feather}
+\title{Read a Feather file}
+\usage{
+read_feather(file, col_select = NULL, as_data_frame = TRUE, ...)
+}
+\arguments{
+\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream,
+or a \code{FileSystem} with path (\code{SubTreeFileSystem}).
+If a file name or URI, an Arrow \link{InputStream} will be opened and
+closed when finished. If an input stream is provided, it will be left
+open.}
+
+\item{col_select}{A character vector of column names to keep, as in the
+"select" argument to \code{data.table::fread()}, or a
+\link[tidyselect:vars_select]{tidy selection specification}
+of columns, as used in \code{dplyr::select()}.}
+
+\item{as_data_frame}{Should the function return a \code{data.frame} (default) or
+an Arrow \link{Table}?}
+
+\item{...}{additional parameters, passed to \code{\link[=make_readable_file]{make_readable_file()}}.}
+}
+\value{
+A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an
+Arrow \link{Table} otherwise
+}
+\description{
+Feather provides binary columnar serialization for data frames.
+It is designed to make reading and writing data frames efficient,
+and to make sharing data across data analysis languages easy.
+This function reads both the original, limited specification of the format
+and the version 2 specification, which is the Apache Arrow IPC file format.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_feather(mtcars, tf)
+df <- read_feather(tf)
+dim(df)
+# Can select columns
+df <- read_feather(tf, col_select = starts_with("d"))
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\link{FeatherReader} and \link{RecordBatchReader} for lower-level access to reading Arrow IPC data.
+}
diff --git a/src/arrow/r/man/read_ipc_stream.Rd b/src/arrow/r/man/read_ipc_stream.Rd
new file mode 100644
index 000000000..d4dd78314
--- /dev/null
+++ b/src/arrow/r/man/read_ipc_stream.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/deprecated.R, R/ipc_stream.R
+\name{read_arrow}
+\alias{read_arrow}
+\alias{read_ipc_stream}
+\title{Read Arrow IPC stream format}
+\usage{
+read_arrow(file, ...)
+
+read_ipc_stream(file, as_data_frame = TRUE, ...)
+}
+\arguments{
+\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream,
+or a \code{FileSystem} with path (\code{SubTreeFileSystem}).
+If a file name or URI, an Arrow \link{InputStream} will be opened and
+closed when finished. If an input stream is provided, it will be left
+open.}
+
+\item{...}{extra parameters passed to \code{read_feather()}.}
+
+\item{as_data_frame}{Should the function return a \code{data.frame} (default) or
+an Arrow \link{Table}?}
+}
+\value{
+A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an
+Arrow \link{Table} otherwise
+}
+\description{
+Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}:
+a "stream" format and a "file" format, known as Feather. \code{read_ipc_stream()}
+and \code{\link[=read_feather]{read_feather()}} read those formats, respectively.
+}
+\details{
+\code{read_arrow()}, a wrapper around \code{read_ipc_stream()} and \code{read_feather()},
+is deprecated. You should explicitly choose
+the function that will read the desired IPC format (stream or file) since
+a file or \code{InputStream} may contain either.
+}
+\seealso{
+\code{\link[=read_feather]{read_feather()}} for writing IPC files. \link{RecordBatchReader} for a
+lower-level interface.
+}
diff --git a/src/arrow/r/man/read_json_arrow.Rd b/src/arrow/r/man/read_json_arrow.Rd
new file mode 100644
index 000000000..610867ca4
--- /dev/null
+++ b/src/arrow/r/man/read_json_arrow.Rd
@@ -0,0 +1,52 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/json.R
+\name{read_json_arrow}
+\alias{read_json_arrow}
+\title{Read a JSON file}
+\usage{
+read_json_arrow(
+ file,
+ col_select = NULL,
+ as_data_frame = TRUE,
+ schema = NULL,
+ ...
+)
+}
+\arguments{
+\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream,
+or a \code{FileSystem} with path (\code{SubTreeFileSystem}).
+If a file name, a memory-mapped Arrow \link{InputStream} will be opened and
+closed when finished; compression will be detected from the file extension
+and handled automatically. If an input stream is provided, it will be left
+open.}
+
+\item{col_select}{A character vector of column names to keep, as in the
+"select" argument to \code{data.table::fread()}, or a
+\link[tidyselect:vars_select]{tidy selection specification}
+of columns, as used in \code{dplyr::select()}.}
+
+\item{as_data_frame}{Should the function return a \code{data.frame} (default) or
+an Arrow \link{Table}?}
+
+\item{schema}{\link{Schema} that describes the table.}
+
+\item{...}{Additional options passed to \code{JsonTableReader$create()}}
+}
+\value{
+A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}.
+}
+\description{
+Using \link{JsonTableReader}
+}
+\examples{
+\dontshow{if (arrow_with_json()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf, useBytes = TRUE)
+df <- read_json_arrow(tf)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/read_message.Rd b/src/arrow/r/man/read_message.Rd
new file mode 100644
index 000000000..444c76c86
--- /dev/null
+++ b/src/arrow/r/man/read_message.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/message.R
+\name{read_message}
+\alias{read_message}
+\title{Read a Message from a stream}
+\usage{
+read_message(stream)
+}
+\arguments{
+\item{stream}{an InputStream}
+}
+\description{
+Read a Message from a stream
+}
diff --git a/src/arrow/r/man/read_parquet.Rd b/src/arrow/r/man/read_parquet.Rd
new file mode 100644
index 000000000..056e86447
--- /dev/null
+++ b/src/arrow/r/man/read_parquet.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\name{read_parquet}
+\alias{read_parquet}
+\title{Read a Parquet file}
+\usage{
+read_parquet(
+ file,
+ col_select = NULL,
+ as_data_frame = TRUE,
+ props = ParquetArrowReaderProperties$create(),
+ ...
+)
+}
+\arguments{
+\item{file}{A character file name or URI, \code{raw} vector, an Arrow input stream,
+or a \code{FileSystem} with path (\code{SubTreeFileSystem}).
+If a file name or URI, an Arrow \link{InputStream} will be opened and
+closed when finished. If an input stream is provided, it will be left
+open.}
+
+\item{col_select}{A character vector of column names to keep, as in the
+"select" argument to \code{data.table::fread()}, or a
+\link[tidyselect:vars_select]{tidy selection specification}
+of columns, as used in \code{dplyr::select()}.}
+
+\item{as_data_frame}{Should the function return a \code{data.frame} (default) or
+an Arrow \link{Table}?}
+
+\item{props}{\link{ParquetArrowReaderProperties}}
+
+\item{...}{Additional arguments passed to \code{ParquetFileReader$create()}}
+}
+\value{
+A \link[=Table]{arrow::Table}, or a \code{data.frame} if \code{as_data_frame} is
+\code{TRUE} (the default).
+}
+\description{
+'\href{https://parquet.apache.org/}{Parquet}' is a columnar storage file format.
+This function enables you to read Parquet files into R.
+}
+\examples{
+\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_parquet(mtcars, tf)
+df <- read_parquet(tf, col_select = starts_with("d"))
+head(df)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/read_schema.Rd b/src/arrow/r/man/read_schema.Rd
new file mode 100644
index 000000000..8738b8aeb
--- /dev/null
+++ b/src/arrow/r/man/read_schema.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/schema.R
+\name{read_schema}
+\alias{read_schema}
+\title{read a Schema from a stream}
+\usage{
+read_schema(stream, ...)
+}
+\arguments{
+\item{stream}{a \code{Message}, \code{InputStream}, or \code{Buffer}}
+
+\item{...}{currently ignored}
+}
+\value{
+A \link{Schema}
+}
+\description{
+read a Schema from a stream
+}
diff --git a/src/arrow/r/man/recycle_scalars.Rd b/src/arrow/r/man/recycle_scalars.Rd
new file mode 100644
index 000000000..3d97ecfd7
--- /dev/null
+++ b/src/arrow/r/man/recycle_scalars.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util.R
+\name{recycle_scalars}
+\alias{recycle_scalars}
+\title{Recycle scalar values in a list of arrays}
+\usage{
+recycle_scalars(arrays)
+}
+\arguments{
+\item{arrays}{List of arrays}
+}
+\value{
+List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled
+}
+\description{
+Recycle scalar values in a list of arrays
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/reexports.Rd b/src/arrow/r/man/reexports.Rd
new file mode 100644
index 000000000..591158c72
--- /dev/null
+++ b/src/arrow/r/man/reexports.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/reexports-bit64.R, R/reexports-tidyselect.R
+\docType{import}
+\name{reexports}
+\alias{reexports}
+\alias{print.integer64}
+\alias{str.integer64}
+\alias{contains}
+\alias{select_helpers}
+\alias{ends_with}
+\alias{everything}
+\alias{matches}
+\alias{num_range}
+\alias{one_of}
+\alias{starts_with}
+\alias{last_col}
+\alias{all_of}
+\title{Objects exported from other packages}
+\keyword{internal}
+\description{
+These objects are imported from other packages. Follow the links
+below to see their documentation.
+
+\describe{
+ \item{bit64}{\code{\link[bit64:bit64-package]{print.integer64}}, \code{\link[bit64:bit64-package]{str.integer64}}}
+
+ \item{tidyselect}{\code{\link[tidyselect]{all_of}}, \code{\link[tidyselect:starts_with]{contains}}, \code{\link[tidyselect:starts_with]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect:everything]{last_col}}, \code{\link[tidyselect:starts_with]{matches}}, \code{\link[tidyselect:starts_with]{num_range}}, \code{\link[tidyselect]{one_of}}, \code{\link[tidyselect]{starts_with}}}
+}}
+
diff --git a/src/arrow/r/man/repeat_value_as_array.Rd b/src/arrow/r/man/repeat_value_as_array.Rd
new file mode 100644
index 000000000..a4937326e
--- /dev/null
+++ b/src/arrow/r/man/repeat_value_as_array.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util.R
+\name{repeat_value_as_array}
+\alias{repeat_value_as_array}
+\title{Take an object of length 1 and repeat it.}
+\usage{
+repeat_value_as_array(object, n)
+}
+\arguments{
+\item{object}{Object of length 1 to be repeated - vector, \code{Scalar}, \code{Array}, or \code{ChunkedArray}}
+
+\item{n}{Number of repetitions}
+}
+\value{
+\code{Array} of length \code{n}
+}
+\description{
+Take an object of length 1 and repeat it.
+}
+\keyword{internal}
diff --git a/src/arrow/r/man/s3_bucket.Rd b/src/arrow/r/man/s3_bucket.Rd
new file mode 100644
index 000000000..95a086dea
--- /dev/null
+++ b/src/arrow/r/man/s3_bucket.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filesystem.R
+\name{s3_bucket}
+\alias{s3_bucket}
+\title{Connect to an AWS S3 bucket}
+\usage{
+s3_bucket(bucket, ...)
+}
+\arguments{
+\item{bucket}{string S3 bucket name or path}
+
+\item{...}{Additional connection options, passed to \code{S3FileSystem$create()}}
+}
+\value{
+A \code{SubTreeFileSystem} containing an \code{S3FileSystem} and the bucket's
+relative path. Note that this function's success does not guarantee that you
+are authorized to access the bucket's contents.
+}
+\description{
+\code{s3_bucket()} is a convenience function to create an \code{S3FileSystem} object
+that automatically detects the bucket's AWS region and holding onto the its
+relative path.
+}
+\examples{
+\dontshow{if (arrow_with_s3()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+bucket <- s3_bucket("ursa-labs-taxi-data")
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/to_arrow.Rd b/src/arrow/r/man/to_arrow.Rd
new file mode 100644
index 000000000..e0c31b8dc
--- /dev/null
+++ b/src/arrow/r/man/to_arrow.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/duckdb.R
+\name{to_arrow}
+\alias{to_arrow}
+\title{Create an Arrow object from others}
+\usage{
+to_arrow(.data)
+}
+\arguments{
+\item{.data}{the object to be converted}
+}
+\value{
+an \code{arrow_dplyr_query} object, to be used in dplyr pipelines.
+}
+\description{
+This can be used in pipelines that pass data back and forth between Arrow and
+other processes (like DuckDB).
+}
+\examples{
+\dontshow{if (getFromNamespace("run_duckdb_examples", "arrow")()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+library(dplyr)
+
+ds <- InMemoryDataset$create(mtcars)
+
+ds \%>\%
+ filter(mpg < 30) \%>\%
+ to_duckdb() \%>\%
+ group_by(cyl) \%>\%
+ summarize(mean_mpg = mean(mpg, na.rm = TRUE)) \%>\%
+ to_arrow() \%>\%
+ collect()
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/to_duckdb.Rd b/src/arrow/r/man/to_duckdb.Rd
new file mode 100644
index 000000000..12186d432
--- /dev/null
+++ b/src/arrow/r/man/to_duckdb.Rd
@@ -0,0 +1,56 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/duckdb.R
+\name{to_duckdb}
+\alias{to_duckdb}
+\title{Create a (virtual) DuckDB table from an Arrow object}
+\usage{
+to_duckdb(
+ .data,
+ con = arrow_duck_connection(),
+ table_name = unique_arrow_tablename(),
+ auto_disconnect = FALSE
+)
+}
+\arguments{
+\item{.data}{the Arrow object (e.g. Dataset, Table) to use for the DuckDB table}
+
+\item{con}{a DuckDB connection to use (default will create one and store it
+in \code{options("arrow_duck_con")})}
+
+\item{table_name}{a name to use in DuckDB for this object. The default is a
+unique string \code{"arrow_"} followed by numbers.}
+
+\item{auto_disconnect}{should the table be automatically cleaned up when the
+resulting object is removed (and garbage collected)? Default: \code{FALSE}}
+}
+\value{
+A \code{tbl} of the new table in DuckDB
+}
+\description{
+This will do the necessary configuration to create a (virtual) table in DuckDB
+that is backed by the Arrow object given. No data is copied or modified until
+\code{collect()} or \code{compute()} are called or a query is run against the table.
+}
+\details{
+The result is a dbplyr-compatible object that can be used in d(b)plyr pipelines.
+
+If \code{auto_disconnect = TRUE}, the DuckDB table that is created will be configured
+to be unregistered when the \code{tbl} object is garbage collected. This is helpful
+if you don't want to have extra table objects in DuckDB after you've finished
+using them. Currently, this cleanup can, however, sometimes lead to hangs if
+tables are created and deleted in quick succession, hence the default value
+of \code{FALSE}
+}
+\examples{
+\dontshow{if (getFromNamespace("run_duckdb_examples", "arrow")()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+library(dplyr)
+
+ds <- InMemoryDataset$create(mtcars)
+
+ds \%>\%
+ filter(mpg < 30) \%>\%
+ to_duckdb() \%>\%
+ group_by(cyl) \%>\%
+ summarize(mean_mpg = mean(mpg, na.rm = TRUE))
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/type.Rd b/src/arrow/r/man/type.Rd
new file mode 100644
index 000000000..d55bbe24b
--- /dev/null
+++ b/src/arrow/r/man/type.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/type.R
+\name{type}
+\alias{type}
+\title{infer the arrow Array type from an R vector}
+\usage{
+type(x)
+}
+\arguments{
+\item{x}{an R vector}
+}
+\value{
+an arrow logical type
+}
+\description{
+infer the arrow Array type from an R vector
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+type(1:10)
+type(1L:10L)
+type(c(1, 1.5, 2))
+type(c("A", "B", "C"))
+type(mtcars)
+type(Sys.Date())
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/unify_schemas.Rd b/src/arrow/r/man/unify_schemas.Rd
new file mode 100644
index 000000000..50c80c2dd
--- /dev/null
+++ b/src/arrow/r/man/unify_schemas.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/schema.R
+\name{unify_schemas}
+\alias{unify_schemas}
+\title{Combine and harmonize schemas}
+\usage{
+unify_schemas(..., schemas = list(...))
+}
+\arguments{
+\item{...}{\link{Schema}s to unify}
+
+\item{schemas}{Alternatively, a list of schemas}
+}
+\value{
+A \code{Schema} with the union of fields contained in the inputs, or
+\code{NULL} if any of \code{schemas} is \code{NULL}
+}
+\description{
+Combine and harmonize schemas
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+a <- schema(b = double(), c = bool())
+z <- schema(b = double(), k = utf8())
+unify_schemas(a, z)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/value_counts.Rd b/src/arrow/r/man/value_counts.Rd
new file mode 100644
index 000000000..7e64d1550
--- /dev/null
+++ b/src/arrow/r/man/value_counts.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute.R
+\name{value_counts}
+\alias{value_counts}
+\title{\code{table} for Arrow objects}
+\usage{
+value_counts(x)
+}
+\arguments{
+\item{x}{\code{Array} or \code{ChunkedArray}}
+}
+\value{
+A \code{StructArray} containing "values" (same type as \code{x}) and "counts"
+\code{Int64}.
+}
+\description{
+This function tabulates the values in the array and returns a table of counts.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+cyl_vals <- Array$create(mtcars$cyl)
+counts <- value_counts(cyl_vals)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/write_csv_arrow.Rd b/src/arrow/r/man/write_csv_arrow.Rd
new file mode 100644
index 000000000..55a239ca9
--- /dev/null
+++ b/src/arrow/r/man/write_csv_arrow.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{write_csv_arrow}
+\alias{write_csv_arrow}
+\title{Write CSV file to disk}
+\usage{
+write_csv_arrow(x, sink, include_header = TRUE, batch_size = 1024L)
+}
+\arguments{
+\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}
+
+\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
+system (\code{SubTreeFileSystem})}
+
+\item{include_header}{Whether to write an initial header line with column names}
+
+\item{batch_size}{Maximum number of rows processed at a time. Default is 1024.}
+}
+\value{
+The input \code{x}, invisibly. Note that if \code{sink} is an \link{OutputStream},
+the stream will be left open.
+}
+\description{
+Write CSV file to disk
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_csv_arrow(mtcars, tf)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/write_dataset.Rd b/src/arrow/r/man/write_dataset.Rd
new file mode 100644
index 000000000..76bbaf7c7
--- /dev/null
+++ b/src/arrow/r/man/write_dataset.Rd
@@ -0,0 +1,115 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-write.R
+\name{write_dataset}
+\alias{write_dataset}
+\title{Write a dataset}
+\usage{
+write_dataset(
+ dataset,
+ path,
+ format = c("parquet", "feather", "arrow", "ipc", "csv"),
+ partitioning = dplyr::group_vars(dataset),
+ basename_template = paste0("part-{i}.", as.character(format)),
+ hive_style = TRUE,
+ existing_data_behavior = c("overwrite", "error", "delete_matching"),
+ ...
+)
+}
+\arguments{
+\item{dataset}{\link{Dataset}, \link{RecordBatch}, \link{Table}, \code{arrow_dplyr_query}, or
+\code{data.frame}. If an \code{arrow_dplyr_query}, the query will be evaluated and
+the result will be written. This means that you can \code{select()}, \code{filter()}, \code{mutate()},
+etc. to transform the data before it is written if you need to.}
+
+\item{path}{string path, URI, or \code{SubTreeFileSystem} referencing a directory
+to write to (directory will be created if it does not exist)}
+
+\item{format}{a string identifier of the file format. Default is to use
+"parquet" (see \link{FileFormat})}
+
+\item{partitioning}{\code{Partitioning} or a character vector of columns to
+use as partition keys (to be written as path segments). Default is to
+use the current \code{group_by()} columns.}
+
+\item{basename_template}{string template for the names of files to be written.
+Must contain \code{"{i}"}, which will be replaced with an autoincremented
+integer to generate basenames of datafiles. For example, \code{"part-{i}.feather"}
+will yield \verb{"part-0.feather", ...}.}
+
+\item{hive_style}{logical: write partition segments as Hive-style
+(\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.}
+
+\item{existing_data_behavior}{The behavior to use when there is already data
+in the destination directory. Must be one of overwrite, error, or
+delete_matching. When this is set to "overwrite" (the default) then any
+new files created will overwrite existing files. When this is set to
+"error" then the operation will fail if the destination directory is not
+empty. When this is set to "delete_matching" then the writer will delete
+any existing partitions if data is going to be written to those partitions
+and will leave alone partitions which data is not written to.}
+
+\item{...}{additional format-specific arguments. For available Parquet
+options, see \code{\link[=write_parquet]{write_parquet()}}. The available Feather options are
+\itemize{
+\item \code{use_legacy_format} logical: write data formatted so that Arrow libraries
+versions 0.14 and lower can read it. Default is \code{FALSE}. You can also
+enable this by setting the environment variable \code{ARROW_PRE_0_15_IPC_FORMAT=1}.
+\item \code{metadata_version}: A string like "V5" or the equivalent integer indicating
+the Arrow IPC MetadataVersion. Default (NULL) will use the latest version,
+unless the environment variable \code{ARROW_PRE_1_0_METADATA_VERSION=1}, in
+which case it will be V4.
+\item \code{codec}: A \link{Codec} which will be used to compress body buffers of written
+files. Default (NULL) will not compress body buffers.
+\item \code{null_fallback}: character to be used in place of missing values (\code{NA} or
+\code{NULL}) when using Hive-style partitioning. See \code{\link[=hive_partition]{hive_partition()}}.
+}}
+}
+\value{
+The input \code{dataset}, invisibly
+}
+\description{
+This function allows you to write a dataset. By writing to more efficient
+binary storage formats, and by specifying relevant partitioning, you can
+make it much faster to read and query.
+}
+\examples{
+\dontshow{if (arrow_with_dataset() & arrow_with_parquet() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# You can write datasets partitioned by the values in a column (here: "cyl").
+# This creates a structure of the form cyl=X/part-Z.parquet.
+one_level_tree <- tempfile()
+write_dataset(mtcars, one_level_tree, partitioning = "cyl")
+list.files(one_level_tree, recursive = TRUE)
+
+# You can also partition by the values in multiple columns
+# (here: "cyl" and "gear").
+# This creates a structure of the form cyl=X/gear=Y/part-Z.parquet.
+two_levels_tree <- tempfile()
+write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear"))
+list.files(two_levels_tree, recursive = TRUE)
+
+# In the two previous examples we would have:
+# X = {4,6,8}, the number of cylinders.
+# Y = {3,4,5}, the number of forward gears.
+# Z = {0,1,2}, the number of saved parts, starting from 0.
+
+# You can obtain the same result as as the previous examples using arrow with
+# a dplyr pipeline. This will be the same as two_levels_tree above, but the
+# output directory will be different.
+library(dplyr)
+two_levels_tree_2 <- tempfile()
+mtcars \%>\%
+ group_by(cyl, gear) \%>\%
+ write_dataset(two_levels_tree_2)
+list.files(two_levels_tree_2, recursive = TRUE)
+
+# And you can also turn off the Hive-style directory naming where the column
+# name is included with the values by using `hive_style = FALSE`.
+
+# Write a structure X/Y/part-Z.parquet.
+two_levels_tree_no_hive <- tempfile()
+mtcars \%>\%
+ group_by(cyl, gear) \%>\%
+ write_dataset(two_levels_tree_no_hive, hive_style = FALSE)
+list.files(two_levels_tree_no_hive, recursive = TRUE)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/write_feather.Rd b/src/arrow/r/man/write_feather.Rd
new file mode 100644
index 000000000..c6273b61b
--- /dev/null
+++ b/src/arrow/r/man/write_feather.Rd
@@ -0,0 +1,61 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feather.R
+\name{write_feather}
+\alias{write_feather}
+\title{Write data in the Feather format}
+\usage{
+write_feather(
+ x,
+ sink,
+ version = 2,
+ chunk_size = 65536L,
+ compression = c("default", "lz4", "uncompressed", "zstd"),
+ compression_level = NULL
+)
+}
+\arguments{
+\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}
+
+\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
+system (\code{SubTreeFileSystem})}
+
+\item{version}{integer Feather file version. Version 2 is the current.
+Version 1 is the more limited legacy format.}
+
+\item{chunk_size}{For V2 files, the number of rows that each chunk of data
+should have in the file. Use a smaller \code{chunk_size} when you need faster
+random row access. Default is 64K. This option is not supported for V1.}
+
+\item{compression}{Name of compression codec to use, if any. Default is
+"lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
+"uncompressed". "zstd" is the other available codec and generally has better
+compression ratios in exchange for slower read and write performance
+See \code{\link[=codec_is_available]{codec_is_available()}}. This option is not supported for V1.}
+
+\item{compression_level}{If \code{compression} is "zstd", you may
+specify an integer compression level. If omitted, the compression codec's
+default compression level is used.}
+}
+\value{
+The input \code{x}, invisibly. Note that if \code{sink} is an \link{OutputStream},
+the stream will be left open.
+}
+\description{
+Feather provides binary columnar serialization for data frames.
+It is designed to make reading and writing data frames efficient,
+and to make sharing data across data analysis languages easy.
+This function writes both the original, limited specification of the format
+and the version 2 specification, which is the Apache Arrow IPC file format.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_feather(mtcars, tf)
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\link{RecordBatchWriter} for lower-level access to writing Arrow IPC data.
+
+\link{Schema} for information about schemas and metadata handling.
+}
diff --git a/src/arrow/r/man/write_ipc_stream.Rd b/src/arrow/r/man/write_ipc_stream.Rd
new file mode 100644
index 000000000..2f215f25f
--- /dev/null
+++ b/src/arrow/r/man/write_ipc_stream.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/deprecated.R, R/ipc_stream.R
+\name{write_arrow}
+\alias{write_arrow}
+\alias{write_ipc_stream}
+\title{Write Arrow IPC stream format}
+\usage{
+write_arrow(x, sink, ...)
+
+write_ipc_stream(x, sink, ...)
+}
+\arguments{
+\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}
+
+\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
+system (\code{SubTreeFileSystem})}
+
+\item{...}{extra parameters passed to \code{write_feather()}.}
+}
+\value{
+\code{x}, invisibly.
+}
+\description{
+Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}:
+a "stream" format and a "file" format, known as Feather. \code{write_ipc_stream()}
+and \code{\link[=write_feather]{write_feather()}} write those formats, respectively.
+}
+\details{
+\code{write_arrow()}, a wrapper around \code{write_ipc_stream()} and \code{write_feather()}
+with some nonstandard behavior, is deprecated. You should explicitly choose
+the function that will write the desired IPC format (stream or file) since
+either can be written to a file or \code{OutputStream}.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_ipc_stream(mtcars, tf)
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\code{\link[=write_feather]{write_feather()}} for writing IPC files. \code{\link[=write_to_raw]{write_to_raw()}} to
+serialize data to a buffer.
+\link{RecordBatchWriter} for a lower-level interface.
+}
diff --git a/src/arrow/r/man/write_parquet.Rd b/src/arrow/r/man/write_parquet.Rd
new file mode 100644
index 000000000..d7147f7e8
--- /dev/null
+++ b/src/arrow/r/man/write_parquet.Rd
@@ -0,0 +1,108 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\name{write_parquet}
+\alias{write_parquet}
+\title{Write Parquet file to disk}
+\usage{
+write_parquet(
+ x,
+ sink,
+ chunk_size = NULL,
+ version = NULL,
+ compression = default_parquet_compression(),
+ compression_level = NULL,
+ use_dictionary = NULL,
+ write_statistics = NULL,
+ data_page_size = NULL,
+ use_deprecated_int96_timestamps = FALSE,
+ coerce_timestamps = NULL,
+ allow_truncated_timestamps = FALSE,
+ properties = NULL,
+ arrow_properties = NULL
+)
+}
+\arguments{
+\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}
+
+\item{sink}{A string file path, URI, or \link{OutputStream}, or path in a file
+system (\code{SubTreeFileSystem})}
+
+\item{chunk_size}{chunk size in number of rows. If NULL, the total number of rows is used.}
+
+\item{version}{parquet version, "1.0" or "2.0". Default "1.0". Numeric values
+are coerced to character.}
+
+\item{compression}{compression algorithm. Default "snappy". See details.}
+
+\item{compression_level}{compression level. Meaning depends on compression algorithm}
+
+\item{use_dictionary}{Specify if we should use dictionary encoding. Default \code{TRUE}}
+
+\item{write_statistics}{Specify if we should write statistics. Default \code{TRUE}}
+
+\item{data_page_size}{Set a target threshold for the approximate encoded
+size of data pages within a column chunk (in bytes). Default 1 MiB.}
+
+\item{use_deprecated_int96_timestamps}{Write timestamps to INT96 Parquet format. Default \code{FALSE}.}
+
+\item{coerce_timestamps}{Cast timestamps a particular resolution. Can be
+\code{NULL}, "ms" or "us". Default \code{NULL} (no casting)}
+
+\item{allow_truncated_timestamps}{Allow loss of data when coercing timestamps to a
+particular resolution. E.g. if microsecond or nanosecond data is lost when coercing
+to "ms", do not raise an exception}
+
+\item{properties}{A \code{ParquetWriterProperties} object, used instead of the options
+enumerated in this function's signature. Providing \code{properties} as an argument
+is deprecated; if you need to assemble \code{ParquetWriterProperties} outside
+of \code{write_parquet()}, use \code{ParquetFileWriter} instead.}
+
+\item{arrow_properties}{A \code{ParquetArrowWriterProperties} object. Like
+\code{properties}, this argument is deprecated.}
+}
+\value{
+the input \code{x} invisibly.
+}
+\description{
+\href{https://parquet.apache.org/}{Parquet} is a columnar storage file format.
+This function enables you to write Parquet files from R.
+}
+\details{
+Due to features of the format, Parquet files cannot be appended to.
+If you want to use the Parquet format but also want the ability to extend
+your dataset, you can write to additional Parquet files and then treat
+the whole directory of files as a \link{Dataset} you can query.
+See \code{vignette("dataset", package = "arrow")} for examples of this.
+
+The parameters \code{compression}, \code{compression_level}, \code{use_dictionary} and
+\code{write_statistics} support various patterns:
+\itemize{
+\item The default \code{NULL} leaves the parameter unspecified, and the C++ library
+uses an appropriate default for each column (defaults listed above)
+\item A single, unnamed, value (e.g. a single string for \code{compression}) applies to all columns
+\item An unnamed vector, of the same size as the number of columns, to specify a
+value for each column, in positional order
+\item A named vector, to specify the value for the named columns, the default
+value for the setting is used when not supplied
+}
+
+The \code{compression} argument can be any of the following (case insensitive):
+"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4", "lzo" or "bz2".
+Only "uncompressed" is guaranteed to be available, but "snappy" and "gzip"
+are almost always included. See \code{\link[=codec_is_available]{codec_is_available()}}.
+The default "snappy" is used if available, otherwise "uncompressed". To
+disable compression, set \code{compression = "uncompressed"}.
+Note that "uncompressed" columns may still have dictionary encoding.
+}
+\examples{
+\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf1 <- tempfile(fileext = ".parquet")
+write_parquet(data.frame(x = 1:5), tf1)
+
+# using compression
+if (codec_is_available("gzip")) {
+ tf2 <- tempfile(fileext = ".gz.parquet")
+ write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
+}
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/man/write_to_raw.Rd b/src/arrow/r/man/write_to_raw.Rd
new file mode 100644
index 000000000..a3c6e324b
--- /dev/null
+++ b/src/arrow/r/man/write_to_raw.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ipc_stream.R
+\name{write_to_raw}
+\alias{write_to_raw}
+\title{Write Arrow data to a raw vector}
+\usage{
+write_to_raw(x, format = c("stream", "file"))
+}
+\arguments{
+\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}}
+
+\item{format}{one of \code{c("stream", "file")}, indicating the IPC format to use}
+}
+\value{
+A \code{raw} vector containing the bytes of the IPC serialized data.
+}
+\description{
+\code{\link[=write_ipc_stream]{write_ipc_stream()}} and \code{\link[=write_feather]{write_feather()}} write data to a sink and return
+the data (\code{data.frame}, \code{RecordBatch}, or \code{Table}) they were given.
+This function wraps those so that you can serialize data to a buffer and
+access that buffer as a \code{raw} vector in R.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# The default format is "stream"
+mtcars_raw <- write_to_raw(mtcars)
+\dontshow{\}) # examplesIf}
+}
diff --git a/src/arrow/r/pkgdown/extra.js b/src/arrow/r/pkgdown/extra.js
new file mode 100644
index 000000000..aca15c566
--- /dev/null
+++ b/src/arrow/r/pkgdown/extra.js
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+(function () {
+ // Load the rmarkdown tabset script
+ var script = document.createElement("script");
+ script.type = "text/javascript";
+ script.async = true;
+ script.src =
+ "https://cdn.jsdelivr.net/gh/rstudio/rmarkdown@47d837d3d9cd5e8e212b05767454f058db7d2789/inst/rmd/h/navigation-1.1/tabsets.js";
+ script.integrity = "sha256-Rs54TE1FCN1uLM4f7VQEMiRTl1Ia7TiQLkMruItwV+Q=";
+ script.crossOrigin = "anonymous";
+
+ // Run the processing as the onload callback
+ script.onload = () => {
+ // Monkey patch the .html method to use the .text method
+ $(document).ready(function () {
+ (function ($) {
+ $.fn.html = function (content) {
+ return this.text();
+ };
+ })(jQuery);
+
+ window.buildTabsets("toc");
+ });
+
+ $(document).ready(function () {
+ $(".tabset-dropdown > .nav-tabs > li").click(function () {
+ $(this).parent().toggleClass("nav-tabs-open");
+ });
+ });
+
+ $(document).ready(function () {
+ /**
+ * The tabset creation above sometimes relies on empty headers to stop the
+ * tabbing. Though they shouldn't be included in the TOC in the first place,
+ * this will remove empty headers from the TOC after it's created.
+ */
+
+ // find all the empty <a> elements and remove them (and their parents)
+ var empty_a = $("#toc").find("a").filter(":empty");
+ empty_a.parent().remove();
+
+ // now find any empty <ul>s and remove them too
+ var empty_ul = $("#toc").find("ul").filter(":empty");
+ empty_ul.remove();
+ });
+ };
+
+ document.head.appendChild(script);
+})();
diff --git a/src/arrow/r/src/.clang-format b/src/arrow/r/src/.clang-format
new file mode 100644
index 000000000..06453dfbb
--- /dev/null
+++ b/src/arrow/r/src/.clang-format
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+---
+BasedOnStyle: Google
+DerivePointerAlignment: false
+ColumnLimit: 90
diff --git a/src/arrow/r/src/.gitignore b/src/arrow/r/src/.gitignore
new file mode 100644
index 000000000..22034c461
--- /dev/null
+++ b/src/arrow/r/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
diff --git a/src/arrow/r/src/Makevars.in b/src/arrow/r/src/Makevars.in
new file mode 100644
index 000000000..5c7764d9c
--- /dev/null
+++ b/src/arrow/r/src/Makevars.in
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# when cpp11 is vendored:
+PKG_CPPFLAGS=@cflags@ -I../inst/include/
+
+# when it is not:
+# PKG_CPPFLAGS=@cflags@
+
+# `-fvisibility=hidden` does not play well with UBSAN:
+# https://bugs.llvm.org/show_bug.cgi?id=39191
+# https://www.mail-archive.com/gcc-bugs@gcc.gnu.org/msg534862.html
+# PKG_CXXFLAGS=$(CXX_VISIBILITY)
+CXX_STD=CXX11
+PKG_LIBS=@libs@
diff --git a/src/arrow/r/src/Makevars.ucrt b/src/arrow/r/src/Makevars.ucrt
new file mode 100644
index 000000000..52488eb2b
--- /dev/null
+++ b/src/arrow/r/src/Makevars.ucrt
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+CRT=-ucrt
+include Makevars.win
diff --git a/src/arrow/r/src/RTasks.cpp b/src/arrow/r/src/RTasks.cpp
new file mode 100644
index 000000000..25bd944cc
--- /dev/null
+++ b/src/arrow/r/src/RTasks.cpp
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./r_task_group.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+namespace arrow {
+namespace r {
+
+RTasks::RTasks(bool use_threads)
+ : use_threads_(use_threads),
+ stop_source_(),
+ parallel_tasks_(use_threads
+ ? arrow::internal::TaskGroup::MakeThreaded(
+ arrow::internal::GetCpuThreadPool(), stop_source_.token())
+ : nullptr) {}
+
+Status RTasks::Finish() {
+ Status status = Status::OK();
+
+ // run the delayed tasks now
+ for (auto& task : delayed_serial_tasks_) {
+ status &= std::move(task)();
+ if (!status.ok()) {
+ stop_source_.RequestStop();
+ break;
+ }
+ }
+
+ // then wait for the parallel tasks to finish
+ if (use_threads_) {
+ status &= parallel_tasks_->Finish();
+ }
+
+ return status;
+}
+
+void RTasks::Append(bool parallel, RTasks::Task&& task) {
+ if (parallel && use_threads_) {
+ parallel_tasks_->Append(std::move(task));
+ } else {
+ delayed_serial_tasks_.push_back(std::move(task));
+ }
+}
+
+void RTasks::Reset() {
+ delayed_serial_tasks_.clear();
+
+ stop_source_.Reset();
+ if (use_threads_) {
+ parallel_tasks_ = arrow::internal::TaskGroup::MakeThreaded(
+ arrow::internal::GetCpuThreadPool(), stop_source_.token());
+ }
+}
+
+} // namespace r
+} // namespace arrow
+
+#endif
diff --git a/src/arrow/r/src/altrep.cpp b/src/arrow/r/src/altrep.cpp
new file mode 100644
index 000000000..81407be3e
--- /dev/null
+++ b/src/arrow/r/src/altrep.cpp
@@ -0,0 +1,690 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/array.h>
+#include <arrow/chunked_array.h>
+#include <arrow/compute/api.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include <cpp11/altrep.hpp>
+#include <cpp11/declarations.hpp>
+#if defined(HAS_ALTREP)
+
+#if R_VERSION < R_Version(3, 6, 0)
+
+// workaround because R's <R_ext/Altrep.h> not so conveniently uses `class`
+// as a variable name, and C++ is not happy about that
+//
+// SEXP R_new_altrep(R_altrep_class_t class, SEXP data1, SEXP data2);
+//
+#define class klass
+
+// Because functions declared in <R_ext/Altrep.h> have C linkage
+extern "C" {
+#include <R_ext/Altrep.h>
+}
+
+// undo the workaround
+#undef class
+
+#else
+#include <R_ext/Altrep.h>
+#endif
+
+#include "./r_task_group.h"
+
+namespace arrow {
+namespace r {
+namespace altrep {
+
+namespace {
+template <typename c_type>
+R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, c_type* buf);
+
+template <>
+R_xlen_t Standard_Get_region<double>(SEXP data2, R_xlen_t i, R_xlen_t n, double* buf) {
+ return REAL_GET_REGION(data2, i, n, buf);
+}
+
+template <>
+R_xlen_t Standard_Get_region<int>(SEXP data2, R_xlen_t i, R_xlen_t n, int* buf) {
+ return INTEGER_GET_REGION(data2, i, n, buf);
+}
+
+void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
+using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
+
+// the Array that is being wrapped by the altrep object
+static const std::shared_ptr<Array>& GetArray(SEXP alt) {
+ return *Pointer(R_altrep_data1(alt));
+}
+
+// base class for all altrep vectors
+//
+// data1: the Array as an external pointer.
+// data2: starts as NULL, and becomes a standard R vector with the same
+// data if necessary: if materialization is needed, e.g. if we need
+// to access its data pointer, with DATAPTR().
+template <typename Impl>
+struct AltrepVectorBase {
+ // store the Array as an external pointer in data1, mark as immutable
+ static SEXP Make(const std::shared_ptr<Array>& array) {
+ SEXP alt = R_new_altrep(Impl::class_t, Pointer(new std::shared_ptr<Array>(array)),
+ R_NilValue);
+ MARK_NOT_MUTABLE(alt);
+
+ return alt;
+ }
+
+ // Is the vector materialized, i.e. does the data2 slot contain a
+ // standard R vector with the same data as the array.
+ static bool IsMaterialized(SEXP alt) { return !Rf_isNull(R_altrep_data2(alt)); }
+
+ static R_xlen_t Length(SEXP alt) { return GetArray(alt)->length(); }
+
+ static int No_NA(SEXP alt) { return GetArray(alt)->null_count() == 0; }
+
+ static int Is_sorted(SEXP alt) { return UNKNOWN_SORTEDNESS; }
+
+ // What gets printed on .Internal(inspect(<the altrep object>))
+ static Rboolean Inspect(SEXP alt, int pre, int deep, int pvec,
+ void (*inspect_subtree)(SEXP, int, int, int)) {
+ const auto& array = GetArray(alt);
+ Rprintf("arrow::Array<%s, %d nulls> len=%d, Array=<%p>\n",
+ array->type()->ToString().c_str(), array->null_count(), array->length(),
+ array.get());
+ return TRUE;
+ }
+
+ // Duplication is done by first materializing the vector and
+ // then make a lazy duplicate of data2
+ static SEXP Duplicate(SEXP alt, Rboolean /* deep */) {
+ return Rf_lazy_duplicate(Impl::Materialize(alt));
+ }
+
+ static SEXP Coerce(SEXP alt, int type) {
+ return Rf_coerceVector(Impl::Materialize(alt), type);
+ }
+
+ static SEXP Serialized_state(SEXP alt) { return Impl::Materialize(alt); }
+
+ static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; }
+};
+
+// altrep R vector shadowing an primitive (int or double) Array.
+//
+// This tries as much as possible to directly use the data
+// from the Array and minimize data copies.
+template <int sexp_type>
+struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sexp_type>> {
+ using Base = AltrepVectorBase<AltrepVectorPrimitive<sexp_type>>;
+
+ // singleton altrep class description
+ static R_altrep_class_t class_t;
+
+ using c_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+
+ // Force materialization. After calling this, the data2 slot of the altrep
+ // object contains a standard R vector with the same data, with
+ // R sentinels where the Array has nulls.
+ //
+ // The Array remains available so that it can be used by Length(), Min(), etc ...
+ static SEXP Materialize(SEXP alt) {
+ if (!Base::IsMaterialized(alt)) {
+ auto size = Base::Length(alt);
+
+ // create a standard R vector
+ SEXP copy = PROTECT(Rf_allocVector(sexp_type, size));
+
+ // copy the data from the array, through Get_region
+ Get_region(alt, 0, size, reinterpret_cast<c_type*>(DATAPTR(copy)));
+
+ // store as data2, this is now considered materialized
+ R_set_altrep_data2(alt, copy);
+ MARK_NOT_MUTABLE(copy);
+
+ UNPROTECT(1);
+ }
+ return R_altrep_data2(alt);
+ }
+
+ // R calls this to get a pointer to the start of the vector data
+ // but only if this is possible without allocating (in the R sense).
+ static const void* Dataptr_or_null(SEXP alt) {
+ // data2 has been created, and so the R sentinels are in place where the array has
+ // nulls
+ if (Base::IsMaterialized(alt)) {
+ return DATAPTR_RO(R_altrep_data2(alt));
+ }
+
+ // the Array has no nulls, we can directly return the start of its data
+ const auto& array = GetArray(alt);
+ if (array->null_count() == 0) {
+ return reinterpret_cast<const void*>(array->data()->template GetValues<c_type>(1));
+ }
+
+ // Otherwise: if the array has nulls and data2 has not been generated: give up
+ return nullptr;
+ }
+
+ // R calls this to get a pointer to the start of the data, R allocations are allowed.
+ static void* Dataptr(SEXP alt, Rboolean writeable) {
+ // If the object hasn't been materialized, and the array has no
+ // nulls we can directly point to the array data.
+ if (!Base::IsMaterialized(alt)) {
+ const auto& array = GetArray(alt);
+
+ if (array->null_count() == 0) {
+ return reinterpret_cast<void*>(
+ const_cast<c_type*>(array->data()->template GetValues<c_type>(1)));
+ }
+ }
+
+ // Otherwise we have to materialize and hand the pointer to data2
+ //
+ // NOTE: this returns the DATAPTR() of data2 even in the case writeable = TRUE
+ //
+ // which is risky because C(++) clients of this object might
+ // modify data2, and therefore make it diverge from the data of the Array,
+ // but the object was marked as immutable on creation, so doing this is
+ // disregarding the R api.
+ //
+ // Simply stop() when `writeable = TRUE` is too strong, e.g. this fails
+ // identical() which calls DATAPTR() even though DATAPTR_RO() would
+ // be enough
+ return DATAPTR(Materialize(alt));
+ }
+
+ // The value at position i
+ static c_type Elt(SEXP alt, R_xlen_t i) {
+ const auto& array = GetArray(alt);
+ return array->IsNull(i) ? cpp11::na<c_type>()
+ : array->data()->template GetValues<c_type>(1)[i];
+ }
+
+ // R calls this when it wants data from position `i` to `i + n` copied into `buf`
+ // The returned value is the number of values that were really copied
+ // (this can be lower than n)
+ static R_xlen_t Get_region(SEXP alt, R_xlen_t i, R_xlen_t n, c_type* buf) {
+ // If we have data2, we can just copy the region into buf
+ // using the standard Get_region for this R type
+ if (Base::IsMaterialized(alt)) {
+ return Standard_Get_region<c_type>(R_altrep_data2(alt), i, n, buf);
+ }
+
+ // The vector was not materialized, aka we don't have data2
+ //
+ // In that case, we copy the data from the Array, and then
+ // do a second pass to force the R sentinels for where the
+ // array has nulls
+ //
+ // This only materialize the region, into buf. Not the entire vector.
+ auto slice = GetArray(alt)->Slice(i, n);
+ R_xlen_t ncopy = slice->length();
+
+ // first copy the data buffer
+ memcpy(buf, slice->data()->template GetValues<c_type>(1), ncopy * sizeof(c_type));
+
+ // then set the R NA sentinels if needed
+ if (slice->null_count() > 0) {
+ internal::BitmapReader bitmap_reader(slice->null_bitmap()->data(), slice->offset(),
+ ncopy);
+
+ for (R_xlen_t j = 0; j < ncopy; j++, bitmap_reader.Next()) {
+ if (bitmap_reader.IsNotSet()) {
+ buf[j] = cpp11::na<c_type>();
+ }
+ }
+ }
+
+ return ncopy;
+ }
+
+ static std::shared_ptr<arrow::compute::ScalarAggregateOptions> NaRmOptions(
+ const std::shared_ptr<Array>& array, bool na_rm) {
+ auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+ arrow::compute::ScalarAggregateOptions::Defaults());
+ options->min_count = 0;
+ options->skip_nulls = na_rm;
+ return options;
+ }
+
+ template <bool Min>
+ static SEXP MinMax(SEXP alt, Rboolean narm) {
+ using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+ using scalar_type =
+ typename std::conditional<sexp_type == INTSXP, Int32Scalar, DoubleScalar>::type;
+
+ const auto& array = GetArray(alt);
+ bool na_rm = narm == TRUE;
+ auto n = array->length();
+ auto null_count = array->null_count();
+ if ((na_rm || n == 0) && null_count == n) {
+ return Rf_ScalarReal(Min ? R_PosInf : R_NegInf);
+ }
+ if (!na_rm && null_count > 0) {
+ return cpp11::as_sexp(cpp11::na<data_type>());
+ }
+
+ auto options = NaRmOptions(array, na_rm);
+
+ const auto& minmax =
+ ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+ const auto& minmax_scalar =
+ internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+ const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+ *ValueOrStop(minmax_scalar.field(Min ? "min" : "max")));
+ return cpp11::as_sexp(result_scalar.value);
+ }
+
+ static SEXP Min(SEXP alt, Rboolean narm) { return MinMax<true>(alt, narm); }
+
+ static SEXP Max(SEXP alt, Rboolean narm) { return MinMax<false>(alt, narm); }
+
+ static SEXP Sum(SEXP alt, Rboolean narm) {
+ using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+
+ const auto& array = GetArray(alt);
+ bool na_rm = narm == TRUE;
+ auto null_count = array->null_count();
+
+ if (!na_rm && null_count > 0) {
+ return cpp11::as_sexp(cpp11::na<data_type>());
+ }
+ auto options = NaRmOptions(array, na_rm);
+
+ const auto& sum =
+ ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+ if (sexp_type == INTSXP) {
+ // When calling the "sum" function on an int32 array, we get an Int64 scalar
+ // in case of overflow, make it a double like R
+ int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+ if (value <= INT32_MIN || value > INT32_MAX) {
+ return Rf_ScalarReal(static_cast<double>(value));
+ } else {
+ return Rf_ScalarInteger(static_cast<int>(value));
+ }
+ } else {
+ return Rf_ScalarReal(
+ internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+ }
+ }
+};
+template <int sexp_type>
+R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
+
+// Implementation for string arrays
+template <typename Type>
+struct AltrepVectorString : public AltrepVectorBase<AltrepVectorString<Type>> {
+ using Base = AltrepVectorBase<AltrepVectorString<Type>>;
+
+ static R_altrep_class_t class_t;
+ using StringArrayType = typename TypeTraits<Type>::ArrayType;
+
+ // Helper class to convert to R strings
+ struct RStringViewer {
+ explicit RStringViewer(const std::shared_ptr<Array>& array)
+ : array_(array),
+ string_array_(internal::checked_cast<const StringArrayType*>(array.get())),
+ strip_out_nuls_(GetBoolOption("arrow.skip_nul", false)),
+ nul_was_stripped_(false) {}
+
+ // convert the i'th string of the Array to an R string (CHARSXP)
+ SEXP Convert(size_t i) {
+ if (array_->IsNull(i)) {
+ return NA_STRING;
+ }
+
+ view_ = string_array_->GetView(i);
+ bool no_nul = std::find(view_.begin(), view_.end(), '\0') == view_.end();
+
+ if (no_nul) {
+ return Rf_mkCharLenCE(view_.data(), view_.size(), CE_UTF8);
+ } else if (strip_out_nuls_) {
+ return ConvertStripNul();
+ } else {
+ Error();
+
+ // not reached
+ return R_NilValue;
+ }
+ }
+
+ // strip the nuls and then convert to R string
+ SEXP ConvertStripNul() {
+ const char* old_string = view_.data();
+
+ size_t stripped_len = 0, nul_count = 0;
+
+ for (size_t i = 0; i < view_.size(); i++) {
+ if (old_string[i] == '\0') {
+ ++nul_count;
+
+ if (nul_count == 1) {
+ // first nul spotted: allocate stripped string storage
+ stripped_string_.assign(view_.begin(), view_.end());
+ stripped_len = i;
+ }
+
+ // don't copy old_string[i] (which is \0) into stripped_string
+ continue;
+ }
+
+ if (nul_count > 0) {
+ stripped_string_[stripped_len++] = old_string[i];
+ }
+ }
+
+ nul_was_stripped_ = true;
+ return Rf_mkCharLenCE(stripped_string_.data(), stripped_len, CE_UTF8);
+ }
+
+ bool nul_was_stripped() const { return nul_was_stripped_; }
+
+ // throw R error about embedded nul
+ void Error() {
+ stripped_string_ = "embedded nul in string: '";
+ for (char c : view_) {
+ if (c) {
+ stripped_string_ += c;
+ } else {
+ stripped_string_ += "\\0";
+ }
+ }
+
+ stripped_string_ +=
+ "'; to strip nuls when converting from Arrow to R, set options(arrow.skip_nul "
+ "= TRUE)";
+
+ Rf_error(stripped_string_.c_str());
+ }
+
+ const std::shared_ptr<Array>& array_;
+ const StringArrayType* string_array_;
+ std::string stripped_string_;
+ const bool strip_out_nuls_;
+ bool nul_was_stripped_;
+ util::string_view view_;
+ };
+
+ // Get a single string, as a CHARSXP SEXP,
+ // either from data2 or directly from the Array
+ static SEXP Elt(SEXP alt, R_xlen_t i) {
+ if (Base::IsMaterialized(alt)) {
+ return STRING_ELT(R_altrep_data2(alt), i);
+ }
+
+ BEGIN_CPP11
+
+ const auto& array = GetArray(alt);
+ RStringViewer r_string_viewer(array);
+
+ // r_string_viewer.Convert(i) might jump so it's wrapped
+ // in cpp11::unwind_protect() so that string_viewer
+ // can be properly destructed before the unwinding continues
+ SEXP s = NA_STRING;
+ cpp11::unwind_protect([&]() {
+ s = r_string_viewer.Convert(i);
+ if (r_string_viewer.nul_was_stripped()) {
+ cpp11::warning("Stripping '\\0' (nul) from character vector");
+ }
+ });
+ return s;
+
+ END_CPP11
+ }
+
+ static void* Dataptr(SEXP alt, Rboolean writeable) { return DATAPTR(Materialize(alt)); }
+
+ static SEXP Materialize(SEXP alt) {
+ if (Base::IsMaterialized(alt)) {
+ return R_altrep_data2(alt);
+ }
+
+ BEGIN_CPP11
+
+ const auto& array = GetArray(alt);
+ R_xlen_t n = array->length();
+ SEXP data2 = PROTECT(Rf_allocVector(STRSXP, n));
+ MARK_NOT_MUTABLE(data2);
+
+ RStringViewer r_string_viewer(array);
+
+ // r_string_viewer.Convert(i) might jump so we have to
+ // wrap it in unwind_protect() to:
+ // - correctly destruct the C++ objects
+ // - resume the unwinding
+ cpp11::unwind_protect([&]() {
+ for (R_xlen_t i = 0; i < n; i++) {
+ SET_STRING_ELT(data2, i, r_string_viewer.Convert(i));
+ }
+
+ if (r_string_viewer.nul_was_stripped()) {
+ cpp11::warning("Stripping '\\0' (nul) from character vector");
+ }
+ });
+
+ // only set to data2 if all the values have been converted
+ R_set_altrep_data2(alt, data2);
+ UNPROTECT(1); // data2
+
+ return data2;
+
+ END_CPP11
+ }
+
+ static const void* Dataptr_or_null(SEXP alt) {
+ if (Base::IsMaterialized(alt)) return DATAPTR(R_altrep_data2(alt));
+
+ // otherwise give up
+ return nullptr;
+ }
+
+ static void Set_elt(SEXP alt, R_xlen_t i, SEXP v) {
+ Rf_error("ALTSTRING objects of type <arrow::array_string_vector> are immutable");
+ }
+};
+
+template <typename Type>
+R_altrep_class_t AltrepVectorString<Type>::class_t;
+
+// initialize altrep, altvec, altreal, and altinteger methods
+template <typename AltrepClass>
+void InitAltrepMethods(R_altrep_class_t class_t, DllInfo* dll) {
+ R_set_altrep_Length_method(class_t, AltrepClass::Length);
+ R_set_altrep_Inspect_method(class_t, AltrepClass::Inspect);
+ R_set_altrep_Duplicate_method(class_t, AltrepClass::Duplicate);
+ R_set_altrep_Serialized_state_method(class_t, AltrepClass::Serialized_state);
+ R_set_altrep_Unserialize_method(class_t, AltrepClass::Unserialize);
+ R_set_altrep_Coerce_method(class_t, AltrepClass::Coerce);
+}
+
+template <typename AltrepClass>
+void InitAltvecMethods(R_altrep_class_t class_t, DllInfo* dll) {
+ R_set_altvec_Dataptr_method(class_t, AltrepClass::Dataptr);
+ R_set_altvec_Dataptr_or_null_method(class_t, AltrepClass::Dataptr_or_null);
+}
+
+template <typename AltrepClass>
+void InitAltRealMethods(R_altrep_class_t class_t, DllInfo* dll) {
+ R_set_altreal_No_NA_method(class_t, AltrepClass::No_NA);
+ R_set_altreal_Is_sorted_method(class_t, AltrepClass::Is_sorted);
+
+ R_set_altreal_Sum_method(class_t, AltrepClass::Sum);
+ R_set_altreal_Min_method(class_t, AltrepClass::Min);
+ R_set_altreal_Max_method(class_t, AltrepClass::Max);
+
+ R_set_altreal_Elt_method(class_t, AltrepClass::Elt);
+ R_set_altreal_Get_region_method(class_t, AltrepClass::Get_region);
+}
+
+template <typename AltrepClass>
+void InitAltIntegerMethods(R_altrep_class_t class_t, DllInfo* dll) {
+ R_set_altinteger_No_NA_method(class_t, AltrepClass::No_NA);
+ R_set_altinteger_Is_sorted_method(class_t, AltrepClass::Is_sorted);
+
+ R_set_altinteger_Sum_method(class_t, AltrepClass::Sum);
+ R_set_altinteger_Min_method(class_t, AltrepClass::Min);
+ R_set_altinteger_Max_method(class_t, AltrepClass::Max);
+
+ R_set_altinteger_Elt_method(class_t, AltrepClass::Elt);
+ R_set_altinteger_Get_region_method(class_t, AltrepClass::Get_region);
+}
+
+template <typename AltrepClass>
+void InitAltRealClass(DllInfo* dll, const char* name) {
+ AltrepClass::class_t = R_make_altreal_class(name, "arrow", dll);
+ InitAltrepMethods<AltrepClass>(AltrepClass::class_t, dll);
+ InitAltvecMethods<AltrepClass>(AltrepClass::class_t, dll);
+ InitAltRealMethods<AltrepClass>(AltrepClass::class_t, dll);
+}
+
+template <typename AltrepClass>
+void InitAltIntegerClass(DllInfo* dll, const char* name) {
+ AltrepClass::class_t = R_make_altinteger_class(name, "arrow", dll);
+ InitAltrepMethods<AltrepClass>(AltrepClass::class_t, dll);
+ InitAltvecMethods<AltrepClass>(AltrepClass::class_t, dll);
+ InitAltIntegerMethods<AltrepClass>(AltrepClass::class_t, dll);
+}
+
+template <typename AltrepClass>
+void InitAltStringClass(DllInfo* dll, const char* name) {
+ AltrepClass::class_t = R_make_altstring_class(name, "arrow", dll);
+ R_set_altrep_Length_method(AltrepClass::class_t, AltrepClass::Length);
+ R_set_altrep_Inspect_method(AltrepClass::class_t, AltrepClass::Inspect);
+ R_set_altrep_Duplicate_method(AltrepClass::class_t, AltrepClass::Duplicate);
+ R_set_altrep_Serialized_state_method(AltrepClass::class_t,
+ AltrepClass::Serialized_state);
+ R_set_altrep_Unserialize_method(AltrepClass::class_t, AltrepClass::Unserialize);
+ R_set_altrep_Coerce_method(AltrepClass::class_t, AltrepClass::Coerce);
+
+ R_set_altvec_Dataptr_method(AltrepClass::class_t, AltrepClass::Dataptr);
+ R_set_altvec_Dataptr_or_null_method(AltrepClass::class_t, AltrepClass::Dataptr_or_null);
+
+ R_set_altstring_Elt_method(AltrepClass::class_t, AltrepClass::Elt);
+ R_set_altstring_Set_elt_method(AltrepClass::class_t, AltrepClass::Set_elt);
+ R_set_altstring_No_NA_method(AltrepClass::class_t, AltrepClass::No_NA);
+ R_set_altstring_Is_sorted_method(AltrepClass::class_t, AltrepClass::Is_sorted);
+}
+
+} // namespace
+
+// initialize the altrep classes
+void Init_Altrep_classes(DllInfo* dll) {
+ InitAltRealClass<AltrepVectorPrimitive<REALSXP>>(dll, "arrow::array_dbl_vector");
+ InitAltIntegerClass<AltrepVectorPrimitive<INTSXP>>(dll, "arrow::array_int_vector");
+
+ InitAltStringClass<AltrepVectorString<StringType>>(dll, "arrow::array_string_vector");
+ InitAltStringClass<AltrepVectorString<LargeStringType>>(
+ dll, "arrow::array_large_string_vector");
+}
+
+// return an altrep R vector that shadows the array if possible
+SEXP MakeAltrepVector(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ // special case when there is only one array
+ if (chunked_array->num_chunks() == 1) {
+ const auto& array = chunked_array->chunk(0);
+ // using altrep if
+ // - the arrow.use_altrep is set to TRUE or unset (implicit TRUE)
+ // - the array has at least one element
+ if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) {
+ switch (array->type()->id()) {
+ case arrow::Type::DOUBLE:
+ return altrep::AltrepVectorPrimitive<REALSXP>::Make(array);
+
+ case arrow::Type::INT32:
+ return altrep::AltrepVectorPrimitive<INTSXP>::Make(array);
+
+ case arrow::Type::STRING:
+ return altrep::AltrepVectorString<StringType>::Make(array);
+
+ case arrow::Type::LARGE_STRING:
+ return altrep::AltrepVectorString<LargeStringType>::Make(array);
+
+ default:
+ break;
+ }
+ }
+ }
+ return R_NilValue;
+}
+
+bool is_arrow_altrep(SEXP x) {
+ if (ALTREP(x)) {
+ SEXP info = ALTREP_CLASS_SERIALIZED_CLASS(ALTREP_CLASS(x));
+ SEXP pkg = ALTREP_SERIALIZED_CLASS_PKGSYM(info);
+
+ if (pkg == symbols::arrow) return true;
+ }
+
+ return false;
+}
+
+std::shared_ptr<Array> vec_to_arrow_altrep_bypass(SEXP x) {
+ if (is_arrow_altrep(x)) {
+ return GetArray(x);
+ }
+
+ return nullptr;
+}
+
+} // namespace altrep
+} // namespace r
+} // namespace arrow
+
+#else // HAS_ALTREP
+
+namespace arrow {
+namespace r {
+namespace altrep {
+
+// return an altrep R vector that shadows the array if possible
+SEXP MakeAltrepVector(const std::shared_ptr<ChunkedArray>& chunked_array) {
+ return R_NilValue;
+}
+
+bool is_arrow_altrep(SEXP) { return false; }
+
+std::shared_ptr<Array> vec_to_arrow_altrep_bypass(SEXP x) { return nullptr; }
+
+} // namespace altrep
+} // namespace r
+} // namespace arrow
+
+#endif
+
+// [[arrow::export]]
+void test_SET_STRING_ELT(SEXP s) { SET_STRING_ELT(s, 0, Rf_mkChar("forbidden")); }
+
+// [[arrow::export]]
+bool test_same_Array(SEXP x, SEXP y) {
+ auto* p_x = reinterpret_cast<std::shared_ptr<arrow::Array>*>(x);
+ auto* p_y = reinterpret_cast<std::shared_ptr<arrow::Array>*>(y);
+
+ return p_x->get() == p_y->get();
+}
+
+// [[arrow::export]]
+bool is_arrow_altrep(SEXP x) { return arrow::r::altrep::is_arrow_altrep(x); }
+
+#endif
diff --git a/src/arrow/r/src/array.cpp b/src/arrow/r/src/array.cpp
new file mode 100644
index 000000000..9601ee43c
--- /dev/null
+++ b/src/arrow/r/src/array.cpp
@@ -0,0 +1,286 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+namespace cpp11 {
+
+const char* r6_class_name<arrow::Array>::get(const std::shared_ptr<arrow::Array>& array) {
+ auto type = array->type_id();
+ switch (type) {
+ case arrow::Type::DICTIONARY:
+ return "DictionaryArray";
+ case arrow::Type::STRUCT:
+ return "StructArray";
+ case arrow::Type::LIST:
+ return "ListArray";
+ case arrow::Type::LARGE_LIST:
+ return "LargeListArray";
+ case arrow::Type::FIXED_SIZE_LIST:
+ return "FixedSizeListArray";
+
+ default:
+ return "Array";
+ }
+}
+
+} // namespace cpp11
+
+void arrow::r::validate_slice_offset(R_xlen_t offset, int64_t len) {
+ if (offset == NA_INTEGER) {
+ cpp11::stop("Slice 'offset' cannot be NA");
+ }
+ if (offset < 0) {
+ cpp11::stop("Slice 'offset' cannot be negative");
+ }
+ if (offset > len) {
+ cpp11::stop("Slice 'offset' greater than array length");
+ }
+}
+
+void arrow::r::validate_slice_length(R_xlen_t length, int64_t available) {
+ if (length == NA_INTEGER) {
+ cpp11::stop("Slice 'length' cannot be NA");
+ }
+ if (length < 0) {
+ cpp11::stop("Slice 'length' cannot be negative");
+ }
+ if (length > available) {
+ cpp11::warning("Slice 'length' greater than available length");
+ }
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> Array__Slice1(const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t offset) {
+ arrow::r::validate_slice_offset(offset, array->length());
+ return array->Slice(offset);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> Array__Slice2(const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t offset, R_xlen_t length) {
+ arrow::r::validate_slice_offset(offset, array->length());
+ arrow::r::validate_slice_length(length, array->length() - offset);
+ return array->Slice(offset, length);
+}
+
+void arrow::r::validate_index(int i, int len) {
+ if (i == NA_INTEGER) {
+ cpp11::stop("'i' cannot be NA");
+ }
+ if (i < 0 || i >= len) {
+ cpp11::stop("subscript out of bounds");
+ }
+}
+
+// [[arrow::export]]
+bool Array__IsNull(const std::shared_ptr<arrow::Array>& x, R_xlen_t i) {
+ arrow::r::validate_index(i, x->length());
+ return x->IsNull(i);
+}
+
+// [[arrow::export]]
+bool Array__IsValid(const std::shared_ptr<arrow::Array>& x, R_xlen_t i) {
+ arrow::r::validate_index(i, x->length());
+ return x->IsValid(i);
+}
+
+// [[arrow::export]]
+int Array__length(const std::shared_ptr<arrow::Array>& x) { return x->length(); }
+
+// [[arrow::export]]
+int Array__offset(const std::shared_ptr<arrow::Array>& x) { return x->offset(); }
+
+// [[arrow::export]]
+int Array__null_count(const std::shared_ptr<arrow::Array>& x) { return x->null_count(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Array__type(const std::shared_ptr<arrow::Array>& x) {
+ return x->type();
+}
+
+// [[arrow::export]]
+std::string Array__ToString(const std::shared_ptr<arrow::Array>& x) {
+ return x->ToString();
+}
+
+// [[arrow::export]]
+arrow::Type::type Array__type_id(const std::shared_ptr<arrow::Array>& x) {
+ return x->type_id();
+}
+
+// [[arrow::export]]
+bool Array__Equals(const std::shared_ptr<arrow::Array>& lhs,
+ const std::shared_ptr<arrow::Array>& rhs) {
+ return lhs->Equals(rhs);
+}
+
+// [[arrow::export]]
+bool Array__ApproxEquals(const std::shared_ptr<arrow::Array>& lhs,
+ const std::shared_ptr<arrow::Array>& rhs) {
+ return lhs->ApproxEquals(rhs);
+}
+
+// [[arrow::export]]
+std::string Array__Diff(const std::shared_ptr<arrow::Array>& lhs,
+ const std::shared_ptr<arrow::Array>& rhs) {
+ return lhs->Diff(*rhs);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ArrayData> Array__data(
+ const std::shared_ptr<arrow::Array>& array) {
+ return array->data();
+}
+
+// [[arrow::export]]
+bool Array__RangeEquals(const std::shared_ptr<arrow::Array>& self,
+ const std::shared_ptr<arrow::Array>& other, R_xlen_t start_idx,
+ R_xlen_t end_idx, R_xlen_t other_start_idx) {
+ if (start_idx == NA_INTEGER) {
+ cpp11::stop("'start_idx' cannot be NA");
+ }
+ if (end_idx == NA_INTEGER) {
+ cpp11::stop("'end_idx' cannot be NA");
+ }
+ if (other_start_idx == NA_INTEGER) {
+ cpp11::stop("'other_start_idx' cannot be NA");
+ }
+ return self->RangeEquals(*other, start_idx, end_idx, other_start_idx);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> Array__View(const std::shared_ptr<arrow::Array>& array,
+ const std::shared_ptr<arrow::DataType>& type) {
+ return ValueOrStop(array->View(type));
+}
+
+// [[arrow::export]]
+void Array__Validate(const std::shared_ptr<arrow::Array>& array) {
+ StopIfNotOk(array->Validate());
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> DictionaryArray__indices(
+ const std::shared_ptr<arrow::DictionaryArray>& array) {
+ return array->indices();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> DictionaryArray__dictionary(
+ const std::shared_ptr<arrow::DictionaryArray>& array) {
+ return array->dictionary();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> StructArray__field(
+ const std::shared_ptr<arrow::StructArray>& array, int i) {
+ return array->field(i);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> StructArray__GetFieldByName(
+ const std::shared_ptr<arrow::StructArray>& array, const std::string& name) {
+ return array->GetFieldByName(name);
+}
+
+// [[arrow::export]]
+cpp11::list StructArray__Flatten(const std::shared_ptr<arrow::StructArray>& array) {
+ return arrow::r::to_r_list(ValueOrStop(array->Flatten()));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ListArray__value_type(
+ const std::shared_ptr<arrow::ListArray>& array) {
+ return array->value_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> LargeListArray__value_type(
+ const std::shared_ptr<arrow::LargeListArray>& array) {
+ return array->value_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> ListArray__values(
+ const std::shared_ptr<arrow::ListArray>& array) {
+ return array->values();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> LargeListArray__values(
+ const std::shared_ptr<arrow::LargeListArray>& array) {
+ return array->values();
+}
+
+// [[arrow::export]]
+int32_t ListArray__value_length(const std::shared_ptr<arrow::ListArray>& array,
+ int64_t i) {
+ return array->value_length(i);
+}
+
+// [[arrow::export]]
+int64_t LargeListArray__value_length(const std::shared_ptr<arrow::LargeListArray>& array,
+ int64_t i) {
+ return array->value_length(i);
+}
+
+// [[arrow::export]]
+int64_t FixedSizeListArray__value_length(
+ const std::shared_ptr<arrow::FixedSizeListArray>& array, int64_t i) {
+ return array->value_length(i);
+}
+
+// [[arrow::export]]
+int32_t ListArray__value_offset(const std::shared_ptr<arrow::ListArray>& array,
+ int64_t i) {
+ return array->value_offset(i);
+}
+
+// [[arrow::export]]
+int64_t LargeListArray__value_offset(const std::shared_ptr<arrow::LargeListArray>& array,
+ int64_t i) {
+ return array->value_offset(i);
+}
+
+// [[arrow::export]]
+int64_t FixedSizeListArray__value_offset(
+ const std::shared_ptr<arrow::FixedSizeListArray>& array, int64_t i) {
+ return array->value_offset(i);
+}
+
+// [[arrow::export]]
+cpp11::writable::integers ListArray__raw_value_offsets(
+ const std::shared_ptr<arrow::ListArray>& array) {
+ auto offsets = array->raw_value_offsets();
+ return cpp11::writable::integers(offsets, offsets + array->length());
+}
+
+// [[arrow::export]]
+cpp11::writable::integers LargeListArray__raw_value_offsets(
+ const std::shared_ptr<arrow::LargeListArray>& array) {
+ auto offsets = array->raw_value_offsets();
+ return cpp11::writable::integers(offsets, offsets + array->length());
+}
+
+#endif
diff --git a/src/arrow/r/src/array_to_vector.cpp b/src/arrow/r/src/array_to_vector.cpp
new file mode 100644
index 000000000..480eb8629
--- /dev/null
+++ b/src/arrow/r/src/array_to_vector.cpp
@@ -0,0 +1,1317 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// or more contributor license agreements. See the NOTICE file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/array.h>
+#include <arrow/builder.h>
+#include <arrow/datum.h>
+#include <arrow/table.h>
+#include <arrow/util/bitmap_reader.h>
+#include <arrow/util/bitmap_writer.h>
+#include <arrow/util/int_util.h>
+
+#include <cpp11/altrep.hpp>
+#include <type_traits>
+
+#include "./r_task_group.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::IntegersCanFit;
+
+namespace r {
+
+class Converter {
+ public:
+ explicit Converter(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : chunked_array_(std::move(chunked_array)) {}
+
+ virtual ~Converter() {}
+
+ // Allocate a vector of the right R type for this converter
+ virtual SEXP Allocate(R_xlen_t n) const = 0;
+
+ // data[ start:(start + n) ] = NA
+ virtual Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const = 0;
+
+ // ingest the values from the array into data[ start : (start + n)]
+ //
+ // chunk_index indicates which of the chunk is being ingested into data. This is
+ // ignored by most implementations and currently only used with Dictionary
+ // arrays.
+ virtual Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n,
+ size_t chunk_index) const = 0;
+
+ // can this run in parallel ?
+ virtual bool Parallel() const { return true; }
+
+ // converter is passed as self to outlive the scope of Converter::Convert()
+ SEXP ScheduleConvertTasks(RTasks& tasks, std::shared_ptr<Converter> self) {
+ // try altrep first
+ SEXP alt = altrep::MakeAltrepVector(chunked_array_);
+ if (!Rf_isNull(alt)) {
+ return alt;
+ }
+
+ // otherwise use the Converter api:
+
+ // allocating the R vector upfront
+ SEXP out = PROTECT(Allocate(chunked_array_->length()));
+
+ // for each array, fill the relevant slice of `out`, potentially in parallel
+ R_xlen_t k = 0, i = 0;
+ for (const auto& array : chunked_array_->chunks()) {
+ auto n_chunk = array->length();
+
+ tasks.Append(Parallel(), [=] {
+ if (array->null_count() == n_chunk) {
+ return self->Ingest_all_nulls(out, k, n_chunk);
+ } else {
+ return self->Ingest_some_nulls(out, array, k, n_chunk, i);
+ }
+ });
+
+ k += n_chunk;
+ i++;
+ }
+
+ UNPROTECT(1);
+ return out;
+ }
+
+ // Converter factory
+ static std::shared_ptr<Converter> Make(
+ const std::shared_ptr<ChunkedArray>& chunked_array);
+
+ static SEXP LazyConvert(const std::shared_ptr<ChunkedArray>& chunked_array,
+ RTasks& tasks) {
+ auto converter = Make(chunked_array);
+ return converter->ScheduleConvertTasks(tasks, converter);
+ }
+
+ static SEXP Convert(const std::shared_ptr<ChunkedArray>& chunked_array,
+ bool use_threads) {
+ RTasks tasks(use_threads);
+ SEXP out = PROTECT(Converter::LazyConvert(chunked_array, tasks));
+ StopIfNotOk(tasks.Finish());
+
+ UNPROTECT(1);
+ return out;
+ }
+
+ static SEXP Convert(const std::shared_ptr<Array>& array) {
+ return Convert(std::make_shared<ChunkedArray>(array), false);
+ }
+
+ SEXP MaybeAltrep() { return altrep::MakeAltrepVector(chunked_array_); }
+
+ protected:
+ std::shared_ptr<ChunkedArray> chunked_array_;
+};
+
+template <typename SetNonNull, typename SetNull>
+Status IngestSome(const std::shared_ptr<arrow::Array>& array, R_xlen_t n,
+ SetNonNull&& set_non_null, SetNull&& set_null) {
+ if (array->null_count()) {
+ internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(),
+ n);
+
+ for (R_xlen_t i = 0; i < n; i++, bitmap_reader.Next()) {
+ if (bitmap_reader.IsSet()) {
+ RETURN_NOT_OK(set_non_null(i));
+ } else {
+ RETURN_NOT_OK(set_null(i));
+ }
+ }
+
+ } else {
+ for (R_xlen_t i = 0; i < n; i++) {
+ RETURN_NOT_OK(set_non_null(i));
+ }
+ }
+
+ return Status::OK();
+}
+
+template <typename SetNonNull>
+Status IngestSome(const std::shared_ptr<arrow::Array>& array, R_xlen_t n,
+ SetNonNull&& set_non_null) {
+ auto nothing = [](R_xlen_t i) { return Status::OK(); };
+ return IngestSome(array, n, std::forward<SetNonNull>(set_non_null), nothing);
+}
+
+std::shared_ptr<Array> CreateEmptyArray(const std::shared_ptr<DataType>& array_type) {
+ std::unique_ptr<arrow::ArrayBuilder> builder;
+ StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), array_type, &builder));
+
+ std::shared_ptr<arrow::Array> array;
+ StopIfNotOk(builder->Finish(&array));
+ return array;
+}
+
+template <typename Type>
+class Converter_Int : public Converter {
+ using value_type = typename TypeTraits<Type>::ArrayType::value_type;
+
+ public:
+ explicit Converter_Int(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const { return Rf_allocVector(INTSXP, n); }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(INTEGER(data) + start, n, NA_INTEGER);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_values = array->data()->GetValues<value_type>(1);
+ if (!p_values) {
+ return Status::Invalid("Invalid data buffer");
+ }
+ auto p_data = INTEGER(data) + start;
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = static_cast<int>(p_values[i]);
+ return Status::OK();
+ };
+ auto null_one = [&](R_xlen_t i) {
+ p_data[i] = NA_INTEGER;
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+};
+
+template <typename Type>
+class Converter_Double : public Converter {
+ using value_type = typename TypeTraits<Type>::ArrayType::value_type;
+
+ public:
+ explicit Converter_Double(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const { return Rf_allocVector(REALSXP, n); }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(REAL(data) + start, n, NA_REAL);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_values = array->data()->GetValues<value_type>(1);
+ if (!p_values) {
+ return Status::Invalid("Invalid data buffer");
+ }
+ auto p_data = REAL(data) + start;
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = static_cast<value_type>(p_values[i]);
+ return Status::OK();
+ };
+ auto null_one = [&](R_xlen_t i) {
+ p_data[i] = NA_REAL;
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+};
+
+class Converter_Date32 : public Converter {
+ public:
+ explicit Converter_Date32(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ SEXP data = PROTECT(Rf_allocVector(REALSXP, n));
+ Rf_classgets(data, Rf_mkString("Date"));
+ UNPROTECT(1);
+ return data;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(REAL(data) + start, n, NA_REAL);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_values = array->data()->GetValues<int>(1);
+ if (!p_values) {
+ return Status::Invalid("Invalid data buffer");
+ }
+ auto p_data = REAL(data) + start;
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = static_cast<double>(p_values[i]);
+ return Status::OK();
+ };
+ auto null_one = [&](R_xlen_t i) {
+ p_data[i] = NA_REAL;
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+};
+
+template <typename StringArrayType>
+struct Converter_String : public Converter {
+ public:
+ explicit Converter_String(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const { return Rf_allocVector(STRSXP, n); }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ for (R_xlen_t i = 0; i < n; i++) {
+ SET_STRING_ELT(data, i + start, NA_STRING);
+ }
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_offset = array->data()->GetValues<int32_t>(1);
+ if (!p_offset) {
+ return Status::Invalid("Invalid offset buffer");
+ }
+ auto p_strings = array->data()->GetValues<char>(2, *p_offset);
+ if (!p_strings) {
+ // There is an offset buffer, but the data buffer is null
+ // There is at least one value in the array and not all the values are null
+ // That means all values are either empty strings or nulls so there is nothing to do
+
+ if (array->null_count()) {
+ arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
+ array->offset(), n);
+ for (int i = 0; i < n; i++, null_reader.Next()) {
+ if (null_reader.IsNotSet()) {
+ SET_STRING_ELT(data, start + i, NA_STRING);
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
+
+ const bool all_valid = array->null_count() == 0;
+ const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);
+
+ bool nul_was_stripped = false;
+
+ if (all_valid) {
+ // no need to watch for missing strings
+ cpp11::unwind_protect([&] {
+ if (strip_out_nuls) {
+ for (int i = 0; i < n; i++) {
+ SET_STRING_ELT(data, start + i,
+ r_string_from_view_strip_nul(string_array->GetView(i),
+ &nul_was_stripped));
+ }
+ return;
+ }
+
+ for (int i = 0; i < n; i++) {
+ SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
+ }
+ });
+ } else {
+ cpp11::unwind_protect([&] {
+ arrow::internal::BitmapReader validity_reader(array->null_bitmap_data(),
+ array->offset(), n);
+
+ if (strip_out_nuls) {
+ for (int i = 0; i < n; i++, validity_reader.Next()) {
+ if (validity_reader.IsSet()) {
+ SET_STRING_ELT(data, start + i,
+ r_string_from_view_strip_nul(string_array->GetView(i),
+ &nul_was_stripped));
+ } else {
+ SET_STRING_ELT(data, start + i, NA_STRING);
+ }
+ }
+ return;
+ }
+
+ for (int i = 0; i < n; i++, validity_reader.Next()) {
+ if (validity_reader.IsSet()) {
+ SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
+ } else {
+ SET_STRING_ELT(data, start + i, NA_STRING);
+ }
+ }
+ });
+ }
+
+ if (nul_was_stripped) {
+ cpp11::warning("Stripping '\\0' (nul) from character vector");
+ }
+
+ return Status::OK();
+ }
+
+ bool Parallel() const { return false; }
+
+ private:
+ static SEXP r_string_from_view(arrow::util::string_view view) {
+ return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8);
+ }
+
+ static SEXP r_string_from_view_strip_nul(arrow::util::string_view view,
+ bool* nul_was_stripped) {
+ const char* old_string = view.data();
+
+ std::string stripped_string;
+ size_t stripped_len = 0, nul_count = 0;
+
+ for (size_t i = 0; i < view.size(); i++) {
+ if (old_string[i] == '\0') {
+ ++nul_count;
+
+ if (nul_count == 1) {
+ // first nul spotted: allocate stripped string storage
+ stripped_string = view.to_string();
+ stripped_len = i;
+ }
+
+ // don't copy old_string[i] (which is \0) into stripped_string
+ continue;
+ }
+
+ if (nul_count > 0) {
+ stripped_string[stripped_len++] = old_string[i];
+ }
+ }
+
+ if (nul_count > 0) {
+ *nul_was_stripped = true;
+ stripped_string.resize(stripped_len);
+ return r_string_from_view(stripped_string);
+ }
+
+ return r_string_from_view(view);
+ }
+};
+
+class Converter_Boolean : public Converter {
+ public:
+ explicit Converter_Boolean(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const { return Rf_allocVector(LGLSXP, n); }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(LOGICAL(data) + start, n, NA_LOGICAL);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_data = LOGICAL(data) + start;
+ auto p_bools = array->data()->GetValues<uint8_t>(1, 0);
+ if (!p_bools) {
+ return Status::Invalid("Invalid data buffer");
+ }
+
+ arrow::internal::BitmapReader data_reader(p_bools, array->offset(), n);
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = data_reader.IsSet();
+ data_reader.Next();
+ return Status::OK();
+ };
+
+ auto null_one = [&](R_xlen_t i) {
+ data_reader.Next();
+ p_data[i] = NA_LOGICAL;
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+};
+
+template <typename ArrayType>
+class Converter_Binary : public Converter {
+ public:
+ using offset_type = typename ArrayType::offset_type;
+ explicit Converter_Binary(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ SEXP res = PROTECT(Rf_allocVector(VECSXP, n));
+ if (std::is_same<ArrayType, BinaryArray>::value) {
+ Rf_classgets(res, data::classes_arrow_binary);
+ } else {
+ Rf_classgets(res, data::classes_arrow_large_binary);
+ }
+ UNPROTECT(1);
+ return res;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ const ArrayType* binary_array = checked_cast<const ArrayType*>(array.get());
+
+ auto ingest_one = [&](R_xlen_t i) {
+ offset_type ni;
+ auto value = binary_array->GetValue(i, &ni);
+ if (ni > R_XLEN_T_MAX) {
+ return Status::RError("Array too big to be represented as a raw vector");
+ }
+ SEXP raw = PROTECT(Rf_allocVector(RAWSXP, ni));
+ std::copy(value, value + ni, RAW(raw));
+
+ SET_VECTOR_ELT(data, i + start, raw);
+ UNPROTECT(1);
+
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one);
+ }
+
+ virtual bool Parallel() const { return false; }
+};
+
+class Converter_FixedSizeBinary : public Converter {
+ public:
+ explicit Converter_FixedSizeBinary(const std::shared_ptr<ChunkedArray>& chunked_array,
+ int byte_width)
+ : Converter(chunked_array), byte_width_(byte_width) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ SEXP res = PROTECT(Rf_allocVector(VECSXP, n));
+ Rf_classgets(res, data::classes_arrow_fixed_size_binary);
+ Rf_setAttrib(res, symbols::byte_width, Rf_ScalarInteger(byte_width_));
+ UNPROTECT(1);
+ return res;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ const FixedSizeBinaryArray* binary_array =
+ checked_cast<const FixedSizeBinaryArray*>(array.get());
+
+ int byte_width = binary_array->byte_width();
+ auto ingest_one = [&, byte_width](R_xlen_t i) {
+ auto value = binary_array->GetValue(i);
+ SEXP raw = PROTECT(Rf_allocVector(RAWSXP, byte_width));
+ std::copy(value, value + byte_width, RAW(raw));
+
+ SET_VECTOR_ELT(data, i + start, raw);
+ UNPROTECT(1);
+
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one);
+ }
+
+ virtual bool Parallel() const { return false; }
+
+ private:
+ int byte_width_;
+};
+
+class Converter_Dictionary : public Converter {
+ private:
+ bool need_unification_;
+ std::unique_ptr<arrow::DictionaryUnifier> unifier_;
+ std::vector<std::shared_ptr<Buffer>> arrays_transpose_;
+ std::shared_ptr<DataType> out_type_;
+ std::shared_ptr<Array> dictionary_;
+
+ public:
+ explicit Converter_Dictionary(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array), need_unification_(NeedUnification()) {
+ if (need_unification_) {
+ const auto& arr_type = checked_cast<const DictionaryType&>(*chunked_array->type());
+ unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type()));
+
+ size_t n_arrays = chunked_array->num_chunks();
+ arrays_transpose_.resize(n_arrays);
+
+ for (size_t i = 0; i < n_arrays; i++) {
+ const auto& dict_i =
+ *checked_cast<const DictionaryArray&>(*chunked_array->chunk(i)).dictionary();
+ StopIfNotOk(unifier_->Unify(dict_i, &arrays_transpose_[i]));
+ }
+
+ StopIfNotOk(unifier_->GetResult(&out_type_, &dictionary_));
+ } else {
+ const auto& dict_type = checked_cast<const DictionaryType&>(*chunked_array->type());
+
+ const auto& indices_type = *dict_type.index_type();
+ switch (indices_type.id()) {
+ case Type::UINT8:
+ case Type::INT8:
+ case Type::UINT16:
+ case Type::INT16:
+ case Type::INT32:
+ // TODO: also add int64, uint32, uint64 downcasts, if possible
+ break;
+ default:
+ cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
+ dict_type.ToString().c_str());
+ }
+
+ if (chunked_array->num_chunks() > 0) {
+ // NeedUnification() returned false so we can safely assume the
+ // dictionary of the first chunk applies everywhere
+ const auto& dict_array =
+ checked_cast<const DictionaryArray&>(*chunked_array->chunk(0));
+ dictionary_ = dict_array.dictionary();
+ } else {
+ dictionary_ = CreateEmptyArray(dict_type.value_type());
+ }
+ }
+ }
+
+ SEXP Allocate(R_xlen_t n) const {
+ cpp11::writable::integers data(n);
+ data.attr("levels") = GetLevels();
+ if (GetOrdered()) {
+ Rf_classgets(data, arrow::r::data::classes_ordered);
+ } else {
+ Rf_classgets(data, arrow::r::data::classes_factor);
+ }
+ return data;
+ }
+
+ virtual bool Parallel() const { return false; }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(INTEGER(data) + start, n, NA_INTEGER);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ const DictionaryArray& dict_array =
+ checked_cast<const DictionaryArray&>(*array.get());
+ auto indices = dict_array.indices();
+ switch (indices->type_id()) {
+ case Type::UINT8:
+ return Ingest_some_nulls_Impl<arrow::UInt8Type>(data, array, start, n,
+ chunk_index);
+ case Type::INT8:
+ return Ingest_some_nulls_Impl<arrow::Int8Type>(data, array, start, n,
+ chunk_index);
+ case Type::UINT16:
+ return Ingest_some_nulls_Impl<arrow::UInt16Type>(data, array, start, n,
+ chunk_index);
+ case Type::INT16:
+ return Ingest_some_nulls_Impl<arrow::Int16Type>(data, array, start, n,
+ chunk_index);
+ case Type::INT32:
+ return Ingest_some_nulls_Impl<arrow::Int32Type>(data, array, start, n,
+ chunk_index);
+ default:
+ break;
+ }
+ return Status::OK();
+ }
+
+ private:
+ template <typename Type>
+ Status Ingest_some_nulls_Impl(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ using index_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
+ auto indices = checked_cast<const DictionaryArray&>(*array).indices();
+ auto raw_indices = indices->data()->GetValues<index_type>(1);
+
+ auto p_data = INTEGER(data) + start;
+ auto null_one = [&](R_xlen_t i) {
+ p_data[i] = NA_INTEGER;
+ return Status::OK();
+ };
+
+ // convert the 0-based indices from the arrow Array
+ // to 1-based indices used in R factors
+ if (need_unification_) {
+ // transpose the indices before converting
+ auto transposed =
+ reinterpret_cast<const int32_t*>(arrays_transpose_[chunk_index]->data());
+
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = transposed[raw_indices[i]] + 1;
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one, null_one);
+ } else {
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = static_cast<int>(raw_indices[i]) + 1;
+ return Status::OK();
+ };
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+ }
+
+ bool NeedUnification() {
+ int n = chunked_array_->num_chunks();
+ if (n < 2) {
+ return false;
+ }
+ const auto& arr_first =
+ checked_cast<const DictionaryArray&>(*chunked_array_->chunk(0));
+ for (int i = 1; i < n; i++) {
+ const auto& arr = checked_cast<const DictionaryArray&>(*chunked_array_->chunk(i));
+ if (!(arr_first.dictionary()->Equals(arr.dictionary()))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool GetOrdered() const {
+ return checked_cast<const DictionaryType&>(*chunked_array_->type()).ordered();
+ }
+
+ SEXP GetLevels() const {
+ // R factor levels must be type "character" so coerce `dict` to STRSXP
+ // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;)
+ // Alternative: preserve the logical type of the dictionary values
+ // (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
+ if (dictionary_->type_id() != Type::STRING) {
+ cpp11::warning("Coercing dictionary values to R character factor levels");
+ }
+
+ SEXP vec = PROTECT(Converter::Convert(dictionary_));
+ SEXP strings_vec = PROTECT(Rf_coerceVector(vec, STRSXP));
+ UNPROTECT(2);
+ return strings_vec;
+ }
+};
+
+class Converter_Struct : public Converter {
+ public:
+ explicit Converter_Struct(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array), converters() {
+ const auto& struct_type =
+ checked_cast<const arrow::StructType&>(*chunked_array->type());
+
+ int nf = struct_type.num_fields();
+
+ std::shared_ptr<arrow::Table> array_as_table =
+ ValueOrStop(arrow::Table::FromChunkedStructArray(chunked_array));
+ for (int i = 0; i < nf; i++) {
+ converters.push_back(Converter::Make(array_as_table->column(i)));
+ }
+ }
+
+ SEXP Allocate(R_xlen_t n) const {
+ // allocate a data frame column to host each array
+ auto type =
+ checked_cast<const arrow::StructType*>(this->chunked_array_->type().get());
+ auto out = arrow::r::to_r_list(
+ converters, [n, this](const std::shared_ptr<Converter>& converter) {
+ // when there is only one chunk, perhaps this field
+ // can be dealt with upfront with altrep
+ if (this->chunked_array_->num_chunks() == 1) {
+ SEXP alt = converter->MaybeAltrep();
+ if (!Rf_isNull(alt)) {
+ return alt;
+ }
+ }
+
+ return converter->Allocate(n);
+ });
+ auto colnames = arrow::r::to_r_strings(
+ type->fields(),
+ [](const std::shared_ptr<Field>& field) { return field->name(); });
+ out.attr(symbols::row_names) = arrow::r::short_row_names(n);
+ out.attr(R_NamesSymbol) = colnames;
+ out.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df;
+
+ return out;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ int nf = converters.size();
+ for (int i = 0; i < nf; i++) {
+ SEXP data_i = VECTOR_ELT(data, i);
+
+ // only ingest if the column is not altrep
+ if (!is_altrep(data_i)) {
+ StopIfNotOk(converters[i]->Ingest_all_nulls(data_i, start, n));
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto struct_array = checked_cast<const arrow::StructArray*>(array.get());
+ int nf = converters.size();
+ // Flatten() deals with merging of nulls
+ auto arrays = ValueOrStop(struct_array->Flatten(gc_memory_pool()));
+ for (int i = 0; i < nf; i++) {
+ SEXP data_i = VECTOR_ELT(data, i);
+
+ // only ingest if the column is not altrep
+ if (!is_altrep(data_i)) {
+ StopIfNotOk(converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i), arrays[i],
+ start, n, chunk_index));
+ }
+ }
+
+ return Status::OK();
+ }
+
+ virtual bool Parallel() const {
+ // this can only run in parallel if all the
+ // inner converters can
+ for (const auto& converter : converters) {
+ if (!converter->Parallel()) return false;
+ }
+ return true;
+ }
+
+ private:
+ std::vector<std::shared_ptr<Converter>> converters;
+
+ bool is_altrep(SEXP x) const { return ALTREP(x); }
+};
+
+double ms_to_seconds(int64_t ms) { return static_cast<double>(ms) / 1000; }
+
+class Converter_Date64 : public Converter {
+ public:
+ explicit Converter_Date64(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ cpp11::writable::doubles data(n);
+ Rf_classgets(data, arrow::r::data::classes_POSIXct);
+ return data;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(REAL(data) + start, n, NA_REAL);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_data = REAL(data) + start;
+ auto p_values = array->data()->GetValues<int64_t>(1);
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = static_cast<double>(p_values[i] / 1000);
+ return Status::OK();
+ };
+ auto null_one = [&](R_xlen_t i) {
+ p_data[i] = NA_REAL;
+ return Status::OK();
+ };
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+};
+
+template <typename value_type, typename unit_type = TimeType>
+class Converter_Time : public Converter {
+ public:
+ explicit Converter_Time(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ cpp11::writable::doubles data(n);
+ data.attr("class") = cpp11::writable::strings({"hms", "difftime"});
+
+ // hms difftime is always stored as "seconds"
+ data.attr("units") = cpp11::writable::strings({"secs"});
+ return data;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(REAL(data) + start, n, NA_REAL);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ int multiplier = TimeUnit_multiplier(array);
+
+ auto p_data = REAL(data) + start;
+ auto p_values = array->data()->GetValues<value_type>(1);
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = static_cast<double>(p_values[i]) / multiplier;
+ return Status::OK();
+ };
+ auto null_one = [&](R_xlen_t i) {
+ p_data[i] = NA_REAL;
+ return Status::OK();
+ };
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+
+ private:
+ int TimeUnit_multiplier(const std::shared_ptr<Array>& array) const {
+ // hms difftime is always "seconds", so multiply based on the Array's TimeUnit
+ switch (static_cast<unit_type*>(array->type().get())->unit()) {
+ case TimeUnit::SECOND:
+ return 1;
+ case TimeUnit::MILLI:
+ return 1000;
+ case TimeUnit::MICRO:
+ return 1000000;
+ case TimeUnit::NANO:
+ return 1000000000;
+ default:
+ return 0;
+ }
+ }
+};
+
+template <typename value_type>
+class Converter_Timestamp : public Converter_Time<value_type, TimestampType> {
+ public:
+ explicit Converter_Timestamp(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter_Time<value_type, TimestampType>(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ cpp11::writable::doubles data(n);
+ Rf_classgets(data, arrow::r::data::classes_POSIXct);
+ auto array_type =
+ checked_cast<const TimestampType*>(this->chunked_array_->type().get());
+ std::string tzone = array_type->timezone();
+ if (tzone.size() > 0) {
+ data.attr("tzone") = tzone;
+ }
+ return data;
+ }
+};
+
+class Converter_Decimal : public Converter {
+ public:
+ explicit Converter_Decimal(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const { return Rf_allocVector(REALSXP, n); }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ std::fill_n(REAL(data) + start, n, NA_REAL);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_data = REAL(data) + start;
+ const auto& decimals_arr = checked_cast<const arrow::Decimal128Array&>(*array);
+
+ auto ingest_one = [&](R_xlen_t i) {
+ p_data[i] = std::stod(decimals_arr.FormatValue(i).c_str());
+ return Status::OK();
+ };
+ auto null_one = [&](R_xlen_t i) {
+ p_data[i] = NA_REAL;
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one, null_one);
+ }
+};
+
+template <typename ListArrayType>
+class Converter_List : public Converter {
+ private:
+ std::shared_ptr<arrow::DataType> value_type_;
+
+ public:
+ explicit Converter_List(const std::shared_ptr<ChunkedArray>& chunked_array,
+ const std::shared_ptr<arrow::DataType>& value_type)
+ : Converter(chunked_array), value_type_(value_type) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ cpp11::writable::list res(n);
+ res.attr(R_ClassSymbol) = std::is_same<ListArrayType, ListArray>::value
+ ? arrow::r::data::classes_arrow_list
+ : arrow::r::data::classes_arrow_large_list;
+
+ std::shared_ptr<arrow::Array> array = CreateEmptyArray(value_type_);
+
+ // convert to an R object to store as the list' ptype
+ res.attr(arrow::r::symbols::ptype) = Converter::Convert(array);
+
+ return res;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ // nothing to do, list contain NULL by default
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto list_array = checked_cast<const ListArrayType*>(array.get());
+ auto values_array = list_array->values();
+
+ auto ingest_one = [&](R_xlen_t i) {
+ auto slice = list_array->value_slice(i);
+ SET_VECTOR_ELT(data, i + start, Converter::Convert(slice));
+ return Status::OK();
+ };
+
+ return IngestSome(array, n, ingest_one);
+ }
+
+ bool Parallel() const { return false; }
+};
+
+class Converter_FixedSizeList : public Converter {
+ private:
+ std::shared_ptr<arrow::DataType> value_type_;
+ int list_size_;
+
+ public:
+ explicit Converter_FixedSizeList(const std::shared_ptr<ChunkedArray>& chunked_array,
+ const std::shared_ptr<arrow::DataType>& value_type,
+ int list_size)
+ : Converter(chunked_array), value_type_(value_type), list_size_(list_size) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ cpp11::writable::list res(n);
+ Rf_classgets(res, arrow::r::data::classes_arrow_fixed_size_list);
+ res.attr(arrow::r::symbols::list_size) = Rf_ScalarInteger(list_size_);
+
+ std::shared_ptr<arrow::Array> array = CreateEmptyArray(value_type_);
+
+ // convert to an R object to store as the list' ptype
+ res.attr(arrow::r::symbols::ptype) = Converter::Convert(array);
+
+ return res;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ // nothing to do, list contain NULL by default
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ const auto& fixed_size_list_array = checked_cast<const FixedSizeListArray&>(*array);
+ auto values_array = fixed_size_list_array.values();
+
+ auto ingest_one = [&](R_xlen_t i) {
+ auto slice = fixed_size_list_array.value_slice(i);
+ SET_VECTOR_ELT(data, i + start, Converter::Convert(slice));
+ return Status::OK();
+ };
+ return IngestSome(array, n, ingest_one);
+ }
+
+ bool Parallel() const { return false; }
+};
+
+class Converter_Int64 : public Converter {
+ public:
+ explicit Converter_Int64(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ cpp11::writable::doubles data(n);
+ data.attr("class") = "integer64";
+ return data;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ auto p_data = reinterpret_cast<int64_t*>(REAL(data)) + start;
+ std::fill_n(p_data, n, NA_INT64);
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ auto p_values = array->data()->GetValues<int64_t>(1);
+ if (!p_values) {
+ return Status::Invalid("Invalid data buffer");
+ }
+
+ auto p_data = reinterpret_cast<int64_t*>(REAL(data)) + start;
+
+ if (array->null_count()) {
+ internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(),
+ n);
+ for (R_xlen_t i = 0; i < n; i++, bitmap_reader.Next(), ++p_data) {
+ *p_data = bitmap_reader.IsSet() ? p_values[i] : NA_INT64;
+ }
+ } else {
+ std::copy_n(p_values, n, p_data);
+ }
+
+ return Status::OK();
+ }
+};
+
+class Converter_Null : public Converter {
+ public:
+ explicit Converter_Null(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ SEXP data = PROTECT(Rf_allocVector(LGLSXP, n));
+ std::fill_n(LOGICAL(data), n, NA_LOGICAL);
+ Rf_classgets(data, Rf_mkString("vctrs_unspecified"));
+ UNPROTECT(1);
+ return data;
+ }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ return Status::OK();
+ }
+};
+
+bool ArraysCanFitInteger(ArrayVector arrays) {
+ bool all_can_fit = true;
+ auto i32 = arrow::int32();
+ for (const auto& array : arrays) {
+ if (all_can_fit) {
+ all_can_fit = arrow::IntegersCanFit(arrow::Datum(array), *i32).ok();
+ }
+ }
+ return all_can_fit;
+}
+
+bool GetBoolOption(const std::string& name, bool default_) {
+ SEXP getOption = Rf_install("getOption");
+ cpp11::sexp call = Rf_lang2(getOption, Rf_mkString(name.c_str()));
+ cpp11::sexp res = Rf_eval(call, R_BaseEnv);
+ if (TYPEOF(res) == LGLSXP) {
+ return LOGICAL(res)[0] == TRUE;
+ } else {
+ return default_;
+ }
+}
+
+std::shared_ptr<Converter> Converter::Make(
+ const std::shared_ptr<ChunkedArray>& chunked_array) {
+ const auto& type = chunked_array->type();
+ switch (type->id()) {
+ // direct support
+ case Type::INT32:
+ return std::make_shared<arrow::r::Converter_Int<arrow::Int32Type>>(chunked_array);
+
+ case Type::DOUBLE:
+ return std::make_shared<arrow::r::Converter_Double<arrow::DoubleType>>(
+ chunked_array);
+
+ // need to handle 1-bit case
+ case Type::BOOL:
+ return std::make_shared<arrow::r::Converter_Boolean>(chunked_array);
+
+ case Type::BINARY:
+ return std::make_shared<arrow::r::Converter_Binary<arrow::BinaryArray>>(
+ chunked_array);
+
+ case Type::LARGE_BINARY:
+ return std::make_shared<arrow::r::Converter_Binary<arrow::LargeBinaryArray>>(
+ chunked_array);
+
+ case Type::FIXED_SIZE_BINARY:
+ return std::make_shared<arrow::r::Converter_FixedSizeBinary>(
+ chunked_array, checked_cast<const FixedSizeBinaryType&>(*type).byte_width());
+
+ // handle memory dense strings
+ case Type::STRING:
+ return std::make_shared<arrow::r::Converter_String<arrow::StringArray>>(
+ chunked_array);
+
+ case Type::LARGE_STRING:
+ return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>(
+ chunked_array);
+
+ case Type::DICTIONARY:
+ return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array);
+
+ case Type::DATE32:
+ return std::make_shared<arrow::r::Converter_Date32>(chunked_array);
+
+ case Type::DATE64:
+ return std::make_shared<arrow::r::Converter_Date64>(chunked_array);
+
+ // promotions to integer vector
+ case Type::INT8:
+ return std::make_shared<arrow::r::Converter_Int<arrow::Int8Type>>(chunked_array);
+
+ case Type::UINT8:
+ return std::make_shared<arrow::r::Converter_Int<arrow::UInt8Type>>(chunked_array);
+
+ case Type::INT16:
+ return std::make_shared<arrow::r::Converter_Int<arrow::Int16Type>>(chunked_array);
+
+ case Type::UINT16:
+ return std::make_shared<arrow::r::Converter_Int<arrow::UInt16Type>>(chunked_array);
+
+ // promotions to numeric vector, if they don't fit into int32
+ case Type::UINT32:
+ if (ArraysCanFitInteger(chunked_array->chunks())) {
+ return std::make_shared<arrow::r::Converter_Int<arrow::UInt32Type>>(
+ chunked_array);
+ } else {
+ return std::make_shared<arrow::r::Converter_Double<arrow::UInt32Type>>(
+ chunked_array);
+ }
+
+ case Type::UINT64:
+ if (ArraysCanFitInteger(chunked_array->chunks())) {
+ return std::make_shared<arrow::r::Converter_Int<arrow::UInt64Type>>(
+ chunked_array);
+ } else {
+ return std::make_shared<arrow::r::Converter_Double<arrow::UInt64Type>>(
+ chunked_array);
+ }
+
+ case Type::HALF_FLOAT:
+ return std::make_shared<arrow::r::Converter_Double<arrow::HalfFloatType>>(
+ chunked_array);
+
+ case Type::FLOAT:
+ return std::make_shared<arrow::r::Converter_Double<arrow::FloatType>>(
+ chunked_array);
+
+ // time32 and time64
+ case Type::TIME32:
+ return std::make_shared<arrow::r::Converter_Time<int32_t>>(chunked_array);
+
+ case Type::TIME64:
+ return std::make_shared<arrow::r::Converter_Time<int64_t>>(chunked_array);
+
+ case Type::TIMESTAMP:
+ return std::make_shared<arrow::r::Converter_Timestamp<int64_t>>(chunked_array);
+
+ case Type::INT64:
+ // Prefer integer if it fits, unless option arrow.int64_downcast is `false`
+ if (GetBoolOption("arrow.int64_downcast", true) &&
+ ArraysCanFitInteger(chunked_array->chunks())) {
+ return std::make_shared<arrow::r::Converter_Int<arrow::Int64Type>>(chunked_array);
+ } else {
+ return std::make_shared<arrow::r::Converter_Int64>(chunked_array);
+ }
+
+ case Type::DECIMAL:
+ return std::make_shared<arrow::r::Converter_Decimal>(chunked_array);
+
+ // nested
+ case Type::STRUCT:
+ return std::make_shared<arrow::r::Converter_Struct>(chunked_array);
+
+ case Type::LIST:
+ return std::make_shared<arrow::r::Converter_List<arrow::ListArray>>(
+ chunked_array, checked_cast<const arrow::ListType*>(type.get())->value_type());
+
+ case Type::LARGE_LIST:
+ return std::make_shared<arrow::r::Converter_List<arrow::LargeListArray>>(
+ chunked_array,
+ checked_cast<const arrow::LargeListType*>(type.get())->value_type());
+
+ case Type::FIXED_SIZE_LIST:
+ return std::make_shared<arrow::r::Converter_FixedSizeList>(
+ chunked_array,
+ checked_cast<const arrow::FixedSizeListType&>(*type).value_type(),
+ checked_cast<const arrow::FixedSizeListType&>(*type).list_size());
+
+ case Type::NA:
+ return std::make_shared<arrow::r::Converter_Null>(chunked_array);
+
+ default:
+ break;
+ }
+
+ cpp11::stop("cannot handle Array of type ", type->name().c_str());
+}
+
+std::shared_ptr<ChunkedArray> to_chunks(const std::shared_ptr<Array>& array) {
+ return std::make_shared<ChunkedArray>(array);
+}
+
+std::shared_ptr<ChunkedArray> to_chunks(
+ const std::shared_ptr<ChunkedArray>& chunked_array) {
+ return chunked_array;
+}
+
+template <typename Rectangle>
+cpp11::writable::list to_data_frame(const std::shared_ptr<Rectangle>& data,
+ bool use_threads) {
+ int64_t nc = data->num_columns();
+ int64_t nr = data->num_rows();
+ cpp11::writable::strings names(nc);
+
+ arrow::r::RTasks tasks(use_threads);
+
+ cpp11::writable::list tbl(nc);
+
+ for (int i = 0; i < nc; i++) {
+ names[i] = data->schema()->field(i)->name();
+ tbl[i] = Converter::LazyConvert(to_chunks(data->column(i)), tasks);
+ }
+
+ StopIfNotOk(tasks.Finish());
+
+ tbl.attr(R_NamesSymbol) = names;
+ tbl.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df;
+ tbl.attr(R_RowNamesSymbol) = arrow::r::short_row_names(nr);
+
+ return tbl;
+}
+
+} // namespace r
+} // namespace arrow
+
+// [[arrow::export]]
+SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array) {
+ return arrow::r::Converter::Convert(array);
+}
+
+// [[arrow::export]]
+SEXP ChunkedArray__as_vector(const std::shared_ptr<arrow::ChunkedArray>& chunked_array,
+ bool use_threads = false) {
+ return arrow::r::Converter::Convert(chunked_array, use_threads);
+}
+
+// [[arrow::export]]
+cpp11::writable::list RecordBatch__to_dataframe(
+ const std::shared_ptr<arrow::RecordBatch>& batch, bool use_threads) {
+ return arrow::r::to_data_frame(batch, use_threads);
+}
+
+// [[arrow::export]]
+cpp11::writable::list Table__to_dataframe(const std::shared_ptr<arrow::Table>& table,
+ bool use_threads) {
+ return arrow::r::to_data_frame(table, use_threads);
+}
+
+#endif
diff --git a/src/arrow/r/src/arraydata.cpp b/src/arrow/r/src/arraydata.cpp
new file mode 100644
index 000000000..179532a64
--- /dev/null
+++ b/src/arrow/r/src/arraydata.cpp
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/array/data.h>
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ArrayData__get_type(
+ const std::shared_ptr<arrow::ArrayData>& x) {
+ return x->type;
+}
+
+// [[arrow::export]]
+int ArrayData__get_length(const std::shared_ptr<arrow::ArrayData>& x) {
+ return x->length;
+}
+
+// [[arrow::export]]
+int ArrayData__get_null_count(const std::shared_ptr<arrow::ArrayData>& x) {
+ return x->null_count;
+}
+
+// [[arrow::export]]
+int ArrayData__get_offset(const std::shared_ptr<arrow::ArrayData>& x) {
+ return x->offset;
+}
+
+// [[arrow::export]]
+cpp11::list ArrayData__buffers(const std::shared_ptr<arrow::ArrayData>& x) {
+ return arrow::r::to_r_list(x->buffers);
+}
+
+#endif
diff --git a/src/arrow/r/src/arrowExports.cpp b/src/arrow/r/src/arrowExports.cpp
new file mode 100644
index 000000000..5872aa4d2
--- /dev/null
+++ b/src/arrow/r/src/arrowExports.cpp
@@ -0,0 +1,7636 @@
+// Generated by using data-raw/codegen.R -> do not edit by hand
+#include <cpp11.hpp>
+#include <cpp11/declarations.hpp>
+
+#include "./arrow_types.h"
+
+// altrep.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void test_SET_STRING_ELT(SEXP s);
+extern "C" SEXP _arrow_test_SET_STRING_ELT(SEXP s_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type s(s_sexp);
+ test_SET_STRING_ELT(s);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_test_SET_STRING_ELT(SEXP s_sexp){
+ Rf_error("Cannot call test_SET_STRING_ELT(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// altrep.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool test_same_Array(SEXP x, SEXP y);
+extern "C" SEXP _arrow_test_same_Array(SEXP x_sexp, SEXP y_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ arrow::r::Input<SEXP>::type y(y_sexp);
+ return cpp11::as_sexp(test_same_Array(x, y));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_test_same_Array(SEXP x_sexp, SEXP y_sexp){
+ Rf_error("Cannot call test_same_Array(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// altrep.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool is_arrow_altrep(SEXP x);
+extern "C" SEXP _arrow_is_arrow_altrep(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ return cpp11::as_sexp(is_arrow_altrep(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_is_arrow_altrep(SEXP x_sexp){
+ Rf_error("Cannot call is_arrow_altrep(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> Array__Slice1(const std::shared_ptr<arrow::Array>& array, R_xlen_t offset);
+extern "C" SEXP _arrow_Array__Slice1(SEXP array_sexp, SEXP offset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type array(array_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ return cpp11::as_sexp(Array__Slice1(array, offset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__Slice1(SEXP array_sexp, SEXP offset_sexp){
+ Rf_error("Cannot call Array__Slice1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> Array__Slice2(const std::shared_ptr<arrow::Array>& array, R_xlen_t offset, R_xlen_t length);
+extern "C" SEXP _arrow_Array__Slice2(SEXP array_sexp, SEXP offset_sexp, SEXP length_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type array(array_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ arrow::r::Input<R_xlen_t>::type length(length_sexp);
+ return cpp11::as_sexp(Array__Slice2(array, offset, length));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__Slice2(SEXP array_sexp, SEXP offset_sexp, SEXP length_sexp){
+ Rf_error("Cannot call Array__Slice2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Array__IsNull(const std::shared_ptr<arrow::Array>& x, R_xlen_t i);
+extern "C" SEXP _arrow_Array__IsNull(SEXP x_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(Array__IsNull(x, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__IsNull(SEXP x_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Array__IsNull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Array__IsValid(const std::shared_ptr<arrow::Array>& x, R_xlen_t i);
+extern "C" SEXP _arrow_Array__IsValid(SEXP x_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(Array__IsValid(x, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__IsValid(SEXP x_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Array__IsValid(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int Array__length(const std::shared_ptr<arrow::Array>& x);
+extern "C" SEXP _arrow_Array__length(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ return cpp11::as_sexp(Array__length(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__length(SEXP x_sexp){
+ Rf_error("Cannot call Array__length(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int Array__offset(const std::shared_ptr<arrow::Array>& x);
+extern "C" SEXP _arrow_Array__offset(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ return cpp11::as_sexp(Array__offset(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__offset(SEXP x_sexp){
+ Rf_error("Cannot call Array__offset(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int Array__null_count(const std::shared_ptr<arrow::Array>& x);
+extern "C" SEXP _arrow_Array__null_count(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ return cpp11::as_sexp(Array__null_count(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__null_count(SEXP x_sexp){
+ Rf_error("Cannot call Array__null_count(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Array__type(const std::shared_ptr<arrow::Array>& x);
+extern "C" SEXP _arrow_Array__type(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ return cpp11::as_sexp(Array__type(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__type(SEXP x_sexp){
+ Rf_error("Cannot call Array__type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string Array__ToString(const std::shared_ptr<arrow::Array>& x);
+extern "C" SEXP _arrow_Array__ToString(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ return cpp11::as_sexp(Array__ToString(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__ToString(SEXP x_sexp){
+ Rf_error("Cannot call Array__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::Type::type Array__type_id(const std::shared_ptr<arrow::Array>& x);
+extern "C" SEXP _arrow_Array__type_id(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ return cpp11::as_sexp(Array__type_id(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__type_id(SEXP x_sexp){
+ Rf_error("Cannot call Array__type_id(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Array__Equals(const std::shared_ptr<arrow::Array>& lhs, const std::shared_ptr<arrow::Array>& rhs);
+extern "C" SEXP _arrow_Array__Equals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type rhs(rhs_sexp);
+ return cpp11::as_sexp(Array__Equals(lhs, rhs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__Equals(SEXP lhs_sexp, SEXP rhs_sexp){
+ Rf_error("Cannot call Array__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Array__ApproxEquals(const std::shared_ptr<arrow::Array>& lhs, const std::shared_ptr<arrow::Array>& rhs);
+extern "C" SEXP _arrow_Array__ApproxEquals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type rhs(rhs_sexp);
+ return cpp11::as_sexp(Array__ApproxEquals(lhs, rhs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__ApproxEquals(SEXP lhs_sexp, SEXP rhs_sexp){
+ Rf_error("Cannot call Array__ApproxEquals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string Array__Diff(const std::shared_ptr<arrow::Array>& lhs, const std::shared_ptr<arrow::Array>& rhs);
+extern "C" SEXP _arrow_Array__Diff(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type rhs(rhs_sexp);
+ return cpp11::as_sexp(Array__Diff(lhs, rhs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__Diff(SEXP lhs_sexp, SEXP rhs_sexp){
+ Rf_error("Cannot call Array__Diff(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ArrayData> Array__data(const std::shared_ptr<arrow::Array>& array);
+extern "C" SEXP _arrow_Array__data(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type array(array_sexp);
+ return cpp11::as_sexp(Array__data(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__data(SEXP array_sexp){
+ Rf_error("Cannot call Array__data(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Array__RangeEquals(const std::shared_ptr<arrow::Array>& self, const std::shared_ptr<arrow::Array>& other, R_xlen_t start_idx, R_xlen_t end_idx, R_xlen_t other_start_idx);
+extern "C" SEXP _arrow_Array__RangeEquals(SEXP self_sexp, SEXP other_sexp, SEXP start_idx_sexp, SEXP end_idx_sexp, SEXP other_start_idx_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type self(self_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type other(other_sexp);
+ arrow::r::Input<R_xlen_t>::type start_idx(start_idx_sexp);
+ arrow::r::Input<R_xlen_t>::type end_idx(end_idx_sexp);
+ arrow::r::Input<R_xlen_t>::type other_start_idx(other_start_idx_sexp);
+ return cpp11::as_sexp(Array__RangeEquals(self, other, start_idx, end_idx, other_start_idx));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__RangeEquals(SEXP self_sexp, SEXP other_sexp, SEXP start_idx_sexp, SEXP end_idx_sexp, SEXP other_start_idx_sexp){
+ Rf_error("Cannot call Array__RangeEquals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> Array__View(const std::shared_ptr<arrow::Array>& array, const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_Array__View(SEXP array_sexp, SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type array(array_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ return cpp11::as_sexp(Array__View(array, type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__View(SEXP array_sexp, SEXP type_sexp){
+ Rf_error("Cannot call Array__View(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void Array__Validate(const std::shared_ptr<arrow::Array>& array);
+extern "C" SEXP _arrow_Array__Validate(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type array(array_sexp);
+ Array__Validate(array);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__Validate(SEXP array_sexp){
+ Rf_error("Cannot call Array__Validate(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> DictionaryArray__indices(const std::shared_ptr<arrow::DictionaryArray>& array);
+extern "C" SEXP _arrow_DictionaryArray__indices(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DictionaryArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(DictionaryArray__indices(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryArray__indices(SEXP array_sexp){
+ Rf_error("Cannot call DictionaryArray__indices(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> DictionaryArray__dictionary(const std::shared_ptr<arrow::DictionaryArray>& array);
+extern "C" SEXP _arrow_DictionaryArray__dictionary(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DictionaryArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(DictionaryArray__dictionary(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryArray__dictionary(SEXP array_sexp){
+ Rf_error("Cannot call DictionaryArray__dictionary(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> StructArray__field(const std::shared_ptr<arrow::StructArray>& array, int i);
+extern "C" SEXP _arrow_StructArray__field(SEXP array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructArray>&>::type array(array_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(StructArray__field(array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructArray__field(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call StructArray__field(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> StructArray__GetFieldByName(const std::shared_ptr<arrow::StructArray>& array, const std::string& name);
+extern "C" SEXP _arrow_StructArray__GetFieldByName(SEXP array_sexp, SEXP name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructArray>&>::type array(array_sexp);
+ arrow::r::Input<const std::string&>::type name(name_sexp);
+ return cpp11::as_sexp(StructArray__GetFieldByName(array, name));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructArray__GetFieldByName(SEXP array_sexp, SEXP name_sexp){
+ Rf_error("Cannot call StructArray__GetFieldByName(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list StructArray__Flatten(const std::shared_ptr<arrow::StructArray>& array);
+extern "C" SEXP _arrow_StructArray__Flatten(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(StructArray__Flatten(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructArray__Flatten(SEXP array_sexp){
+ Rf_error("Cannot call StructArray__Flatten(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ListArray__value_type(const std::shared_ptr<arrow::ListArray>& array);
+extern "C" SEXP _arrow_ListArray__value_type(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(ListArray__value_type(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListArray__value_type(SEXP array_sexp){
+ Rf_error("Cannot call ListArray__value_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> LargeListArray__value_type(const std::shared_ptr<arrow::LargeListArray>& array);
+extern "C" SEXP _arrow_LargeListArray__value_type(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::LargeListArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(LargeListArray__value_type(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeListArray__value_type(SEXP array_sexp){
+ Rf_error("Cannot call LargeListArray__value_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> ListArray__values(const std::shared_ptr<arrow::ListArray>& array);
+extern "C" SEXP _arrow_ListArray__values(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(ListArray__values(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListArray__values(SEXP array_sexp){
+ Rf_error("Cannot call ListArray__values(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> LargeListArray__values(const std::shared_ptr<arrow::LargeListArray>& array);
+extern "C" SEXP _arrow_LargeListArray__values(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::LargeListArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(LargeListArray__values(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeListArray__values(SEXP array_sexp){
+ Rf_error("Cannot call LargeListArray__values(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t ListArray__value_length(const std::shared_ptr<arrow::ListArray>& array, int64_t i);
+extern "C" SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ arrow::r::Input<int64_t>::type i(i_sexp);
+ return cpp11::as_sexp(ListArray__value_length(array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call ListArray__value_length(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t LargeListArray__value_length(const std::shared_ptr<arrow::LargeListArray>& array, int64_t i);
+extern "C" SEXP _arrow_LargeListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::LargeListArray>&>::type array(array_sexp);
+ arrow::r::Input<int64_t>::type i(i_sexp);
+ return cpp11::as_sexp(LargeListArray__value_length(array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call LargeListArray__value_length(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t FixedSizeListArray__value_length(const std::shared_ptr<arrow::FixedSizeListArray>& array, int64_t i);
+extern "C" SEXP _arrow_FixedSizeListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::FixedSizeListArray>&>::type array(array_sexp);
+ arrow::r::Input<int64_t>::type i(i_sexp);
+ return cpp11::as_sexp(FixedSizeListArray__value_length(array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_FixedSizeListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call FixedSizeListArray__value_length(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t ListArray__value_offset(const std::shared_ptr<arrow::ListArray>& array, int64_t i);
+extern "C" SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ arrow::r::Input<int64_t>::type i(i_sexp);
+ return cpp11::as_sexp(ListArray__value_offset(array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call ListArray__value_offset(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t LargeListArray__value_offset(const std::shared_ptr<arrow::LargeListArray>& array, int64_t i);
+extern "C" SEXP _arrow_LargeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::LargeListArray>&>::type array(array_sexp);
+ arrow::r::Input<int64_t>::type i(i_sexp);
+ return cpp11::as_sexp(LargeListArray__value_offset(array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call LargeListArray__value_offset(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t FixedSizeListArray__value_offset(const std::shared_ptr<arrow::FixedSizeListArray>& array, int64_t i);
+extern "C" SEXP _arrow_FixedSizeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::FixedSizeListArray>&>::type array(array_sexp);
+ arrow::r::Input<int64_t>::type i(i_sexp);
+ return cpp11::as_sexp(FixedSizeListArray__value_offset(array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_FixedSizeListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call FixedSizeListArray__value_offset(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::integers ListArray__raw_value_offsets(const std::shared_ptr<arrow::ListArray>& array);
+extern "C" SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(ListArray__raw_value_offsets(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){
+ Rf_error("Cannot call ListArray__raw_value_offsets(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::integers LargeListArray__raw_value_offsets(const std::shared_ptr<arrow::LargeListArray>& array);
+extern "C" SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::LargeListArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(LargeListArray__raw_value_offsets(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){
+ Rf_error("Cannot call LargeListArray__raw_value_offsets(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array_to_vector.cpp
+#if defined(ARROW_R_WITH_ARROW)
+SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array);
+extern "C" SEXP _arrow_Array__as_vector(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type array(array_sexp);
+ return cpp11::as_sexp(Array__as_vector(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__as_vector(SEXP array_sexp){
+ Rf_error("Cannot call Array__as_vector(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array_to_vector.cpp
+#if defined(ARROW_R_WITH_ARROW)
+SEXP ChunkedArray__as_vector(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, bool use_threads);
+extern "C" SEXP _arrow_ChunkedArray__as_vector(SEXP chunked_array_sexp, SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ return cpp11::as_sexp(ChunkedArray__as_vector(chunked_array, use_threads));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__as_vector(SEXP chunked_array_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call ChunkedArray__as_vector(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array_to_vector.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::list RecordBatch__to_dataframe(const std::shared_ptr<arrow::RecordBatch>& batch, bool use_threads);
+extern "C" SEXP _arrow_RecordBatch__to_dataframe(SEXP batch_sexp, SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ return cpp11::as_sexp(RecordBatch__to_dataframe(batch, use_threads));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__to_dataframe(SEXP batch_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call RecordBatch__to_dataframe(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// array_to_vector.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::list Table__to_dataframe(const std::shared_ptr<arrow::Table>& table, bool use_threads);
+extern "C" SEXP _arrow_Table__to_dataframe(SEXP table_sexp, SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ return cpp11::as_sexp(Table__to_dataframe(table, use_threads));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__to_dataframe(SEXP table_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call Table__to_dataframe(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// arraydata.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ArrayData__get_type(const std::shared_ptr<arrow::ArrayData>& x);
+extern "C" SEXP _arrow_ArrayData__get_type(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ArrayData>&>::type x(x_sexp);
+ return cpp11::as_sexp(ArrayData__get_type(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ArrayData__get_type(SEXP x_sexp){
+ Rf_error("Cannot call ArrayData__get_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// arraydata.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ArrayData__get_length(const std::shared_ptr<arrow::ArrayData>& x);
+extern "C" SEXP _arrow_ArrayData__get_length(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ArrayData>&>::type x(x_sexp);
+ return cpp11::as_sexp(ArrayData__get_length(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ArrayData__get_length(SEXP x_sexp){
+ Rf_error("Cannot call ArrayData__get_length(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// arraydata.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ArrayData__get_null_count(const std::shared_ptr<arrow::ArrayData>& x);
+extern "C" SEXP _arrow_ArrayData__get_null_count(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ArrayData>&>::type x(x_sexp);
+ return cpp11::as_sexp(ArrayData__get_null_count(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ArrayData__get_null_count(SEXP x_sexp){
+ Rf_error("Cannot call ArrayData__get_null_count(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// arraydata.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ArrayData__get_offset(const std::shared_ptr<arrow::ArrayData>& x);
+extern "C" SEXP _arrow_ArrayData__get_offset(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ArrayData>&>::type x(x_sexp);
+ return cpp11::as_sexp(ArrayData__get_offset(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ArrayData__get_offset(SEXP x_sexp){
+ Rf_error("Cannot call ArrayData__get_offset(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// arraydata.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list ArrayData__buffers(const std::shared_ptr<arrow::ArrayData>& x);
+extern "C" SEXP _arrow_ArrayData__buffers(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ArrayData>&>::type x(x_sexp);
+ return cpp11::as_sexp(ArrayData__buffers(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ArrayData__buffers(SEXP x_sexp){
+ Rf_error("Cannot call ArrayData__buffers(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// buffer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Buffer__is_mutable(const std::shared_ptr<arrow::Buffer>& buffer);
+extern "C" SEXP _arrow_Buffer__is_mutable(SEXP buffer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type buffer(buffer_sexp);
+ return cpp11::as_sexp(Buffer__is_mutable(buffer));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Buffer__is_mutable(SEXP buffer_sexp){
+ Rf_error("Cannot call Buffer__is_mutable(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// buffer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void Buffer__ZeroPadding(const std::shared_ptr<arrow::Buffer>& buffer);
+extern "C" SEXP _arrow_Buffer__ZeroPadding(SEXP buffer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type buffer(buffer_sexp);
+ Buffer__ZeroPadding(buffer);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Buffer__ZeroPadding(SEXP buffer_sexp){
+ Rf_error("Cannot call Buffer__ZeroPadding(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// buffer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t Buffer__capacity(const std::shared_ptr<arrow::Buffer>& buffer);
+extern "C" SEXP _arrow_Buffer__capacity(SEXP buffer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type buffer(buffer_sexp);
+ return cpp11::as_sexp(Buffer__capacity(buffer));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Buffer__capacity(SEXP buffer_sexp){
+ Rf_error("Cannot call Buffer__capacity(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// buffer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t Buffer__size(const std::shared_ptr<arrow::Buffer>& buffer);
+extern "C" SEXP _arrow_Buffer__size(SEXP buffer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type buffer(buffer_sexp);
+ return cpp11::as_sexp(Buffer__size(buffer));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Buffer__size(SEXP buffer_sexp){
+ Rf_error("Cannot call Buffer__size(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// buffer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Buffer> r___RBuffer__initialize(SEXP x);
+extern "C" SEXP _arrow_r___RBuffer__initialize(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ return cpp11::as_sexp(r___RBuffer__initialize(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_r___RBuffer__initialize(SEXP x_sexp){
+ Rf_error("Cannot call r___RBuffer__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// buffer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::raws Buffer__data(const std::shared_ptr<arrow::Buffer>& buffer);
+extern "C" SEXP _arrow_Buffer__data(SEXP buffer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type buffer(buffer_sexp);
+ return cpp11::as_sexp(Buffer__data(buffer));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Buffer__data(SEXP buffer_sexp){
+ Rf_error("Cannot call Buffer__data(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// buffer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Buffer__Equals(const std::shared_ptr<arrow::Buffer>& x, const std::shared_ptr<arrow::Buffer>& y);
+extern "C" SEXP _arrow_Buffer__Equals(SEXP x_sexp, SEXP y_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type x(x_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type y(y_sexp);
+ return cpp11::as_sexp(Buffer__Equals(x, y));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Buffer__Equals(SEXP x_sexp, SEXP y_sexp){
+ Rf_error("Cannot call Buffer__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ChunkedArray__length(const std::shared_ptr<arrow::ChunkedArray>& chunked_array);
+extern "C" SEXP _arrow_ChunkedArray__length(SEXP chunked_array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ return cpp11::as_sexp(ChunkedArray__length(chunked_array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__length(SEXP chunked_array_sexp){
+ Rf_error("Cannot call ChunkedArray__length(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ChunkedArray__null_count(const std::shared_ptr<arrow::ChunkedArray>& chunked_array);
+extern "C" SEXP _arrow_ChunkedArray__null_count(SEXP chunked_array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ return cpp11::as_sexp(ChunkedArray__null_count(chunked_array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__null_count(SEXP chunked_array_sexp){
+ Rf_error("Cannot call ChunkedArray__null_count(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ChunkedArray__num_chunks(const std::shared_ptr<arrow::ChunkedArray>& chunked_array);
+extern "C" SEXP _arrow_ChunkedArray__num_chunks(SEXP chunked_array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ return cpp11::as_sexp(ChunkedArray__num_chunks(chunked_array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__num_chunks(SEXP chunked_array_sexp){
+ Rf_error("Cannot call ChunkedArray__num_chunks(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> ChunkedArray__chunk(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int i);
+extern "C" SEXP _arrow_ChunkedArray__chunk(SEXP chunked_array_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(ChunkedArray__chunk(chunked_array, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__chunk(SEXP chunked_array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call ChunkedArray__chunk(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list ChunkedArray__chunks(const std::shared_ptr<arrow::ChunkedArray>& chunked_array);
+extern "C" SEXP _arrow_ChunkedArray__chunks(SEXP chunked_array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ return cpp11::as_sexp(ChunkedArray__chunks(chunked_array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__chunks(SEXP chunked_array_sexp){
+ Rf_error("Cannot call ChunkedArray__chunks(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ChunkedArray__type(const std::shared_ptr<arrow::ChunkedArray>& chunked_array);
+extern "C" SEXP _arrow_ChunkedArray__type(SEXP chunked_array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ return cpp11::as_sexp(ChunkedArray__type(chunked_array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__type(SEXP chunked_array_sexp){
+ Rf_error("Cannot call ChunkedArray__type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice1(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, R_xlen_t offset);
+extern "C" SEXP _arrow_ChunkedArray__Slice1(SEXP chunked_array_sexp, SEXP offset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ return cpp11::as_sexp(ChunkedArray__Slice1(chunked_array, offset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__Slice1(SEXP chunked_array_sexp, SEXP offset_sexp){
+ Rf_error("Cannot call ChunkedArray__Slice1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice2(const std::shared_ptr<arrow::ChunkedArray>& chunked_array, R_xlen_t offset, R_xlen_t length);
+extern "C" SEXP _arrow_ChunkedArray__Slice2(SEXP chunked_array_sexp, SEXP offset_sexp, SEXP length_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ arrow::r::Input<R_xlen_t>::type length(length_sexp);
+ return cpp11::as_sexp(ChunkedArray__Slice2(chunked_array, offset, length));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__Slice2(SEXP chunked_array_sexp, SEXP offset_sexp, SEXP length_sexp){
+ Rf_error("Cannot call ChunkedArray__Slice2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__View(const std::shared_ptr<arrow::ChunkedArray>& array, const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_ChunkedArray__View(SEXP array_sexp, SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type array(array_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ChunkedArray__View(array, type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__View(SEXP array_sexp, SEXP type_sexp){
+ Rf_error("Cannot call ChunkedArray__View(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ChunkedArray__Validate(const std::shared_ptr<arrow::ChunkedArray>& chunked_array);
+extern "C" SEXP _arrow_ChunkedArray__Validate(SEXP chunked_array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type chunked_array(chunked_array_sexp);
+ ChunkedArray__Validate(chunked_array);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__Validate(SEXP chunked_array_sexp){
+ Rf_error("Cannot call ChunkedArray__Validate(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool ChunkedArray__Equals(const std::shared_ptr<arrow::ChunkedArray>& x, const std::shared_ptr<arrow::ChunkedArray>& y);
+extern "C" SEXP _arrow_ChunkedArray__Equals(SEXP x_sexp, SEXP y_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type x(x_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type y(y_sexp);
+ return cpp11::as_sexp(ChunkedArray__Equals(x, y));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__Equals(SEXP x_sexp, SEXP y_sexp){
+ Rf_error("Cannot call ChunkedArray__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string ChunkedArray__ToString(const std::shared_ptr<arrow::ChunkedArray>& x);
+extern "C" SEXP _arrow_ChunkedArray__ToString(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type x(x_sexp);
+ return cpp11::as_sexp(ChunkedArray__ToString(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__ToString(SEXP x_sexp){
+ Rf_error("Cannot call ChunkedArray__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// chunkedarray.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__from_list(cpp11::list chunks, SEXP s_type);
+extern "C" SEXP _arrow_ChunkedArray__from_list(SEXP chunks_sexp, SEXP s_type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<cpp11::list>::type chunks(chunks_sexp);
+ arrow::r::Input<SEXP>::type s_type(s_type_sexp);
+ return cpp11::as_sexp(ChunkedArray__from_list(chunks, s_type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ChunkedArray__from_list(SEXP chunks_sexp, SEXP s_type_sexp){
+ Rf_error("Cannot call ChunkedArray__from_list(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::util::Codec> util___Codec__Create(arrow::Compression::type codec, R_xlen_t compression_level);
+extern "C" SEXP _arrow_util___Codec__Create(SEXP codec_sexp, SEXP compression_level_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::Compression::type>::type codec(codec_sexp);
+ arrow::r::Input<R_xlen_t>::type compression_level(compression_level_sexp);
+ return cpp11::as_sexp(util___Codec__Create(codec, compression_level));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_util___Codec__Create(SEXP codec_sexp, SEXP compression_level_sexp){
+ Rf_error("Cannot call util___Codec__Create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string util___Codec__name(const std::shared_ptr<arrow::util::Codec>& codec);
+extern "C" SEXP _arrow_util___Codec__name(SEXP codec_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::util::Codec>&>::type codec(codec_sexp);
+ return cpp11::as_sexp(util___Codec__name(codec));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_util___Codec__name(SEXP codec_sexp){
+ Rf_error("Cannot call util___Codec__name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool util___Codec__IsAvailable(arrow::Compression::type codec);
+extern "C" SEXP _arrow_util___Codec__IsAvailable(SEXP codec_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::Compression::type>::type codec(codec_sexp);
+ return cpp11::as_sexp(util___Codec__IsAvailable(codec));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_util___Codec__IsAvailable(SEXP codec_sexp){
+ Rf_error("Cannot call util___Codec__IsAvailable(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::CompressedOutputStream> io___CompressedOutputStream__Make(const std::shared_ptr<arrow::util::Codec>& codec, const std::shared_ptr<arrow::io::OutputStream>& raw);
+extern "C" SEXP _arrow_io___CompressedOutputStream__Make(SEXP codec_sexp, SEXP raw_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::util::Codec>&>::type codec(codec_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type raw(raw_sexp);
+ return cpp11::as_sexp(io___CompressedOutputStream__Make(codec, raw));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___CompressedOutputStream__Make(SEXP codec_sexp, SEXP raw_sexp){
+ Rf_error("Cannot call io___CompressedOutputStream__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::CompressedInputStream> io___CompressedInputStream__Make(const std::shared_ptr<arrow::util::Codec>& codec, const std::shared_ptr<arrow::io::InputStream>& raw);
+extern "C" SEXP _arrow_io___CompressedInputStream__Make(SEXP codec_sexp, SEXP raw_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::util::Codec>&>::type codec(codec_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type raw(raw_sexp);
+ return cpp11::as_sexp(io___CompressedInputStream__Make(codec, raw));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___CompressedInputStream__Make(SEXP codec_sexp, SEXP raw_sexp){
+ Rf_error("Cannot call io___CompressedInputStream__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<compute::ExecPlan> ExecPlan_create(bool use_threads);
+extern "C" SEXP _arrow_ExecPlan_create(SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ return cpp11::as_sexp(ExecPlan_create(use_threads));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecPlan_create(SEXP use_threads_sexp){
+ Rf_error("Cannot call ExecPlan_create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatchReader> ExecPlan_run(const std::shared_ptr<compute::ExecPlan>& plan, const std::shared_ptr<compute::ExecNode>& final_node, cpp11::list sort_options, int64_t head);
+extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP head_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecPlan>&>::type plan(plan_sexp);
+ arrow::r::Input<const std::shared_ptr<compute::ExecNode>&>::type final_node(final_node_sexp);
+ arrow::r::Input<cpp11::list>::type sort_options(sort_options_sexp);
+ arrow::r::Input<int64_t>::type head(head_sexp);
+ return cpp11::as_sexp(ExecPlan_run(plan, final_node, sort_options, head));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp, SEXP head_sexp){
+ Rf_error("Cannot call ExecPlan_run(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ExecPlan_StopProducing(const std::shared_ptr<compute::ExecPlan>& plan);
+extern "C" SEXP _arrow_ExecPlan_StopProducing(SEXP plan_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecPlan>&>::type plan(plan_sexp);
+ ExecPlan_StopProducing(plan);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecPlan_StopProducing(SEXP plan_sexp){
+ Rf_error("Cannot call ExecPlan_StopProducing(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Schema> ExecNode_output_schema(const std::shared_ptr<compute::ExecNode>& node);
+extern "C" SEXP _arrow_ExecNode_output_schema(SEXP node_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecNode>&>::type node(node_sexp);
+ return cpp11::as_sexp(ExecNode_output_schema(node));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecNode_output_schema(SEXP node_sexp){
+ Rf_error("Cannot call ExecNode_output_schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<compute::ExecNode> ExecNode_Scan(const std::shared_ptr<compute::ExecPlan>& plan, const std::shared_ptr<arrow::dataset::Dataset>& dataset, const std::shared_ptr<compute::Expression>& filter, std::vector<std::string> materialized_field_names);
+extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecPlan>&>::type plan(plan_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::dataset::Dataset>&>::type dataset(dataset_sexp);
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type filter(filter_sexp);
+ arrow::r::Input<std::vector<std::string>>::type materialized_field_names(materialized_field_names_sexp);
+ return cpp11::as_sexp(ExecNode_Scan(plan, dataset, filter, materialized_field_names));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecNode_Scan(SEXP plan_sexp, SEXP dataset_sexp, SEXP filter_sexp, SEXP materialized_field_names_sexp){
+ Rf_error("Cannot call ExecNode_Scan(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<compute::ExecNode> ExecNode_Filter(const std::shared_ptr<compute::ExecNode>& input, const std::shared_ptr<compute::Expression>& filter);
+extern "C" SEXP _arrow_ExecNode_Filter(SEXP input_sexp, SEXP filter_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecNode>&>::type input(input_sexp);
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type filter(filter_sexp);
+ return cpp11::as_sexp(ExecNode_Filter(input, filter));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecNode_Filter(SEXP input_sexp, SEXP filter_sexp){
+ Rf_error("Cannot call ExecNode_Filter(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<compute::ExecNode> ExecNode_Project(const std::shared_ptr<compute::ExecNode>& input, const std::vector<std::shared_ptr<compute::Expression>>& exprs, std::vector<std::string> names);
+extern "C" SEXP _arrow_ExecNode_Project(SEXP input_sexp, SEXP exprs_sexp, SEXP names_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecNode>&>::type input(input_sexp);
+ arrow::r::Input<const std::vector<std::shared_ptr<compute::Expression>>&>::type exprs(exprs_sexp);
+ arrow::r::Input<std::vector<std::string>>::type names(names_sexp);
+ return cpp11::as_sexp(ExecNode_Project(input, exprs, names));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecNode_Project(SEXP input_sexp, SEXP exprs_sexp, SEXP names_sexp){
+ Rf_error("Cannot call ExecNode_Project(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<compute::ExecNode> ExecNode_Aggregate(const std::shared_ptr<compute::ExecNode>& input, cpp11::list options, std::vector<std::string> target_names, std::vector<std::string> out_field_names, std::vector<std::string> key_names);
+extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SEXP target_names_sexp, SEXP out_field_names_sexp, SEXP key_names_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecNode>&>::type input(input_sexp);
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ arrow::r::Input<std::vector<std::string>>::type target_names(target_names_sexp);
+ arrow::r::Input<std::vector<std::string>>::type out_field_names(out_field_names_sexp);
+ arrow::r::Input<std::vector<std::string>>::type key_names(key_names_sexp);
+ return cpp11::as_sexp(ExecNode_Aggregate(input, options, target_names, out_field_names, key_names));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SEXP target_names_sexp, SEXP out_field_names_sexp, SEXP key_names_sexp){
+ Rf_error("Cannot call ExecNode_Aggregate(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<compute::ExecNode> ExecNode_Join(const std::shared_ptr<compute::ExecNode>& input, int type, const std::shared_ptr<compute::ExecNode>& right_data, std::vector<std::string> left_keys, std::vector<std::string> right_keys, std::vector<std::string> left_output, std::vector<std::string> right_output);
+extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecNode>&>::type input(input_sexp);
+ arrow::r::Input<int>::type type(type_sexp);
+ arrow::r::Input<const std::shared_ptr<compute::ExecNode>&>::type right_data(right_data_sexp);
+ arrow::r::Input<std::vector<std::string>>::type left_keys(left_keys_sexp);
+ arrow::r::Input<std::vector<std::string>>::type right_keys(right_keys_sexp);
+ arrow::r::Input<std::vector<std::string>>::type left_output(left_output_sexp);
+ arrow::r::Input<std::vector<std::string>>::type right_output(right_output_sexp);
+ return cpp11::as_sexp(ExecNode_Join(input, type, right_data, left_keys, right_keys, left_output, right_output));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp){
+ Rf_error("Cannot call ExecNode_Join(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute-exec.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<compute::ExecNode> ExecNode_ReadFromRecordBatchReader(const std::shared_ptr<compute::ExecPlan>& plan, const std::shared_ptr<arrow::RecordBatchReader>& reader);
+extern "C" SEXP _arrow_ExecNode_ReadFromRecordBatchReader(SEXP plan_sexp, SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::ExecPlan>&>::type plan(plan_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatchReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(ExecNode_ReadFromRecordBatchReader(plan, reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExecNode_ReadFromRecordBatchReader(SEXP plan_sexp, SEXP reader_sexp){
+ Rf_error("Cannot call ExecNode_ReadFromRecordBatchReader(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__cast(const std::shared_ptr<arrow::RecordBatch>& batch, const std::shared_ptr<arrow::Schema>& schema, cpp11::list options);
+extern "C" SEXP _arrow_RecordBatch__cast(SEXP batch_sexp, SEXP schema_sexp, SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(RecordBatch__cast(batch, schema, options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__cast(SEXP batch_sexp, SEXP schema_sexp, SEXP options_sexp){
+ Rf_error("Cannot call RecordBatch__cast(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__cast(const std::shared_ptr<arrow::Table>& table, const std::shared_ptr<arrow::Schema>& schema, cpp11::list options);
+extern "C" SEXP _arrow_Table__cast(SEXP table_sexp, SEXP schema_sexp, SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(Table__cast(table, schema, options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__cast(SEXP table_sexp, SEXP schema_sexp, SEXP options_sexp){
+ Rf_error("Cannot call Table__cast(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute.cpp
+#if defined(ARROW_R_WITH_ARROW)
+SEXP compute__CallFunction(std::string func_name, cpp11::list args, cpp11::list options);
+extern "C" SEXP _arrow_compute__CallFunction(SEXP func_name_sexp, SEXP args_sexp, SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<std::string>::type func_name(func_name_sexp);
+ arrow::r::Input<cpp11::list>::type args(args_sexp);
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(compute__CallFunction(func_name, args, options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute__CallFunction(SEXP func_name_sexp, SEXP args_sexp, SEXP options_sexp){
+ Rf_error("Cannot call compute__CallFunction(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// compute.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> compute__GetFunctionNames();
+extern "C" SEXP _arrow_compute__GetFunctionNames(){
+BEGIN_CPP11
+ return cpp11::as_sexp(compute__GetFunctionNames());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute__GetFunctionNames(){
+ Rf_error("Cannot call compute__GetFunctionNames(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// config.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> build_info();
+extern "C" SEXP _arrow_build_info(){
+BEGIN_CPP11
+ return cpp11::as_sexp(build_info());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_build_info(){
+ Rf_error("Cannot call build_info(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// config.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> runtime_info();
+extern "C" SEXP _arrow_runtime_info(){
+BEGIN_CPP11
+ return cpp11::as_sexp(runtime_info());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_runtime_info(){
+ Rf_error("Cannot call runtime_info(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::csv::WriteOptions> csv___WriteOptions__initialize(cpp11::list options);
+extern "C" SEXP _arrow_csv___WriteOptions__initialize(SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(csv___WriteOptions__initialize(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___WriteOptions__initialize(SEXP options_sexp){
+ Rf_error("Cannot call csv___WriteOptions__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::csv::ReadOptions> csv___ReadOptions__initialize(cpp11::list options);
+extern "C" SEXP _arrow_csv___ReadOptions__initialize(SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(csv___ReadOptions__initialize(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___ReadOptions__initialize(SEXP options_sexp){
+ Rf_error("Cannot call csv___ReadOptions__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::csv::ParseOptions> csv___ParseOptions__initialize(cpp11::list options);
+extern "C" SEXP _arrow_csv___ParseOptions__initialize(SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(csv___ParseOptions__initialize(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___ParseOptions__initialize(SEXP options_sexp){
+ Rf_error("Cannot call csv___ParseOptions__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+SEXP csv___ReadOptions__column_names(const std::shared_ptr<arrow::csv::ReadOptions>& options);
+extern "C" SEXP _arrow_csv___ReadOptions__column_names(SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type options(options_sexp);
+ return cpp11::as_sexp(csv___ReadOptions__column_names(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___ReadOptions__column_names(SEXP options_sexp){
+ Rf_error("Cannot call csv___ReadOptions__column_names(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::csv::ConvertOptions> csv___ConvertOptions__initialize(cpp11::list options);
+extern "C" SEXP _arrow_csv___ConvertOptions__initialize(SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(csv___ConvertOptions__initialize(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___ConvertOptions__initialize(SEXP options_sexp){
+ Rf_error("Cannot call csv___ConvertOptions__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::csv::TableReader> csv___TableReader__Make(const std::shared_ptr<arrow::io::InputStream>& input, const std::shared_ptr<arrow::csv::ReadOptions>& read_options, const std::shared_ptr<arrow::csv::ParseOptions>& parse_options, const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options);
+extern "C" SEXP _arrow_csv___TableReader__Make(SEXP input_sexp, SEXP read_options_sexp, SEXP parse_options_sexp, SEXP convert_options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type input(input_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type read_options(read_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ParseOptions>&>::type parse_options(parse_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ConvertOptions>&>::type convert_options(convert_options_sexp);
+ return cpp11::as_sexp(csv___TableReader__Make(input, read_options, parse_options, convert_options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___TableReader__Make(SEXP input_sexp, SEXP read_options_sexp, SEXP parse_options_sexp, SEXP convert_options_sexp){
+ Rf_error("Cannot call csv___TableReader__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> csv___TableReader__Read(const std::shared_ptr<arrow::csv::TableReader>& table_reader);
+extern "C" SEXP _arrow_csv___TableReader__Read(SEXP table_reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::csv::TableReader>&>::type table_reader(table_reader_sexp);
+ return cpp11::as_sexp(csv___TableReader__Read(table_reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___TableReader__Read(SEXP table_reader_sexp){
+ Rf_error("Cannot call csv___TableReader__Read(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string TimestampParser__kind(const std::shared_ptr<arrow::TimestampParser>& parser);
+extern "C" SEXP _arrow_TimestampParser__kind(SEXP parser_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::TimestampParser>&>::type parser(parser_sexp);
+ return cpp11::as_sexp(TimestampParser__kind(parser));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__kind(SEXP parser_sexp){
+ Rf_error("Cannot call TimestampParser__kind(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string TimestampParser__format(const std::shared_ptr<arrow::TimestampParser>& parser);
+extern "C" SEXP _arrow_TimestampParser__format(SEXP parser_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::TimestampParser>&>::type parser(parser_sexp);
+ return cpp11::as_sexp(TimestampParser__format(parser));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__format(SEXP parser_sexp){
+ Rf_error("Cannot call TimestampParser__format(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeStrptime(std::string format);
+extern "C" SEXP _arrow_TimestampParser__MakeStrptime(SEXP format_sexp){
+BEGIN_CPP11
+ arrow::r::Input<std::string>::type format(format_sexp);
+ return cpp11::as_sexp(TimestampParser__MakeStrptime(format));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__MakeStrptime(SEXP format_sexp){
+ Rf_error("Cannot call TimestampParser__MakeStrptime(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeISO8601();
+extern "C" SEXP _arrow_TimestampParser__MakeISO8601(){
+BEGIN_CPP11
+ return cpp11::as_sexp(TimestampParser__MakeISO8601());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampParser__MakeISO8601(){
+ Rf_error("Cannot call TimestampParser__MakeISO8601(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void csv___WriteCSV__Table(const std::shared_ptr<arrow::Table>& table, const std::shared_ptr<arrow::csv::WriteOptions>& write_options, const std::shared_ptr<arrow::io::OutputStream>& stream);
+extern "C" SEXP _arrow_csv___WriteCSV__Table(SEXP table_sexp, SEXP write_options_sexp, SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::WriteOptions>&>::type write_options(write_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type stream(stream_sexp);
+ csv___WriteCSV__Table(table, write_options, stream);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___WriteCSV__Table(SEXP table_sexp, SEXP write_options_sexp, SEXP stream_sexp){
+ Rf_error("Cannot call csv___WriteCSV__Table(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// csv.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void csv___WriteCSV__RecordBatch(const std::shared_ptr<arrow::RecordBatch>& record_batch, const std::shared_ptr<arrow::csv::WriteOptions>& write_options, const std::shared_ptr<arrow::io::OutputStream>& stream);
+extern "C" SEXP _arrow_csv___WriteCSV__RecordBatch(SEXP record_batch_sexp, SEXP write_options_sexp, SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type record_batch(record_batch_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::WriteOptions>&>::type write_options(write_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type stream(stream_sexp);
+ csv___WriteCSV__RecordBatch(record_batch, write_options, stream);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_csv___WriteCSV__RecordBatch(SEXP record_batch_sexp, SEXP write_options_sexp, SEXP stream_sexp){
+ Rf_error("Cannot call csv___WriteCSV__RecordBatch(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::ScannerBuilder> dataset___Dataset__NewScan(const std::shared_ptr<ds::Dataset>& ds);
+extern "C" SEXP _arrow_dataset___Dataset__NewScan(SEXP ds_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Dataset>&>::type ds(ds_sexp);
+ return cpp11::as_sexp(dataset___Dataset__NewScan(ds));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Dataset__NewScan(SEXP ds_sexp){
+ Rf_error("Cannot call dataset___Dataset__NewScan(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Schema> dataset___Dataset__schema(const std::shared_ptr<ds::Dataset>& dataset);
+extern "C" SEXP _arrow_dataset___Dataset__schema(SEXP dataset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Dataset>&>::type dataset(dataset_sexp);
+ return cpp11::as_sexp(dataset___Dataset__schema(dataset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Dataset__schema(SEXP dataset_sexp){
+ Rf_error("Cannot call dataset___Dataset__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::string dataset___Dataset__type_name(const std::shared_ptr<ds::Dataset>& dataset);
+extern "C" SEXP _arrow_dataset___Dataset__type_name(SEXP dataset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Dataset>&>::type dataset(dataset_sexp);
+ return cpp11::as_sexp(dataset___Dataset__type_name(dataset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Dataset__type_name(SEXP dataset_sexp){
+ Rf_error("Cannot call dataset___Dataset__type_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::Dataset> dataset___Dataset__ReplaceSchema(const std::shared_ptr<ds::Dataset>& dataset, const std::shared_ptr<arrow::Schema>& schm);
+extern "C" SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Dataset>&>::type dataset(dataset_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp);
+ return cpp11::as_sexp(dataset___Dataset__ReplaceSchema(dataset, schm));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Dataset__ReplaceSchema(SEXP dataset_sexp, SEXP schm_sexp){
+ Rf_error("Cannot call dataset___Dataset__ReplaceSchema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::Dataset> dataset___UnionDataset__create(const ds::DatasetVector& datasets, const std::shared_ptr<arrow::Schema>& schm);
+extern "C" SEXP _arrow_dataset___UnionDataset__create(SEXP datasets_sexp, SEXP schm_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const ds::DatasetVector&>::type datasets(datasets_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp);
+ return cpp11::as_sexp(dataset___UnionDataset__create(datasets, schm));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___UnionDataset__create(SEXP datasets_sexp, SEXP schm_sexp){
+ Rf_error("Cannot call dataset___UnionDataset__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::Dataset> dataset___InMemoryDataset__create(const std::shared_ptr<arrow::Table>& table);
+extern "C" SEXP _arrow_dataset___InMemoryDataset__create(SEXP table_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ return cpp11::as_sexp(dataset___InMemoryDataset__create(table));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___InMemoryDataset__create(SEXP table_sexp){
+ Rf_error("Cannot call dataset___InMemoryDataset__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+cpp11::list dataset___UnionDataset__children(const std::shared_ptr<ds::UnionDataset>& ds);
+extern "C" SEXP _arrow_dataset___UnionDataset__children(SEXP ds_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::UnionDataset>&>::type ds(ds_sexp);
+ return cpp11::as_sexp(dataset___UnionDataset__children(ds));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___UnionDataset__children(SEXP ds_sexp){
+ Rf_error("Cannot call dataset___UnionDataset__children(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::FileFormat> dataset___FileSystemDataset__format(const std::shared_ptr<ds::FileSystemDataset>& dataset);
+extern "C" SEXP _arrow_dataset___FileSystemDataset__format(SEXP dataset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FileSystemDataset>&>::type dataset(dataset_sexp);
+ return cpp11::as_sexp(dataset___FileSystemDataset__format(dataset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileSystemDataset__format(SEXP dataset_sexp){
+ Rf_error("Cannot call dataset___FileSystemDataset__format(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<fs::FileSystem> dataset___FileSystemDataset__filesystem(const std::shared_ptr<ds::FileSystemDataset>& dataset);
+extern "C" SEXP _arrow_dataset___FileSystemDataset__filesystem(SEXP dataset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FileSystemDataset>&>::type dataset(dataset_sexp);
+ return cpp11::as_sexp(dataset___FileSystemDataset__filesystem(dataset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileSystemDataset__filesystem(SEXP dataset_sexp){
+ Rf_error("Cannot call dataset___FileSystemDataset__filesystem(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::vector<std::string> dataset___FileSystemDataset__files(const std::shared_ptr<ds::FileSystemDataset>& dataset);
+extern "C" SEXP _arrow_dataset___FileSystemDataset__files(SEXP dataset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FileSystemDataset>&>::type dataset(dataset_sexp);
+ return cpp11::as_sexp(dataset___FileSystemDataset__files(dataset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileSystemDataset__files(SEXP dataset_sexp){
+ Rf_error("Cannot call dataset___FileSystemDataset__files(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::Dataset> dataset___DatasetFactory__Finish1(const std::shared_ptr<ds::DatasetFactory>& factory, bool unify_schemas);
+extern "C" SEXP _arrow_dataset___DatasetFactory__Finish1(SEXP factory_sexp, SEXP unify_schemas_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::DatasetFactory>&>::type factory(factory_sexp);
+ arrow::r::Input<bool>::type unify_schemas(unify_schemas_sexp);
+ return cpp11::as_sexp(dataset___DatasetFactory__Finish1(factory, unify_schemas));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___DatasetFactory__Finish1(SEXP factory_sexp, SEXP unify_schemas_sexp){
+ Rf_error("Cannot call dataset___DatasetFactory__Finish1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::Dataset> dataset___DatasetFactory__Finish2(const std::shared_ptr<ds::DatasetFactory>& factory, const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_dataset___DatasetFactory__Finish2(SEXP factory_sexp, SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::DatasetFactory>&>::type factory(factory_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(dataset___DatasetFactory__Finish2(factory, schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___DatasetFactory__Finish2(SEXP factory_sexp, SEXP schema_sexp){
+ Rf_error("Cannot call dataset___DatasetFactory__Finish2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Schema> dataset___DatasetFactory__Inspect(const std::shared_ptr<ds::DatasetFactory>& factory, bool unify_schemas);
+extern "C" SEXP _arrow_dataset___DatasetFactory__Inspect(SEXP factory_sexp, SEXP unify_schemas_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::DatasetFactory>&>::type factory(factory_sexp);
+ arrow::r::Input<bool>::type unify_schemas(unify_schemas_sexp);
+ return cpp11::as_sexp(dataset___DatasetFactory__Inspect(factory, unify_schemas));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___DatasetFactory__Inspect(SEXP factory_sexp, SEXP unify_schemas_sexp){
+ Rf_error("Cannot call dataset___DatasetFactory__Inspect(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::DatasetFactory> dataset___UnionDatasetFactory__Make(const std::vector<std::shared_ptr<ds::DatasetFactory>>& children);
+extern "C" SEXP _arrow_dataset___UnionDatasetFactory__Make(SEXP children_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::vector<std::shared_ptr<ds::DatasetFactory>>&>::type children(children_sexp);
+ return cpp11::as_sexp(dataset___UnionDatasetFactory__Make(children));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___UnionDatasetFactory__Make(SEXP children_sexp){
+ Rf_error("Cannot call dataset___UnionDatasetFactory__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make0(const std::shared_ptr<fs::FileSystem>& fs, const std::vector<std::string>& paths, const std::shared_ptr<ds::FileFormat>& format);
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make0(SEXP fs_sexp, SEXP paths_sexp, SEXP format_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type fs(fs_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::FileFormat>&>::type format(format_sexp);
+ return cpp11::as_sexp(dataset___FileSystemDatasetFactory__Make0(fs, paths, format));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make0(SEXP fs_sexp, SEXP paths_sexp, SEXP format_sexp){
+ Rf_error("Cannot call dataset___FileSystemDatasetFactory__Make0(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make2(const std::shared_ptr<fs::FileSystem>& fs, const std::shared_ptr<fs::FileSelector>& selector, const std::shared_ptr<ds::FileFormat>& format, const std::shared_ptr<ds::Partitioning>& partitioning);
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make2(SEXP fs_sexp, SEXP selector_sexp, SEXP format_sexp, SEXP partitioning_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type fs(fs_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type selector(selector_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::FileFormat>&>::type format(format_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::Partitioning>&>::type partitioning(partitioning_sexp);
+ return cpp11::as_sexp(dataset___FileSystemDatasetFactory__Make2(fs, selector, format, partitioning));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make2(SEXP fs_sexp, SEXP selector_sexp, SEXP format_sexp, SEXP partitioning_sexp){
+ Rf_error("Cannot call dataset___FileSystemDatasetFactory__Make2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make1(const std::shared_ptr<fs::FileSystem>& fs, const std::shared_ptr<fs::FileSelector>& selector, const std::shared_ptr<ds::FileFormat>& format);
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make1(SEXP fs_sexp, SEXP selector_sexp, SEXP format_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type fs(fs_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type selector(selector_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::FileFormat>&>::type format(format_sexp);
+ return cpp11::as_sexp(dataset___FileSystemDatasetFactory__Make1(fs, selector, format));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make1(SEXP fs_sexp, SEXP selector_sexp, SEXP format_sexp){
+ Rf_error("Cannot call dataset___FileSystemDatasetFactory__Make1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make3(const std::shared_ptr<fs::FileSystem>& fs, const std::shared_ptr<fs::FileSelector>& selector, const std::shared_ptr<ds::FileFormat>& format, const std::shared_ptr<ds::PartitioningFactory>& factory);
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make3(SEXP fs_sexp, SEXP selector_sexp, SEXP format_sexp, SEXP factory_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type fs(fs_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type selector(selector_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::FileFormat>&>::type format(format_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::PartitioningFactory>&>::type factory(factory_sexp);
+ return cpp11::as_sexp(dataset___FileSystemDatasetFactory__Make3(fs, selector, format, factory));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileSystemDatasetFactory__Make3(SEXP fs_sexp, SEXP selector_sexp, SEXP format_sexp, SEXP factory_sexp){
+ Rf_error("Cannot call dataset___FileSystemDatasetFactory__Make3(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::string dataset___FileFormat__type_name(const std::shared_ptr<ds::FileFormat>& format);
+extern "C" SEXP _arrow_dataset___FileFormat__type_name(SEXP format_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FileFormat>&>::type format(format_sexp);
+ return cpp11::as_sexp(dataset___FileFormat__type_name(format));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileFormat__type_name(SEXP format_sexp){
+ Rf_error("Cannot call dataset___FileFormat__type_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::FileWriteOptions> dataset___FileFormat__DefaultWriteOptions(const std::shared_ptr<ds::FileFormat>& fmt);
+extern "C" SEXP _arrow_dataset___FileFormat__DefaultWriteOptions(SEXP fmt_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FileFormat>&>::type fmt(fmt_sexp);
+ return cpp11::as_sexp(dataset___FileFormat__DefaultWriteOptions(fmt));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileFormat__DefaultWriteOptions(SEXP fmt_sexp){
+ Rf_error("Cannot call dataset___FileFormat__DefaultWriteOptions(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::ParquetFileFormat> dataset___ParquetFileFormat__Make(const std::shared_ptr<ds::ParquetFragmentScanOptions>& options, cpp11::strings dict_columns);
+extern "C" SEXP _arrow_dataset___ParquetFileFormat__Make(SEXP options_sexp, SEXP dict_columns_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ParquetFragmentScanOptions>&>::type options(options_sexp);
+ arrow::r::Input<cpp11::strings>::type dict_columns(dict_columns_sexp);
+ return cpp11::as_sexp(dataset___ParquetFileFormat__Make(options, dict_columns));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ParquetFileFormat__Make(SEXP options_sexp, SEXP dict_columns_sexp){
+ Rf_error("Cannot call dataset___ParquetFileFormat__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::string dataset___FileWriteOptions__type_name(const std::shared_ptr<ds::FileWriteOptions>& options);
+extern "C" SEXP _arrow_dataset___FileWriteOptions__type_name(SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FileWriteOptions>&>::type options(options_sexp);
+ return cpp11::as_sexp(dataset___FileWriteOptions__type_name(options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FileWriteOptions__type_name(SEXP options_sexp){
+ Rf_error("Cannot call dataset___FileWriteOptions__type_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ParquetFileWriteOptions__update(const std::shared_ptr<ds::ParquetFileWriteOptions>& options, const std::shared_ptr<parquet::WriterProperties>& writer_props, const std::shared_ptr<parquet::ArrowWriterProperties>& arrow_writer_props);
+extern "C" SEXP _arrow_dataset___ParquetFileWriteOptions__update(SEXP options_sexp, SEXP writer_props_sexp, SEXP arrow_writer_props_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ParquetFileWriteOptions>&>::type options(options_sexp);
+ arrow::r::Input<const std::shared_ptr<parquet::WriterProperties>&>::type writer_props(writer_props_sexp);
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowWriterProperties>&>::type arrow_writer_props(arrow_writer_props_sexp);
+ dataset___ParquetFileWriteOptions__update(options, writer_props, arrow_writer_props);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ParquetFileWriteOptions__update(SEXP options_sexp, SEXP writer_props_sexp, SEXP arrow_writer_props_sexp){
+ Rf_error("Cannot call dataset___ParquetFileWriteOptions__update(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___IpcFileWriteOptions__update2(const std::shared_ptr<ds::IpcFileWriteOptions>& ipc_options, bool use_legacy_format, const std::shared_ptr<arrow::util::Codec>& codec, arrow::ipc::MetadataVersion metadata_version);
+extern "C" SEXP _arrow_dataset___IpcFileWriteOptions__update2(SEXP ipc_options_sexp, SEXP use_legacy_format_sexp, SEXP codec_sexp, SEXP metadata_version_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::IpcFileWriteOptions>&>::type ipc_options(ipc_options_sexp);
+ arrow::r::Input<bool>::type use_legacy_format(use_legacy_format_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::util::Codec>&>::type codec(codec_sexp);
+ arrow::r::Input<arrow::ipc::MetadataVersion>::type metadata_version(metadata_version_sexp);
+ dataset___IpcFileWriteOptions__update2(ipc_options, use_legacy_format, codec, metadata_version);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___IpcFileWriteOptions__update2(SEXP ipc_options_sexp, SEXP use_legacy_format_sexp, SEXP codec_sexp, SEXP metadata_version_sexp){
+ Rf_error("Cannot call dataset___IpcFileWriteOptions__update2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___IpcFileWriteOptions__update1(const std::shared_ptr<ds::IpcFileWriteOptions>& ipc_options, bool use_legacy_format, arrow::ipc::MetadataVersion metadata_version);
+extern "C" SEXP _arrow_dataset___IpcFileWriteOptions__update1(SEXP ipc_options_sexp, SEXP use_legacy_format_sexp, SEXP metadata_version_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::IpcFileWriteOptions>&>::type ipc_options(ipc_options_sexp);
+ arrow::r::Input<bool>::type use_legacy_format(use_legacy_format_sexp);
+ arrow::r::Input<arrow::ipc::MetadataVersion>::type metadata_version(metadata_version_sexp);
+ dataset___IpcFileWriteOptions__update1(ipc_options, use_legacy_format, metadata_version);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___IpcFileWriteOptions__update1(SEXP ipc_options_sexp, SEXP use_legacy_format_sexp, SEXP metadata_version_sexp){
+ Rf_error("Cannot call dataset___IpcFileWriteOptions__update1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___CsvFileWriteOptions__update(const std::shared_ptr<ds::CsvFileWriteOptions>& csv_options, const std::shared_ptr<arrow::csv::WriteOptions>& write_options);
+extern "C" SEXP _arrow_dataset___CsvFileWriteOptions__update(SEXP csv_options_sexp, SEXP write_options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::CsvFileWriteOptions>&>::type csv_options(csv_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::WriteOptions>&>::type write_options(write_options_sexp);
+ dataset___CsvFileWriteOptions__update(csv_options, write_options);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___CsvFileWriteOptions__update(SEXP csv_options_sexp, SEXP write_options_sexp){
+ Rf_error("Cannot call dataset___CsvFileWriteOptions__update(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::IpcFileFormat> dataset___IpcFileFormat__Make();
+extern "C" SEXP _arrow_dataset___IpcFileFormat__Make(){
+BEGIN_CPP11
+ return cpp11::as_sexp(dataset___IpcFileFormat__Make());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___IpcFileFormat__Make(){
+ Rf_error("Cannot call dataset___IpcFileFormat__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::CsvFileFormat> dataset___CsvFileFormat__Make(const std::shared_ptr<arrow::csv::ParseOptions>& parse_options, const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options, const std::shared_ptr<arrow::csv::ReadOptions>& read_options);
+extern "C" SEXP _arrow_dataset___CsvFileFormat__Make(SEXP parse_options_sexp, SEXP convert_options_sexp, SEXP read_options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ParseOptions>&>::type parse_options(parse_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ConvertOptions>&>::type convert_options(convert_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type read_options(read_options_sexp);
+ return cpp11::as_sexp(dataset___CsvFileFormat__Make(parse_options, convert_options, read_options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___CsvFileFormat__Make(SEXP parse_options_sexp, SEXP convert_options_sexp, SEXP read_options_sexp){
+ Rf_error("Cannot call dataset___CsvFileFormat__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::string dataset___FragmentScanOptions__type_name(const std::shared_ptr<ds::FragmentScanOptions>& fragment_scan_options);
+extern "C" SEXP _arrow_dataset___FragmentScanOptions__type_name(SEXP fragment_scan_options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FragmentScanOptions>&>::type fragment_scan_options(fragment_scan_options_sexp);
+ return cpp11::as_sexp(dataset___FragmentScanOptions__type_name(fragment_scan_options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___FragmentScanOptions__type_name(SEXP fragment_scan_options_sexp){
+ Rf_error("Cannot call dataset___FragmentScanOptions__type_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::CsvFragmentScanOptions> dataset___CsvFragmentScanOptions__Make(const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options, const std::shared_ptr<arrow::csv::ReadOptions>& read_options);
+extern "C" SEXP _arrow_dataset___CsvFragmentScanOptions__Make(SEXP convert_options_sexp, SEXP read_options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ConvertOptions>&>::type convert_options(convert_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::csv::ReadOptions>&>::type read_options(read_options_sexp);
+ return cpp11::as_sexp(dataset___CsvFragmentScanOptions__Make(convert_options, read_options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___CsvFragmentScanOptions__Make(SEXP convert_options_sexp, SEXP read_options_sexp){
+ Rf_error("Cannot call dataset___CsvFragmentScanOptions__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::ParquetFragmentScanOptions> dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer);
+extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type use_buffered_stream(use_buffered_stream_sexp);
+ arrow::r::Input<int64_t>::type buffer_size(buffer_size_sexp);
+ arrow::r::Input<bool>::type pre_buffer(pre_buffer_sexp);
+ return cpp11::as_sexp(dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp){
+ Rf_error("Cannot call dataset___ParquetFragmentScanOptions__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::DirectoryPartitioning> dataset___DirectoryPartitioning(const std::shared_ptr<arrow::Schema>& schm, const std::string& segment_encoding);
+extern "C" SEXP _arrow_dataset___DirectoryPartitioning(SEXP schm_sexp, SEXP segment_encoding_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp);
+ arrow::r::Input<const std::string&>::type segment_encoding(segment_encoding_sexp);
+ return cpp11::as_sexp(dataset___DirectoryPartitioning(schm, segment_encoding));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___DirectoryPartitioning(SEXP schm_sexp, SEXP segment_encoding_sexp){
+ Rf_error("Cannot call dataset___DirectoryPartitioning(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::PartitioningFactory> dataset___DirectoryPartitioning__MakeFactory(const std::vector<std::string>& field_names, const std::string& segment_encoding);
+extern "C" SEXP _arrow_dataset___DirectoryPartitioning__MakeFactory(SEXP field_names_sexp, SEXP segment_encoding_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::vector<std::string>&>::type field_names(field_names_sexp);
+ arrow::r::Input<const std::string&>::type segment_encoding(segment_encoding_sexp);
+ return cpp11::as_sexp(dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___DirectoryPartitioning__MakeFactory(SEXP field_names_sexp, SEXP segment_encoding_sexp){
+ Rf_error("Cannot call dataset___DirectoryPartitioning__MakeFactory(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::HivePartitioning> dataset___HivePartitioning(const std::shared_ptr<arrow::Schema>& schm, const std::string& null_fallback, const std::string& segment_encoding);
+extern "C" SEXP _arrow_dataset___HivePartitioning(SEXP schm_sexp, SEXP null_fallback_sexp, SEXP segment_encoding_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schm(schm_sexp);
+ arrow::r::Input<const std::string&>::type null_fallback(null_fallback_sexp);
+ arrow::r::Input<const std::string&>::type segment_encoding(segment_encoding_sexp);
+ return cpp11::as_sexp(dataset___HivePartitioning(schm, null_fallback, segment_encoding));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___HivePartitioning(SEXP schm_sexp, SEXP null_fallback_sexp, SEXP segment_encoding_sexp){
+ Rf_error("Cannot call dataset___HivePartitioning(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::PartitioningFactory> dataset___HivePartitioning__MakeFactory(const std::string& null_fallback, const std::string& segment_encoding);
+extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallback_sexp, SEXP segment_encoding_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type null_fallback(null_fallback_sexp);
+ arrow::r::Input<const std::string&>::type segment_encoding(segment_encoding_sexp);
+ return cpp11::as_sexp(dataset___HivePartitioning__MakeFactory(null_fallback, segment_encoding));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallback_sexp, SEXP segment_encoding_sexp){
+ Rf_error("Cannot call dataset___HivePartitioning__MakeFactory(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__ProjectNames(const std::shared_ptr<ds::ScannerBuilder>& sb, const std::vector<std::string>& cols);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__ProjectNames(SEXP sb_sexp, SEXP cols_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type cols(cols_sexp);
+ dataset___ScannerBuilder__ProjectNames(sb, cols);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__ProjectNames(SEXP sb_sexp, SEXP cols_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__ProjectNames(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__ProjectExprs(const std::shared_ptr<ds::ScannerBuilder>& sb, const std::vector<std::shared_ptr<compute::Expression>>& exprs, const std::vector<std::string>& names);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__ProjectExprs(SEXP sb_sexp, SEXP exprs_sexp, SEXP names_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ arrow::r::Input<const std::vector<std::shared_ptr<compute::Expression>>&>::type exprs(exprs_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type names(names_sexp);
+ dataset___ScannerBuilder__ProjectExprs(sb, exprs, names);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__ProjectExprs(SEXP sb_sexp, SEXP exprs_sexp, SEXP names_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__ProjectExprs(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__Filter(const std::shared_ptr<ds::ScannerBuilder>& sb, const std::shared_ptr<compute::Expression>& expr);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__Filter(SEXP sb_sexp, SEXP expr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type expr(expr_sexp);
+ dataset___ScannerBuilder__Filter(sb, expr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__Filter(SEXP sb_sexp, SEXP expr_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__Filter(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__UseThreads(const std::shared_ptr<ds::ScannerBuilder>& sb, bool threads);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__UseThreads(SEXP sb_sexp, SEXP threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ arrow::r::Input<bool>::type threads(threads_sexp);
+ dataset___ScannerBuilder__UseThreads(sb, threads);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__UseThreads(SEXP sb_sexp, SEXP threads_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__UseThreads(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__UseAsync(const std::shared_ptr<ds::ScannerBuilder>& sb, bool use_async);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__UseAsync(SEXP sb_sexp, SEXP use_async_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ arrow::r::Input<bool>::type use_async(use_async_sexp);
+ dataset___ScannerBuilder__UseAsync(sb, use_async);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__UseAsync(SEXP sb_sexp, SEXP use_async_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__UseAsync(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__BatchSize(const std::shared_ptr<ds::ScannerBuilder>& sb, int64_t batch_size);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__BatchSize(SEXP sb_sexp, SEXP batch_size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ arrow::r::Input<int64_t>::type batch_size(batch_size_sexp);
+ dataset___ScannerBuilder__BatchSize(sb, batch_size);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__BatchSize(SEXP sb_sexp, SEXP batch_size_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__BatchSize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___ScannerBuilder__FragmentScanOptions(const std::shared_ptr<ds::ScannerBuilder>& sb, const std::shared_ptr<ds::FragmentScanOptions>& options);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__FragmentScanOptions(SEXP sb_sexp, SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::FragmentScanOptions>&>::type options(options_sexp);
+ dataset___ScannerBuilder__FragmentScanOptions(sb, options);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__FragmentScanOptions(SEXP sb_sexp, SEXP options_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__FragmentScanOptions(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Schema> dataset___ScannerBuilder__schema(const std::shared_ptr<ds::ScannerBuilder>& sb);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__schema(SEXP sb_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ return cpp11::as_sexp(dataset___ScannerBuilder__schema(sb));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__schema(SEXP sb_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::Scanner> dataset___ScannerBuilder__Finish(const std::shared_ptr<ds::ScannerBuilder>& sb);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__Finish(SEXP sb_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScannerBuilder>&>::type sb(sb_sexp);
+ return cpp11::as_sexp(dataset___ScannerBuilder__Finish(sb));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__Finish(SEXP sb_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__Finish(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<ds::ScannerBuilder> dataset___ScannerBuilder__FromRecordBatchReader(const std::shared_ptr<arrow::RecordBatchReader>& reader);
+extern "C" SEXP _arrow_dataset___ScannerBuilder__FromRecordBatchReader(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatchReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(dataset___ScannerBuilder__FromRecordBatchReader(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScannerBuilder__FromRecordBatchReader(SEXP reader_sexp){
+ Rf_error("Cannot call dataset___ScannerBuilder__FromRecordBatchReader(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Table> dataset___Scanner__ToTable(const std::shared_ptr<ds::Scanner>& scanner);
+extern "C" SEXP _arrow_dataset___Scanner__ToTable(SEXP scanner_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
+ return cpp11::as_sexp(dataset___Scanner__ToTable(scanner));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Scanner__ToTable(SEXP scanner_sexp){
+ Rf_error("Cannot call dataset___Scanner__ToTable(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+cpp11::list dataset___Scanner__ScanBatches(const std::shared_ptr<ds::Scanner>& scanner);
+extern "C" SEXP _arrow_dataset___Scanner__ScanBatches(SEXP scanner_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
+ return cpp11::as_sexp(dataset___Scanner__ScanBatches(scanner));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Scanner__ScanBatches(SEXP scanner_sexp){
+ Rf_error("Cannot call dataset___Scanner__ScanBatches(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::RecordBatchReader> dataset___Scanner__ToRecordBatchReader(const std::shared_ptr<ds::Scanner>& scanner);
+extern "C" SEXP _arrow_dataset___Scanner__ToRecordBatchReader(SEXP scanner_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
+ return cpp11::as_sexp(dataset___Scanner__ToRecordBatchReader(scanner));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Scanner__ToRecordBatchReader(SEXP scanner_sexp){
+ Rf_error("Cannot call dataset___Scanner__ToRecordBatchReader(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Table> dataset___Scanner__head(const std::shared_ptr<ds::Scanner>& scanner, int n);
+extern "C" SEXP _arrow_dataset___Scanner__head(SEXP scanner_sexp, SEXP n_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
+ arrow::r::Input<int>::type n(n_sexp);
+ return cpp11::as_sexp(dataset___Scanner__head(scanner, n));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Scanner__head(SEXP scanner_sexp, SEXP n_sexp){
+ Rf_error("Cannot call dataset___Scanner__head(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Schema> dataset___Scanner__schema(const std::shared_ptr<ds::Scanner>& sc);
+extern "C" SEXP _arrow_dataset___Scanner__schema(SEXP sc_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type sc(sc_sexp);
+ return cpp11::as_sexp(dataset___Scanner__schema(sc));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Scanner__schema(SEXP sc_sexp){
+ Rf_error("Cannot call dataset___Scanner__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+cpp11::list dataset___ScanTask__get_batches(const std::shared_ptr<ds::ScanTask>& scan_task);
+extern "C" SEXP _arrow_dataset___ScanTask__get_batches(SEXP scan_task_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::ScanTask>&>::type scan_task(scan_task_sexp);
+ return cpp11::as_sexp(dataset___ScanTask__get_batches(scan_task));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___ScanTask__get_batches(SEXP scan_task_sexp){
+ Rf_error("Cannot call dataset___ScanTask__get_batches(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+void dataset___Dataset__Write(const std::shared_ptr<ds::FileWriteOptions>& file_write_options, const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir, const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template, const std::shared_ptr<ds::Scanner>& scanner, arrow::dataset::ExistingDataBehavior existing_data_behavior);
+extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::FileWriteOptions>&>::type file_write_options(file_write_options_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type filesystem(filesystem_sexp);
+ arrow::r::Input<std::string>::type base_dir(base_dir_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::Partitioning>&>::type partitioning(partitioning_sexp);
+ arrow::r::Input<std::string>::type basename_template(basename_template_sexp);
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
+ arrow::r::Input<arrow::dataset::ExistingDataBehavior>::type existing_data_behavior(existing_data_behavior_sexp);
+ dataset___Dataset__Write(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner, existing_data_behavior);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp, SEXP existing_data_behavior_sexp){
+ Rf_error("Cannot call dataset___Dataset__Write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+std::shared_ptr<arrow::Table> dataset___Scanner__TakeRows(const std::shared_ptr<ds::Scanner>& scanner, const std::shared_ptr<arrow::Array>& indices);
+extern "C" SEXP _arrow_dataset___Scanner__TakeRows(SEXP scanner_sexp, SEXP indices_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type indices(indices_sexp);
+ return cpp11::as_sexp(dataset___Scanner__TakeRows(scanner, indices));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Scanner__TakeRows(SEXP scanner_sexp, SEXP indices_sexp){
+ Rf_error("Cannot call dataset___Scanner__TakeRows(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// dataset.cpp
+#if defined(ARROW_R_WITH_DATASET)
+int64_t dataset___Scanner__CountRows(const std::shared_ptr<ds::Scanner>& scanner);
+extern "C" SEXP _arrow_dataset___Scanner__CountRows(SEXP scanner_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<ds::Scanner>&>::type scanner(scanner_sexp);
+ return cpp11::as_sexp(dataset___Scanner__CountRows(scanner));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_dataset___Scanner__CountRows(SEXP scanner_sexp){
+ Rf_error("Cannot call dataset___Scanner__CountRows(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Int8__initialize();
+extern "C" SEXP _arrow_Int8__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Int8__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Int8__initialize(){
+ Rf_error("Cannot call Int8__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Int16__initialize();
+extern "C" SEXP _arrow_Int16__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Int16__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Int16__initialize(){
+ Rf_error("Cannot call Int16__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Int32__initialize();
+extern "C" SEXP _arrow_Int32__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Int32__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Int32__initialize(){
+ Rf_error("Cannot call Int32__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Int64__initialize();
+extern "C" SEXP _arrow_Int64__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Int64__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Int64__initialize(){
+ Rf_error("Cannot call Int64__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> UInt8__initialize();
+extern "C" SEXP _arrow_UInt8__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(UInt8__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_UInt8__initialize(){
+ Rf_error("Cannot call UInt8__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> UInt16__initialize();
+extern "C" SEXP _arrow_UInt16__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(UInt16__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_UInt16__initialize(){
+ Rf_error("Cannot call UInt16__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> UInt32__initialize();
+extern "C" SEXP _arrow_UInt32__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(UInt32__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_UInt32__initialize(){
+ Rf_error("Cannot call UInt32__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> UInt64__initialize();
+extern "C" SEXP _arrow_UInt64__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(UInt64__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_UInt64__initialize(){
+ Rf_error("Cannot call UInt64__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Float16__initialize();
+extern "C" SEXP _arrow_Float16__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Float16__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Float16__initialize(){
+ Rf_error("Cannot call Float16__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Float32__initialize();
+extern "C" SEXP _arrow_Float32__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Float32__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Float32__initialize(){
+ Rf_error("Cannot call Float32__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Float64__initialize();
+extern "C" SEXP _arrow_Float64__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Float64__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Float64__initialize(){
+ Rf_error("Cannot call Float64__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Boolean__initialize();
+extern "C" SEXP _arrow_Boolean__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Boolean__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Boolean__initialize(){
+ Rf_error("Cannot call Boolean__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Utf8__initialize();
+extern "C" SEXP _arrow_Utf8__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Utf8__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Utf8__initialize(){
+ Rf_error("Cannot call Utf8__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> LargeUtf8__initialize();
+extern "C" SEXP _arrow_LargeUtf8__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(LargeUtf8__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeUtf8__initialize(){
+ Rf_error("Cannot call LargeUtf8__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Binary__initialize();
+extern "C" SEXP _arrow_Binary__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Binary__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Binary__initialize(){
+ Rf_error("Cannot call Binary__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> LargeBinary__initialize();
+extern "C" SEXP _arrow_LargeBinary__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(LargeBinary__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeBinary__initialize(){
+ Rf_error("Cannot call LargeBinary__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Date32__initialize();
+extern "C" SEXP _arrow_Date32__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Date32__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Date32__initialize(){
+ Rf_error("Cannot call Date32__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Date64__initialize();
+extern "C" SEXP _arrow_Date64__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Date64__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Date64__initialize(){
+ Rf_error("Cannot call Date64__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Null__initialize();
+extern "C" SEXP _arrow_Null__initialize(){
+BEGIN_CPP11
+ return cpp11::as_sexp(Null__initialize());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Null__initialize(){
+ Rf_error("Cannot call Null__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Decimal128Type__initialize(int32_t precision, int32_t scale);
+extern "C" SEXP _arrow_Decimal128Type__initialize(SEXP precision_sexp, SEXP scale_sexp){
+BEGIN_CPP11
+ arrow::r::Input<int32_t>::type precision(precision_sexp);
+ arrow::r::Input<int32_t>::type scale(scale_sexp);
+ return cpp11::as_sexp(Decimal128Type__initialize(precision, scale));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Decimal128Type__initialize(SEXP precision_sexp, SEXP scale_sexp){
+ Rf_error("Cannot call Decimal128Type__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> FixedSizeBinary__initialize(R_xlen_t byte_width);
+extern "C" SEXP _arrow_FixedSizeBinary__initialize(SEXP byte_width_sexp){
+BEGIN_CPP11
+ arrow::r::Input<R_xlen_t>::type byte_width(byte_width_sexp);
+ return cpp11::as_sexp(FixedSizeBinary__initialize(byte_width));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_FixedSizeBinary__initialize(SEXP byte_width_sexp){
+ Rf_error("Cannot call FixedSizeBinary__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Timestamp__initialize(arrow::TimeUnit::type unit, const std::string& timezone);
+extern "C" SEXP _arrow_Timestamp__initialize(SEXP unit_sexp, SEXP timezone_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::TimeUnit::type>::type unit(unit_sexp);
+ arrow::r::Input<const std::string&>::type timezone(timezone_sexp);
+ return cpp11::as_sexp(Timestamp__initialize(unit, timezone));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Timestamp__initialize(SEXP unit_sexp, SEXP timezone_sexp){
+ Rf_error("Cannot call Timestamp__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Time32__initialize(arrow::TimeUnit::type unit);
+extern "C" SEXP _arrow_Time32__initialize(SEXP unit_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::TimeUnit::type>::type unit(unit_sexp);
+ return cpp11::as_sexp(Time32__initialize(unit));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Time32__initialize(SEXP unit_sexp){
+ Rf_error("Cannot call Time32__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Time64__initialize(arrow::TimeUnit::type unit);
+extern "C" SEXP _arrow_Time64__initialize(SEXP unit_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::TimeUnit::type>::type unit(unit_sexp);
+ return cpp11::as_sexp(Time64__initialize(unit));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Time64__initialize(SEXP unit_sexp){
+ Rf_error("Cannot call Time64__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> list__(SEXP x);
+extern "C" SEXP _arrow_list__(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ return cpp11::as_sexp(list__(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_list__(SEXP x_sexp){
+ Rf_error("Cannot call list__(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> large_list__(SEXP x);
+extern "C" SEXP _arrow_large_list__(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ return cpp11::as_sexp(large_list__(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_large_list__(SEXP x_sexp){
+ Rf_error("Cannot call large_list__(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> fixed_size_list__(SEXP x, int list_size);
+extern "C" SEXP _arrow_fixed_size_list__(SEXP x_sexp, SEXP list_size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ arrow::r::Input<int>::type list_size(list_size_sexp);
+ return cpp11::as_sexp(fixed_size_list__(x, list_size));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fixed_size_list__(SEXP x_sexp, SEXP list_size_sexp){
+ Rf_error("Cannot call fixed_size_list__(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> struct__(const std::vector<std::shared_ptr<arrow::Field>>& fields);
+extern "C" SEXP _arrow_struct__(SEXP fields_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::vector<std::shared_ptr<arrow::Field>>&>::type fields(fields_sexp);
+ return cpp11::as_sexp(struct__(fields));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_struct__(SEXP fields_sexp){
+ Rf_error("Cannot call struct__(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string DataType__ToString(const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_DataType__ToString(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DataType__ToString(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DataType__ToString(SEXP type_sexp){
+ Rf_error("Cannot call DataType__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string DataType__name(const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_DataType__name(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DataType__name(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DataType__name(SEXP type_sexp){
+ Rf_error("Cannot call DataType__name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool DataType__Equals(const std::shared_ptr<arrow::DataType>& lhs, const std::shared_ptr<arrow::DataType>& rhs);
+extern "C" SEXP _arrow_DataType__Equals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type rhs(rhs_sexp);
+ return cpp11::as_sexp(DataType__Equals(lhs, rhs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DataType__Equals(SEXP lhs_sexp, SEXP rhs_sexp){
+ Rf_error("Cannot call DataType__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int DataType__num_fields(const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_DataType__num_fields(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DataType__num_fields(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DataType__num_fields(SEXP type_sexp){
+ Rf_error("Cannot call DataType__num_fields(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list DataType__fields(const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_DataType__fields(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DataType__fields(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DataType__fields(SEXP type_sexp){
+ Rf_error("Cannot call DataType__fields(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::Type::type DataType__id(const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_DataType__id(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DataType__id(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DataType__id(SEXP type_sexp){
+ Rf_error("Cannot call DataType__id(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string ListType__ToString(const std::shared_ptr<arrow::ListType>& type);
+extern "C" SEXP _arrow_ListType__ToString(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ListType__ToString(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListType__ToString(SEXP type_sexp){
+ Rf_error("Cannot call ListType__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int FixedWidthType__bit_width(const std::shared_ptr<arrow::FixedWidthType>& type);
+extern "C" SEXP _arrow_FixedWidthType__bit_width(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::FixedWidthType>&>::type type(type_sexp);
+ return cpp11::as_sexp(FixedWidthType__bit_width(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_FixedWidthType__bit_width(SEXP type_sexp){
+ Rf_error("Cannot call FixedWidthType__bit_width(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::DateUnit DateType__unit(const std::shared_ptr<arrow::DateType>& type);
+extern "C" SEXP _arrow_DateType__unit(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DateType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DateType__unit(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DateType__unit(SEXP type_sexp){
+ Rf_error("Cannot call DateType__unit(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::TimeUnit::type TimeType__unit(const std::shared_ptr<arrow::TimeType>& type);
+extern "C" SEXP _arrow_TimeType__unit(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::TimeType>&>::type type(type_sexp);
+ return cpp11::as_sexp(TimeType__unit(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimeType__unit(SEXP type_sexp){
+ Rf_error("Cannot call TimeType__unit(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t DecimalType__precision(const std::shared_ptr<arrow::DecimalType>& type);
+extern "C" SEXP _arrow_DecimalType__precision(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DecimalType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DecimalType__precision(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DecimalType__precision(SEXP type_sexp){
+ Rf_error("Cannot call DecimalType__precision(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t DecimalType__scale(const std::shared_ptr<arrow::DecimalType>& type);
+extern "C" SEXP _arrow_DecimalType__scale(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DecimalType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DecimalType__scale(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DecimalType__scale(SEXP type_sexp){
+ Rf_error("Cannot call DecimalType__scale(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string TimestampType__timezone(const std::shared_ptr<arrow::TimestampType>& type);
+extern "C" SEXP _arrow_TimestampType__timezone(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::TimestampType>&>::type type(type_sexp);
+ return cpp11::as_sexp(TimestampType__timezone(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampType__timezone(SEXP type_sexp){
+ Rf_error("Cannot call TimestampType__timezone(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::TimeUnit::type TimestampType__unit(const std::shared_ptr<arrow::TimestampType>& type);
+extern "C" SEXP _arrow_TimestampType__unit(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::TimestampType>&>::type type(type_sexp);
+ return cpp11::as_sexp(TimestampType__unit(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_TimestampType__unit(SEXP type_sexp){
+ Rf_error("Cannot call TimestampType__unit(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> DictionaryType__initialize(const std::shared_ptr<arrow::DataType>& index_type, const std::shared_ptr<arrow::DataType>& value_type, bool ordered);
+extern "C" SEXP _arrow_DictionaryType__initialize(SEXP index_type_sexp, SEXP value_type_sexp, SEXP ordered_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type index_type(index_type_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type value_type(value_type_sexp);
+ arrow::r::Input<bool>::type ordered(ordered_sexp);
+ return cpp11::as_sexp(DictionaryType__initialize(index_type, value_type, ordered));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryType__initialize(SEXP index_type_sexp, SEXP value_type_sexp, SEXP ordered_sexp){
+ Rf_error("Cannot call DictionaryType__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> DictionaryType__index_type(const std::shared_ptr<arrow::DictionaryType>& type);
+extern "C" SEXP _arrow_DictionaryType__index_type(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DictionaryType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DictionaryType__index_type(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryType__index_type(SEXP type_sexp){
+ Rf_error("Cannot call DictionaryType__index_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> DictionaryType__value_type(const std::shared_ptr<arrow::DictionaryType>& type);
+extern "C" SEXP _arrow_DictionaryType__value_type(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DictionaryType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DictionaryType__value_type(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryType__value_type(SEXP type_sexp){
+ Rf_error("Cannot call DictionaryType__value_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type);
+extern "C" SEXP _arrow_DictionaryType__name(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DictionaryType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DictionaryType__name(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryType__name(SEXP type_sexp){
+ Rf_error("Cannot call DictionaryType__name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool DictionaryType__ordered(const std::shared_ptr<arrow::DictionaryType>& type);
+extern "C" SEXP _arrow_DictionaryType__ordered(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DictionaryType>&>::type type(type_sexp);
+ return cpp11::as_sexp(DictionaryType__ordered(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryType__ordered(SEXP type_sexp){
+ Rf_error("Cannot call DictionaryType__ordered(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> StructType__GetFieldByName(const std::shared_ptr<arrow::StructType>& type, const std::string& name);
+extern "C" SEXP _arrow_StructType__GetFieldByName(SEXP type_sexp, SEXP name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructType>&>::type type(type_sexp);
+ arrow::r::Input<const std::string&>::type name(name_sexp);
+ return cpp11::as_sexp(StructType__GetFieldByName(type, name));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructType__GetFieldByName(SEXP type_sexp, SEXP name_sexp){
+ Rf_error("Cannot call StructType__GetFieldByName(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int StructType__GetFieldIndex(const std::shared_ptr<arrow::StructType>& type, const std::string& name);
+extern "C" SEXP _arrow_StructType__GetFieldIndex(SEXP type_sexp, SEXP name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructType>&>::type type(type_sexp);
+ arrow::r::Input<const std::string&>::type name(name_sexp);
+ return cpp11::as_sexp(StructType__GetFieldIndex(type, name));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructType__GetFieldIndex(SEXP type_sexp, SEXP name_sexp){
+ Rf_error("Cannot call StructType__GetFieldIndex(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> StructType__field_names(const std::shared_ptr<arrow::StructType>& type);
+extern "C" SEXP _arrow_StructType__field_names(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructType>&>::type type(type_sexp);
+ return cpp11::as_sexp(StructType__field_names(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructType__field_names(SEXP type_sexp){
+ Rf_error("Cannot call StructType__field_names(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> ListType__value_field(const std::shared_ptr<arrow::ListType>& type);
+extern "C" SEXP _arrow_ListType__value_field(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ListType__value_field(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListType__value_field(SEXP type_sexp){
+ Rf_error("Cannot call ListType__value_field(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ListType__value_type(const std::shared_ptr<arrow::ListType>& type);
+extern "C" SEXP _arrow_ListType__value_type(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ListType__value_type(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ListType__value_type(SEXP type_sexp){
+ Rf_error("Cannot call ListType__value_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> LargeListType__value_field(const std::shared_ptr<arrow::LargeListType>& type);
+extern "C" SEXP _arrow_LargeListType__value_field(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::LargeListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(LargeListType__value_field(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeListType__value_field(SEXP type_sexp){
+ Rf_error("Cannot call LargeListType__value_field(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> LargeListType__value_type(const std::shared_ptr<arrow::LargeListType>& type);
+extern "C" SEXP _arrow_LargeListType__value_type(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::LargeListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(LargeListType__value_type(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_LargeListType__value_type(SEXP type_sexp){
+ Rf_error("Cannot call LargeListType__value_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> FixedSizeListType__value_field(const std::shared_ptr<arrow::FixedSizeListType>& type);
+extern "C" SEXP _arrow_FixedSizeListType__value_field(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::FixedSizeListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(FixedSizeListType__value_field(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_FixedSizeListType__value_field(SEXP type_sexp){
+ Rf_error("Cannot call FixedSizeListType__value_field(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> FixedSizeListType__value_type(const std::shared_ptr<arrow::FixedSizeListType>& type);
+extern "C" SEXP _arrow_FixedSizeListType__value_type(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::FixedSizeListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(FixedSizeListType__value_type(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_FixedSizeListType__value_type(SEXP type_sexp){
+ Rf_error("Cannot call FixedSizeListType__value_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int FixedSizeListType__list_size(const std::shared_ptr<arrow::FixedSizeListType>& type);
+extern "C" SEXP _arrow_FixedSizeListType__list_size(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::FixedSizeListType>&>::type type(type_sexp);
+ return cpp11::as_sexp(FixedSizeListType__list_size(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_FixedSizeListType__list_size(SEXP type_sexp){
+ Rf_error("Cannot call FixedSizeListType__list_size(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool compute___expr__equals(const std::shared_ptr<compute::Expression>& lhs, const std::shared_ptr<compute::Expression>& rhs);
+extern "C" SEXP _arrow_compute___expr__equals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type rhs(rhs_sexp);
+ return cpp11::as_sexp(compute___expr__equals(lhs, rhs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__equals(SEXP lhs_sexp, SEXP rhs_sexp){
+ Rf_error("Cannot call compute___expr__equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<compute::Expression> compute___expr__call(std::string func_name, cpp11::list argument_list, cpp11::list options);
+extern "C" SEXP _arrow_compute___expr__call(SEXP func_name_sexp, SEXP argument_list_sexp, SEXP options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<std::string>::type func_name(func_name_sexp);
+ arrow::r::Input<cpp11::list>::type argument_list(argument_list_sexp);
+ arrow::r::Input<cpp11::list>::type options(options_sexp);
+ return cpp11::as_sexp(compute___expr__call(func_name, argument_list, options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__call(SEXP func_name_sexp, SEXP argument_list_sexp, SEXP options_sexp){
+ Rf_error("Cannot call compute___expr__call(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> field_names_in_expression(const std::shared_ptr<compute::Expression>& x);
+extern "C" SEXP _arrow_field_names_in_expression(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type x(x_sexp);
+ return cpp11::as_sexp(field_names_in_expression(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_field_names_in_expression(SEXP x_sexp){
+ Rf_error("Cannot call field_names_in_expression(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string compute___expr__get_field_ref_name(const std::shared_ptr<compute::Expression>& x);
+extern "C" SEXP _arrow_compute___expr__get_field_ref_name(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type x(x_sexp);
+ return cpp11::as_sexp(compute___expr__get_field_ref_name(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__get_field_ref_name(SEXP x_sexp){
+ Rf_error("Cannot call compute___expr__get_field_ref_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<compute::Expression> compute___expr__field_ref(std::string name);
+extern "C" SEXP _arrow_compute___expr__field_ref(SEXP name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<std::string>::type name(name_sexp);
+ return cpp11::as_sexp(compute___expr__field_ref(name));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__field_ref(SEXP name_sexp){
+ Rf_error("Cannot call compute___expr__field_ref(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<compute::Expression> compute___expr__scalar(const std::shared_ptr<arrow::Scalar>& x);
+extern "C" SEXP _arrow_compute___expr__scalar(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type x(x_sexp);
+ return cpp11::as_sexp(compute___expr__scalar(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__scalar(SEXP x_sexp){
+ Rf_error("Cannot call compute___expr__scalar(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string compute___expr__ToString(const std::shared_ptr<compute::Expression>& x);
+extern "C" SEXP _arrow_compute___expr__ToString(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type x(x_sexp);
+ return cpp11::as_sexp(compute___expr__ToString(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__ToString(SEXP x_sexp){
+ Rf_error("Cannot call compute___expr__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> compute___expr__type(const std::shared_ptr<compute::Expression>& x, const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_compute___expr__type(SEXP x_sexp, SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type x(x_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(compute___expr__type(x, schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__type(SEXP x_sexp, SEXP schema_sexp){
+ Rf_error("Cannot call compute___expr__type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// expression.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::Type::type compute___expr__type_id(const std::shared_ptr<compute::Expression>& x, const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<compute::Expression>&>::type x(x_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(compute___expr__type_id(x, schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){
+ Rf_error("Cannot call compute___expr__type_id(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// feather.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ipc___WriteFeather__Table(const std::shared_ptr<arrow::io::OutputStream>& stream, const std::shared_ptr<arrow::Table>& table, int version, int chunk_size, arrow::Compression::type compression, int compression_level);
+extern "C" SEXP _arrow_ipc___WriteFeather__Table(SEXP stream_sexp, SEXP table_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type stream(stream_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<int>::type version(version_sexp);
+ arrow::r::Input<int>::type chunk_size(chunk_size_sexp);
+ arrow::r::Input<arrow::Compression::type>::type compression(compression_sexp);
+ arrow::r::Input<int>::type compression_level(compression_level_sexp);
+ ipc___WriteFeather__Table(stream, table, version, chunk_size, compression, compression_level);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___WriteFeather__Table(SEXP stream_sexp, SEXP table_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){
+ Rf_error("Cannot call ipc___WriteFeather__Table(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// feather.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ipc___feather___Reader__version(const std::shared_ptr<arrow::ipc::feather::Reader>& reader);
+extern "C" SEXP _arrow_ipc___feather___Reader__version(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::feather::Reader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(ipc___feather___Reader__version(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___feather___Reader__version(SEXP reader_sexp){
+ Rf_error("Cannot call ipc___feather___Reader__version(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// feather.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> ipc___feather___Reader__Read(const std::shared_ptr<arrow::ipc::feather::Reader>& reader, SEXP columns);
+extern "C" SEXP _arrow_ipc___feather___Reader__Read(SEXP reader_sexp, SEXP columns_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::feather::Reader>&>::type reader(reader_sexp);
+ arrow::r::Input<SEXP>::type columns(columns_sexp);
+ return cpp11::as_sexp(ipc___feather___Reader__Read(reader, columns));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___feather___Reader__Read(SEXP reader_sexp, SEXP columns_sexp){
+ Rf_error("Cannot call ipc___feather___Reader__Read(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// feather.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::feather::Reader> ipc___feather___Reader__Open(const std::shared_ptr<arrow::io::RandomAccessFile>& stream);
+extern "C" SEXP _arrow_ipc___feather___Reader__Open(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(ipc___feather___Reader__Open(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___feather___Reader__Open(SEXP stream_sexp){
+ Rf_error("Cannot call ipc___feather___Reader__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// feather.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> ipc___feather___Reader__schema(const std::shared_ptr<arrow::ipc::feather::Reader>& reader);
+extern "C" SEXP _arrow_ipc___feather___Reader__schema(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::feather::Reader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(ipc___feather___Reader__schema(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___feather___Reader__schema(SEXP reader_sexp){
+ Rf_error("Cannot call ipc___feather___Reader__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// field.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> Field__initialize(const std::string& name, const std::shared_ptr<arrow::DataType>& field, bool nullable);
+extern "C" SEXP _arrow_Field__initialize(SEXP name_sexp, SEXP field_sexp, SEXP nullable_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type name(name_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type field(field_sexp);
+ arrow::r::Input<bool>::type nullable(nullable_sexp);
+ return cpp11::as_sexp(Field__initialize(name, field, nullable));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Field__initialize(SEXP name_sexp, SEXP field_sexp, SEXP nullable_sexp){
+ Rf_error("Cannot call Field__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// field.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string Field__ToString(const std::shared_ptr<arrow::Field>& field);
+extern "C" SEXP _arrow_Field__ToString(SEXP field_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ return cpp11::as_sexp(Field__ToString(field));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Field__ToString(SEXP field_sexp){
+ Rf_error("Cannot call Field__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// field.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string Field__name(const std::shared_ptr<arrow::Field>& field);
+extern "C" SEXP _arrow_Field__name(SEXP field_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ return cpp11::as_sexp(Field__name(field));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Field__name(SEXP field_sexp){
+ Rf_error("Cannot call Field__name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// field.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Field__Equals(const std::shared_ptr<arrow::Field>& field, const std::shared_ptr<arrow::Field>& other);
+extern "C" SEXP _arrow_Field__Equals(SEXP field_sexp, SEXP other_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type other(other_sexp);
+ return cpp11::as_sexp(Field__Equals(field, other));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Field__Equals(SEXP field_sexp, SEXP other_sexp){
+ Rf_error("Cannot call Field__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// field.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Field__nullable(const std::shared_ptr<arrow::Field>& field);
+extern "C" SEXP _arrow_Field__nullable(SEXP field_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ return cpp11::as_sexp(Field__nullable(field));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Field__nullable(SEXP field_sexp){
+ Rf_error("Cannot call Field__nullable(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// field.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Field__type(const std::shared_ptr<arrow::Field>& field);
+extern "C" SEXP _arrow_Field__type(SEXP field_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ return cpp11::as_sexp(Field__type(field));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Field__type(SEXP field_sexp){
+ Rf_error("Cannot call Field__type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+fs::FileType fs___FileInfo__type(const std::shared_ptr<fs::FileInfo>& x);
+extern "C" SEXP _arrow_fs___FileInfo__type(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ return cpp11::as_sexp(fs___FileInfo__type(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__type(SEXP x_sexp){
+ Rf_error("Cannot call fs___FileInfo__type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileInfo__set_type(const std::shared_ptr<fs::FileInfo>& x, fs::FileType type);
+extern "C" SEXP _arrow_fs___FileInfo__set_type(SEXP x_sexp, SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ arrow::r::Input<fs::FileType>::type type(type_sexp);
+ fs___FileInfo__set_type(x, type);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__set_type(SEXP x_sexp, SEXP type_sexp){
+ Rf_error("Cannot call fs___FileInfo__set_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string fs___FileInfo__path(const std::shared_ptr<fs::FileInfo>& x);
+extern "C" SEXP _arrow_fs___FileInfo__path(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ return cpp11::as_sexp(fs___FileInfo__path(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__path(SEXP x_sexp){
+ Rf_error("Cannot call fs___FileInfo__path(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileInfo__set_path(const std::shared_ptr<fs::FileInfo>& x, const std::string& path);
+extern "C" SEXP _arrow_fs___FileInfo__set_path(SEXP x_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ fs___FileInfo__set_path(x, path);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__set_path(SEXP x_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileInfo__set_path(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t fs___FileInfo__size(const std::shared_ptr<fs::FileInfo>& x);
+extern "C" SEXP _arrow_fs___FileInfo__size(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ return cpp11::as_sexp(fs___FileInfo__size(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__size(SEXP x_sexp){
+ Rf_error("Cannot call fs___FileInfo__size(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileInfo__set_size(const std::shared_ptr<fs::FileInfo>& x, int64_t size);
+extern "C" SEXP _arrow_fs___FileInfo__set_size(SEXP x_sexp, SEXP size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ arrow::r::Input<int64_t>::type size(size_sexp);
+ fs___FileInfo__set_size(x, size);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__set_size(SEXP x_sexp, SEXP size_sexp){
+ Rf_error("Cannot call fs___FileInfo__set_size(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string fs___FileInfo__base_name(const std::shared_ptr<fs::FileInfo>& x);
+extern "C" SEXP _arrow_fs___FileInfo__base_name(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ return cpp11::as_sexp(fs___FileInfo__base_name(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__base_name(SEXP x_sexp){
+ Rf_error("Cannot call fs___FileInfo__base_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string fs___FileInfo__extension(const std::shared_ptr<fs::FileInfo>& x);
+extern "C" SEXP _arrow_fs___FileInfo__extension(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ return cpp11::as_sexp(fs___FileInfo__extension(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__extension(SEXP x_sexp){
+ Rf_error("Cannot call fs___FileInfo__extension(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+SEXP fs___FileInfo__mtime(const std::shared_ptr<fs::FileInfo>& x);
+extern "C" SEXP _arrow_fs___FileInfo__mtime(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ return cpp11::as_sexp(fs___FileInfo__mtime(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__mtime(SEXP x_sexp){
+ Rf_error("Cannot call fs___FileInfo__mtime(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileInfo__set_mtime(const std::shared_ptr<fs::FileInfo>& x, SEXP time);
+extern "C" SEXP _arrow_fs___FileInfo__set_mtime(SEXP x_sexp, SEXP time_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileInfo>&>::type x(x_sexp);
+ arrow::r::Input<SEXP>::type time(time_sexp);
+ fs___FileInfo__set_mtime(x, time);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileInfo__set_mtime(SEXP x_sexp, SEXP time_sexp){
+ Rf_error("Cannot call fs___FileInfo__set_mtime(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string fs___FileSelector__base_dir(const std::shared_ptr<fs::FileSelector>& selector);
+extern "C" SEXP _arrow_fs___FileSelector__base_dir(SEXP selector_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type selector(selector_sexp);
+ return cpp11::as_sexp(fs___FileSelector__base_dir(selector));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSelector__base_dir(SEXP selector_sexp){
+ Rf_error("Cannot call fs___FileSelector__base_dir(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool fs___FileSelector__allow_not_found(const std::shared_ptr<fs::FileSelector>& selector);
+extern "C" SEXP _arrow_fs___FileSelector__allow_not_found(SEXP selector_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type selector(selector_sexp);
+ return cpp11::as_sexp(fs___FileSelector__allow_not_found(selector));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSelector__allow_not_found(SEXP selector_sexp){
+ Rf_error("Cannot call fs___FileSelector__allow_not_found(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool fs___FileSelector__recursive(const std::shared_ptr<fs::FileSelector>& selector);
+extern "C" SEXP _arrow_fs___FileSelector__recursive(SEXP selector_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type selector(selector_sexp);
+ return cpp11::as_sexp(fs___FileSelector__recursive(selector));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSelector__recursive(SEXP selector_sexp){
+ Rf_error("Cannot call fs___FileSelector__recursive(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<fs::FileSelector> fs___FileSelector__create(const std::string& base_dir, bool allow_not_found, bool recursive);
+extern "C" SEXP _arrow_fs___FileSelector__create(SEXP base_dir_sexp, SEXP allow_not_found_sexp, SEXP recursive_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type base_dir(base_dir_sexp);
+ arrow::r::Input<bool>::type allow_not_found(allow_not_found_sexp);
+ arrow::r::Input<bool>::type recursive(recursive_sexp);
+ return cpp11::as_sexp(fs___FileSelector__create(base_dir, allow_not_found, recursive));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSelector__create(SEXP base_dir_sexp, SEXP allow_not_found_sexp, SEXP recursive_sexp){
+ Rf_error("Cannot call fs___FileSelector__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list fs___FileSystem__GetTargetInfos_Paths(const std::shared_ptr<fs::FileSystem>& file_system, const std::vector<std::string>& paths);
+extern "C" SEXP _arrow_fs___FileSystem__GetTargetInfos_Paths(SEXP file_system_sexp, SEXP paths_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+ return cpp11::as_sexp(fs___FileSystem__GetTargetInfos_Paths(file_system, paths));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__GetTargetInfos_Paths(SEXP file_system_sexp, SEXP paths_sexp){
+ Rf_error("Cannot call fs___FileSystem__GetTargetInfos_Paths(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list fs___FileSystem__GetTargetInfos_FileSelector(const std::shared_ptr<fs::FileSystem>& file_system, const std::shared_ptr<fs::FileSelector>& selector);
+extern "C" SEXP _arrow_fs___FileSystem__GetTargetInfos_FileSelector(SEXP file_system_sexp, SEXP selector_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type selector(selector_sexp);
+ return cpp11::as_sexp(fs___FileSystem__GetTargetInfos_FileSelector(file_system, selector));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__GetTargetInfos_FileSelector(SEXP file_system_sexp, SEXP selector_sexp){
+ Rf_error("Cannot call fs___FileSystem__GetTargetInfos_FileSelector(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileSystem__CreateDir(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path, bool recursive);
+extern "C" SEXP _arrow_fs___FileSystem__CreateDir(SEXP file_system_sexp, SEXP path_sexp, SEXP recursive_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ arrow::r::Input<bool>::type recursive(recursive_sexp);
+ fs___FileSystem__CreateDir(file_system, path, recursive);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__CreateDir(SEXP file_system_sexp, SEXP path_sexp, SEXP recursive_sexp){
+ Rf_error("Cannot call fs___FileSystem__CreateDir(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileSystem__DeleteDir(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystem__DeleteDir(SEXP file_system_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ fs___FileSystem__DeleteDir(file_system, path);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__DeleteDir(SEXP file_system_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystem__DeleteDir(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileSystem__DeleteDirContents(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystem__DeleteDirContents(SEXP file_system_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ fs___FileSystem__DeleteDirContents(file_system, path);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__DeleteDirContents(SEXP file_system_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystem__DeleteDirContents(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileSystem__DeleteFile(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystem__DeleteFile(SEXP file_system_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ fs___FileSystem__DeleteFile(file_system, path);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__DeleteFile(SEXP file_system_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystem__DeleteFile(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileSystem__DeleteFiles(const std::shared_ptr<fs::FileSystem>& file_system, const std::vector<std::string>& paths);
+extern "C" SEXP _arrow_fs___FileSystem__DeleteFiles(SEXP file_system_sexp, SEXP paths_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+ fs___FileSystem__DeleteFiles(file_system, paths);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__DeleteFiles(SEXP file_system_sexp, SEXP paths_sexp){
+ Rf_error("Cannot call fs___FileSystem__DeleteFiles(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileSystem__Move(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& src, const std::string& dest);
+extern "C" SEXP _arrow_fs___FileSystem__Move(SEXP file_system_sexp, SEXP src_sexp, SEXP dest_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type src(src_sexp);
+ arrow::r::Input<const std::string&>::type dest(dest_sexp);
+ fs___FileSystem__Move(file_system, src, dest);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__Move(SEXP file_system_sexp, SEXP src_sexp, SEXP dest_sexp){
+ Rf_error("Cannot call fs___FileSystem__Move(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___FileSystem__CopyFile(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& src, const std::string& dest);
+extern "C" SEXP _arrow_fs___FileSystem__CopyFile(SEXP file_system_sexp, SEXP src_sexp, SEXP dest_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type src(src_sexp);
+ arrow::r::Input<const std::string&>::type dest(dest_sexp);
+ fs___FileSystem__CopyFile(file_system, src, dest);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__CopyFile(SEXP file_system_sexp, SEXP src_sexp, SEXP dest_sexp){
+ Rf_error("Cannot call fs___FileSystem__CopyFile(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::InputStream> fs___FileSystem__OpenInputStream(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystem__OpenInputStream(SEXP file_system_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ return cpp11::as_sexp(fs___FileSystem__OpenInputStream(file_system, path));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__OpenInputStream(SEXP file_system_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystem__OpenInputStream(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::RandomAccessFile> fs___FileSystem__OpenInputFile(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystem__OpenInputFile(SEXP file_system_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ return cpp11::as_sexp(fs___FileSystem__OpenInputFile(file_system, path));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__OpenInputFile(SEXP file_system_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystem__OpenInputFile(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::OutputStream> fs___FileSystem__OpenOutputStream(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystem__OpenOutputStream(SEXP file_system_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ return cpp11::as_sexp(fs___FileSystem__OpenOutputStream(file_system, path));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__OpenOutputStream(SEXP file_system_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystem__OpenOutputStream(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::OutputStream> fs___FileSystem__OpenAppendStream(const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystem__OpenAppendStream(SEXP file_system_sexp, SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ return cpp11::as_sexp(fs___FileSystem__OpenAppendStream(file_system, path));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__OpenAppendStream(SEXP file_system_sexp, SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystem__OpenAppendStream(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string fs___FileSystem__type_name(const std::shared_ptr<fs::FileSystem>& file_system);
+extern "C" SEXP _arrow_fs___FileSystem__type_name(SEXP file_system_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type file_system(file_system_sexp);
+ return cpp11::as_sexp(fs___FileSystem__type_name(file_system));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystem__type_name(SEXP file_system_sexp){
+ Rf_error("Cannot call fs___FileSystem__type_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<fs::LocalFileSystem> fs___LocalFileSystem__create();
+extern "C" SEXP _arrow_fs___LocalFileSystem__create(){
+BEGIN_CPP11
+ return cpp11::as_sexp(fs___LocalFileSystem__create());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___LocalFileSystem__create(){
+ Rf_error("Cannot call fs___LocalFileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<fs::SubTreeFileSystem> fs___SubTreeFileSystem__create(const std::string& base_path, const std::shared_ptr<fs::FileSystem>& base_fs);
+extern "C" SEXP _arrow_fs___SubTreeFileSystem__create(SEXP base_path_sexp, SEXP base_fs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type base_path(base_path_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type base_fs(base_fs_sexp);
+ return cpp11::as_sexp(fs___SubTreeFileSystem__create(base_path, base_fs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___SubTreeFileSystem__create(SEXP base_path_sexp, SEXP base_fs_sexp){
+ Rf_error("Cannot call fs___SubTreeFileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<fs::FileSystem> fs___SubTreeFileSystem__base_fs(const std::shared_ptr<fs::SubTreeFileSystem>& file_system);
+extern "C" SEXP _arrow_fs___SubTreeFileSystem__base_fs(SEXP file_system_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::SubTreeFileSystem>&>::type file_system(file_system_sexp);
+ return cpp11::as_sexp(fs___SubTreeFileSystem__base_fs(file_system));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___SubTreeFileSystem__base_fs(SEXP file_system_sexp){
+ Rf_error("Cannot call fs___SubTreeFileSystem__base_fs(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string fs___SubTreeFileSystem__base_path(const std::shared_ptr<fs::SubTreeFileSystem>& file_system);
+extern "C" SEXP _arrow_fs___SubTreeFileSystem__base_path(SEXP file_system_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::SubTreeFileSystem>&>::type file_system(file_system_sexp);
+ return cpp11::as_sexp(fs___SubTreeFileSystem__base_path(file_system));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___SubTreeFileSystem__base_path(SEXP file_system_sexp){
+ Rf_error("Cannot call fs___SubTreeFileSystem__base_path(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::list fs___FileSystemFromUri(const std::string& path);
+extern "C" SEXP _arrow_fs___FileSystemFromUri(SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ return cpp11::as_sexp(fs___FileSystemFromUri(path));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___FileSystemFromUri(SEXP path_sexp){
+ Rf_error("Cannot call fs___FileSystemFromUri(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void fs___CopyFiles(const std::shared_ptr<fs::FileSystem>& source_fs, const std::shared_ptr<fs::FileSelector>& source_sel, const std::shared_ptr<fs::FileSystem>& destination_fs, const std::string& destination_base_dir, int64_t chunk_size, bool use_threads);
+extern "C" SEXP _arrow_fs___CopyFiles(SEXP source_fs_sexp, SEXP source_sel_sexp, SEXP destination_fs_sexp, SEXP destination_base_dir_sexp, SEXP chunk_size_sexp, SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type source_fs(source_fs_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSelector>&>::type source_sel(source_sel_sexp);
+ arrow::r::Input<const std::shared_ptr<fs::FileSystem>&>::type destination_fs(destination_fs_sexp);
+ arrow::r::Input<const std::string&>::type destination_base_dir(destination_base_dir_sexp);
+ arrow::r::Input<int64_t>::type chunk_size(chunk_size_sexp);
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ fs___CopyFiles(source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___CopyFiles(SEXP source_fs_sexp, SEXP source_sel_sexp, SEXP destination_fs_sexp, SEXP destination_base_dir_sexp, SEXP chunk_size_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call fs___CopyFiles(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_S3)
+std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, bool background_writes);
+extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP background_writes_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type anonymous(anonymous_sexp);
+ arrow::r::Input<std::string>::type access_key(access_key_sexp);
+ arrow::r::Input<std::string>::type secret_key(secret_key_sexp);
+ arrow::r::Input<std::string>::type session_token(session_token_sexp);
+ arrow::r::Input<std::string>::type role_arn(role_arn_sexp);
+ arrow::r::Input<std::string>::type session_name(session_name_sexp);
+ arrow::r::Input<std::string>::type external_id(external_id_sexp);
+ arrow::r::Input<int>::type load_frequency(load_frequency_sexp);
+ arrow::r::Input<std::string>::type region(region_sexp);
+ arrow::r::Input<std::string>::type endpoint_override(endpoint_override_sexp);
+ arrow::r::Input<std::string>::type scheme(scheme_sexp);
+ arrow::r::Input<bool>::type background_writes(background_writes_sexp);
+ return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP background_writes_sexp){
+ Rf_error("Cannot call fs___S3FileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// filesystem.cpp
+#if defined(ARROW_R_WITH_S3)
+std::string fs___S3FileSystem__region(const std::shared_ptr<fs::S3FileSystem>& fs);
+extern "C" SEXP _arrow_fs___S3FileSystem__region(SEXP fs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<fs::S3FileSystem>&>::type fs(fs_sexp);
+ return cpp11::as_sexp(fs___S3FileSystem__region(fs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_fs___S3FileSystem__region(SEXP fs_sexp){
+ Rf_error("Cannot call fs___S3FileSystem__region(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Buffer> io___Readable__Read(const std::shared_ptr<arrow::io::Readable>& x, int64_t nbytes);
+extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::Readable>&>::type x(x_sexp);
+ arrow::r::Input<int64_t>::type nbytes(nbytes_sexp);
+ return cpp11::as_sexp(io___Readable__Read(x, nbytes));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){
+ Rf_error("Cannot call io___Readable__Read(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void io___InputStream__Close(const std::shared_ptr<arrow::io::InputStream>& x);
+extern "C" SEXP _arrow_io___InputStream__Close(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type x(x_sexp);
+ io___InputStream__Close(x);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___InputStream__Close(SEXP x_sexp){
+ Rf_error("Cannot call io___InputStream__Close(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void io___OutputStream__Close(const std::shared_ptr<arrow::io::OutputStream>& x);
+extern "C" SEXP _arrow_io___OutputStream__Close(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type x(x_sexp);
+ io___OutputStream__Close(x);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___OutputStream__Close(SEXP x_sexp){
+ Rf_error("Cannot call io___OutputStream__Close(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t io___RandomAccessFile__GetSize(const std::shared_ptr<arrow::io::RandomAccessFile>& x);
+extern "C" SEXP _arrow_io___RandomAccessFile__GetSize(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type x(x_sexp);
+ return cpp11::as_sexp(io___RandomAccessFile__GetSize(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___RandomAccessFile__GetSize(SEXP x_sexp){
+ Rf_error("Cannot call io___RandomAccessFile__GetSize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool io___RandomAccessFile__supports_zero_copy(const std::shared_ptr<arrow::io::RandomAccessFile>& x);
+extern "C" SEXP _arrow_io___RandomAccessFile__supports_zero_copy(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type x(x_sexp);
+ return cpp11::as_sexp(io___RandomAccessFile__supports_zero_copy(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___RandomAccessFile__supports_zero_copy(SEXP x_sexp){
+ Rf_error("Cannot call io___RandomAccessFile__supports_zero_copy(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void io___RandomAccessFile__Seek(const std::shared_ptr<arrow::io::RandomAccessFile>& x, int64_t position);
+extern "C" SEXP _arrow_io___RandomAccessFile__Seek(SEXP x_sexp, SEXP position_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type x(x_sexp);
+ arrow::r::Input<int64_t>::type position(position_sexp);
+ io___RandomAccessFile__Seek(x, position);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___RandomAccessFile__Seek(SEXP x_sexp, SEXP position_sexp){
+ Rf_error("Cannot call io___RandomAccessFile__Seek(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t io___RandomAccessFile__Tell(const std::shared_ptr<arrow::io::RandomAccessFile>& x);
+extern "C" SEXP _arrow_io___RandomAccessFile__Tell(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type x(x_sexp);
+ return cpp11::as_sexp(io___RandomAccessFile__Tell(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___RandomAccessFile__Tell(SEXP x_sexp){
+ Rf_error("Cannot call io___RandomAccessFile__Tell(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Buffer> io___RandomAccessFile__Read0(const std::shared_ptr<arrow::io::RandomAccessFile>& x);
+extern "C" SEXP _arrow_io___RandomAccessFile__Read0(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type x(x_sexp);
+ return cpp11::as_sexp(io___RandomAccessFile__Read0(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___RandomAccessFile__Read0(SEXP x_sexp){
+ Rf_error("Cannot call io___RandomAccessFile__Read0(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Buffer> io___RandomAccessFile__ReadAt(const std::shared_ptr<arrow::io::RandomAccessFile>& x, int64_t position, int64_t nbytes);
+extern "C" SEXP _arrow_io___RandomAccessFile__ReadAt(SEXP x_sexp, SEXP position_sexp, SEXP nbytes_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type x(x_sexp);
+ arrow::r::Input<int64_t>::type position(position_sexp);
+ arrow::r::Input<int64_t>::type nbytes(nbytes_sexp);
+ return cpp11::as_sexp(io___RandomAccessFile__ReadAt(x, position, nbytes));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___RandomAccessFile__ReadAt(SEXP x_sexp, SEXP position_sexp, SEXP nbytes_sexp){
+ Rf_error("Cannot call io___RandomAccessFile__ReadAt(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::MemoryMappedFile> io___MemoryMappedFile__Create(const std::string& path, int64_t size);
+extern "C" SEXP _arrow_io___MemoryMappedFile__Create(SEXP path_sexp, SEXP size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ arrow::r::Input<int64_t>::type size(size_sexp);
+ return cpp11::as_sexp(io___MemoryMappedFile__Create(path, size));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___MemoryMappedFile__Create(SEXP path_sexp, SEXP size_sexp){
+ Rf_error("Cannot call io___MemoryMappedFile__Create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::MemoryMappedFile> io___MemoryMappedFile__Open(const std::string& path, arrow::io::FileMode::type mode);
+extern "C" SEXP _arrow_io___MemoryMappedFile__Open(SEXP path_sexp, SEXP mode_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ arrow::r::Input<arrow::io::FileMode::type>::type mode(mode_sexp);
+ return cpp11::as_sexp(io___MemoryMappedFile__Open(path, mode));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___MemoryMappedFile__Open(SEXP path_sexp, SEXP mode_sexp){
+ Rf_error("Cannot call io___MemoryMappedFile__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void io___MemoryMappedFile__Resize(const std::shared_ptr<arrow::io::MemoryMappedFile>& x, int64_t size);
+extern "C" SEXP _arrow_io___MemoryMappedFile__Resize(SEXP x_sexp, SEXP size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::MemoryMappedFile>&>::type x(x_sexp);
+ arrow::r::Input<int64_t>::type size(size_sexp);
+ io___MemoryMappedFile__Resize(x, size);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___MemoryMappedFile__Resize(SEXP x_sexp, SEXP size_sexp){
+ Rf_error("Cannot call io___MemoryMappedFile__Resize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::ReadableFile> io___ReadableFile__Open(const std::string& path);
+extern "C" SEXP _arrow_io___ReadableFile__Open(SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ return cpp11::as_sexp(io___ReadableFile__Open(path));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___ReadableFile__Open(SEXP path_sexp){
+ Rf_error("Cannot call io___ReadableFile__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::BufferReader> io___BufferReader__initialize(const std::shared_ptr<arrow::Buffer>& buffer);
+extern "C" SEXP _arrow_io___BufferReader__initialize(SEXP buffer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type buffer(buffer_sexp);
+ return cpp11::as_sexp(io___BufferReader__initialize(buffer));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___BufferReader__initialize(SEXP buffer_sexp){
+ Rf_error("Cannot call io___BufferReader__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void io___Writable__write(const std::shared_ptr<arrow::io::Writable>& stream, const std::shared_ptr<arrow::Buffer>& buf);
+extern "C" SEXP _arrow_io___Writable__write(SEXP stream_sexp, SEXP buf_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::Writable>&>::type stream(stream_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Buffer>&>::type buf(buf_sexp);
+ io___Writable__write(stream, buf);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___Writable__write(SEXP stream_sexp, SEXP buf_sexp){
+ Rf_error("Cannot call io___Writable__write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t io___OutputStream__Tell(const std::shared_ptr<arrow::io::OutputStream>& stream);
+extern "C" SEXP _arrow_io___OutputStream__Tell(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(io___OutputStream__Tell(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___OutputStream__Tell(SEXP stream_sexp){
+ Rf_error("Cannot call io___OutputStream__Tell(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::FileOutputStream> io___FileOutputStream__Open(const std::string& path);
+extern "C" SEXP _arrow_io___FileOutputStream__Open(SEXP path_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::string&>::type path(path_sexp);
+ return cpp11::as_sexp(io___FileOutputStream__Open(path));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___FileOutputStream__Open(SEXP path_sexp){
+ Rf_error("Cannot call io___FileOutputStream__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::io::BufferOutputStream> io___BufferOutputStream__Create(int64_t initial_capacity);
+extern "C" SEXP _arrow_io___BufferOutputStream__Create(SEXP initial_capacity_sexp){
+BEGIN_CPP11
+ arrow::r::Input<int64_t>::type initial_capacity(initial_capacity_sexp);
+ return cpp11::as_sexp(io___BufferOutputStream__Create(initial_capacity));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___BufferOutputStream__Create(SEXP initial_capacity_sexp){
+ Rf_error("Cannot call io___BufferOutputStream__Create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t io___BufferOutputStream__capacity(const std::shared_ptr<arrow::io::BufferOutputStream>& stream);
+extern "C" SEXP _arrow_io___BufferOutputStream__capacity(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::BufferOutputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(io___BufferOutputStream__capacity(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___BufferOutputStream__capacity(SEXP stream_sexp){
+ Rf_error("Cannot call io___BufferOutputStream__capacity(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Buffer> io___BufferOutputStream__Finish(const std::shared_ptr<arrow::io::BufferOutputStream>& stream);
+extern "C" SEXP _arrow_io___BufferOutputStream__Finish(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::BufferOutputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(io___BufferOutputStream__Finish(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___BufferOutputStream__Finish(SEXP stream_sexp){
+ Rf_error("Cannot call io___BufferOutputStream__Finish(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t io___BufferOutputStream__Tell(const std::shared_ptr<arrow::io::BufferOutputStream>& stream);
+extern "C" SEXP _arrow_io___BufferOutputStream__Tell(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::BufferOutputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(io___BufferOutputStream__Tell(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___BufferOutputStream__Tell(SEXP stream_sexp){
+ Rf_error("Cannot call io___BufferOutputStream__Tell(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// io.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void io___BufferOutputStream__Write(const std::shared_ptr<arrow::io::BufferOutputStream>& stream, cpp11::raws bytes);
+extern "C" SEXP _arrow_io___BufferOutputStream__Write(SEXP stream_sexp, SEXP bytes_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::BufferOutputStream>&>::type stream(stream_sexp);
+ arrow::r::Input<cpp11::raws>::type bytes(bytes_sexp);
+ io___BufferOutputStream__Write(stream, bytes);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_io___BufferOutputStream__Write(SEXP stream_sexp, SEXP bytes_sexp){
+ Rf_error("Cannot call io___BufferOutputStream__Write(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// json.cpp
+#if defined(ARROW_R_WITH_JSON)
+std::shared_ptr<arrow::json::ReadOptions> json___ReadOptions__initialize(bool use_threads, int block_size);
+extern "C" SEXP _arrow_json___ReadOptions__initialize(SEXP use_threads_sexp, SEXP block_size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ arrow::r::Input<int>::type block_size(block_size_sexp);
+ return cpp11::as_sexp(json___ReadOptions__initialize(use_threads, block_size));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_json___ReadOptions__initialize(SEXP use_threads_sexp, SEXP block_size_sexp){
+ Rf_error("Cannot call json___ReadOptions__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// json.cpp
+#if defined(ARROW_R_WITH_JSON)
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize1(bool newlines_in_values);
+extern "C" SEXP _arrow_json___ParseOptions__initialize1(SEXP newlines_in_values_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type newlines_in_values(newlines_in_values_sexp);
+ return cpp11::as_sexp(json___ParseOptions__initialize1(newlines_in_values));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_json___ParseOptions__initialize1(SEXP newlines_in_values_sexp){
+ Rf_error("Cannot call json___ParseOptions__initialize1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// json.cpp
+#if defined(ARROW_R_WITH_JSON)
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize2(bool newlines_in_values, const std::shared_ptr<arrow::Schema>& explicit_schema);
+extern "C" SEXP _arrow_json___ParseOptions__initialize2(SEXP newlines_in_values_sexp, SEXP explicit_schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type newlines_in_values(newlines_in_values_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type explicit_schema(explicit_schema_sexp);
+ return cpp11::as_sexp(json___ParseOptions__initialize2(newlines_in_values, explicit_schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_json___ParseOptions__initialize2(SEXP newlines_in_values_sexp, SEXP explicit_schema_sexp){
+ Rf_error("Cannot call json___ParseOptions__initialize2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// json.cpp
+#if defined(ARROW_R_WITH_JSON)
+std::shared_ptr<arrow::json::TableReader> json___TableReader__Make(const std::shared_ptr<arrow::io::InputStream>& input, const std::shared_ptr<arrow::json::ReadOptions>& read_options, const std::shared_ptr<arrow::json::ParseOptions>& parse_options);
+extern "C" SEXP _arrow_json___TableReader__Make(SEXP input_sexp, SEXP read_options_sexp, SEXP parse_options_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type input(input_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::json::ReadOptions>&>::type read_options(read_options_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::json::ParseOptions>&>::type parse_options(parse_options_sexp);
+ return cpp11::as_sexp(json___TableReader__Make(input, read_options, parse_options));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_json___TableReader__Make(SEXP input_sexp, SEXP read_options_sexp, SEXP parse_options_sexp){
+ Rf_error("Cannot call json___TableReader__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// json.cpp
+#if defined(ARROW_R_WITH_JSON)
+std::shared_ptr<arrow::Table> json___TableReader__Read(const std::shared_ptr<arrow::json::TableReader>& table_reader);
+extern "C" SEXP _arrow_json___TableReader__Read(SEXP table_reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::json::TableReader>&>::type table_reader(table_reader_sexp);
+ return cpp11::as_sexp(json___TableReader__Read(table_reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_json___TableReader__Read(SEXP table_reader_sexp){
+ Rf_error("Cannot call json___TableReader__Read(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// memorypool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::MemoryPool> MemoryPool__default();
+extern "C" SEXP _arrow_MemoryPool__default(){
+BEGIN_CPP11
+ return cpp11::as_sexp(MemoryPool__default());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_MemoryPool__default(){
+ Rf_error("Cannot call MemoryPool__default(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// memorypool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+double MemoryPool__bytes_allocated(const std::shared_ptr<arrow::MemoryPool>& pool);
+extern "C" SEXP _arrow_MemoryPool__bytes_allocated(SEXP pool_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::MemoryPool>&>::type pool(pool_sexp);
+ return cpp11::as_sexp(MemoryPool__bytes_allocated(pool));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_MemoryPool__bytes_allocated(SEXP pool_sexp){
+ Rf_error("Cannot call MemoryPool__bytes_allocated(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// memorypool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+double MemoryPool__max_memory(const std::shared_ptr<arrow::MemoryPool>& pool);
+extern "C" SEXP _arrow_MemoryPool__max_memory(SEXP pool_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::MemoryPool>&>::type pool(pool_sexp);
+ return cpp11::as_sexp(MemoryPool__max_memory(pool));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_MemoryPool__max_memory(SEXP pool_sexp){
+ Rf_error("Cannot call MemoryPool__max_memory(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// memorypool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string MemoryPool__backend_name(const std::shared_ptr<arrow::MemoryPool>& pool);
+extern "C" SEXP _arrow_MemoryPool__backend_name(SEXP pool_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::MemoryPool>&>::type pool(pool_sexp);
+ return cpp11::as_sexp(MemoryPool__backend_name(pool));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_MemoryPool__backend_name(SEXP pool_sexp){
+ Rf_error("Cannot call MemoryPool__backend_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// memorypool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> supported_memory_backends();
+extern "C" SEXP _arrow_supported_memory_backends(){
+BEGIN_CPP11
+ return cpp11::as_sexp(supported_memory_backends());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_supported_memory_backends(){
+ Rf_error("Cannot call supported_memory_backends(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t ipc___Message__body_length(const std::unique_ptr<arrow::ipc::Message>& message);
+extern "C" SEXP _arrow_ipc___Message__body_length(SEXP message_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type message(message_sexp);
+ return cpp11::as_sexp(ipc___Message__body_length(message));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___Message__body_length(SEXP message_sexp){
+ Rf_error("Cannot call ipc___Message__body_length(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Buffer> ipc___Message__metadata(const std::unique_ptr<arrow::ipc::Message>& message);
+extern "C" SEXP _arrow_ipc___Message__metadata(SEXP message_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type message(message_sexp);
+ return cpp11::as_sexp(ipc___Message__metadata(message));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___Message__metadata(SEXP message_sexp){
+ Rf_error("Cannot call ipc___Message__metadata(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Buffer> ipc___Message__body(const std::unique_ptr<arrow::ipc::Message>& message);
+extern "C" SEXP _arrow_ipc___Message__body(SEXP message_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type message(message_sexp);
+ return cpp11::as_sexp(ipc___Message__body(message));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___Message__body(SEXP message_sexp){
+ Rf_error("Cannot call ipc___Message__body(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int64_t ipc___Message__Verify(const std::unique_ptr<arrow::ipc::Message>& message);
+extern "C" SEXP _arrow_ipc___Message__Verify(SEXP message_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type message(message_sexp);
+ return cpp11::as_sexp(ipc___Message__Verify(message));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___Message__Verify(SEXP message_sexp){
+ Rf_error("Cannot call ipc___Message__Verify(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::ipc::MessageType ipc___Message__type(const std::unique_ptr<arrow::ipc::Message>& message);
+extern "C" SEXP _arrow_ipc___Message__type(SEXP message_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type message(message_sexp);
+ return cpp11::as_sexp(ipc___Message__type(message));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___Message__type(SEXP message_sexp){
+ Rf_error("Cannot call ipc___Message__type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool ipc___Message__Equals(const std::unique_ptr<arrow::ipc::Message>& x, const std::unique_ptr<arrow::ipc::Message>& y);
+extern "C" SEXP _arrow_ipc___Message__Equals(SEXP x_sexp, SEXP y_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type x(x_sexp);
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type y(y_sexp);
+ return cpp11::as_sexp(ipc___Message__Equals(x, y));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___Message__Equals(SEXP x_sexp, SEXP y_sexp){
+ Rf_error("Cannot call ipc___Message__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__Message__Schema(const std::unique_ptr<arrow::ipc::Message>& message, const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_ipc___ReadRecordBatch__Message__Schema(SEXP message_sexp, SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type message(message_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(ipc___ReadRecordBatch__Message__Schema(message, schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___ReadRecordBatch__Message__Schema(SEXP message_sexp, SEXP schema_sexp){
+ Rf_error("Cannot call ipc___ReadRecordBatch__Message__Schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> ipc___ReadSchema_InputStream(const std::shared_ptr<arrow::io::InputStream>& stream);
+extern "C" SEXP _arrow_ipc___ReadSchema_InputStream(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(ipc___ReadSchema_InputStream(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___ReadSchema_InputStream(SEXP stream_sexp){
+ Rf_error("Cannot call ipc___ReadSchema_InputStream(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> ipc___ReadSchema_Message(const std::unique_ptr<arrow::ipc::Message>& message);
+extern "C" SEXP _arrow_ipc___ReadSchema_Message(SEXP message_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::Message>&>::type message(message_sexp);
+ return cpp11::as_sexp(ipc___ReadSchema_Message(message));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___ReadSchema_Message(SEXP message_sexp){
+ Rf_error("Cannot call ipc___ReadSchema_Message(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::MessageReader> ipc___MessageReader__Open(const std::shared_ptr<arrow::io::InputStream>& stream);
+extern "C" SEXP _arrow_ipc___MessageReader__Open(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(ipc___MessageReader__Open(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___MessageReader__Open(SEXP stream_sexp){
+ Rf_error("Cannot call ipc___MessageReader__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::Message> ipc___MessageReader__ReadNextMessage(const std::unique_ptr<arrow::ipc::MessageReader>& reader);
+extern "C" SEXP _arrow_ipc___MessageReader__ReadNextMessage(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::unique_ptr<arrow::ipc::MessageReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(ipc___MessageReader__ReadNextMessage(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___MessageReader__ReadNextMessage(SEXP reader_sexp){
+ Rf_error("Cannot call ipc___MessageReader__ReadNextMessage(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// message.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::Message> ipc___ReadMessage(const std::shared_ptr<arrow::io::InputStream>& stream);
+extern "C" SEXP _arrow_ipc___ReadMessage(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(ipc___ReadMessage(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___ReadMessage(SEXP stream_sexp){
+ Rf_error("Cannot call ipc___ReadMessage(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<parquet::ArrowReaderProperties> parquet___arrow___ArrowReaderProperties__Make(bool use_threads);
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__Make(SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ return cpp11::as_sexp(parquet___arrow___ArrowReaderProperties__Make(use_threads));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__Make(SEXP use_threads_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__Make(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___arrow___ArrowReaderProperties__set_use_threads(const std::shared_ptr<parquet::ArrowReaderProperties>& properties, bool use_threads);
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowReaderProperties>&>::type properties(properties_sexp);
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ parquet___arrow___ArrowReaderProperties__set_use_threads(properties, use_threads);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__set_use_threads(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+bool parquet___arrow___ArrowReaderProperties__get_use_threads(const std::shared_ptr<parquet::ArrowReaderProperties>& properties, bool use_threads);
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowReaderProperties>&>::type properties(properties_sexp);
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ return cpp11::as_sexp(parquet___arrow___ArrowReaderProperties__get_use_threads(properties, use_threads));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__get_use_threads(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+bool parquet___arrow___ArrowReaderProperties__get_read_dictionary(const std::shared_ptr<parquet::ArrowReaderProperties>& properties, int column_index);
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowReaderProperties>&>::type properties(properties_sexp);
+ arrow::r::Input<int>::type column_index(column_index_sexp);
+ return cpp11::as_sexp(parquet___arrow___ArrowReaderProperties__get_read_dictionary(properties, column_index));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__get_read_dictionary(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___arrow___ArrowReaderProperties__set_read_dictionary(const std::shared_ptr<parquet::ArrowReaderProperties>& properties, int column_index, bool read_dict);
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp, SEXP read_dict_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowReaderProperties>&>::type properties(properties_sexp);
+ arrow::r::Input<int>::type column_index(column_index_sexp);
+ arrow::r::Input<bool>::type read_dict(read_dict_sexp);
+ parquet___arrow___ArrowReaderProperties__set_read_dictionary(properties, column_index, read_dict);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp, SEXP read_dict_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__set_read_dictionary(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<parquet::arrow::FileReader> parquet___arrow___FileReader__OpenFile(const std::shared_ptr<arrow::io::RandomAccessFile>& file, const std::shared_ptr<parquet::ArrowReaderProperties>& props);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, SEXP props_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type file(file_sexp);
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowReaderProperties>&>::type props(props_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__OpenFile(file, props));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, SEXP props_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__OpenFile(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable1(const std::shared_ptr<parquet::arrow::FileReader>& reader);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadTable1(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__ReadTable1(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadTable1(SEXP reader_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadTable1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable2(const std::shared_ptr<parquet::arrow::FileReader>& reader, const std::vector<int>& column_indices);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadTable2(SEXP reader_sexp, SEXP column_indices_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ arrow::r::Input<const std::vector<int>&>::type column_indices(column_indices_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__ReadTable2(reader, column_indices));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadTable2(SEXP reader_sexp, SEXP column_indices_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadTable2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroup1(const std::shared_ptr<parquet::arrow::FileReader>& reader, int i);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroup1(SEXP reader_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__ReadRowGroup1(reader, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroup1(SEXP reader_sexp, SEXP i_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadRowGroup1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroup2(const std::shared_ptr<parquet::arrow::FileReader>& reader, int i, const std::vector<int>& column_indices);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroup2(SEXP reader_sexp, SEXP i_sexp, SEXP column_indices_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ arrow::r::Input<const std::vector<int>&>::type column_indices(column_indices_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__ReadRowGroup2(reader, i, column_indices));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroup2(SEXP reader_sexp, SEXP i_sexp, SEXP column_indices_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadRowGroup2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroups1(const std::shared_ptr<parquet::arrow::FileReader>& reader, const std::vector<int>& row_groups);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroups1(SEXP reader_sexp, SEXP row_groups_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ arrow::r::Input<const std::vector<int>&>::type row_groups(row_groups_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__ReadRowGroups1(reader, row_groups));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroups1(SEXP reader_sexp, SEXP row_groups_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadRowGroups1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroups2(const std::shared_ptr<parquet::arrow::FileReader>& reader, const std::vector<int>& row_groups, const std::vector<int>& column_indices);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroups2(SEXP reader_sexp, SEXP row_groups_sexp, SEXP column_indices_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ arrow::r::Input<const std::vector<int>&>::type row_groups(row_groups_sexp);
+ arrow::r::Input<const std::vector<int>&>::type column_indices(column_indices_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__ReadRowGroups2(reader, row_groups, column_indices));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadRowGroups2(SEXP reader_sexp, SEXP row_groups_sexp, SEXP column_indices_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadRowGroups2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+int64_t parquet___arrow___FileReader__num_rows(const std::shared_ptr<parquet::arrow::FileReader>& reader);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__num_rows(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__num_rows(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__num_rows(SEXP reader_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__num_rows(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+int parquet___arrow___FileReader__num_columns(const std::shared_ptr<parquet::arrow::FileReader>& reader);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__num_columns(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__num_columns(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__num_columns(SEXP reader_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__num_columns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+int parquet___arrow___FileReader__num_row_groups(const std::shared_ptr<parquet::arrow::FileReader>& reader);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__num_row_groups(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__num_row_groups(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__num_row_groups(SEXP reader_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__num_row_groups(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::ChunkedArray> parquet___arrow___FileReader__ReadColumn(const std::shared_ptr<parquet::arrow::FileReader>& reader, int i);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadColumn(SEXP reader_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__ReadColumn(reader, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__ReadColumn(SEXP reader_sexp, SEXP i_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadColumn(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<parquet::ArrowWriterProperties> parquet___ArrowWriterProperties___create(bool allow_truncated_timestamps, bool use_deprecated_int96_timestamps, int timestamp_unit);
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___create(SEXP allow_truncated_timestamps_sexp, SEXP use_deprecated_int96_timestamps_sexp, SEXP timestamp_unit_sexp){
+BEGIN_CPP11
+ arrow::r::Input<bool>::type allow_truncated_timestamps(allow_truncated_timestamps_sexp);
+ arrow::r::Input<bool>::type use_deprecated_int96_timestamps(use_deprecated_int96_timestamps_sexp);
+ arrow::r::Input<int>::type timestamp_unit(timestamp_unit_sexp);
+ return cpp11::as_sexp(parquet___ArrowWriterProperties___create(allow_truncated_timestamps, use_deprecated_int96_timestamps, timestamp_unit));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___create(SEXP allow_truncated_timestamps_sexp, SEXP use_deprecated_int96_timestamps_sexp, SEXP timestamp_unit_sexp){
+ Rf_error("Cannot call parquet___ArrowWriterProperties___create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<parquet::WriterPropertiesBuilder> parquet___WriterProperties___Builder__create();
+extern "C" SEXP _arrow_parquet___WriterProperties___Builder__create(){
+BEGIN_CPP11
+ return cpp11::as_sexp(parquet___WriterProperties___Builder__create());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___WriterProperties___Builder__create(){
+ Rf_error("Cannot call parquet___WriterProperties___Builder__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___WriterProperties___Builder__version(const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder, const parquet::ParquetVersion::type& version);
+extern "C" SEXP _arrow_parquet___WriterProperties___Builder__version(SEXP builder_sexp, SEXP version_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::WriterPropertiesBuilder>&>::type builder(builder_sexp);
+ arrow::r::Input<const parquet::ParquetVersion::type&>::type version(version_sexp);
+ parquet___WriterProperties___Builder__version(builder, version);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___WriterProperties___Builder__version(SEXP builder_sexp, SEXP version_sexp){
+ Rf_error("Cannot call parquet___WriterProperties___Builder__version(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___ArrowWriterProperties___Builder__set_compressions(const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder, const std::vector<std::string>& paths, cpp11::integers types);
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compressions(SEXP builder_sexp, SEXP paths_sexp, SEXP types_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::WriterPropertiesBuilder>&>::type builder(builder_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+ arrow::r::Input<cpp11::integers>::type types(types_sexp);
+ parquet___ArrowWriterProperties___Builder__set_compressions(builder, paths, types);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compressions(SEXP builder_sexp, SEXP paths_sexp, SEXP types_sexp){
+ Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_compressions(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___ArrowWriterProperties___Builder__set_compression_levels(const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder, const std::vector<std::string>& paths, cpp11::integers levels);
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels(SEXP builder_sexp, SEXP paths_sexp, SEXP levels_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::WriterPropertiesBuilder>&>::type builder(builder_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+ arrow::r::Input<cpp11::integers>::type levels(levels_sexp);
+ parquet___ArrowWriterProperties___Builder__set_compression_levels(builder, paths, levels);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels(SEXP builder_sexp, SEXP paths_sexp, SEXP levels_sexp){
+ Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_compression_levels(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___ArrowWriterProperties___Builder__set_use_dictionary(const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder, const std::vector<std::string>& paths, cpp11::logicals use_dictionary);
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary(SEXP builder_sexp, SEXP paths_sexp, SEXP use_dictionary_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::WriterPropertiesBuilder>&>::type builder(builder_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+ arrow::r::Input<cpp11::logicals>::type use_dictionary(use_dictionary_sexp);
+ parquet___ArrowWriterProperties___Builder__set_use_dictionary(builder, paths, use_dictionary);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary(SEXP builder_sexp, SEXP paths_sexp, SEXP use_dictionary_sexp){
+ Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_use_dictionary(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___ArrowWriterProperties___Builder__set_write_statistics(const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder, const std::vector<std::string>& paths, cpp11::logicals write_statistics);
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics(SEXP builder_sexp, SEXP paths_sexp, SEXP write_statistics_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::WriterPropertiesBuilder>&>::type builder(builder_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+ arrow::r::Input<cpp11::logicals>::type write_statistics(write_statistics_sexp);
+ parquet___ArrowWriterProperties___Builder__set_write_statistics(builder, paths, write_statistics);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics(SEXP builder_sexp, SEXP paths_sexp, SEXP write_statistics_sexp){
+ Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__set_write_statistics(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___ArrowWriterProperties___Builder__data_page_size(const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder, int64_t data_page_size);
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__data_page_size(SEXP builder_sexp, SEXP data_page_size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::WriterPropertiesBuilder>&>::type builder(builder_sexp);
+ arrow::r::Input<int64_t>::type data_page_size(data_page_size_sexp);
+ parquet___ArrowWriterProperties___Builder__data_page_size(builder, data_page_size);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___ArrowWriterProperties___Builder__data_page_size(SEXP builder_sexp, SEXP data_page_size_sexp){
+ Rf_error("Cannot call parquet___ArrowWriterProperties___Builder__data_page_size(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<parquet::WriterProperties> parquet___WriterProperties___Builder__build(const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder);
+extern "C" SEXP _arrow_parquet___WriterProperties___Builder__build(SEXP builder_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::WriterPropertiesBuilder>&>::type builder(builder_sexp);
+ return cpp11::as_sexp(parquet___WriterProperties___Builder__build(builder));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___WriterProperties___Builder__build(SEXP builder_sexp){
+ Rf_error("Cannot call parquet___WriterProperties___Builder__build(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<parquet::arrow::FileWriter> parquet___arrow___ParquetFileWriter__Open(const std::shared_ptr<arrow::Schema>& schema, const std::shared_ptr<arrow::io::OutputStream>& sink, const std::shared_ptr<parquet::WriterProperties>& properties, const std::shared_ptr<parquet::ArrowWriterProperties>& arrow_properties);
+extern "C" SEXP _arrow_parquet___arrow___ParquetFileWriter__Open(SEXP schema_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type sink(sink_sexp);
+ arrow::r::Input<const std::shared_ptr<parquet::WriterProperties>&>::type properties(properties_sexp);
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowWriterProperties>&>::type arrow_properties(arrow_properties_sexp);
+ return cpp11::as_sexp(parquet___arrow___ParquetFileWriter__Open(schema, sink, properties, arrow_properties));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___ParquetFileWriter__Open(SEXP schema_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){
+ Rf_error("Cannot call parquet___arrow___ParquetFileWriter__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___arrow___FileWriter__WriteTable(const std::shared_ptr<parquet::arrow::FileWriter>& writer, const std::shared_ptr<arrow::Table>& table, int64_t chunk_size);
+extern "C" SEXP _arrow_parquet___arrow___FileWriter__WriteTable(SEXP writer_sexp, SEXP table_sexp, SEXP chunk_size_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileWriter>&>::type writer(writer_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<int64_t>::type chunk_size(chunk_size_sexp);
+ parquet___arrow___FileWriter__WriteTable(writer, table, chunk_size);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileWriter__WriteTable(SEXP writer_sexp, SEXP table_sexp, SEXP chunk_size_sexp){
+ Rf_error("Cannot call parquet___arrow___FileWriter__WriteTable(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___arrow___FileWriter__Close(const std::shared_ptr<parquet::arrow::FileWriter>& writer);
+extern "C" SEXP _arrow_parquet___arrow___FileWriter__Close(SEXP writer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileWriter>&>::type writer(writer_sexp);
+ parquet___arrow___FileWriter__Close(writer);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileWriter__Close(SEXP writer_sexp){
+ Rf_error("Cannot call parquet___arrow___FileWriter__Close(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+void parquet___arrow___WriteTable(const std::shared_ptr<arrow::Table>& table, const std::shared_ptr<arrow::io::OutputStream>& sink, const std::shared_ptr<parquet::WriterProperties>& properties, const std::shared_ptr<parquet::ArrowWriterProperties>& arrow_properties);
+extern "C" SEXP _arrow_parquet___arrow___WriteTable(SEXP table_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type sink(sink_sexp);
+ arrow::r::Input<const std::shared_ptr<parquet::WriterProperties>&>::type properties(properties_sexp);
+ arrow::r::Input<const std::shared_ptr<parquet::ArrowWriterProperties>&>::type arrow_properties(arrow_properties_sexp);
+ parquet___arrow___WriteTable(table, sink, properties, arrow_properties);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___WriteTable(SEXP table_sexp, SEXP sink_sexp, SEXP properties_sexp, SEXP arrow_properties_sexp){
+ Rf_error("Cannot call parquet___arrow___WriteTable(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_PARQUET)
+std::shared_ptr<arrow::Schema> parquet___arrow___FileReader__GetSchema(const std::shared_ptr<parquet::arrow::FileReader>& reader);
+extern "C" SEXP _arrow_parquet___arrow___FileReader__GetSchema(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(parquet___arrow___FileReader__GetSchema(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_parquet___arrow___FileReader__GetSchema(SEXP reader_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__GetSchema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::r::Pointer<struct ArrowSchema> allocate_arrow_schema();
+extern "C" SEXP _arrow_allocate_arrow_schema(){
+BEGIN_CPP11
+ return cpp11::as_sexp(allocate_arrow_schema());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_allocate_arrow_schema(){
+ Rf_error("Cannot call allocate_arrow_schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void delete_arrow_schema(arrow::r::Pointer<struct ArrowSchema> ptr);
+extern "C" SEXP _arrow_delete_arrow_schema(SEXP ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type ptr(ptr_sexp);
+ delete_arrow_schema(ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_delete_arrow_schema(SEXP ptr_sexp){
+ Rf_error("Cannot call delete_arrow_schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::r::Pointer<struct ArrowArray> allocate_arrow_array();
+extern "C" SEXP _arrow_allocate_arrow_array(){
+BEGIN_CPP11
+ return cpp11::as_sexp(allocate_arrow_array());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_allocate_arrow_array(){
+ Rf_error("Cannot call allocate_arrow_array(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void delete_arrow_array(arrow::r::Pointer<struct ArrowArray> ptr);
+extern "C" SEXP _arrow_delete_arrow_array(SEXP ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArray>>::type ptr(ptr_sexp);
+ delete_arrow_array(ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_delete_arrow_array(SEXP ptr_sexp){
+ Rf_error("Cannot call delete_arrow_array(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+arrow::r::Pointer<struct ArrowArrayStream> allocate_arrow_array_stream();
+extern "C" SEXP _arrow_allocate_arrow_array_stream(){
+BEGIN_CPP11
+ return cpp11::as_sexp(allocate_arrow_array_stream());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_allocate_arrow_array_stream(){
+ Rf_error("Cannot call allocate_arrow_array_stream(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void delete_arrow_array_stream(arrow::r::Pointer<struct ArrowArrayStream> ptr);
+extern "C" SEXP _arrow_delete_arrow_array_stream(SEXP ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArrayStream>>::type ptr(ptr_sexp);
+ delete_arrow_array_stream(ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_delete_arrow_array_stream(SEXP ptr_sexp){
+ Rf_error("Cannot call delete_arrow_array_stream(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> ImportArray(arrow::r::Pointer<struct ArrowArray> array, arrow::r::Pointer<struct ArrowSchema> schema);
+extern "C" SEXP _arrow_ImportArray(SEXP array_sexp, SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArray>>::type array(array_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type schema(schema_sexp);
+ return cpp11::as_sexp(ImportArray(array, schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ImportArray(SEXP array_sexp, SEXP schema_sexp){
+ Rf_error("Cannot call ImportArray(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> ImportRecordBatch(arrow::r::Pointer<struct ArrowArray> array, arrow::r::Pointer<struct ArrowSchema> schema);
+extern "C" SEXP _arrow_ImportRecordBatch(SEXP array_sexp, SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArray>>::type array(array_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type schema(schema_sexp);
+ return cpp11::as_sexp(ImportRecordBatch(array, schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ImportRecordBatch(SEXP array_sexp, SEXP schema_sexp){
+ Rf_error("Cannot call ImportRecordBatch(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> ImportSchema(arrow::r::Pointer<struct ArrowSchema> schema);
+extern "C" SEXP _arrow_ImportSchema(SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type schema(schema_sexp);
+ return cpp11::as_sexp(ImportSchema(schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ImportSchema(SEXP schema_sexp){
+ Rf_error("Cannot call ImportSchema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> ImportField(arrow::r::Pointer<struct ArrowSchema> field);
+extern "C" SEXP _arrow_ImportField(SEXP field_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type field(field_sexp);
+ return cpp11::as_sexp(ImportField(field));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ImportField(SEXP field_sexp){
+ Rf_error("Cannot call ImportField(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ImportType(arrow::r::Pointer<struct ArrowSchema> type);
+extern "C" SEXP _arrow_ImportType(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type type(type_sexp);
+ return cpp11::as_sexp(ImportType(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ImportType(SEXP type_sexp){
+ Rf_error("Cannot call ImportType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatchReader> ImportRecordBatchReader(arrow::r::Pointer<struct ArrowArrayStream> stream);
+extern "C" SEXP _arrow_ImportRecordBatchReader(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArrayStream>>::type stream(stream_sexp);
+ return cpp11::as_sexp(ImportRecordBatchReader(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ImportRecordBatchReader(SEXP stream_sexp){
+ Rf_error("Cannot call ImportRecordBatchReader(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ExportType(const std::shared_ptr<arrow::DataType>& type, arrow::r::Pointer<struct ArrowSchema> ptr);
+extern "C" SEXP _arrow_ExportType(SEXP type_sexp, SEXP ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type ptr(ptr_sexp);
+ ExportType(type, ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExportType(SEXP type_sexp, SEXP ptr_sexp){
+ Rf_error("Cannot call ExportType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ExportField(const std::shared_ptr<arrow::Field>& field, arrow::r::Pointer<struct ArrowSchema> ptr);
+extern "C" SEXP _arrow_ExportField(SEXP field_sexp, SEXP ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type ptr(ptr_sexp);
+ ExportField(field, ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExportField(SEXP field_sexp, SEXP ptr_sexp){
+ Rf_error("Cannot call ExportField(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ExportSchema(const std::shared_ptr<arrow::Schema>& schema, arrow::r::Pointer<struct ArrowSchema> ptr);
+extern "C" SEXP _arrow_ExportSchema(SEXP schema_sexp, SEXP ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type ptr(ptr_sexp);
+ ExportSchema(schema, ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExportSchema(SEXP schema_sexp, SEXP ptr_sexp){
+ Rf_error("Cannot call ExportSchema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ExportArray(const std::shared_ptr<arrow::Array>& array, arrow::r::Pointer<struct ArrowArray> array_ptr, arrow::r::Pointer<struct ArrowSchema> schema_ptr);
+extern "C" SEXP _arrow_ExportArray(SEXP array_sexp, SEXP array_ptr_sexp, SEXP schema_ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type array(array_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArray>>::type array_ptr(array_ptr_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type schema_ptr(schema_ptr_sexp);
+ ExportArray(array, array_ptr, schema_ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExportArray(SEXP array_sexp, SEXP array_ptr_sexp, SEXP schema_ptr_sexp){
+ Rf_error("Cannot call ExportArray(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ExportRecordBatch(const std::shared_ptr<arrow::RecordBatch>& batch, arrow::r::Pointer<struct ArrowArray> array_ptr, arrow::r::Pointer<struct ArrowSchema> schema_ptr);
+extern "C" SEXP _arrow_ExportRecordBatch(SEXP batch_sexp, SEXP array_ptr_sexp, SEXP schema_ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArray>>::type array_ptr(array_ptr_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowSchema>>::type schema_ptr(schema_ptr_sexp);
+ ExportRecordBatch(batch, array_ptr, schema_ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExportRecordBatch(SEXP batch_sexp, SEXP array_ptr_sexp, SEXP schema_ptr_sexp){
+ Rf_error("Cannot call ExportRecordBatch(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// py-to-r.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ExportRecordBatchReader(const std::shared_ptr<arrow::RecordBatchReader>& reader, arrow::r::Pointer<struct ArrowArrayStream> stream_ptr);
+extern "C" SEXP _arrow_ExportRecordBatchReader(SEXP reader_sexp, SEXP stream_ptr_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatchReader>&>::type reader(reader_sexp);
+ arrow::r::Input<arrow::r::Pointer<struct ArrowArrayStream>>::type stream_ptr(stream_ptr_sexp);
+ ExportRecordBatchReader(reader, stream_ptr);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExportRecordBatchReader(SEXP reader_sexp, SEXP stream_ptr_sexp){
+ Rf_error("Cannot call ExportRecordBatchReader(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// r_to_arrow.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, SEXP schema_sxp, bool use_threads);
+extern "C" SEXP _arrow_Table__from_dots(SEXP lst_sexp, SEXP schema_sxp_sexp, SEXP use_threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type lst(lst_sexp);
+ arrow::r::Input<SEXP>::type schema_sxp(schema_sxp_sexp);
+ arrow::r::Input<bool>::type use_threads(use_threads_sexp);
+ return cpp11::as_sexp(Table__from_dots(lst, schema_sxp, use_threads));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__from_dots(SEXP lst_sexp, SEXP schema_sxp_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call Table__from_dots(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// r_to_arrow.cpp
+#if defined(ARROW_R_WITH_ARROW)
+SEXP vec_to_arrow(SEXP x, SEXP s_type);
+extern "C" SEXP _arrow_vec_to_arrow(SEXP x_sexp, SEXP s_type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ arrow::r::Input<SEXP>::type s_type(s_type_sexp);
+ return cpp11::as_sexp(vec_to_arrow(x, s_type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_vec_to_arrow(SEXP x_sexp, SEXP s_type_sexp){
+ Rf_error("Cannot call vec_to_arrow(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// r_to_arrow.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> DictionaryArray__FromArrays(const std::shared_ptr<arrow::DataType>& type, const std::shared_ptr<arrow::Array>& indices, const std::shared_ptr<arrow::Array>& dict);
+extern "C" SEXP _arrow_DictionaryArray__FromArrays(SEXP type_sexp, SEXP indices_sexp, SEXP dict_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type indices(indices_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type dict(dict_sexp);
+ return cpp11::as_sexp(DictionaryArray__FromArrays(type, indices, dict));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_DictionaryArray__FromArrays(SEXP type_sexp, SEXP indices_sexp, SEXP dict_sexp){
+ Rf_error("Cannot call DictionaryArray__FromArrays(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int RecordBatch__num_columns(const std::shared_ptr<arrow::RecordBatch>& x);
+extern "C" SEXP _arrow_RecordBatch__num_columns(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type x(x_sexp);
+ return cpp11::as_sexp(RecordBatch__num_columns(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__num_columns(SEXP x_sexp){
+ Rf_error("Cannot call RecordBatch__num_columns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int RecordBatch__num_rows(const std::shared_ptr<arrow::RecordBatch>& x);
+extern "C" SEXP _arrow_RecordBatch__num_rows(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type x(x_sexp);
+ return cpp11::as_sexp(RecordBatch__num_rows(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__num_rows(SEXP x_sexp){
+ Rf_error("Cannot call RecordBatch__num_rows(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> RecordBatch__schema(const std::shared_ptr<arrow::RecordBatch>& x);
+extern "C" SEXP _arrow_RecordBatch__schema(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type x(x_sexp);
+ return cpp11::as_sexp(RecordBatch__schema(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__schema(SEXP x_sexp){
+ Rf_error("Cannot call RecordBatch__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__RenameColumns(const std::shared_ptr<arrow::RecordBatch>& batch, const std::vector<std::string>& names);
+extern "C" SEXP _arrow_RecordBatch__RenameColumns(SEXP batch_sexp, SEXP names_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type names(names_sexp);
+ return cpp11::as_sexp(RecordBatch__RenameColumns(batch, names));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__RenameColumns(SEXP batch_sexp, SEXP names_sexp){
+ Rf_error("Cannot call RecordBatch__RenameColumns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__ReplaceSchemaMetadata(const std::shared_ptr<arrow::RecordBatch>& x, cpp11::strings metadata);
+extern "C" SEXP _arrow_RecordBatch__ReplaceSchemaMetadata(SEXP x_sexp, SEXP metadata_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type x(x_sexp);
+ arrow::r::Input<cpp11::strings>::type metadata(metadata_sexp);
+ return cpp11::as_sexp(RecordBatch__ReplaceSchemaMetadata(x, metadata));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__ReplaceSchemaMetadata(SEXP x_sexp, SEXP metadata_sexp){
+ Rf_error("Cannot call RecordBatch__ReplaceSchemaMetadata(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list RecordBatch__columns(const std::shared_ptr<arrow::RecordBatch>& batch);
+extern "C" SEXP _arrow_RecordBatch__columns(SEXP batch_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ return cpp11::as_sexp(RecordBatch__columns(batch));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__columns(SEXP batch_sexp){
+ Rf_error("Cannot call RecordBatch__columns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> RecordBatch__column(const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i);
+extern "C" SEXP _arrow_RecordBatch__column(SEXP batch_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(RecordBatch__column(batch, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__column(SEXP batch_sexp, SEXP i_sexp){
+ Rf_error("Cannot call RecordBatch__column(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> RecordBatch__GetColumnByName(const std::shared_ptr<arrow::RecordBatch>& batch, const std::string& name);
+extern "C" SEXP _arrow_RecordBatch__GetColumnByName(SEXP batch_sexp, SEXP name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<const std::string&>::type name(name_sexp);
+ return cpp11::as_sexp(RecordBatch__GetColumnByName(batch, name));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__GetColumnByName(SEXP batch_sexp, SEXP name_sexp){
+ Rf_error("Cannot call RecordBatch__GetColumnByName(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__SelectColumns(const std::shared_ptr<arrow::RecordBatch>& batch, const std::vector<int>& indices);
+extern "C" SEXP _arrow_RecordBatch__SelectColumns(SEXP batch_sexp, SEXP indices_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<const std::vector<int>&>::type indices(indices_sexp);
+ return cpp11::as_sexp(RecordBatch__SelectColumns(batch, indices));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__SelectColumns(SEXP batch_sexp, SEXP indices_sexp){
+ Rf_error("Cannot call RecordBatch__SelectColumns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool RecordBatch__Equals(const std::shared_ptr<arrow::RecordBatch>& self, const std::shared_ptr<arrow::RecordBatch>& other, bool check_metadata);
+extern "C" SEXP _arrow_RecordBatch__Equals(SEXP self_sexp, SEXP other_sexp, SEXP check_metadata_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type self(self_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type other(other_sexp);
+ arrow::r::Input<bool>::type check_metadata(check_metadata_sexp);
+ return cpp11::as_sexp(RecordBatch__Equals(self, other, check_metadata));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__Equals(SEXP self_sexp, SEXP other_sexp, SEXP check_metadata_sexp){
+ Rf_error("Cannot call RecordBatch__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__AddColumn(const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i, const std::shared_ptr<arrow::Field>& field, const std::shared_ptr<arrow::Array>& column);
+extern "C" SEXP _arrow_RecordBatch__AddColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type column(column_sexp);
+ return cpp11::as_sexp(RecordBatch__AddColumn(batch, i, field, column));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__AddColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+ Rf_error("Cannot call RecordBatch__AddColumn(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__SetColumn(const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i, const std::shared_ptr<arrow::Field>& field, const std::shared_ptr<arrow::Array>& column);
+extern "C" SEXP _arrow_RecordBatch__SetColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type column(column_sexp);
+ return cpp11::as_sexp(RecordBatch__SetColumn(batch, i, field, column));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__SetColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+ Rf_error("Cannot call RecordBatch__SetColumn(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__RemoveColumn(const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i);
+extern "C" SEXP _arrow_RecordBatch__RemoveColumn(SEXP batch_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(RecordBatch__RemoveColumn(batch, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__RemoveColumn(SEXP batch_sexp, SEXP i_sexp){
+ Rf_error("Cannot call RecordBatch__RemoveColumn(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string RecordBatch__column_name(const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i);
+extern "C" SEXP _arrow_RecordBatch__column_name(SEXP batch_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(RecordBatch__column_name(batch, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__column_name(SEXP batch_sexp, SEXP i_sexp){
+ Rf_error("Cannot call RecordBatch__column_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::strings RecordBatch__names(const std::shared_ptr<arrow::RecordBatch>& batch);
+extern "C" SEXP _arrow_RecordBatch__names(SEXP batch_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ return cpp11::as_sexp(RecordBatch__names(batch));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__names(SEXP batch_sexp){
+ Rf_error("Cannot call RecordBatch__names(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__Slice1(const std::shared_ptr<arrow::RecordBatch>& self, R_xlen_t offset);
+extern "C" SEXP _arrow_RecordBatch__Slice1(SEXP self_sexp, SEXP offset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type self(self_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ return cpp11::as_sexp(RecordBatch__Slice1(self, offset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__Slice1(SEXP self_sexp, SEXP offset_sexp){
+ Rf_error("Cannot call RecordBatch__Slice1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__Slice2(const std::shared_ptr<arrow::RecordBatch>& self, R_xlen_t offset, R_xlen_t length);
+extern "C" SEXP _arrow_RecordBatch__Slice2(SEXP self_sexp, SEXP offset_sexp, SEXP length_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type self(self_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ arrow::r::Input<R_xlen_t>::type length(length_sexp);
+ return cpp11::as_sexp(RecordBatch__Slice2(self, offset, length));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__Slice2(SEXP self_sexp, SEXP offset_sexp, SEXP length_sexp){
+ Rf_error("Cannot call RecordBatch__Slice2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::raws ipc___SerializeRecordBatch__Raw(const std::shared_ptr<arrow::RecordBatch>& batch);
+extern "C" SEXP _arrow_ipc___SerializeRecordBatch__Raw(SEXP batch_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ return cpp11::as_sexp(ipc___SerializeRecordBatch__Raw(batch));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___SerializeRecordBatch__Raw(SEXP batch_sexp){
+ Rf_error("Cannot call ipc___SerializeRecordBatch__Raw(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__InputStream__Schema(const std::shared_ptr<arrow::io::InputStream>& stream, const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_ipc___ReadRecordBatch__InputStream__Schema(SEXP stream_sexp, SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type stream(stream_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(ipc___ReadRecordBatch__InputStream__Schema(stream, schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___ReadRecordBatch__InputStream__Schema(SEXP stream_sexp, SEXP schema_sexp){
+ Rf_error("Cannot call ipc___ReadRecordBatch__InputStream__Schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatch.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatch__from_arrays(SEXP schema_sxp, SEXP lst);
+extern "C" SEXP _arrow_RecordBatch__from_arrays(SEXP schema_sxp_sexp, SEXP lst_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type schema_sxp(schema_sxp_sexp);
+ arrow::r::Input<SEXP>::type lst(lst_sexp);
+ return cpp11::as_sexp(RecordBatch__from_arrays(schema_sxp, lst));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__from_arrays(SEXP schema_sxp_sexp, SEXP lst_sexp){
+ Rf_error("Cannot call RecordBatch__from_arrays(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> RecordBatchReader__schema(const std::shared_ptr<arrow::RecordBatchReader>& reader);
+extern "C" SEXP _arrow_RecordBatchReader__schema(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatchReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(RecordBatchReader__schema(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatchReader__schema(SEXP reader_sexp){
+ Rf_error("Cannot call RecordBatchReader__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> RecordBatchReader__ReadNext(const std::shared_ptr<arrow::RecordBatchReader>& reader);
+extern "C" SEXP _arrow_RecordBatchReader__ReadNext(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatchReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(RecordBatchReader__ReadNext(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatchReader__ReadNext(SEXP reader_sexp){
+ Rf_error("Cannot call RecordBatchReader__ReadNext(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list RecordBatchReader__batches(const std::shared_ptr<arrow::RecordBatchReader>& reader);
+extern "C" SEXP _arrow_RecordBatchReader__batches(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatchReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(RecordBatchReader__batches(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_RecordBatchReader__batches(SEXP reader_sexp){
+ Rf_error("Cannot call RecordBatchReader__batches(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__from_RecordBatchReader(const std::shared_ptr<arrow::RecordBatchReader>& reader);
+extern "C" SEXP _arrow_Table__from_RecordBatchReader(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatchReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(Table__from_RecordBatchReader(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__from_RecordBatchReader(SEXP reader_sexp){
+ Rf_error("Cannot call Table__from_RecordBatchReader(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::RecordBatchStreamReader> ipc___RecordBatchStreamReader__Open(const std::shared_ptr<arrow::io::InputStream>& stream);
+extern "C" SEXP _arrow_ipc___RecordBatchStreamReader__Open(SEXP stream_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::InputStream>&>::type stream(stream_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchStreamReader__Open(stream));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchStreamReader__Open(SEXP stream_sexp){
+ Rf_error("Cannot call ipc___RecordBatchStreamReader__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> ipc___RecordBatchFileReader__schema(const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader);
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__schema(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchFileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchFileReader__schema(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__schema(SEXP reader_sexp){
+ Rf_error("Cannot call ipc___RecordBatchFileReader__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int ipc___RecordBatchFileReader__num_record_batches(const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader);
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__num_record_batches(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchFileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchFileReader__num_record_batches(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__num_record_batches(SEXP reader_sexp){
+ Rf_error("Cannot call ipc___RecordBatchFileReader__num_record_batches(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::RecordBatch> ipc___RecordBatchFileReader__ReadRecordBatch(const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader, int i);
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__ReadRecordBatch(SEXP reader_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchFileReader>&>::type reader(reader_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchFileReader__ReadRecordBatch(reader, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__ReadRecordBatch(SEXP reader_sexp, SEXP i_sexp){
+ Rf_error("Cannot call ipc___RecordBatchFileReader__ReadRecordBatch(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::RecordBatchFileReader> ipc___RecordBatchFileReader__Open(const std::shared_ptr<arrow::io::RandomAccessFile>& file);
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__Open(SEXP file_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type file(file_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchFileReader__Open(file));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__Open(SEXP file_sexp){
+ Rf_error("Cannot call ipc___RecordBatchFileReader__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__from_RecordBatchFileReader(const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader);
+extern "C" SEXP _arrow_Table__from_RecordBatchFileReader(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchFileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(Table__from_RecordBatchFileReader(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__from_RecordBatchFileReader(SEXP reader_sexp){
+ Rf_error("Cannot call Table__from_RecordBatchFileReader(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchreader.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list ipc___RecordBatchFileReader__batches(const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader);
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__batches(SEXP reader_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchFileReader>&>::type reader(reader_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchFileReader__batches(reader));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchFileReader__batches(SEXP reader_sexp){
+ Rf_error("Cannot call ipc___RecordBatchFileReader__batches(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchwriter.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ipc___RecordBatchWriter__WriteRecordBatch(const std::shared_ptr<arrow::ipc::RecordBatchWriter>& batch_writer, const std::shared_ptr<arrow::RecordBatch>& batch);
+extern "C" SEXP _arrow_ipc___RecordBatchWriter__WriteRecordBatch(SEXP batch_writer_sexp, SEXP batch_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchWriter>&>::type batch_writer(batch_writer_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::RecordBatch>&>::type batch(batch_sexp);
+ ipc___RecordBatchWriter__WriteRecordBatch(batch_writer, batch);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchWriter__WriteRecordBatch(SEXP batch_writer_sexp, SEXP batch_sexp){
+ Rf_error("Cannot call ipc___RecordBatchWriter__WriteRecordBatch(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchwriter.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ipc___RecordBatchWriter__WriteTable(const std::shared_ptr<arrow::ipc::RecordBatchWriter>& batch_writer, const std::shared_ptr<arrow::Table>& table);
+extern "C" SEXP _arrow_ipc___RecordBatchWriter__WriteTable(SEXP batch_writer_sexp, SEXP table_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchWriter>&>::type batch_writer(batch_writer_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ ipc___RecordBatchWriter__WriteTable(batch_writer, table);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchWriter__WriteTable(SEXP batch_writer_sexp, SEXP table_sexp){
+ Rf_error("Cannot call ipc___RecordBatchWriter__WriteTable(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchwriter.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void ipc___RecordBatchWriter__Close(const std::shared_ptr<arrow::ipc::RecordBatchWriter>& batch_writer);
+extern "C" SEXP _arrow_ipc___RecordBatchWriter__Close(SEXP batch_writer_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ipc::RecordBatchWriter>&>::type batch_writer(batch_writer_sexp);
+ ipc___RecordBatchWriter__Close(batch_writer);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchWriter__Close(SEXP batch_writer_sexp){
+ Rf_error("Cannot call ipc___RecordBatchWriter__Close(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchwriter.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc___RecordBatchFileWriter__Open(const std::shared_ptr<arrow::io::OutputStream>& stream, const std::shared_ptr<arrow::Schema>& schema, bool use_legacy_format, arrow::ipc::MetadataVersion metadata_version);
+extern "C" SEXP _arrow_ipc___RecordBatchFileWriter__Open(SEXP stream_sexp, SEXP schema_sexp, SEXP use_legacy_format_sexp, SEXP metadata_version_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type stream(stream_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<bool>::type use_legacy_format(use_legacy_format_sexp);
+ arrow::r::Input<arrow::ipc::MetadataVersion>::type metadata_version(metadata_version_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchFileWriter__Open(stream, schema, use_legacy_format, metadata_version));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchFileWriter__Open(SEXP stream_sexp, SEXP schema_sexp, SEXP use_legacy_format_sexp, SEXP metadata_version_sexp){
+ Rf_error("Cannot call ipc___RecordBatchFileWriter__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// recordbatchwriter.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc___RecordBatchStreamWriter__Open(const std::shared_ptr<arrow::io::OutputStream>& stream, const std::shared_ptr<arrow::Schema>& schema, bool use_legacy_format, arrow::ipc::MetadataVersion metadata_version);
+extern "C" SEXP _arrow_ipc___RecordBatchStreamWriter__Open(SEXP stream_sexp, SEXP schema_sexp, SEXP use_legacy_format_sexp, SEXP metadata_version_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::io::OutputStream>&>::type stream(stream_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<bool>::type use_legacy_format(use_legacy_format_sexp);
+ arrow::r::Input<arrow::ipc::MetadataVersion>::type metadata_version(metadata_version_sexp);
+ return cpp11::as_sexp(ipc___RecordBatchStreamWriter__Open(stream, schema, use_legacy_format, metadata_version));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ipc___RecordBatchStreamWriter__Open(SEXP stream_sexp, SEXP schema_sexp, SEXP use_legacy_format_sexp, SEXP metadata_version_sexp){
+ Rf_error("Cannot call ipc___RecordBatchStreamWriter__Open(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Scalar> Array__GetScalar(const std::shared_ptr<arrow::Array>& x, int64_t i);
+extern "C" SEXP _arrow_Array__GetScalar(SEXP x_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Array>&>::type x(x_sexp);
+ arrow::r::Input<int64_t>::type i(i_sexp);
+ return cpp11::as_sexp(Array__GetScalar(x, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__GetScalar(SEXP x_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Array__GetScalar(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string Scalar__ToString(const std::shared_ptr<arrow::Scalar>& s);
+extern "C" SEXP _arrow_Scalar__ToString(SEXP s_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type s(s_sexp);
+ return cpp11::as_sexp(Scalar__ToString(s));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Scalar__ToString(SEXP s_sexp){
+ Rf_error("Cannot call Scalar__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Scalar> StructScalar__field(const std::shared_ptr<arrow::StructScalar>& s, int i);
+extern "C" SEXP _arrow_StructScalar__field(SEXP s_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructScalar>&>::type s(s_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(StructScalar__field(s, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructScalar__field(SEXP s_sexp, SEXP i_sexp){
+ Rf_error("Cannot call StructScalar__field(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Scalar> StructScalar__GetFieldByName(const std::shared_ptr<arrow::StructScalar>& s, const std::string& name);
+extern "C" SEXP _arrow_StructScalar__GetFieldByName(SEXP s_sexp, SEXP name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::StructScalar>&>::type s(s_sexp);
+ arrow::r::Input<const std::string&>::type name(name_sexp);
+ return cpp11::as_sexp(StructScalar__GetFieldByName(s, name));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_StructScalar__GetFieldByName(SEXP s_sexp, SEXP name_sexp){
+ Rf_error("Cannot call StructScalar__GetFieldByName(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+SEXP Scalar__as_vector(const std::shared_ptr<arrow::Scalar>& scalar);
+extern "C" SEXP _arrow_Scalar__as_vector(SEXP scalar_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type scalar(scalar_sexp);
+ return cpp11::as_sexp(Scalar__as_vector(scalar));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Scalar__as_vector(SEXP scalar_sexp){
+ Rf_error("Cannot call Scalar__as_vector(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> MakeArrayFromScalar(const std::shared_ptr<arrow::Scalar>& scalar, int n);
+extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp, SEXP n_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type scalar(scalar_sexp);
+ arrow::r::Input<int>::type n(n_sexp);
+ return cpp11::as_sexp(MakeArrayFromScalar(scalar, n));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp, SEXP n_sexp){
+ Rf_error("Cannot call MakeArrayFromScalar(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Scalar__is_valid(const std::shared_ptr<arrow::Scalar>& s);
+extern "C" SEXP _arrow_Scalar__is_valid(SEXP s_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type s(s_sexp);
+ return cpp11::as_sexp(Scalar__is_valid(s));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Scalar__is_valid(SEXP s_sexp){
+ Rf_error("Cannot call Scalar__is_valid(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Scalar__type(const std::shared_ptr<arrow::Scalar>& s);
+extern "C" SEXP _arrow_Scalar__type(SEXP s_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type s(s_sexp);
+ return cpp11::as_sexp(Scalar__type(s));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Scalar__type(SEXP s_sexp){
+ Rf_error("Cannot call Scalar__type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Scalar__Equals(const std::shared_ptr<arrow::Scalar>& lhs, const std::shared_ptr<arrow::Scalar>& rhs);
+extern "C" SEXP _arrow_Scalar__Equals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type rhs(rhs_sexp);
+ return cpp11::as_sexp(Scalar__Equals(lhs, rhs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Scalar__Equals(SEXP lhs_sexp, SEXP rhs_sexp){
+ Rf_error("Cannot call Scalar__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// scalar.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Scalar__ApproxEquals(const std::shared_ptr<arrow::Scalar>& lhs, const std::shared_ptr<arrow::Scalar>& rhs);
+extern "C" SEXP _arrow_Scalar__ApproxEquals(SEXP lhs_sexp, SEXP rhs_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type rhs(rhs_sexp);
+ return cpp11::as_sexp(Scalar__ApproxEquals(lhs, rhs));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Scalar__ApproxEquals(SEXP lhs_sexp, SEXP rhs_sexp){
+ Rf_error("Cannot call Scalar__ApproxEquals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> schema_(const std::vector<std::shared_ptr<arrow::Field>>& fields);
+extern "C" SEXP _arrow_schema_(SEXP fields_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::vector<std::shared_ptr<arrow::Field>>&>::type fields(fields_sexp);
+ return cpp11::as_sexp(schema_(fields));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_schema_(SEXP fields_sexp){
+ Rf_error("Cannot call schema_(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string Schema__ToString(const std::shared_ptr<arrow::Schema>& s);
+extern "C" SEXP _arrow_Schema__ToString(SEXP s_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type s(s_sexp);
+ return cpp11::as_sexp(Schema__ToString(s));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__ToString(SEXP s_sexp){
+ Rf_error("Cannot call Schema__ToString(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int Schema__num_fields(const std::shared_ptr<arrow::Schema>& s);
+extern "C" SEXP _arrow_Schema__num_fields(SEXP s_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type s(s_sexp);
+ return cpp11::as_sexp(Schema__num_fields(s));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__num_fields(SEXP s_sexp){
+ Rf_error("Cannot call Schema__num_fields(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> Schema__field(const std::shared_ptr<arrow::Schema>& s, int i);
+extern "C" SEXP _arrow_Schema__field(SEXP s_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type s(s_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(Schema__field(s, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__field(SEXP s_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Schema__field(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> Schema__AddField(const std::shared_ptr<arrow::Schema>& s, int i, const std::shared_ptr<arrow::Field>& field);
+extern "C" SEXP _arrow_Schema__AddField(SEXP s_sexp, SEXP i_sexp, SEXP field_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type s(s_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ return cpp11::as_sexp(Schema__AddField(s, i, field));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__AddField(SEXP s_sexp, SEXP i_sexp, SEXP field_sexp){
+ Rf_error("Cannot call Schema__AddField(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> Schema__SetField(const std::shared_ptr<arrow::Schema>& s, int i, const std::shared_ptr<arrow::Field>& field);
+extern "C" SEXP _arrow_Schema__SetField(SEXP s_sexp, SEXP i_sexp, SEXP field_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type s(s_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ return cpp11::as_sexp(Schema__SetField(s, i, field));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__SetField(SEXP s_sexp, SEXP i_sexp, SEXP field_sexp){
+ Rf_error("Cannot call Schema__SetField(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> Schema__RemoveField(const std::shared_ptr<arrow::Schema>& s, int i);
+extern "C" SEXP _arrow_Schema__RemoveField(SEXP s_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type s(s_sexp);
+ arrow::r::Input<int>::type i(i_sexp);
+ return cpp11::as_sexp(Schema__RemoveField(s, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__RemoveField(SEXP s_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Schema__RemoveField(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> Schema__GetFieldByName(const std::shared_ptr<arrow::Schema>& s, std::string x);
+extern "C" SEXP _arrow_Schema__GetFieldByName(SEXP s_sexp, SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type s(s_sexp);
+ arrow::r::Input<std::string>::type x(x_sexp);
+ return cpp11::as_sexp(Schema__GetFieldByName(s, x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__GetFieldByName(SEXP s_sexp, SEXP x_sexp){
+ Rf_error("Cannot call Schema__GetFieldByName(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list Schema__fields(const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_Schema__fields(SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(Schema__fields(schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__fields(SEXP schema_sexp){
+ Rf_error("Cannot call Schema__fields(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> Schema__field_names(const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_Schema__field_names(SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(Schema__field_names(schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__field_names(SEXP schema_sexp){
+ Rf_error("Cannot call Schema__field_names(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Schema__HasMetadata(const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_Schema__HasMetadata(SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(Schema__HasMetadata(schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__HasMetadata(SEXP schema_sexp){
+ Rf_error("Cannot call Schema__HasMetadata(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::list Schema__metadata(const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_Schema__metadata(SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(Schema__metadata(schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__metadata(SEXP schema_sexp){
+ Rf_error("Cannot call Schema__metadata(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> Schema__WithMetadata(const std::shared_ptr<arrow::Schema>& schema, cpp11::strings metadata);
+extern "C" SEXP _arrow_Schema__WithMetadata(SEXP schema_sexp, SEXP metadata_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<cpp11::strings>::type metadata(metadata_sexp);
+ return cpp11::as_sexp(Schema__WithMetadata(schema, metadata));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__WithMetadata(SEXP schema_sexp, SEXP metadata_sexp){
+ Rf_error("Cannot call Schema__WithMetadata(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::writable::raws Schema__serialize(const std::shared_ptr<arrow::Schema>& schema);
+extern "C" SEXP _arrow_Schema__serialize(SEXP schema_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ return cpp11::as_sexp(Schema__serialize(schema));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__serialize(SEXP schema_sexp){
+ Rf_error("Cannot call Schema__serialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Schema__Equals(const std::shared_ptr<arrow::Schema>& schema, const std::shared_ptr<arrow::Schema>& other, bool check_metadata);
+extern "C" SEXP _arrow_Schema__Equals(SEXP schema_sexp, SEXP other_sexp, SEXP check_metadata_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type schema(schema_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Schema>&>::type other(other_sexp);
+ arrow::r::Input<bool>::type check_metadata(check_metadata_sexp);
+ return cpp11::as_sexp(Schema__Equals(schema, other, check_metadata));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Schema__Equals(SEXP schema_sexp, SEXP other_sexp, SEXP check_metadata_sexp){
+ Rf_error("Cannot call Schema__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// schema.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> arrow__UnifySchemas(const std::vector<std::shared_ptr<arrow::Schema>>& schemas);
+extern "C" SEXP _arrow_arrow__UnifySchemas(SEXP schemas_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::vector<std::shared_ptr<arrow::Schema>>&>::type schemas(schemas_sexp);
+ return cpp11::as_sexp(arrow__UnifySchemas(schemas));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_arrow__UnifySchemas(SEXP schemas_sexp){
+ Rf_error("Cannot call arrow__UnifySchemas(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int Table__num_columns(const std::shared_ptr<arrow::Table>& x);
+extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type x(x_sexp);
+ return cpp11::as_sexp(Table__num_columns(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){
+ Rf_error("Cannot call Table__num_columns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int Table__num_rows(const std::shared_ptr<arrow::Table>& x);
+extern "C" SEXP _arrow_Table__num_rows(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type x(x_sexp);
+ return cpp11::as_sexp(Table__num_rows(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__num_rows(SEXP x_sexp){
+ Rf_error("Cannot call Table__num_rows(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> Table__schema(const std::shared_ptr<arrow::Table>& x);
+extern "C" SEXP _arrow_Table__schema(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type x(x_sexp);
+ return cpp11::as_sexp(Table__schema(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__schema(SEXP x_sexp){
+ Rf_error("Cannot call Table__schema(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__ReplaceSchemaMetadata(const std::shared_ptr<arrow::Table>& x, cpp11::strings metadata);
+extern "C" SEXP _arrow_Table__ReplaceSchemaMetadata(SEXP x_sexp, SEXP metadata_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type x(x_sexp);
+ arrow::r::Input<cpp11::strings>::type metadata(metadata_sexp);
+ return cpp11::as_sexp(Table__ReplaceSchemaMetadata(x, metadata));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__ReplaceSchemaMetadata(SEXP x_sexp, SEXP metadata_sexp){
+ Rf_error("Cannot call Table__ReplaceSchemaMetadata(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ChunkedArray> Table__column(const std::shared_ptr<arrow::Table>& table, R_xlen_t i);
+extern "C" SEXP _arrow_Table__column(SEXP table_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(Table__column(table, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__column(SEXP table_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Table__column(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> Table__field(const std::shared_ptr<arrow::Table>& table, R_xlen_t i);
+extern "C" SEXP _arrow_Table__field(SEXP table_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(Table__field(table, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__field(SEXP table_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Table__field(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::list Table__columns(const std::shared_ptr<arrow::Table>& table);
+extern "C" SEXP _arrow_Table__columns(SEXP table_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ return cpp11::as_sexp(Table__columns(table));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__columns(SEXP table_sexp){
+ Rf_error("Cannot call Table__columns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::vector<std::string> Table__ColumnNames(const std::shared_ptr<arrow::Table>& table);
+extern "C" SEXP _arrow_Table__ColumnNames(SEXP table_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ return cpp11::as_sexp(Table__ColumnNames(table));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__ColumnNames(SEXP table_sexp){
+ Rf_error("Cannot call Table__ColumnNames(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__RenameColumns(const std::shared_ptr<arrow::Table>& table, const std::vector<std::string>& names);
+extern "C" SEXP _arrow_Table__RenameColumns(SEXP table_sexp, SEXP names_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<const std::vector<std::string>&>::type names(names_sexp);
+ return cpp11::as_sexp(Table__RenameColumns(table, names));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__RenameColumns(SEXP table_sexp, SEXP names_sexp){
+ Rf_error("Cannot call Table__RenameColumns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__Slice1(const std::shared_ptr<arrow::Table>& table, R_xlen_t offset);
+extern "C" SEXP _arrow_Table__Slice1(SEXP table_sexp, SEXP offset_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ return cpp11::as_sexp(Table__Slice1(table, offset));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__Slice1(SEXP table_sexp, SEXP offset_sexp){
+ Rf_error("Cannot call Table__Slice1(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__Slice2(const std::shared_ptr<arrow::Table>& table, R_xlen_t offset, R_xlen_t length);
+extern "C" SEXP _arrow_Table__Slice2(SEXP table_sexp, SEXP offset_sexp, SEXP length_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<R_xlen_t>::type offset(offset_sexp);
+ arrow::r::Input<R_xlen_t>::type length(length_sexp);
+ return cpp11::as_sexp(Table__Slice2(table, offset, length));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__Slice2(SEXP table_sexp, SEXP offset_sexp, SEXP length_sexp){
+ Rf_error("Cannot call Table__Slice2(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Table__Equals(const std::shared_ptr<arrow::Table>& lhs, const std::shared_ptr<arrow::Table>& rhs, bool check_metadata);
+extern "C" SEXP _arrow_Table__Equals(SEXP lhs_sexp, SEXP rhs_sexp, SEXP check_metadata_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type lhs(lhs_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type rhs(rhs_sexp);
+ arrow::r::Input<bool>::type check_metadata(check_metadata_sexp);
+ return cpp11::as_sexp(Table__Equals(lhs, rhs, check_metadata));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__Equals(SEXP lhs_sexp, SEXP rhs_sexp, SEXP check_metadata_sexp){
+ Rf_error("Cannot call Table__Equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Table__Validate(const std::shared_ptr<arrow::Table>& table);
+extern "C" SEXP _arrow_Table__Validate(SEXP table_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ return cpp11::as_sexp(Table__Validate(table));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__Validate(SEXP table_sexp){
+ Rf_error("Cannot call Table__Validate(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool Table__ValidateFull(const std::shared_ptr<arrow::Table>& table);
+extern "C" SEXP _arrow_Table__ValidateFull(SEXP table_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ return cpp11::as_sexp(Table__ValidateFull(table));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__ValidateFull(SEXP table_sexp){
+ Rf_error("Cannot call Table__ValidateFull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::ChunkedArray> Table__GetColumnByName(const std::shared_ptr<arrow::Table>& table, const std::string& name);
+extern "C" SEXP _arrow_Table__GetColumnByName(SEXP table_sexp, SEXP name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<const std::string&>::type name(name_sexp);
+ return cpp11::as_sexp(Table__GetColumnByName(table, name));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__GetColumnByName(SEXP table_sexp, SEXP name_sexp){
+ Rf_error("Cannot call Table__GetColumnByName(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__RemoveColumn(const std::shared_ptr<arrow::Table>& table, R_xlen_t i);
+extern "C" SEXP _arrow_Table__RemoveColumn(SEXP table_sexp, SEXP i_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ return cpp11::as_sexp(Table__RemoveColumn(table, i));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__RemoveColumn(SEXP table_sexp, SEXP i_sexp){
+ Rf_error("Cannot call Table__RemoveColumn(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__AddColumn(const std::shared_ptr<arrow::Table>& table, R_xlen_t i, const std::shared_ptr<arrow::Field>& field, const std::shared_ptr<arrow::ChunkedArray>& column);
+extern "C" SEXP _arrow_Table__AddColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type column(column_sexp);
+ return cpp11::as_sexp(Table__AddColumn(table, i, field, column));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__AddColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+ Rf_error("Cannot call Table__AddColumn(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__SetColumn(const std::shared_ptr<arrow::Table>& table, R_xlen_t i, const std::shared_ptr<arrow::Field>& field, const std::shared_ptr<arrow::ChunkedArray>& column);
+extern "C" SEXP _arrow_Table__SetColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<R_xlen_t>::type i(i_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::Field>&>::type field(field_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::ChunkedArray>&>::type column(column_sexp);
+ return cpp11::as_sexp(Table__SetColumn(table, i, field, column));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__SetColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){
+ Rf_error("Cannot call Table__SetColumn(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__SelectColumns(const std::shared_ptr<arrow::Table>& table, const std::vector<int>& indices);
+extern "C" SEXP _arrow_Table__SelectColumns(SEXP table_sexp, SEXP indices_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ arrow::r::Input<const std::vector<int>&>::type indices(indices_sexp);
+ return cpp11::as_sexp(Table__SelectColumns(table, indices));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__SelectColumns(SEXP table_sexp, SEXP indices_sexp){
+ Rf_error("Cannot call Table__SelectColumns(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool all_record_batches(SEXP lst);
+extern "C" SEXP _arrow_all_record_batches(SEXP lst_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type lst(lst_sexp);
+ return cpp11::as_sexp(all_record_batches(lst));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_all_record_batches(SEXP lst_sexp){
+ Rf_error("Cannot call all_record_batches(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__from_record_batches(const std::vector<std::shared_ptr<arrow::RecordBatch>>& batches, SEXP schema_sxp);
+extern "C" SEXP _arrow_Table__from_record_batches(SEXP batches_sexp, SEXP schema_sxp_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::vector<std::shared_ptr<arrow::RecordBatch>>&>::type batches(batches_sexp);
+ arrow::r::Input<SEXP>::type schema_sxp(schema_sxp_sexp);
+ return cpp11::as_sexp(Table__from_record_batches(batches, schema_sxp));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Table__from_record_batches(SEXP batches_sexp, SEXP schema_sxp_sexp){
+ Rf_error("Cannot call Table__from_record_batches(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// threadpool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int GetCpuThreadPoolCapacity();
+extern "C" SEXP _arrow_GetCpuThreadPoolCapacity(){
+BEGIN_CPP11
+ return cpp11::as_sexp(GetCpuThreadPoolCapacity());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_GetCpuThreadPoolCapacity(){
+ Rf_error("Cannot call GetCpuThreadPoolCapacity(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// threadpool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void SetCpuThreadPoolCapacity(int threads);
+extern "C" SEXP _arrow_SetCpuThreadPoolCapacity(SEXP threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<int>::type threads(threads_sexp);
+ SetCpuThreadPoolCapacity(threads);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_SetCpuThreadPoolCapacity(SEXP threads_sexp){
+ Rf_error("Cannot call SetCpuThreadPoolCapacity(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// threadpool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int GetIOThreadPoolCapacity();
+extern "C" SEXP _arrow_GetIOThreadPoolCapacity(){
+BEGIN_CPP11
+ return cpp11::as_sexp(GetIOThreadPoolCapacity());
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_GetIOThreadPoolCapacity(){
+ Rf_error("Cannot call GetIOThreadPoolCapacity(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// threadpool.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void SetIOThreadPoolCapacity(int threads);
+extern "C" SEXP _arrow_SetIOThreadPoolCapacity(SEXP threads_sexp){
+BEGIN_CPP11
+ arrow::r::Input<int>::type threads(threads_sexp);
+ SetIOThreadPoolCapacity(threads);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_SetIOThreadPoolCapacity(SEXP threads_sexp){
+ Rf_error("Cannot call SetIOThreadPoolCapacity(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// type_infer.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> Array__infer_type(SEXP x);
+extern "C" SEXP _arrow_Array__infer_type(SEXP x_sexp){
+BEGIN_CPP11
+ arrow::r::Input<SEXP>::type x(x_sexp);
+ return cpp11::as_sexp(Array__infer_type(x));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_Array__infer_type(SEXP x_sexp){
+ Rf_error("Cannot call Array__infer_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+#if defined(ARROW_R_WITH_ARROW)
+extern "C" SEXP _arrow_Table__Reset(SEXP r6) {
+BEGIN_CPP11
+arrow::r::r6_reset_pointer<arrow::Table>(r6);
+END_CPP11
+return R_NilValue;
+}
+#else
+extern "C" SEXP _arrow_Table__Reset(SEXP r6){
+ Rf_error("Cannot call Table(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+#if defined(ARROW_R_WITH_ARROW)
+extern "C" SEXP _arrow_RecordBatch__Reset(SEXP r6) {
+BEGIN_CPP11
+arrow::r::r6_reset_pointer<arrow::RecordBatch>(r6);
+END_CPP11
+return R_NilValue;
+}
+#else
+extern "C" SEXP _arrow_RecordBatch__Reset(SEXP r6){
+ Rf_error("Cannot call RecordBatch(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+extern "C" SEXP _arrow_available() {
+return Rf_ScalarLogical(
+#if defined(ARROW_R_WITH_ARROW)
+ TRUE
+#else
+ FALSE
+#endif
+);
+}
+extern "C" SEXP _dataset_available() {
+return Rf_ScalarLogical(
+#if defined(ARROW_R_WITH_DATASET)
+ TRUE
+#else
+ FALSE
+#endif
+);
+}
+extern "C" SEXP _parquet_available() {
+return Rf_ScalarLogical(
+#if defined(ARROW_R_WITH_PARQUET)
+ TRUE
+#else
+ FALSE
+#endif
+);
+}
+extern "C" SEXP _s3_available() {
+return Rf_ScalarLogical(
+#if defined(ARROW_R_WITH_S3)
+ TRUE
+#else
+ FALSE
+#endif
+);
+}
+extern "C" SEXP _json_available() {
+return Rf_ScalarLogical(
+#if defined(ARROW_R_WITH_JSON)
+ TRUE
+#else
+ FALSE
+#endif
+);
+}
+static const R_CallMethodDef CallEntries[] = {
+ { "_arrow_available", (DL_FUNC)& _arrow_available, 0 },
+ { "_dataset_available", (DL_FUNC)& _dataset_available, 0 },
+ { "_parquet_available", (DL_FUNC)& _parquet_available, 0 },
+ { "_s3_available", (DL_FUNC)& _s3_available, 0 },
+ { "_json_available", (DL_FUNC)& _json_available, 0 },
+ { "_arrow_test_SET_STRING_ELT", (DL_FUNC) &_arrow_test_SET_STRING_ELT, 1},
+ { "_arrow_test_same_Array", (DL_FUNC) &_arrow_test_same_Array, 2},
+ { "_arrow_is_arrow_altrep", (DL_FUNC) &_arrow_is_arrow_altrep, 1},
+ { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2},
+ { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3},
+ { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2},
+ { "_arrow_Array__IsValid", (DL_FUNC) &_arrow_Array__IsValid, 2},
+ { "_arrow_Array__length", (DL_FUNC) &_arrow_Array__length, 1},
+ { "_arrow_Array__offset", (DL_FUNC) &_arrow_Array__offset, 1},
+ { "_arrow_Array__null_count", (DL_FUNC) &_arrow_Array__null_count, 1},
+ { "_arrow_Array__type", (DL_FUNC) &_arrow_Array__type, 1},
+ { "_arrow_Array__ToString", (DL_FUNC) &_arrow_Array__ToString, 1},
+ { "_arrow_Array__type_id", (DL_FUNC) &_arrow_Array__type_id, 1},
+ { "_arrow_Array__Equals", (DL_FUNC) &_arrow_Array__Equals, 2},
+ { "_arrow_Array__ApproxEquals", (DL_FUNC) &_arrow_Array__ApproxEquals, 2},
+ { "_arrow_Array__Diff", (DL_FUNC) &_arrow_Array__Diff, 2},
+ { "_arrow_Array__data", (DL_FUNC) &_arrow_Array__data, 1},
+ { "_arrow_Array__RangeEquals", (DL_FUNC) &_arrow_Array__RangeEquals, 5},
+ { "_arrow_Array__View", (DL_FUNC) &_arrow_Array__View, 2},
+ { "_arrow_Array__Validate", (DL_FUNC) &_arrow_Array__Validate, 1},
+ { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1},
+ { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1},
+ { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2},
+ { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2},
+ { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1},
+ { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1},
+ { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1},
+ { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1},
+ { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1},
+ { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2},
+ { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2},
+ { "_arrow_FixedSizeListArray__value_length", (DL_FUNC) &_arrow_FixedSizeListArray__value_length, 2},
+ { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2},
+ { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2},
+ { "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2},
+ { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1},
+ { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1},
+ { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1},
+ { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2},
+ { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2},
+ { "_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2},
+ { "_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1},
+ { "_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1},
+ { "_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1},
+ { "_arrow_ArrayData__get_offset", (DL_FUNC) &_arrow_ArrayData__get_offset, 1},
+ { "_arrow_ArrayData__buffers", (DL_FUNC) &_arrow_ArrayData__buffers, 1},
+ { "_arrow_Buffer__is_mutable", (DL_FUNC) &_arrow_Buffer__is_mutable, 1},
+ { "_arrow_Buffer__ZeroPadding", (DL_FUNC) &_arrow_Buffer__ZeroPadding, 1},
+ { "_arrow_Buffer__capacity", (DL_FUNC) &_arrow_Buffer__capacity, 1},
+ { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1},
+ { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1},
+ { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1},
+ { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2},
+ { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1},
+ { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1},
+ { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1},
+ { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2},
+ { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1},
+ { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1},
+ { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2},
+ { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3},
+ { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2},
+ { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1},
+ { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2},
+ { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1},
+ { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2},
+ { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2},
+ { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1},
+ { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1},
+ { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2},
+ { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2},
+ { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1},
+ { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 4},
+ { "_arrow_ExecPlan_StopProducing", (DL_FUNC) &_arrow_ExecPlan_StopProducing, 1},
+ { "_arrow_ExecNode_output_schema", (DL_FUNC) &_arrow_ExecNode_output_schema, 1},
+ { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4},
+ { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2},
+ { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3},
+ { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5},
+ { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 7},
+ { "_arrow_ExecNode_ReadFromRecordBatchReader", (DL_FUNC) &_arrow_ExecNode_ReadFromRecordBatchReader, 2},
+ { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3},
+ { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3},
+ { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3},
+ { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0},
+ { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0},
+ { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0},
+ { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1},
+ { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1},
+ { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1},
+ { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1},
+ { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1},
+ { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4},
+ { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1},
+ { "_arrow_TimestampParser__kind", (DL_FUNC) &_arrow_TimestampParser__kind, 1},
+ { "_arrow_TimestampParser__format", (DL_FUNC) &_arrow_TimestampParser__format, 1},
+ { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) &_arrow_TimestampParser__MakeStrptime, 1},
+ { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) &_arrow_TimestampParser__MakeISO8601, 0},
+ { "_arrow_csv___WriteCSV__Table", (DL_FUNC) &_arrow_csv___WriteCSV__Table, 3},
+ { "_arrow_csv___WriteCSV__RecordBatch", (DL_FUNC) &_arrow_csv___WriteCSV__RecordBatch, 3},
+ { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1},
+ { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1},
+ { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1},
+ { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2},
+ { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2},
+ { "_arrow_dataset___InMemoryDataset__create", (DL_FUNC) &_arrow_dataset___InMemoryDataset__create, 1},
+ { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1},
+ { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1},
+ { "_arrow_dataset___FileSystemDataset__filesystem", (DL_FUNC) &_arrow_dataset___FileSystemDataset__filesystem, 1},
+ { "_arrow_dataset___FileSystemDataset__files", (DL_FUNC) &_arrow_dataset___FileSystemDataset__files, 1},
+ { "_arrow_dataset___DatasetFactory__Finish1", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish1, 2},
+ { "_arrow_dataset___DatasetFactory__Finish2", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish2, 2},
+ { "_arrow_dataset___DatasetFactory__Inspect", (DL_FUNC) &_arrow_dataset___DatasetFactory__Inspect, 2},
+ { "_arrow_dataset___UnionDatasetFactory__Make", (DL_FUNC) &_arrow_dataset___UnionDatasetFactory__Make, 1},
+ { "_arrow_dataset___FileSystemDatasetFactory__Make0", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make0, 3},
+ { "_arrow_dataset___FileSystemDatasetFactory__Make2", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make2, 4},
+ { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3},
+ { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4},
+ { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1},
+ { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1},
+ { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 2},
+ { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1},
+ { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3},
+ { "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4},
+ { "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3},
+ { "_arrow_dataset___CsvFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___CsvFileWriteOptions__update, 2},
+ { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0},
+ { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3},
+ { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1},
+ { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2},
+ { "_arrow_dataset___ParquetFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3},
+ { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 2},
+ { "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2},
+ { "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 3},
+ { "_arrow_dataset___HivePartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___HivePartitioning__MakeFactory, 2},
+ { "_arrow_dataset___ScannerBuilder__ProjectNames", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectNames, 2},
+ { "_arrow_dataset___ScannerBuilder__ProjectExprs", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectExprs, 3},
+ { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2},
+ { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2},
+ { "_arrow_dataset___ScannerBuilder__UseAsync", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseAsync, 2},
+ { "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2},
+ { "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2},
+ { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1},
+ { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1},
+ { "_arrow_dataset___ScannerBuilder__FromRecordBatchReader", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FromRecordBatchReader, 1},
+ { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1},
+ { "_arrow_dataset___Scanner__ScanBatches", (DL_FUNC) &_arrow_dataset___Scanner__ScanBatches, 1},
+ { "_arrow_dataset___Scanner__ToRecordBatchReader", (DL_FUNC) &_arrow_dataset___Scanner__ToRecordBatchReader, 1},
+ { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2},
+ { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1},
+ { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1},
+ { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 7},
+ { "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2},
+ { "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1},
+ { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0},
+ { "_arrow_Int16__initialize", (DL_FUNC) &_arrow_Int16__initialize, 0},
+ { "_arrow_Int32__initialize", (DL_FUNC) &_arrow_Int32__initialize, 0},
+ { "_arrow_Int64__initialize", (DL_FUNC) &_arrow_Int64__initialize, 0},
+ { "_arrow_UInt8__initialize", (DL_FUNC) &_arrow_UInt8__initialize, 0},
+ { "_arrow_UInt16__initialize", (DL_FUNC) &_arrow_UInt16__initialize, 0},
+ { "_arrow_UInt32__initialize", (DL_FUNC) &_arrow_UInt32__initialize, 0},
+ { "_arrow_UInt64__initialize", (DL_FUNC) &_arrow_UInt64__initialize, 0},
+ { "_arrow_Float16__initialize", (DL_FUNC) &_arrow_Float16__initialize, 0},
+ { "_arrow_Float32__initialize", (DL_FUNC) &_arrow_Float32__initialize, 0},
+ { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0},
+ { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0},
+ { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0},
+ { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0},
+ { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0},
+ { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0},
+ { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0},
+ { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0},
+ { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0},
+ { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2},
+ { "_arrow_FixedSizeBinary__initialize", (DL_FUNC) &_arrow_FixedSizeBinary__initialize, 1},
+ { "_arrow_Timestamp__initialize", (DL_FUNC) &_arrow_Timestamp__initialize, 2},
+ { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1},
+ { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1},
+ { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1},
+ { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1},
+ { "_arrow_fixed_size_list__", (DL_FUNC) &_arrow_fixed_size_list__, 2},
+ { "_arrow_struct__", (DL_FUNC) &_arrow_struct__, 1},
+ { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1},
+ { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1},
+ { "_arrow_DataType__Equals", (DL_FUNC) &_arrow_DataType__Equals, 2},
+ { "_arrow_DataType__num_fields", (DL_FUNC) &_arrow_DataType__num_fields, 1},
+ { "_arrow_DataType__fields", (DL_FUNC) &_arrow_DataType__fields, 1},
+ { "_arrow_DataType__id", (DL_FUNC) &_arrow_DataType__id, 1},
+ { "_arrow_ListType__ToString", (DL_FUNC) &_arrow_ListType__ToString, 1},
+ { "_arrow_FixedWidthType__bit_width", (DL_FUNC) &_arrow_FixedWidthType__bit_width, 1},
+ { "_arrow_DateType__unit", (DL_FUNC) &_arrow_DateType__unit, 1},
+ { "_arrow_TimeType__unit", (DL_FUNC) &_arrow_TimeType__unit, 1},
+ { "_arrow_DecimalType__precision", (DL_FUNC) &_arrow_DecimalType__precision, 1},
+ { "_arrow_DecimalType__scale", (DL_FUNC) &_arrow_DecimalType__scale, 1},
+ { "_arrow_TimestampType__timezone", (DL_FUNC) &_arrow_TimestampType__timezone, 1},
+ { "_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1},
+ { "_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3},
+ { "_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1},
+ { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1},
+ { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1},
+ { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1},
+ { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2},
+ { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2},
+ { "_arrow_StructType__field_names", (DL_FUNC) &_arrow_StructType__field_names, 1},
+ { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1},
+ { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1},
+ { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1},
+ { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1},
+ { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1},
+ { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1},
+ { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1},
+ { "_arrow_compute___expr__equals", (DL_FUNC) &_arrow_compute___expr__equals, 2},
+ { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3},
+ { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1},
+ { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1},
+ { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1},
+ { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1},
+ { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1},
+ { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2},
+ { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2},
+ { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6},
+ { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1},
+ { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2},
+ { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1},
+ { "_arrow_ipc___feather___Reader__schema", (DL_FUNC) &_arrow_ipc___feather___Reader__schema, 1},
+ { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3},
+ { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1},
+ { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1},
+ { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 2},
+ { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1},
+ { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1},
+ { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1},
+ { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2},
+ { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1},
+ { "_arrow_fs___FileInfo__set_path", (DL_FUNC) &_arrow_fs___FileInfo__set_path, 2},
+ { "_arrow_fs___FileInfo__size", (DL_FUNC) &_arrow_fs___FileInfo__size, 1},
+ { "_arrow_fs___FileInfo__set_size", (DL_FUNC) &_arrow_fs___FileInfo__set_size, 2},
+ { "_arrow_fs___FileInfo__base_name", (DL_FUNC) &_arrow_fs___FileInfo__base_name, 1},
+ { "_arrow_fs___FileInfo__extension", (DL_FUNC) &_arrow_fs___FileInfo__extension, 1},
+ { "_arrow_fs___FileInfo__mtime", (DL_FUNC) &_arrow_fs___FileInfo__mtime, 1},
+ { "_arrow_fs___FileInfo__set_mtime", (DL_FUNC) &_arrow_fs___FileInfo__set_mtime, 2},
+ { "_arrow_fs___FileSelector__base_dir", (DL_FUNC) &_arrow_fs___FileSelector__base_dir, 1},
+ { "_arrow_fs___FileSelector__allow_not_found", (DL_FUNC) &_arrow_fs___FileSelector__allow_not_found, 1},
+ { "_arrow_fs___FileSelector__recursive", (DL_FUNC) &_arrow_fs___FileSelector__recursive, 1},
+ { "_arrow_fs___FileSelector__create", (DL_FUNC) &_arrow_fs___FileSelector__create, 3},
+ { "_arrow_fs___FileSystem__GetTargetInfos_Paths", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_Paths, 2},
+ { "_arrow_fs___FileSystem__GetTargetInfos_FileSelector", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_FileSelector, 2},
+ { "_arrow_fs___FileSystem__CreateDir", (DL_FUNC) &_arrow_fs___FileSystem__CreateDir, 3},
+ { "_arrow_fs___FileSystem__DeleteDir", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDir, 2},
+ { "_arrow_fs___FileSystem__DeleteDirContents", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDirContents, 2},
+ { "_arrow_fs___FileSystem__DeleteFile", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFile, 2},
+ { "_arrow_fs___FileSystem__DeleteFiles", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFiles, 2},
+ { "_arrow_fs___FileSystem__Move", (DL_FUNC) &_arrow_fs___FileSystem__Move, 3},
+ { "_arrow_fs___FileSystem__CopyFile", (DL_FUNC) &_arrow_fs___FileSystem__CopyFile, 3},
+ { "_arrow_fs___FileSystem__OpenInputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputStream, 2},
+ { "_arrow_fs___FileSystem__OpenInputFile", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputFile, 2},
+ { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2},
+ { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2},
+ { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1},
+ { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0},
+ { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2},
+ { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1},
+ { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1},
+ { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1},
+ { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6},
+ { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12},
+ { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1},
+ { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2},
+ { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1},
+ { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1},
+ { "_arrow_io___RandomAccessFile__GetSize", (DL_FUNC) &_arrow_io___RandomAccessFile__GetSize, 1},
+ { "_arrow_io___RandomAccessFile__supports_zero_copy", (DL_FUNC) &_arrow_io___RandomAccessFile__supports_zero_copy, 1},
+ { "_arrow_io___RandomAccessFile__Seek", (DL_FUNC) &_arrow_io___RandomAccessFile__Seek, 2},
+ { "_arrow_io___RandomAccessFile__Tell", (DL_FUNC) &_arrow_io___RandomAccessFile__Tell, 1},
+ { "_arrow_io___RandomAccessFile__Read0", (DL_FUNC) &_arrow_io___RandomAccessFile__Read0, 1},
+ { "_arrow_io___RandomAccessFile__ReadAt", (DL_FUNC) &_arrow_io___RandomAccessFile__ReadAt, 3},
+ { "_arrow_io___MemoryMappedFile__Create", (DL_FUNC) &_arrow_io___MemoryMappedFile__Create, 2},
+ { "_arrow_io___MemoryMappedFile__Open", (DL_FUNC) &_arrow_io___MemoryMappedFile__Open, 2},
+ { "_arrow_io___MemoryMappedFile__Resize", (DL_FUNC) &_arrow_io___MemoryMappedFile__Resize, 2},
+ { "_arrow_io___ReadableFile__Open", (DL_FUNC) &_arrow_io___ReadableFile__Open, 1},
+ { "_arrow_io___BufferReader__initialize", (DL_FUNC) &_arrow_io___BufferReader__initialize, 1},
+ { "_arrow_io___Writable__write", (DL_FUNC) &_arrow_io___Writable__write, 2},
+ { "_arrow_io___OutputStream__Tell", (DL_FUNC) &_arrow_io___OutputStream__Tell, 1},
+ { "_arrow_io___FileOutputStream__Open", (DL_FUNC) &_arrow_io___FileOutputStream__Open, 1},
+ { "_arrow_io___BufferOutputStream__Create", (DL_FUNC) &_arrow_io___BufferOutputStream__Create, 1},
+ { "_arrow_io___BufferOutputStream__capacity", (DL_FUNC) &_arrow_io___BufferOutputStream__capacity, 1},
+ { "_arrow_io___BufferOutputStream__Finish", (DL_FUNC) &_arrow_io___BufferOutputStream__Finish, 1},
+ { "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1},
+ { "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2},
+ { "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2},
+ { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1},
+ { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2},
+ { "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3},
+ { "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1},
+ { "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0},
+ { "_arrow_MemoryPool__bytes_allocated", (DL_FUNC) &_arrow_MemoryPool__bytes_allocated, 1},
+ { "_arrow_MemoryPool__max_memory", (DL_FUNC) &_arrow_MemoryPool__max_memory, 1},
+ { "_arrow_MemoryPool__backend_name", (DL_FUNC) &_arrow_MemoryPool__backend_name, 1},
+ { "_arrow_supported_memory_backends", (DL_FUNC) &_arrow_supported_memory_backends, 0},
+ { "_arrow_ipc___Message__body_length", (DL_FUNC) &_arrow_ipc___Message__body_length, 1},
+ { "_arrow_ipc___Message__metadata", (DL_FUNC) &_arrow_ipc___Message__metadata, 1},
+ { "_arrow_ipc___Message__body", (DL_FUNC) &_arrow_ipc___Message__body, 1},
+ { "_arrow_ipc___Message__Verify", (DL_FUNC) &_arrow_ipc___Message__Verify, 1},
+ { "_arrow_ipc___Message__type", (DL_FUNC) &_arrow_ipc___Message__type, 1},
+ { "_arrow_ipc___Message__Equals", (DL_FUNC) &_arrow_ipc___Message__Equals, 2},
+ { "_arrow_ipc___ReadRecordBatch__Message__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__Message__Schema, 2},
+ { "_arrow_ipc___ReadSchema_InputStream", (DL_FUNC) &_arrow_ipc___ReadSchema_InputStream, 1},
+ { "_arrow_ipc___ReadSchema_Message", (DL_FUNC) &_arrow_ipc___ReadSchema_Message, 1},
+ { "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1},
+ { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1},
+ { "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3},
+ { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2},
+ { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1},
+ { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2},
+ { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2},
+ { "_arrow_parquet___arrow___FileReader__ReadRowGroup2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup2, 3},
+ { "_arrow_parquet___arrow___FileReader__ReadRowGroups1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups1, 2},
+ { "_arrow_parquet___arrow___FileReader__ReadRowGroups2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups2, 3},
+ { "_arrow_parquet___arrow___FileReader__num_rows", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_rows, 1},
+ { "_arrow_parquet___arrow___FileReader__num_columns", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_columns, 1},
+ { "_arrow_parquet___arrow___FileReader__num_row_groups", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_row_groups, 1},
+ { "_arrow_parquet___arrow___FileReader__ReadColumn", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadColumn, 2},
+ { "_arrow_parquet___ArrowWriterProperties___create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___create, 3},
+ { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0},
+ { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2},
+ { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3},
+ { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3},
+ { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3},
+ { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3},
+ { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2},
+ { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1},
+ { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4},
+ { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3},
+ { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1},
+ { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4},
+ { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1},
+ { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0},
+ { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1},
+ { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0},
+ { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1},
+ { "_arrow_allocate_arrow_array_stream", (DL_FUNC) &_arrow_allocate_arrow_array_stream, 0},
+ { "_arrow_delete_arrow_array_stream", (DL_FUNC) &_arrow_delete_arrow_array_stream, 1},
+ { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2},
+ { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2},
+ { "_arrow_ImportSchema", (DL_FUNC) &_arrow_ImportSchema, 1},
+ { "_arrow_ImportField", (DL_FUNC) &_arrow_ImportField, 1},
+ { "_arrow_ImportType", (DL_FUNC) &_arrow_ImportType, 1},
+ { "_arrow_ImportRecordBatchReader", (DL_FUNC) &_arrow_ImportRecordBatchReader, 1},
+ { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2},
+ { "_arrow_ExportField", (DL_FUNC) &_arrow_ExportField, 2},
+ { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2},
+ { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3},
+ { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3},
+ { "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2},
+ { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3},
+ { "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2},
+ { "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3},
+ { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1},
+ { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1},
+ { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1},
+ { "_arrow_RecordBatch__RenameColumns", (DL_FUNC) &_arrow_RecordBatch__RenameColumns, 2},
+ { "_arrow_RecordBatch__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_RecordBatch__ReplaceSchemaMetadata, 2},
+ { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1},
+ { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2},
+ { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2},
+ { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2},
+ { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3},
+ { "_arrow_RecordBatch__AddColumn", (DL_FUNC) &_arrow_RecordBatch__AddColumn, 4},
+ { "_arrow_RecordBatch__SetColumn", (DL_FUNC) &_arrow_RecordBatch__SetColumn, 4},
+ { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2},
+ { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2},
+ { "_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1},
+ { "_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2},
+ { "_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3},
+ { "_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1},
+ { "_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2},
+ { "_arrow_RecordBatch__from_arrays", (DL_FUNC) &_arrow_RecordBatch__from_arrays, 2},
+ { "_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1},
+ { "_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1},
+ { "_arrow_RecordBatchReader__batches", (DL_FUNC) &_arrow_RecordBatchReader__batches, 1},
+ { "_arrow_Table__from_RecordBatchReader", (DL_FUNC) &_arrow_Table__from_RecordBatchReader, 1},
+ { "_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1},
+ { "_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1},
+ { "_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1},
+ { "_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2},
+ { "_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1},
+ { "_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1},
+ { "_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1},
+ { "_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2},
+ { "_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2},
+ { "_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1},
+ { "_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 4},
+ { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4},
+ { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2},
+ { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1},
+ { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2},
+ { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2},
+ { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1},
+ { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2},
+ { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1},
+ { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1},
+ { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2},
+ { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2},
+ { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1},
+ { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1},
+ { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1},
+ { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2},
+ { "_arrow_Schema__AddField", (DL_FUNC) &_arrow_Schema__AddField, 3},
+ { "_arrow_Schema__SetField", (DL_FUNC) &_arrow_Schema__SetField, 3},
+ { "_arrow_Schema__RemoveField", (DL_FUNC) &_arrow_Schema__RemoveField, 2},
+ { "_arrow_Schema__GetFieldByName", (DL_FUNC) &_arrow_Schema__GetFieldByName, 2},
+ { "_arrow_Schema__fields", (DL_FUNC) &_arrow_Schema__fields, 1},
+ { "_arrow_Schema__field_names", (DL_FUNC) &_arrow_Schema__field_names, 1},
+ { "_arrow_Schema__HasMetadata", (DL_FUNC) &_arrow_Schema__HasMetadata, 1},
+ { "_arrow_Schema__metadata", (DL_FUNC) &_arrow_Schema__metadata, 1},
+ { "_arrow_Schema__WithMetadata", (DL_FUNC) &_arrow_Schema__WithMetadata, 2},
+ { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1},
+ { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3},
+ { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1},
+ { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1},
+ { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1},
+ { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1},
+ { "_arrow_Table__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_Table__ReplaceSchemaMetadata, 2},
+ { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2},
+ { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2},
+ { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1},
+ { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1},
+ { "_arrow_Table__RenameColumns", (DL_FUNC) &_arrow_Table__RenameColumns, 2},
+ { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2},
+ { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3},
+ { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 3},
+ { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1},
+ { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1},
+ { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2},
+ { "_arrow_Table__RemoveColumn", (DL_FUNC) &_arrow_Table__RemoveColumn, 2},
+ { "_arrow_Table__AddColumn", (DL_FUNC) &_arrow_Table__AddColumn, 4},
+ { "_arrow_Table__SetColumn", (DL_FUNC) &_arrow_Table__SetColumn, 4},
+ { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2},
+ { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1},
+ { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2},
+ { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0},
+ { "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1},
+ { "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0},
+ { "_arrow_SetIOThreadPoolCapacity", (DL_FUNC) &_arrow_SetIOThreadPoolCapacity, 1},
+ { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1},
+ { "_arrow_Table__Reset", (DL_FUNC) &_arrow_Table__Reset, 1},
+ { "_arrow_RecordBatch__Reset", (DL_FUNC) &_arrow_RecordBatch__Reset, 1},
+ {NULL, NULL, 0}
+};
+extern "C" void R_init_arrow(DllInfo* dll){
+ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+ R_useDynamicSymbols(dll, FALSE);
+
+ #if defined(ARROW_R_WITH_ARROW) && defined(HAS_ALTREP)
+ arrow::r::altrep::Init_Altrep_classes(dll);
+ #endif
+
+}
+
+
diff --git a/src/arrow/r/src/arrow_cpp11.h b/src/arrow/r/src/arrow_cpp11.h
new file mode 100644
index 000000000..c35948867
--- /dev/null
+++ b/src/arrow/r/src/arrow_cpp11.h
@@ -0,0 +1,382 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstring> // for strlen
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+#undef Free
+
+#include <cpp11.hpp>
+#include <cpp11/altrep.hpp>
+
+#include "./nameof.h"
+
+// borrowed from enc package
+// because R does not make these macros available (i.e. from Defn.h)
+#define UTF8_MASK (1 << 3)
+#define ASCII_MASK (1 << 6)
+
+#define IS_ASCII(x) (LEVELS(x) & ASCII_MASK)
+#define IS_UTF8(x) (LEVELS(x) & UTF8_MASK)
+
+// For context, see:
+// https://github.com/r-devel/r-svn/blob/6418faeb6f5d87d3d9b92b8978773bc3856b4b6f/src/main/altrep.c#L37
+#define ALTREP_CLASS_SERIALIZED_CLASS(x) ATTRIB(x)
+#define ALTREP_SERIALIZED_CLASS_PKGSYM(x) CADR(x)
+
+namespace arrow {
+namespace r {
+
+template <typename T>
+struct Pointer {
+ Pointer() : ptr_(new T()) {}
+ explicit Pointer(SEXP x)
+ : ptr_(reinterpret_cast<T*>(static_cast<uintptr_t>(REAL(x)[0]))) {}
+
+ inline operator SEXP() const {
+ return Rf_ScalarReal(static_cast<double>(reinterpret_cast<uintptr_t>(ptr_)));
+ }
+
+ inline operator T*() const { return ptr_; }
+
+ inline void finalize() { delete ptr_; }
+
+ T* ptr_;
+};
+
+// until cpp11 has a similar class
+class complexs {
+ public:
+ using value_type = Rcomplex;
+
+ explicit complexs(SEXP x) : data_(x) {}
+
+ inline R_xlen_t size() const { return XLENGTH(data_); }
+
+ inline operator SEXP() const { return data_; }
+
+ private:
+ cpp11::sexp data_;
+};
+
+// functions that need to be called from an unwind_protect()
+namespace unsafe {
+
+inline const char* utf8_string(SEXP s) {
+ if (!IS_UTF8(s) && !IS_ASCII(s)) {
+ return Rf_translateCharUTF8(s);
+ } else {
+ return CHAR(s);
+ }
+}
+
+inline R_xlen_t r_string_size(SEXP s) {
+ if (s == NA_STRING) {
+ return 0;
+ } else if (IS_ASCII(s) || IS_UTF8(s)) {
+ return XLENGTH(s);
+ } else {
+ return strlen(Rf_translateCharUTF8(s));
+ }
+}
+
+} // namespace unsafe
+
+inline SEXP utf8_strings(SEXP x) {
+ return cpp11::unwind_protect([x] {
+ R_xlen_t n = XLENGTH(x);
+ for (R_xlen_t i = 0; i < n; i++) {
+ SEXP s = STRING_ELT(x, i);
+ if (s != NA_STRING && !IS_UTF8(s) && !IS_ASCII(s)) {
+ SET_STRING_ELT(x, i, Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8));
+ }
+ }
+ return x;
+ });
+}
+
+struct symbols {
+ static SEXP units;
+ static SEXP tzone;
+ static SEXP xp;
+ static SEXP dot_Internal;
+ static SEXP inspect;
+ static SEXP row_names;
+ static SEXP serialize_arrow_r_metadata;
+ static SEXP as_list;
+ static SEXP ptype;
+ static SEXP byte_width;
+ static SEXP list_size;
+ static SEXP arrow_attributes;
+ static SEXP new_;
+ static SEXP create;
+ static SEXP arrow;
+};
+
+struct data {
+ static SEXP classes_POSIXct;
+ static SEXP classes_metadata_r;
+ static SEXP classes_vctrs_list_of;
+ static SEXP classes_tbl_df;
+
+ static SEXP classes_arrow_binary;
+ static SEXP classes_arrow_large_binary;
+ static SEXP classes_arrow_fixed_size_binary;
+
+ static SEXP classes_arrow_list;
+ static SEXP classes_arrow_large_list;
+ static SEXP classes_arrow_fixed_size_list;
+
+ static SEXP classes_factor;
+ static SEXP classes_ordered;
+
+ static SEXP names_metadata;
+};
+
+struct ns {
+ static SEXP arrow;
+};
+
+template <typename Pointer>
+Pointer r6_to_pointer(SEXP self) {
+ if (!Rf_inherits(self, "ArrowObject")) {
+ std::string type_name = arrow::util::nameof<
+ cpp11::decay_t<typename std::remove_pointer<Pointer>::type>>();
+ cpp11::stop("Invalid R object for %s, must be an ArrowObject", type_name.c_str());
+ }
+ void* p = R_ExternalPtrAddr(Rf_findVarInFrame(self, arrow::r::symbols::xp));
+ if (p == nullptr) {
+ SEXP klass = Rf_getAttrib(self, R_ClassSymbol);
+ cpp11::stop("Invalid <%s>, external pointer to null", CHAR(STRING_ELT(klass, 0)));
+ }
+ return reinterpret_cast<Pointer>(p);
+}
+
+template <typename T>
+void r6_reset_pointer(SEXP r6) {
+ SEXP xp = Rf_findVarInFrame(r6, arrow::r::symbols::xp);
+ void* p = R_ExternalPtrAddr(xp);
+ if (p != nullptr) {
+ delete reinterpret_cast<const std::shared_ptr<T>*>(p);
+ R_SetExternalPtrAddr(xp, nullptr);
+ }
+}
+
+// T is either std::shared_ptr<U> or std::unique_ptr<U>
+// e.g. T = std::shared_ptr<arrow::Array>
+template <typename T>
+class ExternalPtrInput {
+ public:
+ explicit ExternalPtrInput(SEXP self) : ptr_(r6_to_pointer<const T*>(self)) {}
+
+ operator const T&() const { return *ptr_; }
+
+ private:
+ const T* ptr_;
+};
+
+template <typename T>
+class VectorExternalPtrInput {
+ public:
+ explicit VectorExternalPtrInput(SEXP self) : vec_(XLENGTH(self)) {
+ R_xlen_t i = 0;
+ for (auto& element : vec_) {
+ element = *r6_to_pointer<const T*>(VECTOR_ELT(self, i++));
+ }
+ }
+ operator const std::vector<T>&() const { return vec_; }
+
+ private:
+ std::vector<T> vec_;
+};
+
+template <typename T>
+class DefaultInput {
+ public:
+ explicit DefaultInput(SEXP from) : from_(from) {}
+
+ operator T() const { return cpp11::as_cpp<T>(from_); }
+
+ private:
+ SEXP from_;
+};
+
+template <typename T>
+class ConstReferenceInput {
+ public:
+ explicit ConstReferenceInput(SEXP from) : obj_(cpp11::as_cpp<T>(from)) {}
+
+ using const_reference = const T&;
+ operator const_reference() const { return obj_; }
+
+ private:
+ T obj_;
+};
+
+template <typename T>
+struct Input {
+ using type = DefaultInput<T>;
+};
+
+template <typename T>
+struct Input<const T&> {
+ using type = ConstReferenceInput<typename std::decay<T>::type>;
+};
+
+template <typename T>
+struct Input<const std::shared_ptr<T>&> {
+ using type = ExternalPtrInput<std::shared_ptr<T>>;
+};
+
+template <typename T>
+struct Input<const std::unique_ptr<T>&> {
+ using type = ExternalPtrInput<std::unique_ptr<T>>;
+};
+
+template <typename T>
+struct Input<const std::vector<std::shared_ptr<T>>&> {
+ using type = VectorExternalPtrInput<std::shared_ptr<T>>;
+};
+
+template <typename Rvector, typename T, typename ToVectorElement>
+Rvector to_r_vector(const std::vector<std::shared_ptr<T>>& x,
+ ToVectorElement&& to_element) {
+ R_xlen_t n = x.size();
+ Rvector out(n);
+ for (R_xlen_t i = 0; i < n; i++) {
+ out[i] = to_element(x[i]);
+ }
+ return out;
+}
+
+template <typename T, typename ToString>
+cpp11::writable::strings to_r_strings(const std::vector<std::shared_ptr<T>>& x,
+ ToString&& to_string) {
+ return to_r_vector<cpp11::writable::strings>(x, std::forward<ToString>(to_string));
+}
+
+template <typename T, typename ToListElement>
+cpp11::writable::list to_r_list(const std::vector<std::shared_ptr<T>>& x,
+ ToListElement&& to_element) {
+ auto as_sexp = [&](const std::shared_ptr<T>& t) { return to_element(t); };
+ return to_r_vector<cpp11::writable::list>(x, as_sexp);
+}
+
+template <typename T>
+cpp11::writable::list to_r_list(const std::vector<std::shared_ptr<T>>& x);
+
+inline cpp11::writable::integers short_row_names(int n) { return {NA_INTEGER, -n}; }
+
+template <typename T>
+std::vector<T> from_r_list(cpp11::list args) {
+ std::vector<T> vec;
+ R_xlen_t n = args.size();
+ for (R_xlen_t i = 0; i < n; i++) {
+ vec.push_back(cpp11::as_cpp<T>(args[i]));
+ }
+ return vec;
+}
+
+bool GetBoolOption(const std::string& name, bool default_);
+
+} // namespace r
+} // namespace arrow
+
+namespace cpp11 {
+
+template <typename T>
+SEXP to_r6(const std::shared_ptr<T>& ptr, const char* r6_class_name) {
+ if (ptr == nullptr) return R_NilValue;
+
+ cpp11::external_pointer<std::shared_ptr<T>> xp(new std::shared_ptr<T>(ptr));
+ SEXP r6_class = Rf_install(r6_class_name);
+
+ if (Rf_findVarInFrame3(arrow::r::ns::arrow, r6_class, FALSE) == R_UnboundValue) {
+ cpp11::stop("No arrow R6 class named '%s'", r6_class_name);
+ }
+
+ // make call: <symbol>$new(<x>)
+ SEXP call = PROTECT(Rf_lang3(R_DollarSymbol, r6_class, arrow::r::symbols::new_));
+ SEXP call2 = PROTECT(Rf_lang2(call, xp));
+
+ // and then eval in arrow::
+ SEXP r6 = PROTECT(Rf_eval(call2, arrow::r::ns::arrow));
+
+ UNPROTECT(3);
+ return r6;
+}
+
+/// This trait defines a single static function which returns the name of the R6 class
+/// which corresponds to T. By default, this is just the c++ class name with any
+/// namespaces stripped, for example the R6 class for arrow::ipc::RecordBatchStreamReader
+/// is simply named "RecordBatchStreamReader".
+///
+/// Some classes require specializations of this trait. For example the R6 classes which
+/// wrap arrow::csv::ReadOptions and arrow::json::ReadOptions would collide if both were
+/// named "ReadOptions", so they are named "CsvReadOptions" and "JsonReadOptions"
+/// respectively. Other classes such as arrow::Array are base classes and the proper R6
+/// class name must be derived by examining a discriminant like Array::type_id.
+///
+/// All specializations are located in arrow_types.h
+template <typename T>
+struct r6_class_name;
+
+template <typename T>
+SEXP to_r6(const std::shared_ptr<T>& x) {
+ if (x == nullptr) return R_NilValue;
+
+ return to_r6(x, cpp11::r6_class_name<T>::get(x));
+}
+
+} // namespace cpp11
+
+namespace arrow {
+namespace r {
+
+template <typename T>
+cpp11::writable::list to_r_list(const std::vector<std::shared_ptr<T>>& x) {
+ auto as_sexp = [&](const std::shared_ptr<T>& t) { return cpp11::to_r6<T>(t); };
+ return to_r_vector<cpp11::writable::list>(x, as_sexp);
+}
+
+} // namespace r
+} // namespace arrow
+
+namespace cpp11 {
+
+template <typename T>
+using enable_if_shared_ptr = typename std::enable_if<
+ std::is_same<std::shared_ptr<typename T::element_type>, T>::value, T>::type;
+
+template <typename T>
+enable_if_shared_ptr<T> as_cpp(SEXP from) {
+ return arrow::r::ExternalPtrInput<T>(from);
+}
+
+template <typename E>
+enable_if_enum<E, SEXP> as_sexp(E e) {
+ return as_sexp(static_cast<int>(e));
+}
+
+template <typename T>
+SEXP as_sexp(const std::shared_ptr<T>& ptr) {
+ return cpp11::to_r6<T>(ptr);
+}
+
+} // namespace cpp11
diff --git a/src/arrow/r/src/arrow_types.h b/src/arrow/r/src/arrow_types.h
new file mode 100644
index 000000000..bf620bb78
--- /dev/null
+++ b/src/arrow/r/src/arrow_types.h
@@ -0,0 +1,274 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cpp11/R.hpp>
+
+#include "./arrow_cpp11.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/buffer.h> // for RBuffer definition below
+#include <arrow/result.h>
+#include <arrow/status.h>
+
+#include <limits>
+#include <memory>
+#include <utility>
+
+// forward declaration-only headers
+#include <arrow/c/abi.h>
+#include <arrow/compute/type_fwd.h>
+#include <arrow/csv/type_fwd.h>
+
+#if defined(ARROW_R_WITH_DATASET)
+#include <arrow/dataset/type_fwd.h>
+#endif
+
+#include <arrow/filesystem/type_fwd.h>
+#include <arrow/io/type_fwd.h>
+#include <arrow/ipc/type_fwd.h>
+
+#if defined(ARROW_R_WITH_JSON)
+#include <arrow/json/type_fwd.h>
+#endif
+
+#include <arrow/type_fwd.h>
+#include <arrow/util/type_fwd.h>
+
+namespace arrow {
+namespace compute {
+
+class ExecPlan;
+class ExecNode;
+
+} // namespace compute
+} // namespace arrow
+
+#if defined(ARROW_R_WITH_PARQUET)
+#include <parquet/type_fwd.h>
+#endif
+
+#if defined(ARROW_R_WITH_DATASET)
+namespace ds = ::arrow::dataset;
+#endif
+
+namespace compute = ::arrow::compute;
+namespace fs = ::arrow::fs;
+
+std::shared_ptr<arrow::RecordBatch> RecordBatch__from_arrays(SEXP, SEXP);
+arrow::MemoryPool* gc_memory_pool();
+arrow::compute::ExecContext* gc_context();
+
+#if (R_VERSION < R_Version(3, 5, 0))
+#define LOGICAL_RO(x) ((const int*)LOGICAL(x))
+#define INTEGER_RO(x) ((const int*)INTEGER(x))
+#define REAL_RO(x) ((const double*)REAL(x))
+#define COMPLEX_RO(x) ((const Rcomplex*)COMPLEX(x))
+#define STRING_PTR_RO(x) ((const SEXP*)STRING_PTR(x))
+#define RAW_RO(x) ((const Rbyte*)RAW(x))
+#define DATAPTR_RO(x) ((const void*)STRING_PTR(x))
+#define DATAPTR(x) (void*)STRING_PTR(x)
+#endif
+
+#define VECTOR_PTR_RO(x) ((const SEXP*)DATAPTR_RO(x))
+
+namespace arrow {
+
+static inline void StopIfNotOk(const Status& status) {
+ if (!status.ok()) {
+ // ARROW-13039: be careful not to interpret our error message as a %-format string
+ std::string s = status.ToString();
+ cpp11::stop("%s", s.c_str());
+ }
+}
+
+template <typename R>
+auto ValueOrStop(R&& result) -> decltype(std::forward<R>(result).ValueOrDie()) {
+ StopIfNotOk(result.status());
+ return std::forward<R>(result).ValueOrDie();
+}
+
+namespace r {
+class RTasks;
+
+std::shared_ptr<arrow::DataType> InferArrowType(SEXP x);
+std::shared_ptr<arrow::Array> vec_to_arrow__reuse_memory(SEXP x);
+bool can_reuse_memory(SEXP x, const std::shared_ptr<arrow::DataType>& type);
+
+Status count_fields(SEXP lst, int* out);
+
+void inspect(SEXP obj);
+std::shared_ptr<arrow::Array> vec_to_arrow(SEXP x,
+ const std::shared_ptr<arrow::DataType>& type,
+ bool type_inferred);
+
+// the integer64 sentinel
+constexpr int64_t NA_INT64 = std::numeric_limits<int64_t>::min();
+
+template <typename RVector>
+class RBuffer : public MutableBuffer {
+ public:
+ explicit RBuffer(RVector vec)
+ : MutableBuffer(reinterpret_cast<uint8_t*>(DATAPTR(vec)),
+ vec.size() * sizeof(typename RVector::value_type),
+ arrow::CPUDevice::memory_manager(gc_memory_pool())),
+ vec_(vec) {}
+
+ private:
+ // vec_ holds the memory
+ RVector vec_;
+};
+
+std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP);
+
+void validate_slice_offset(R_xlen_t offset, int64_t len);
+
+void validate_slice_length(R_xlen_t length, int64_t available);
+
+void validate_index(int i, int len);
+
+template <typename Lambda>
+void TraverseDots(cpp11::list dots, int num_fields, Lambda lambda) {
+ cpp11::strings names(dots.attr(R_NamesSymbol));
+
+ for (R_xlen_t i = 0, j = 0; j < num_fields; i++) {
+ auto name_i = names[i];
+
+ if (name_i.size() == 0) {
+ cpp11::list x_i = dots[i];
+ cpp11::strings names_x_i(x_i.attr(R_NamesSymbol));
+ R_xlen_t n_i = x_i.size();
+ for (R_xlen_t k = 0; k < n_i; k++, j++) {
+ lambda(j, x_i[k], names_x_i[k]);
+ }
+ } else {
+ lambda(j, dots[i], name_i);
+ j++;
+ }
+ }
+}
+
+inline cpp11::writable::list FlattenDots(cpp11::list dots, int num_fields) {
+ std::vector<SEXP> out(num_fields);
+ auto set = [&](int j, SEXP x, cpp11::r_string) { out[j] = x; };
+ TraverseDots(dots, num_fields, set);
+
+ return cpp11::writable::list(out.begin(), out.end());
+}
+
+arrow::Status InferSchemaFromDots(SEXP lst, SEXP schema_sxp, int num_fields,
+ std::shared_ptr<arrow::Schema>& schema);
+
+arrow::Status AddMetadataFromDots(SEXP lst, int num_fields,
+ std::shared_ptr<arrow::Schema>& schema);
+
+namespace altrep {
+
+#if defined(HAS_ALTREP)
+void Init_Altrep_classes(DllInfo* dll);
+#endif
+
+SEXP MakeAltrepVector(const std::shared_ptr<ChunkedArray>& chunked_array);
+
+} // namespace altrep
+
+} // namespace r
+} // namespace arrow
+
+namespace cpp11 {
+
+template <typename T>
+struct r6_class_name {
+ static const char* get(const std::shared_ptr<T>& ptr) {
+ static const std::string name = arrow::util::nameof<T>(/*strip_namespace=*/true);
+ return name.c_str();
+ }
+};
+
+// Overrides of default R6 class names:
+#define R6_CLASS_NAME(CLASS, NAME) \
+ template <> \
+ struct r6_class_name<CLASS> { \
+ static const char* get(const std::shared_ptr<CLASS>&) { return NAME; } \
+ }
+
+R6_CLASS_NAME(arrow::csv::ReadOptions, "CsvReadOptions");
+R6_CLASS_NAME(arrow::csv::ParseOptions, "CsvParseOptions");
+R6_CLASS_NAME(arrow::csv::ConvertOptions, "CsvConvertOptions");
+R6_CLASS_NAME(arrow::csv::TableReader, "CsvTableReader");
+R6_CLASS_NAME(arrow::csv::WriteOptions, "CsvWriteOptions");
+
+#if defined(ARROW_R_WITH_PARQUET)
+R6_CLASS_NAME(parquet::ArrowReaderProperties, "ParquetArrowReaderProperties");
+R6_CLASS_NAME(parquet::ArrowWriterProperties, "ParquetArrowWriterProperties");
+R6_CLASS_NAME(parquet::WriterProperties, "ParquetWriterProperties");
+R6_CLASS_NAME(parquet::arrow::FileReader, "ParquetFileReader");
+R6_CLASS_NAME(parquet::WriterPropertiesBuilder, "ParquetWriterPropertiesBuilder");
+R6_CLASS_NAME(parquet::arrow::FileWriter, "ParquetFileWriter");
+#endif
+
+R6_CLASS_NAME(arrow::ipc::feather::Reader, "FeatherReader");
+
+#if defined(ARROW_R_WITH_JSON)
+R6_CLASS_NAME(arrow::json::ReadOptions, "JsonReadOptions");
+R6_CLASS_NAME(arrow::json::ParseOptions, "JsonParseOptions");
+R6_CLASS_NAME(arrow::json::TableReader, "JsonTableReader");
+#endif
+
+#undef R6_CLASS_NAME
+
+// Declarations of discriminated base classes.
+// Definitions reside in corresponding .cpp files.
+template <>
+struct r6_class_name<fs::FileSystem> {
+ static const char* get(const std::shared_ptr<fs::FileSystem>&);
+};
+
+template <>
+struct r6_class_name<arrow::Array> {
+ static const char* get(const std::shared_ptr<arrow::Array>&);
+};
+
+template <>
+struct r6_class_name<arrow::Scalar> {
+ static const char* get(const std::shared_ptr<arrow::Scalar>&);
+};
+
+template <>
+struct r6_class_name<arrow::DataType> {
+ static const char* get(const std::shared_ptr<arrow::DataType>&);
+};
+
+#if defined(ARROW_R_WITH_DATASET)
+
+template <>
+struct r6_class_name<ds::Dataset> {
+ static const char* get(const std::shared_ptr<ds::Dataset>&);
+};
+
+template <>
+struct r6_class_name<ds::FileFormat> {
+ static const char* get(const std::shared_ptr<ds::FileFormat>&);
+};
+
+#endif
+
+} // namespace cpp11
+
+#endif
diff --git a/src/arrow/r/src/arrow_vctrs.h b/src/arrow/r/src/arrow_vctrs.h
new file mode 100644
index 000000000..b91c08199
--- /dev/null
+++ b/src/arrow/r/src/arrow_vctrs.h
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace vctrs {
+R_len_t vec_size(SEXP);
+}
diff --git a/src/arrow/r/src/buffer.cpp b/src/arrow/r/src/buffer.cpp
new file mode 100644
index 000000000..281467734
--- /dev/null
+++ b/src/arrow/r/src/buffer.cpp
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+// [[arrow::export]]
+bool Buffer__is_mutable(const std::shared_ptr<arrow::Buffer>& buffer) {
+ return buffer->is_mutable();
+}
+
+// [[arrow::export]]
+void Buffer__ZeroPadding(const std::shared_ptr<arrow::Buffer>& buffer) {
+ buffer->ZeroPadding();
+}
+
+// [[arrow::export]]
+int64_t Buffer__capacity(const std::shared_ptr<arrow::Buffer>& buffer) {
+ return buffer->capacity();
+}
+
+// [[arrow::export]]
+int64_t Buffer__size(const std::shared_ptr<arrow::Buffer>& buffer) {
+ return buffer->size();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Buffer> r___RBuffer__initialize(SEXP x) {
+ switch (TYPEOF(x)) {
+ case RAWSXP:
+ return std::make_shared<arrow::r::RBuffer<cpp11::raws>>(x);
+ case REALSXP:
+ return std::make_shared<arrow::r::RBuffer<cpp11::doubles>>(x);
+ case INTSXP:
+ return std::make_shared<arrow::r::RBuffer<cpp11::integers>>(x);
+ case CPLXSXP:
+ return std::make_shared<arrow::r::RBuffer<arrow::r::complexs>>(
+ arrow::r::complexs(x));
+ default:
+ break;
+ }
+ cpp11::stop("R object of type <%s> not supported", Rf_type2char(TYPEOF(x)));
+}
+
+// [[arrow::export]]
+cpp11::writable::raws Buffer__data(const std::shared_ptr<arrow::Buffer>& buffer) {
+ return cpp11::writable::raws(buffer->data(), buffer->data() + buffer->size());
+}
+
+// [[arrow::export]]
+bool Buffer__Equals(const std::shared_ptr<arrow::Buffer>& x,
+ const std::shared_ptr<arrow::Buffer>& y) {
+ return x->Equals(*y.get());
+}
+
+#endif
diff --git a/src/arrow/r/src/chunkedarray.cpp b/src/arrow/r/src/chunkedarray.cpp
new file mode 100644
index 000000000..10c6e84b3
--- /dev/null
+++ b/src/arrow/r/src/chunkedarray.cpp
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/builder.h>
+#include <arrow/chunked_array.h>
+
+// [[arrow::export]]
+int ChunkedArray__length(const std::shared_ptr<arrow::ChunkedArray>& chunked_array) {
+ return chunked_array->length();
+}
+
+// [[arrow::export]]
+int ChunkedArray__null_count(const std::shared_ptr<arrow::ChunkedArray>& chunked_array) {
+ return chunked_array->null_count();
+}
+
+// [[arrow::export]]
+int ChunkedArray__num_chunks(const std::shared_ptr<arrow::ChunkedArray>& chunked_array) {
+ return chunked_array->num_chunks();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> ChunkedArray__chunk(
+ const std::shared_ptr<arrow::ChunkedArray>& chunked_array, int i) {
+ arrow::r::validate_index(i, chunked_array->num_chunks());
+ return chunked_array->chunk(i);
+}
+
+// [[arrow::export]]
+cpp11::list ChunkedArray__chunks(
+ const std::shared_ptr<arrow::ChunkedArray>& chunked_array) {
+ return arrow::r::to_r_list(chunked_array->chunks());
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ChunkedArray__type(
+ const std::shared_ptr<arrow::ChunkedArray>& chunked_array) {
+ return chunked_array->type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice1(
+ const std::shared_ptr<arrow::ChunkedArray>& chunked_array, R_xlen_t offset) {
+ arrow::r::validate_slice_offset(offset, chunked_array->length());
+ return chunked_array->Slice(offset);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Slice2(
+ const std::shared_ptr<arrow::ChunkedArray>& chunked_array, R_xlen_t offset,
+ R_xlen_t length) {
+ arrow::r::validate_slice_offset(offset, chunked_array->length());
+ arrow::r::validate_slice_length(length, chunked_array->length() - offset);
+ return chunked_array->Slice(offset, length);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__View(
+ const std::shared_ptr<arrow::ChunkedArray>& array,
+ const std::shared_ptr<arrow::DataType>& type) {
+ return ValueOrStop(array->View(type));
+}
+
+// [[arrow::export]]
+void ChunkedArray__Validate(const std::shared_ptr<arrow::ChunkedArray>& chunked_array) {
+ StopIfNotOk(chunked_array->Validate());
+}
+
+// [[arrow::export]]
+bool ChunkedArray__Equals(const std::shared_ptr<arrow::ChunkedArray>& x,
+ const std::shared_ptr<arrow::ChunkedArray>& y) {
+ return x->Equals(y);
+}
+
+// [[arrow::export]]
+std::string ChunkedArray__ToString(const std::shared_ptr<arrow::ChunkedArray>& x) {
+ return x->ToString();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ChunkedArray> ChunkedArray__from_list(cpp11::list chunks,
+ SEXP s_type) {
+ std::vector<std::shared_ptr<arrow::Array>> vec;
+
+ // the type might be NULL, in which case we need to infer it from the data
+ // we keep track of whether it was inferred or supplied
+ bool type_inferred = Rf_isNull(s_type);
+ R_xlen_t n = XLENGTH(chunks);
+
+ std::shared_ptr<arrow::DataType> type;
+ if (type_inferred) {
+ if (n == 0) {
+ cpp11::stop("type must be specified for empty list");
+ }
+ type = arrow::r::InferArrowType(VECTOR_ELT(chunks, 0));
+ } else {
+ type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(s_type);
+ }
+
+ if (n == 0) {
+ std::shared_ptr<arrow::Array> array;
+ std::unique_ptr<arrow::ArrayBuilder> type_builder;
+ StopIfNotOk(arrow::MakeBuilder(gc_memory_pool(), type, &type_builder));
+ StopIfNotOk(type_builder->Finish(&array));
+ vec.push_back(array);
+ } else {
+ // the first - might differ from the rest of the loop
+ // because we might have inferred the type from the first element of the list
+ //
+ // this only really matters for dictionary arrays
+ vec.push_back(arrow::r::vec_to_arrow(chunks[0], type, type_inferred));
+
+ for (R_xlen_t i = 1; i < n; i++) {
+ vec.push_back(arrow::r::vec_to_arrow(chunks[i], type, false));
+ }
+ }
+
+ return std::make_shared<arrow::ChunkedArray>(std::move(vec));
+}
+
+#endif
diff --git a/src/arrow/r/src/compression.cpp b/src/arrow/r/src/compression.cpp
new file mode 100644
index 000000000..18c63e4fd
--- /dev/null
+++ b/src/arrow/r/src/compression.cpp
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/io/compressed.h>
+#include <arrow/util/compression.h>
+
+// [[arrow::export]]
+std::shared_ptr<arrow::util::Codec> util___Codec__Create(arrow::Compression::type codec,
+ R_xlen_t compression_level) {
+ return ValueOrStop(arrow::util::Codec::Create(codec, compression_level));
+}
+
+// [[arrow::export]]
+std::string util___Codec__name(const std::shared_ptr<arrow::util::Codec>& codec) {
+ return codec->name();
+}
+
+// [[arrow::export]]
+bool util___Codec__IsAvailable(arrow::Compression::type codec) {
+ return arrow::util::Codec::IsAvailable(codec);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::CompressedOutputStream> io___CompressedOutputStream__Make(
+ const std::shared_ptr<arrow::util::Codec>& codec,
+ const std::shared_ptr<arrow::io::OutputStream>& raw) {
+ return ValueOrStop(
+ arrow::io::CompressedOutputStream::Make(codec.get(), raw, gc_memory_pool()));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::CompressedInputStream> io___CompressedInputStream__Make(
+ const std::shared_ptr<arrow::util::Codec>& codec,
+ const std::shared_ptr<arrow::io::InputStream>& raw) {
+ return ValueOrStop(
+ arrow::io::CompressedInputStream::Make(codec.get(), raw, gc_memory_pool()));
+}
+
+#endif
diff --git a/src/arrow/r/src/compute-exec.cpp b/src/arrow/r/src/compute-exec.cpp
new file mode 100644
index 000000000..7e0235bf9
--- /dev/null
+++ b/src/arrow/r/src/compute-exec.cpp
@@ -0,0 +1,281 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/compute/api.h>
+#include <arrow/compute/exec/exec_plan.h>
+#include <arrow/compute/exec/expression.h>
+#include <arrow/compute/exec/options.h>
+#include <arrow/table.h>
+#include <arrow/util/async_generator.h>
+#include <arrow/util/future.h>
+#include <arrow/util/optional.h>
+#include <arrow/util/thread_pool.h>
+
+#include <iostream>
+
+namespace compute = ::arrow::compute;
+
+std::shared_ptr<compute::FunctionOptions> make_compute_options(std::string func_name,
+ cpp11::list options);
+
+// [[arrow::export]]
+std::shared_ptr<compute::ExecPlan> ExecPlan_create(bool use_threads) {
+ static compute::ExecContext threaded_context{gc_memory_pool(),
+ arrow::internal::GetCpuThreadPool()};
+ auto plan = ValueOrStop(
+ compute::ExecPlan::Make(use_threads ? &threaded_context : gc_context()));
+ return plan;
+}
+
+std::shared_ptr<compute::ExecNode> MakeExecNodeOrStop(
+ const std::string& factory_name, compute::ExecPlan* plan,
+ std::vector<compute::ExecNode*> inputs, const compute::ExecNodeOptions& options) {
+ return std::shared_ptr<compute::ExecNode>(
+ ValueOrStop(compute::MakeExecNode(factory_name, plan, std::move(inputs), options)),
+ [](...) {
+ // empty destructor: ExecNode lifetime is managed by an ExecPlan
+ });
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatchReader> ExecPlan_run(
+ const std::shared_ptr<compute::ExecPlan>& plan,
+ const std::shared_ptr<compute::ExecNode>& final_node, cpp11::list sort_options,
+ int64_t head = -1) {
+ // For now, don't require R to construct SinkNodes.
+ // Instead, just pass the node we should collect as an argument.
+ arrow::AsyncGenerator<arrow::util::optional<compute::ExecBatch>> sink_gen;
+
+ // Sorting uses a different sink node; there is no general sort yet
+ if (sort_options.size() > 0) {
+ if (head >= 0) {
+ // Use the SelectK node to take only what we need
+ MakeExecNodeOrStop(
+ "select_k_sink", plan.get(), {final_node.get()},
+ compute::SelectKSinkNodeOptions{
+ arrow::compute::SelectKOptions(
+ head, std::dynamic_pointer_cast<compute::SortOptions>(
+ make_compute_options("sort_indices", sort_options))
+ ->sort_keys),
+ &sink_gen});
+ } else {
+ MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()},
+ compute::OrderBySinkNodeOptions{
+ *std::dynamic_pointer_cast<compute::SortOptions>(
+ make_compute_options("sort_indices", sort_options)),
+ &sink_gen});
+ }
+ } else {
+ MakeExecNodeOrStop("sink", plan.get(), {final_node.get()},
+ compute::SinkNodeOptions{&sink_gen});
+ }
+
+ StopIfNotOk(plan->Validate());
+ StopIfNotOk(plan->StartProducing());
+
+ // If the generator is destroyed before being completely drained, inform plan
+ std::shared_ptr<void> stop_producing{nullptr, [plan](...) {
+ bool not_finished_yet =
+ plan->finished().TryAddCallback([&plan] {
+ return [plan](const arrow::Status&) {};
+ });
+
+ if (not_finished_yet) {
+ plan->StopProducing();
+ }
+ }};
+
+ return compute::MakeGeneratorReader(
+ final_node->output_schema(),
+ [stop_producing, plan, sink_gen] { return sink_gen(); }, gc_memory_pool());
+}
+
+// [[arrow::export]]
+void ExecPlan_StopProducing(const std::shared_ptr<compute::ExecPlan>& plan) {
+ plan->StopProducing();
+}
+
+#if defined(ARROW_R_WITH_DATASET)
+
+#include <arrow/dataset/plan.h>
+#include <arrow/dataset/scanner.h>
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Schema> ExecNode_output_schema(
+ const std::shared_ptr<compute::ExecNode>& node) {
+ return node->output_schema();
+}
+
+// [[dataset::export]]
+std::shared_ptr<compute::ExecNode> ExecNode_Scan(
+ const std::shared_ptr<compute::ExecPlan>& plan,
+ const std::shared_ptr<arrow::dataset::Dataset>& dataset,
+ const std::shared_ptr<compute::Expression>& filter,
+ std::vector<std::string> materialized_field_names) {
+ arrow::dataset::internal::Initialize();
+
+ // TODO: pass in FragmentScanOptions
+ auto options = std::make_shared<arrow::dataset::ScanOptions>();
+
+ options->use_async = true;
+ options->use_threads = arrow::r::GetBoolOption("arrow.use_threads", true);
+
+ options->dataset_schema = dataset->schema();
+
+ // ScanNode needs the filter to do predicate pushdown and skip partitions
+ options->filter = ValueOrStop(filter->Bind(*dataset->schema()));
+
+ // ScanNode needs to know which fields to materialize (and which are unnecessary)
+ std::vector<compute::Expression> exprs;
+ for (const auto& name : materialized_field_names) {
+ exprs.push_back(compute::field_ref(name));
+ }
+
+ options->projection =
+ ValueOrStop(call("make_struct", std::move(exprs),
+ compute::MakeStructOptions{std::move(materialized_field_names)})
+ .Bind(*dataset->schema()));
+
+ return MakeExecNodeOrStop("scan", plan.get(), {},
+ arrow::dataset::ScanNodeOptions{dataset, options});
+}
+
+#endif
+
+// [[dataset::export]]
+std::shared_ptr<compute::ExecNode> ExecNode_Filter(
+ const std::shared_ptr<compute::ExecNode>& input,
+ const std::shared_ptr<compute::Expression>& filter) {
+ return MakeExecNodeOrStop("filter", input->plan(), {input.get()},
+ compute::FilterNodeOptions{*filter});
+}
+
+// [[dataset::export]]
+std::shared_ptr<compute::ExecNode> ExecNode_Project(
+ const std::shared_ptr<compute::ExecNode>& input,
+ const std::vector<std::shared_ptr<compute::Expression>>& exprs,
+ std::vector<std::string> names) {
+ // We have shared_ptrs of expressions but need the Expressions
+ std::vector<compute::Expression> expressions;
+ for (auto expr : exprs) {
+ expressions.push_back(*expr);
+ }
+ return MakeExecNodeOrStop(
+ "project", input->plan(), {input.get()},
+ compute::ProjectNodeOptions{std::move(expressions), std::move(names)});
+}
+
+// [[dataset::export]]
+std::shared_ptr<compute::ExecNode> ExecNode_Aggregate(
+ const std::shared_ptr<compute::ExecNode>& input, cpp11::list options,
+ std::vector<std::string> target_names, std::vector<std::string> out_field_names,
+ std::vector<std::string> key_names) {
+ std::vector<arrow::compute::internal::Aggregate> aggregates;
+ std::vector<std::shared_ptr<arrow::compute::FunctionOptions>> keep_alives;
+
+ for (cpp11::list name_opts : options) {
+ auto name = cpp11::as_cpp<std::string>(name_opts[0]);
+ auto opts = make_compute_options(name, name_opts[1]);
+
+ aggregates.push_back(
+ arrow::compute::internal::Aggregate{std::move(name), opts.get()});
+ keep_alives.push_back(std::move(opts));
+ }
+
+ std::vector<arrow::FieldRef> targets, keys;
+ for (auto&& name : target_names) {
+ targets.emplace_back(std::move(name));
+ }
+ for (auto&& name : key_names) {
+ keys.emplace_back(std::move(name));
+ }
+ return MakeExecNodeOrStop(
+ "aggregate", input->plan(), {input.get()},
+ compute::AggregateNodeOptions{std::move(aggregates), std::move(targets),
+ std::move(out_field_names), std::move(keys)});
+}
+
+// [[dataset::export]]
+std::shared_ptr<compute::ExecNode> ExecNode_Join(
+ const std::shared_ptr<compute::ExecNode>& input, int type,
+ const std::shared_ptr<compute::ExecNode>& right_data,
+ std::vector<std::string> left_keys, std::vector<std::string> right_keys,
+ std::vector<std::string> left_output, std::vector<std::string> right_output) {
+ std::vector<arrow::FieldRef> left_refs, right_refs, left_out_refs, right_out_refs;
+ for (auto&& name : left_keys) {
+ left_refs.emplace_back(std::move(name));
+ }
+ for (auto&& name : right_keys) {
+ right_refs.emplace_back(std::move(name));
+ }
+ for (auto&& name : left_output) {
+ left_out_refs.emplace_back(std::move(name));
+ }
+ if (type != 0 && type != 2) {
+ // Don't include out_refs in semi/anti join
+ for (auto&& name : right_output) {
+ right_out_refs.emplace_back(std::move(name));
+ }
+ }
+
+ // TODO: we should be able to use this enum directly
+ compute::JoinType join_type;
+ if (type == 0) {
+ join_type = compute::JoinType::LEFT_SEMI;
+ } else if (type == 1) {
+ // Not readily called from R bc dplyr::semi_join is LEFT_SEMI
+ join_type = compute::JoinType::RIGHT_SEMI;
+ } else if (type == 2) {
+ join_type = compute::JoinType::LEFT_ANTI;
+ } else if (type == 3) {
+ // Not readily called from R bc dplyr::semi_join is LEFT_SEMI
+ join_type = compute::JoinType::RIGHT_ANTI;
+ } else if (type == 4) {
+ join_type = compute::JoinType::INNER;
+ } else if (type == 5) {
+ join_type = compute::JoinType::LEFT_OUTER;
+ } else if (type == 6) {
+ join_type = compute::JoinType::RIGHT_OUTER;
+ } else if (type == 7) {
+ join_type = compute::JoinType::FULL_OUTER;
+ } else {
+ cpp11::stop("todo");
+ }
+
+ return MakeExecNodeOrStop(
+ "hashjoin", input->plan(), {input.get(), right_data.get()},
+ compute::HashJoinNodeOptions{join_type, std::move(left_refs), std::move(right_refs),
+ std::move(left_out_refs), std::move(right_out_refs)});
+}
+
+// [[arrow::export]]
+std::shared_ptr<compute::ExecNode> ExecNode_ReadFromRecordBatchReader(
+ const std::shared_ptr<compute::ExecPlan>& plan,
+ const std::shared_ptr<arrow::RecordBatchReader>& reader) {
+ arrow::compute::SourceNodeOptions options{
+ /*output_schema=*/reader->schema(),
+ /*generator=*/ValueOrStop(
+ compute::MakeReaderGenerator(reader, arrow::internal::GetCpuThreadPool()))};
+
+ return MakeExecNodeOrStop("source", plan.get(), {}, options);
+}
+
+#endif
diff --git a/src/arrow/r/src/compute.cpp b/src/arrow/r/src/compute.cpp
new file mode 100644
index 000000000..0f0ef2f7d
--- /dev/null
+++ b/src/arrow/r/src/compute.cpp
@@ -0,0 +1,576 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/compute/api.h>
+#include <arrow/record_batch.h>
+#include <arrow/table.h>
+
+std::shared_ptr<arrow::compute::CastOptions> make_cast_options(cpp11::list options);
+
+arrow::compute::ExecContext* gc_context() {
+ static arrow::compute::ExecContext context(gc_memory_pool());
+ return &context;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__cast(
+ const std::shared_ptr<arrow::RecordBatch>& batch,
+ const std::shared_ptr<arrow::Schema>& schema, cpp11::list options) {
+ auto opts = make_cast_options(options);
+ auto nc = batch->num_columns();
+
+ arrow::ArrayVector columns(nc);
+ for (int i = 0; i < nc; i++) {
+ columns[i] = ValueOrStop(
+ arrow::compute::Cast(*batch->column(i), schema->field(i)->type(), *opts));
+ }
+
+ return arrow::RecordBatch::Make(schema, batch->num_rows(), std::move(columns));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__cast(const std::shared_ptr<arrow::Table>& table,
+ const std::shared_ptr<arrow::Schema>& schema,
+ cpp11::list options) {
+ auto opts = make_cast_options(options);
+ auto nc = table->num_columns();
+
+ using ColumnVector = std::vector<std::shared_ptr<arrow::ChunkedArray>>;
+ ColumnVector columns(nc);
+ for (int i = 0; i < nc; i++) {
+ arrow::Datum value(table->column(i));
+ arrow::Datum out =
+ ValueOrStop(arrow::compute::Cast(value, schema->field(i)->type(), *opts));
+ columns[i] = out.chunked_array();
+ }
+ return arrow::Table::Make(schema, std::move(columns), table->num_rows());
+}
+
+template <typename T>
+std::shared_ptr<T> MaybeUnbox(const char* class_name, SEXP x) {
+ if (Rf_inherits(x, "ArrowObject") && Rf_inherits(x, class_name)) {
+ return cpp11::as_cpp<std::shared_ptr<T>>(x);
+ }
+ return nullptr;
+}
+
+namespace cpp11 {
+
+template <>
+arrow::Datum as_cpp<arrow::Datum>(SEXP x) {
+ if (auto array = MaybeUnbox<arrow::Array>("Array", x)) {
+ return array;
+ }
+
+ if (auto chunked_array = MaybeUnbox<arrow::ChunkedArray>("ChunkedArray", x)) {
+ return chunked_array;
+ }
+
+ if (auto batch = MaybeUnbox<arrow::RecordBatch>("RecordBatch", x)) {
+ return batch;
+ }
+
+ if (auto table = MaybeUnbox<arrow::Table>("Table", x)) {
+ return table;
+ }
+
+ if (auto scalar = MaybeUnbox<arrow::Scalar>("Scalar", x)) {
+ return scalar;
+ }
+
+ // This assumes that R objects have already been converted to Arrow objects;
+ // that seems right but should we do the wrapping here too/instead?
+ cpp11::stop("to_datum: Not implemented for type %s", Rf_type2char(TYPEOF(x)));
+}
+} // namespace cpp11
+
+SEXP from_datum(arrow::Datum datum) {
+ switch (datum.kind()) {
+ case arrow::Datum::SCALAR:
+ return cpp11::to_r6(datum.scalar());
+
+ case arrow::Datum::ARRAY:
+ return cpp11::to_r6(datum.make_array());
+
+ case arrow::Datum::CHUNKED_ARRAY:
+ return cpp11::to_r6(datum.chunked_array());
+
+ case arrow::Datum::RECORD_BATCH:
+ return cpp11::to_r6(datum.record_batch());
+
+ case arrow::Datum::TABLE:
+ return cpp11::to_r6(datum.table());
+
+ default:
+ break;
+ }
+
+ cpp11::stop("from_datum: Not implemented for Datum %s", datum.ToString().c_str());
+}
+
+std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
+ std::string func_name, cpp11::list options) {
+ if (func_name == "filter") {
+ using Options = arrow::compute::FilterOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ SEXP keep_na = options["keep_na"];
+ if (!Rf_isNull(keep_na) && cpp11::as_cpp<bool>(keep_na)) {
+ out->null_selection_behavior = Options::EMIT_NULL;
+ }
+ return out;
+ }
+
+ if (func_name == "take") {
+ using Options = arrow::compute::TakeOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ return out;
+ }
+
+ if (func_name == "array_sort_indices") {
+ using Order = arrow::compute::SortOrder;
+ using Options = arrow::compute::ArraySortOptions;
+ // false means descending, true means ascending
+ auto order = cpp11::as_cpp<bool>(options["order"]);
+ auto out =
+ std::make_shared<Options>(Options(order ? Order::Descending : Order::Ascending));
+ return out;
+ }
+
+ if (func_name == "sort_indices") {
+ using Key = arrow::compute::SortKey;
+ using Order = arrow::compute::SortOrder;
+ using Options = arrow::compute::SortOptions;
+ auto names = cpp11::as_cpp<std::vector<std::string>>(options["names"]);
+ // false means descending, true means ascending
+ // cpp11 does not support bool here so use int
+ auto orders = cpp11::as_cpp<std::vector<int>>(options["orders"]);
+ std::vector<Key> keys;
+ for (size_t i = 0; i < names.size(); i++) {
+ keys.push_back(
+ Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending));
+ }
+ auto out = std::make_shared<Options>(Options(keys));
+ return out;
+ }
+
+ if (func_name == "all" || func_name == "hash_all" || func_name == "any" ||
+ func_name == "hash_any" || func_name == "approximate_median" ||
+ func_name == "hash_approximate_median" || func_name == "mean" ||
+ func_name == "hash_mean" || func_name == "min_max" || func_name == "hash_min_max" ||
+ func_name == "min" || func_name == "hash_min" || func_name == "max" ||
+ func_name == "hash_max" || func_name == "sum" || func_name == "hash_sum") {
+ using Options = arrow::compute::ScalarAggregateOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["min_count"])) {
+ out->min_count = cpp11::as_cpp<int>(options["min_count"]);
+ }
+ if (!Rf_isNull(options["skip_nulls"])) {
+ out->skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
+ }
+ return out;
+ }
+
+ if (func_name == "tdigest" || func_name == "hash_tdigest") {
+ using Options = arrow::compute::TDigestOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["q"])) {
+ out->q = cpp11::as_cpp<std::vector<double>>(options["q"]);
+ }
+ if (!Rf_isNull(options["skip_nulls"])) {
+ out->skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
+ }
+ return out;
+ }
+
+ if (func_name == "count") {
+ using Options = arrow::compute::CountOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ out->mode =
+ cpp11::as_cpp<bool>(options["na.rm"]) ? Options::ONLY_VALID : Options::ONLY_NULL;
+ return out;
+ }
+
+ if (func_name == "count_distinct" || func_name == "hash_count_distinct") {
+ using Options = arrow::compute::CountOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ out->mode =
+ cpp11::as_cpp<bool>(options["na.rm"]) ? Options::ONLY_VALID : Options::ALL;
+ return out;
+ }
+
+ if (func_name == "min_element_wise" || func_name == "max_element_wise") {
+ using Options = arrow::compute::ElementWiseAggregateOptions;
+ bool skip_nulls = true;
+ if (!Rf_isNull(options["skip_nulls"])) {
+ skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
+ }
+ return std::make_shared<Options>(skip_nulls);
+ }
+
+ if (func_name == "quantile") {
+ using Options = arrow::compute::QuantileOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ SEXP q = options["q"];
+ if (!Rf_isNull(q) && TYPEOF(q) == REALSXP) {
+ out->q = cpp11::as_cpp<std::vector<double>>(q);
+ }
+ SEXP interpolation = options["interpolation"];
+ if (!Rf_isNull(interpolation) && TYPEOF(interpolation) == INTSXP &&
+ XLENGTH(interpolation) == 1) {
+ out->interpolation =
+ cpp11::as_cpp<enum arrow::compute::QuantileOptions::Interpolation>(
+ interpolation);
+ }
+ if (!Rf_isNull(options["min_count"])) {
+ out->min_count = cpp11::as_cpp<int64_t>(options["min_count"]);
+ }
+ if (!Rf_isNull(options["skip_nulls"])) {
+ out->skip_nulls = cpp11::as_cpp<int64_t>(options["skip_nulls"]);
+ }
+ return out;
+ }
+
+ if (func_name == "is_in" || func_name == "index_in") {
+ using Options = arrow::compute::SetLookupOptions;
+ return std::make_shared<Options>(cpp11::as_cpp<arrow::Datum>(options["value_set"]),
+ cpp11::as_cpp<bool>(options["skip_nulls"]));
+ }
+
+ if (func_name == "index") {
+ using Options = arrow::compute::IndexOptions;
+ return std::make_shared<Options>(
+ cpp11::as_cpp<std::shared_ptr<arrow::Scalar>>(options["value"]));
+ }
+
+ if (func_name == "is_null") {
+ using Options = arrow::compute::NullOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["nan_is_null"])) {
+ out->nan_is_null = cpp11::as_cpp<bool>(options["nan_is_null"]);
+ }
+ return out;
+ }
+
+ if (func_name == "dictionary_encode") {
+ using Options = arrow::compute::DictionaryEncodeOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["null_encoding_behavior"])) {
+ out->null_encoding_behavior = cpp11::as_cpp<
+ enum arrow::compute::DictionaryEncodeOptions::NullEncodingBehavior>(
+ options["null_encoding_behavior"]);
+ }
+ return out;
+ }
+
+ if (func_name == "cast") {
+ return make_cast_options(options);
+ }
+
+ if (func_name == "binary_join_element_wise") {
+ using Options = arrow::compute::JoinOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["null_handling"])) {
+ out->null_handling =
+ cpp11::as_cpp<enum arrow::compute::JoinOptions::NullHandlingBehavior>(
+ options["null_handling"]);
+ }
+ if (!Rf_isNull(options["null_replacement"])) {
+ out->null_replacement = cpp11::as_cpp<std::string>(options["null_replacement"]);
+ }
+ return out;
+ }
+
+ if (func_name == "make_struct") {
+ using Options = arrow::compute::MakeStructOptions;
+ // TODO (ARROW-13371): accept `field_nullability` and `field_metadata` options
+ return std::make_shared<Options>(
+ cpp11::as_cpp<std::vector<std::string>>(options["field_names"]));
+ }
+
+ if (func_name == "match_substring" || func_name == "match_substring_regex" ||
+ func_name == "find_substring" || func_name == "find_substring_regex" ||
+ func_name == "match_like" || func_name == "starts_with" ||
+ func_name == "ends_with" || func_name == "count_substring" ||
+ func_name == "count_substring_regex") {
+ using Options = arrow::compute::MatchSubstringOptions;
+ bool ignore_case = false;
+ if (!Rf_isNull(options["ignore_case"])) {
+ ignore_case = cpp11::as_cpp<bool>(options["ignore_case"]);
+ }
+ return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]),
+ ignore_case);
+ }
+
+ if (func_name == "replace_substring" || func_name == "replace_substring_regex") {
+ using Options = arrow::compute::ReplaceSubstringOptions;
+ int64_t max_replacements = -1;
+ if (!Rf_isNull(options["max_replacements"])) {
+ max_replacements = cpp11::as_cpp<int64_t>(options["max_replacements"]);
+ }
+ return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]),
+ cpp11::as_cpp<std::string>(options["replacement"]),
+ max_replacements);
+ }
+
+ if (func_name == "extract_regex") {
+ using Options = arrow::compute::ExtractRegexOptions;
+ return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]));
+ }
+
+ if (func_name == "day_of_week") {
+ using Options = arrow::compute::DayOfWeekOptions;
+ bool count_from_zero = false;
+ if (!Rf_isNull(options["count_from_zero"])) {
+ count_from_zero = cpp11::as_cpp<bool>(options["count_from_zero"]);
+ }
+ return std::make_shared<Options>(count_from_zero,
+ cpp11::as_cpp<uint32_t>(options["week_start"]));
+ }
+
+ if (func_name == "iso_week") {
+ return std::make_shared<arrow::compute::WeekOptions>(
+ arrow::compute::WeekOptions::ISODefaults());
+ }
+
+ if (func_name == "us_week") {
+ return std::make_shared<arrow::compute::WeekOptions>(
+ arrow::compute::WeekOptions::USDefaults());
+ }
+
+ if (func_name == "week") {
+ using Options = arrow::compute::WeekOptions;
+ bool week_starts_monday = true;
+ bool count_from_zero = false;
+ bool first_week_is_fully_in_year = false;
+ if (!Rf_isNull(options["week_starts_monday"])) {
+ week_starts_monday = cpp11::as_cpp<bool>(options["week_starts_monday"]);
+ }
+ if (!Rf_isNull(options["count_from_zero"])) {
+ count_from_zero = cpp11::as_cpp<bool>(options["count_from_zero"]);
+ }
+ if (!Rf_isNull(options["first_week_is_fully_in_year"])) {
+ count_from_zero = cpp11::as_cpp<bool>(options["first_week_is_fully_in_year"]);
+ }
+ return std::make_shared<Options>(week_starts_monday, count_from_zero,
+ first_week_is_fully_in_year);
+ }
+
+ if (func_name == "strptime") {
+ using Options = arrow::compute::StrptimeOptions;
+ return std::make_shared<Options>(
+ cpp11::as_cpp<std::string>(options["format"]),
+ cpp11::as_cpp<arrow::TimeUnit::type>(options["unit"]));
+ }
+
+ if (func_name == "strftime") {
+ using Options = arrow::compute::StrftimeOptions;
+ return std::make_shared<Options>(
+ Options(cpp11::as_cpp<std::string>(options["format"]),
+ cpp11::as_cpp<std::string>(options["locale"])));
+ }
+
+ if (func_name == "assume_timezone") {
+ using Options = arrow::compute::AssumeTimezoneOptions;
+ enum Options::Ambiguous ambiguous;
+ enum Options::Nonexistent nonexistent;
+
+ if (!Rf_isNull(options["ambiguous"])) {
+ ambiguous = cpp11::as_cpp<enum Options::Ambiguous>(options["ambiguous"]);
+ }
+ if (!Rf_isNull(options["nonexistent"])) {
+ nonexistent = cpp11::as_cpp<enum Options::Nonexistent>(options["nonexistent"]);
+ }
+
+ return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["timezone"]),
+ ambiguous, nonexistent);
+ }
+
+ if (func_name == "split_pattern" || func_name == "split_pattern_regex") {
+ using Options = arrow::compute::SplitPatternOptions;
+ int64_t max_splits = -1;
+ if (!Rf_isNull(options["max_splits"])) {
+ max_splits = cpp11::as_cpp<int64_t>(options["max_splits"]);
+ }
+ bool reverse = false;
+ if (!Rf_isNull(options["reverse"])) {
+ reverse = cpp11::as_cpp<bool>(options["reverse"]);
+ }
+ return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["pattern"]),
+ max_splits, reverse);
+ }
+
+ if (func_name == "utf8_lpad" || func_name == "utf8_rpad" ||
+ func_name == "utf8_center" || func_name == "ascii_lpad" ||
+ func_name == "ascii_rpad" || func_name == "ascii_center") {
+ using Options = arrow::compute::PadOptions;
+ return std::make_shared<Options>(cpp11::as_cpp<int64_t>(options["width"]),
+ cpp11::as_cpp<std::string>(options["padding"]));
+ }
+
+ if (func_name == "utf8_split_whitespace" || func_name == "ascii_split_whitespace") {
+ using Options = arrow::compute::SplitOptions;
+ int64_t max_splits = -1;
+ if (!Rf_isNull(options["max_splits"])) {
+ max_splits = cpp11::as_cpp<int64_t>(options["max_splits"]);
+ }
+ bool reverse = false;
+ if (!Rf_isNull(options["reverse"])) {
+ reverse = cpp11::as_cpp<bool>(options["reverse"]);
+ }
+ return std::make_shared<Options>(max_splits, reverse);
+ }
+
+ if (func_name == "utf8_trim" || func_name == "utf8_ltrim" ||
+ func_name == "utf8_rtrim" || func_name == "ascii_trim" ||
+ func_name == "ascii_ltrim" || func_name == "ascii_rtrim") {
+ using Options = arrow::compute::TrimOptions;
+ return std::make_shared<Options>(cpp11::as_cpp<std::string>(options["characters"]));
+ }
+
+ if (func_name == "utf8_slice_codeunits") {
+ using Options = arrow::compute::SliceOptions;
+
+ int64_t step = 1;
+ if (!Rf_isNull(options["step"])) {
+ step = cpp11::as_cpp<int64_t>(options["step"]);
+ }
+
+ int64_t stop = std::numeric_limits<int32_t>::max();
+ if (!Rf_isNull(options["stop"])) {
+ stop = cpp11::as_cpp<int64_t>(options["stop"]);
+ }
+
+ return std::make_shared<Options>(cpp11::as_cpp<int64_t>(options["start"]), stop,
+ step);
+ }
+
+ if (func_name == "utf8_replace_slice" || func_name == "binary_replace_slice") {
+ using Options = arrow::compute::ReplaceSliceOptions;
+
+ return std::make_shared<Options>(cpp11::as_cpp<int64_t>(options["start"]),
+ cpp11::as_cpp<int64_t>(options["stop"]),
+ cpp11::as_cpp<std::string>(options["replacement"]));
+ }
+
+ if (func_name == "variance" || func_name == "stddev" || func_name == "hash_variance" ||
+ func_name == "hash_stddev") {
+ using Options = arrow::compute::VarianceOptions;
+ auto out = std::make_shared<Options>();
+ out->ddof = cpp11::as_cpp<int64_t>(options["ddof"]);
+ if (!Rf_isNull(options["min_count"])) {
+ out->min_count = cpp11::as_cpp<int64_t>(options["min_count"]);
+ }
+ if (!Rf_isNull(options["skip_nulls"])) {
+ out->skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
+ }
+ return out;
+ }
+
+ if (func_name == "mode") {
+ using Options = arrow::compute::ModeOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["n"])) {
+ out->n = cpp11::as_cpp<int64_t>(options["n"]);
+ }
+ if (!Rf_isNull(options["min_count"])) {
+ out->min_count = cpp11::as_cpp<uint32_t>(options["min_count"]);
+ }
+ if (!Rf_isNull(options["skip_nulls"])) {
+ out->skip_nulls = cpp11::as_cpp<bool>(options["skip_nulls"]);
+ }
+ return out;
+ }
+
+ if (func_name == "partition_nth_indices") {
+ using Options = arrow::compute::PartitionNthOptions;
+ return std::make_shared<Options>(cpp11::as_cpp<int64_t>(options["pivot"]));
+ }
+
+ if (func_name == "round") {
+ using Options = arrow::compute::RoundOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["ndigits"])) {
+ out->ndigits = cpp11::as_cpp<int64_t>(options["ndigits"]);
+ }
+ SEXP round_mode = options["round_mode"];
+ if (!Rf_isNull(round_mode)) {
+ out->round_mode = cpp11::as_cpp<enum arrow::compute::RoundMode>(round_mode);
+ }
+ return out;
+ }
+
+ if (func_name == "round_to_multiple") {
+ using Options = arrow::compute::RoundToMultipleOptions;
+ auto out = std::make_shared<Options>(Options::Defaults());
+ if (!Rf_isNull(options["multiple"])) {
+ out->multiple = std::make_shared<arrow::DoubleScalar>(
+ cpp11::as_cpp<double>(options["multiple"]));
+ }
+ SEXP round_mode = options["round_mode"];
+ if (!Rf_isNull(round_mode)) {
+ out->round_mode = cpp11::as_cpp<enum arrow::compute::RoundMode>(round_mode);
+ }
+ return out;
+ }
+
+ return nullptr;
+}
+
+std::shared_ptr<arrow::compute::CastOptions> make_cast_options(cpp11::list options) {
+ using Options = arrow::compute::CastOptions;
+ auto out = std::make_shared<Options>(true);
+ SEXP to_type = options["to_type"];
+ if (!Rf_isNull(to_type) && cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(to_type)) {
+ out->to_type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(to_type);
+ }
+
+ SEXP allow_float_truncate = options["allow_float_truncate"];
+ if (!Rf_isNull(allow_float_truncate) && cpp11::as_cpp<bool>(allow_float_truncate)) {
+ out->allow_float_truncate = cpp11::as_cpp<bool>(allow_float_truncate);
+ }
+
+ SEXP allow_time_truncate = options["allow_time_truncate"];
+ if (!Rf_isNull(allow_time_truncate) && cpp11::as_cpp<bool>(allow_time_truncate)) {
+ out->allow_time_truncate = cpp11::as_cpp<bool>(allow_time_truncate);
+ }
+
+ SEXP allow_int_overflow = options["allow_int_overflow"];
+ if (!Rf_isNull(allow_int_overflow) && cpp11::as_cpp<bool>(allow_int_overflow)) {
+ out->allow_int_overflow = cpp11::as_cpp<bool>(allow_int_overflow);
+ }
+ return out;
+}
+
+// [[arrow::export]]
+SEXP compute__CallFunction(std::string func_name, cpp11::list args, cpp11::list options) {
+ auto opts = make_compute_options(func_name, options);
+ auto datum_args = arrow::r::from_r_list<arrow::Datum>(args);
+ auto out = ValueOrStop(
+ arrow::compute::CallFunction(func_name, datum_args, opts.get(), gc_context()));
+ return from_datum(std::move(out));
+}
+
+// [[arrow::export]]
+std::vector<std::string> compute__GetFunctionNames() {
+ return arrow::compute::GetFunctionRegistry()->GetFunctionNames();
+}
+
+#endif
diff --git a/src/arrow/r/src/config.cpp b/src/arrow/r/src/config.cpp
new file mode 100644
index 000000000..497843573
--- /dev/null
+++ b/src/arrow/r/src/config.cpp
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/config.h>
+
+// [[arrow::export]]
+std::vector<std::string> build_info() {
+ auto info = arrow::GetBuildInfo();
+ return {info.version_string, info.compiler_id, info.compiler_version,
+ info.compiler_flags, info.git_id};
+}
+
+// [[arrow::export]]
+std::vector<std::string> runtime_info() {
+ auto info = arrow::GetRuntimeInfo();
+ return {info.simd_level, info.detected_simd_level};
+}
+
+#endif
diff --git a/src/arrow/r/src/csv.cpp b/src/arrow/r/src/csv.cpp
new file mode 100644
index 000000000..93d07d82e
--- /dev/null
+++ b/src/arrow/r/src/csv.cpp
@@ -0,0 +1,205 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/csv/reader.h>
+#include <arrow/csv/writer.h>
+#include <arrow/memory_pool.h>
+#include <arrow/util/value_parsing.h>
+
+// [[arrow::export]]
+std::shared_ptr<arrow::csv::WriteOptions> csv___WriteOptions__initialize(
+ cpp11::list options) {
+ auto res =
+ std::make_shared<arrow::csv::WriteOptions>(arrow::csv::WriteOptions::Defaults());
+ res->include_header = cpp11::as_cpp<bool>(options["include_header"]);
+ res->batch_size = cpp11::as_cpp<int>(options["batch_size"]);
+ res->io_context = arrow::io::IOContext(gc_memory_pool());
+ return res;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::csv::ReadOptions> csv___ReadOptions__initialize(
+ cpp11::list options) {
+ auto res =
+ std::make_shared<arrow::csv::ReadOptions>(arrow::csv::ReadOptions::Defaults());
+ res->use_threads = cpp11::as_cpp<bool>(options["use_threads"]);
+ res->block_size = cpp11::as_cpp<int>(options["block_size"]);
+ res->skip_rows = cpp11::as_cpp<int>(options["skip_rows"]);
+ res->column_names = cpp11::as_cpp<std::vector<std::string>>(options["column_names"]);
+ res->autogenerate_column_names =
+ cpp11::as_cpp<bool>(options["autogenerate_column_names"]);
+
+ return res;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::csv::ParseOptions> csv___ParseOptions__initialize(
+ cpp11::list options) {
+ auto res =
+ std::make_shared<arrow::csv::ParseOptions>(arrow::csv::ParseOptions::Defaults());
+ res->delimiter = cpp11::as_cpp<char>(options["delimiter"]);
+ res->quoting = cpp11::as_cpp<bool>(options["quoting"]);
+ res->quote_char = cpp11::as_cpp<char>(options["quote_char"]);
+ res->double_quote = cpp11::as_cpp<bool>(options["double_quote"]);
+ res->escape_char = cpp11::as_cpp<char>(options["escape_char"]);
+ res->newlines_in_values = cpp11::as_cpp<bool>(options["newlines_in_values"]);
+ res->ignore_empty_lines = cpp11::as_cpp<bool>(options["ignore_empty_lines"]);
+ return res;
+}
+
+// [[arrow::export]]
+SEXP csv___ReadOptions__column_names(
+ const std::shared_ptr<arrow::csv::ReadOptions>& options) {
+ if (options->autogenerate_column_names) {
+ return R_NilValue;
+ }
+
+ return cpp11::as_sexp(options->column_names);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::csv::ConvertOptions> csv___ConvertOptions__initialize(
+ cpp11::list options) {
+ auto res = std::make_shared<arrow::csv::ConvertOptions>(
+ arrow::csv::ConvertOptions::Defaults());
+ res->check_utf8 = cpp11::as_cpp<bool>(options["check_utf8"]);
+ // Recognized spellings for null values
+ res->null_values = cpp11::as_cpp<std::vector<std::string>>(options["null_values"]);
+ // Whether string / binary columns can have null values.
+ // If true, then strings in "null_values" are considered null for string columns.
+ // If false, then all strings are valid string values.
+ res->strings_can_be_null = cpp11::as_cpp<bool>(options["strings_can_be_null"]);
+
+ res->true_values = cpp11::as_cpp<std::vector<std::string>>(options["true_values"]);
+ res->false_values = cpp11::as_cpp<std::vector<std::string>>(options["false_values"]);
+
+ SEXP col_types = options["col_types"];
+ if (Rf_inherits(col_types, "Schema")) {
+ auto schema = cpp11::as_cpp<std::shared_ptr<arrow::Schema>>(col_types);
+ std::unordered_map<std::string, std::shared_ptr<arrow::DataType>> column_types;
+ for (const auto& field : schema->fields()) {
+ column_types.insert(std::make_pair(field->name(), field->type()));
+ }
+ res->column_types = column_types;
+ }
+
+ res->auto_dict_encode = cpp11::as_cpp<bool>(options["auto_dict_encode"]);
+ res->auto_dict_max_cardinality =
+ cpp11::as_cpp<int>(options["auto_dict_max_cardinality"]);
+ res->include_columns =
+ cpp11::as_cpp<std::vector<std::string>>(options["include_columns"]);
+ res->include_missing_columns = cpp11::as_cpp<bool>(options["include_missing_columns"]);
+
+ SEXP op_timestamp_parsers = options["timestamp_parsers"];
+ if (!Rf_isNull(op_timestamp_parsers)) {
+ std::vector<std::shared_ptr<arrow::TimestampParser>> timestamp_parsers;
+
+ // if we have a character vector, convert to arrow::StrptimeTimestampParser
+ if (TYPEOF(op_timestamp_parsers) == STRSXP) {
+ cpp11::strings s_timestamp_parsers(op_timestamp_parsers);
+ for (cpp11::r_string s : s_timestamp_parsers) {
+ timestamp_parsers.push_back(arrow::TimestampParser::MakeStrptime(s));
+ }
+
+ } else if (TYPEOF(op_timestamp_parsers) == VECSXP) {
+ cpp11::list lst_parsers(op_timestamp_parsers);
+
+ for (SEXP x : lst_parsers) {
+ // handle scalar string and TimestampParser instances
+ if (TYPEOF(x) == STRSXP && XLENGTH(x) == 1) {
+ timestamp_parsers.push_back(
+ arrow::TimestampParser::MakeStrptime(CHAR(STRING_ELT(x, 0))));
+ } else if (Rf_inherits(x, "TimestampParser")) {
+ timestamp_parsers.push_back(
+ cpp11::as_cpp<std::shared_ptr<arrow::TimestampParser>>(x));
+ } else {
+ cpp11::stop(
+ "unsupported timestamp parser, must be a scalar string or a "
+ "<TimestampParser> object");
+ }
+ }
+
+ } else {
+ cpp11::stop(
+ "unsupported timestamp parser, must be character vector of strptime "
+ "specifications, or a list of <TimestampParser> objects");
+ }
+ res->timestamp_parsers = timestamp_parsers;
+ }
+
+ return res;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::csv::TableReader> csv___TableReader__Make(
+ const std::shared_ptr<arrow::io::InputStream>& input,
+ const std::shared_ptr<arrow::csv::ReadOptions>& read_options,
+ const std::shared_ptr<arrow::csv::ParseOptions>& parse_options,
+ const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options) {
+ return ValueOrStop(arrow::csv::TableReader::Make(arrow::io::IOContext(gc_memory_pool()),
+ input, *read_options, *parse_options,
+ *convert_options));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> csv___TableReader__Read(
+ const std::shared_ptr<arrow::csv::TableReader>& table_reader) {
+ return ValueOrStop(table_reader->Read());
+}
+
+// [[arrow::export]]
+std::string TimestampParser__kind(const std::shared_ptr<arrow::TimestampParser>& parser) {
+ return parser->kind();
+}
+
+// [[arrow::export]]
+std::string TimestampParser__format(
+ const std::shared_ptr<arrow::TimestampParser>& parser) {
+ return parser->format();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeStrptime(
+ std::string format) {
+ return arrow::TimestampParser::MakeStrptime(format);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeISO8601() {
+ return arrow::TimestampParser::MakeISO8601();
+}
+
+// [[arrow::export]]
+void csv___WriteCSV__Table(const std::shared_ptr<arrow::Table>& table,
+ const std::shared_ptr<arrow::csv::WriteOptions>& write_options,
+ const std::shared_ptr<arrow::io::OutputStream>& stream) {
+ StopIfNotOk(arrow::csv::WriteCSV(*table, *write_options, stream.get()));
+}
+
+// [[arrow::export]]
+void csv___WriteCSV__RecordBatch(
+ const std::shared_ptr<arrow::RecordBatch>& record_batch,
+ const std::shared_ptr<arrow::csv::WriteOptions>& write_options,
+ const std::shared_ptr<arrow::io::OutputStream>& stream) {
+ StopIfNotOk(arrow::csv::WriteCSV(*record_batch, *write_options, stream.get()));
+}
+
+#endif
diff --git a/src/arrow/r/src/dataset.cpp b/src/arrow/r/src/dataset.cpp
new file mode 100644
index 000000000..7e384aa54
--- /dev/null
+++ b/src/arrow/r/src/dataset.cpp
@@ -0,0 +1,543 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_DATASET)
+
+#include <arrow/array.h>
+#include <arrow/compute/api.h>
+#include <arrow/dataset/api.h>
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/ipc/writer.h>
+#include <arrow/table.h>
+#include <arrow/util/checked_cast.h>
+#include <arrow/util/iterator.h>
+#include <parquet/properties.h>
+
+namespace ds = ::arrow::dataset;
+namespace fs = ::arrow::fs;
+namespace compute = ::arrow::compute;
+
+namespace cpp11 {
+
+const char* r6_class_name<ds::Dataset>::get(const std::shared_ptr<ds::Dataset>& dataset) {
+ auto type_name = dataset->type_name();
+
+ if (type_name == "union") {
+ return "UnionDataset";
+ } else if (type_name == "filesystem") {
+ return "FileSystemDataset";
+ } else if (type_name == "in-memory") {
+ return "InMemoryDataset";
+ } else {
+ return "Dataset";
+ }
+}
+
+const char* r6_class_name<ds::FileFormat>::get(
+ const std::shared_ptr<ds::FileFormat>& file_format) {
+ auto type_name = file_format->type_name();
+ if (type_name == "parquet") {
+ return "ParquetFileFormat";
+ } else if (type_name == "ipc") {
+ return "IpcFileFormat";
+ } else if (type_name == "csv") {
+ return "CsvFileFormat";
+ } else {
+ return "FileFormat";
+ }
+}
+
+} // namespace cpp11
+
+// Dataset, UnionDataset, FileSystemDataset
+
+// [[dataset::export]]
+std::shared_ptr<ds::ScannerBuilder> dataset___Dataset__NewScan(
+ const std::shared_ptr<ds::Dataset>& ds) {
+ auto builder = ValueOrStop(ds->NewScan());
+ StopIfNotOk(builder->Pool(gc_memory_pool()));
+ return builder;
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Schema> dataset___Dataset__schema(
+ const std::shared_ptr<ds::Dataset>& dataset) {
+ return dataset->schema();
+}
+
+// [[dataset::export]]
+std::string dataset___Dataset__type_name(const std::shared_ptr<ds::Dataset>& dataset) {
+ return dataset->type_name();
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::Dataset> dataset___Dataset__ReplaceSchema(
+ const std::shared_ptr<ds::Dataset>& dataset,
+ const std::shared_ptr<arrow::Schema>& schm) {
+ return ValueOrStop(dataset->ReplaceSchema(schm));
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::Dataset> dataset___UnionDataset__create(
+ const ds::DatasetVector& datasets, const std::shared_ptr<arrow::Schema>& schm) {
+ return ValueOrStop(ds::UnionDataset::Make(schm, datasets));
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::Dataset> dataset___InMemoryDataset__create(
+ const std::shared_ptr<arrow::Table>& table) {
+ return std::make_shared<ds::InMemoryDataset>(table);
+}
+
+// [[dataset::export]]
+cpp11::list dataset___UnionDataset__children(
+ const std::shared_ptr<ds::UnionDataset>& ds) {
+ return arrow::r::to_r_list(ds->children());
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::FileFormat> dataset___FileSystemDataset__format(
+ const std::shared_ptr<ds::FileSystemDataset>& dataset) {
+ return dataset->format();
+}
+
+// [[dataset::export]]
+std::shared_ptr<fs::FileSystem> dataset___FileSystemDataset__filesystem(
+ const std::shared_ptr<ds::FileSystemDataset>& dataset) {
+ return dataset->filesystem();
+}
+
+// [[dataset::export]]
+std::vector<std::string> dataset___FileSystemDataset__files(
+ const std::shared_ptr<ds::FileSystemDataset>& dataset) {
+ return dataset->files();
+}
+
+// DatasetFactory, UnionDatasetFactory, FileSystemDatasetFactory
+
+// [[dataset::export]]
+std::shared_ptr<ds::Dataset> dataset___DatasetFactory__Finish1(
+ const std::shared_ptr<ds::DatasetFactory>& factory, bool unify_schemas) {
+ ds::FinishOptions opts;
+ if (unify_schemas) {
+ opts.inspect_options.fragments = ds::InspectOptions::kInspectAllFragments;
+ }
+ return ValueOrStop(factory->Finish(opts));
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::Dataset> dataset___DatasetFactory__Finish2(
+ const std::shared_ptr<ds::DatasetFactory>& factory,
+ const std::shared_ptr<arrow::Schema>& schema) {
+ return ValueOrStop(factory->Finish(schema));
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Schema> dataset___DatasetFactory__Inspect(
+ const std::shared_ptr<ds::DatasetFactory>& factory, bool unify_schemas) {
+ ds::InspectOptions opts;
+ if (unify_schemas) {
+ opts.fragments = ds::InspectOptions::kInspectAllFragments;
+ }
+ return ValueOrStop(factory->Inspect(opts));
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::DatasetFactory> dataset___UnionDatasetFactory__Make(
+ const std::vector<std::shared_ptr<ds::DatasetFactory>>& children) {
+ return ValueOrStop(ds::UnionDatasetFactory::Make(children));
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make0(
+ const std::shared_ptr<fs::FileSystem>& fs, const std::vector<std::string>& paths,
+ const std::shared_ptr<ds::FileFormat>& format) {
+ // TODO(fsaintjacques): Make options configurable
+ auto options = ds::FileSystemFactoryOptions{};
+
+ return arrow::internal::checked_pointer_cast<ds::FileSystemDatasetFactory>(
+ ValueOrStop(ds::FileSystemDatasetFactory::Make(fs, paths, format, options)));
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make2(
+ const std::shared_ptr<fs::FileSystem>& fs,
+ const std::shared_ptr<fs::FileSelector>& selector,
+ const std::shared_ptr<ds::FileFormat>& format,
+ const std::shared_ptr<ds::Partitioning>& partitioning) {
+ // TODO(fsaintjacques): Make options configurable
+ auto options = ds::FileSystemFactoryOptions{};
+ if (partitioning != nullptr) {
+ options.partitioning = partitioning;
+ }
+
+ return arrow::internal::checked_pointer_cast<ds::FileSystemDatasetFactory>(
+ ValueOrStop(ds::FileSystemDatasetFactory::Make(fs, *selector, format, options)));
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make1(
+ const std::shared_ptr<fs::FileSystem>& fs,
+ const std::shared_ptr<fs::FileSelector>& selector,
+ const std::shared_ptr<ds::FileFormat>& format) {
+ return dataset___FileSystemDatasetFactory__Make2(fs, selector, format, nullptr);
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::FileSystemDatasetFactory> dataset___FileSystemDatasetFactory__Make3(
+ const std::shared_ptr<fs::FileSystem>& fs,
+ const std::shared_ptr<fs::FileSelector>& selector,
+ const std::shared_ptr<ds::FileFormat>& format,
+ const std::shared_ptr<ds::PartitioningFactory>& factory) {
+ // TODO(fsaintjacques): Make options configurable
+ auto options = ds::FileSystemFactoryOptions{};
+ if (factory != nullptr) {
+ options.partitioning = factory;
+ }
+
+ return arrow::internal::checked_pointer_cast<ds::FileSystemDatasetFactory>(
+ ValueOrStop(ds::FileSystemDatasetFactory::Make(fs, *selector, format, options)));
+}
+
+// FileFormat, ParquetFileFormat, IpcFileFormat
+
+// [[dataset::export]]
+std::string dataset___FileFormat__type_name(
+ const std::shared_ptr<ds::FileFormat>& format) {
+ return format->type_name();
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::FileWriteOptions> dataset___FileFormat__DefaultWriteOptions(
+ const std::shared_ptr<ds::FileFormat>& fmt) {
+ return fmt->DefaultWriteOptions();
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::ParquetFileFormat> dataset___ParquetFileFormat__Make(
+ const std::shared_ptr<ds::ParquetFragmentScanOptions>& options,
+ cpp11::strings dict_columns) {
+ auto fmt = std::make_shared<ds::ParquetFileFormat>();
+ fmt->default_fragment_scan_options = std::move(options);
+
+ auto dict_columns_vector = cpp11::as_cpp<std::vector<std::string>>(dict_columns);
+ auto& d = fmt->reader_options.dict_columns;
+ std::move(dict_columns_vector.begin(), dict_columns_vector.end(),
+ std::inserter(d, d.end()));
+
+ return fmt;
+}
+
+// [[dataset::export]]
+std::string dataset___FileWriteOptions__type_name(
+ const std::shared_ptr<ds::FileWriteOptions>& options) {
+ return options->type_name();
+}
+
+#if defined(ARROW_R_WITH_PARQUET)
+// [[dataset::export]]
+void dataset___ParquetFileWriteOptions__update(
+ const std::shared_ptr<ds::ParquetFileWriteOptions>& options,
+ const std::shared_ptr<parquet::WriterProperties>& writer_props,
+ const std::shared_ptr<parquet::ArrowWriterProperties>& arrow_writer_props) {
+ options->writer_properties = writer_props;
+ options->arrow_writer_properties = arrow_writer_props;
+}
+#endif
+
+// [[dataset::export]]
+void dataset___IpcFileWriteOptions__update2(
+ const std::shared_ptr<ds::IpcFileWriteOptions>& ipc_options, bool use_legacy_format,
+ const std::shared_ptr<arrow::util::Codec>& codec,
+ arrow::ipc::MetadataVersion metadata_version) {
+ ipc_options->options->write_legacy_ipc_format = use_legacy_format;
+ ipc_options->options->codec = codec;
+ ipc_options->options->metadata_version = metadata_version;
+}
+
+// [[dataset::export]]
+void dataset___IpcFileWriteOptions__update1(
+ const std::shared_ptr<ds::IpcFileWriteOptions>& ipc_options, bool use_legacy_format,
+ arrow::ipc::MetadataVersion metadata_version) {
+ ipc_options->options->write_legacy_ipc_format = use_legacy_format;
+ ipc_options->options->metadata_version = metadata_version;
+}
+
+// [[dataset::export]]
+void dataset___CsvFileWriteOptions__update(
+ const std::shared_ptr<ds::CsvFileWriteOptions>& csv_options,
+ const std::shared_ptr<arrow::csv::WriteOptions>& write_options) {
+ *csv_options->write_options = *write_options;
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::IpcFileFormat> dataset___IpcFileFormat__Make() {
+ return std::make_shared<ds::IpcFileFormat>();
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::CsvFileFormat> dataset___CsvFileFormat__Make(
+ const std::shared_ptr<arrow::csv::ParseOptions>& parse_options,
+ const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options,
+ const std::shared_ptr<arrow::csv::ReadOptions>& read_options) {
+ auto format = std::make_shared<ds::CsvFileFormat>();
+ format->parse_options = *parse_options;
+ auto scan_options = std::make_shared<ds::CsvFragmentScanOptions>();
+ if (convert_options) scan_options->convert_options = *convert_options;
+ if (read_options) scan_options->read_options = *read_options;
+ format->default_fragment_scan_options = std::move(scan_options);
+ return format;
+}
+
+// FragmentScanOptions, CsvFragmentScanOptions, ParquetFragmentScanOptions
+
+// [[dataset::export]]
+std::string dataset___FragmentScanOptions__type_name(
+ const std::shared_ptr<ds::FragmentScanOptions>& fragment_scan_options) {
+ return fragment_scan_options->type_name();
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::CsvFragmentScanOptions> dataset___CsvFragmentScanOptions__Make(
+ const std::shared_ptr<arrow::csv::ConvertOptions>& convert_options,
+ const std::shared_ptr<arrow::csv::ReadOptions>& read_options) {
+ auto options = std::make_shared<ds::CsvFragmentScanOptions>();
+ options->convert_options = *convert_options;
+ options->read_options = *read_options;
+ return options;
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::ParquetFragmentScanOptions>
+dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size,
+ bool pre_buffer) {
+ auto options = std::make_shared<ds::ParquetFragmentScanOptions>();
+ if (use_buffered_stream) {
+ options->reader_properties->enable_buffered_stream();
+ } else {
+ options->reader_properties->disable_buffered_stream();
+ }
+ options->reader_properties->set_buffer_size(buffer_size);
+ options->arrow_reader_properties->set_pre_buffer(pre_buffer);
+ if (pre_buffer) {
+ options->arrow_reader_properties->set_cache_options(
+ arrow::io::CacheOptions::LazyDefaults());
+ }
+ return options;
+}
+
+// DirectoryPartitioning, HivePartitioning
+
+ds::SegmentEncoding GetSegmentEncoding(const std::string& segment_encoding) {
+ if (segment_encoding == "none") {
+ return ds::SegmentEncoding::None;
+ } else if (segment_encoding == "uri") {
+ return ds::SegmentEncoding::Uri;
+ }
+ cpp11::stop("invalid segment encoding: " + segment_encoding);
+ return ds::SegmentEncoding::None;
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::DirectoryPartitioning> dataset___DirectoryPartitioning(
+ const std::shared_ptr<arrow::Schema>& schm, const std::string& segment_encoding) {
+ ds::KeyValuePartitioningOptions options;
+ options.segment_encoding = GetSegmentEncoding(segment_encoding);
+ std::vector<std::shared_ptr<arrow::Array>> dictionaries;
+ return std::make_shared<ds::DirectoryPartitioning>(schm, dictionaries, options);
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::PartitioningFactory> dataset___DirectoryPartitioning__MakeFactory(
+ const std::vector<std::string>& field_names, const std::string& segment_encoding) {
+ ds::PartitioningFactoryOptions options;
+ options.segment_encoding = GetSegmentEncoding(segment_encoding);
+ return ds::DirectoryPartitioning::MakeFactory(field_names, options);
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::HivePartitioning> dataset___HivePartitioning(
+ const std::shared_ptr<arrow::Schema>& schm, const std::string& null_fallback,
+ const std::string& segment_encoding) {
+ ds::HivePartitioningOptions options;
+ options.null_fallback = null_fallback;
+ options.segment_encoding = GetSegmentEncoding(segment_encoding);
+ std::vector<std::shared_ptr<arrow::Array>> dictionaries;
+ return std::make_shared<ds::HivePartitioning>(schm, dictionaries, options);
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::PartitioningFactory> dataset___HivePartitioning__MakeFactory(
+ const std::string& null_fallback, const std::string& segment_encoding) {
+ ds::HivePartitioningFactoryOptions options;
+ options.null_fallback = null_fallback;
+ options.segment_encoding = GetSegmentEncoding(segment_encoding);
+ return ds::HivePartitioning::MakeFactory(options);
+}
+
+// ScannerBuilder, Scanner
+
+// [[dataset::export]]
+void dataset___ScannerBuilder__ProjectNames(const std::shared_ptr<ds::ScannerBuilder>& sb,
+ const std::vector<std::string>& cols) {
+ StopIfNotOk(sb->Project(cols));
+}
+
+// [[dataset::export]]
+void dataset___ScannerBuilder__ProjectExprs(
+ const std::shared_ptr<ds::ScannerBuilder>& sb,
+ const std::vector<std::shared_ptr<compute::Expression>>& exprs,
+ const std::vector<std::string>& names) {
+ // We have shared_ptrs of expressions but need the Expressions
+ std::vector<compute::Expression> expressions;
+ for (auto expr : exprs) {
+ expressions.push_back(*expr);
+ }
+ StopIfNotOk(sb->Project(expressions, names));
+}
+
+// [[dataset::export]]
+void dataset___ScannerBuilder__Filter(const std::shared_ptr<ds::ScannerBuilder>& sb,
+ const std::shared_ptr<compute::Expression>& expr) {
+ StopIfNotOk(sb->Filter(*expr));
+}
+
+// [[dataset::export]]
+void dataset___ScannerBuilder__UseThreads(const std::shared_ptr<ds::ScannerBuilder>& sb,
+ bool threads) {
+ StopIfNotOk(sb->UseThreads(threads));
+}
+
+// [[dataset::export]]
+void dataset___ScannerBuilder__UseAsync(const std::shared_ptr<ds::ScannerBuilder>& sb,
+ bool use_async) {
+ StopIfNotOk(sb->UseAsync(use_async));
+}
+
+// [[dataset::export]]
+void dataset___ScannerBuilder__BatchSize(const std::shared_ptr<ds::ScannerBuilder>& sb,
+ int64_t batch_size) {
+ StopIfNotOk(sb->BatchSize(batch_size));
+}
+
+// [[dataset::export]]
+void dataset___ScannerBuilder__FragmentScanOptions(
+ const std::shared_ptr<ds::ScannerBuilder>& sb,
+ const std::shared_ptr<ds::FragmentScanOptions>& options) {
+ StopIfNotOk(sb->FragmentScanOptions(options));
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Schema> dataset___ScannerBuilder__schema(
+ const std::shared_ptr<ds::ScannerBuilder>& sb) {
+ return sb->schema();
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::Scanner> dataset___ScannerBuilder__Finish(
+ const std::shared_ptr<ds::ScannerBuilder>& sb) {
+ return ValueOrStop(sb->Finish());
+}
+
+// [[dataset::export]]
+std::shared_ptr<ds::ScannerBuilder> dataset___ScannerBuilder__FromRecordBatchReader(
+ const std::shared_ptr<arrow::RecordBatchReader>& reader) {
+ return (ds::ScannerBuilder::FromRecordBatchReader(reader));
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Table> dataset___Scanner__ToTable(
+ const std::shared_ptr<ds::Scanner>& scanner) {
+ return ValueOrStop(scanner->ToTable());
+}
+
+// [[dataset::export]]
+cpp11::list dataset___Scanner__ScanBatches(const std::shared_ptr<ds::Scanner>& scanner) {
+ auto it = ValueOrStop(scanner->ScanBatches());
+ arrow::RecordBatchVector batches;
+ StopIfNotOk(it.Visit([&](ds::TaggedRecordBatch tagged_batch) {
+ batches.push_back(std::move(tagged_batch.record_batch));
+ return arrow::Status::OK();
+ }));
+ return arrow::r::to_r_list(batches);
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::RecordBatchReader> dataset___Scanner__ToRecordBatchReader(
+ const std::shared_ptr<ds::Scanner>& scanner) {
+ return ValueOrStop(scanner->ToRecordBatchReader());
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Table> dataset___Scanner__head(
+ const std::shared_ptr<ds::Scanner>& scanner, int n) {
+ // TODO: make this a full Slice with offset > 0
+ return ValueOrStop(scanner->Head(n));
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Schema> dataset___Scanner__schema(
+ const std::shared_ptr<ds::Scanner>& sc) {
+ return sc->options()->projected_schema;
+}
+
+// [[dataset::export]]
+cpp11::list dataset___ScanTask__get_batches(
+ const std::shared_ptr<ds::ScanTask>& scan_task) {
+ arrow::RecordBatchIterator rbi;
+ rbi = ValueOrStop(scan_task->Execute());
+ std::vector<std::shared_ptr<arrow::RecordBatch>> out;
+ std::shared_ptr<arrow::RecordBatch> batch;
+ for (auto b : rbi) {
+ batch = ValueOrStop(b);
+ out.push_back(batch);
+ }
+ return arrow::r::to_r_list(out);
+}
+
+// [[dataset::export]]
+void dataset___Dataset__Write(
+ const std::shared_ptr<ds::FileWriteOptions>& file_write_options,
+ const std::shared_ptr<fs::FileSystem>& filesystem, std::string base_dir,
+ const std::shared_ptr<ds::Partitioning>& partitioning, std::string basename_template,
+ const std::shared_ptr<ds::Scanner>& scanner,
+ arrow::dataset::ExistingDataBehavior existing_data_behavior) {
+ ds::FileSystemDatasetWriteOptions opts;
+ opts.file_write_options = file_write_options;
+ opts.existing_data_behavior = existing_data_behavior;
+ opts.filesystem = filesystem;
+ opts.base_dir = base_dir;
+ opts.partitioning = partitioning;
+ opts.basename_template = basename_template;
+ StopIfNotOk(ds::FileSystemDataset::Write(opts, scanner));
+}
+
+// [[dataset::export]]
+std::shared_ptr<arrow::Table> dataset___Scanner__TakeRows(
+ const std::shared_ptr<ds::Scanner>& scanner,
+ const std::shared_ptr<arrow::Array>& indices) {
+ return ValueOrStop(scanner->TakeRows(*indices));
+}
+
+// [[dataset::export]]
+int64_t dataset___Scanner__CountRows(const std::shared_ptr<ds::Scanner>& scanner) {
+ return ValueOrStop(scanner->CountRows());
+}
+
+#endif
diff --git a/src/arrow/r/src/datatype.cpp b/src/arrow/r/src/datatype.cpp
new file mode 100644
index 000000000..ffbe1ecc5
--- /dev/null
+++ b/src/arrow/r/src/datatype.cpp
@@ -0,0 +1,426 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/type.h>
+
+namespace cpp11 {
+
+const char* r6_class_name<arrow::DataType>::get(
+ const std::shared_ptr<arrow::DataType>& type) {
+ using arrow::Type;
+
+ switch (type->id()) {
+ case Type::NA:
+ return "Null";
+ case Type::BOOL:
+ return "Boolean";
+ case Type::UINT8:
+ return "UInt8";
+ case Type::UINT16:
+ return "UInt16";
+ case Type::UINT32:
+ return "UInt32";
+ case Type::UINT64:
+ return "UInt64";
+
+ case Type::INT8:
+ return "Int8";
+ case Type::INT16:
+ return "Int16";
+ case Type::INT32:
+ return "Int32";
+ case Type::INT64:
+ return "Int64";
+
+ case Type::HALF_FLOAT:
+ return "Float16";
+ case Type::FLOAT:
+ return "Float32";
+ case Type::DOUBLE:
+ return "Float64";
+
+ case Type::STRING:
+ return "Utf8";
+ case Type::LARGE_STRING:
+ return "LargeUtf8";
+
+ case Type::BINARY:
+ return "Binary";
+ case Type::FIXED_SIZE_BINARY:
+ return "FixedSizeBinary";
+ case Type::LARGE_BINARY:
+ return "LargeBinary";
+
+ case Type::DATE32:
+ return "Date32";
+ case Type::DATE64:
+ return "Date64";
+ case Type::TIMESTAMP:
+ return "Timestamp";
+
+ case Type::TIME32:
+ return "Time32";
+ case Type::TIME64:
+ return "Time64";
+
+ case Type::DECIMAL:
+ return "Decimal128Type";
+
+ case Type::LIST:
+ return "ListType";
+ case Type::LARGE_LIST:
+ return "LargeListType";
+ case Type::FIXED_SIZE_LIST:
+ return "FixedSizeListType";
+
+ case Type::STRUCT:
+ return "StructType";
+ case Type::DICTIONARY:
+ return "DictionaryType";
+
+ default:
+ break;
+ }
+
+ // No R6 classes are defined for:
+ // INTERVAL
+ // SPARSE_UNION
+ // DENSE_UNION
+ // MAP
+ // EXTENSION
+ // DURATION
+ //
+ // If a c++ function returns one it will be wrapped as a DataType.
+
+ return "DataType";
+}
+
+} // namespace cpp11
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Int8__initialize() { return arrow::int8(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Int16__initialize() { return arrow::int16(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Int32__initialize() { return arrow::int32(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Int64__initialize() { return arrow::int64(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> UInt8__initialize() { return arrow::uint8(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> UInt16__initialize() { return arrow::uint16(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> UInt32__initialize() { return arrow::uint32(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> UInt64__initialize() { return arrow::uint64(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Float16__initialize() { return arrow::float16(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Float32__initialize() { return arrow::float32(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Float64__initialize() { return arrow::float64(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Boolean__initialize() { return arrow::boolean(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Utf8__initialize() { return arrow::utf8(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> LargeUtf8__initialize() { return arrow::large_utf8(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Binary__initialize() { return arrow::binary(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> LargeBinary__initialize() {
+ return arrow::large_binary();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Date32__initialize() { return arrow::date32(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Date64__initialize() { return arrow::date64(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Null__initialize() { return arrow::null(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Decimal128Type__initialize(int32_t precision,
+ int32_t scale) {
+ // Use the builder that validates inputs
+ return ValueOrStop(arrow::Decimal128Type::Make(precision, scale));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> FixedSizeBinary__initialize(R_xlen_t byte_width) {
+ if (byte_width == NA_INTEGER) {
+ cpp11::stop("'byte_width' cannot be NA");
+ }
+ if (byte_width < 1) {
+ cpp11::stop("'byte_width' must be > 0");
+ }
+ return arrow::fixed_size_binary(byte_width);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Timestamp__initialize(arrow::TimeUnit::type unit,
+ const std::string& timezone) {
+ return arrow::timestamp(unit, timezone);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Time32__initialize(arrow::TimeUnit::type unit) {
+ return arrow::time32(unit);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Time64__initialize(arrow::TimeUnit::type unit) {
+ return arrow::time64(unit);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> list__(SEXP x) {
+ if (Rf_inherits(x, "Field")) {
+ auto field = cpp11::as_cpp<std::shared_ptr<arrow::Field>>(x);
+ return arrow::list(field);
+ }
+
+ if (!Rf_inherits(x, "DataType")) {
+ cpp11::stop("incompatible");
+ }
+
+ auto type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(x);
+ return arrow::list(type);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> large_list__(SEXP x) {
+ if (Rf_inherits(x, "Field")) {
+ auto field = cpp11::as_cpp<std::shared_ptr<arrow::Field>>(x);
+ return arrow::large_list(field);
+ }
+
+ if (!Rf_inherits(x, "DataType")) {
+ cpp11::stop("incompatible");
+ }
+
+ auto type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(x);
+ return arrow::large_list(type);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> fixed_size_list__(SEXP x, int list_size) {
+ if (Rf_inherits(x, "Field")) {
+ auto field = cpp11::as_cpp<std::shared_ptr<arrow::Field>>(x);
+ return arrow::fixed_size_list(field, list_size);
+ }
+
+ if (!Rf_inherits(x, "DataType")) {
+ cpp11::stop("incompatible");
+ }
+
+ auto type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(x);
+ return arrow::fixed_size_list(type, list_size);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> struct__(
+ const std::vector<std::shared_ptr<arrow::Field>>& fields) {
+ return arrow::struct_(fields);
+}
+
+// [[arrow::export]]
+std::string DataType__ToString(const std::shared_ptr<arrow::DataType>& type) {
+ return type->ToString();
+}
+
+// [[arrow::export]]
+std::string DataType__name(const std::shared_ptr<arrow::DataType>& type) {
+ return type->name();
+}
+
+// [[arrow::export]]
+bool DataType__Equals(const std::shared_ptr<arrow::DataType>& lhs,
+ const std::shared_ptr<arrow::DataType>& rhs) {
+ return lhs->Equals(*rhs);
+}
+
+// [[arrow::export]]
+int DataType__num_fields(const std::shared_ptr<arrow::DataType>& type) {
+ return type->num_fields();
+}
+
+// [[arrow::export]]
+cpp11::list DataType__fields(const std::shared_ptr<arrow::DataType>& type) {
+ return arrow::r::to_r_list(type->fields());
+}
+
+// [[arrow::export]]
+arrow::Type::type DataType__id(const std::shared_ptr<arrow::DataType>& type) {
+ return type->id();
+}
+
+// [[arrow::export]]
+std::string ListType__ToString(const std::shared_ptr<arrow::ListType>& type) {
+ return type->ToString();
+}
+
+// [[arrow::export]]
+int FixedWidthType__bit_width(const std::shared_ptr<arrow::FixedWidthType>& type) {
+ return type->bit_width();
+}
+
+// [[arrow::export]]
+arrow::DateUnit DateType__unit(const std::shared_ptr<arrow::DateType>& type) {
+ return type->unit();
+}
+
+// [[arrow::export]]
+arrow::TimeUnit::type TimeType__unit(const std::shared_ptr<arrow::TimeType>& type) {
+ return type->unit();
+}
+
+// [[arrow::export]]
+int32_t DecimalType__precision(const std::shared_ptr<arrow::DecimalType>& type) {
+ return type->precision();
+}
+
+// [[arrow::export]]
+int32_t DecimalType__scale(const std::shared_ptr<arrow::DecimalType>& type) {
+ return type->scale();
+}
+
+// [[arrow::export]]
+std::string TimestampType__timezone(const std::shared_ptr<arrow::TimestampType>& type) {
+ return type->timezone();
+}
+
+// [[arrow::export]]
+arrow::TimeUnit::type TimestampType__unit(
+ const std::shared_ptr<arrow::TimestampType>& type) {
+ return type->unit();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> DictionaryType__initialize(
+ const std::shared_ptr<arrow::DataType>& index_type,
+ const std::shared_ptr<arrow::DataType>& value_type, bool ordered) {
+ return ValueOrStop(arrow::DictionaryType::Make(index_type, value_type, ordered));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> DictionaryType__index_type(
+ const std::shared_ptr<arrow::DictionaryType>& type) {
+ return type->index_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> DictionaryType__value_type(
+ const std::shared_ptr<arrow::DictionaryType>& type) {
+ return type->value_type();
+}
+
+// [[arrow::export]]
+std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type) {
+ return type->name();
+}
+
+// [[arrow::export]]
+bool DictionaryType__ordered(const std::shared_ptr<arrow::DictionaryType>& type) {
+ return type->ordered();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> StructType__GetFieldByName(
+ const std::shared_ptr<arrow::StructType>& type, const std::string& name) {
+ return type->GetFieldByName(name);
+}
+
+// [[arrow::export]]
+int StructType__GetFieldIndex(const std::shared_ptr<arrow::StructType>& type,
+ const std::string& name) {
+ return type->GetFieldIndex(name);
+}
+
+// [[arrow::export]]
+std::vector<std::string> StructType__field_names(
+ const std::shared_ptr<arrow::StructType>& type) {
+ auto num_fields = type->num_fields();
+ std::vector<std::string> out(num_fields);
+ for (int i = 0; i < num_fields; i++) {
+ out[i] = type->field(i)->name();
+ }
+ return out;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> ListType__value_field(
+ const std::shared_ptr<arrow::ListType>& type) {
+ return type->value_field();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ListType__value_type(
+ const std::shared_ptr<arrow::ListType>& type) {
+ return type->value_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> LargeListType__value_field(
+ const std::shared_ptr<arrow::LargeListType>& type) {
+ return type->value_field();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> LargeListType__value_type(
+ const std::shared_ptr<arrow::LargeListType>& type) {
+ return type->value_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> FixedSizeListType__value_field(
+ const std::shared_ptr<arrow::FixedSizeListType>& type) {
+ return type->value_field();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> FixedSizeListType__value_type(
+ const std::shared_ptr<arrow::FixedSizeListType>& type) {
+ return type->value_type();
+}
+
+// [[arrow::export]]
+int FixedSizeListType__list_size(const std::shared_ptr<arrow::FixedSizeListType>& type) {
+ return type->list_size();
+}
+
+#endif
diff --git a/src/arrow/r/src/expression.cpp b/src/arrow/r/src/expression.cpp
new file mode 100644
index 000000000..97a8a746b
--- /dev/null
+++ b/src/arrow/r/src/expression.cpp
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/compute/api_scalar.h>
+#include <arrow/compute/exec/expression.h>
+
+namespace compute = ::arrow::compute;
+
+std::shared_ptr<compute::FunctionOptions> make_compute_options(std::string func_name,
+ cpp11::list options);
+
+// [[arrow::export]]
+bool compute___expr__equals(const std::shared_ptr<compute::Expression>& lhs,
+ const std::shared_ptr<compute::Expression>& rhs) {
+ return lhs->Equals(*rhs);
+}
+// [[arrow::export]]
+std::shared_ptr<compute::Expression> compute___expr__call(std::string func_name,
+ cpp11::list argument_list,
+ cpp11::list options) {
+ std::vector<compute::Expression> arguments;
+ for (SEXP argument : argument_list) {
+ auto argument_ptr = cpp11::as_cpp<std::shared_ptr<compute::Expression>>(argument);
+ arguments.push_back(*argument_ptr);
+ }
+
+ auto options_ptr = make_compute_options(func_name, options);
+
+ return std::make_shared<compute::Expression>(
+ compute::call(std::move(func_name), std::move(arguments), std::move(options_ptr)));
+}
+
+// [[arrow::export]]
+std::vector<std::string> field_names_in_expression(
+ const std::shared_ptr<compute::Expression>& x) {
+ std::vector<std::string> out;
+ auto field_refs = FieldsInExpression(*x);
+ for (auto f : field_refs) {
+ out.push_back(*f.name());
+ }
+ return out;
+}
+
+// [[arrow::export]]
+std::string compute___expr__get_field_ref_name(
+ const std::shared_ptr<compute::Expression>& x) {
+ if (auto field_ref = x->field_ref()) {
+ return *field_ref->name();
+ }
+ return "";
+}
+
+// [[arrow::export]]
+std::shared_ptr<compute::Expression> compute___expr__field_ref(std::string name) {
+ return std::make_shared<compute::Expression>(compute::field_ref(std::move(name)));
+}
+
+// [[arrow::export]]
+std::shared_ptr<compute::Expression> compute___expr__scalar(
+ const std::shared_ptr<arrow::Scalar>& x) {
+ return std::make_shared<compute::Expression>(compute::literal(std::move(x)));
+}
+
+// [[arrow::export]]
+std::string compute___expr__ToString(const std::shared_ptr<compute::Expression>& x) {
+ return x->ToString();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> compute___expr__type(
+ const std::shared_ptr<compute::Expression>& x,
+ const std::shared_ptr<arrow::Schema>& schema) {
+ auto bound = ValueOrStop(x->Bind(*schema));
+ return bound.type();
+}
+
+// [[arrow::export]]
+arrow::Type::type compute___expr__type_id(const std::shared_ptr<compute::Expression>& x,
+ const std::shared_ptr<arrow::Schema>& schema) {
+ auto bound = ValueOrStop(x->Bind(*schema));
+ return bound.type()->id();
+}
+
+#endif
diff --git a/src/arrow/r/src/feather.cpp b/src/arrow/r/src/feather.cpp
new file mode 100644
index 000000000..1df992baa
--- /dev/null
+++ b/src/arrow/r/src/feather.cpp
@@ -0,0 +1,87 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/ipc/feather.h>
+#include <arrow/type.h>
+
+// ---------- WriteFeather
+
+// [[arrow::export]]
+void ipc___WriteFeather__Table(const std::shared_ptr<arrow::io::OutputStream>& stream,
+ const std::shared_ptr<arrow::Table>& table, int version,
+ int chunk_size, arrow::Compression::type compression,
+ int compression_level) {
+ auto properties = arrow::ipc::feather::WriteProperties::Defaults();
+ properties.version = version;
+ properties.chunksize = chunk_size;
+ properties.compression = compression;
+ if (compression_level != -1) {
+ properties.compression_level = compression_level;
+ }
+ StopIfNotOk(arrow::ipc::feather::WriteTable(*table, stream.get(), properties));
+}
+
+// ----------- Reader
+
+// [[arrow::export]]
+int ipc___feather___Reader__version(
+ const std::shared_ptr<arrow::ipc::feather::Reader>& reader) {
+ return reader->version();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> ipc___feather___Reader__Read(
+ const std::shared_ptr<arrow::ipc::feather::Reader>& reader, SEXP columns) {
+ std::shared_ptr<arrow::Table> table;
+
+ switch (TYPEOF(columns)) {
+ case STRSXP: {
+ R_xlen_t n = XLENGTH(columns);
+ std::vector<std::string> names(n);
+ for (R_xlen_t i = 0; i < n; i++) {
+ names[i] = CHAR(STRING_ELT(columns, i));
+ }
+ StopIfNotOk(reader->Read(names, &table));
+ break;
+ }
+ case NILSXP:
+ StopIfNotOk(reader->Read(&table));
+ break;
+ default:
+ cpp11::stop("incompatible column specification");
+ break;
+ }
+
+ return table;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::feather::Reader> ipc___feather___Reader__Open(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& stream) {
+ return ValueOrStop(arrow::ipc::feather::Reader::Open(stream));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> ipc___feather___Reader__schema(
+ const std::shared_ptr<arrow::ipc::feather::Reader>& reader) {
+ return reader->schema();
+}
+
+#endif
diff --git a/src/arrow/r/src/field.cpp b/src/arrow/r/src/field.cpp
new file mode 100644
index 000000000..914d270c6
--- /dev/null
+++ b/src/arrow/r/src/field.cpp
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/type.h>
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> Field__initialize(
+ const std::string& name, const std::shared_ptr<arrow::DataType>& field,
+ bool nullable = true) {
+ return arrow::field(name, field, nullable);
+}
+
+// [[arrow::export]]
+std::string Field__ToString(const std::shared_ptr<arrow::Field>& field) {
+ return field->ToString();
+}
+
+// [[arrow::export]]
+std::string Field__name(const std::shared_ptr<arrow::Field>& field) {
+ return field->name();
+}
+
+// [[arrow::export]]
+bool Field__Equals(const std::shared_ptr<arrow::Field>& field,
+ const std::shared_ptr<arrow::Field>& other) {
+ return field->Equals(other);
+}
+
+// [[arrow::export]]
+bool Field__nullable(const std::shared_ptr<arrow::Field>& field) {
+ return field->nullable();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Field__type(const std::shared_ptr<arrow::Field>& field) {
+ return field->type();
+}
+
+#endif
diff --git a/src/arrow/r/src/filesystem.cpp b/src/arrow/r/src/filesystem.cpp
new file mode 100644
index 000000000..b7ec60536
--- /dev/null
+++ b/src/arrow/r/src/filesystem.cpp
@@ -0,0 +1,329 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/filesystem/localfs.h>
+
+namespace fs = ::arrow::fs;
+namespace io = ::arrow::io;
+
+namespace cpp11 {
+
+const char* r6_class_name<fs::FileSystem>::get(
+ const std::shared_ptr<fs::FileSystem>& file_system) {
+ auto type_name = file_system->type_name();
+
+ if (type_name == "local") {
+ return "LocalFileSystem";
+ } else if (type_name == "s3") {
+ return "S3FileSystem";
+ } else if (type_name == "subtree") {
+ return "SubTreeFileSystem";
+ } else {
+ return "FileSystem";
+ }
+}
+
+} // namespace cpp11
+
+// [[arrow::export]]
+fs::FileType fs___FileInfo__type(const std::shared_ptr<fs::FileInfo>& x) {
+ return x->type();
+}
+
+// [[arrow::export]]
+void fs___FileInfo__set_type(const std::shared_ptr<fs::FileInfo>& x, fs::FileType type) {
+ x->set_type(type);
+}
+
+// [[arrow::export]]
+std::string fs___FileInfo__path(const std::shared_ptr<fs::FileInfo>& x) {
+ return x->path();
+}
+
+// [[arrow::export]]
+void fs___FileInfo__set_path(const std::shared_ptr<fs::FileInfo>& x,
+ const std::string& path) {
+ x->set_path(path);
+}
+
+// [[arrow::export]]
+int64_t fs___FileInfo__size(const std::shared_ptr<fs::FileInfo>& x) { return x->size(); }
+
+// [[arrow::export]]
+void fs___FileInfo__set_size(const std::shared_ptr<fs::FileInfo>& x, int64_t size) {
+ x->set_size(size);
+}
+
+// [[arrow::export]]
+std::string fs___FileInfo__base_name(const std::shared_ptr<fs::FileInfo>& x) {
+ return x->base_name();
+}
+
+// [[arrow::export]]
+std::string fs___FileInfo__extension(const std::shared_ptr<fs::FileInfo>& x) {
+ return x->extension();
+}
+
+// [[arrow::export]]
+SEXP fs___FileInfo__mtime(const std::shared_ptr<fs::FileInfo>& x) {
+ SEXP res = PROTECT(Rf_allocVector(REALSXP, 1));
+ // .mtime() gets us nanoseconds since epoch, POSIXct is seconds since epoch as a double
+ REAL(res)[0] = static_cast<double>(x->mtime().time_since_epoch().count()) / 1000000000;
+ Rf_classgets(res, arrow::r::data::classes_POSIXct);
+ UNPROTECT(1);
+ return res;
+}
+
+// [[arrow::export]]
+void fs___FileInfo__set_mtime(const std::shared_ptr<fs::FileInfo>& x, SEXP time) {
+ auto nanosecs =
+ std::chrono::nanoseconds(static_cast<int64_t>(REAL(time)[0] * 1000000000));
+ x->set_mtime(fs::TimePoint(nanosecs));
+}
+
+// Selector
+
+// [[arrow::export]]
+std::string fs___FileSelector__base_dir(
+ const std::shared_ptr<fs::FileSelector>& selector) {
+ return selector->base_dir;
+}
+
+// [[arrow::export]]
+bool fs___FileSelector__allow_not_found(
+ const std::shared_ptr<fs::FileSelector>& selector) {
+ return selector->allow_not_found;
+}
+
+// [[arrow::export]]
+bool fs___FileSelector__recursive(const std::shared_ptr<fs::FileSelector>& selector) {
+ return selector->recursive;
+}
+
+// [[arrow::export]]
+std::shared_ptr<fs::FileSelector> fs___FileSelector__create(const std::string& base_dir,
+ bool allow_not_found,
+ bool recursive) {
+ auto selector = std::make_shared<fs::FileSelector>();
+ selector->base_dir = base_dir;
+ selector->allow_not_found = allow_not_found;
+ selector->recursive = recursive;
+ return selector;
+}
+
+// FileSystem
+
+template <typename T>
+std::vector<std::shared_ptr<T>> shared_ptr_vector(const std::vector<T>& vec) {
+ std::vector<std::shared_ptr<fs::FileInfo>> res(vec.size());
+ std::transform(vec.begin(), vec.end(), res.begin(),
+ [](const fs::FileInfo& x) { return std::make_shared<fs::FileInfo>(x); });
+ return res;
+}
+
+// [[arrow::export]]
+cpp11::list fs___FileSystem__GetTargetInfos_Paths(
+ const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::vector<std::string>& paths) {
+ auto results = ValueOrStop(file_system->GetFileInfo(paths));
+ return arrow::r::to_r_list(shared_ptr_vector(results));
+}
+
+// [[arrow::export]]
+cpp11::list fs___FileSystem__GetTargetInfos_FileSelector(
+ const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::shared_ptr<fs::FileSelector>& selector) {
+ auto results = ValueOrStop(file_system->GetFileInfo(*selector));
+
+ return arrow::r::to_r_list(shared_ptr_vector(results));
+}
+
+// [[arrow::export]]
+void fs___FileSystem__CreateDir(const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::string& path, bool recursive) {
+ StopIfNotOk(file_system->CreateDir(path, recursive));
+}
+
+// [[arrow::export]]
+void fs___FileSystem__DeleteDir(const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::string& path) {
+ StopIfNotOk(file_system->DeleteDir(path));
+}
+
+// [[arrow::export]]
+void fs___FileSystem__DeleteDirContents(
+ const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path) {
+ StopIfNotOk(file_system->DeleteDirContents(path));
+}
+
+// [[arrow::export]]
+void fs___FileSystem__DeleteFile(const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::string& path) {
+ StopIfNotOk(file_system->DeleteFile(path));
+}
+
+// [[arrow::export]]
+void fs___FileSystem__DeleteFiles(const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::vector<std::string>& paths) {
+ StopIfNotOk(file_system->DeleteFiles(paths));
+}
+
+// [[arrow::export]]
+void fs___FileSystem__Move(const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::string& src, const std::string& dest) {
+ StopIfNotOk(file_system->Move(src, dest));
+}
+
+// [[arrow::export]]
+void fs___FileSystem__CopyFile(const std::shared_ptr<fs::FileSystem>& file_system,
+ const std::string& src, const std::string& dest) {
+ StopIfNotOk(file_system->CopyFile(src, dest));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::InputStream> fs___FileSystem__OpenInputStream(
+ const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path) {
+ return ValueOrStop(file_system->OpenInputStream(path));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::RandomAccessFile> fs___FileSystem__OpenInputFile(
+ const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path) {
+ return ValueOrStop(file_system->OpenInputFile(path));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::OutputStream> fs___FileSystem__OpenOutputStream(
+ const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path) {
+ return ValueOrStop(file_system->OpenOutputStream(path));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::OutputStream> fs___FileSystem__OpenAppendStream(
+ const std::shared_ptr<fs::FileSystem>& file_system, const std::string& path) {
+ return ValueOrStop(file_system->OpenAppendStream(path));
+}
+
+// [[arrow::export]]
+std::string fs___FileSystem__type_name(
+ const std::shared_ptr<fs::FileSystem>& file_system) {
+ return file_system->type_name();
+}
+
+// [[arrow::export]]
+std::shared_ptr<fs::LocalFileSystem> fs___LocalFileSystem__create() {
+ // Affects OpenInputFile/OpenInputStream
+ auto io_context = arrow::io::IOContext(gc_memory_pool());
+ return std::make_shared<fs::LocalFileSystem>(io_context);
+}
+
+// [[arrow::export]]
+std::shared_ptr<fs::SubTreeFileSystem> fs___SubTreeFileSystem__create(
+ const std::string& base_path, const std::shared_ptr<fs::FileSystem>& base_fs) {
+ return std::make_shared<fs::SubTreeFileSystem>(base_path, base_fs);
+}
+
+// [[arrow::export]]
+std::shared_ptr<fs::FileSystem> fs___SubTreeFileSystem__base_fs(
+ const std::shared_ptr<fs::SubTreeFileSystem>& file_system) {
+ return file_system->base_fs();
+}
+
+// [[arrow::export]]
+std::string fs___SubTreeFileSystem__base_path(
+ const std::shared_ptr<fs::SubTreeFileSystem>& file_system) {
+ return file_system->base_path();
+}
+
+// [[arrow::export]]
+cpp11::writable::list fs___FileSystemFromUri(const std::string& path) {
+ using cpp11::literals::operator"" _nm;
+
+ std::string out_path;
+ return cpp11::writable::list(
+ {"fs"_nm = cpp11::to_r6(ValueOrStop(fs::FileSystemFromUri(path, &out_path))),
+ "path"_nm = out_path});
+}
+
+// [[arrow::export]]
+void fs___CopyFiles(const std::shared_ptr<fs::FileSystem>& source_fs,
+ const std::shared_ptr<fs::FileSelector>& source_sel,
+ const std::shared_ptr<fs::FileSystem>& destination_fs,
+ const std::string& destination_base_dir,
+ int64_t chunk_size = 1024 * 1024, bool use_threads = true) {
+ StopIfNotOk(fs::CopyFiles(source_fs, *source_sel, destination_fs, destination_base_dir,
+ io::default_io_context(), chunk_size, use_threads));
+}
+
+#endif
+
+#if defined(ARROW_R_WITH_S3)
+
+#include <arrow/filesystem/s3fs.h>
+
+// [[s3::export]]
+std::shared_ptr<fs::S3FileSystem> fs___S3FileSystem__create(
+ bool anonymous = false, std::string access_key = "", std::string secret_key = "",
+ std::string session_token = "", std::string role_arn = "",
+ std::string session_name = "", std::string external_id = "", int load_frequency = 900,
+ std::string region = "", std::string endpoint_override = "", std::string scheme = "",
+ bool background_writes = true) {
+ fs::S3Options s3_opts;
+ // Handle auth (anonymous, keys, default)
+ // (validation/internal coherence handled in R)
+ if (anonymous) {
+ s3_opts = fs::S3Options::Anonymous();
+ } else if (access_key != "" && secret_key != "") {
+ s3_opts = fs::S3Options::FromAccessKey(access_key, secret_key, session_token);
+ } else if (role_arn != "") {
+ s3_opts = fs::S3Options::FromAssumeRole(role_arn, session_name, external_id,
+ load_frequency);
+ } else {
+ s3_opts = fs::S3Options::Defaults();
+ }
+
+ // Now handle the rest of the options
+ /// AWS region to connect to (default determined by AWS SDK)
+ if (region != "") {
+ s3_opts.region = region;
+ }
+ /// If non-empty, override region with a connect string such as "localhost:9000"
+ s3_opts.endpoint_override = endpoint_override;
+ /// S3 connection transport, default "https"
+ if (scheme != "") {
+ s3_opts.scheme = scheme;
+ }
+ /// Whether OutputStream writes will be issued in the background, without blocking
+ /// default true
+ s3_opts.background_writes = background_writes;
+
+ StopIfNotOk(fs::EnsureS3Initialized());
+ auto io_context = arrow::io::IOContext(gc_memory_pool());
+ return ValueOrStop(fs::S3FileSystem::Make(s3_opts, io_context));
+}
+
+// [[s3::export]]
+std::string fs___S3FileSystem__region(const std::shared_ptr<fs::S3FileSystem>& fs) {
+ return fs->region();
+}
+
+#endif
diff --git a/src/arrow/r/src/imports.cpp b/src/arrow/r/src/imports.cpp
new file mode 100644
index 000000000..f4174bab5
--- /dev/null
+++ b/src/arrow/r/src/imports.cpp
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <R_ext/Rdynload.h> // for R_GetCCallable
+#include <Rdefines.h>
+
+namespace vctrs {
+struct vctrs_api_ptrs_t {
+ R_len_t (*short_vec_size)(SEXP x);
+
+ vctrs_api_ptrs_t() {
+ short_vec_size = (R_len_t(*)(SEXP))R_GetCCallable("vctrs", "short_vec_size");
+ }
+};
+
+const vctrs_api_ptrs_t& vctrs_api() {
+ static vctrs_api_ptrs_t ptrs;
+ return ptrs;
+}
+
+R_len_t vec_size(SEXP x) {
+ if (Rf_inherits(x, "data.frame") || TYPEOF(x) != VECSXP || Rf_inherits(x, "POSIXlt")) {
+ return vctrs_api().short_vec_size(x);
+ } else {
+ return Rf_length(x);
+ }
+}
+
+} // namespace vctrs
diff --git a/src/arrow/r/src/io.cpp b/src/arrow/r/src/io.cpp
new file mode 100644
index 000000000..6a912dd78
--- /dev/null
+++ b/src/arrow/r/src/io.cpp
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/io/file.h>
+#include <arrow/io/memory.h>
+
+// ------ arrow::io::Readable
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Buffer> io___Readable__Read(
+ const std::shared_ptr<arrow::io::Readable>& x, int64_t nbytes) {
+ return ValueOrStop(x->Read(nbytes));
+}
+
+// ------ arrow::io::InputStream
+
+// [[arrow::export]]
+void io___InputStream__Close(const std::shared_ptr<arrow::io::InputStream>& x) {
+ StopIfNotOk(x->Close());
+}
+
+// ------ arrow::io::OutputStream
+
+// [[arrow::export]]
+void io___OutputStream__Close(const std::shared_ptr<arrow::io::OutputStream>& x) {
+ StopIfNotOk(x->Close());
+}
+
+// ------ arrow::io::RandomAccessFile
+
+// [[arrow::export]]
+int64_t io___RandomAccessFile__GetSize(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& x) {
+ return ValueOrStop(x->GetSize());
+}
+
+// [[arrow::export]]
+bool io___RandomAccessFile__supports_zero_copy(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& x) {
+ return x->supports_zero_copy();
+}
+
+// [[arrow::export]]
+void io___RandomAccessFile__Seek(const std::shared_ptr<arrow::io::RandomAccessFile>& x,
+ int64_t position) {
+ StopIfNotOk(x->Seek(position));
+}
+
+// [[arrow::export]]
+int64_t io___RandomAccessFile__Tell(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& x) {
+ return ValueOrStop(x->Tell());
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Buffer> io___RandomAccessFile__Read0(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& x) {
+ int64_t current = ValueOrStop(x->Tell());
+
+ int64_t n = ValueOrStop(x->GetSize());
+
+ return ValueOrStop(x->Read(n - current));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Buffer> io___RandomAccessFile__ReadAt(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& x, int64_t position,
+ int64_t nbytes) {
+ return ValueOrStop(x->ReadAt(position, nbytes));
+}
+
+// ------ arrow::io::MemoryMappedFile
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::MemoryMappedFile> io___MemoryMappedFile__Create(
+ const std::string& path, int64_t size) {
+ return ValueOrStop(arrow::io::MemoryMappedFile::Create(path, size));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::MemoryMappedFile> io___MemoryMappedFile__Open(
+ const std::string& path, arrow::io::FileMode::type mode) {
+ return ValueOrStop(arrow::io::MemoryMappedFile::Open(path, mode));
+}
+
+// [[arrow::export]]
+void io___MemoryMappedFile__Resize(const std::shared_ptr<arrow::io::MemoryMappedFile>& x,
+ int64_t size) {
+ StopIfNotOk(x->Resize(size));
+}
+
+// ------ arrow::io::ReadableFile
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::ReadableFile> io___ReadableFile__Open(
+ const std::string& path) {
+ return ValueOrStop(arrow::io::ReadableFile::Open(path, gc_memory_pool()));
+}
+
+// ------ arrow::io::BufferReader
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::BufferReader> io___BufferReader__initialize(
+ const std::shared_ptr<arrow::Buffer>& buffer) {
+ return std::make_shared<arrow::io::BufferReader>(buffer);
+}
+
+// ------- arrow::io::Writable
+
+// [[arrow::export]]
+void io___Writable__write(const std::shared_ptr<arrow::io::Writable>& stream,
+ const std::shared_ptr<arrow::Buffer>& buf) {
+ StopIfNotOk(stream->Write(buf->data(), buf->size()));
+}
+
+// ------- arrow::io::OutputStream
+
+// [[arrow::export]]
+int64_t io___OutputStream__Tell(const std::shared_ptr<arrow::io::OutputStream>& stream) {
+ return ValueOrStop(stream->Tell());
+}
+
+// ------ arrow::io::FileOutputStream
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::FileOutputStream> io___FileOutputStream__Open(
+ const std::string& path) {
+ return ValueOrStop(arrow::io::FileOutputStream::Open(path));
+}
+
+// ------ arrow::BufferOutputStream
+
+// [[arrow::export]]
+std::shared_ptr<arrow::io::BufferOutputStream> io___BufferOutputStream__Create(
+ int64_t initial_capacity) {
+ return ValueOrStop(
+ arrow::io::BufferOutputStream::Create(initial_capacity, gc_memory_pool()));
+}
+
+// [[arrow::export]]
+int64_t io___BufferOutputStream__capacity(
+ const std::shared_ptr<arrow::io::BufferOutputStream>& stream) {
+ return stream->capacity();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Buffer> io___BufferOutputStream__Finish(
+ const std::shared_ptr<arrow::io::BufferOutputStream>& stream) {
+ return ValueOrStop(stream->Finish());
+}
+
+// [[arrow::export]]
+int64_t io___BufferOutputStream__Tell(
+ const std::shared_ptr<arrow::io::BufferOutputStream>& stream) {
+ return ValueOrStop(stream->Tell());
+}
+
+// [[arrow::export]]
+void io___BufferOutputStream__Write(
+ const std::shared_ptr<arrow::io::BufferOutputStream>& stream, cpp11::raws bytes) {
+ StopIfNotOk(stream->Write(RAW(bytes), bytes.size()));
+}
+
+#endif
diff --git a/src/arrow/r/src/json.cpp b/src/arrow/r/src/json.cpp
new file mode 100644
index 000000000..ec00e54be
--- /dev/null
+++ b/src/arrow/r/src/json.cpp
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+#if defined(ARROW_R_WITH_JSON)
+
+#include <arrow/json/reader.h>
+
+// [[json::export]]
+std::shared_ptr<arrow::json::ReadOptions> json___ReadOptions__initialize(bool use_threads,
+ int block_size) {
+ auto res =
+ std::make_shared<arrow::json::ReadOptions>(arrow::json::ReadOptions::Defaults());
+ res->use_threads = use_threads;
+ res->block_size = block_size;
+ return res;
+}
+
+// [[json::export]]
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize1(
+ bool newlines_in_values) {
+ auto res =
+ std::make_shared<arrow::json::ParseOptions>(arrow::json::ParseOptions::Defaults());
+ res->newlines_in_values = newlines_in_values;
+ return res;
+}
+
+// [[json::export]]
+std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize2(
+ bool newlines_in_values, const std::shared_ptr<arrow::Schema>& explicit_schema) {
+ auto res =
+ std::make_shared<arrow::json::ParseOptions>(arrow::json::ParseOptions::Defaults());
+ res->newlines_in_values = newlines_in_values;
+ res->explicit_schema = explicit_schema;
+ return res;
+}
+
+// [[json::export]]
+std::shared_ptr<arrow::json::TableReader> json___TableReader__Make(
+ const std::shared_ptr<arrow::io::InputStream>& input,
+ const std::shared_ptr<arrow::json::ReadOptions>& read_options,
+ const std::shared_ptr<arrow::json::ParseOptions>& parse_options) {
+ return ValueOrStop(arrow::json::TableReader::Make(gc_memory_pool(), input,
+ *read_options, *parse_options));
+}
+
+// [[json::export]]
+std::shared_ptr<arrow::Table> json___TableReader__Read(
+ const std::shared_ptr<arrow::json::TableReader>& table_reader) {
+ return ValueOrStop(table_reader->Read());
+}
+
+#endif
diff --git a/src/arrow/r/src/memorypool.cpp b/src/arrow/r/src/memorypool.cpp
new file mode 100644
index 000000000..b48e7ec71
--- /dev/null
+++ b/src/arrow/r/src/memorypool.cpp
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/memory_pool.h>
+#include <arrow/util/mutex.h>
+
+class GcMemoryPool : public arrow::MemoryPool {
+ public:
+ GcMemoryPool() : pool_(arrow::default_memory_pool()) {}
+
+ arrow::Status Allocate(int64_t size, uint8_t** out) override {
+ return GcAndTryAgain([&] { return pool_->Allocate(size, out); });
+ }
+
+ arrow::Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override {
+ return GcAndTryAgain([&] { return pool_->Reallocate(old_size, new_size, ptr); });
+ }
+
+ void Free(uint8_t* buffer, int64_t size) override { pool_->Free(buffer, size); }
+
+ int64_t bytes_allocated() const override { return pool_->bytes_allocated(); }
+
+ int64_t max_memory() const override { return pool_->max_memory(); }
+
+ std::string backend_name() const override { return pool_->backend_name(); }
+
+ private:
+ template <typename Call>
+ arrow::Status GcAndTryAgain(const Call& call) {
+ if (call().ok()) {
+ return arrow::Status::OK();
+ } else {
+ auto lock = mutex_.Lock();
+
+ // ARROW-10080: Allocation may fail spuriously since the garbage collector is lazy.
+ // Force it to run then try again in case any reusable allocations have been freed.
+ static cpp11::function gc = cpp11::package("base")["gc"];
+ gc();
+ }
+ return call();
+ }
+
+ arrow::util::Mutex mutex_;
+ arrow::MemoryPool* pool_;
+};
+
+static GcMemoryPool g_pool;
+
+arrow::MemoryPool* gc_memory_pool() { return &g_pool; }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::MemoryPool> MemoryPool__default() {
+ return std::shared_ptr<arrow::MemoryPool>(&g_pool, [](...) {});
+}
+
+// [[arrow::export]]
+double MemoryPool__bytes_allocated(const std::shared_ptr<arrow::MemoryPool>& pool) {
+ return pool->bytes_allocated();
+}
+
+// [[arrow::export]]
+double MemoryPool__max_memory(const std::shared_ptr<arrow::MemoryPool>& pool) {
+ return pool->max_memory();
+}
+
+// [[arrow::export]]
+std::string MemoryPool__backend_name(const std::shared_ptr<arrow::MemoryPool>& pool) {
+ return pool->backend_name();
+}
+
+// [[arrow::export]]
+std::vector<std::string> supported_memory_backends() {
+ return arrow::SupportedMemoryBackendNames();
+}
+
+#endif
diff --git a/src/arrow/r/src/message.cpp b/src/arrow/r/src/message.cpp
new file mode 100644
index 000000000..f2524644a
--- /dev/null
+++ b/src/arrow/r/src/message.cpp
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/ipc/reader.h>
+#include <arrow/ipc/writer.h>
+
+// [[arrow::export]]
+int64_t ipc___Message__body_length(const std::unique_ptr<arrow::ipc::Message>& message) {
+ return message->body_length();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Buffer> ipc___Message__metadata(
+ const std::unique_ptr<arrow::ipc::Message>& message) {
+ return message->metadata();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Buffer> ipc___Message__body(
+ const std::unique_ptr<arrow::ipc::Message>& message) {
+ return message->body();
+}
+
+// [[arrow::export]]
+int64_t ipc___Message__Verify(const std::unique_ptr<arrow::ipc::Message>& message) {
+ return message->Verify();
+}
+
+// [[arrow::export]]
+arrow::ipc::MessageType ipc___Message__type(
+ const std::unique_ptr<arrow::ipc::Message>& message) {
+ return message->type();
+}
+
+// [[arrow::export]]
+bool ipc___Message__Equals(const std::unique_ptr<arrow::ipc::Message>& x,
+ const std::unique_ptr<arrow::ipc::Message>& y) {
+ return x->Equals(*y);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__Message__Schema(
+ const std::unique_ptr<arrow::ipc::Message>& message,
+ const std::shared_ptr<arrow::Schema>& schema) {
+ // TODO: perhaps this should come from the R side
+ arrow::ipc::DictionaryMemo memo;
+ auto batch = ValueOrStop(arrow::ipc::ReadRecordBatch(
+ *message, schema, &memo, arrow::ipc::IpcReadOptions::Defaults()));
+ return batch;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> ipc___ReadSchema_InputStream(
+ const std::shared_ptr<arrow::io::InputStream>& stream) {
+ // TODO: promote to function argument
+ arrow::ipc::DictionaryMemo memo;
+ return ValueOrStop(arrow::ipc::ReadSchema(stream.get(), &memo));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> ipc___ReadSchema_Message(
+ const std::unique_ptr<arrow::ipc::Message>& message) {
+ arrow::ipc::DictionaryMemo empty_memo;
+ return ValueOrStop(arrow::ipc::ReadSchema(*message, &empty_memo));
+}
+
+//--------- MessageReader
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::MessageReader> ipc___MessageReader__Open(
+ const std::shared_ptr<arrow::io::InputStream>& stream) {
+ return std::shared_ptr<arrow::ipc::MessageReader>(
+ arrow::ipc::MessageReader::Open(stream));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::Message> ipc___MessageReader__ReadNextMessage(
+ const std::unique_ptr<arrow::ipc::MessageReader>& reader) {
+ return ValueOrStop(reader->ReadNextMessage());
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::Message> ipc___ReadMessage(
+ const std::shared_ptr<arrow::io::InputStream>& stream) {
+ return ValueOrStop(arrow::ipc::ReadMessage(stream.get()));
+}
+
+#endif
diff --git a/src/arrow/r/src/nameof.h b/src/arrow/r/src/nameof.h
new file mode 100644
index 000000000..ae49880b8
--- /dev/null
+++ b/src/arrow/r/src/nameof.h
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <string>
+
+namespace arrow {
+namespace util {
+namespace detail {
+
+#ifdef _MSC_VER
+#define ARROW_PRETTY_FUNCTION __FUNCSIG__
+#else
+#define ARROW_PRETTY_FUNCTION __PRETTY_FUNCTION__
+#endif
+
+template <typename T>
+const char* raw() {
+ return ARROW_PRETTY_FUNCTION;
+}
+
+template <typename T>
+size_t raw_sizeof() {
+ return sizeof(ARROW_PRETTY_FUNCTION);
+}
+
+#undef ARROW_PRETTY_FUNCTION
+
+constexpr bool starts_with(char const* haystack, char const* needle) {
+ return needle[0] == '\0' ||
+ (haystack[0] == needle[0] && starts_with(haystack + 1, needle + 1));
+}
+
+constexpr size_t search(char const* haystack, char const* needle) {
+ return haystack[0] == '\0' || starts_with(haystack, needle)
+ ? 0
+ : search(haystack + 1, needle) + 1;
+}
+
+const size_t typename_prefix = search(raw<double>(), "double");
+
+template <typename T>
+size_t struct_class_prefix() {
+#ifdef _MSC_VER
+ return starts_with(raw<T>() + typename_prefix, "struct ")
+ ? 7
+ : starts_with(raw<T>() + typename_prefix, "class ") ? 6 : 0;
+#else
+ return 0;
+#endif
+}
+
+template <typename T>
+size_t typename_length() {
+ // raw_sizeof<T>() - raw_sizeof<double>() == (length of T's name) - strlen("double")
+ // (length of T's name) == raw_sizeof<T>() - raw_sizeof<double>() + strlen("double")
+ return raw_sizeof<T>() - struct_class_prefix<T>() - raw_sizeof<double>() + 6;
+}
+
+template <typename T>
+const char* typename_begin() {
+ return raw<T>() + struct_class_prefix<T>() + typename_prefix;
+}
+
+} // namespace detail
+
+template <typename T>
+std::string nameof(bool strip_namespace = false) {
+ std::string name{detail::typename_begin<T>(), detail::typename_length<T>()};
+ if (strip_namespace) {
+ auto i = name.find_last_of("::");
+ if (i != std::string::npos) {
+ name = name.substr(i + 1);
+ }
+ }
+ return name;
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/src/arrow/r/src/parquet.cpp b/src/arrow/r/src/parquet.cpp
new file mode 100644
index 000000000..5de7ca8fa
--- /dev/null
+++ b/src/arrow/r/src/parquet.cpp
@@ -0,0 +1,326 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_PARQUET)
+
+#include <arrow/table.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/exception.h>
+
+namespace parquet {
+
+class WriterPropertiesBuilder : public WriterProperties::Builder {
+ public:
+ using WriterProperties::Builder::Builder;
+};
+
+class ArrowWriterPropertiesBuilder : public ArrowWriterProperties::Builder {
+ public:
+ using ArrowWriterProperties::Builder::Builder;
+};
+
+} // namespace parquet
+
+// [[parquet::export]]
+std::shared_ptr<parquet::ArrowReaderProperties>
+parquet___arrow___ArrowReaderProperties__Make(bool use_threads) {
+ return std::make_shared<parquet::ArrowReaderProperties>(use_threads);
+}
+
+// [[parquet::export]]
+void parquet___arrow___ArrowReaderProperties__set_use_threads(
+ const std::shared_ptr<parquet::ArrowReaderProperties>& properties, bool use_threads) {
+ properties->set_use_threads(use_threads);
+}
+
+// [[parquet::export]]
+bool parquet___arrow___ArrowReaderProperties__get_use_threads(
+ const std::shared_ptr<parquet::ArrowReaderProperties>& properties, bool use_threads) {
+ return properties->use_threads();
+}
+
+// [[parquet::export]]
+bool parquet___arrow___ArrowReaderProperties__get_read_dictionary(
+ const std::shared_ptr<parquet::ArrowReaderProperties>& properties, int column_index) {
+ return properties->read_dictionary(column_index);
+}
+
+// [[parquet::export]]
+void parquet___arrow___ArrowReaderProperties__set_read_dictionary(
+ const std::shared_ptr<parquet::ArrowReaderProperties>& properties, int column_index,
+ bool read_dict) {
+ properties->set_read_dictionary(column_index, read_dict);
+}
+
+// [[parquet::export]]
+std::shared_ptr<parquet::arrow::FileReader> parquet___arrow___FileReader__OpenFile(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& file,
+ const std::shared_ptr<parquet::ArrowReaderProperties>& props) {
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ parquet::arrow::FileReaderBuilder builder;
+ PARQUET_THROW_NOT_OK(builder.Open(file));
+ PARQUET_THROW_NOT_OK(
+ builder.memory_pool(gc_memory_pool())->properties(*props)->Build(&reader));
+ return std::move(reader);
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable1(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader) {
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
+ return table;
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable2(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader,
+ const std::vector<int>& column_indices) {
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadTable(column_indices, &table));
+ return table;
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroup1(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader, int i) {
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadRowGroup(i, &table));
+ return table;
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroup2(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader, int i,
+ const std::vector<int>& column_indices) {
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadRowGroup(i, column_indices, &table));
+ return table;
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroups1(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader,
+ const std::vector<int>& row_groups) {
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadRowGroups(row_groups, &table));
+ return table;
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadRowGroups2(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader,
+ const std::vector<int>& row_groups, const std::vector<int>& column_indices) {
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadRowGroups(row_groups, column_indices, &table));
+ return table;
+}
+
+// [[parquet::export]]
+int64_t parquet___arrow___FileReader__num_rows(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader) {
+ return reader->parquet_reader()->metadata()->num_rows();
+}
+
+// [[parquet::export]]
+int parquet___arrow___FileReader__num_columns(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader) {
+ return reader->parquet_reader()->metadata()->num_columns();
+}
+
+// [[parquet::export]]
+int parquet___arrow___FileReader__num_row_groups(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader) {
+ return reader->num_row_groups();
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::ChunkedArray> parquet___arrow___FileReader__ReadColumn(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader, int i) {
+ std::shared_ptr<arrow::ChunkedArray> array;
+ PARQUET_THROW_NOT_OK(reader->ReadColumn(i - 1, &array));
+ return array;
+}
+
+// [[parquet::export]]
+std::shared_ptr<parquet::ArrowWriterProperties> parquet___ArrowWriterProperties___create(
+ bool allow_truncated_timestamps, bool use_deprecated_int96_timestamps,
+ int timestamp_unit) {
+ auto builder = std::make_shared<parquet::ArrowWriterPropertiesBuilder>();
+ builder->store_schema();
+
+ if (allow_truncated_timestamps) {
+ builder->allow_truncated_timestamps();
+ }
+ if (use_deprecated_int96_timestamps) {
+ builder->enable_deprecated_int96_timestamps();
+ }
+ if (timestamp_unit > -1) {
+ // -1 is passed in for NULL/default
+ builder->coerce_timestamps(static_cast<arrow::TimeUnit::type>(timestamp_unit));
+ }
+
+ return builder->build();
+}
+
+// [[parquet::export]]
+std::shared_ptr<parquet::WriterPropertiesBuilder>
+parquet___WriterProperties___Builder__create() {
+ return std::make_shared<parquet::WriterPropertiesBuilder>();
+}
+
+// [[parquet::export]]
+void parquet___WriterProperties___Builder__version(
+ const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder,
+ const parquet::ParquetVersion::type& version) {
+ builder->version(version);
+}
+
+// [[parquet::export]]
+void parquet___ArrowWriterProperties___Builder__set_compressions(
+ const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder,
+ const std::vector<std::string>& paths, cpp11::integers types) {
+ auto n = types.size();
+ if (n == 1) {
+ builder->compression(static_cast<arrow::Compression::type>(types[0]));
+ } else {
+ for (decltype(n) i = 0; i < n; i++) {
+ builder->compression(paths[i], static_cast<arrow::Compression::type>(types[i]));
+ }
+ }
+}
+
+// [[parquet::export]]
+void parquet___ArrowWriterProperties___Builder__set_compression_levels(
+ const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder,
+ const std::vector<std::string>& paths, cpp11::integers levels) {
+ auto n = levels.size();
+ if (n == 1) {
+ builder->compression_level(levels[0]);
+ } else {
+ for (decltype(n) i = 0; i < n; i++) {
+ builder->compression_level(paths[i], levels[i]);
+ }
+ }
+}
+
+// [[parquet::export]]
+void parquet___ArrowWriterProperties___Builder__set_use_dictionary(
+ const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder,
+ const std::vector<std::string>& paths, cpp11::logicals use_dictionary) {
+ auto n = use_dictionary.size();
+ if (n == 1) {
+ if (use_dictionary[0] == TRUE) {
+ builder->enable_dictionary();
+ } else {
+ builder->disable_dictionary();
+ }
+ } else {
+ builder->disable_dictionary();
+ for (decltype(n) i = 0; i < n; i++) {
+ if (use_dictionary[i] == TRUE) {
+ builder->enable_dictionary(paths[i]);
+ } else {
+ builder->disable_dictionary(paths[i]);
+ }
+ }
+ }
+}
+
+// [[parquet::export]]
+void parquet___ArrowWriterProperties___Builder__set_write_statistics(
+ const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder,
+ const std::vector<std::string>& paths, cpp11::logicals write_statistics) {
+ auto n = write_statistics.size();
+ if (n == 1) {
+ if (write_statistics[0] == TRUE) {
+ builder->enable_statistics();
+ } else {
+ builder->disable_statistics();
+ }
+ } else {
+ builder->disable_statistics();
+ for (decltype(n) i = 0; i < n; i++) {
+ if (write_statistics[i] == TRUE) {
+ builder->enable_statistics(paths[i]);
+ } else {
+ builder->disable_statistics(paths[i]);
+ }
+ }
+ }
+}
+
+// [[parquet::export]]
+void parquet___ArrowWriterProperties___Builder__data_page_size(
+ const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder,
+ int64_t data_page_size) {
+ builder->data_pagesize(data_page_size);
+}
+
+// [[parquet::export]]
+std::shared_ptr<parquet::WriterProperties> parquet___WriterProperties___Builder__build(
+ const std::shared_ptr<parquet::WriterPropertiesBuilder>& builder) {
+ return builder->build();
+}
+
+// [[parquet::export]]
+std::shared_ptr<parquet::arrow::FileWriter> parquet___arrow___ParquetFileWriter__Open(
+ const std::shared_ptr<arrow::Schema>& schema,
+ const std::shared_ptr<arrow::io::OutputStream>& sink,
+ const std::shared_ptr<parquet::WriterProperties>& properties,
+ const std::shared_ptr<parquet::ArrowWriterProperties>& arrow_properties) {
+ std::unique_ptr<parquet::arrow::FileWriter> writer;
+ PARQUET_THROW_NOT_OK(parquet::arrow::FileWriter::Open(
+ *schema, gc_memory_pool(), sink, properties, arrow_properties, &writer));
+ return std::move(writer);
+}
+
+// [[parquet::export]]
+void parquet___arrow___FileWriter__WriteTable(
+ const std::shared_ptr<parquet::arrow::FileWriter>& writer,
+ const std::shared_ptr<arrow::Table>& table, int64_t chunk_size) {
+ PARQUET_THROW_NOT_OK(writer->WriteTable(*table, chunk_size));
+}
+
+// [[parquet::export]]
+void parquet___arrow___FileWriter__Close(
+ const std::shared_ptr<parquet::arrow::FileWriter>& writer) {
+ PARQUET_THROW_NOT_OK(writer->Close());
+}
+
+// [[parquet::export]]
+void parquet___arrow___WriteTable(
+ const std::shared_ptr<arrow::Table>& table,
+ const std::shared_ptr<arrow::io::OutputStream>& sink,
+ const std::shared_ptr<parquet::WriterProperties>& properties,
+ const std::shared_ptr<parquet::ArrowWriterProperties>& arrow_properties) {
+ PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(
+ *table, gc_memory_pool(), sink, table->num_rows(), properties, arrow_properties));
+}
+
+// [[parquet::export]]
+std::shared_ptr<arrow::Schema> parquet___arrow___FileReader__GetSchema(
+ const std::shared_ptr<parquet::arrow::FileReader>& reader) {
+ std::shared_ptr<arrow::Schema> schema;
+ StopIfNotOk(reader->GetSchema(&schema));
+ return schema;
+}
+
+#endif
diff --git a/src/arrow/r/src/py-to-r.cpp b/src/arrow/r/src/py-to-r.cpp
new file mode 100644
index 000000000..80cd65c51
--- /dev/null
+++ b/src/arrow/r/src/py-to-r.cpp
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/c/bridge.h>
+
+// [[arrow::export]]
+arrow::r::Pointer<struct ArrowSchema> allocate_arrow_schema() { return {}; }
+
+// [[arrow::export]]
+void delete_arrow_schema(arrow::r::Pointer<struct ArrowSchema> ptr) { ptr.finalize(); }
+
+// [[arrow::export]]
+arrow::r::Pointer<struct ArrowArray> allocate_arrow_array() { return {}; }
+
+// [[arrow::export]]
+void delete_arrow_array(arrow::r::Pointer<struct ArrowArray> ptr) { ptr.finalize(); }
+
+// [[arrow::export]]
+arrow::r::Pointer<struct ArrowArrayStream> allocate_arrow_array_stream() { return {}; }
+
+// [[arrow::export]]
+void delete_arrow_array_stream(arrow::r::Pointer<struct ArrowArrayStream> ptr) {
+ ptr.finalize();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> ImportArray(arrow::r::Pointer<struct ArrowArray> array,
+ arrow::r::Pointer<struct ArrowSchema> schema) {
+ return ValueOrStop(arrow::ImportArray(array, schema));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> ImportRecordBatch(
+ arrow::r::Pointer<struct ArrowArray> array,
+ arrow::r::Pointer<struct ArrowSchema> schema) {
+ return ValueOrStop(arrow::ImportRecordBatch(array, schema));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> ImportSchema(
+ arrow::r::Pointer<struct ArrowSchema> schema) {
+ return ValueOrStop(arrow::ImportSchema(schema));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> ImportField(arrow::r::Pointer<struct ArrowSchema> field) {
+ return ValueOrStop(arrow::ImportField(field));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ImportType(arrow::r::Pointer<struct ArrowSchema> type) {
+ return ValueOrStop(arrow::ImportType(type));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatchReader> ImportRecordBatchReader(
+ arrow::r::Pointer<struct ArrowArrayStream> stream) {
+ return ValueOrStop(arrow::ImportRecordBatchReader(stream));
+}
+
+// [[arrow::export]]
+void ExportType(const std::shared_ptr<arrow::DataType>& type,
+ arrow::r::Pointer<struct ArrowSchema> ptr) {
+ StopIfNotOk(arrow::ExportType(*type, ptr));
+}
+
+// [[arrow::export]]
+void ExportField(const std::shared_ptr<arrow::Field>& field,
+ arrow::r::Pointer<struct ArrowSchema> ptr) {
+ StopIfNotOk(arrow::ExportField(*field, ptr));
+}
+
+// [[arrow::export]]
+void ExportSchema(const std::shared_ptr<arrow::Schema>& schema,
+ arrow::r::Pointer<struct ArrowSchema> ptr) {
+ StopIfNotOk(arrow::ExportSchema(*schema, ptr));
+}
+
+// [[arrow::export]]
+void ExportArray(const std::shared_ptr<arrow::Array>& array,
+ arrow::r::Pointer<struct ArrowArray> array_ptr,
+ arrow::r::Pointer<struct ArrowSchema> schema_ptr) {
+ StopIfNotOk(arrow::ExportArray(*array, array_ptr, schema_ptr));
+}
+
+// [[arrow::export]]
+void ExportRecordBatch(const std::shared_ptr<arrow::RecordBatch>& batch,
+ arrow::r::Pointer<struct ArrowArray> array_ptr,
+ arrow::r::Pointer<struct ArrowSchema> schema_ptr) {
+ StopIfNotOk(arrow::ExportRecordBatch(*batch, array_ptr, schema_ptr));
+}
+
+// [[arrow::export]]
+void ExportRecordBatchReader(const std::shared_ptr<arrow::RecordBatchReader>& reader,
+ arrow::r::Pointer<struct ArrowArrayStream> stream_ptr) {
+ StopIfNotOk(arrow::ExportRecordBatchReader(reader, stream_ptr));
+}
+
+#endif
diff --git a/src/arrow/r/src/r_task_group.h b/src/arrow/r/src/r_task_group.h
new file mode 100644
index 000000000..723251cd9
--- /dev/null
+++ b/src/arrow/r/src/r_task_group.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/util/parallel.h>
+#include <arrow/util/task_group.h>
+
+namespace arrow {
+namespace r {
+
+class RTasks {
+ public:
+ using Task = internal::FnOnce<Status()>;
+
+ explicit RTasks(bool use_threads);
+
+ // This Finish() method must never be called from a thread pool thread
+ // as this would deadlock.
+ //
+ // Usage is to :
+ // - create an RTasks instance on the main thread
+ // - add some tasks with .Append()
+ // - and then call .Finish() so that the parallel tasks are finished
+ Status Finish();
+ void Append(bool parallel, Task&& task);
+
+ void Reset();
+
+ bool use_threads_;
+ StopSource stop_source_;
+ std::shared_ptr<arrow::internal::TaskGroup> parallel_tasks_;
+ std::vector<Task> delayed_serial_tasks_;
+};
+
+} // namespace r
+} // namespace arrow
+
+#endif
diff --git a/src/arrow/r/src/r_to_arrow.cpp b/src/arrow/r/src/r_to_arrow.cpp
new file mode 100644
index 000000000..d3d3ac69f
--- /dev/null
+++ b/src/arrow/r/src/r_to_arrow.cpp
@@ -0,0 +1,1439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+#include "./arrow_vctrs.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/array/builder_base.h>
+#include <arrow/array/builder_binary.h>
+#include <arrow/array/builder_decimal.h>
+#include <arrow/array/builder_dict.h>
+#include <arrow/array/builder_nested.h>
+#include <arrow/array/builder_primitive.h>
+#include <arrow/table.h>
+#include <arrow/type_traits.h>
+#include <arrow/util/bitmap_writer.h>
+#include <arrow/util/checked_cast.h>
+#include <arrow/util/converter.h>
+#include <arrow/util/logging.h>
+
+#include "./r_task_group.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+using internal::Converter;
+using internal::DictionaryConverter;
+using internal::ListConverter;
+using internal::PrimitiveConverter;
+using internal::StructConverter;
+
+using internal::MakeChunker;
+using internal::MakeConverter;
+
+namespace r {
+
+struct RConversionOptions {
+ RConversionOptions() = default;
+
+ std::shared_ptr<arrow::DataType> type;
+ bool strict;
+ int64_t size;
+};
+
+enum RVectorType {
+ BOOLEAN,
+ UINT8,
+ INT32,
+ FLOAT64,
+ INT64,
+ COMPLEX,
+ STRING,
+ DATAFRAME,
+ DATE_INT,
+ DATE_DBL,
+ TIME,
+ POSIXCT,
+ POSIXLT,
+ BINARY,
+ LIST,
+ FACTOR,
+
+ OTHER
+};
+
+// this flattens out a logical type of what an R object is
+// because TYPEOF() is not detailed enough
+// we can't use arrow types though as there is no 1-1 mapping
+RVectorType GetVectorType(SEXP x) {
+ switch (TYPEOF(x)) {
+ case LGLSXP:
+ return BOOLEAN;
+ case RAWSXP:
+ return UINT8;
+ case INTSXP:
+ if (Rf_inherits(x, "factor")) {
+ return FACTOR;
+ } else if (Rf_inherits(x, "Date")) {
+ return DATE_INT;
+ }
+ return INT32;
+ case STRSXP:
+ return STRING;
+ case CPLXSXP:
+ return COMPLEX;
+ case REALSXP: {
+ if (Rf_inherits(x, "Date")) {
+ return DATE_DBL;
+ } else if (Rf_inherits(x, "integer64")) {
+ return INT64;
+ } else if (Rf_inherits(x, "POSIXct")) {
+ return POSIXCT;
+ } else if (Rf_inherits(x, "difftime")) {
+ return TIME;
+ } else {
+ return FLOAT64;
+ }
+ }
+ case VECSXP: {
+ if (Rf_inherits(x, "data.frame")) {
+ return DATAFRAME;
+ }
+
+ if (Rf_inherits(x, "POSIXlt")) {
+ return POSIXLT;
+ }
+
+ if (Rf_inherits(x, "arrow_binary")) {
+ return BINARY;
+ }
+
+ return LIST;
+ }
+ default:
+ break;
+ }
+ return OTHER;
+}
+
+template <typename T>
+bool is_NA(T value);
+
+template <>
+bool is_NA<int>(int value) {
+ return value == NA_INTEGER;
+}
+
+template <>
+bool is_NA<double>(double value) {
+ return ISNA(value);
+}
+
+template <>
+bool is_NA<uint8_t>(uint8_t value) {
+ return false;
+}
+
+template <>
+bool is_NA<cpp11::r_bool>(cpp11::r_bool value) {
+ return value == NA_LOGICAL;
+}
+
+template <>
+bool is_NA<cpp11::r_string>(cpp11::r_string value) {
+ return value == NA_STRING;
+}
+
+template <>
+bool is_NA<SEXP>(SEXP value) {
+ return Rf_isNull(value);
+}
+
+template <>
+bool is_NA<int64_t>(int64_t value) {
+ return value == NA_INT64;
+}
+
+template <typename T>
+class RVectorIterator {
+ public:
+ using value_type = T;
+ RVectorIterator(SEXP x, int64_t start)
+ : ptr_x_(reinterpret_cast<const T*>(DATAPTR_RO(x)) + start) {}
+
+ RVectorIterator& operator++() {
+ ++ptr_x_;
+ return *this;
+ }
+
+ const T operator*() const { return *ptr_x_; }
+
+ private:
+ const T* ptr_x_;
+};
+
+template <typename T>
+class RVectorIterator_ALTREP {
+ public:
+ using value_type = T;
+ using data_type =
+ typename std::conditional<std::is_same<T, int64_t>::value, double, T>::type;
+ using r_vector_type = cpp11::r_vector<data_type>;
+ using r_vector_iterator = typename r_vector_type::const_iterator;
+
+ RVectorIterator_ALTREP(SEXP x, int64_t start)
+ : vector_(x), it_(vector_.begin() + start) {}
+
+ RVectorIterator_ALTREP& operator++() {
+ ++it_;
+ return *this;
+ }
+
+ const T operator*() const { return GetValue(*it_); }
+
+ static T GetValue(data_type x) { return x; }
+
+ private:
+ r_vector_type vector_;
+ r_vector_iterator it_;
+};
+
+template <>
+int64_t RVectorIterator_ALTREP<int64_t>::GetValue(double x) {
+ int64_t value;
+ memcpy(&value, &x, sizeof(int64_t));
+ return value;
+}
+
+template <typename Iterator, typename AppendNull, typename AppendValue>
+Status VisitVector(Iterator it, int64_t n, AppendNull&& append_null,
+ AppendValue&& append_value) {
+ for (R_xlen_t i = 0; i < n; i++, ++it) {
+ auto value = *it;
+
+ if (is_NA<typename Iterator::value_type>(value)) {
+ RETURN_NOT_OK(append_null());
+ } else {
+ RETURN_NOT_OK(append_value(value));
+ }
+ }
+
+ return Status::OK();
+}
+
+class RConverter : public Converter<SEXP, RConversionOptions> {
+ public:
+ virtual Status Append(SEXP) { return Status::NotImplemented("Append"); }
+
+ virtual Status Extend(SEXP values, int64_t size, int64_t offset = 0) {
+ return Status::NotImplemented("Extend");
+ }
+
+ // by default, just delay the ->Extend(), i.e. not run in parallel
+ // implementations might redefine so that ->Extend() is run in parallel
+ virtual void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(false, task);
+ }
+
+ virtual Status ExtendMasked(SEXP values, SEXP mask, int64_t size, int64_t offset = 0) {
+ return Status::NotImplemented("ExtendMasked");
+ }
+};
+
+template <typename T, typename Enable = void>
+class RPrimitiveConverter;
+
+template <typename T>
+Result<T> CIntFromRScalarImpl(int64_t value) {
+ if (value < std::numeric_limits<T>::min() || value > std::numeric_limits<T>::max()) {
+ return Status::Invalid("value outside of range");
+ }
+ return static_cast<T>(value);
+}
+
+template <>
+Result<uint64_t> CIntFromRScalarImpl<uint64_t>(int64_t value) {
+ if (value < 0) {
+ return Status::Invalid("value outside of range");
+ }
+ return static_cast<uint64_t>(value);
+}
+
+// utility to convert R single values from (int, raw, double and int64) vectors
+// to arrow integers and floating point
+struct RConvert {
+ // ---- convert to an arrow integer
+ template <typename Type, typename From>
+ static enable_if_integer<Type, Result<typename Type::c_type>> Convert(Type*,
+ From from) {
+ return CIntFromRScalarImpl<typename Type::c_type>(from);
+ }
+
+ // ---- convert R integer types to double
+ template <typename Type, typename From>
+ static enable_if_t<std::is_same<Type, const DoubleType>::value &&
+ !std::is_same<From, double>::value,
+ Result<typename Type::c_type>>
+ Convert(Type*, From from) {
+ constexpr int64_t kDoubleMax = 1LL << 53;
+ constexpr int64_t kDoubleMin = -(1LL << 53);
+
+ if (from < kDoubleMin || from > kDoubleMax) {
+ return Status::Invalid("Integer value ", from, " is outside of the range exactly",
+ " representable by a IEEE 754 double precision value");
+ }
+ return static_cast<double>(from);
+ }
+
+ // ---- convert double to double
+ template <typename Type, typename From>
+ static enable_if_t<std::is_same<Type, const DoubleType>::value &&
+ std::is_same<From, double>::value,
+ Result<typename Type::c_type>>
+ Convert(Type*, From from) {
+ return from;
+ }
+
+ // ---- convert R integer types to float
+ template <typename Type, typename From>
+ static enable_if_t<std::is_same<Type, const FloatType>::value &&
+ !std::is_same<From, double>::value,
+ Result<typename Type::c_type>>
+ Convert(Type*, From from) {
+ constexpr int64_t kFloatMax = 1LL << 24;
+ constexpr int64_t kFloatMin = -(1LL << 24);
+
+ if (from < kFloatMin || from > kFloatMax) {
+ return Status::Invalid("Integer value ", from, " is outside of the range exactly",
+ " representable by a IEEE 754 single precision value");
+ }
+ return static_cast<float>(from);
+ }
+
+ // ---- convert double to float
+ template <typename Type, typename From>
+ static enable_if_t<std::is_same<Type, const FloatType>::value &&
+ std::is_same<From, double>::value,
+ Result<typename Type::c_type>>
+ Convert(Type*, From from) {
+ return static_cast<float>(from);
+ }
+
+ // ---- convert to half float: not implemented
+ template <typename Type, typename From>
+ static enable_if_t<std::is_same<Type, const HalfFloatType>::value,
+ Result<typename Type::c_type>>
+ Convert(Type*, From from) {
+ return Status::Invalid("Cannot convert to Half Float");
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_null<T>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP, int64_t size, int64_t offset = 0) override {
+ return this->primitive_builder_->AppendNulls(size - offset);
+ }
+};
+
+// TODO: extend this to BooleanType, but this needs some work in RConvert
+template <typename T>
+class RPrimitiveConverter<
+ T, enable_if_t<is_integer_type<T>::value || is_floating_type<T>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ auto rtype = GetVectorType(x);
+ switch (rtype) {
+ case UINT8:
+ return ExtendDispatch<unsigned char>(x, size, offset);
+ case INT32:
+ return ExtendDispatch<int>(x, size, offset);
+ case FLOAT64:
+ return ExtendDispatch<double>(x, size, offset);
+ case INT64:
+ return ExtendDispatch<int64_t>(x, size, offset);
+
+ default:
+ break;
+ }
+ // TODO: mention T in the error
+ return Status::Invalid("cannot convert");
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(!ALTREP(values), std::move(task));
+ }
+
+ private:
+ template <typename r_value_type>
+ Status ExtendDispatch(SEXP x, int64_t size, int64_t offset) {
+ if (ALTREP(x)) {
+ // `x` is an ALTREP R vector storing `r_value_type`
+ // and that type matches exactly the type of the array this is building
+ return Extend_impl(RVectorIterator_ALTREP<r_value_type>(x, offset), size);
+ } else {
+ // `x` is not an ALTREP vector so we have direct access to a range of values
+ return Extend_impl(RVectorIterator<r_value_type>(x, offset), size);
+ }
+ }
+
+ template <typename Iterator>
+ Status Extend_impl(Iterator it, int64_t size) {
+ using r_value_type = typename Iterator::value_type;
+ RETURN_NOT_OK(this->primitive_builder_->Reserve(size));
+
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+
+ if (std::is_same<typename T::c_type, r_value_type>::value) {
+ auto append_value = [this](r_value_type value) {
+ this->primitive_builder_->UnsafeAppend(value);
+ return Status::OK();
+ };
+ return VisitVector(it, size, append_null, append_value);
+ } else {
+ auto append_value = [this](r_value_type value) {
+ ARROW_ASSIGN_OR_RAISE(auto converted,
+ RConvert::Convert(this->primitive_type_, value));
+ this->primitive_builder_->UnsafeAppend(converted);
+ return Status::OK();
+ };
+ return VisitVector(it, size, append_null, append_value);
+ }
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_t<is_boolean_type<T>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ auto rtype = GetVectorType(x);
+ if (rtype != BOOLEAN) {
+ return Status::Invalid("Expecting a logical vector");
+ }
+
+ if (ALTREP(x)) {
+ return Extend_impl(RVectorIterator_ALTREP<cpp11::r_bool>(x, offset), size);
+ } else {
+ return Extend_impl(RVectorIterator<cpp11::r_bool>(x, offset), size);
+ }
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(!ALTREP(values), std::move(task));
+ }
+
+ private:
+ template <typename Iterator>
+ Status Extend_impl(Iterator it, int64_t size) {
+ RETURN_NOT_OK(this->Reserve(size));
+
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+ auto append_value = [this](cpp11::r_bool value) {
+ this->primitive_builder_->UnsafeAppend(value == 1);
+ return Status::OK();
+ };
+ return VisitVector(it, size, append_null, append_value);
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_t<is_date_type<T>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ switch (GetVectorType(x)) {
+ case DATE_INT:
+ return AppendRange_Date_dispatch<int>(x, size, offset);
+
+ case DATE_DBL:
+ return AppendRange_Date_dispatch<double>(x, size, offset);
+
+ case POSIXCT:
+ return AppendRange_Posixct_dispatch(x, size, offset);
+
+ default:
+ break;
+ }
+
+ return Status::Invalid("cannot convert to date type ");
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(!ALTREP(values), std::move(task));
+ }
+
+ private:
+ template <typename r_value_type>
+ Status AppendRange_Date_dispatch(SEXP x, int64_t size, int64_t offset) {
+ if (ALTREP(x)) {
+ return AppendRange_Date(RVectorIterator_ALTREP<r_value_type>(x, offset),
+ size - offset);
+ } else {
+ return AppendRange_Date(RVectorIterator<r_value_type>(x, offset), size - offset);
+ }
+ }
+
+ template <typename Iterator>
+ Status AppendRange_Date(Iterator it, int64_t size) {
+ using r_value_type = typename Iterator::value_type;
+ RETURN_NOT_OK(this->Reserve(size));
+
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+ auto append_value = [this](r_value_type value) {
+ this->primitive_builder_->UnsafeAppend(FromRDate(this->primitive_type_, value));
+ return Status::OK();
+ };
+ return VisitVector(it, size, append_null, append_value);
+ }
+
+ Status AppendRange_Posixct_dispatch(SEXP x, int64_t size, int64_t offset) {
+ if (ALTREP(x)) {
+ return AppendRange_Posixct(RVectorIterator_ALTREP<double>(x, offset),
+ size - offset);
+ } else {
+ return AppendRange_Posixct(RVectorIterator<double>(x, offset), size - offset);
+ }
+ }
+
+ template <typename Iterator>
+ Status AppendRange_Posixct(Iterator it, int64_t size) {
+ using r_value_type = typename Iterator::value_type;
+ RETURN_NOT_OK(this->Reserve(size));
+
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+ auto append_value = [this](r_value_type value) {
+ this->primitive_builder_->UnsafeAppend(FromPosixct(this->primitive_type_, value));
+ return Status::OK();
+ };
+ return VisitVector(it, size, append_null, append_value);
+ }
+
+ static int FromRDate(const Date32Type*, int from) { return from; }
+
+ static int64_t FromRDate(const Date64Type*, int from) {
+ constexpr int64_t kMilliSecondsPerDay = 86400000;
+ return from * kMilliSecondsPerDay;
+ }
+
+ static int FromPosixct(const Date32Type*, double from) {
+ constexpr int64_t kSecondsPerDay = 86400;
+ return from / kSecondsPerDay;
+ }
+
+ static int64_t FromPosixct(const Date64Type*, double from) { return from * 1000; }
+};
+
+int64_t get_TimeUnit_multiplier(TimeUnit::type unit) {
+ switch (unit) {
+ case TimeUnit::SECOND:
+ return 1;
+ case TimeUnit::MILLI:
+ return 1000;
+ case TimeUnit::MICRO:
+ return 1000000;
+ case TimeUnit::NANO:
+ return 1000000000;
+ default:
+ return 0;
+ }
+}
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_t<is_time_type<T>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RETURN_NOT_OK(this->Reserve(size - offset));
+ auto rtype = GetVectorType(x);
+ if (rtype != TIME) {
+ return Status::Invalid("Invalid conversion to time");
+ }
+
+ // multiplier to get the number of seconds from the value stored in the R vector
+ int difftime_multiplier;
+ std::string unit(CHAR(STRING_ELT(Rf_getAttrib(x, symbols::units), 0)));
+ if (unit == "secs") {
+ difftime_multiplier = 1;
+ } else if (unit == "mins") {
+ difftime_multiplier = 60;
+ } else if (unit == "hours") {
+ difftime_multiplier = 3600;
+ } else if (unit == "days") {
+ difftime_multiplier = 86400;
+ } else if (unit == "weeks") {
+ difftime_multiplier = 604800;
+ } else {
+ return Status::Invalid("unknown difftime unit");
+ }
+
+ // then multiply the seconds by this to match the time unit
+ auto multiplier =
+ get_TimeUnit_multiplier(this->primitive_type_->unit()) * difftime_multiplier;
+
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+ auto append_value = [this, multiplier](double value) {
+ auto converted = static_cast<typename T::c_type>(value * multiplier);
+ this->primitive_builder_->UnsafeAppend(converted);
+ return Status::OK();
+ };
+
+ if (ALTREP(x)) {
+ return VisitVector(RVectorIterator_ALTREP<double>(x, offset), size, append_null,
+ append_value);
+ } else {
+ return VisitVector(RVectorIterator<double>(x, offset), size, append_null,
+ append_value);
+ }
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(!ALTREP(values), std::move(task));
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_t<is_timestamp_type<T>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RETURN_NOT_OK(this->Reserve(size - offset));
+
+ RVectorType rtype = GetVectorType(x);
+ if (rtype != POSIXCT) {
+ return Status::Invalid("Invalid conversion to timestamp");
+ }
+
+ int64_t multiplier = get_TimeUnit_multiplier(this->primitive_type_->unit());
+
+ auto append_value = [this, multiplier](double value) {
+ auto converted = static_cast<typename T::c_type>(value * multiplier);
+ this->primitive_builder_->UnsafeAppend(converted);
+ return Status::OK();
+ };
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+
+ if (ALTREP(x)) {
+ return VisitVector(RVectorIterator_ALTREP<double>(x, offset), size, append_null,
+ append_value);
+ } else {
+ return VisitVector(RVectorIterator<double>(x, offset), size, append_null,
+ append_value);
+ }
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(!ALTREP(values), std::move(task));
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_t<is_decimal_type<T>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ return Status::NotImplemented("Extend");
+ }
+};
+
+Status check_binary(SEXP x, int64_t size) {
+ RVectorType rtype = GetVectorType(x);
+ switch (rtype) {
+ case BINARY:
+ break;
+ case LIST: {
+ // check this is a list of raw vectors
+ const SEXP* p_x = VECTOR_PTR_RO(x);
+ for (R_xlen_t i = 0; i < size; i++, ++p_x) {
+ if (TYPEOF(*p_x) != RAWSXP) {
+ return Status::Invalid("invalid R type to convert to binary");
+ }
+ }
+ break;
+ }
+ default:
+ return Status::Invalid("invalid R type to convert to binary");
+ }
+ return Status::OK();
+}
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_binary<T>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ using OffsetType = typename T::offset_type;
+
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RETURN_NOT_OK(this->Reserve(size - offset));
+ RETURN_NOT_OK(check_binary(x, size));
+
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+
+ auto append_value = [this](SEXP raw) {
+ R_xlen_t n = XLENGTH(raw);
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(n));
+ this->primitive_builder_->UnsafeAppend(RAW_RO(raw), static_cast<OffsetType>(n));
+ return Status::OK();
+ };
+ return VisitVector(RVectorIterator<SEXP>(x, offset), size, append_null, append_value);
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(!ALTREP(values), std::move(task));
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RETURN_NOT_OK(this->Reserve(size - offset));
+ RETURN_NOT_OK(check_binary(x, size));
+
+ auto append_null = [this]() {
+ this->primitive_builder_->UnsafeAppendNull();
+ return Status::OK();
+ };
+
+ auto append_value = [this](SEXP raw) {
+ R_xlen_t n = XLENGTH(raw);
+
+ if (n != this->primitive_builder_->byte_width()) {
+ return Status::Invalid("invalid size");
+ }
+ ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(n));
+ this->primitive_builder_->UnsafeAppend(RAW_RO(raw));
+ return Status::OK();
+ };
+ return VisitVector(RVectorIterator<SEXP>(x, offset), size, append_null, append_value);
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ tasks.Append(!ALTREP(values), std::move(task));
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_string_like<T>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ using OffsetType = typename T::offset_type;
+
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RVectorType rtype = GetVectorType(x);
+ if (rtype != STRING) {
+ return Status::Invalid("Expecting a character vector");
+ }
+ return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset);
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ auto task = [this, values, size]() { return this->Extend(values, size); };
+ // TODO: refine this., e.g. extract setup from Extend()
+ tasks.Append(false, std::move(task));
+ }
+
+ private:
+ Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) {
+ RETURN_NOT_OK(this->primitive_builder_->Reserve(s.size()));
+ const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s));
+
+ // we know all the R strings are utf8 already, so we can get
+ // a definite size and then use UnsafeAppend*()
+ int64_t total_length = 0;
+ for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
+ SEXP si = *p_strings;
+ total_length += si == NA_STRING ? 0 : LENGTH(si);
+ }
+ RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length));
+
+ // append
+ p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s));
+ for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
+ SEXP si = *p_strings;
+ if (si == NA_STRING) {
+ this->primitive_builder_->UnsafeAppendNull();
+ } else {
+ this->primitive_builder_->UnsafeAppend(CHAR(si), LENGTH(si));
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
+template <typename T>
+class RPrimitiveConverter<T, enable_if_t<is_duration_type<T>::value>>
+ : public PrimitiveConverter<T, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ // TODO: look in lubridate
+ return Status::NotImplemented("Extend");
+ }
+};
+
+template <typename T>
+class RListConverter;
+
+template <typename U, typename Enable = void>
+class RDictionaryConverter;
+
+template <typename U>
+class RDictionaryConverter<U, enable_if_has_c_type<U>>
+ : public DictionaryConverter<U, RConverter> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ return Status::NotImplemented("Extend");
+ }
+};
+
+template <typename ValueType>
+class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
+ : public DictionaryConverter<ValueType, RConverter> {
+ public:
+ using BuilderType = DictionaryBuilder<ValueType>;
+
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RETURN_NOT_OK(ExtendSetup(x, size, offset));
+ return ExtendImpl(x, size, offset, GetCharLevels(x));
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ // the setup runs synchronously first
+ Status setup = ExtendSetup(values, size, /*offset=*/0);
+
+ if (!setup.ok()) {
+ // if that fails, propagate the error
+ tasks.Append(false, [setup]() { return setup; });
+ } else {
+ auto char_levels = GetCharLevels(values);
+
+ tasks.Append(true, [this, values, size, char_levels]() {
+ return this->ExtendImpl(values, size, /*offset=*/0, char_levels);
+ });
+ }
+ }
+
+ Result<std::shared_ptr<Array>> ToArray() override {
+ ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish());
+
+ auto result_type = checked_cast<DictionaryType*>(result->type().get());
+ if (this->dict_type_->ordered() && !result_type->ordered()) {
+ // TODO: we should not have to do that, there is probably something wrong
+ // in the DictionaryBuilder code
+ result->data()->type =
+ arrow::dictionary(result_type->index_type(), result_type->value_type(), true);
+ }
+
+ return std::make_shared<DictionaryArray>(result->data());
+ }
+
+ private:
+ std::vector<const char*> GetCharLevels(SEXP x) {
+ SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
+ R_xlen_t n_levels = XLENGTH(levels);
+ std::vector<const char*> char_levels(XLENGTH(levels));
+ const SEXP* p_levels = reinterpret_cast<const SEXP*>(DATAPTR_RO(levels));
+ for (R_xlen_t i = 0; i < n_levels; i++, ++p_levels) {
+ char_levels[i] = CHAR(*p_levels);
+ }
+
+ return char_levels;
+ }
+
+ Status ExtendSetup(SEXP x, int64_t size, int64_t offset) {
+ RVectorType rtype = GetVectorType(x);
+ if (rtype != FACTOR) {
+ return Status::Invalid("invalid R type to convert to dictionary");
+ }
+
+ // first we need to handle the levels
+ SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
+ auto memo_array = arrow::r::vec_to_arrow(levels, utf8(), false);
+ RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*memo_array));
+
+ // then we can proceed
+ return this->Reserve(size - offset);
+ }
+
+ Status ExtendImpl(SEXP values, int64_t size, int64_t offset,
+ const std::vector<const char*>& char_levels) {
+ auto append_null = [this]() { return this->value_builder_->AppendNull(); };
+ auto append_value = [this, &char_levels](int value) {
+ return this->value_builder_->Append(char_levels[value - 1]);
+ };
+
+ return VisitVector(RVectorIterator<int>(values, offset), size, append_null,
+ append_value);
+ }
+};
+
+template <typename T, typename Enable = void>
+struct RConverterTrait;
+
+template <typename T>
+struct RConverterTrait<
+ T, enable_if_t<!is_nested_type<T>::value && !is_interval_type<T>::value &&
+ !is_extension_type<T>::value>> {
+ using type = RPrimitiveConverter<T>;
+};
+
+template <typename T>
+struct RConverterTrait<T, enable_if_list_like<T>> {
+ using type = RListConverter<T>;
+};
+
+template <typename T>
+class RListConverter : public ListConverter<T, RConverter, RConverterTrait> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RETURN_NOT_OK(this->Reserve(size));
+
+ RVectorType rtype = GetVectorType(x);
+ if (rtype != LIST) {
+ return Status::Invalid("Cannot convert to list type");
+ }
+
+ auto append_null = [this]() { return this->list_builder_->AppendNull(); };
+
+ auto append_value = [this](SEXP value) {
+ // TODO: if we decide that this can be run concurrently
+ // we'll have to do vec_size() upfront
+ int n = vctrs::vec_size(value);
+
+ RETURN_NOT_OK(this->list_builder_->ValidateOverflow(n));
+ RETURN_NOT_OK(this->list_builder_->Append());
+ return this->value_converter_.get()->Extend(value, n);
+ };
+
+ return VisitVector(RVectorIterator<SEXP>(x, offset), size, append_null, append_value);
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ // NOTE: because Extend::[]append_value() calls Extend() on the
+ // value converter, which might require a setup step, it feels
+ // complicated to run this task concurrently.
+ //
+ // TODO: perhaps allow running concurrently in some cases, e.g. list(int32(!altrep))
+ tasks.Append(false, [this, values, size]() { return this->Extend(values, size); });
+ }
+};
+
+class RStructConverter;
+
+template <>
+struct RConverterTrait<StructType> {
+ using type = RStructConverter;
+};
+
+class RStructConverter : public StructConverter<RConverter, RConverterTrait> {
+ public:
+ Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+ RETURN_NOT_OK(ExtendSetup(x, size, offset));
+
+ auto fields = this->struct_type_->fields();
+ R_xlen_t n_columns = XLENGTH(x);
+ for (R_xlen_t i = offset; i < n_columns; i++) {
+ auto status = children_[i]->Extend(VECTOR_ELT(x, i), size);
+ if (!status.ok()) {
+ return Status::Invalid("Problem with column ", (i + 1), " (", fields[i]->name(),
+ "): ", status.ToString());
+ }
+ }
+
+ return Status::OK();
+ }
+
+ void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+ // the setup runs synchronously first
+ Status setup = ExtendSetup(values, size, /*offset=*/0);
+
+ if (!setup.ok()) {
+ // if that fails, propagate the error
+ tasks.Append(false, [setup]() { return setup; });
+ } else {
+ // otherwise deal with each column, maybe concurrently
+ auto fields = this->struct_type_->fields();
+ R_xlen_t n_columns = XLENGTH(values);
+
+ for (R_xlen_t i = 0; i < n_columns; i++) {
+ children_[i]->DelayedExtend(VECTOR_ELT(values, i), size, tasks);
+ }
+ }
+ }
+
+ protected:
+ Status Init(MemoryPool* pool) override {
+ return StructConverter<RConverter, RConverterTrait>::Init(pool);
+ }
+
+ Status ExtendSetup(SEXP x, int64_t size, int64_t offset) {
+ // check that x is compatible
+ R_xlen_t n_columns = XLENGTH(x);
+
+ if (!Rf_inherits(x, "data.frame") && !Rf_inherits(x, "POSIXlt")) {
+ return Status::Invalid("Can only convert data frames to Struct type");
+ }
+
+ auto fields = this->struct_type_->fields();
+ if (n_columns != static_cast<R_xlen_t>(fields.size())) {
+ return Status::RError("Number of fields in struct (", fields.size(),
+ ") incompatible with number of columns in the data frame (",
+ n_columns, ")");
+ }
+
+ cpp11::strings x_names = Rf_getAttrib(x, R_NamesSymbol);
+
+ RETURN_NOT_OK(cpp11::unwind_protect([&] {
+ for (int i = 0; i < n_columns; i++) {
+ const char* name_i = arrow::r::unsafe::utf8_string(x_names[i]);
+ auto field_name = fields[i]->name();
+ if (field_name != name_i) {
+ return Status::RError(
+ "Field name in position ", i, " (", field_name,
+ ") does not match the name of the column of the data frame (", name_i, ")");
+ }
+ }
+
+ return Status::OK();
+ }));
+
+ for (R_xlen_t i = 0; i < n_columns; i++) {
+ SEXP x_i = VECTOR_ELT(x, i);
+ if (vctrs::vec_size(x_i) < size) {
+ return Status::RError("Degenerated data frame");
+ }
+ }
+
+ RETURN_NOT_OK(this->Reserve(size - offset));
+
+ for (R_xlen_t i = 0; i < size; i++) {
+ RETURN_NOT_OK(struct_builder_->Append());
+ }
+
+ return Status::OK();
+ }
+};
+
+template <>
+struct RConverterTrait<DictionaryType> {
+ template <typename T>
+ using dictionary_type = RDictionaryConverter<T>;
+};
+
+// ---- short circuit the Converter api entirely when we can do zero-copy
+
+// in some situations we can just use the memory of the R object in an RBuffer
+// instead of going through ArrayBuilder, etc ...
+bool can_reuse_memory(SEXP x, const std::shared_ptr<arrow::DataType>& type) {
+ // TODO: this probably should be disabled when x is an ALTREP object
+ // because MakeSimpleArray below will force materialization
+ switch (type->id()) {
+ case Type::INT32:
+ return TYPEOF(x) == INTSXP && !OBJECT(x);
+ case Type::DOUBLE:
+ return TYPEOF(x) == REALSXP && !OBJECT(x);
+ case Type::INT8:
+ return TYPEOF(x) == RAWSXP && !OBJECT(x);
+ case Type::INT64:
+ return TYPEOF(x) == REALSXP && Rf_inherits(x, "integer64");
+ default:
+ break;
+ }
+ return false;
+}
+
+// this is only used on some special cases when the arrow Array can just use the memory of
+// the R object, via an RBuffer, hence be zero copy
+template <int RTYPE, typename RVector, typename Type>
+std::shared_ptr<Array> MakeSimpleArray(SEXP x) {
+ using value_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
+ RVector vec(x);
+ auto n = vec.size();
+ auto p_vec_start = reinterpret_cast<const value_type*>(DATAPTR_RO(vec));
+ auto p_vec_end = p_vec_start + n;
+ std::vector<std::shared_ptr<Buffer>> buffers{nullptr,
+ std::make_shared<RBuffer<RVector>>(vec)};
+
+ int null_count = 0;
+
+ auto first_na = std::find_if(p_vec_start, p_vec_end, is_NA<value_type>);
+ if (first_na < p_vec_end) {
+ auto null_bitmap =
+ ValueOrStop(AllocateBuffer(BitUtil::BytesForBits(n), gc_memory_pool()));
+ internal::FirstTimeBitmapWriter bitmap_writer(null_bitmap->mutable_data(), 0, n);
+
+ // first loop to clear all the bits before the first NA
+ auto j = std::distance(p_vec_start, first_na);
+ int i = 0;
+ for (; i < j; i++, bitmap_writer.Next()) {
+ bitmap_writer.Set();
+ }
+
+ auto p_vec = first_na;
+ // then finish
+ for (; i < n; i++, bitmap_writer.Next(), ++p_vec) {
+ if (is_NA<value_type>(*p_vec)) {
+ bitmap_writer.Clear();
+ null_count++;
+ } else {
+ bitmap_writer.Set();
+ }
+ }
+
+ bitmap_writer.Finish();
+ buffers[0] = std::move(null_bitmap);
+ }
+
+ auto data = ArrayData::Make(std::make_shared<Type>(), LENGTH(x), std::move(buffers),
+ null_count, 0 /*offset*/);
+
+ // return the right Array class
+ return std::make_shared<typename TypeTraits<Type>::ArrayType>(data);
+}
+
+std::shared_ptr<arrow::Array> vec_to_arrow__reuse_memory(SEXP x) {
+ auto type = TYPEOF(x);
+
+ if (type == INTSXP) {
+ return MakeSimpleArray<INTSXP, cpp11::integers, Int32Type>(x);
+ } else if (type == REALSXP && Rf_inherits(x, "integer64")) {
+ return MakeSimpleArray<REALSXP, cpp11::doubles, Int64Type>(x);
+ } else if (type == REALSXP) {
+ return MakeSimpleArray<REALSXP, cpp11::doubles, DoubleType>(x);
+ } else if (type == RAWSXP) {
+ return MakeSimpleArray<RAWSXP, cpp11::raws, UInt8Type>(x);
+ }
+
+ cpp11::stop("Unreachable: you might need to fix can_reuse_memory()");
+}
+
+namespace altrep {
+std::shared_ptr<Array> vec_to_arrow_altrep_bypass(SEXP); // in altrep.cpp
+}
+
+std::shared_ptr<arrow::Array> vec_to_arrow(SEXP x,
+ const std::shared_ptr<arrow::DataType>& type,
+ bool type_inferred) {
+ // short circuit if `x` is already an Array
+ if (Rf_inherits(x, "Array")) {
+ return cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x);
+ }
+
+ // short circuit if `x` is an altrep vector that shells an Array
+ auto maybe = altrep::vec_to_arrow_altrep_bypass(x);
+ if (maybe.get()) {
+ return maybe;
+ }
+
+ RConversionOptions options;
+ options.strict = !type_inferred;
+ options.type = type;
+ options.size = vctrs::vec_size(x);
+
+ // maybe short circuit when zero-copy is possible
+ if (can_reuse_memory(x, options.type)) {
+ return vec_to_arrow__reuse_memory(x);
+ }
+
+ // otherwise go through the converter api
+ auto converter = ValueOrStop(MakeConverter<RConverter, RConverterTrait>(
+ options.type, options, gc_memory_pool()));
+
+ StopIfNotOk(converter->Extend(x, options.size));
+
+ return ValueOrStop(converter->ToArray());
+}
+
+// TODO: most of this is very similar to MakeSimpleArray, just adapted to
+// leverage concurrency. Maybe some refactoring needed.
+template <typename RVector, typename Type>
+bool vector_from_r_memory_impl(SEXP x, const std::shared_ptr<DataType>& type,
+ std::vector<std::shared_ptr<arrow::ChunkedArray>>& columns,
+ int j, RTasks& tasks) {
+ RVector vec(x);
+ using value_type = typename arrow::TypeTraits<Type>::ArrayType::value_type;
+ auto buffer = std::make_shared<RBuffer<RVector>>(vec);
+
+ tasks.Append(true, [buffer, x, &columns, j]() {
+ std::vector<std::shared_ptr<Buffer>> buffers{nullptr, buffer};
+
+ auto n = XLENGTH(x);
+ auto p_x_start = reinterpret_cast<const value_type*>(DATAPTR_RO(x));
+ auto p_x_end = p_x_start + n;
+
+ int null_count = 0;
+ auto first_na = std::find_if(p_x_start, p_x_end, is_NA<value_type>);
+
+ if (first_na < p_x_end) {
+ auto null_bitmap =
+ ValueOrStop(AllocateBuffer(BitUtil::BytesForBits(n), gc_memory_pool()));
+ internal::FirstTimeBitmapWriter bitmap_writer(null_bitmap->mutable_data(), 0, n);
+
+ // first loop to clear all the bits before the first NA
+ auto k = std::distance(p_x_start, first_na);
+ int i = 0;
+ for (; i < k; i++, bitmap_writer.Next()) {
+ bitmap_writer.Set();
+ }
+
+ auto p_vec = first_na;
+ // then finish
+ for (; i < n; i++, bitmap_writer.Next(), ++p_vec) {
+ if (is_NA<value_type>(*p_vec)) {
+ bitmap_writer.Clear();
+ null_count++;
+ } else {
+ bitmap_writer.Set();
+ }
+ }
+
+ bitmap_writer.Finish();
+ buffers[0] = std::move(null_bitmap);
+ }
+
+ auto data = ArrayData::Make(std::make_shared<Type>(), n, std::move(buffers),
+ null_count, 0 /*offset*/);
+ auto array = std::make_shared<typename TypeTraits<Type>::ArrayType>(data);
+ columns[j] = std::make_shared<arrow::ChunkedArray>(array);
+
+ return Status::OK();
+ });
+
+ return true;
+}
+
+bool vector_from_r_memory(SEXP x, const std::shared_ptr<DataType>& type,
+ std::vector<std::shared_ptr<arrow::ChunkedArray>>& columns,
+ int j, RTasks& tasks) {
+ if (ALTREP(x)) return false;
+
+ switch (type->id()) {
+ case Type::INT32:
+ return TYPEOF(x) == INTSXP && !OBJECT(x) &&
+ vector_from_r_memory_impl<cpp11::integers, Int32Type>(x, type, columns, j,
+ tasks);
+
+ case Type::DOUBLE:
+ return TYPEOF(x) == REALSXP && !OBJECT(x) &&
+ vector_from_r_memory_impl<cpp11::doubles, DoubleType>(x, type, columns, j,
+ tasks);
+
+ case Type::UINT8:
+ return TYPEOF(x) == RAWSXP && !OBJECT(x) &&
+ vector_from_r_memory_impl<cpp11::raws, UInt8Type>(x, type, columns, j,
+ tasks);
+
+ case Type::INT64:
+ return TYPEOF(x) == REALSXP && Rf_inherits(x, "integer64") &&
+ vector_from_r_memory_impl<cpp11::doubles, Int64Type>(x, type, columns, j,
+ tasks);
+ default:
+ break;
+ }
+
+ return false;
+}
+
+} // namespace r
+} // namespace arrow
+
+arrow::Status check_consistent_column_length(
+ const std::vector<std::shared_ptr<arrow::ChunkedArray>>& columns) {
+ if (columns.size()) {
+ int64_t num_rows = columns[0]->length();
+
+ for (const auto& column : columns) {
+ if (column->length() != num_rows) {
+ return arrow::Status::Invalid("All columns must have the same length");
+ }
+ }
+ }
+
+ return arrow::Status::OK();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, SEXP schema_sxp,
+ bool use_threads) {
+ bool infer_schema = !Rf_inherits(schema_sxp, "Schema");
+
+ int num_fields;
+ StopIfNotOk(arrow::r::count_fields(lst, &num_fields));
+
+ // schema + metadata
+ std::shared_ptr<arrow::Schema> schema;
+ StopIfNotOk(arrow::r::InferSchemaFromDots(lst, schema_sxp, num_fields, schema));
+ StopIfNotOk(arrow::r::AddMetadataFromDots(lst, num_fields, schema));
+
+ if (!infer_schema && schema->num_fields() != num_fields) {
+ cpp11::stop("incompatible. schema has %d fields, and %d columns are supplied",
+ schema->num_fields(), num_fields);
+ }
+
+ // table
+ std::vector<std::shared_ptr<arrow::ChunkedArray>> columns(num_fields);
+
+ if (!infer_schema) {
+ auto check_name = [&](int j, SEXP, cpp11::r_string name) {
+ std::string cpp_name(name);
+ if (schema->field(j)->name() != cpp_name) {
+ cpp11::stop("field at index %d has name '%s' != '%s'", j + 1,
+ schema->field(j)->name().c_str(), cpp_name.c_str());
+ }
+ };
+ arrow::r::TraverseDots(lst, num_fields, check_name);
+ }
+
+ // must be careful to avoid R stop() until the tasks
+ // are finished, i.e. after tasks.Finish()
+ arrow::r::RTasks tasks(use_threads);
+
+ arrow::Status status = arrow::Status::OK();
+
+ auto flatten_lst = arrow::r::FlattenDots(lst, num_fields);
+ std::vector<std::unique_ptr<arrow::r::RConverter>> converters(num_fields);
+
+ // init converters
+ for (int j = 0; j < num_fields && status.ok(); j++) {
+ SEXP x = flatten_lst[j];
+
+ if (Rf_inherits(x, "ChunkedArray")) {
+ columns[j] = cpp11::as_cpp<std::shared_ptr<arrow::ChunkedArray>>(x);
+ } else if (Rf_inherits(x, "Array")) {
+ columns[j] = std::make_shared<arrow::ChunkedArray>(
+ cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x));
+ } else {
+ arrow::r::RConversionOptions options;
+ options.strict = !infer_schema;
+ options.type = schema->field(j)->type();
+ options.size = vctrs::vec_size(x);
+
+ // first try to add a task to do a zero copy in parallel
+ if (arrow::r::vector_from_r_memory(x, options.type, columns, j, tasks)) {
+ continue;
+ }
+
+ // if unsuccessful: use RConverter api
+ auto converter_result =
+ arrow::MakeConverter<arrow::r::RConverter, arrow::r::RConverterTrait>(
+ options.type, options, gc_memory_pool());
+ if (!converter_result.ok()) {
+ status = converter_result.status();
+ break;
+ }
+ converters[j] = std::move(converter_result.ValueUnsafe());
+ }
+ }
+
+ // if the previous loop didn't break early, spawn
+ // tasks to Extend, maybe in parallel
+ if (status.ok()) {
+ for (int j = 0; j < num_fields; j++) {
+ auto& converter = converters[j];
+ if (converter != nullptr) {
+ converter->DelayedExtend(flatten_lst[j], converter->options().size, tasks);
+ }
+ }
+ }
+
+ // in any case, this needs to wait until all tasks are finished
+ status &= tasks.Finish();
+
+ // nothing is running in parallel here, so we have an opportunity to stop
+ StopIfNotOk(status);
+
+ // then finally convert to chunked arrays in parallel
+ tasks.Reset();
+
+ for (int j = 0; j < num_fields; j++) {
+ tasks.Append(true, [&columns, j, &converters]() {
+ auto& converter = converters[j];
+ if (converter != nullptr) {
+ ARROW_ASSIGN_OR_RAISE(auto array, converter->ToArray());
+ columns[j] = std::make_shared<arrow::ChunkedArray>(array);
+ }
+ return arrow::Status::OK();
+ });
+ }
+ status &= tasks.Finish();
+ StopIfNotOk(status);
+
+ status &= check_consistent_column_length(columns);
+ StopIfNotOk(status);
+
+ return arrow::Table::Make(schema, columns);
+}
+
+// [[arrow::export]]
+SEXP vec_to_arrow(SEXP x, SEXP s_type) {
+ if (Rf_inherits(x, "Array")) return x;
+ bool type_inferred = Rf_isNull(s_type);
+ std::shared_ptr<arrow::DataType> type;
+
+ if (type_inferred) {
+ type = arrow::r::InferArrowType(x);
+ } else {
+ type = cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(s_type);
+ }
+ return cpp11::to_r6(arrow::r::vec_to_arrow(x, type, type_inferred));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> DictionaryArray__FromArrays(
+ const std::shared_ptr<arrow::DataType>& type,
+ const std::shared_ptr<arrow::Array>& indices,
+ const std::shared_ptr<arrow::Array>& dict) {
+ return ValueOrStop(arrow::DictionaryArray::FromArrays(type, indices, dict));
+}
+
+#endif
diff --git a/src/arrow/r/src/recordbatch.cpp b/src/arrow/r/src/recordbatch.cpp
new file mode 100644
index 000000000..81e20e9ec
--- /dev/null
+++ b/src/arrow/r/src/recordbatch.cpp
@@ -0,0 +1,309 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/array/array_base.h>
+#include <arrow/io/file.h>
+#include <arrow/io/memory.h>
+#include <arrow/ipc/reader.h>
+#include <arrow/ipc/writer.h>
+#include <arrow/type.h>
+#include <arrow/util/key_value_metadata.h>
+
+// [[arrow::export]]
+int RecordBatch__num_columns(const std::shared_ptr<arrow::RecordBatch>& x) {
+ return x->num_columns();
+}
+
+// [[arrow::export]]
+int RecordBatch__num_rows(const std::shared_ptr<arrow::RecordBatch>& x) {
+ return x->num_rows();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> RecordBatch__schema(
+ const std::shared_ptr<arrow::RecordBatch>& x) {
+ return x->schema();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__RenameColumns(
+ const std::shared_ptr<arrow::RecordBatch>& batch,
+ const std::vector<std::string>& names) {
+ int n = batch->num_columns();
+ if (names.size() != static_cast<size_t>(n)) {
+ cpp11::stop("RecordBatch has %d columns but %d names were provided", n, names.size());
+ }
+ std::vector<std::shared_ptr<arrow::Field>> fields(n);
+ for (int i = 0; i < n; i++) {
+ fields[i] = batch->schema()->field(i)->WithName(names[i]);
+ }
+ auto schema = std::make_shared<arrow::Schema>(std::move(fields));
+ return arrow::RecordBatch::Make(schema, batch->num_rows(), batch->columns());
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__ReplaceSchemaMetadata(
+ const std::shared_ptr<arrow::RecordBatch>& x, cpp11::strings metadata) {
+ auto vec_metadata = cpp11::as_cpp<std::vector<std::string>>(metadata);
+ auto names_metadata = cpp11::as_cpp<std::vector<std::string>>(metadata.names());
+ auto kv = std::shared_ptr<arrow::KeyValueMetadata>(
+ new arrow::KeyValueMetadata(names_metadata, vec_metadata));
+ return x->ReplaceSchemaMetadata(kv);
+}
+
+// [[arrow::export]]
+cpp11::list RecordBatch__columns(const std::shared_ptr<arrow::RecordBatch>& batch) {
+ auto nc = batch->num_columns();
+ arrow::ArrayVector res(nc);
+ for (int i = 0; i < nc; i++) {
+ res[i] = batch->column(i);
+ }
+ return arrow::r::to_r_list(res);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> RecordBatch__column(
+ const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i) {
+ arrow::r::validate_index(i, batch->num_columns());
+ return batch->column(i);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> RecordBatch__GetColumnByName(
+ const std::shared_ptr<arrow::RecordBatch>& batch, const std::string& name) {
+ return batch->GetColumnByName(name);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__SelectColumns(
+ const std::shared_ptr<arrow::RecordBatch>& batch, const std::vector<int>& indices) {
+ return ValueOrStop(batch->SelectColumns(indices));
+}
+
+// [[arrow::export]]
+bool RecordBatch__Equals(const std::shared_ptr<arrow::RecordBatch>& self,
+ const std::shared_ptr<arrow::RecordBatch>& other,
+ bool check_metadata) {
+ return self->Equals(*other, check_metadata);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__AddColumn(
+ const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i,
+ const std::shared_ptr<arrow::Field>& field,
+ const std::shared_ptr<arrow::Array>& column) {
+ return ValueOrStop(batch->AddColumn(i, field, column));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__SetColumn(
+ const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i,
+ const std::shared_ptr<arrow::Field>& field,
+ const std::shared_ptr<arrow::Array>& column) {
+ return ValueOrStop(batch->SetColumn(i, field, column));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__RemoveColumn(
+ const std::shared_ptr<arrow::RecordBatch>& batch, R_xlen_t i) {
+ arrow::r::validate_index(i, batch->num_columns());
+ return ValueOrStop(batch->RemoveColumn(i));
+}
+
+// [[arrow::export]]
+std::string RecordBatch__column_name(const std::shared_ptr<arrow::RecordBatch>& batch,
+ R_xlen_t i) {
+ arrow::r::validate_index(i, batch->num_columns());
+ return batch->column_name(i);
+}
+
+// [[arrow::export]]
+cpp11::writable::strings RecordBatch__names(
+ const std::shared_ptr<arrow::RecordBatch>& batch) {
+ int n = batch->num_columns();
+ cpp11::writable::strings names(n);
+ for (int i = 0; i < n; i++) {
+ names[i] = batch->column_name(i);
+ }
+ return names;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__Slice1(
+ const std::shared_ptr<arrow::RecordBatch>& self, R_xlen_t offset) {
+ arrow::r::validate_slice_offset(offset, self->num_rows());
+ return self->Slice(offset);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__Slice2(
+ const std::shared_ptr<arrow::RecordBatch>& self, R_xlen_t offset, R_xlen_t length) {
+ arrow::r::validate_slice_offset(offset, self->num_rows());
+ arrow::r::validate_slice_length(length, self->num_rows() - offset);
+ return self->Slice(offset, length);
+}
+
+// [[arrow::export]]
+cpp11::raws ipc___SerializeRecordBatch__Raw(
+ const std::shared_ptr<arrow::RecordBatch>& batch) {
+ // how many bytes do we need ?
+ int64_t size;
+ StopIfNotOk(arrow::ipc::GetRecordBatchSize(*batch, &size));
+
+ // allocate the result raw vector
+ cpp11::writable::raws out(size);
+
+ // serialize into the bytes of the raw vector
+ auto buffer = std::make_shared<arrow::r::RBuffer<cpp11::raws>>(out);
+ arrow::io::FixedSizeBufferWriter stream(buffer);
+ StopIfNotOk(arrow::ipc::SerializeRecordBatch(
+ *batch, arrow::ipc::IpcWriteOptions::Defaults(), &stream));
+ StopIfNotOk(stream.Close());
+
+ return out;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__InputStream__Schema(
+ const std::shared_ptr<arrow::io::InputStream>& stream,
+ const std::shared_ptr<arrow::Schema>& schema) {
+ // TODO: promote to function arg
+ arrow::ipc::DictionaryMemo memo;
+ StopIfNotOk(memo.fields().AddSchemaFields(*schema));
+ return ValueOrStop(arrow::ipc::ReadRecordBatch(
+ schema, &memo, arrow::ipc::IpcReadOptions::Defaults(), stream.get()));
+}
+
+namespace arrow {
+namespace r {
+
+arrow::Status check_consistent_array_size(
+ const std::vector<std::shared_ptr<arrow::Array>>& arrays, int64_t* num_rows) {
+ if (arrays.size()) {
+ *num_rows = arrays[0]->length();
+
+ for (const auto& array : arrays) {
+ if (array->length() != *num_rows) {
+ return arrow::Status::Invalid("All arrays must have the same length");
+ }
+ }
+ }
+
+ return arrow::Status::OK();
+}
+
+Status count_fields(SEXP lst, int* out) {
+ int res = 0;
+ R_xlen_t n = XLENGTH(lst);
+ SEXP names = Rf_getAttrib(lst, R_NamesSymbol);
+ for (R_xlen_t i = 0; i < n; i++) {
+ if (LENGTH(STRING_ELT(names, i)) > 0) {
+ ++res;
+ } else {
+ SEXP x = VECTOR_ELT(lst, i);
+ if (Rf_inherits(x, "data.frame")) {
+ res += XLENGTH(x);
+ } else {
+ return Status::RError(
+ "only data frames are allowed as unnamed arguments to be auto spliced");
+ }
+ }
+ }
+ *out = res;
+ return Status::OK();
+}
+
+} // namespace r
+} // namespace arrow
+
+std::shared_ptr<arrow::RecordBatch> RecordBatch__from_arrays__known_schema(
+ const std::shared_ptr<arrow::Schema>& schema, SEXP lst) {
+ int num_fields;
+ StopIfNotOk(arrow::r::count_fields(lst, &num_fields));
+
+ if (schema->num_fields() != num_fields) {
+ cpp11::stop("incompatible. schema has %d fields, and %d arrays are supplied",
+ schema->num_fields(), num_fields);
+ }
+
+ // convert lst to a vector of arrow::Array
+ std::vector<std::shared_ptr<arrow::Array>> arrays(num_fields);
+
+ auto fill_array = [&arrays, &schema](int j, SEXP x, std::string name) {
+ if (schema->field(j)->name() != name) {
+ cpp11::stop("field at index %d has name '%s' != '%s'", j + 1,
+ schema->field(j)->name().c_str(), name.c_str());
+ }
+ arrays[j] = arrow::r::vec_to_arrow(x, schema->field(j)->type(), false);
+ };
+
+ arrow::r::TraverseDots(lst, num_fields, fill_array);
+
+ int64_t num_rows = 0;
+ StopIfNotOk(arrow::r::check_consistent_array_size(arrays, &num_rows));
+ return arrow::RecordBatch::Make(schema, num_rows, arrays);
+}
+
+namespace arrow {
+namespace r {
+
+arrow::Status CollectRecordBatchArrays(
+ SEXP lst, const std::shared_ptr<arrow::Schema>& schema, int num_fields, bool inferred,
+ std::vector<std::shared_ptr<arrow::Array>>& arrays) {
+ auto extract_one_array = [&arrays, &schema, inferred](int j, SEXP x, cpp11::r_string) {
+ arrays[j] = arrow::r::vec_to_arrow(x, schema->field(j)->type(), inferred);
+ };
+ arrow::r::TraverseDots(lst, num_fields, extract_one_array);
+ return arrow::Status::OK();
+}
+
+} // namespace r
+} // namespace arrow
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatch__from_arrays(SEXP schema_sxp, SEXP lst) {
+ bool infer_schema = !Rf_inherits(schema_sxp, "Schema");
+
+ int num_fields;
+ StopIfNotOk(arrow::r::count_fields(lst, &num_fields));
+
+ // schema + metadata
+ std::shared_ptr<arrow::Schema> schema;
+ StopIfNotOk(arrow::r::InferSchemaFromDots(lst, schema_sxp, num_fields, schema));
+ StopIfNotOk(arrow::r::AddMetadataFromDots(lst, num_fields, schema));
+
+ // RecordBatch
+ if (!infer_schema) {
+ return RecordBatch__from_arrays__known_schema(schema, lst);
+ }
+
+ // RecordBatch
+ std::vector<std::shared_ptr<arrow::Array>> arrays(num_fields);
+ StopIfNotOk(
+ arrow::r::CollectRecordBatchArrays(lst, schema, num_fields, infer_schema, arrays));
+
+ // extract number of rows, and check their consistency
+ int64_t num_rows = 0;
+ StopIfNotOk(arrow::r::check_consistent_array_size(arrays, &num_rows));
+
+ return arrow::RecordBatch::Make(schema, num_rows, arrays);
+}
+
+#endif
diff --git a/src/arrow/r/src/recordbatchreader.cpp b/src/arrow/r/src/recordbatchreader.cpp
new file mode 100644
index 000000000..14af503b4
--- /dev/null
+++ b/src/arrow/r/src/recordbatchreader.cpp
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/ipc/reader.h>
+#include <arrow/table.h>
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> RecordBatchReader__schema(
+ const std::shared_ptr<arrow::RecordBatchReader>& reader) {
+ return reader->schema();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> RecordBatchReader__ReadNext(
+ const std::shared_ptr<arrow::RecordBatchReader>& reader) {
+ std::shared_ptr<arrow::RecordBatch> batch;
+ StopIfNotOk(reader->ReadNext(&batch));
+ return batch;
+}
+
+// [[arrow::export]]
+cpp11::list RecordBatchReader__batches(
+ const std::shared_ptr<arrow::RecordBatchReader>& reader) {
+ std::vector<std::shared_ptr<arrow::RecordBatch>> res;
+ StopIfNotOk(reader->ReadAll(&res));
+ return arrow::r::to_r_list(res);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__from_RecordBatchReader(
+ const std::shared_ptr<arrow::RecordBatchReader>& reader) {
+ std::shared_ptr<arrow::Table> table = nullptr;
+ StopIfNotOk(reader->ReadAll(&table));
+ return table;
+}
+
+// -------- RecordBatchStreamReader
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::RecordBatchStreamReader> ipc___RecordBatchStreamReader__Open(
+ const std::shared_ptr<arrow::io::InputStream>& stream) {
+ auto options = arrow::ipc::IpcReadOptions::Defaults();
+ options.memory_pool = gc_memory_pool();
+ return ValueOrStop(arrow::ipc::RecordBatchStreamReader::Open(stream, options));
+}
+
+// -------- RecordBatchFileReader
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> ipc___RecordBatchFileReader__schema(
+ const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader) {
+ return reader->schema();
+}
+
+// [[arrow::export]]
+int ipc___RecordBatchFileReader__num_record_batches(
+ const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader) {
+ return reader->num_record_batches();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::RecordBatch> ipc___RecordBatchFileReader__ReadRecordBatch(
+ const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader, int i) {
+ if (i < 0 && i >= reader->num_record_batches()) {
+ cpp11::stop("Record batch index out of bounds");
+ }
+ return ValueOrStop(reader->ReadRecordBatch(i));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::RecordBatchFileReader> ipc___RecordBatchFileReader__Open(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& file) {
+ auto options = arrow::ipc::IpcReadOptions::Defaults();
+ options.memory_pool = gc_memory_pool();
+ return ValueOrStop(arrow::ipc::RecordBatchFileReader::Open(file, options));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__from_RecordBatchFileReader(
+ const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader) {
+ // RecordBatchStreamReader inherits from RecordBatchReader
+ // but RecordBatchFileReader apparently does not
+ int num_batches = reader->num_record_batches();
+ std::vector<std::shared_ptr<arrow::RecordBatch>> batches(num_batches);
+ for (int i = 0; i < num_batches; i++) {
+ batches[i] = ValueOrStop(reader->ReadRecordBatch(i));
+ }
+
+ return ValueOrStop(arrow::Table::FromRecordBatches(std::move(batches)));
+}
+
+// [[arrow::export]]
+cpp11::list ipc___RecordBatchFileReader__batches(
+ const std::shared_ptr<arrow::ipc::RecordBatchFileReader>& reader) {
+ auto n = reader->num_record_batches();
+ std::vector<std::shared_ptr<arrow::RecordBatch>> res(n);
+
+ for (int i = 0; i < n; i++) {
+ res[i] = ValueOrStop(reader->ReadRecordBatch(i));
+ }
+
+ return arrow::r::to_r_list(res);
+}
+
+#endif
diff --git a/src/arrow/r/src/recordbatchwriter.cpp b/src/arrow/r/src/recordbatchwriter.cpp
new file mode 100644
index 000000000..00a617e55
--- /dev/null
+++ b/src/arrow/r/src/recordbatchwriter.cpp
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/ipc/writer.h>
+
+// [[arrow::export]]
+void ipc___RecordBatchWriter__WriteRecordBatch(
+ const std::shared_ptr<arrow::ipc::RecordBatchWriter>& batch_writer,
+ const std::shared_ptr<arrow::RecordBatch>& batch) {
+ StopIfNotOk(batch_writer->WriteRecordBatch(*batch));
+}
+
+// [[arrow::export]]
+void ipc___RecordBatchWriter__WriteTable(
+ const std::shared_ptr<arrow::ipc::RecordBatchWriter>& batch_writer,
+ const std::shared_ptr<arrow::Table>& table) {
+ StopIfNotOk(batch_writer->WriteTable(*table));
+}
+
+// [[arrow::export]]
+void ipc___RecordBatchWriter__Close(
+ const std::shared_ptr<arrow::ipc::RecordBatchWriter>& batch_writer) {
+ StopIfNotOk(batch_writer->Close());
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc___RecordBatchFileWriter__Open(
+ const std::shared_ptr<arrow::io::OutputStream>& stream,
+ const std::shared_ptr<arrow::Schema>& schema, bool use_legacy_format,
+ arrow::ipc::MetadataVersion metadata_version) {
+ auto options = arrow::ipc::IpcWriteOptions::Defaults();
+ options.write_legacy_ipc_format = use_legacy_format;
+ options.metadata_version = metadata_version;
+ options.memory_pool = gc_memory_pool();
+ return ValueOrStop(arrow::ipc::MakeFileWriter(stream, schema, options));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc___RecordBatchStreamWriter__Open(
+ const std::shared_ptr<arrow::io::OutputStream>& stream,
+ const std::shared_ptr<arrow::Schema>& schema, bool use_legacy_format,
+ arrow::ipc::MetadataVersion metadata_version) {
+ auto options = arrow::ipc::IpcWriteOptions::Defaults();
+ options.write_legacy_ipc_format = use_legacy_format;
+ options.metadata_version = metadata_version;
+ options.memory_pool = gc_memory_pool();
+ return ValueOrStop(MakeStreamWriter(stream, schema, options));
+}
+
+#endif
diff --git a/src/arrow/r/src/scalar.cpp b/src/arrow/r/src/scalar.cpp
new file mode 100644
index 000000000..5450a6f0a
--- /dev/null
+++ b/src/arrow/r/src/scalar.cpp
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/array/array_base.h>
+#include <arrow/array/util.h>
+#include <arrow/scalar.h>
+#include <arrow/type.h>
+
+namespace cpp11 {
+
+const char* r6_class_name<arrow::Scalar>::get(
+ const std::shared_ptr<arrow::Scalar>& scalar) {
+ if (scalar->type->id() == arrow::Type::STRUCT) {
+ return "StructScalar";
+ }
+ return "Scalar";
+}
+
+} // namespace cpp11
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Scalar> Array__GetScalar(const std::shared_ptr<arrow::Array>& x,
+ int64_t i) {
+ return ValueOrStop(x->GetScalar(i));
+}
+
+// [[arrow::export]]
+std::string Scalar__ToString(const std::shared_ptr<arrow::Scalar>& s) {
+ return s->ToString();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Scalar> StructScalar__field(
+ const std::shared_ptr<arrow::StructScalar>& s, int i) {
+ return ValueOrStop(s->field(i));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Scalar> StructScalar__GetFieldByName(
+ const std::shared_ptr<arrow::StructScalar>& s, const std::string& name) {
+ return ValueOrStop(s->field(name));
+}
+
+// [[arrow::export]]
+SEXP Scalar__as_vector(const std::shared_ptr<arrow::Scalar>& scalar) {
+ auto array = ValueOrStop(arrow::MakeArrayFromScalar(*scalar, 1, gc_memory_pool()));
+
+ // defined in array_to_vector.cpp
+ SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array);
+ return Array__as_vector(array);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> MakeArrayFromScalar(
+ const std::shared_ptr<arrow::Scalar>& scalar, int n) {
+ return ValueOrStop(arrow::MakeArrayFromScalar(*scalar, n, gc_memory_pool()));
+}
+
+// [[arrow::export]]
+bool Scalar__is_valid(const std::shared_ptr<arrow::Scalar>& s) { return s->is_valid; }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Scalar__type(const std::shared_ptr<arrow::Scalar>& s) {
+ return s->type;
+}
+
+// [[arrow::export]]
+bool Scalar__Equals(const std::shared_ptr<arrow::Scalar>& lhs,
+ const std::shared_ptr<arrow::Scalar>& rhs) {
+ return lhs->Equals(rhs);
+}
+
+// [[arrow::export]]
+bool Scalar__ApproxEquals(const std::shared_ptr<arrow::Scalar>& lhs,
+ const std::shared_ptr<arrow::Scalar>& rhs) {
+ return lhs->ApproxEquals(*rhs);
+}
+
+#endif
diff --git a/src/arrow/r/src/schema.cpp b/src/arrow/r/src/schema.cpp
new file mode 100644
index 000000000..dcb6ab36e
--- /dev/null
+++ b/src/arrow/r/src/schema.cpp
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/ipc/writer.h>
+#include <arrow/type.h>
+#include <arrow/util/key_value_metadata.h>
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> schema_(
+ const std::vector<std::shared_ptr<arrow::Field>>& fields) {
+ return arrow::schema(fields);
+}
+
+// [[arrow::export]]
+std::string Schema__ToString(const std::shared_ptr<arrow::Schema>& s) {
+ return s->ToString();
+}
+
+// [[arrow::export]]
+int Schema__num_fields(const std::shared_ptr<arrow::Schema>& s) {
+ return s->num_fields();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> Schema__field(const std::shared_ptr<arrow::Schema>& s,
+ int i) {
+ if (i >= s->num_fields() || i < 0) {
+ cpp11::stop("Invalid field index for schema.");
+ }
+
+ return s->field(i);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> Schema__AddField(
+ const std::shared_ptr<arrow::Schema>& s, int i,
+ const std::shared_ptr<arrow::Field>& field) {
+ return ValueOrStop(s->AddField(i, field));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> Schema__SetField(
+ const std::shared_ptr<arrow::Schema>& s, int i,
+ const std::shared_ptr<arrow::Field>& field) {
+ return ValueOrStop(s->SetField(i, field));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> Schema__RemoveField(
+ const std::shared_ptr<arrow::Schema>& s, int i) {
+ return ValueOrStop(s->RemoveField(i));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> Schema__GetFieldByName(
+ const std::shared_ptr<arrow::Schema>& s, std::string x) {
+ return s->GetFieldByName(x);
+}
+
+// [[arrow::export]]
+cpp11::list Schema__fields(const std::shared_ptr<arrow::Schema>& schema) {
+ return arrow::r::to_r_list(schema->fields());
+}
+
+// [[arrow::export]]
+std::vector<std::string> Schema__field_names(
+ const std::shared_ptr<arrow::Schema>& schema) {
+ return schema->field_names();
+}
+
+// [[arrow::export]]
+bool Schema__HasMetadata(const std::shared_ptr<arrow::Schema>& schema) {
+ return schema->HasMetadata();
+}
+
+// [[arrow::export]]
+cpp11::writable::list Schema__metadata(const std::shared_ptr<arrow::Schema>& schema) {
+ auto meta = schema->metadata();
+ int64_t n = 0;
+ if (schema->HasMetadata()) {
+ n = meta->size();
+ }
+
+ cpp11::writable::list out(n);
+ std::vector<std::string> names_out(n);
+
+ for (int i = 0; i < n; i++) {
+ auto key = meta->key(i);
+ out[i] = cpp11::as_sexp(meta->value(i));
+ if (key == "r") {
+ Rf_classgets(out[i], arrow::r::data::classes_metadata_r);
+ }
+ names_out[i] = key;
+ }
+ out.names() = names_out;
+ return out;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> Schema__WithMetadata(
+ const std::shared_ptr<arrow::Schema>& schema, cpp11::strings metadata) {
+ auto values = cpp11::as_cpp<std::vector<std::string>>(metadata);
+ auto names = cpp11::as_cpp<std::vector<std::string>>(metadata.attr("names"));
+
+ auto kv =
+ std::make_shared<arrow::KeyValueMetadata>(std::move(names), std::move(values));
+ return schema->WithMetadata(std::move(kv));
+}
+
+// [[arrow::export]]
+cpp11::writable::raws Schema__serialize(const std::shared_ptr<arrow::Schema>& schema) {
+ auto out = ValueOrStop(arrow::ipc::SerializeSchema(*schema));
+ auto n = out->size();
+ return cpp11::writable::raws(out->data(), out->data() + n);
+}
+
+// [[arrow::export]]
+bool Schema__Equals(const std::shared_ptr<arrow::Schema>& schema,
+ const std::shared_ptr<arrow::Schema>& other, bool check_metadata) {
+ return schema->Equals(*other, check_metadata);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> arrow__UnifySchemas(
+ const std::vector<std::shared_ptr<arrow::Schema>>& schemas) {
+ return ValueOrStop(arrow::UnifySchemas(schemas));
+}
+
+#endif
diff --git a/src/arrow/r/src/symbols.cpp b/src/arrow/r/src/symbols.cpp
new file mode 100644
index 000000000..0cb32c462
--- /dev/null
+++ b/src/arrow/r/src/symbols.cpp
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+namespace arrow {
+namespace r {
+SEXP symbols::units = Rf_install("units");
+SEXP symbols::tzone = Rf_install("tzone");
+SEXP symbols::xp = Rf_install(".:xp:.");
+SEXP symbols::dot_Internal = Rf_install(".Internal");
+SEXP symbols::inspect = Rf_install("inspect");
+SEXP symbols::row_names = Rf_install("row.names");
+SEXP symbols::serialize_arrow_r_metadata = Rf_install(".serialize_arrow_r_metadata");
+SEXP symbols::as_list = Rf_install("as.list");
+SEXP symbols::ptype = Rf_install("ptype");
+SEXP symbols::byte_width = Rf_install("byte_width");
+SEXP symbols::list_size = Rf_install("list_size");
+SEXP symbols::arrow_attributes = Rf_install("arrow_attributes");
+SEXP symbols::new_ = Rf_install("new");
+SEXP symbols::create = Rf_install("create");
+SEXP symbols::arrow = Rf_install("arrow");
+
+// persistently protect `x` and return it
+SEXP precious(SEXP x) {
+ PROTECT(x);
+ R_PreserveObject(x);
+ UNPROTECT(1);
+ return x;
+}
+
+// returns the namespace environment for package `name`
+SEXP precious_namespace(std::string name) {
+ SEXP s_name = PROTECT(cpp11::writable::strings({name}));
+ SEXP ns = R_FindNamespace(s_name);
+ R_PreserveObject(ns);
+ UNPROTECT(1);
+
+ return ns;
+}
+SEXP data::classes_POSIXct = precious(cpp11::writable::strings({"POSIXct", "POSIXt"}));
+SEXP data::classes_metadata_r = precious(cpp11::writable::strings({"arrow_r_metadata"}));
+SEXP data::classes_vctrs_list_of =
+ precious(cpp11::writable::strings({"vctrs_list_of", "vctrs_vctr", "list"}));
+SEXP data::classes_tbl_df =
+ precious(cpp11::writable::strings({"tbl_df", "tbl", "data.frame"}));
+
+SEXP data::classes_arrow_binary =
+ precious(cpp11::writable::strings({"arrow_binary", "vctrs_vctr", "list"}));
+SEXP data::classes_arrow_large_binary =
+ precious(cpp11::writable::strings({"arrow_large_binary", "vctrs_vctr", "list"}));
+SEXP data::classes_arrow_fixed_size_binary =
+ precious(cpp11::writable::strings({"arrow_fixed_size_binary", "vctrs_vctr", "list"}));
+SEXP data::classes_factor = precious(cpp11::writable::strings({"factor"}));
+SEXP data::classes_ordered = precious(cpp11::writable::strings({"ordered", "factor"}));
+
+SEXP data::classes_arrow_list = precious(
+ cpp11::writable::strings({"arrow_list", "vctrs_list_of", "vctrs_vctr", "list"}));
+SEXP data::classes_arrow_large_list = precious(cpp11::writable::strings(
+ {"arrow_large_list", "vctrs_list_of", "vctrs_vctr", "list"}));
+SEXP data::classes_arrow_fixed_size_list = precious(cpp11::writable::strings(
+ {"arrow_fixed_size_list", "vctrs_list_of", "vctrs_vctr", "list"}));
+
+SEXP data::names_metadata = precious(cpp11::writable::strings({"attributes", "columns"}));
+
+SEXP ns::arrow = precious_namespace("arrow");
+
+void inspect(SEXP obj) {
+ SEXP call_inspect = PROTECT(Rf_lang2(symbols::inspect, obj));
+ SEXP call_internal = PROTECT(Rf_lang2(symbols::dot_Internal, call_inspect));
+ Rf_eval(call_internal, R_GlobalEnv);
+ UNPROTECT(2);
+}
+
+} // namespace r
+} // namespace arrow
diff --git a/src/arrow/r/src/table.cpp b/src/arrow/r/src/table.cpp
new file mode 100644
index 000000000..68adefcfd
--- /dev/null
+++ b/src/arrow/r/src/table.cpp
@@ -0,0 +1,286 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/array/array_base.h>
+#include <arrow/table.h>
+#include <arrow/util/key_value_metadata.h>
+
+// [[arrow::export]]
+int Table__num_columns(const std::shared_ptr<arrow::Table>& x) {
+ return x->num_columns();
+}
+
+// [[arrow::export]]
+int Table__num_rows(const std::shared_ptr<arrow::Table>& x) { return x->num_rows(); }
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> Table__schema(const std::shared_ptr<arrow::Table>& x) {
+ return x->schema();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__ReplaceSchemaMetadata(
+ const std::shared_ptr<arrow::Table>& x, cpp11::strings metadata) {
+ auto vec_metadata = cpp11::as_cpp<std::vector<std::string>>(metadata);
+ auto names_metadata = cpp11::as_cpp<std::vector<std::string>>(metadata.names());
+ auto kv = std::shared_ptr<arrow::KeyValueMetadata>(
+ new arrow::KeyValueMetadata(names_metadata, vec_metadata));
+ return x->ReplaceSchemaMetadata(kv);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ChunkedArray> Table__column(
+ const std::shared_ptr<arrow::Table>& table, R_xlen_t i) {
+ arrow::r::validate_index(i, table->num_columns());
+ return table->column(i);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> Table__field(const std::shared_ptr<arrow::Table>& table,
+ R_xlen_t i) {
+ arrow::r::validate_index(i, table->num_columns());
+ return table->field(i);
+}
+
+// [[arrow::export]]
+cpp11::list Table__columns(const std::shared_ptr<arrow::Table>& table) {
+ auto nc = table->num_columns();
+ std::vector<std::shared_ptr<arrow::ChunkedArray>> res(nc);
+ for (int i = 0; i < nc; i++) {
+ res[i] = table->column(i);
+ }
+ return arrow::r::to_r_list(res);
+}
+
+// [[arrow::export]]
+std::vector<std::string> Table__ColumnNames(const std::shared_ptr<arrow::Table>& table) {
+ return table->ColumnNames();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__RenameColumns(
+ const std::shared_ptr<arrow::Table>& table, const std::vector<std::string>& names) {
+ return ValueOrStop(table->RenameColumns(names));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__Slice1(const std::shared_ptr<arrow::Table>& table,
+ R_xlen_t offset) {
+ arrow::r::validate_slice_offset(offset, table->num_rows());
+ return table->Slice(offset);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__Slice2(const std::shared_ptr<arrow::Table>& table,
+ R_xlen_t offset, R_xlen_t length) {
+ arrow::r::validate_slice_offset(offset, table->num_rows());
+ arrow::r::validate_slice_length(length, table->num_rows() - offset);
+ return table->Slice(offset, length);
+}
+
+// [[arrow::export]]
+bool Table__Equals(const std::shared_ptr<arrow::Table>& lhs,
+ const std::shared_ptr<arrow::Table>& rhs, bool check_metadata) {
+ return lhs->Equals(*rhs.get(), check_metadata);
+}
+
+// [[arrow::export]]
+bool Table__Validate(const std::shared_ptr<arrow::Table>& table) {
+ StopIfNotOk(table->Validate());
+ return true;
+}
+
+// [[arrow::export]]
+bool Table__ValidateFull(const std::shared_ptr<arrow::Table>& table) {
+ StopIfNotOk(table->ValidateFull());
+ return true;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::ChunkedArray> Table__GetColumnByName(
+ const std::shared_ptr<arrow::Table>& table, const std::string& name) {
+ return table->GetColumnByName(name);
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__RemoveColumn(
+ const std::shared_ptr<arrow::Table>& table, R_xlen_t i) {
+ return ValueOrStop(table->RemoveColumn(i));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__AddColumn(
+ const std::shared_ptr<arrow::Table>& table, R_xlen_t i,
+ const std::shared_ptr<arrow::Field>& field,
+ const std::shared_ptr<arrow::ChunkedArray>& column) {
+ return ValueOrStop(table->AddColumn(i, field, column));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__SetColumn(
+ const std::shared_ptr<arrow::Table>& table, R_xlen_t i,
+ const std::shared_ptr<arrow::Field>& field,
+ const std::shared_ptr<arrow::ChunkedArray>& column) {
+ return ValueOrStop(table->SetColumn(i, field, column));
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__SelectColumns(
+ const std::shared_ptr<arrow::Table>& table, const std::vector<int>& indices) {
+ return ValueOrStop(table->SelectColumns(indices));
+}
+
+namespace arrow {
+namespace r {
+
+arrow::Status InferSchemaFromDots(SEXP lst, SEXP schema_sxp, int num_fields,
+ std::shared_ptr<arrow::Schema>& schema) {
+ // maybe a schema was given
+ if (Rf_inherits(schema_sxp, "Schema")) {
+ schema = cpp11::as_cpp<std::shared_ptr<arrow::Schema>>(schema_sxp);
+ return arrow::Status::OK();
+ }
+
+ if (!Rf_isNull(schema_sxp)) {
+ return arrow::Status::RError("`schema` must be an arrow::Schema or NULL");
+ }
+
+ // infer the schema from the `...`
+ std::vector<std::shared_ptr<arrow::Field>> fields(num_fields);
+
+ auto extract_one_field = [&fields](int j, SEXP x, std::string name) {
+ if (Rf_inherits(x, "ChunkedArray")) {
+ fields[j] = arrow::field(
+ name, cpp11::as_cpp<std::shared_ptr<arrow::ChunkedArray>>(x)->type());
+ } else if (Rf_inherits(x, "Array")) {
+ fields[j] =
+ arrow::field(name, cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x)->type());
+ } else {
+ // TODO: we just need the type at this point
+ fields[j] = arrow::field(name, arrow::r::InferArrowType(x));
+ }
+ };
+ arrow::r::TraverseDots(lst, num_fields, extract_one_field);
+
+ schema = std::make_shared<arrow::Schema>(std::move(fields));
+
+ return arrow::Status::OK();
+}
+
+SEXP arrow_attributes(SEXP x, bool only_top_level) {
+ SEXP call = PROTECT(
+ Rf_lang3(arrow::r::symbols::arrow_attributes, x, Rf_ScalarLogical(only_top_level)));
+ SEXP att = Rf_eval(call, arrow::r::ns::arrow);
+ UNPROTECT(1);
+ return att;
+}
+
+SEXP CollectColumnMetadata(SEXP lst, int num_fields, bool& has_metadata) {
+ // Preallocate for the lambda to fill in
+ cpp11::writable::list metadata_columns(num_fields);
+
+ cpp11::writable::strings metadata_columns_names(num_fields);
+
+ auto extract_one_metadata = [&metadata_columns, &metadata_columns_names, &has_metadata](
+ int j, SEXP x, std::string name) {
+ metadata_columns_names[j] = name;
+
+ // no metadata for arrow R6 objects
+ if (Rf_inherits(x, "ArrowObject")) {
+ return;
+ }
+ metadata_columns[j] = arrow_attributes(x, false);
+
+ if (!Rf_isNull(metadata_columns[j])) {
+ has_metadata = true;
+ }
+ };
+ arrow::r::TraverseDots(lst, num_fields, extract_one_metadata);
+
+ metadata_columns.names() = metadata_columns_names;
+ return metadata_columns;
+}
+
+arrow::Status AddMetadataFromDots(SEXP lst, int num_fields,
+ std::shared_ptr<arrow::Schema>& schema) {
+ // Preallocate the r_metadata object: list(attributes=list(), columns=namedList(fields))
+
+ cpp11::writable::list metadata(2);
+ metadata.names() = arrow::r::data::names_metadata;
+
+ bool has_metadata = false;
+
+ // "top level" attributes, only relevant if the first object is not named and a data
+ // frame
+ cpp11::strings names = Rf_getAttrib(lst, R_NamesSymbol);
+ if (names[0] == "" && Rf_inherits(VECTOR_ELT(lst, 0), "data.frame")) {
+ SEXP top_level = metadata[0] = arrow_attributes(VECTOR_ELT(lst, 0), true);
+ if (!Rf_isNull(top_level) && XLENGTH(top_level) > 0) {
+ has_metadata = true;
+ }
+ }
+
+ // recurse to get all columns metadata
+ metadata[1] = CollectColumnMetadata(lst, num_fields, has_metadata);
+
+ if (has_metadata) {
+ SEXP serialise_call =
+ PROTECT(Rf_lang2(arrow::r::symbols::serialize_arrow_r_metadata, metadata));
+ SEXP serialised = PROTECT(Rf_eval(serialise_call, arrow::r::ns::arrow));
+
+ schema = schema->WithMetadata(
+ arrow::key_value_metadata({"r"}, {CHAR(STRING_ELT(serialised, 0))}));
+
+ UNPROTECT(2);
+ }
+
+ return arrow::Status::OK();
+}
+
+} // namespace r
+} // namespace arrow
+
+// [[arrow::export]]
+bool all_record_batches(SEXP lst) {
+ R_xlen_t n = XLENGTH(lst);
+ for (R_xlen_t i = 0; i < n; i++) {
+ if (!Rf_inherits(VECTOR_ELT(lst, i), "RecordBatch")) return false;
+ }
+ return true;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__from_record_batches(
+ const std::vector<std::shared_ptr<arrow::RecordBatch>>& batches, SEXP schema_sxp) {
+ bool infer_schema = !Rf_inherits(schema_sxp, "Schema");
+
+ std::shared_ptr<arrow::Table> tab;
+
+ if (infer_schema) {
+ tab = ValueOrStop(arrow::Table::FromRecordBatches(std::move(batches)));
+ } else {
+ auto schema = cpp11::as_cpp<std::shared_ptr<arrow::Schema>>(schema_sxp);
+ tab = ValueOrStop(arrow::Table::FromRecordBatches(schema, std::move(batches)));
+ }
+
+ return tab;
+}
+
+#endif
diff --git a/src/arrow/r/src/threadpool.cpp b/src/arrow/r/src/threadpool.cpp
new file mode 100644
index 000000000..fb5005517
--- /dev/null
+++ b/src/arrow/r/src/threadpool.cpp
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <arrow/util/parallel.h>
+
+//' View and manage the capacity of the global thread pool
+//'
+//' Get the capacity of the global thread pool
+//'
+//' `GetCpuThreadPoolCapacity()` returns the number of worker threads in the
+//' thread pool to which
+//' Arrow dispatches various CPU-bound tasks. This is an ideal number,
+//' not necessarily the exact number of threads at a given point in time.
+//' You can change this number using `SetCpuThreadPoolCapacity()`.
+//'
+//' @param threads the number of worker threads in the thread pool to which
+//' Arrow dispatches various CPU-bound tasks.
+//'
+//' @return `GetCpuThreadPoolCapacity()` returns the number of worker threads.
+//' `SetCpuThreadPoolCapacity()` returns nothing.
+//' @export
+//' @name threadpool
+// [[arrow::export]]
+int GetCpuThreadPoolCapacity() { return arrow::GetCpuThreadPoolCapacity(); }
+
+//' @rdname threadpool
+//' @export
+// [[arrow::export]]
+void SetCpuThreadPoolCapacity(int threads) {
+ StopIfNotOk(arrow::SetCpuThreadPoolCapacity(threads));
+}
+
+// [[arrow::export]]
+int GetIOThreadPoolCapacity() { return arrow::GetCpuThreadPoolCapacity(); }
+
+// [[arrow::export]]
+void SetIOThreadPoolCapacity(int threads) {
+ StopIfNotOk(arrow::SetCpuThreadPoolCapacity(threads));
+}
+
+#endif
diff --git a/src/arrow/r/src/type_infer.cpp b/src/arrow/r/src/type_infer.cpp
new file mode 100644
index 000000000..022a29ea5
--- /dev/null
+++ b/src/arrow/r/src/type_infer.cpp
@@ -0,0 +1,202 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+
+#include "./arrow_types.h"
+#include "./arrow_vctrs.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+#include <arrow/array/array_base.h>
+
+namespace arrow {
+namespace r {
+
+static inline std::shared_ptr<arrow::DataType> IndexTypeForFactors(int n_factors) {
+ if (n_factors < INT8_MAX) {
+ return arrow::int8();
+ } else if (n_factors < INT16_MAX) {
+ return arrow::int16();
+ } else {
+ return arrow::int32();
+ }
+}
+
+std::shared_ptr<arrow::DataType> InferArrowTypeFromFactor(SEXP factor) {
+ SEXP factors = Rf_getAttrib(factor, R_LevelsSymbol);
+ auto index_type = IndexTypeForFactors(Rf_length(factors));
+ bool is_ordered = Rf_inherits(factor, "ordered");
+ return dictionary(index_type, arrow::utf8(), is_ordered);
+}
+
+template <int VectorType>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector(SEXP x) {
+ cpp11::stop("Unknown vector type: ", VectorType);
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<ENVSXP>(SEXP x) {
+ if (Rf_inherits(x, "Array")) {
+ return cpp11::as_cpp<std::shared_ptr<arrow::Array>>(x)->type();
+ }
+
+ cpp11::stop("Unrecognized vector instance for type ENVSXP");
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<LGLSXP>(SEXP x) {
+ return Rf_inherits(x, "vctrs_unspecified") ? null() : boolean();
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<INTSXP>(SEXP x) {
+ if (Rf_isFactor(x)) {
+ return InferArrowTypeFromFactor(x);
+ } else if (Rf_inherits(x, "Date")) {
+ return date32();
+ } else if (Rf_inherits(x, "POSIXct")) {
+ auto tzone_sexp = Rf_getAttrib(x, symbols::tzone);
+ if (Rf_isNull(tzone_sexp)) {
+ return timestamp(TimeUnit::MICRO);
+ } else {
+ return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(tzone_sexp, 0)));
+ }
+ }
+ return int32();
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<REALSXP>(SEXP x) {
+ if (Rf_inherits(x, "Date")) {
+ return date32();
+ }
+ if (Rf_inherits(x, "POSIXct")) {
+ auto tzone_sexp = Rf_getAttrib(x, symbols::tzone);
+ if (Rf_isNull(tzone_sexp)) {
+ return timestamp(TimeUnit::MICRO);
+ } else {
+ return timestamp(TimeUnit::MICRO, CHAR(STRING_ELT(tzone_sexp, 0)));
+ }
+ }
+ if (Rf_inherits(x, "integer64")) {
+ return int64();
+ }
+ if (Rf_inherits(x, "difftime")) {
+ return time32(TimeUnit::SECOND);
+ }
+ return float64();
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<STRSXP>(SEXP x) {
+ return cpp11::unwind_protect([&] {
+ R_xlen_t n = XLENGTH(x);
+
+ int64_t size = 0;
+
+ for (R_xlen_t i = 0; i < n; i++) {
+ size += arrow::r::unsafe::r_string_size(STRING_ELT(x, i));
+ if (size > arrow::kBinaryMemoryLimit) {
+ // Exceeds 2GB capacity of utf8 type, so use large
+ return large_utf8();
+ }
+ }
+
+ return utf8();
+ });
+}
+
+static inline std::shared_ptr<arrow::DataType> InferArrowTypeFromDataFrame(
+ cpp11::list x) {
+ R_xlen_t n = x.size();
+ cpp11::strings names(x.attr(R_NamesSymbol));
+ std::vector<std::shared_ptr<arrow::Field>> fields(n);
+ for (R_xlen_t i = 0; i < n; i++) {
+ fields[i] = arrow::field(names[i], InferArrowType(x[i]));
+ }
+ return arrow::struct_(std::move(fields));
+}
+
+template <>
+std::shared_ptr<arrow::DataType> InferArrowTypeFromVector<VECSXP>(SEXP x) {
+ if (Rf_inherits(x, "data.frame") || Rf_inherits(x, "POSIXlt")) {
+ return InferArrowTypeFromDataFrame(x);
+ } else {
+ // some known special cases
+ if (Rf_inherits(x, "arrow_fixed_size_binary")) {
+ SEXP byte_width = Rf_getAttrib(x, symbols::byte_width);
+ if (Rf_isNull(byte_width) || TYPEOF(byte_width) != INTSXP ||
+ XLENGTH(byte_width) != 1) {
+ cpp11::stop("malformed arrow_fixed_size_binary object");
+ }
+ return arrow::fixed_size_binary(INTEGER(byte_width)[0]);
+ }
+
+ if (Rf_inherits(x, "arrow_binary")) {
+ return arrow::binary();
+ }
+
+ if (Rf_inherits(x, "arrow_large_binary")) {
+ return arrow::large_binary();
+ }
+
+ SEXP ptype = Rf_getAttrib(x, symbols::ptype);
+ if (Rf_isNull(ptype)) {
+ if (XLENGTH(x) == 0) {
+ cpp11::stop(
+ "Requires at least one element to infer the values' type of a list vector");
+ }
+
+ ptype = VECTOR_ELT(x, 0);
+ }
+
+ return arrow::list(InferArrowType(ptype));
+ }
+}
+
+std::shared_ptr<arrow::DataType> InferArrowType(SEXP x) {
+ switch (TYPEOF(x)) {
+ case ENVSXP:
+ return InferArrowTypeFromVector<ENVSXP>(x);
+ case LGLSXP:
+ return InferArrowTypeFromVector<LGLSXP>(x);
+ case INTSXP:
+ return InferArrowTypeFromVector<INTSXP>(x);
+ case REALSXP:
+ return InferArrowTypeFromVector<REALSXP>(x);
+ case RAWSXP:
+ return uint8();
+ case STRSXP:
+ return InferArrowTypeFromVector<STRSXP>(x);
+ case VECSXP:
+ return InferArrowTypeFromVector<VECSXP>(x);
+ default:
+ break;
+ }
+
+ cpp11::stop("Cannot infer type from vector");
+}
+
+} // namespace r
+} // namespace arrow
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> Array__infer_type(SEXP x) {
+ return arrow::r::InferArrowType(x);
+}
+
+#endif
diff --git a/src/arrow/r/tests/testthat.R b/src/arrow/r/tests/testthat.R
new file mode 100644
index 000000000..d0f5b1e0d
--- /dev/null
+++ b/src/arrow/r/tests/testthat.R
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(testthat)
+library(arrow)
+library(tibble)
+
+if (identical(tolower(Sys.getenv("ARROW_R_DEV", "false")), "true")) {
+ arrow_reporter <- MultiReporter$new(list(CheckReporter$new(), LocationReporter$new()))
+} else {
+ arrow_reporter <- check_reporter()
+}
+test_check("arrow", reporter = arrow_reporter)
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
new file mode 100644
index 000000000..3394be241
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather
new file mode 100644
index 000000000..d91acd0cc
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_lz4.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather
new file mode 100644
index 000000000..0198024ec
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_uncompressed.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather
new file mode 100644
index 000000000..f6788231c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_0.17.0_zstd.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet
new file mode 100644
index 000000000..e1d589bf0
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather
new file mode 100644
index 000000000..f3a71435a
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_lz4.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather
new file mode 100644
index 000000000..1188ac669
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_uncompressed.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather
new file mode 100644
index 000000000..056b26c17
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_1.0.1_zstd.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet
new file mode 100644
index 000000000..6c5911560
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0.parquet
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather
new file mode 100644
index 000000000..b65da7234
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_lz4.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather
new file mode 100644
index 000000000..508903cb4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_uncompressed.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather
new file mode 100644
index 000000000..39c829fda
--- /dev/null
+++ b/src/arrow/r/tests/testthat/golden-files/data-arrow_2.0.0_zstd.feather
Binary files differ
diff --git a/src/arrow/r/tests/testthat/helper-arrow.R b/src/arrow/r/tests/testthat/helper-arrow.R
new file mode 100644
index 000000000..545f2d044
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-arrow.R
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Wrap testthat::test_that with a check for the C++ library
+options(..skip.tests = !arrow:::arrow_available())
+
+set.seed(1)
+
+MAX_INT <- 2147483647L
+
+# Make sure this is unset
+Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = "")
+
+# use the C locale for string collation (ARROW-12046)
+Sys.setlocale("LC_COLLATE", "C")
+
+# Set English language so that error messages aren't internationalized
+# (R CMD check does this, but in case you're running outside of check)
+Sys.setenv(LANGUAGE = "en")
+
+with_language <- function(lang, expr) {
+ old <- Sys.getenv("LANGUAGE")
+ # Check what this message is before changing languages; this will
+ # trigger caching the transations if the OS does that (some do).
+ # If the OS does cache, then we can't test changing languages safely.
+ before <- i18ize_error_messages()
+ Sys.setenv(LANGUAGE = lang)
+ on.exit({
+ Sys.setenv(LANGUAGE = old)
+ .cache$i18ized_error_pattern <<- NULL
+ })
+ if (!identical(before, i18ize_error_messages())) {
+ skip(paste("This OS either does not support changing languages to", lang, "or it caches translations"))
+ }
+ force(expr)
+}
+
+test_that <- function(what, code) {
+ testthat::test_that(what, {
+ skip_if(getOption("..skip.tests", TRUE), "arrow C++ library not available")
+ code
+ })
+}
+
+# Wrapper to run tests that only touch R code even when the C++ library isn't
+# available (so that at least some tests are run on those platforms)
+r_only <- function(code) {
+ withr::with_options(list(..skip.tests = FALSE), code)
+}
+
+make_temp_dir <- function() {
+ path <- tempfile()
+ dir.create(path)
+ normalizePath(path, winslash = "/")
+}
diff --git a/src/arrow/r/tests/testthat/helper-data.R b/src/arrow/r/tests/testthat/helper-data.R
new file mode 100644
index 000000000..c693e84b2
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-data.R
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+example_data <- tibble::tibble(
+ int = c(1:3, NA_integer_, 5:10),
+ dbl = c(1:8, NA, 10) + .1,
+ dbl2 = rep(5, 10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ false = logical(10),
+ chr = letters[c(1:5, NA, 7:10)],
+ fct = factor(letters[c(1:4, NA, NA, 7:10)])
+)
+
+example_with_metadata <- tibble::tibble(
+ a = structure("one", class = "special_string"),
+ b = 2,
+ c = tibble::tibble(
+ c1 = structure("inner", extra_attr = "something"),
+ c2 = 4,
+ c3 = 50
+ ),
+ d = "four"
+)
+
+attr(example_with_metadata, "top_level") <- list(
+ field_one = 12,
+ field_two = "more stuff"
+)
+
+haven_data <- tibble::tibble(
+ num = structure(c(5.1, 4.9),
+ format.spss = "F8.2"
+ ),
+ cat_int = structure(c(3, 1),
+ format.spss = "F8.0",
+ labels = c(first = 1, second = 2, third = 3),
+ class = c("haven_labelled", "vctrs_vctr", "double")
+ ),
+ cat_chr = structure(c("B", "B"),
+ labels = c(Alpha = "A", Beta = "B"),
+ class = c("haven_labelled", "vctrs_vctr", "character")
+ )
+)
+
+example_with_times <- tibble::tibble(
+ date = Sys.Date() + 1:10,
+ posixct = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10,
+ posixct_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10,
+ posixlt = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10),
+ posixlt_tz = as.POSIXlt(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10)
+)
+
+verses <- list(
+ # Since we tend to test with dataframes with 10 rows, here are verses from
+ # "Milonga del moro judío", by Jorge Drexler. They are décimas, 10-line
+ # poems with a particular meter and rhyme scheme.
+ # (They also have non-ASCII characters, which is nice for testing)
+ c(
+ "Por cada muro, un lamento",
+ "En Jerusalén la dorada",
+ "Y mil vidas malgastadas",
+ "Por cada mandamiento",
+ "Yo soy polvo de tu viento",
+ "Y aunque sangro de tu herida",
+ "Y cada piedra querida",
+ "Guarda mi amor más profundo",
+ "No hay una piedra en el mundo",
+ "Que valga lo que una vida"
+ ),
+ c(
+ "No hay muerto que no me duela",
+ "No hay un bando ganador",
+ "No hay nada más que dolor",
+ "Y otra vida que se vuela",
+ "La guerra es muy mala escuela",
+ "No importa el disfraz que viste",
+ "Perdonen que no me aliste",
+ "Bajo ninguna bandera",
+ "Vale más cualquier quimera",
+ "Que un trozo de tela triste"
+ ),
+ c(
+ "Y a nadie le di permiso",
+ "Para matar en mi nombre",
+ "Un hombre no es más que un hombre",
+ "Y si hay Dios, así lo quiso",
+ "El mismo suelo que piso",
+ "Seguirá, yo me habré ido",
+ "Rumbo también del olvido",
+ "No hay doctrina que no vaya",
+ "Y no hay pueblo que no se haya",
+ "Creído el pueblo elegido"
+ )
+)
+
+make_big_string <- function() {
+ # This creates a character vector that would exceed the capacity of BinaryArray
+ rep(purrr::map_chr(2047:2050, ~ paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
+}
+
+make_random_string_of_size <- function(size = 1) {
+ purrr::map_chr(1000 * size, ~ paste(sample(letters, ., replace = TRUE), collapse = ""))
+}
+
+make_string_of_size <- function(size = 1) {
+ paste(rep(letters, length.out = 1000 * size), collapse = "")
+}
+
+example_with_extra_metadata <- example_with_metadata
+attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100))
+
+example_with_logical_factors <- tibble::tibble(
+ starting_a_fight = factor(c(FALSE, TRUE, TRUE, TRUE)),
+ consoling_a_child = factor(c(TRUE, FALSE, TRUE, TRUE)),
+ petting_a_dog = factor(c(TRUE, TRUE, FALSE, TRUE)),
+ saying = c(
+ "shhhhh, it's ok",
+ "you wanna go outside?",
+ "you want your mommy?",
+ "hey buddy"
+ )
+)
+
+# The values in each column of this tibble are in ascending order. There are
+# some ties, so tests should use two or more columns to ensure deterministic
+# sort order. The Arrow C++ library orders strings lexicographically as byte
+# strings. The order of a string array sorted by Arrow will not match the order
+# of an equivalent character vector sorted by R unless you set the R collation
+# locale to "C" by running: Sys.setlocale("LC_COLLATE", "C")
+# These test scripts set that, but if you are running individual tests you might
+# need to set it manually. When finished, you can restore the default
+# collation locale by running: Sys.setlocale("LC_COLLATE")
+# In the future, the string collation locale used by the Arrow C++ library might
+# be configurable (ARROW-12046).
+example_data_for_sorting <- tibble::tibble(
+ int = c(-.Machine$integer.max, -101L, -100L, 0L, 0L, 1L, 100L, 1000L, .Machine$integer.max, NA_integer_),
+ dbl = c(
+ -Inf, -.Machine$double.xmax, -.Machine$double.xmin, 0, .Machine$double.xmin,
+ pi, .Machine$double.xmax, Inf, NaN, NA_real_
+ ),
+ chr = c("", "", "\"", "&", "ABC", "NULL", "a", "abc", "zzz", NA_character_),
+ lgl = c(rep(FALSE, 4L), rep(TRUE, 5L), NA),
+ dttm = lubridate::ymd_hms(c(
+ "0000-01-01 00:00:00",
+ "1919-05-29 13:08:55",
+ "1955-06-20 04:10:42",
+ "1973-06-30 11:38:41",
+ "1987-03-29 12:49:47",
+ "1991-06-11 19:07:01",
+ NA_character_,
+ "2017-08-21 18:26:40",
+ "2017-08-21 18:26:40",
+ "9999-12-31 23:59:59"
+ )),
+ grp = c(rep("A", 5), rep("B", 5))
+)
+
+# For Dataset tests
+first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+df1 <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ fct = factor(LETTERS[1:10]),
+ ts = first_date + lubridate::days(1:10)
+)
+
+second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+df2 <- tibble::tibble(
+ int = 101:110,
+ dbl = c(as.numeric(51:59), NaN),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[10:1],
+ fct = factor(LETTERS[10:1]),
+ ts = second_date + lubridate::days(10:1)
+)
diff --git a/src/arrow/r/tests/testthat/helper-expectation.R b/src/arrow/r/tests/testthat/helper-expectation.R
new file mode 100644
index 000000000..ef6142bb4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-expectation.R
@@ -0,0 +1,320 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_as_vector <- function(x, y, ...) {
+ expect_equal(as.vector(x), y, ...)
+}
+
+expect_data_frame <- function(x, y, ...) {
+ expect_equal(as.data.frame(x), y, ...)
+}
+
+expect_r6_class <- function(object, class) {
+ expect_s3_class(object, class)
+ expect_s3_class(object, "R6")
+}
+
+#' Mask `testthat::expect_equal()` in order to compare ArrowObjects using their
+#' `Equals` methods from the C++ library.
+expect_equal <- function(object, expected, ignore_attr = FALSE, ..., info = NULL, label = NULL) {
+ if (inherits(object, "ArrowObject") && inherits(expected, "ArrowObject")) {
+ mc <- match.call()
+ expect_true(
+ all.equal(object, expected, check.attributes = !ignore_attr),
+ info = info,
+ label = paste(rlang::as_label(mc[["object"]]), "==", rlang::as_label(mc[["expected"]]))
+ )
+ } else {
+ testthat::expect_equal(object, expected, ignore_attr = ignore_attr, ..., info = info, label = label)
+ }
+}
+
+expect_type_equal <- function(object, expected, ...) {
+ if (is.Array(object)) {
+ object <- object$type
+ }
+ if (is.Array(expected)) {
+ expected <- expected$type
+ }
+ expect_equal(object, expected, ...)
+}
+
+expect_match_arg_error <- function(object, values = c()) {
+ expect_error(object, paste0("'arg' .*", paste(dQuote(values), collapse = ", ")))
+}
+
+expect_deprecated <- expect_warning
+
+verify_output <- function(...) {
+ if (isTRUE(grepl("conda", R.Version()$platform))) {
+ skip("On conda")
+ }
+ testthat::verify_output(...)
+}
+
+#' Ensure that dplyr methods on Arrow objects return the same as for data frames
+#'
+#' This function compares the output of running a dplyr expression on a tibble
+#' or data.frame object against the output of the same expression run on
+#' Arrow Table and RecordBatch objects.
+#'
+#'
+#' @param expr A dplyr pipeline which must have `.input` as its start
+#' @param tbl A tibble or data.frame which will be substituted for `.input`
+#' @param skip_record_batch The skip message to show (if you should skip the
+#' RecordBatch test)
+#' @param skip_table The skip message to show (if you should skip the Table test)
+#' @param warning The expected warning from the RecordBatch and Table comparison
+#' paths, passed to `expect_warning()`. Special values:
+#' * `NA` (the default) for ensuring no warning message
+#' * `TRUE` is a special case to mean to check for the
+#' "not supported in Arrow; pulling data into R" message.
+#' @param ... additional arguments, passed to `expect_equal()`
+compare_dplyr_binding <- function(expr,
+ tbl,
+ skip_record_batch = NULL,
+ skip_table = NULL,
+ warning = NA,
+ ...) {
+
+ # Quote the contents of `expr` so that we can evaluate it a few different ways
+ expr <- rlang::enquo(expr)
+ # Get the expected output by evaluating expr on the .input data.frame using regular dplyr
+ expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl)))
+
+ if (isTRUE(warning)) {
+ # Special-case the simple warning:
+ # TODO: ARROW-13362 pick one of in or by and use it everywhere
+ warning <- "not supported (in|by) Arrow; pulling data into R"
+ }
+
+ skip_msg <- NULL
+
+ # Evaluate `expr` on a RecordBatch object and compare with `expected`
+ if (is.null(skip_record_batch)) {
+ expect_warning(
+ via_batch <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = record_batch(tbl)))
+ ),
+ warning
+ )
+ expect_equal(via_batch, expected, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_record_batch)
+ }
+
+ # Evaluate `expr` on a Table object and compare with `expected`
+ if (is.null(skip_table)) {
+ expect_warning(
+ via_table <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = arrow_table(tbl)))
+ ),
+ warning
+ )
+ expect_equal(via_table, expected, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_table)
+ }
+
+ if (!is.null(skip_msg)) {
+ skip(paste(skip_msg, collapse = "\n"))
+ }
+}
+
+#' Assert that Arrow dplyr methods error in the same way as methods on data.frame
+#'
+#' Comparing the error message generated when running expressions on R objects
+#' against the error message generated by running the same expression on Arrow
+#' Tables and RecordBatches.
+#'
+#' @param expr A dplyr pipeline which must have `.input` as its start
+#' @param tbl A tibble or data.frame which will be substituted for `.input`
+#' @param ... additional arguments, passed to `expect_error()`
+compare_dplyr_error <- function(expr,
+ tbl,
+ ...) {
+ # ensure we have supplied tbl
+ force(tbl)
+
+ expr <- rlang::enquo(expr)
+ msg <- tryCatch(
+ rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = tbl))),
+ error = function(e) {
+ msg <- conditionMessage(e)
+
+ # The error here is of the form:
+ #
+ # Problem with `filter()` .input `..1`.
+ # x object 'b_var' not found
+ # ℹ Input `..1` is `chr == b_var`.
+ #
+ # but what we really care about is the `x` block
+ # so (temporarily) let's pull those blocks out when we find them
+ pattern <- i18ize_error_messages()
+
+ if (grepl(pattern, msg)) {
+ msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg)
+ }
+ msg
+ }
+ )
+ # make sure msg is a character object (i.e. there has been an error)
+ # If it did not error, we would get a data.frame or whatever
+ # This expectation will tell us "dplyr on data.frame errored is not TRUE"
+ expect_true(identical(typeof(msg), "character"), label = "dplyr on data.frame errored")
+
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = record_batch(tbl)))
+ ),
+ msg,
+ ...
+ )
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = arrow_table(tbl)))
+ ),
+ msg,
+ ...
+ )
+}
+
+#' Comparing the output of running expressions on R vectors against the same
+#' expression run on Arrow Arrays and ChunkedArrays.
+#'
+#' @param expr A vectorized R expression which must have `.input` as its start
+#' @param vec A vector which will be substituted for `.input`
+#' @param skip_array The skip message to show (if you should skip the Array test)
+#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test)
+#' @param ignore_attr Ignore differences in specified attributes?
+#' @param ... additional arguments, passed to `expect_as_vector()`
+compare_expression <- function(expr,
+ vec,
+ skip_array = NULL,
+ skip_chunked_array = NULL,
+ ignore_attr = FALSE,
+ ...) {
+ expr <- rlang::enquo(expr)
+ expected <- rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec)))
+ skip_msg <- NULL
+
+ if (is.null(skip_array)) {
+ via_array <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = Array$create(vec)))
+ )
+ expect_as_vector(via_array, expected, ignore_attr, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_array)
+ }
+
+ if (is.null(skip_chunked_array)) {
+ # split input vector into two to exercise ChunkedArray with >1 chunk
+ split_vector <- split_vector_as_list(vec)
+
+ via_chunked <- rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
+ )
+ expect_as_vector(via_chunked, expected, ignore_attr, ...)
+ } else {
+ skip_msg <- c(skip_msg, skip_chunked_array)
+ }
+
+ if (!is.null(skip_msg)) {
+ skip(paste(skip_msg, collapse = "\n"))
+ }
+}
+
+#' Comparing the error message generated when running expressions on R objects
+#' against the error message generated by running the same expression on Arrow
+#' Arrays and ChunkedArrays.
+#'
+#' @param expr An R expression which must have `.input` as its start
+#' @param vec A vector which will be substituted for `.input`
+#' @param skip_array The skip message to show (if you should skip the Array test)
+#' @param skip_chunked_array The skip message to show (if you should skip the ChunkedArray test)
+#' @param ... additional arguments, passed to `expect_error()`
+compare_expression_error <- function(expr,
+ vec,
+ skip_array = NULL,
+ skip_chunked_array = NULL,
+ ...) {
+ expr <- rlang::enquo(expr)
+
+ msg <- tryCatch(
+ rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(.input = vec))),
+ error = function(e) {
+ msg <- conditionMessage(e)
+
+ pattern <- i18ize_error_messages()
+
+ if (grepl(pattern, msg)) {
+ msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg)
+ }
+ msg
+ }
+ )
+
+ expect_true(identical(typeof(msg), "character"), label = "vector errored")
+
+ skip_msg <- NULL
+
+ if (is.null(skip_array)) {
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = Array$create(vec)))
+ ),
+ msg,
+ ...
+ )
+ } else {
+ skip_msg <- c(skip_msg, skip_array)
+ }
+
+ if (is.null(skip_chunked_array)) {
+ # split input vector into two to exercise ChunkedArray with >1 chunk
+ split_vector <- split_vector_as_list(vec)
+
+ expect_error(
+ rlang::eval_tidy(
+ expr,
+ rlang::new_data_mask(rlang::env(.input = ChunkedArray$create(split_vector[[1]], split_vector[[2]])))
+ ),
+ msg,
+ ...
+ )
+ } else {
+ skip_msg <- c(skip_msg, skip_chunked_array)
+ }
+
+ if (!is.null(skip_msg)) {
+ skip(paste(skip_msg, collapse = "\n"))
+ }
+}
+
+split_vector_as_list <- function(vec) {
+ vec_split <- length(vec) %/% 2
+ vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)]
+ vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)]
+ list(vec1, vec2)
+}
diff --git a/src/arrow/r/tests/testthat/helper-parquet.R b/src/arrow/r/tests/testthat/helper-parquet.R
new file mode 100644
index 000000000..a0dd445bb
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-parquet.R
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_parquet_roundtrip <- function(tab, ...) {
+ expect_equal(parquet_roundtrip(tab, ...), tab)
+}
+
+parquet_roundtrip <- function(x, ..., as_data_frame = FALSE) {
+ # write/read parquet, returns Table
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_parquet(x, tf, ...)
+ read_parquet(tf, as_data_frame = as_data_frame)
+}
diff --git a/src/arrow/r/tests/testthat/helper-roundtrip.R b/src/arrow/r/tests/testthat/helper-roundtrip.R
new file mode 100644
index 000000000..80bcb42f1
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-roundtrip.R
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_array_roundtrip <- function(x, type, as = NULL) {
+ a <- Array$create(x, type = as)
+ expect_equal(a$type, type)
+ expect_identical(length(a), length(x))
+ if (!inherits(type, c("ListType", "LargeListType", "FixedSizeListType"))) {
+ # TODO: revisit how missingness works with ListArrays
+ # R list objects don't handle missingness the same way as other vectors.
+ # Is there some vctrs thing we should do on the roundtrip back to R?
+ expect_as_vector(is.na(a), is.na(x))
+ }
+ roundtrip <- as.vector(a)
+ expect_equal(roundtrip, x, ignore_attr = TRUE)
+ # Make sure the storage mode is the same on roundtrip (esp. integer vs. numeric)
+ expect_identical(typeof(roundtrip), typeof(x))
+
+ if (length(x)) {
+ a_sliced <- a$Slice(1)
+ x_sliced <- x[-1]
+ expect_equal(a_sliced$type, type)
+ expect_identical(length(a_sliced), length(x_sliced))
+ if (!inherits(type, c("ListType", "LargeListType", "FixedSizeListType"))) {
+ expect_as_vector(is.na(a_sliced), is.na(x_sliced))
+ }
+ expect_as_vector(a_sliced, x_sliced, ignore_attr = TRUE)
+ }
+ invisible(a)
+}
diff --git a/src/arrow/r/tests/testthat/helper-skip.R b/src/arrow/r/tests/testthat/helper-skip.R
new file mode 100644
index 000000000..4256ec4ab
--- /dev/null
+++ b/src/arrow/r/tests/testthat/helper-skip.R
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+build_features <- c(
+ arrow_info()$capabilities,
+ # Special handling for "uncompressed", for tests that iterate over compressions
+ uncompressed = TRUE
+)
+
+skip_if_not_available <- function(feature) {
+ if (feature == "re2") {
+ # RE2 does not support valgrind (on purpose): https://github.com/google/re2/issues/177
+ skip_on_valgrind()
+ }
+
+ yes <- feature %in% names(build_features) && build_features[feature]
+ if (!yes) {
+ skip(paste("Arrow C++ not built with", feature))
+ }
+}
+
+skip_if_no_pyarrow <- function() {
+ skip_on_valgrind()
+ skip_on_os("windows")
+
+ skip_if_not_installed("reticulate")
+ if (!reticulate::py_module_available("pyarrow")) {
+ skip("pyarrow not available for testing")
+ }
+}
+
+skip_if_not_dev_mode <- function() {
+ skip_if_not(
+ identical(tolower(Sys.getenv("ARROW_R_DEV")), "true"),
+ "environment variable ARROW_R_DEV"
+ )
+}
+
+skip_if_not_running_large_memory_tests <- function() {
+ skip_if_not(
+ identical(tolower(Sys.getenv("ARROW_LARGE_MEMORY_TESTS")), "true"),
+ "environment variable ARROW_LARGE_MEMORY_TESTS"
+ )
+}
+
+skip_on_valgrind <- function() {
+ # This does not actually skip on valgrind because we can't exactly detect it.
+ # Instead, it skips on CRAN when the OS is linux + and the R version is development
+ # (which is where valgrind is run as of this code)
+ linux_dev <- identical(tolower(Sys.info()[["sysname"]]), "linux") &&
+ grepl("devel", R.version.string)
+
+ if (linux_dev) {
+ skip_on_cran()
+ }
+}
+
+skip_if_r_version <- function(r_version) {
+ if (getRversion() <= r_version) {
+ skip(paste("R version:", getRversion()))
+ }
+}
+
+process_is_running <- function(x) {
+ cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x)
+ tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE)
+}
diff --git a/src/arrow/r/tests/testthat/latin1.R b/src/arrow/r/tests/testthat/latin1.R
new file mode 100644
index 000000000..150192d31
--- /dev/null
+++ b/src/arrow/r/tests/testthat/latin1.R
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+x <- iconv("Veitingastaðir", to = "latin1")
+df <- tibble::tibble(
+ chr = x,
+ fct = as.factor(x)
+)
+names(df) <- iconv(paste(x, names(df), sep = "_"), to = "latin1")
+df_struct <- tibble::tibble(a = df)
+
+raw_schema <- list(utf8(), dictionary(int8(), utf8()))
+names(raw_schema) <- names(df)
+
+# Confirm setup
+expect_identical(Encoding(x), "latin1")
+expect_identical(Encoding(names(df)), c("latin1", "latin1"))
+expect_identical(Encoding(df[[1]]), "latin1")
+expect_identical(Encoding(levels(df[[2]])), "latin1")
+
+# Array
+expect_identical(as.vector(Array$create(x)), x)
+# struct
+expect_identical(as.vector(Array$create(df)), df)
+
+# ChunkedArray
+expect_identical(as.vector(ChunkedArray$create(x)), x)
+# struct
+expect_identical(as.vector(ChunkedArray$create(df)), df)
+
+# Table (including field name)
+expect_identical(as.data.frame(Table$create(df)), df)
+expect_identical(as.data.frame(Table$create(df_struct)), df_struct)
+
+# RecordBatch
+expect_identical(as.data.frame(record_batch(df)), df)
+expect_identical(as.data.frame(record_batch(df_struct)), df_struct)
+
+# Schema field name
+df_schema <- do.call(schema, raw_schema)
+expect_identical(names(df_schema), names(df))
+
+df_struct_schema <- schema(a = do.call(struct, raw_schema))
+# StructType doesn't expose names (in C++)
+# expect_identical(names(df_struct_schema$a), names(df))
+
+# Create table/batch with schema
+expect_identical(as.data.frame(Table$create(df, schema = df_schema)), df)
+expect_identical(as.data.frame(Table$create(df_struct, schema = df_struct_schema)), df_struct)
+expect_identical(as.data.frame(record_batch(df, schema = df_schema)), df)
+expect_identical(as.data.frame(record_batch(df_struct, schema = df_struct_schema)), df_struct)
+
+# Serialization
+feather_file <- tempfile()
+write_feather(df_struct, feather_file)
+expect_identical(read_feather(feather_file), df_struct)
+
+if (arrow_with_parquet()) {
+ parquet_file <- tempfile()
+ write_parquet(df, parquet_file) # Parquet doesn't yet support nested types
+ expect_identical(read_parquet(parquet_file), df)
+}
diff --git a/src/arrow/r/tests/testthat/test-Array.R b/src/arrow/r/tests/testthat/test-Array.R
new file mode 100644
index 000000000..ce23c2609
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-Array.R
@@ -0,0 +1,963 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Integer Array", {
+ ints <- c(1:10, 1:10, 1:5)
+ x <- expect_array_roundtrip(ints, int32())
+})
+
+test_that("binary Array", {
+ # if the type is given, we just need a list of raw vectors
+ bin <- list(as.raw(1:10), as.raw(1:10))
+ expect_array_roundtrip(bin, binary(), as = binary())
+ expect_array_roundtrip(bin, large_binary(), as = large_binary())
+ expect_array_roundtrip(bin, fixed_size_binary(10), as = fixed_size_binary(10))
+
+ bin[[1L]] <- as.raw(1:20)
+ expect_error(Array$create(bin, fixed_size_binary(10)))
+
+ # otherwise the arrow type is deduced from the R classes
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:10), as.raw(11:20)),
+ class = "arrow_binary"
+ )
+ expect_array_roundtrip(bin, binary())
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:10), as.raw(11:20)),
+ class = "arrow_large_binary"
+ )
+ expect_array_roundtrip(bin, large_binary())
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:10), as.raw(11:20)),
+ class = "arrow_fixed_size_binary",
+ byte_width = 10L
+ )
+ expect_array_roundtrip(bin, fixed_size_binary(byte_width = 10))
+
+ # degenerate cases
+ skip_on_valgrind() # valgrind errors on these tests ARROW-12638
+ bin <- vctrs::new_vctr(
+ list(1:10),
+ class = "arrow_binary"
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(1:10),
+ ptype = raw(),
+ class = "arrow_large_binary"
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(1:10),
+ class = "arrow_fixed_size_binary",
+ byte_width = 10
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:5)),
+ class = "arrow_fixed_size_binary",
+ byte_width = 10
+ )
+ expect_error(Array$create(bin))
+
+ bin <- vctrs::new_vctr(
+ list(as.raw(1:5)),
+ class = "arrow_fixed_size_binary"
+ )
+ expect_error(Array$create(bin))
+})
+
+test_that("Slice() and RangeEquals()", {
+ ints <- c(1:10, 101:110, 201:205)
+ x <- Array$create(ints)
+
+ y <- x$Slice(10)
+ expect_equal(y$type, int32())
+ expect_equal(length(y), 15L)
+ expect_as_vector(y, c(101:110, 201:205))
+ expect_true(x$RangeEquals(y, 10, 24))
+ expect_false(x$RangeEquals(y, 9, 23))
+ expect_false(x$RangeEquals(y, 11, 24))
+
+ z <- x$Slice(10, 5)
+ expect_as_vector(z, c(101:105))
+ expect_true(x$RangeEquals(z, 10, 15, 0))
+
+ # Input validation
+ expect_error(x$Slice("ten"))
+ expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(10, "ten"))
+ expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(c(10, 10)))
+ expect_error(x$Slice(10, c(10, 10)))
+ expect_error(x$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative")
+
+ expect_warning(x$Slice(10, 15), NA)
+ expect_warning(
+ overslice <- x$Slice(10, 16),
+ "Slice 'length' greater than available length"
+ )
+ expect_equal(length(overslice), 15)
+ expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
+
+ expect_error(x$RangeEquals(10, 24, 0), 'other must be a "Array"')
+ expect_error(x$RangeEquals(y, NA, 24), "'start_idx' cannot be NA")
+ expect_error(x$RangeEquals(y, 10, NA), "'end_idx' cannot be NA")
+ expect_error(x$RangeEquals(y, 10, 24, NA), "'other_start_idx' cannot be NA")
+ expect_error(x$RangeEquals(y, "ten", 24))
+
+ skip("TODO: (if anyone uses RangeEquals)")
+ expect_error(x$RangeEquals(y, 10, 2400, 0)) # does not error
+ expect_error(x$RangeEquals(y, 1000, 24, 0)) # does not error
+ expect_error(x$RangeEquals(y, 10, 24, 1000)) # does not error
+})
+
+test_that("Double Array", {
+ dbls <- c(1, 2, 3, 4, 5, 6)
+ x_dbl <- expect_array_roundtrip(dbls, float64())
+})
+
+test_that("Array print method includes type", {
+ x <- Array$create(c(1:10, 1:10, 1:5))
+ expect_output(print(x), "Array\n<int32>\n[\n", fixed = TRUE)
+})
+
+test_that("Array supports NA", {
+ x_int <- Array$create(as.integer(c(1:10, NA)))
+ x_dbl <- Array$create(as.numeric(c(1:10, NA)))
+ expect_true(x_int$IsValid(0))
+ expect_true(x_dbl$IsValid(0L))
+ expect_true(x_int$IsNull(10L))
+ expect_true(x_dbl$IsNull(10))
+
+ expect_as_vector(is.na(x_int), c(rep(FALSE, 10), TRUE))
+ expect_as_vector(is.na(x_dbl), c(rep(FALSE, 10), TRUE))
+
+ # Input validation
+ expect_error(x_int$IsValid("ten"))
+ expect_error(x_int$IsNull("ten"))
+ expect_error(x_int$IsValid(c(10, 10)))
+ expect_error(x_int$IsNull(c(10, 10)))
+ expect_error(x_int$IsValid(NA), "'i' cannot be NA")
+ expect_error(x_int$IsNull(NA), "'i' cannot be NA")
+ expect_error(x_int$IsValid(1000), "subscript out of bounds")
+ expect_error(x_int$IsValid(-1), "subscript out of bounds")
+ expect_error(x_int$IsNull(1000), "subscript out of bounds")
+ expect_error(x_int$IsNull(-1), "subscript out of bounds")
+})
+
+test_that("Array support null type (ARROW-7064)", {
+ expect_array_roundtrip(vctrs::unspecified(10), null())
+})
+
+test_that("Array supports logical vectors (ARROW-3341)", {
+ # with NA
+ x <- sample(c(TRUE, FALSE, NA), 1000, replace = TRUE)
+ expect_array_roundtrip(x, bool())
+
+ # without NA
+ x <- sample(c(TRUE, FALSE), 1000, replace = TRUE)
+ expect_array_roundtrip(x, bool())
+})
+
+test_that("Array supports character vectors (ARROW-3339)", {
+ # without NA
+ expect_array_roundtrip(c("itsy", "bitsy", "spider"), utf8())
+ expect_array_roundtrip(c("itsy", "bitsy", "spider"), large_utf8(), as = large_utf8())
+
+ # with NA
+ expect_array_roundtrip(c("itsy", NA, "spider"), utf8())
+ expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8())
+})
+
+test_that("Character vectors > 2GB become large_utf8", {
+ skip_on_cran()
+ skip_if_not_running_large_memory_tests()
+ big <- make_big_string()
+ expect_array_roundtrip(big, large_utf8())
+})
+
+test_that("empty arrays are supported", {
+ expect_array_roundtrip(character(), utf8())
+ expect_array_roundtrip(character(), large_utf8(), as = large_utf8())
+ expect_array_roundtrip(integer(), int32())
+ expect_array_roundtrip(numeric(), float64())
+ expect_array_roundtrip(factor(character()), dictionary(int8(), utf8()))
+ expect_array_roundtrip(logical(), bool())
+})
+
+test_that("array with all nulls are supported", {
+ nas <- c(NA, NA)
+ expect_array_roundtrip(as.character(nas), utf8())
+ expect_array_roundtrip(as.integer(nas), int32())
+ expect_array_roundtrip(as.numeric(nas), float64())
+ expect_array_roundtrip(as.factor(nas), dictionary(int8(), utf8()))
+ expect_array_roundtrip(as.logical(nas), bool())
+})
+
+test_that("Array supports unordered factors (ARROW-3355)", {
+ # without NA
+ f <- factor(c("itsy", "bitsy", "spider", "spider"))
+ expect_array_roundtrip(f, dictionary(int8(), utf8()))
+
+ # with NA
+ f <- factor(c("itsy", "bitsy", NA, "spider", "spider"))
+ expect_array_roundtrip(f, dictionary(int8(), utf8()))
+})
+
+test_that("Array supports ordered factors (ARROW-3355)", {
+ # without NA
+ f <- ordered(c("itsy", "bitsy", "spider", "spider"))
+ arr_fac <- expect_array_roundtrip(f, dictionary(int8(), utf8(), ordered = TRUE))
+ expect_true(arr_fac$ordered)
+
+ # with NA
+ f <- ordered(c("itsy", "bitsy", NA, "spider", "spider"))
+ expect_array_roundtrip(f, dictionary(int8(), utf8(), ordered = TRUE))
+})
+
+test_that("array supports Date (ARROW-3340)", {
+ d <- Sys.Date() + 1:10
+ expect_array_roundtrip(d, date32())
+
+ d[5] <- NA
+ expect_array_roundtrip(d, date32())
+})
+
+test_that("array supports POSIXct (ARROW-3340)", {
+ times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10
+ expect_array_roundtrip(times, timestamp("us", "UTC"))
+
+ times[5] <- NA
+ expect_array_roundtrip(times, timestamp("us", "UTC"))
+
+ times2 <- lubridate::ymd_hms("2018-10-07 19:04:05", tz = "US/Eastern") + 1:10
+ expect_array_roundtrip(times2, timestamp("us", "US/Eastern"))
+})
+
+test_that("array supports POSIXct without timezone", {
+ # Make sure timezone is not set
+ withr::with_envvar(c(TZ = ""), {
+ times <- strptime("2019-02-03 12:34:56", format = "%Y-%m-%d %H:%M:%S") + 1:10
+ expect_array_roundtrip(times, timestamp("us", ""))
+
+ # Also test the INTSXP code path
+ skip("Ingest_POSIXct only implemented for REALSXP")
+ times_int <- as.integer(times)
+ attributes(times_int) <- attributes(times)
+ expect_array_roundtrip(times_int, timestamp("us", ""))
+ })
+})
+
+test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", {
+ # Write a feather file as that's what the initial bug report used
+ df <- tibble::tibble(
+ no_tz = lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10,
+ yes_tz = lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas") + 1:10
+ )
+ if (!identical(Sys.timezone(), "Pacific/Marquesas")) {
+ # Confirming that the columns are in fact different
+ expect_false(any(df$no_tz == df$yes_tz))
+ }
+ feather_file <- tempfile()
+ on.exit(unlink(feather_file))
+ write_feather(df, feather_file)
+ expect_identical(read_feather(feather_file), df)
+})
+
+test_that("array supports integer64", {
+ x <- bit64::as.integer64(1:10) + MAX_INT
+ expect_array_roundtrip(x, int64())
+
+ x[4] <- NA
+ expect_array_roundtrip(x, int64())
+
+ # all NA int64 (ARROW-3795)
+ all_na <- Array$create(bit64::as.integer64(NA))
+ expect_type_equal(all_na, int64())
+ expect_true(as.vector(is.na(all_na)))
+})
+
+test_that("array supports difftime", {
+ time <- hms::hms(56, 34, 12)
+ expect_array_roundtrip(c(time, time), time32("s"))
+ expect_array_roundtrip(vctrs::vec_c(NA, time), time32("s"))
+})
+
+test_that("support for NaN (ARROW-3615)", {
+ x <- c(1, NA, NaN, -1)
+ y <- Array$create(x)
+ expect_true(y$IsValid(2))
+ expect_equal(y$null_count, 1L)
+})
+
+test_that("is.nan() evalutes to FALSE on NA (for consistency with base R)", {
+ x <- c(1.0, NA, NaN, -1.0)
+ compare_expression(is.nan(.input), x)
+})
+
+test_that("is.nan() evalutes to FALSE on non-floats (for consistency with base R)", {
+ x <- c(1L, 2L, 3L)
+ y <- c("foo", "bar")
+ compare_expression(is.nan(.input), x)
+ compare_expression(is.nan(.input), y)
+})
+
+test_that("is.na() evalutes to TRUE on NaN (for consistency with base R)", {
+ x <- c(1, NA, NaN, -1)
+ compare_expression(is.na(.input), x)
+})
+
+test_that("integer types casts (ARROW-3741)", {
+ # Defining some type groups for use here and in the following tests
+ int_types <- c(int8(), int16(), int32(), int64())
+ uint_types <- c(uint8(), uint16(), uint32(), uint64())
+ float_types <- c(float32(), float64()) # float16() not really supported in C++ yet
+
+ a <- Array$create(c(1:10, NA))
+ for (type in c(int_types, uint_types)) {
+ casted <- a$cast(type)
+ expect_equal(casted$type, type)
+ expect_identical(as.vector(is.na(casted)), c(rep(FALSE, 10), TRUE))
+ }
+})
+
+test_that("integer types cast safety (ARROW-3741, ARROW-5541)", {
+ a <- Array$create(-(1:10))
+ for (type in uint_types) {
+ expect_error(a$cast(type), regexp = "Integer value -1 not in range")
+ expect_error(a$cast(type, safe = FALSE), NA)
+ }
+})
+
+test_that("float types casts (ARROW-3741)", {
+ x <- c(1, 2, 3, NA)
+ a <- Array$create(x)
+ for (type in float_types) {
+ casted <- a$cast(type)
+ expect_equal(casted$type, type)
+ expect_identical(as.vector(is.na(casted)), c(rep(FALSE, 3), TRUE))
+ expect_identical(as.vector(casted), x)
+ }
+})
+
+test_that("cast to half float works", {
+ skip("Need halffloat support: https://issues.apache.org/jira/browse/ARROW-3802")
+ a <- Array$create(1:4)
+ a_f16 <- a$cast(float16())
+ expect_type_equal(a_16$type, float16())
+})
+
+test_that("cast input validation", {
+ a <- Array$create(1:4)
+ expect_error(a$cast("not a type"), "type must be a DataType, not character")
+})
+
+test_that("Array$create() supports the type= argument. conversion from INTSXP and int64 to all int types", {
+ num_int32 <- 12L
+ num_int64 <- bit64::as.integer64(10)
+
+ types <- c(
+ int_types,
+ uint_types,
+ float_types,
+ double() # not actually a type, a base R function but should be alias for float64
+ )
+ for (type in types) {
+ expect_type_equal(Array$create(num_int32, type = type)$type, as_type(type))
+ expect_type_equal(Array$create(num_int64, type = type)$type, as_type(type))
+ }
+
+ # Input validation
+ expect_error(
+ Array$create(5, type = "not a type"),
+ "type must be a DataType, not character"
+ )
+})
+
+test_that("Array$create() aborts on overflow", {
+ expect_error(Array$create(128L, type = int8()))
+ expect_error(Array$create(-129L, type = int8()))
+
+ expect_error(Array$create(256L, type = uint8()))
+ expect_error(Array$create(-1L, type = uint8()))
+
+ expect_error(Array$create(32768L, type = int16()))
+ expect_error(Array$create(-32769L, type = int16()))
+
+ expect_error(Array$create(65536L, type = uint16()))
+ expect_error(Array$create(-1L, type = uint16()))
+
+ expect_error(Array$create(65536L, type = uint16()))
+ expect_error(Array$create(-1L, type = uint16()))
+
+ expect_error(Array$create(bit64::as.integer64(2^31), type = int32()))
+ expect_error(Array$create(bit64::as.integer64(2^32), type = uint32()))
+})
+
+test_that("Array$create() does not convert doubles to integer", {
+ for (type in c(int_types, uint_types)) {
+ a <- Array$create(10, type = type)
+ expect_type_equal(a$type, type)
+ expect_true(as.vector(a) == 10L)
+ }
+})
+
+test_that("Array$create() converts raw vectors to uint8 arrays (ARROW-3794)", {
+ expect_type_equal(Array$create(as.raw(1:10))$type, uint8())
+})
+
+test_that("Array<int8>$as_vector() converts to integer (ARROW-3794)", {
+ i8 <- (-128):127
+ a <- Array$create(i8)$cast(int8())
+ expect_type_equal(a, int8())
+ expect_as_vector(a, i8)
+
+ u8 <- 0:255
+ a <- Array$create(u8)$cast(uint8())
+ expect_type_equal(a, uint8())
+ expect_as_vector(a, u8)
+})
+
+test_that("Arrays of {,u}int{32,64} convert to integer if they can fit", {
+ u32 <- Array$create(1L)$cast(uint32())
+ expect_identical(as.vector(u32), 1L)
+
+ u64 <- Array$create(1L)$cast(uint64())
+ expect_identical(as.vector(u64), 1L)
+
+ i64 <- Array$create(bit64::as.integer64(1:10))
+ expect_identical(as.vector(i64), 1:10)
+})
+
+test_that("Arrays of uint{32,64} convert to numeric if they can't fit integer", {
+ u32 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint32())
+ expect_identical(as.vector(u32), 1 + MAX_INT)
+
+ u64 <- Array$create(bit64::as.integer64(1) + MAX_INT)$cast(uint64())
+ expect_identical(as.vector(u64), 1 + MAX_INT)
+})
+
+test_that("Array$create() recognise arrow::Array (ARROW-3815)", {
+ a <- Array$create(1:10)
+ expect_equal(a, Array$create(a))
+})
+
+test_that("Array$create() handles data frame -> struct arrays (ARROW-3811)", {
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ a <- Array$create(df)
+ expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8()))
+ expect_as_vector(a, df)
+
+ df <- structure(
+ list(col = structure(list(structure(list(list(structure(1))), class = "inner")), class = "outer")),
+ class = "data.frame", row.names = c(NA, -1L)
+ )
+ a <- Array$create(df)
+ expect_type_equal(a$type, struct(col = list_of(list_of(list_of(float64())))))
+ expect_as_vector(a, df, ignore_attr = TRUE)
+})
+
+test_that("StructArray methods", {
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ a <- Array$create(df)
+ expect_equal(a$x, Array$create(df$x))
+ expect_equal(a[["x"]], Array$create(df$x))
+ expect_equal(a[[1]], Array$create(df$x))
+ expect_identical(names(a), c("x", "y", "z"))
+ expect_identical(dim(a), c(10L, 3L))
+})
+
+test_that("Array$create() can handle data frame with custom struct type (not inferred)", {
+ df <- tibble::tibble(x = 1:10, y = 1:10)
+ type <- struct(x = float64(), y = int16())
+ a <- Array$create(df, type = type)
+ expect_type_equal(a$type, type)
+
+ type <- struct(x = float64(), y = int16(), z = int32())
+ expect_error(
+ Array$create(df, type = type),
+ regexp = "Number of fields in struct.* incompatible with number of columns in the data frame"
+ )
+
+ type <- struct(y = int16(), x = float64())
+ expect_error(
+ Array$create(df, type = type),
+ regexp = "Field name in position.*does not match the name of the column of the data frame"
+ )
+
+ type <- struct(x = float64(), y = utf8())
+ expect_error(Array$create(df, type = type), regexp = "Invalid")
+})
+
+test_that("Array$create() supports tibble with no columns (ARROW-8354)", {
+ df <- tibble::tibble()
+ expect_equal(Array$create(df)$as_vector(), df)
+})
+
+test_that("Array$create() handles vector -> list arrays (ARROW-7662)", {
+ # Should be able to create an empty list with a type hint.
+ expect_r6_class(Array$create(list(), list_of(bool())), "ListArray")
+
+ # logical
+ expect_array_roundtrip(list(NA), list_of(bool()))
+ expect_array_roundtrip(list(logical(0)), list_of(bool()))
+ expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), list_of(bool()))
+ expect_array_roundtrip(list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)), list_of(bool()))
+
+ # integer
+ expect_array_roundtrip(list(NA_integer_), list_of(int32()))
+ expect_array_roundtrip(list(integer(0)), list_of(int32()))
+ expect_array_roundtrip(list(1:2, 3:4, 12:18), list_of(int32()))
+ expect_array_roundtrip(list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)), list_of(int32()))
+
+ # numeric
+ expect_array_roundtrip(list(NA_real_), list_of(float64()))
+ expect_array_roundtrip(list(numeric(0)), list_of(float64()))
+ expect_array_roundtrip(list(1, c(2, 3), 4), list_of(float64()))
+ expect_array_roundtrip(list(1, numeric(0), c(2, 3, NA_real_), 4), list_of(float64()))
+
+ # character
+ expect_array_roundtrip(list(NA_character_), list_of(utf8()))
+ expect_array_roundtrip(list(character(0)), list_of(utf8()))
+ expect_array_roundtrip(list("itsy", c("bitsy", "spider"), c("is")), list_of(utf8()))
+ expect_array_roundtrip(list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")), list_of(utf8()))
+
+ # factor
+ expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), list_of(dictionary(int8(), utf8())))
+ expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8())))
+
+ # struct
+ expect_array_roundtrip(
+ list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))),
+ list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()))
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(integer()))),
+ list_of(struct(a = list_of(int32())))
+ )
+ # degenerated data frame
+ df <- structure(list(x = 1:2, y = 1), class = "data.frame", row.names = 1:2)
+ expect_error(Array$create(list(df)))
+})
+
+test_that("Array$create() handles vector -> large list arrays", {
+ # Should be able to create an empty list with a type hint.
+ expect_r6_class(Array$create(list(), type = large_list_of(bool())), "LargeListArray")
+
+ # logical
+ expect_array_roundtrip(list(NA), large_list_of(bool()), as = large_list_of(bool()))
+ expect_array_roundtrip(list(logical(0)), large_list_of(bool()), as = large_list_of(bool()))
+ expect_array_roundtrip(list(c(TRUE), c(FALSE), c(FALSE, TRUE)), large_list_of(bool()), as = large_list_of(bool()))
+ expect_array_roundtrip(
+ list(c(TRUE), c(FALSE), NA, logical(0), c(FALSE, NA, TRUE)),
+ large_list_of(bool()),
+ as = large_list_of(bool())
+ )
+
+ # integer
+ expect_array_roundtrip(list(NA_integer_), large_list_of(int32()), as = large_list_of(int32()))
+ expect_array_roundtrip(list(integer(0)), large_list_of(int32()), as = large_list_of(int32()))
+ expect_array_roundtrip(list(1:2, 3:4, 12:18), large_list_of(int32()), as = large_list_of(int32()))
+ expect_array_roundtrip(
+ list(c(1:2), NA_integer_, integer(0), c(12:18, NA_integer_)),
+ large_list_of(int32()),
+ as = large_list_of(int32())
+ )
+
+ # numeric
+ expect_array_roundtrip(list(NA_real_), large_list_of(float64()), as = large_list_of(float64()))
+ expect_array_roundtrip(list(numeric(0)), large_list_of(float64()), as = large_list_of(float64()))
+ expect_array_roundtrip(list(1, c(2, 3), 4), large_list_of(float64()), as = large_list_of(float64()))
+ expect_array_roundtrip(
+ list(1, numeric(0), c(2, 3, NA_real_), 4),
+ large_list_of(float64()),
+ as = large_list_of(float64())
+ )
+
+ # character
+ expect_array_roundtrip(list(NA_character_), large_list_of(utf8()), as = large_list_of(utf8()))
+ expect_array_roundtrip(list(character(0)), large_list_of(utf8()), as = large_list_of(utf8()))
+ expect_array_roundtrip(
+ list("itsy", c("bitsy", "spider"), c("is")),
+ large_list_of(utf8()),
+ as = large_list_of(utf8())
+ )
+ expect_array_roundtrip(
+ list("itsy", character(0), c("bitsy", "spider", NA_character_), c("is")),
+ large_list_of(utf8()),
+ as = large_list_of(utf8())
+ )
+
+ # factor
+ expect_array_roundtrip(
+ list(factor(c("b", "a"), levels = c("a", "b"))),
+ large_list_of(dictionary(int8(), utf8())),
+ as = large_list_of(dictionary(int8(), utf8()))
+ )
+ expect_array_roundtrip(
+ list(factor(NA, levels = c("a", "b"))),
+ large_list_of(dictionary(int8(), utf8())),
+ as = large_list_of(dictionary(int8(), utf8()))
+ )
+
+ # struct
+ expect_array_roundtrip(
+ list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))),
+ large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool())),
+ as = large_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()))
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(integer()))),
+ large_list_of(struct(a = list_of(int32()))),
+ as = large_list_of(struct(a = list_of(int32())))
+ )
+})
+
+test_that("Array$create() handles vector -> fixed size list arrays", {
+ # Should be able to create an empty list with a type hint.
+ expect_r6_class(Array$create(list(), type = fixed_size_list_of(bool(), 20)), "FixedSizeListArray")
+
+ # logical
+ expect_array_roundtrip(list(NA), fixed_size_list_of(bool(), 1L), as = fixed_size_list_of(bool(), 1L))
+ expect_array_roundtrip(
+ list(c(TRUE, FALSE), c(FALSE, TRUE)),
+ fixed_size_list_of(bool(), 2L),
+ as = fixed_size_list_of(bool(), 2L)
+ )
+ expect_array_roundtrip(
+ list(c(TRUE), c(FALSE), NA),
+ fixed_size_list_of(bool(), 1L),
+ as = fixed_size_list_of(bool(), 1L)
+ )
+
+ # integer
+ expect_array_roundtrip(list(NA_integer_), fixed_size_list_of(int32(), 1L), as = fixed_size_list_of(int32(), 1L))
+ expect_array_roundtrip(list(1:2, 3:4, 11:12), fixed_size_list_of(int32(), 2L), as = fixed_size_list_of(int32(), 2L))
+ expect_array_roundtrip(
+ list(c(1:2), c(NA_integer_, 3L)),
+ fixed_size_list_of(int32(), 2L),
+ as = fixed_size_list_of(int32(), 2L)
+ )
+
+ # numeric
+ expect_array_roundtrip(list(NA_real_), fixed_size_list_of(float64(), 1L), as = fixed_size_list_of(float64(), 1L))
+ expect_array_roundtrip(
+ list(c(1, 2), c(2, 3)),
+ fixed_size_list_of(float64(), 2L),
+ as = fixed_size_list_of(float64(), 2L)
+ )
+ expect_array_roundtrip(
+ list(c(1, 2), c(NA_real_, 4)),
+ fixed_size_list_of(float64(), 2L),
+ as = fixed_size_list_of(float64(), 2L)
+ )
+
+ # character
+ expect_array_roundtrip(list(NA_character_), fixed_size_list_of(utf8(), 1L), as = fixed_size_list_of(utf8(), 1L))
+ expect_array_roundtrip(
+ list(c("itsy", "bitsy"), c("spider", "is"), c(NA_character_, NA_character_), c("", "")),
+ fixed_size_list_of(utf8(), 2L),
+ as = fixed_size_list_of(utf8(), 2L)
+ )
+
+ # factor
+ expect_array_roundtrip(
+ list(factor(c("b", "a"), levels = c("a", "b"))),
+ fixed_size_list_of(dictionary(int8(), utf8()), 2L),
+ as = fixed_size_list_of(dictionary(int8(), utf8()), 2L)
+ )
+
+ # struct
+ expect_array_roundtrip(
+ list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)),
+ fixed_size_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()), 1L),
+ as = fixed_size_list_of(struct(a = int32(), b = int32(), c = utf8(), d = bool()), 1L)
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(1L))),
+ fixed_size_list_of(struct(a = list_of(int32())), 1L),
+ as = fixed_size_list_of(struct(a = list_of(int32())), 1L)
+ )
+ expect_array_roundtrip(
+ list(tibble::tibble(a = list(1L))),
+ list_of(struct(a = fixed_size_list_of(int32(), 1L))),
+ as = list_of(struct(a = fixed_size_list_of(int32(), 1L)))
+ )
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- structure(list(
+ as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
+ as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
+ as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
+ as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
+ as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
+ as.raw(c(0x74, 0x76))
+ ),
+ class = c("arrow_binary", "vctrs_vctr", "list")
+ )
+ expect_error(
+ rawToChar(raws[[3]]),
+ "embedded nul in string: 'ma\\0n'", # See?
+ fixed = TRUE
+ )
+ array_with_nul <- Array$create(raws)$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+
+ # no error on conversion, because altrep laziness
+ v <- expect_error(as.vector(array_with_nul), NA)
+
+ # attempting materialization -> error
+
+ expect_error(v[],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow ",
+ "to R, set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ # also error on materializing v[3]
+ expect_error(v[3],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow ",
+ "to R, set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ # no warning yet because altrep laziness
+ v <- as.vector(array_with_nul)
+
+ expect_warning(
+ expect_identical(
+ v[],
+ c("person", "woman", "man", "fan", "camera", "tv")
+ ),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+
+ v <- as.vector(array_with_nul)
+ expect_warning(
+ expect_identical(v[3], "man"),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+
+ v <- as.vector(array_with_nul)
+ expect_warning(
+ expect_identical(v[4], "fan"),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
+
+test_that("Array$create() should have helpful error", {
+ expect_error(Array$create(list(numeric(0)), list_of(bool())), "Expecting a logical vector")
+
+ lgl <- logical(0)
+ int <- integer(0)
+ num <- numeric(0)
+ char <- character(0)
+ expect_error(Array$create(list()), "Requires at least one element to infer")
+ expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector")
+ expect_error(Array$create(list(char, num, char)), "Expecting a character vector")
+})
+
+test_that("Array$View() (ARROW-6542)", {
+ a <- Array$create(1:3)
+ b <- a$View(float32())
+ expect_equal(b$type, float32())
+ expect_equal(length(b), 3L)
+
+ # Input validation
+ expect_error(a$View("not a type"), "type must be a DataType, not character")
+})
+
+test_that("Array$Validate()", {
+ a <- Array$create(1:10)
+ expect_error(a$Validate(), NA)
+})
+
+test_that("is.Array", {
+ a <- Array$create(1, type = int32())
+ expect_true(is.Array(a))
+ expect_true(is.Array(a, "int32"))
+ expect_true(is.Array(a, c("int32", "int16")))
+ expect_false(is.Array(a, "utf8"))
+ expect_true(is.Array(a$View(float32())), "float32")
+ expect_false(is.Array(1))
+ expect_true(is.Array(ChunkedArray$create(1, 2)))
+})
+
+test_that("Array$Take()", {
+ a <- Array$create(10:20)
+ expect_as_vector(a$Take(c(4, 2)), c(14, 12))
+})
+
+test_that("[ method on Array", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ expect_as_vector(a[5:9], vec[5:9])
+ expect_as_vector(a[c(9, 3, 5)], vec[c(9, 3, 5)])
+ expect_as_vector(a[rep(c(TRUE, FALSE), 5)], vec[c(1, 3, 5, 7, 9)])
+ expect_as_vector(a[rep(c(TRUE, FALSE, NA, FALSE, TRUE), 2)], c(11, NA, 15, 16, NA, 20))
+ expect_as_vector(a[-4], vec[-4])
+ expect_as_vector(a[-1], vec[-1])
+})
+
+test_that("[ accepts Arrays and otherwise handles bad input", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ ind <- c(9, 3, 5)
+ expect_error(
+ a[Array$create(ind)],
+ "Cannot extract rows with an Array of type double"
+ )
+ expect_as_vector(a[Array$create(ind - 1, type = int8())], vec[ind])
+ expect_as_vector(a[Array$create(ind - 1, type = uint8())], vec[ind])
+ expect_as_vector(a[ChunkedArray$create(8, 2, 4, type = uint8())], vec[ind])
+
+ filt <- seq_along(vec) %in% ind
+ expect_as_vector(a[Array$create(filt)], vec[filt])
+
+ expect_error(
+ a["string"],
+ "Cannot extract rows with an object of class character"
+ )
+})
+
+test_that("%in% works on dictionary arrays", {
+ a1 <- Array$create(as.factor(c("A", "B", "C")))
+ a2 <- DictionaryArray$create(c(0L, 1L, 2L), c(4.5, 3.2, 1.1))
+ c1 <- Array$create(c(FALSE, TRUE, FALSE))
+ c2 <- Array$create(c(FALSE, FALSE, FALSE))
+ b1 <- Array$create("B")
+ b2 <- Array$create(5.4)
+
+ expect_equal(is_in(a1, b1), c1)
+ expect_equal(is_in(a2, b2), c2)
+ expect_error(is_in(a1, b2))
+})
+
+test_that("[ accepts Expressions", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ b <- Array$create(1:10)
+ expect_as_vector(a[b > 4], vec[5:10])
+})
+
+test_that("Array head/tail", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ expect_as_vector(head(a), head(vec))
+ expect_as_vector(head(a, 4), head(vec, 4))
+ expect_as_vector(head(a, 40), head(vec, 40))
+ expect_as_vector(head(a, -4), head(vec, -4))
+ expect_as_vector(head(a, -40), head(vec, -40))
+ expect_as_vector(tail(a), tail(vec))
+ expect_as_vector(tail(a, 4), tail(vec, 4))
+ expect_as_vector(tail(a, 40), tail(vec, 40))
+ expect_as_vector(tail(a, -40), tail(vec, -40))
+})
+
+test_that("Dictionary array: create from arrays, not factor", {
+ a <- DictionaryArray$create(c(2L, 1L, 1L, 2L, 0L), c(4.5, 3.2, 1.1))
+ expect_equal(a$type, dictionary(int32(), float64()))
+})
+
+test_that("Dictionary array: translate to R when dict isn't string", {
+ a <- DictionaryArray$create(c(2L, 1L, 1L, 2L, 0L), c(4.5, 3.2, 1.1))
+ expect_warning(
+ expect_identical(
+ as.vector(a),
+ factor(c(3, 2, 2, 3, 1), labels = c("4.5", "3.2", "1.1"))
+ )
+ )
+})
+
+test_that("Array$Equals", {
+ vec <- 11:20
+ a <- Array$create(vec)
+ b <- Array$create(vec)
+ d <- Array$create(3:4)
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(vec))
+ expect_false(a$Equals(d))
+})
+
+test_that("Array$ApproxEquals", {
+ vec <- c(1.0000000000001, 2.400000000000001)
+ a <- Array$create(vec)
+ b <- Array$create(round(vec, 1))
+ expect_false(a$Equals(b))
+ expect_true(a$ApproxEquals(b))
+ expect_false(a$ApproxEquals(vec))
+})
+
+test_that("auto int64 conversion to int can be disabled (ARROW-10093)", {
+ withr::with_options(list(arrow.int64_downcast = FALSE), {
+ a <- Array$create(1:10, int64())
+ expect_true(inherits(a$as_vector(), "integer64"))
+
+ batch <- RecordBatch$create(x = a)
+ expect_true(inherits(as.data.frame(batch)$x, "integer64"))
+
+ tab <- Table$create(x = a)
+ expect_true(inherits(as.data.frame(batch)$x, "integer64"))
+ })
+})
+
+
+test_that("Array to C-interface", {
+ # create a struct array since that's one of the more complicated array types
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ arr <- Array$create(df)
+
+ # export the array via the C-interface
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ arr$export_to_c(array_ptr, schema_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- Array$import_from_c(array_ptr, schema_ptr)
+ expect_equal(arr, circle)
+
+ # must clean up the pointers or we leak
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-RecordBatch.R b/src/arrow/r/tests/testthat/test-RecordBatch.R
new file mode 100644
index 000000000..d280754a3
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-RecordBatch.R
@@ -0,0 +1,690 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("RecordBatch", {
+ # Note that we're reusing `tbl` and `batch` throughout the tests in this file
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ batch <- record_batch(tbl)
+
+ expect_equal(batch, batch)
+ expect_equal(
+ batch$schema,
+ schema(
+ int = int32(), dbl = float64(),
+ lgl = boolean(), chr = utf8(),
+ fct = dictionary(int8(), utf8())
+ )
+ )
+ expect_equal(batch$num_columns, 5L)
+ expect_equal(batch$num_rows, 10L)
+ expect_equal(batch$column_name(0), "int")
+ expect_equal(batch$column_name(1), "dbl")
+ expect_equal(batch$column_name(2), "lgl")
+ expect_equal(batch$column_name(3), "chr")
+ expect_equal(batch$column_name(4), "fct")
+ expect_equal(names(batch), c("int", "dbl", "lgl", "chr", "fct"))
+
+ # input validation
+ expect_error(batch$column_name(NA), "'i' cannot be NA")
+ expect_error(batch$column_name(-1), "subscript out of bounds")
+ expect_error(batch$column_name(1000), "subscript out of bounds")
+ expect_error(batch$column_name(1:2))
+ expect_error(batch$column_name("one"))
+
+ col_int <- batch$column(0)
+ expect_true(inherits(col_int, "Array"))
+ expect_equal(col_int$as_vector(), tbl$int)
+ expect_equal(col_int$type, int32())
+
+ col_dbl <- batch$column(1)
+ expect_true(inherits(col_dbl, "Array"))
+ expect_equal(col_dbl$as_vector(), tbl$dbl)
+ expect_equal(col_dbl$type, float64())
+
+ col_lgl <- batch$column(2)
+ expect_true(inherits(col_dbl, "Array"))
+ expect_equal(col_lgl$as_vector(), tbl$lgl)
+ expect_equal(col_lgl$type, boolean())
+
+ col_chr <- batch$column(3)
+ expect_true(inherits(col_chr, "Array"))
+ expect_equal(col_chr$as_vector(), tbl$chr)
+ expect_equal(col_chr$type, utf8())
+
+ col_fct <- batch$column(4)
+ expect_true(inherits(col_fct, "Array"))
+ expect_equal(col_fct$as_vector(), tbl$fct)
+ expect_equal(col_fct$type, dictionary(int8(), utf8()))
+
+ # input validation
+ expect_error(batch$column(NA), "'i' cannot be NA")
+ expect_error(batch$column(-1), "subscript out of bounds")
+ expect_error(batch$column(1000), "subscript out of bounds")
+ expect_error(batch$column(1:2))
+ expect_error(batch$column("one"))
+
+ batch2 <- batch$RemoveColumn(0)
+ expect_equal(
+ batch2$schema,
+ schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int8(), utf8()))
+ )
+ expect_equal(batch2$column(0), batch$column(1))
+ expect_data_frame(batch2, tbl[, -1])
+
+ # input validation
+ expect_error(batch$RemoveColumn(NA), "'i' cannot be NA")
+ expect_error(batch$RemoveColumn(-1), "subscript out of bounds")
+ expect_error(batch$RemoveColumn(1000), "subscript out of bounds")
+ expect_error(batch$RemoveColumn(1:2))
+ expect_error(batch$RemoveColumn("one"))
+})
+
+test_that("RecordBatch S3 methods", {
+ tab <- RecordBatch$create(example_data)
+ for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", "as.list")) {
+ fun <- get(f)
+ expect_identical(fun(tab), fun(example_data), info = f)
+ }
+})
+
+test_that("RecordBatch$Slice", {
+ batch3 <- batch$Slice(5)
+ expect_data_frame(batch3, tbl[6:10, ])
+
+ batch4 <- batch$Slice(5, 2)
+ expect_data_frame(batch4, tbl[6:7, ])
+
+ # Input validation
+ expect_error(batch$Slice("ten"))
+ expect_error(batch$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(batch$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(batch$Slice(10, "ten"))
+ expect_error(batch$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(batch$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(batch$Slice(c(10, 10)))
+ expect_error(batch$Slice(10, c(10, 10)))
+ expect_error(batch$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(batch$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(batch4$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(batch$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(batch$Slice(-1, 10), "Slice 'offset' cannot be negative")
+})
+
+test_that("[ on RecordBatch", {
+ expect_data_frame(batch[6:7, ], tbl[6:7, ])
+ expect_data_frame(batch[c(6, 7), ], tbl[6:7, ])
+ expect_data_frame(batch[6:7, 2:4], tbl[6:7, 2:4])
+ expect_data_frame(batch[, c("dbl", "fct")], tbl[, c(2, 5)])
+ expect_identical(as.vector(batch[, "chr", drop = TRUE]), tbl$chr)
+ expect_data_frame(batch[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4])
+ expect_data_frame(
+ batch[rep(c(FALSE, TRUE), 5), ],
+ tbl[c(2, 4, 6, 8, 10), ]
+ )
+ # bool Array
+ expect_data_frame(batch[batch$lgl, ], tbl[tbl$lgl, ])
+ # int Array
+ expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4])
+
+ # input validation
+ expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"')
+ expect_error(batch[, c(6, NA)], "Column indices cannot be NA")
+ expect_error(batch[, c(2, -2)], "Invalid column index")
+})
+
+test_that("[[ and $ on RecordBatch", {
+ expect_as_vector(batch[["int"]], tbl$int)
+ expect_as_vector(batch$int, tbl$int)
+ expect_as_vector(batch[[4]], tbl$chr)
+ expect_null(batch$qwerty)
+ expect_null(batch[["asdf"]])
+ expect_error(batch[[c(4, 3)]])
+ expect_error(batch[[NA]], "'i' must be character or numeric, not logical")
+ expect_error(batch[[NULL]], "'i' must be character or numeric, not NULL")
+ expect_error(batch[[c("asdf", "jkl;")]], "name is not a string", fixed = TRUE)
+})
+
+test_that("[[<- assignment", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ batch <- RecordBatch$create(tbl)
+
+ # can remove a column
+ batch[["chr"]] <- NULL
+ expect_data_frame(batch, tbl[-4])
+
+ # can remove a column by index
+ batch[[4]] <- NULL
+ expect_data_frame(batch, tbl[1:3])
+
+ # can add a named column
+ batch[["new"]] <- letters[10:1]
+ expect_data_frame(batch, dplyr::bind_cols(tbl[1:3], new = letters[10:1]))
+
+ # can replace a column by index
+ batch[[2]] <- as.numeric(10:1)
+ expect_as_vector(batch[[2]], as.numeric(10:1))
+
+ # can add a column by index
+ batch[[5]] <- as.numeric(10:1)
+ expect_as_vector(batch[[5]], as.numeric(10:1))
+ expect_as_vector(batch[["5"]], as.numeric(10:1))
+
+ # can replace a column
+ batch[["int"]] <- 10:1
+ expect_as_vector(batch[["int"]], 10:1)
+
+ # can use $
+ batch$new <- NULL
+ expect_null(as.vector(batch$new))
+ expect_identical(dim(batch), c(10L, 4L))
+
+ batch$int <- 1:10
+ expect_as_vector(batch$int, 1:10)
+
+ # recycling
+ batch[["atom"]] <- 1L
+ expect_as_vector(batch[["atom"]], rep(1L, 10))
+
+ expect_error(
+ batch[["atom"]] <- 1:6,
+ "Can't recycle input of size 6 to size 10."
+ )
+
+ # assign Arrow array
+ array <- Array$create(c(10:1))
+ batch$array <- array
+ expect_as_vector(batch$array, 10:1)
+
+ # nonsense indexes
+ expect_error(batch[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical")
+ expect_error(batch[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL")
+ expect_error(batch[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(batch[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(batch[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(batch[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE)
+})
+
+test_that("head and tail on RecordBatch", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ batch <- RecordBatch$create(tbl)
+ expect_data_frame(head(batch), head(tbl))
+ expect_data_frame(head(batch, 4), head(tbl, 4))
+ expect_data_frame(head(batch, 40), head(tbl, 40))
+ expect_data_frame(head(batch, -4), head(tbl, -4))
+ expect_data_frame(head(batch, -40), head(tbl, -40))
+ expect_data_frame(tail(batch), tail(tbl))
+ expect_data_frame(tail(batch, 4), tail(tbl, 4))
+ expect_data_frame(tail(batch, 40), tail(tbl, 40))
+ expect_data_frame(tail(batch, -4), tail(tbl, -4))
+ expect_data_frame(tail(batch, -40), tail(tbl, -40))
+})
+
+test_that("RecordBatch print method", {
+ expect_output(
+ print(batch),
+ paste(
+ "RecordBatch",
+ "10 rows x 5 columns",
+ "$int <int32>",
+ "$dbl <double>",
+ "$lgl <bool>",
+ "$chr <string>",
+ "$fct <dictionary<values=string, indices=int8>>",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("RecordBatch with 0 rows are supported", {
+ tbl <- tibble::tibble(
+ int = integer(),
+ dbl = numeric(),
+ lgl = logical(),
+ chr = character(),
+ fct = factor(character(), levels = c("a", "b"))
+ )
+
+ batch <- record_batch(tbl)
+ expect_equal(batch$num_columns, 5L)
+ expect_equal(batch$num_rows, 0L)
+ expect_equal(
+ batch$schema,
+ schema(
+ int = int32(),
+ dbl = float64(),
+ lgl = boolean(),
+ chr = utf8(),
+ fct = dictionary(int8(), utf8())
+ )
+ )
+})
+
+test_that("RecordBatch cast (ARROW-3741)", {
+ batch <- record_batch(x = 1:10, y = 1:10)
+
+ expect_error(batch$cast(schema(x = int32())))
+ expect_error(batch$cast(schema(x = int32(), z = int32())))
+
+ s2 <- schema(x = int16(), y = int64())
+ batch2 <- batch$cast(s2)
+ expect_equal(batch2$schema, s2)
+ expect_equal(batch2$column(0L)$type, int16())
+ expect_equal(batch2$column(1L)$type, int64())
+})
+
+test_that("record_batch() handles schema= argument", {
+ s <- schema(x = int32(), y = int32())
+ batch <- record_batch(x = 1:10, y = 1:10, schema = s)
+ expect_equal(s, batch$schema)
+
+ s <- schema(x = int32(), y = float64())
+ batch <- record_batch(x = 1:10, y = 1:10, schema = s)
+ expect_equal(s, batch$schema)
+
+ s <- schema(x = int32(), y = utf8())
+ expect_error(record_batch(x = 1:10, y = 1:10, schema = s))
+})
+
+test_that("record_batch(schema=) does some basic consistency checking of the schema", {
+ s <- schema(x = int32())
+ expect_error(record_batch(x = 1:10, y = 1:10, schema = s))
+ expect_error(record_batch(z = 1:10, schema = s))
+})
+
+test_that("RecordBatch dim() and nrow() (ARROW-3816)", {
+ batch <- record_batch(x = 1:10, y = 1:10)
+ expect_equal(dim(batch), c(10L, 2L))
+ expect_equal(nrow(batch), 10L)
+})
+
+test_that("record_batch() handles Array", {
+ batch <- record_batch(x = 1:10, y = Array$create(1:10))
+ expect_equal(batch$schema, schema(x = int32(), y = int32()))
+})
+
+test_that("record_batch() handles data frame columns", {
+ tib <- tibble::tibble(x = 1:10, y = 1:10)
+ # because tib is named here, this becomes a struct array
+ batch <- record_batch(a = 1:10, b = tib)
+ expect_equal(
+ batch$schema,
+ schema(
+ a = int32(),
+ b = struct(x = int32(), y = int32())
+ )
+ )
+ out <- as.data.frame(batch)
+ expect_equal(out, tibble::tibble(a = 1:10, b = tib))
+
+ # if not named, columns from tib are auto spliced
+ batch2 <- record_batch(a = 1:10, tib)
+ expect_equal(
+ batch2$schema,
+ schema(a = int32(), x = int32(), y = int32())
+ )
+ out <- as.data.frame(batch2)
+ expect_equal(out, tibble::tibble(a = 1:10, !!!tib))
+})
+
+test_that("record_batch() handles data frame columns with schema spec", {
+ tib <- tibble::tibble(x = 1:10, y = 1:10)
+ tib_float <- tib
+ tib_float$y <- as.numeric(tib_float$y)
+ schema <- schema(a = int32(), b = struct(x = int16(), y = float64()))
+ batch <- record_batch(a = 1:10, b = tib, schema = schema)
+ expect_equal(batch$schema, schema)
+ out <- as.data.frame(batch)
+ expect_equal(out, tibble::tibble(a = 1:10, b = tib_float))
+
+ schema <- schema(a = int32(), b = struct(x = int16(), y = utf8()))
+ expect_error(record_batch(a = 1:10, b = tib, schema = schema))
+})
+
+test_that("record_batch() auto splices (ARROW-5718)", {
+ df <- tibble::tibble(x = 1:10, y = letters[1:10])
+ batch1 <- record_batch(df)
+ batch2 <- record_batch(!!!df)
+ expect_equal(batch1, batch2)
+ expect_equal(batch1$schema, schema(x = int32(), y = utf8()))
+ expect_data_frame(batch1, df)
+
+ batch3 <- record_batch(df, z = 1:10)
+ batch4 <- record_batch(!!!df, z = 1:10)
+ expect_equal(batch3, batch4)
+ expect_equal(batch3$schema, schema(x = int32(), y = utf8(), z = int32()))
+ expect_equal(
+ as.data.frame(batch3),
+ tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
+ )
+
+ s <- schema(x = float64(), y = utf8())
+ batch5 <- record_batch(df, schema = s)
+ batch6 <- record_batch(!!!df, schema = s)
+ expect_equal(batch5, batch6)
+ expect_equal(batch5$schema, s)
+ expect_equal(as.data.frame(batch5), df)
+
+ s2 <- schema(x = float64(), y = utf8(), z = int16())
+ batch7 <- record_batch(df, z = 1:10, schema = s2)
+ batch8 <- record_batch(!!!df, z = 1:10, schema = s2)
+ expect_equal(batch7, batch8)
+ expect_equal(batch7$schema, s2)
+ expect_equal(
+ as.data.frame(batch7),
+ tibble::as_tibble(cbind(df, data.frame(z = 1:10)))
+ )
+})
+
+test_that("record_batch() only auto splice data frames", {
+ expect_error(
+ record_batch(1:10),
+ regexp = "only data frames are allowed as unnamed arguments to be auto spliced"
+ )
+})
+
+test_that("record_batch() handles null type (ARROW-7064)", {
+ batch <- record_batch(a = 1:10, n = vctrs::unspecified(10))
+ expect_equal(
+ batch$schema,
+ schema(a = int32(), n = null()),
+ ignore_attr = TRUE
+ )
+})
+
+test_that("record_batch() scalar recycling with vectors", {
+ expect_data_frame(
+ record_batch(a = 1:10, b = 5),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
+ expect_data_frame(
+ record_batch(a = Array$create(1:10), b = Scalar$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ record_batch(a = Array$create(1:10), b = Array$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("record_batch() no recycling with tibbles", {
+ expect_error(
+ record_batch(
+ tibble::tibble(a = 1:10),
+ tibble::tibble(a = 1, b = 5)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+
+ expect_error(
+ record_batch(
+ tibble::tibble(a = 1:10),
+ tibble::tibble(a = 1)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+})
+
+test_that("RecordBatch$Equals", {
+ df <- tibble::tibble(x = 1:10, y = letters[1:10])
+ a <- record_batch(df)
+ b <- record_batch(df)
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(df))
+})
+
+test_that("RecordBatch$Equals(check_metadata)", {
+ df <- tibble::tibble(x = 1:2, y = c("a", "b"))
+ rb1 <- record_batch(df)
+ rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some = "metadata")))
+
+ expect_r6_class(rb1, "RecordBatch")
+ expect_r6_class(rb2, "RecordBatch")
+ expect_false(rb1$schema$HasMetadata)
+ expect_true(rb2$schema$HasMetadata)
+ expect_identical(rb2$schema$metadata, list(some = "metadata"))
+
+ expect_true(rb1 == rb2)
+ expect_true(rb1$Equals(rb2))
+ expect_false(rb1$Equals(rb2, check_metadata = TRUE))
+
+ expect_failure(expect_equal(rb1, rb2)) # expect_equal has check_metadata=TRUE
+ expect_equal(rb1, rb2, ignore_attr = TRUE) # this passes check_metadata=FALSE
+
+ expect_false(rb1$Equals(24)) # Not a RecordBatch
+})
+
+test_that("RecordBatch name assignment", {
+ rb <- record_batch(x = 1:10, y = 1:10)
+ expect_identical(names(rb), c("x", "y"))
+ names(rb) <- c("a", "b")
+ expect_identical(names(rb), c("a", "b"))
+ expect_error(names(rb) <- "f")
+ expect_error(names(rb) <- letters)
+ expect_error(names(rb) <- character(0))
+ expect_error(names(rb) <- NULL)
+ expect_error(names(rb) <- c(TRUE, FALSE))
+})
+
+test_that("record_batch() with different length arrays", {
+ msg <- "All arrays must have the same length"
+ expect_error(record_batch(a = 1:5, b = 1:6), msg)
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- Array$create(structure(list(
+ as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
+ as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
+ as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
+ as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
+ as.raw(c(0x74, 0x76))
+ ),
+ class = c("arrow_binary", "vctrs_vctr", "list")
+ ))
+ batch_with_nul <- record_batch(a = 1:5, b = raws)
+ batch_with_nul$b <- batch_with_nul$b$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+ df <- as.data.frame(batch_with_nul)
+
+ expect_error(
+ df$b[],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ",
+ "set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ batch_with_nul <- record_batch(a = 1:5, b = raws)
+ batch_with_nul$b <- batch_with_nul$b$cast(utf8())
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ expect_warning(
+ expect_equal(
+ as.data.frame(batch_with_nul)$b,
+ c("person", "woman", "man", "camera", "tv"),
+ ignore_attr = TRUE
+ ),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
+
+test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation", {
+ skip_if_not_available("dataset")
+ library(dplyr, warn.conflicts = FALSE)
+
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ expect_r6_class(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch(),
+ "RecordBatch"
+ )
+ expect_identical(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch() %>%
+ group_vars(),
+ c("fct", "fct2")
+ )
+ expect_identical(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch() %>%
+ ungroup() %>%
+ group_vars(),
+ NULL
+ )
+ expect_identical(
+ tbl %>%
+ group_by(fct, fct2) %>%
+ record_batch() %>%
+ select(-int) %>%
+ group_vars(),
+ c("fct", "fct2")
+ )
+})
+
+test_that("ARROW-12729 - length returns number of columns in RecordBatch", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ rb <- record_batch(!!!tbl)
+
+ expect_identical(length(rb), 3L)
+})
+
+test_that("RecordBatchReader to C-interface", {
+ skip_if_not_available("dataset")
+
+ tab <- Table$create(example_data)
+
+ # export the RecordBatchReader via the C-interface
+ stream_ptr <- allocate_arrow_array_stream()
+ scan <- Scanner$create(tab)
+ reader <- scan$ToRecordBatchReader()
+ reader$export_to_c(stream_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatchStreamReader$import_from_c(stream_ptr)
+ tab_from_c_new <- circle$read_table()
+ expect_equal(tab, tab_from_c_new)
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr)
+
+ # export the RecordBatchStreamReader via the C-interface
+ stream_ptr_new <- allocate_arrow_array_stream()
+ bytes <- write_to_raw(example_data)
+ expect_type(bytes, "raw")
+ reader_new <- RecordBatchStreamReader$create(bytes)
+ reader_new$export_to_c(stream_ptr_new)
+
+ # then import it and check that the roundtripped value is the same
+ circle_new <- RecordBatchStreamReader$import_from_c(stream_ptr_new)
+ tab_from_c_new <- circle_new$read_table()
+ expect_equal(tab, tab_from_c_new)
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr_new)
+})
+
+test_that("RecordBatch to C-interface", {
+ batch <- RecordBatch$create(example_data)
+
+ # export the RecordBatch via the C-interface
+ schema_ptr <- allocate_arrow_schema()
+ array_ptr <- allocate_arrow_array()
+ batch$export_to_c(array_ptr, schema_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatch$import_from_c(array_ptr, schema_ptr)
+ expect_equal
+
+ # must clean up the pointers or we leak
+ delete_arrow_schema(schema_ptr)
+ delete_arrow_array(array_ptr)
+})
+
+
+
+test_that("RecordBatchReader to C-interface to arrow_dplyr_query", {
+ skip_if_not_available("dataset")
+
+ tab <- Table$create(example_data)
+
+ # export the RecordBatchReader via the C-interface
+ stream_ptr <- allocate_arrow_array_stream()
+ scan <- Scanner$create(tab)
+ reader <- scan$ToRecordBatchReader()
+ reader$export_to_c(stream_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatchStreamReader$import_from_c(stream_ptr)
+
+ # create an arrow_dplyr_query() from the recordbatch reader
+ reader_adq <- arrow_dplyr_query(circle)
+
+ tab_from_c_new <- reader_adq %>%
+ dplyr::compute()
+ expect_equal(tab_from_c_new, tab)
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-Table.R b/src/arrow/r/tests/testthat/test-Table.R
new file mode 100644
index 000000000..44144c00b
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-Table.R
@@ -0,0 +1,549 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", {
+ tbl <- tibble::tibble(
+ int = 1:10, dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+ tab <- Table$create(!!!tbl)
+
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ expect_deprecated(
+ write_arrow(tab, tf),
+ "write_feather"
+ )
+
+ tab1 <- read_feather(tf, as_data_frame = FALSE)
+ tab2 <- read_feather(normalizePath(tf), as_data_frame = FALSE)
+
+ readable_file <- ReadableFile$create(tf)
+ expect_deprecated(
+ tab3 <- read_arrow(readable_file, as_data_frame = FALSE),
+ "read_feather"
+ )
+ readable_file$close()
+
+ mmap_file <- mmap_open(tf)
+ mmap_file$close()
+
+ expect_equal(tab, tab1)
+ expect_equal(tab, tab2)
+ expect_equal(tab, tab3)
+})
+
+test_that("Table cast (ARROW-3741)", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+
+ expect_error(tab$cast(schema(x = int32())))
+ expect_error(tab$cast(schema(x = int32(), z = int32())))
+
+ s2 <- schema(x = int16(), y = int64())
+ tab2 <- tab$cast(s2)
+ expect_equal(tab2$schema, s2)
+ expect_equal(tab2$column(0L)$type, int16())
+ expect_equal(tab2$column(1L)$type, int64())
+})
+
+test_that("Table S3 methods", {
+ tab <- Table$create(example_data)
+ for (f in c("dim", "nrow", "ncol", "dimnames", "colnames", "row.names", "as.list")) {
+ fun <- get(f)
+ expect_identical(fun(tab), fun(example_data), info = f)
+ }
+})
+
+test_that("Table $column and $field", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+
+ expect_equal(tab$field(0), field("x", int32()))
+
+ # input validation
+ expect_error(tab$column(NA), "'i' cannot be NA")
+ expect_error(tab$column(-1), "subscript out of bounds")
+ expect_error(tab$column(1000), "subscript out of bounds")
+ expect_error(tab$column(1:2))
+ expect_error(tab$column("one"))
+
+ expect_error(tab$field(NA), "'i' cannot be NA")
+ expect_error(tab$field(-1), "subscript out of bounds")
+ expect_error(tab$field(1000), "subscript out of bounds")
+ expect_error(tab$field(1:2))
+ expect_error(tab$field("one"))
+})
+
+test_that("[, [[, $ for Table", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ expect_identical(names(tab), names(tbl))
+
+ expect_data_frame(tab[6:7, ], tbl[6:7, ])
+ expect_data_frame(tab[6:7, 2:4], tbl[6:7, 2:4])
+ expect_data_frame(tab[, c("dbl", "fct")], tbl[, c(2, 5)])
+ expect_as_vector(tab[, "chr", drop = TRUE], tbl$chr)
+ # Take within a single chunk
+ expect_data_frame(tab[c(7, 3, 5), 2:4], tbl[c(7, 3, 5), 2:4])
+ expect_data_frame(tab[rep(c(FALSE, TRUE), 5), ], tbl[c(2, 4, 6, 8, 10), ])
+ # bool ChunkedArray (with one chunk)
+ expect_data_frame(tab[tab$lgl, ], tbl[tbl$lgl, ])
+ # ChunkedArray with multiple chunks
+ c1 <- c(TRUE, FALSE, TRUE, TRUE, FALSE)
+ c2 <- c(FALSE, FALSE, TRUE, TRUE, FALSE)
+ ca <- ChunkedArray$create(c1, c2)
+ expect_data_frame(tab[ca, ], tbl[c(1, 3, 4, 8, 9), ])
+ # int Array
+ expect_data_frame(tab[Array$create(5:6), 2:4], tbl[6:7, 2:4])
+ # ChunkedArray
+ expect_data_frame(tab[ChunkedArray$create(5L, 6L), 2:4], tbl[6:7, 2:4])
+ # Expression
+ expect_data_frame(tab[tab$int > 6, ], tbl[tbl$int > 6, ])
+
+ expect_as_vector(tab[["int"]], tbl$int)
+ expect_as_vector(tab$int, tbl$int)
+ expect_as_vector(tab[[4]], tbl$chr)
+ expect_null(tab$qwerty)
+ expect_null(tab[["asdf"]])
+ # List-like column slicing
+ expect_data_frame(tab[2:4], tbl[2:4])
+ expect_data_frame(tab[c(2, 1)], tbl[c(2, 1)])
+ expect_data_frame(tab[-3], tbl[-3])
+
+ expect_error(tab[[c(4, 3)]])
+ expect_error(tab[[NA]], "'i' must be character or numeric, not logical")
+ expect_error(tab[[NULL]], "'i' must be character or numeric, not NULL")
+ expect_error(tab[[c("asdf", "jkl;")]], "length(name) not equal to 1", fixed = TRUE)
+ expect_error(tab[-3:3], "Invalid column index")
+ expect_error(tab[1000], "Invalid column index")
+ expect_error(tab[1:1000], "Invalid column index")
+
+ # input validation
+ expect_error(tab[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"')
+ expect_error(tab[, c(6, NA)], "Column indices cannot be NA")
+
+ skip("Table with 0 cols doesn't know how many rows it should have")
+ expect_data_frame(tab[0], tbl[0])
+})
+
+test_that("[[<- assignment", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ # can remove a column
+ tab[["chr"]] <- NULL
+ expect_data_frame(tab, tbl[-4])
+
+ # can remove a column by index
+ tab[[4]] <- NULL
+ expect_data_frame(tab, tbl[1:3])
+
+ # can add a named column
+ tab[["new"]] <- letters[10:1]
+ expect_data_frame(tab, dplyr::bind_cols(tbl[1:3], new = letters[10:1]))
+
+ # can replace a column by index
+ tab[[2]] <- as.numeric(10:1)
+ expect_as_vector(tab[[2]], as.numeric(10:1))
+
+ # can add a column by index
+ tab[[5]] <- as.numeric(10:1)
+ expect_as_vector(tab[[5]], as.numeric(10:1))
+ expect_as_vector(tab[["5"]], as.numeric(10:1))
+
+ # can replace a column
+ tab[["int"]] <- 10:1
+ expect_as_vector(tab[["int"]], 10:1)
+
+ # can use $
+ tab$new <- NULL
+ expect_null(as.vector(tab$new))
+ expect_identical(dim(tab), c(10L, 4L))
+
+ tab$int <- 1:10
+ expect_as_vector(tab$int, 1:10)
+
+ # recycling
+ tab[["atom"]] <- 1L
+ expect_as_vector(tab[["atom"]], rep(1L, 10))
+
+ expect_error(
+ tab[["atom"]] <- 1:6,
+ "Can't recycle input of size 6 to size 10."
+ )
+
+ # assign Arrow array and chunked_array
+ array <- Array$create(c(10:1))
+ tab$array <- array
+ expect_as_vector(tab$array, 10:1)
+
+ tab$chunked <- chunked_array(1:10)
+ expect_as_vector(tab$chunked, 1:10)
+
+ # nonsense indexes
+ expect_error(tab[[NA]] <- letters[10:1], "'i' must be character or numeric, not logical")
+ expect_error(tab[[NULL]] <- letters[10:1], "'i' must be character or numeric, not NULL")
+ expect_error(tab[[NA_integer_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(tab[[NA_real_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(tab[[NA_character_]] <- letters[10:1], "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(tab[[c(1, 4)]] <- letters[10:1], "length(i) not equal to 1", fixed = TRUE)
+})
+
+test_that("Table$Slice", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+ tab2 <- tab$Slice(5)
+ expect_data_frame(tab2, tbl[6:10, ])
+
+ tab3 <- tab$Slice(5, 2)
+ expect_data_frame(tab3, tbl[6:7, ])
+
+ # Input validation
+ expect_error(tab$Slice("ten"))
+ expect_error(tab$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(tab$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(tab$Slice(10, "ten"))
+ expect_error(tab$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(tab$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(tab$Slice(c(10, 10)))
+ expect_error(tab$Slice(10, c(10, 10)))
+ expect_error(tab$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(tab$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(tab3$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(tab$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(tab$Slice(-1, 10), "Slice 'offset' cannot be negative")
+})
+
+test_that("head and tail on Table", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ expect_data_frame(head(tab), head(tbl))
+ expect_data_frame(head(tab, 4), head(tbl, 4))
+ expect_data_frame(head(tab, 40), head(tbl, 40))
+ expect_data_frame(head(tab, -4), head(tbl, -4))
+ expect_data_frame(head(tab, -40), head(tbl, -40))
+ expect_data_frame(tail(tab), tail(tbl))
+ expect_data_frame(tail(tab, 4), tail(tbl, 4))
+ expect_data_frame(tail(tab, 40), tail(tbl, 40))
+ expect_data_frame(tail(tab, -4), tail(tbl, -4))
+ expect_data_frame(tail(tab, -40), tail(tbl, -40))
+})
+
+test_that("Table print method", {
+ expect_output(
+ print(tab),
+ paste(
+ "Table",
+ "10 rows x 5 columns",
+ "$int <int32>",
+ "$dbl <double>",
+ "$lgl <bool>",
+ "$chr <string>",
+ "$fct <dictionary<values=string, indices=int8>>",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("table active bindings", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10],
+ fct = factor(letters[1:10])
+ )
+ tab <- Table$create(tbl)
+
+ expect_identical(dim(tbl), dim(tab))
+ expect_type(tab$columns, "list")
+ expect_equal(tab$columns[[1]], tab[[1]])
+})
+
+test_that("table() handles record batches with splicing", {
+ batch <- record_batch(x = 1:2, y = letters[1:2])
+ tab <- Table$create(batch, batch, batch)
+ expect_equal(tab$schema, batch$schema)
+ expect_equal(tab$num_rows, 6L)
+ expect_equal(
+ as.data.frame(tab),
+ vctrs::vec_rbind(as.data.frame(batch), as.data.frame(batch), as.data.frame(batch))
+ )
+
+ batches <- list(batch, batch, batch)
+ tab <- Table$create(!!!batches)
+ expect_equal(tab$schema, batch$schema)
+ expect_equal(tab$num_rows, 6L)
+ expect_equal(
+ as.data.frame(tab),
+ vctrs::vec_rbind(!!!purrr::map(batches, as.data.frame))
+ )
+})
+
+test_that("table() handles ... of arrays, chunked arrays, vectors", {
+ a <- Array$create(1:10)
+ ca <- chunked_array(1:5, 6:10)
+ v <- rnorm(10)
+ tbl <- tibble::tibble(x = 1:10, y = letters[1:10])
+
+ tab <- Table$create(a = a, b = ca, c = v, !!!tbl)
+ expect_equal(
+ tab$schema,
+ schema(a = int32(), b = int32(), c = float64(), x = int32(), y = utf8())
+ )
+ res <- as.data.frame(tab)
+ expect_equal(names(res), c("a", "b", "c", "x", "y"))
+ expect_equal(
+ res,
+ tibble::tibble(a = 1:10, b = 1:10, c = v, x = 1:10, y = letters[1:10])
+ )
+})
+
+test_that("table() auto splices (ARROW-5718)", {
+ df <- tibble::tibble(x = 1:10, y = letters[1:10])
+
+ tab1 <- Table$create(df)
+ tab2 <- Table$create(!!!df)
+ expect_equal(tab1, tab2)
+ expect_equal(tab1$schema, schema(x = int32(), y = utf8()))
+ expect_equal(as.data.frame(tab1), df)
+
+ s <- schema(x = float64(), y = utf8())
+ tab3 <- Table$create(df, schema = s)
+ tab4 <- Table$create(!!!df, schema = s)
+ expect_equal(tab3, tab4)
+ expect_equal(tab3$schema, s)
+ expect_equal(as.data.frame(tab3), df)
+})
+
+test_that("Validation when creating table with schema (ARROW-10953)", {
+ expect_error(
+ Table$create(data.frame(), schema = schema(a = int32())),
+ "incompatible. schema has 1 fields, and 0 columns are supplied",
+ fixed = TRUE
+ )
+ expect_error(
+ Table$create(data.frame(b = 1), schema = schema(a = int32())),
+ "field at index 1 has name 'a' != 'b'",
+ fixed = TRUE
+ )
+ expect_error(
+ Table$create(data.frame(b = 2, c = 3), schema = schema(a = int32())),
+ "incompatible. schema has 1 fields, and 2 columns are supplied",
+ fixed = TRUE
+ )
+})
+
+test_that("==.Table", {
+ tab1 <- Table$create(x = 1:2, y = c("a", "b"))
+ tab2 <- Table$create(x = 1:2, y = c("a", "b"))
+ tab3 <- Table$create(x = 1:2)
+ tab4 <- Table$create(x = 1:2, y = c("a", "b"), z = 3:4)
+
+ expect_true(tab1 == tab2)
+ expect_true(tab2 == tab1)
+
+ expect_false(tab1 == tab3)
+ expect_false(tab3 == tab1)
+
+ expect_false(tab1 == tab4)
+ expect_false(tab4 == tab1)
+
+ expect_true(all.equal(tab1, tab2))
+ expect_equal(tab1, tab2)
+})
+
+test_that("Table$Equals(check_metadata)", {
+ tab1 <- Table$create(x = 1:2, y = c("a", "b"))
+ tab2 <- Table$create(
+ x = 1:2, y = c("a", "b"),
+ schema = tab1$schema$WithMetadata(list(some = "metadata"))
+ )
+
+ expect_r6_class(tab1, "Table")
+ expect_r6_class(tab2, "Table")
+ expect_false(tab1$schema$HasMetadata)
+ expect_true(tab2$schema$HasMetadata)
+ expect_identical(tab2$schema$metadata, list(some = "metadata"))
+
+ expect_true(tab1 == tab2)
+ expect_true(tab1$Equals(tab2))
+ expect_false(tab1$Equals(tab2, check_metadata = TRUE))
+
+ expect_failure(expect_equal(tab1, tab2)) # expect_equal has check_metadata=TRUE
+ expect_equal(tab1, tab2, ignore_attr = TRUE) # this sets check_metadata=FALSE
+
+ expect_false(tab1$Equals(24)) # Not a Table
+})
+
+test_that("Table handles null type (ARROW-7064)", {
+ tab <- Table$create(a = 1:10, n = vctrs::unspecified(10))
+ expect_equal(tab$schema, schema(a = int32(), n = null()), ignore_attr = TRUE)
+})
+
+test_that("Can create table with specific dictionary types", {
+ fact <- example_data[, "fct"]
+ int_types <- c(int8(), int16(), int32(), int64())
+ # TODO: test uint types when format allows
+ # uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint
+ for (i in int_types) {
+ sch <- schema(fct = dictionary(i, utf8()))
+ tab <- Table$create(fact, schema = sch)
+ expect_equal(sch, tab$schema)
+ if (i != int64()) {
+ # TODO: same downcast to int32 as we do for int64() type elsewhere
+ expect_identical(as.data.frame(tab), fact)
+ }
+ }
+})
+
+test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {
+ b1 <- record_batch(f = factor(c("a"), levels = c("a", "b")))
+ b2 <- record_batch(f = factor(c("c"), levels = c("c", "d")))
+ b3 <- record_batch(f = factor(NA, levels = "a"))
+ b4 <- record_batch(f = factor())
+
+ res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d")))
+ tab <- Table$create(b1, b2, b3, b4)
+
+ expect_identical(as.data.frame(tab), res)
+})
+
+test_that("Table$SelectColumns()", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+
+ expect_equal(tab$SelectColumns(0L), Table$create(x = 1:10))
+
+ expect_error(tab$SelectColumns(2:4))
+ expect_error(tab$SelectColumns(""))
+})
+
+test_that("Table name assignment", {
+ tab <- Table$create(x = 1:10, y = 1:10)
+ expect_identical(names(tab), c("x", "y"))
+ names(tab) <- c("a", "b")
+ expect_identical(names(tab), c("a", "b"))
+ expect_error(names(tab) <- "f")
+ expect_error(names(tab) <- letters)
+ expect_error(names(tab) <- character(0))
+ expect_error(names(tab) <- NULL)
+ expect_error(names(tab) <- c(TRUE, FALSE))
+})
+
+test_that("Table$create() with different length columns", {
+ msg <- "All columns must have the same length"
+ expect_error(Table$create(a = 1:5, b = 1:6), msg)
+})
+
+test_that("Table$create() scalar recycling with vectors", {
+ expect_data_frame(
+ Table$create(a = 1:10, b = 5),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
+ expect_data_frame(
+ Table$create(a = Array$create(1:10), b = Scalar$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ Table$create(a = Array$create(1:10), b = Array$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+
+ expect_data_frame(
+ Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)),
+ tibble::tibble(a = 1:10, b = 5)
+ )
+})
+
+test_that("Table$create() no recycling with tibbles", {
+ expect_error(
+ Table$create(
+ tibble::tibble(a = 1:10, b = 5),
+ tibble::tibble(a = 1, b = 5)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+
+ expect_error(
+ Table$create(
+ tibble::tibble(a = 1:10, b = 5),
+ tibble::tibble(a = 1)
+ ),
+ regexp = "All input tibbles or data.frames must have the same number of rows"
+ )
+})
+
+test_that("ARROW-11769 - grouping preserved in table creation", {
+ skip_if_not_available("dataset")
+
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ expect_identical(
+ tbl %>%
+ dplyr::group_by(fct, fct2) %>%
+ Table$create() %>%
+ dplyr::group_vars(),
+ c("fct", "fct2")
+ )
+})
+
+test_that("ARROW-12729 - length returns number of columns in Table", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ fct = factor(rep(c("A", "B"), 5)),
+ fct2 = factor(rep(c("C", "D"), each = 5)),
+ )
+
+ tab <- Table$create(!!!tbl)
+
+ expect_identical(length(tab), 3L)
+})
diff --git a/src/arrow/r/tests/testthat/test-altrep.R b/src/arrow/r/tests/testthat/test-altrep.R
new file mode 100644
index 000000000..dff369438
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-altrep.R
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_r_version("3.5.0")
+
+test_that("is_arrow_altrep() does not include base altrep", {
+ expect_false(is_arrow_altrep(1:10))
+})
+
+test_that("altrep vectors from int32 and dbl arrays with no nulls", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+ v_int <- Array$create(1:1000)
+ v_dbl <- Array$create(as.numeric(1:1000))
+ c_int <- ChunkedArray$create(1:1000)
+ c_dbl <- ChunkedArray$create(as.numeric(1:1000))
+
+ expect_true(is_arrow_altrep(as.vector(v_int)))
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl)))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+
+ expect_equal(c_int$num_chunks, 1L)
+ expect_true(is_arrow_altrep(as.vector(c_int)))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(1))))
+
+ expect_equal(c_dbl$num_chunks, 1L)
+ expect_true(is_arrow_altrep(as.vector(c_dbl)))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(1))))
+
+ withr::local_options(list(arrow.use_altrep = NULL))
+ expect_true(is_arrow_altrep(as.vector(v_int)))
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl)))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+
+ withr::local_options(list(arrow.use_altrep = FALSE))
+ expect_false(is_arrow_altrep(as.vector(v_int)))
+ expect_false(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_false(is_arrow_altrep(as.vector(v_dbl)))
+ expect_false(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+})
+
+test_that("altrep vectors from int32 and dbl arrays with nulls", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+ v_int <- Array$create(c(1L, NA, 3L))
+ v_dbl <- Array$create(c(1, NA, 3))
+ c_int <- ChunkedArray$create(c(1L, NA, 3L))
+ c_dbl <- ChunkedArray$create(c(1, NA, 3))
+
+ expect_true(is_arrow_altrep(as.vector(v_int)))
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl)))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(c_int)))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(1))))
+ expect_true(is_arrow_altrep(as.vector(c_dbl)))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(1))))
+
+ expect_true(is_arrow_altrep(as.vector(v_int$Slice(2))))
+ expect_true(is_arrow_altrep(as.vector(v_dbl$Slice(2))))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(2))))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(2))))
+
+ # chunked array with 2 chunks cannot be altrep
+ c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
+ c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
+ expect_equal(c_int$num_chunks, 2L)
+ expect_equal(c_dbl$num_chunks, 2L)
+
+ expect_false(is_arrow_altrep(as.vector(c_int)))
+ expect_false(is_arrow_altrep(as.vector(c_dbl)))
+ expect_true(is_arrow_altrep(as.vector(c_int$Slice(3))))
+ expect_true(is_arrow_altrep(as.vector(c_dbl$Slice(3))))
+})
+
+test_that("empty vectors are not altrep", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+ v_int <- Array$create(integer())
+ v_dbl <- Array$create(numeric())
+
+ expect_false(is_arrow_altrep(as.vector(v_int)))
+ expect_false(is_arrow_altrep(as.vector(v_dbl)))
+})
+
+test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
+ withr::local_options(list(arrow.use_altrep = TRUE))
+
+ table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3), str = c("un", "deux", "trois"))
+ df_table <- as.data.frame(table)
+ expect_true(is_arrow_altrep(df_table$int))
+ expect_true(is_arrow_altrep(df_table$dbl))
+ expect_true(is_arrow_altrep(df_table$str))
+
+ batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3), str = c("un", "deux", "trois"))
+ df_batch <- as.data.frame(batch)
+ expect_true(is_arrow_altrep(df_batch$int))
+ expect_true(is_arrow_altrep(df_batch$dbl))
+ expect_true(is_arrow_altrep(df_batch$str))
+})
+
+expect_altrep_rountrip <- function(x, fn, ...) {
+ alt <- Array$create(x)$as_vector()
+
+ expect_true(is_arrow_altrep(alt))
+ expect_identical(fn(x, ...), fn(alt, ...))
+ expect_true(is_arrow_altrep(alt))
+}
+
+test_that("altrep min/max/sum identical to R versions for double", {
+ x <- c(1, 2, 3)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- c(1, 2, NA_real_)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- rep(NA_real_, 3)
+ expect_warning(
+ expect_altrep_rountrip(x, min, na.rm = TRUE),
+ "no non-missing arguments to min"
+ )
+ expect_warning(
+ expect_altrep_rountrip(x, max, na.rm = TRUE),
+ "no non-missing arguments to max"
+ )
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+})
+
+test_that("altrep min/max/sum identical to R versions for int", {
+ x <- c(1L, 2L, 3L)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- c(1L, 2L, NA_integer_)
+ expect_altrep_rountrip(x, min, na.rm = TRUE)
+ expect_altrep_rountrip(x, max, na.rm = TRUE)
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ x <- rep(NA_integer_, 3)
+ expect_warning(
+ expect_altrep_rountrip(x, min, na.rm = TRUE),
+ "no non-missing arguments to min"
+ )
+ expect_warning(
+ expect_altrep_rountrip(x, max, na.rm = TRUE),
+ "no non-missing arguments to max"
+ )
+ expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+ expect_altrep_rountrip(x, min)
+ expect_altrep_rountrip(x, max)
+ expect_altrep_rountrip(x, sum)
+
+ # sum(x) is INT_MIN -> convert to double.
+ x <- as.integer(c(-2^31 + 1L, -1L))
+ expect_altrep_rountrip(x, sum)
+})
+
+test_that("altrep vectors handle serialization", {
+ ints <- c(1L, 2L, NA_integer_)
+ dbls <- c(1, 2, NA_real_)
+ strs <- c("un", "deux", NA_character_)
+
+ expect_identical(ints, unserialize(serialize(Array$create(ints)$as_vector(), NULL)))
+ expect_identical(dbls, unserialize(serialize(Array$create(dbls)$as_vector(), NULL)))
+ expect_identical(strs, unserialize(serialize(Array$create(strs)$as_vector(), NULL)))
+ expect_identical(strs, unserialize(serialize(Array$create(strs, large_utf8())$as_vector(), NULL)))
+})
+
+test_that("altrep vectors handle coercion", {
+ ints <- c(1L, 2L, NA_integer_)
+ dbls <- c(1, 2, NA_real_)
+ strs <- c("1", "2", NA_character_)
+
+ expect_identical(ints, as.integer(Array$create(dbls)$as_vector()))
+ expect_identical(ints, as.integer(Array$create(strs)$as_vector()))
+
+ expect_identical(dbls, as.numeric(Array$create(ints)$as_vector()))
+ expect_identical(dbls, as.numeric(Array$create(strs)$as_vector()))
+
+ expect_identical(strs, as.character(Array$create(ints)$as_vector()))
+ expect_identical(strs, as.character(Array$create(dbls)$as_vector()))
+})
+
+test_that("columns of struct types may be altrep", {
+ st <- Array$create(data.frame(x = 1:10, y = runif(10)))
+ df <- st$as_vector()
+
+ expect_true(is_arrow_altrep(df$x))
+ expect_true(is_arrow_altrep(df$y))
+})
+
+test_that("Conversion from altrep R vector to Array uses the existing Array", {
+ a_int <- Array$create(c(1L, 2L, 3L))
+ b_int <- Array$create(a_int$as_vector())
+ expect_true(test_same_Array(a_int$pointer(), b_int$pointer()))
+
+ a_dbl <- Array$create(c(1, 2, 3))
+ b_dbl <- Array$create(a_dbl$as_vector())
+ expect_true(test_same_Array(a_dbl$pointer(), b_dbl$pointer()))
+
+ a_str <- Array$create(c("un", "deux", "trois"))
+ b_str <- Array$create(a_str$as_vector())
+ expect_true(test_same_Array(a_str$pointer(), b_str$pointer()))
+})
diff --git a/src/arrow/r/tests/testthat/test-array-data.R b/src/arrow/r/tests/testthat/test-array-data.R
new file mode 100644
index 000000000..05d070d8a
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-array-data.R
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("string vectors with only empty strings and nulls don't allocate a data buffer (ARROW-3693)", {
+ a <- Array$create("")
+ expect_equal(a$length(), 1L)
+
+ buffers <- a$data()$buffers
+
+ # No nulls
+ expect_equal(buffers[[1]], NULL)
+
+ # Offsets has 2 elements
+ expect_equal(buffers[[2]]$size, 8L)
+
+ # As per ARROW-2744, values buffer should preferably be non-null.
+ expect_equal(buffers[[3]]$size, 0L)
+ expect_equal(buffers[[3]]$capacity, 0L)
+})
diff --git a/src/arrow/r/tests/testthat/test-arrow-info.R b/src/arrow/r/tests/testthat/test-arrow-info.R
new file mode 100644
index 000000000..9eac60814
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-arrow-info.R
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("arrow_info()", {
+ expect_s3_class(arrow_info(), "arrow_info")
+ expect_output(print(arrow_info()), "Arrow package version")
+ options(arrow.foo = FALSE)
+ expect_output(print(arrow_info()), "arrow.foo")
+})
diff --git a/src/arrow/r/tests/testthat/test-arrow.R b/src/arrow/r/tests/testthat/test-arrow.R
new file mode 100644
index 000000000..48970ab89
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-arrow.R
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if (!identical(tolower(Sys.getenv("TEST_R_WITHOUT_LIBARROW")), "true")) {
+ testthat::test_that("Arrow C++ is available", {
+ skip_on_cran()
+ expect_true(arrow_available())
+ })
+}
+
+test_that("Can't $new() an object with anything other than a pointer", {
+ expect_error(
+ Array$new(1:5),
+ "Array$new() requires a pointer as input: did you mean $create() instead?",
+ fixed = TRUE
+ )
+})
+
+r_only({
+ test_that("assert_is", {
+ x <- 42
+ expect_true(assert_is(x, "numeric"))
+ expect_true(assert_is(x, c("numeric", "character")))
+ expect_error(assert_is(x, "factor"), 'x must be a "factor"')
+ expect_error(
+ assert_is(x, c("factor", "list")),
+ 'x must be a "factor" or "list"'
+ )
+ expect_error(
+ assert_is(x, c("factor", "character", "list")),
+ 'x must be a "factor", "character", or "list"'
+ )
+ })
+})
+
+test_that("arrow gracefully fails to load objects from other sessions (ARROW-10071)", {
+ a <- Array$create(1:10)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ saveRDS(a, tf)
+
+ b <- readRDS(tf)
+ expect_error(b$length(), "Invalid <Array>")
+})
+
+test_that("check for an ArrowObject in functions use std::shared_ptr", {
+ expect_error(Array__length(1), "Invalid R object")
+})
+
+test_that("MemoryPool calls gc() to free memory when allocation fails (ARROW-10080)", {
+ # There is a valgrind error on this test because there cannot be memory allocated
+ # which is exactly what this test is checking, but we quiet this
+ skip_on_valgrind()
+
+ env <- new.env()
+ suppressMessages(trace(gc, print = FALSE, tracer = function() {
+ env$gc_was_called <- TRUE
+ }))
+ on.exit(suppressMessages(untrace(gc)))
+ # We expect this should fail because we don't have this much memory,
+ # but it should gc() and retry (and fail again)
+ expect_error(BufferOutputStream$create(2**60))
+ expect_true(env$gc_was_called)
+})
diff --git a/src/arrow/r/tests/testthat/test-backwards-compatibility.R b/src/arrow/r/tests/testthat/test-backwards-compatibility.R
new file mode 100644
index 000000000..32e86d5f6
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-backwards-compatibility.R
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# nolint start
+# To write a new version of a test file for a current version:
+# write_parquet(example_with_metadata, test_path("golden-files/data-arrow_2.0.0.parquet"))
+
+# To write a new version of a test file for an old version, use docker(-compose)
+# to setup a linux distribution and use RStudio's public package manager binary
+# repo to install the old version. The following commands should be run at the
+# root of the arrow repo directory and might need slight adjusments.
+# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose build --no-cache r
+# R_ORG=rstudio R_IMAGE=r-base R_TAG=4.0-focal docker-compose run r /bin/bash
+# R
+# options(repos = "https://packagemanager.rstudio.com/all/__linux__/focal/latest")
+# remotes::install_version("arrow", version = "1.0.1")
+# # get example data into the global env
+# write_parquet(example_with_metadata, "arrow/r/tests/testthat/golden-files/data-arrow_1.0.1.parquet")
+# quit()/exit
+# nolint end
+
+skip_if(getRversion() < "3.5.0", "The serialization format changed in 3.5")
+
+expect_identical_with_metadata <- function(object, expected, ..., top_level = TRUE) {
+ attrs_to_keep <- c("names", "class", "row.names")
+ if (!top_level) {
+ # remove not-tbl and not-data.frame attributes
+ for (attribute in names(attributes(expected))) {
+ if (attribute %in% attrs_to_keep) next
+ attributes(expected)[[attribute]] <- NULL
+ }
+ }
+ expect_identical(object, expected, ...)
+}
+
+test_that("reading a known Parquet file to dataframe with 3.0.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available("snappy")
+ pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet")
+
+ df <- read_parquet(pq_file)
+ # this is equivalent to `expect_identical()`
+ expect_identical_with_metadata(df, example_with_extra_metadata)
+})
+
+test_that("reading a known Parquet file to dataframe with 2.0.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available("snappy")
+ pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet")
+
+ df <- read_parquet(pq_file)
+ # this is equivalent to `expect_identical()`
+ expect_identical_with_metadata(df, example_with_metadata)
+})
+
+test_that("reading a known Parquet file to dataframe with 1.0.1", {
+ skip_if_not_available("parquet")
+ skip_if_not_available("snappy")
+ pq_file <- test_path("golden-files/data-arrow_1.0.1.parquet")
+
+ df <- read_parquet(pq_file)
+ # 1.0.1 didn't save top-level metadata, so we need to remove it.
+ expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE)
+})
+
+for (comp in c("lz4", "uncompressed", "zstd")) {
+ # nolint start
+ # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_lz4.feather"), compression = "lz4")
+ # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_uncompressed.feather"), compression = "uncompressed")
+ # write_feather(example_with_metadata, test_path("golden-files/data-arrow_2.0.0_zstd.feather"), compression = "zstd")
+ # nolint end
+ test_that("reading a known Feather file to dataframe with 2.0.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available(comp)
+ feather_file <- test_path(paste0("golden-files/data-arrow_2.0.0_", comp, ".feather"))
+
+ df <- read_feather(feather_file)
+ expect_identical_with_metadata(df, example_with_metadata)
+ })
+
+ test_that("reading a known Feather file to dataframe with 1.0.1", {
+ skip_if_not_available("parquet")
+ skip_if_not_available(comp)
+ feather_file <- test_path(paste0("golden-files/data-arrow_1.0.1_", comp, ".feather"))
+
+ df <- read_feather(feather_file)
+ # 1.0.1 didn't save top-level metadata, so we need to remove it.
+ expect_identical_with_metadata(df, example_with_metadata, top_level = FALSE)
+ })
+
+ test_that("reading a known Feather file to dataframe with 0.17.0", {
+ skip_if_not_available("parquet")
+ skip_if_not_available(comp)
+ feather_file <- test_path(paste0("golden-files/data-arrow_0.17.0_", comp, ".feather"))
+
+ df <- read_feather(feather_file)
+ # the metadata from 0.17.0 doesn't have the top level, the special class is
+ # not maintained and the embedded tibble's attributes are read in a wrong
+ # order. Since this is prior to 1.0.0 punting on checking the attributes
+ # though classes are always checked, so that must be removed before checking.
+ example_with_metadata_sans_special_class <- example_with_metadata
+ example_with_metadata_sans_special_class$a <- unclass(example_with_metadata_sans_special_class$a)
+ expect_equal(df, example_with_metadata_sans_special_class, ignore_attr = TRUE)
+ })
+}
+
+# TODO: streams(?)
diff --git a/src/arrow/r/tests/testthat/test-buffer-reader.R b/src/arrow/r/tests/testthat/test-buffer-reader.R
new file mode 100644
index 000000000..b790ed0da
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-buffer-reader.R
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("BufferReader can be created from R objects", {
+ num <- BufferReader$create(numeric(13))
+ int <- BufferReader$create(integer(13))
+ raw <- BufferReader$create(raw(16))
+
+ expect_r6_class(num, "BufferReader")
+ expect_r6_class(int, "BufferReader")
+ expect_r6_class(raw, "BufferReader")
+
+ expect_equal(num$GetSize(), 13 * 8)
+ expect_equal(int$GetSize(), 13 * 4)
+ expect_equal(raw$GetSize(), 16)
+})
+
+test_that("BufferReader can be created from Buffer", {
+ buf <- buffer(raw(76))
+ reader <- BufferReader$create(buf)
+
+ expect_r6_class(reader, "BufferReader")
+ expect_equal(reader$GetSize(), 76)
+})
diff --git a/src/arrow/r/tests/testthat/test-buffer.R b/src/arrow/r/tests/testthat/test-buffer.R
new file mode 100644
index 000000000..9b3ebc6de
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-buffer.R
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Buffer can be created from raw vector", {
+ vec <- raw(123)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 123)
+})
+
+test_that("Buffer can be created from integer vector", {
+ vec <- integer(17)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 17 * 4)
+})
+
+test_that("Buffer can be created from numeric vector", {
+ vec <- numeric(17)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 17 * 8)
+})
+
+test_that("Buffer can be created from complex vector", {
+ vec <- complex(3)
+ buf <- buffer(vec)
+ expect_r6_class(buf, "Buffer")
+ expect_equal(buf$size, 3 * 16)
+})
+
+test_that("buffer buffer buffers buffers", {
+ expect_r6_class(buffer(buffer(42)), "Buffer")
+})
+
+test_that("Other types can't be converted to Buffers", {
+ expect_error(
+ buffer(data.frame(a = "asdf")),
+ "Cannot convert object of class data.frame to arrow::Buffer"
+ )
+})
+
+test_that("can convert Buffer to raw", {
+ buf <- buffer(rnorm(10))
+ expect_equal(buf$data(), as.raw(buf))
+})
+
+test_that("can read remaining bytes of a RandomAccessFile", {
+ tbl <- tibble::tibble(
+ int = 1:10, dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+ tab <- Table$create(!!!tbl)
+
+ tf <- tempfile()
+ all_bytes <- write_feather(tab, tf)
+
+ file <- ReadableFile$create(tf)
+ expect_equal(file$tell(), 0)
+ x <- file$Read(20)$data()
+ expect_equal(file$tell(), 20)
+ y <- file$Read()$data()
+
+ file <- ReadableFile$create(tf)
+ z <- file$Read()$data()
+
+ file <- ReadableFile$create(tf)
+ a <- file$ReadAt(20)$data()
+
+ expect_equal(file$GetSize(), length(x) + length(y))
+ expect_equal(z, c(x, y))
+ expect_equal(a, y)
+})
+
+test_that("Buffer$Equals", {
+ vec <- integer(17)
+ buf1 <- buffer(vec)
+ buf2 <- buffer(vec)
+ expect_equal(buf1, buf2)
+ expect_true(buf1$Equals(buf2))
+ expect_false(buf1$Equals(vec))
+})
diff --git a/src/arrow/r/tests/testthat/test-chunked-array.R b/src/arrow/r/tests/testthat/test-chunked-array.R
new file mode 100644
index 000000000..c931ddec5
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-chunked-array.R
@@ -0,0 +1,468 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+expect_chunked_roundtrip <- function(x, type) {
+ a <- ChunkedArray$create(!!!x)
+ flat_x <- unlist(x, recursive = FALSE)
+ attributes(flat_x) <- attributes(x[[1]])
+ expect_type_equal(a$type, type)
+ expect_identical(a$num_chunks, length(x))
+ expect_identical(length(a), length(flat_x))
+ if (!inherits(type, "ListType")) {
+ # TODO: revisit how missingness works with ListArrays
+ # R list objects don't handle missingness the same way as other vectors.
+ # Is there some vctrs thing we should do on the roundtrip back to R?
+ expect_identical(as.vector(is.na(a)), is.na(flat_x))
+ }
+ expect_as_vector(a, flat_x)
+ expect_as_vector(a$chunk(0), x[[1]])
+
+ if (length(flat_x)) {
+ a_sliced <- a$Slice(1)
+ x_sliced <- flat_x[-1]
+ expect_type_equal(a_sliced$type, type)
+ expect_identical(length(a_sliced), length(x_sliced))
+ if (!inherits(type, "ListType")) {
+ expect_identical(as.vector(is.na(a_sliced)), is.na(x_sliced))
+ }
+ expect_as_vector(a_sliced, x_sliced)
+ }
+ invisible(a)
+}
+
+test_that("ChunkedArray", {
+ x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32())
+
+ y <- x$Slice(8)
+ expect_equal(y$type, int32())
+ expect_equal(y$num_chunks, 3L)
+ expect_equal(length(y), 17L)
+ expect_as_vector(y, c(9:10, 1:10, 1:5))
+
+ z <- x$Slice(8, 5)
+ expect_equal(z$type, int32())
+ expect_equal(z$num_chunks, 2L)
+ expect_equal(z$length(), 5L)
+ expect_equal(z$as_vector(), c(9:10, 1:3))
+
+ expect_chunked_roundtrip(list(c(1, 2, 3), c(4, 5, 6)), float64())
+
+ # input validation
+ expect_error(x$chunk(14), "subscript out of bounds")
+ expect_error(x$chunk("one"))
+ expect_error(x$chunk(NA_integer_), "'i' cannot be NA")
+ expect_error(x$chunk(-1), "subscript out of bounds")
+
+ expect_error(x$Slice("ten"))
+ expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(NA), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(10, "ten"))
+ expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
+ expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
+ expect_error(x$Slice(c(10, 10)))
+ expect_error(x$Slice(10, c(10, 10)))
+ expect_error(x$Slice(1000), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(-1), "Slice 'offset' cannot be negative")
+ expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length")
+ expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative")
+ expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative")
+
+ expect_warning(x$Slice(10, 15), NA)
+ expect_warning(
+ overslice <- x$Slice(10, 16),
+ "Slice 'length' greater than available length"
+ )
+ expect_equal(length(overslice), 15)
+ expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
+})
+
+test_that("print ChunkedArray", {
+ verify_output(test_path("test-chunked-array.txt"), {
+ chunked_array(c(1, 2, 3), c(4, 5, 6))
+ chunked_array(1:30, c(4, 5, 6))
+ chunked_array(1:30)
+ chunked_array(factor(c("a", "b")), factor(c("c", "d")))
+ })
+})
+
+test_that("ChunkedArray handles !!! splicing", {
+ data <- list(1, 2, 3)
+ x <- chunked_array(!!!data)
+ expect_equal(x$type, float64())
+ expect_equal(x$num_chunks, 3L)
+})
+
+test_that("ChunkedArray handles Inf", {
+ data <- list(c(Inf, 2:10), c(1:3, Inf, 5L), 1:10)
+ x <- chunked_array(!!!data)
+ expect_equal(x$type, float64())
+ expect_equal(x$num_chunks, 3L)
+ expect_equal(length(x), 25L)
+ expect_as_vector(x, c(c(Inf, 2:10), c(1:3, Inf, 5), 1:10))
+
+ chunks <- x$chunks
+ expect_as_vector(is.infinite(chunks[[2]]), is.infinite(data[[2]]))
+ expect_equal(
+ as.vector(is.infinite(x)),
+ c(is.infinite(data[[1]]), is.infinite(data[[2]]), is.infinite(data[[3]]))
+ )
+})
+
+test_that("ChunkedArray handles NA", {
+ data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L))
+ x <- chunked_array(!!!data)
+ expect_equal(x$type, int32())
+ expect_equal(x$num_chunks, 3L)
+ expect_equal(length(x), 25L)
+ expect_as_vector(x, c(1:10, c(NA, 2:10), c(1:3, NA, 5)))
+
+ chunks <- x$chunks
+ expect_as_vector(is.na(chunks[[2]]), is.na(data[[2]]))
+ expect_as_vector(is.na(x), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]])))
+})
+
+test_that("ChunkedArray handles NaN", {
+ data <- list(as.numeric(1:10), c(NaN, 2:10), c(1:3, NaN, 5L))
+ x <- chunked_array(!!!data)
+
+ expect_equal(x$type, float64())
+ expect_equal(x$num_chunks, 3L)
+ expect_equal(length(x), 25L)
+ expect_as_vector(x, c(1:10, c(NaN, 2:10), c(1:3, NaN, 5)))
+
+ chunks <- x$chunks
+ expect_as_vector(is.nan(chunks[[2]]), is.nan(data[[2]]))
+ expect_as_vector(is.nan(x), c(is.nan(data[[1]]), is.nan(data[[2]]), is.nan(data[[3]])))
+})
+
+test_that("ChunkedArray supports logical vectors (ARROW-3341)", {
+ # with NA
+ data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE))
+ expect_chunked_roundtrip(data, bool())
+ # without NA
+ data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE))
+ expect_chunked_roundtrip(data, bool())
+})
+
+test_that("ChunkedArray supports character vectors (ARROW-3339)", {
+ data <- list(
+ c("itsy", NA, "spider"),
+ c("Climbed", "up", "the", "water", "spout"),
+ c("Down", "came", "the", "rain"),
+ "And washed the spider out. "
+ )
+ expect_chunked_roundtrip(data, utf8())
+})
+
+test_that("ChunkedArray supports factors (ARROW-3716)", {
+ f <- factor(c("itsy", "bitsy", "spider", "spider"))
+ expect_chunked_roundtrip(list(f, f, f), dictionary(int8()))
+})
+
+test_that("ChunkedArray supports dates (ARROW-3716)", {
+ d <- Sys.Date() + 1:10
+ expect_chunked_roundtrip(list(d, d), date32())
+})
+
+test_that("ChunkedArray supports POSIXct (ARROW-3716)", {
+ times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10
+ expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC"))
+})
+
+test_that("ChunkedArray supports integer64 (ARROW-3716)", {
+ x <- bit64::as.integer64(1:10) + MAX_INT
+ expect_chunked_roundtrip(list(x, x), int64())
+ # Also with a first chunk that would downcast
+ zero <- Array$create(0L)$cast(int64())
+ expect_type_equal(zero, int64())
+ ca <- ChunkedArray$create(zero, x)
+ expect_type_equal(ca, int64())
+ expect_s3_class(as.vector(ca), "integer64")
+ expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x))
+})
+
+test_that("ChunkedArray supports difftime", {
+ time <- hms::hms(56, 34, 12)
+ expect_chunked_roundtrip(list(time, time), time32("s"))
+})
+
+test_that("ChunkedArray supports empty arrays (ARROW-13761)", {
+ types <- c(
+ int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
+ uint64(), float32(), float64(), timestamp("ns"), binary(),
+ large_binary(), fixed_size_binary(32), date32(), date64(),
+ decimal(4, 2), dictionary(), struct(x = int32())
+ )
+
+ empty_filter <- ChunkedArray$create(type = bool())
+ for (type in types) {
+ one_empty_chunk <- ChunkedArray$create(type = type)
+ expect_type_equal(one_empty_chunk$type, type)
+ if (type != struct(x = int32())) {
+ expect_identical(length(one_empty_chunk), length(as.vector(one_empty_chunk)))
+ } else {
+ # struct -> tbl and length(tbl) is num_columns instead of num_rows
+ expect_identical(length(as.vector(one_empty_chunk)), 1L)
+ }
+ zero_empty_chunks <- one_empty_chunk$Filter(empty_filter)
+ expect_equal(zero_empty_chunks$num_chunks, 0)
+ expect_type_equal(zero_empty_chunks$type, type)
+ if (type != struct(x = int32())) {
+ expect_identical(length(zero_empty_chunks), length(as.vector(zero_empty_chunks)))
+ } else {
+ expect_identical(length(as.vector(zero_empty_chunks)), 1L)
+ }
+ }
+})
+
+test_that("integer types casts for ChunkedArray (ARROW-3741)", {
+ int_types <- c(int8(), int16(), int32(), int64())
+ uint_types <- c(uint8(), uint16(), uint32(), uint64())
+ float_types <- c(float32(), float64()) # float16() not really supported in C++ yet
+ all_types <- c(
+ int_types,
+ uint_types,
+ float_types
+ )
+
+ a <- chunked_array(1:10, 1:10)
+ for (type in c(int_types, uint_types)) {
+ casted <- a$cast(type)
+ expect_r6_class(casted, "ChunkedArray")
+ expect_type_equal(casted$type, type)
+ }
+ # Also test casting to double(), not actually a type, a base R function but should be alias for float64
+ dbl <- a$cast(double())
+ expect_r6_class(dbl, "ChunkedArray")
+ expect_type_equal(dbl$type, float64())
+})
+
+test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", {
+ num_int32 <- 12L
+ num_int64 <- bit64::as.integer64(10)
+ for (type in all_types) {
+ expect_type_equal(chunked_array(num_int32, type = type)$type, type)
+ expect_type_equal(chunked_array(num_int64, type = type)$type, type)
+ }
+ # also test creating with double() "type"
+ expect_type_equal(chunked_array(num_int32, type = double())$type, float64())
+})
+
+test_that("ChunkedArray$create() aborts on overflow", {
+ expect_error(chunked_array(128L, type = int8())$type)
+ expect_error(chunked_array(-129L, type = int8())$type)
+
+ expect_error(chunked_array(256L, type = uint8())$type)
+ expect_error(chunked_array(-1L, type = uint8())$type)
+
+ expect_error(chunked_array(32768L, type = int16())$type)
+ expect_error(chunked_array(-32769L, type = int16())$type)
+
+ expect_error(chunked_array(65536L, type = uint16())$type)
+ expect_error(chunked_array(-1L, type = uint16())$type)
+
+ expect_error(chunked_array(65536L, type = uint16())$type)
+ expect_error(chunked_array(-1L, type = uint16())$type)
+
+ expect_error(chunked_array(bit64::as.integer64(2^31), type = int32()))
+ expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32()))
+})
+
+test_that("chunked_array() convert doubles to integers", {
+ for (type in c(int_types, uint_types)) {
+ a <- chunked_array(10, type = type)
+ expect_type_equal(a$type, type)
+ if (type != uint64()) {
+ # exception for unsigned integer 64 that
+ # wa cannot handle yet
+ expect_true(as.vector(a) == 10)
+ }
+ }
+})
+
+test_that("chunked_array() uses the first ... to infer type", {
+ a <- chunked_array(10, 10L)
+ expect_type_equal(a$type, float64())
+})
+
+test_that("chunked_array() handles downcasting", {
+ a <- chunked_array(10L, 10)
+ expect_type_equal(a$type, int32())
+ expect_as_vector(a, c(10L, 10L))
+})
+
+test_that("chunked_array() makes chunks of the same type", {
+ a <- chunked_array(10L, bit64::as.integer64(13), type = int64())
+ for (chunk in a$chunks) {
+ expect_type_equal(chunk$type, int64())
+ }
+})
+
+test_that("chunked_array() handles 0 chunks if given a type", {
+ for (type in all_types) {
+ a <- chunked_array(type = type)
+ expect_type_equal(a$type, as_type(type))
+ expect_equal(length(a), 0L)
+ }
+})
+
+test_that("chunked_array() can ingest arrays (ARROW-3815)", {
+ expect_equal(
+ as.vector(chunked_array(1:5, Array$create(6:10))),
+ 1:10
+ )
+})
+
+test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", {
+ df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+ a <- chunked_array(df, df)
+ expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8()))
+ expect_equal(a$as_vector(), rbind(df, df), ignore_attr = TRUE)
+})
+
+test_that("ChunkedArray$View() (ARROW-6542)", {
+ a <- ChunkedArray$create(1:3, 1:4)
+ b <- a$View(float32())
+ expect_equal(b$type, float32())
+ expect_equal(length(b), 7L)
+ expect_true(all(
+ sapply(b$chunks, function(.x) .x$type == float32())
+ ))
+ # Input validation
+ expect_error(a$View("not a type"), "type must be a DataType, not character")
+})
+
+test_that("ChunkedArray$Validate()", {
+ a <- ChunkedArray$create(1:10)
+ expect_error(a$Validate(), NA)
+})
+
+test_that("[ ChunkedArray", {
+ one_chunk <- chunked_array(2:11)
+ x <- chunked_array(1:10, 31:40, 51:55)
+ # Slice
+ expect_as_vector(x[8:12], c(8:10, 31:32))
+ # Take from same chunk
+ expect_as_vector(x[c(11, 15, 12)], c(31, 35, 32))
+ # Take from multiple chunks (calls Concatenate)
+ expect_as_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3))
+ # Take with Array (note these are 0-based)
+ take1 <- Array$create(c(10L, 14L, 11L))
+ expect_as_vector(x[take1], c(31, 35, 32))
+ # Take with ChunkedArray
+ take2 <- ChunkedArray$create(c(10L, 14L), 11L)
+ expect_as_vector(x[take2], c(31, 35, 32))
+
+ # Filter (with recycling)
+ expect_as_vector(
+ one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
+ c(3, 6, 8, 11)
+ )
+ # Filter where both are 1-chunk
+ expect_as_vector(
+ one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))],
+ c(3, 6, 8, 11)
+ )
+ # Filter multi-chunk with logical (-> Array)
+ expect_as_vector(
+ x[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
+ c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55)
+ )
+ # Filter with a chunked array with different sized chunks
+ p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE)
+ p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)
+ filt <- ChunkedArray$create(p1, p2, p2)
+ expect_as_vector(
+ x[filt],
+ c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55)
+ )
+})
+
+test_that("ChunkedArray head/tail", {
+ vec <- 11:20
+ a <- ChunkedArray$create(11:15, 16:20)
+ expect_as_vector(head(a), head(vec))
+ expect_as_vector(head(a, 4), head(vec, 4))
+ expect_as_vector(head(a, 40), head(vec, 40))
+ expect_as_vector(head(a, -4), head(vec, -4))
+ expect_as_vector(head(a, -40), head(vec, -40))
+ expect_as_vector(tail(a), tail(vec))
+ expect_as_vector(tail(a, 4), tail(vec, 4))
+ expect_as_vector(tail(a, 40), tail(vec, 40))
+ expect_as_vector(tail(a, -40), tail(vec, -40))
+})
+
+test_that("ChunkedArray$Equals", {
+ vec <- 11:20
+ a <- ChunkedArray$create(vec[1:5], vec[6:10])
+ b <- ChunkedArray$create(vec[1:5], vec[6:10])
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(vec))
+})
+
+test_that("Converting a chunked array unifies factors (ARROW-8374)", {
+ f1 <- factor(c("a"), levels = c("a", "b"))
+ f2 <- factor(c("c"), levels = c("c", "d"))
+ f3 <- factor(NA, levels = "a")
+ f4 <- factor()
+
+ res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))
+ ca <- ChunkedArray$create(f1, f2, f3, f4)
+
+ expect_identical(ca$as_vector(), res)
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- structure(list(
+ as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
+ as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
+ as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
+ as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
+ as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
+ as.raw(c(0x74, 0x76))
+ ),
+ class = c("arrow_binary", "vctrs_vctr", "list")
+ )
+ chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+
+ v <- expect_error(as.vector(chunked_array_with_nul), NA)
+
+ expect_error(
+ v[],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ",
+ "set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ v <- expect_warning(as.vector(chunked_array_with_nul), NA)
+ expect_warning(
+ expect_identical(v[3], "man"),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
diff --git a/src/arrow/r/tests/testthat/test-chunked-array.txt b/src/arrow/r/tests/testthat/test-chunked-array.txt
new file mode 100644
index 000000000..c7101359d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-chunked-array.txt
@@ -0,0 +1,103 @@
+> chunked_array(c(1, 2, 3), c(4, 5, 6))
+ChunkedArray
+[
+ [
+ 1,
+ 2,
+ 3
+ ],
+ [
+ 4,
+ 5,
+ 6
+ ]
+]
+
+> chunked_array(1:30, c(4, 5, 6))
+ChunkedArray
+[
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ ...
+ 21,
+ 22,
+ 23,
+ 24,
+ 25,
+ 26,
+ 27,
+ 28,
+ 29,
+ 30
+ ],
+ [
+ 4,
+ 5,
+ 6
+ ]
+]
+
+> chunked_array(1:30)
+ChunkedArray
+[
+ [
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ ...
+ 21,
+ 22,
+ 23,
+ 24,
+ 25,
+ 26,
+ 27,
+ 28,
+ 29,
+ 30
+ ]
+]
+
+> chunked_array(factor(c("a", "b")), factor(c("c", "d")))
+ChunkedArray
+[
+
+ -- dictionary:
+ [
+ "a",
+ "b"
+ ]
+ -- indices:
+ [
+ 0,
+ 1
+ ],
+
+ -- dictionary:
+ [
+ "c",
+ "d"
+ ]
+ -- indices:
+ [
+ 0,
+ 1
+ ]
+]
+
diff --git a/src/arrow/r/tests/testthat/test-compressed.R b/src/arrow/r/tests/testthat/test-compressed.R
new file mode 100644
index 000000000..d796e3e75
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compressed.R
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("codec_is_available", {
+ expect_true(codec_is_available("uncompressed")) # Always true
+ expect_match_arg_error(codec_is_available("sdfasdf"))
+ skip_if_not_available("gzip")
+ expect_true(codec_is_available("gzip"))
+ expect_true(codec_is_available("GZIP"))
+})
+
+if (identical(Sys.getenv("APPVEYOR"), "True")) {
+ test_that("Compression codecs are included in the Windows build", {
+ expect_true(codec_is_available("lz4"))
+ expect_true(codec_is_available("zstd"))
+ })
+}
+
+test_that("Codec attributes", {
+ skip_if_not_available("gzip")
+ cod <- Codec$create("gzip")
+ expect_equal(cod$name, "gzip")
+ # TODO: implement $level
+ expect_error(cod$level)
+})
+
+test_that("can write Buffer to CompressedOutputStream and read back in CompressedInputStream", {
+ skip_if_not_available("gzip")
+ buf <- buffer(as.raw(sample(0:255, size = 1024, replace = TRUE)))
+
+ tf1 <- tempfile()
+ stream1 <- CompressedOutputStream$create(tf1)
+ expect_equal(stream1$tell(), 0)
+ stream1$write(buf)
+ expect_equal(stream1$tell(), buf$size)
+ stream1$close()
+
+ tf2 <- tempfile()
+ sink2 <- FileOutputStream$create(tf2)
+ stream2 <- CompressedOutputStream$create(sink2)
+ expect_equal(stream2$tell(), 0)
+ stream2$write(buf)
+ expect_equal(stream2$tell(), buf$size)
+ stream2$close()
+ sink2$close()
+
+ input1 <- CompressedInputStream$create(tf1)
+ buf1 <- input1$Read(1024L)
+
+ file2 <- ReadableFile$create(tf2)
+ input2 <- CompressedInputStream$create(file2)
+ buf2 <- input2$Read(1024L)
+
+ expect_equal(buf, buf1)
+ expect_equal(buf, buf2)
+
+ unlink(tf1)
+ unlink(tf2)
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-aggregate.R b/src/arrow/r/tests/testthat/test-compute-aggregate.R
new file mode 100644
index 000000000..018279d4b
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-aggregate.R
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("list_compute_functions", {
+ allfuncs <- list_compute_functions()
+ expect_false(all(grepl("min", allfuncs)))
+ justmins <- list_compute_functions("^min")
+ expect_true(length(justmins) > 0)
+ expect_true(all(grepl("min", justmins)))
+ no_hash_funcs <- list_compute_functions("^hash")
+ expect_true(length(no_hash_funcs) == 0)
+})
+
+test_that("sum.Array", {
+ ints <- 1:5
+ a <- Array$create(ints)
+ expect_r6_class(sum(a), "Scalar")
+ expect_identical(as.integer(sum(a)), sum(ints))
+
+ floats <- c(1.3, 2.4, 3)
+ f <- Array$create(floats)
+ expect_identical(as.numeric(sum(f)), sum(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ if (!grepl("devel", R.version.string)) {
+ # Valgrind on R-devel confuses NaN and NA_real_
+ # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html
+ expect_identical(as.numeric(sum(na)), sum(floats))
+ }
+ expect_r6_class(sum(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, NA, TRUE, FALSE)
+ b <- Array$create(bools)
+ expect_identical(as.integer(sum(b)), sum(bools))
+ expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE))
+})
+
+test_that("sum.ChunkedArray", {
+ a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_r6_class(sum(a), "Scalar")
+ expect_true(is.na(as.vector(sum(a))))
+ expect_identical(as.numeric(sum(a, na.rm = TRUE)), 35)
+})
+
+test_that("sum dots", {
+ a1 <- Array$create(1:4)
+ a2 <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_identical(as.numeric(sum(a1, a2, na.rm = TRUE)), 45)
+})
+
+test_that("sum.Scalar", {
+ s <- Scalar$create(4)
+ expect_identical(as.numeric(s), as.numeric(sum(s)))
+})
+
+test_that("mean.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(mean(a), "Scalar")
+ expect_identical(as.vector(mean(a)), mean(ints))
+
+ floats <- c(1.3, 2.4, 3)
+ f <- Array$create(floats)
+ expect_identical(as.vector(mean(f)), mean(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ if (!grepl("devel", R.version.string)) {
+ # Valgrind on R-devel confuses NaN and NA_real_
+ # https://r.789695.n4.nabble.com/Difference-in-NA-behavior-in-R-devel-running-under-valgrind-td4768731.html
+ expect_identical(as.vector(mean(na)), mean(floats))
+ }
+ expect_r6_class(mean(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, NA, TRUE, FALSE)
+ b <- Array$create(bools)
+ expect_identical(as.vector(mean(b)), mean(bools))
+ expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE))
+})
+
+test_that("mean.ChunkedArray", {
+ a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5)
+ expect_r6_class(mean(a), "Scalar")
+ expect_true(is.na(as.vector(mean(a))))
+ expect_identical(as.vector(mean(a, na.rm = TRUE)), 35 / 13)
+})
+
+test_that("mean.Scalar", {
+ s <- Scalar$create(4)
+ expect_equal(s, mean(s))
+})
+
+test_that("Bad input handling of call_function", {
+ expect_error(
+ call_function("sum", 2, 3),
+ 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"'
+ )
+})
+
+test_that("min.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(min(a), "Scalar")
+ expect_identical(as.vector(min(a)), min(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- Array$create(floats)
+ expect_identical(as.vector(min(f)), min(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ expect_identical(as.vector(min(na)), min(floats))
+ expect_r6_class(min(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- Array$create(bools)
+ # R is inconsistent here: typeof(min(NA)) == "integer", not "logical"
+ expect_identical(as.vector(min(b)), as.logical(min(bools)))
+})
+
+test_that("max.Array", {
+ ints <- 1:4
+ a <- Array$create(ints)
+ expect_r6_class(max(a), "Scalar")
+ expect_identical(as.vector(max(a)), max(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- Array$create(floats)
+ expect_identical(as.vector(max(f)), max(floats))
+
+ floats <- c(floats, NA)
+ na <- Array$create(floats)
+ expect_identical(as.vector(max(na)), max(floats))
+ expect_r6_class(max(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- Array$create(bools)
+ # R is inconsistent here: typeof(max(NA)) == "integer", not "logical"
+ expect_identical(as.vector(max(b)), as.logical(max(bools)))
+})
+
+test_that("min.ChunkedArray", {
+ ints <- 1:4
+ a <- ChunkedArray$create(ints)
+ expect_r6_class(min(a), "Scalar")
+ expect_identical(as.vector(min(a)), min(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- ChunkedArray$create(floats)
+ expect_identical(as.vector(min(f)), min(floats))
+
+ floats <- c(floats, NA)
+ na <- ChunkedArray$create(floats)
+ expect_identical(as.vector(min(na)), min(floats))
+ expect_r6_class(min(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- ChunkedArray$create(bools)
+ # R is inconsistent here: typeof(min(NA)) == "integer", not "logical"
+ expect_identical(as.vector(min(b)), as.logical(min(bools)))
+})
+
+test_that("max.ChunkedArray", {
+ ints <- 1:4
+ a <- ChunkedArray$create(ints)
+ expect_r6_class(max(a), "Scalar")
+ expect_identical(as.vector(max(a)), max(ints))
+
+ floats <- c(1.3, 3, 2.4)
+ f <- ChunkedArray$create(floats)
+ expect_identical(as.vector(max(f)), max(floats))
+
+ floats <- c(floats, NA)
+ na <- ChunkedArray$create(floats)
+ expect_identical(as.vector(max(na)), max(floats))
+ expect_r6_class(max(na, na.rm = TRUE), "Scalar")
+ expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE))
+
+ bools <- c(TRUE, TRUE, FALSE)
+ b <- ChunkedArray$create(bools)
+ # R is inconsistent here: typeof(max(NA)) == "integer", not "logical"
+ expect_identical(as.vector(max(b)), as.logical(max(bools)))
+})
+
+test_that("Edge cases", {
+ a <- Array$create(NA)
+ for (type in c(int32(), float64(), bool())) {
+ expect_as_vector(sum(a$cast(type), na.rm = TRUE), sum(NA, na.rm = TRUE))
+ expect_as_vector(mean(a$cast(type), na.rm = TRUE), mean(NA, na.rm = TRUE))
+ expect_as_vector(
+ min(a$cast(type), na.rm = TRUE),
+ # Suppress the base R warning about no non-missing arguments
+ suppressWarnings(min(NA, na.rm = TRUE))
+ )
+ expect_as_vector(
+ max(a$cast(type), na.rm = TRUE),
+ suppressWarnings(max(NA, na.rm = TRUE))
+ )
+ }
+})
+
+test_that("quantile.Array and quantile.ChunkedArray", {
+ a <- Array$create(c(0, 1, 2, 3))
+ ca <- ChunkedArray$create(c(0, 1), c(2, 3))
+ probs <- c(0.49, 0.51)
+ for (ad in list(a, ca)) {
+ for (type in c(int32(), uint64(), float64())) {
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "linear"),
+ Array$create(c(1.47, 1.53))
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "lower"),
+ Array$create(c(1, 1))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "higher"),
+ Array$create(c(2, 2))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "nearest"),
+ Array$create(c(1, 2))$cast(type)
+ )
+ expect_equal(
+ quantile(ad$cast(type), probs = probs, interpolation = "midpoint"),
+ Array$create(c(1.5, 1.5))
+ )
+ }
+ }
+})
+
+test_that("quantile and median NAs, edge cases, and exceptions", {
+ expect_equal(
+ quantile(Array$create(c(1, 2)), probs = c(0, 1)),
+ Array$create(c(1, 2))
+ )
+ expect_error(
+ quantile(Array$create(c(1, 2, NA))),
+ "Missing values not allowed if 'na.rm' is FALSE"
+ )
+ expect_equal(
+ quantile(Array$create(numeric(0))),
+ Array$create(rep(NA_real_, 5))
+ )
+ expect_equal(
+ quantile(Array$create(rep(NA_integer_, 3)), na.rm = TRUE),
+ Array$create(rep(NA_real_, 5))
+ )
+ expect_equal(
+ quantile(Scalar$create(0L)),
+ Array$create(rep(0, 5))
+ )
+ expect_equal(
+ median(Scalar$create(1L)),
+ Scalar$create(1)
+ )
+ expect_error(
+ quantile(Array$create(1:3), type = 9),
+ "not supported"
+ )
+})
+
+test_that("median passes ... args to quantile", {
+ skip_if(
+ !"..." %in% names(formals(median)),
+ "The median generic lacks dots in R 3.3.0 and earlier"
+ )
+ expect_equal(
+ median(Array$create(c(1, 2)), interpolation = "higher"),
+ Scalar$create(2)
+ )
+ expect_error(
+ median(Array$create(c(1, 2)), probs = c(.25, .75))
+ )
+})
+
+test_that("median.Array and median.ChunkedArray", {
+ compare_expression(
+ median(.input),
+ 1:4
+ )
+ compare_expression(
+ median(.input),
+ 1:5
+ )
+ compare_expression(
+ median(.input),
+ numeric(0)
+ )
+ compare_expression(
+ median(.input, na.rm = FALSE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ NA_real_
+ )
+ compare_expression(
+ median(.input, na.rm = FALSE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ c(1, 2, NA)
+ )
+ compare_expression(
+ median(.input, na.rm = TRUE),
+ NA_real_
+ )
+})
+
+test_that("unique.Array", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ expect_equal(unique(a), Array$create(c(1, 4, 3)))
+ ca <- ChunkedArray$create(a, a)
+ expect_equal(unique(ca), Array$create(c(1, 4, 3)))
+})
+
+test_that("match_arrow", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ tab <- c(4, 3, 2, 1)
+ expect_equal(match_arrow(a, tab), Array$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L)))
+
+ ca <- ChunkedArray$create(c(1, 4, 3, 1, 1, 3, 4))
+ expect_equal(match_arrow(ca, tab), ChunkedArray$create(c(3L, 0L, 1L, 3L, 3L, 1L, 0L)))
+
+ sc <- Scalar$create(3)
+ expect_equal(match_arrow(sc, tab), Scalar$create(1L))
+
+ vec <- c(1, 2)
+ expect_equal(match_arrow(vec, tab), Array$create(c(3L, 2L)))
+})
+
+test_that("is_in", {
+ a <- Array$create(c(9, 4, 3))
+ tab <- c(4, 3, 2, 1)
+ expect_equal(is_in(a, tab), Array$create(c(FALSE, TRUE, TRUE)))
+
+ ca <- ChunkedArray$create(c(9, 4, 3))
+ expect_equal(is_in(ca, tab), ChunkedArray$create(c(FALSE, TRUE, TRUE)))
+
+ sc <- Scalar$create(3)
+ expect_equal(is_in(sc, tab), Scalar$create(TRUE))
+
+ vec <- c(1, 9)
+ expect_equal(is_in(vec, tab), Array$create(c(TRUE, FALSE)))
+})
+
+test_that("value_counts", {
+ a <- Array$create(c(1, 4, 3, 1, 1, 3, 4))
+ result_df <- tibble::tibble(
+ values = c(1, 4, 3),
+ counts = c(3L, 2L, 2L)
+ )
+ result <- Array$create(
+ result_df,
+ type = struct(values = float64(), counts = int64())
+ )
+ expect_equal(value_counts(a), result)
+ expect_identical(as.data.frame(value_counts(a)), result_df)
+ expect_identical(as.vector(value_counts(a)$counts), result_df$counts)
+})
+
+test_that("any.Array and any.ChunkedArray", {
+ data <- c(1:10, NA, NA)
+
+ compare_expression(any(.input > 5), data)
+ compare_expression(any(.input > 5, na.rm = TRUE), data)
+ compare_expression(any(.input < 1), data)
+ compare_expression(any(.input < 1, na.rm = TRUE), data)
+
+ data_logical <- c(TRUE, FALSE, TRUE, NA, FALSE)
+
+ compare_expression(any(.input), data_logical)
+ compare_expression(any(.input, na.rm = FALSE), data_logical)
+ compare_expression(any(.input, na.rm = TRUE), data_logical)
+})
+
+test_that("all.Array and all.ChunkedArray", {
+ data <- c(1:10, NA, NA)
+
+ compare_expression(all(.input > 5), data)
+ compare_expression(all(.input > 5, na.rm = TRUE), data)
+
+ compare_expression(all(.input < 11), data)
+ compare_expression(all(.input < 11, na.rm = TRUE), data)
+
+ data_logical <- c(TRUE, TRUE, NA)
+
+ compare_expression(all(.input), data_logical)
+ compare_expression(all(.input, na.rm = TRUE), data_logical)
+})
+
+test_that("variance", {
+ data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA)
+ arr <- Array$create(data)
+ chunked_arr <- ChunkedArray$create(data)
+
+ expect_equal(call_function("variance", arr, options = list(ddof = 5)), Scalar$create(34596))
+ expect_equal(call_function("variance", chunked_arr, options = list(ddof = 5)), Scalar$create(34596))
+})
+
+test_that("stddev", {
+ data <- c(-37, 267, 88, -120, 9, 101, -65, -23, NA)
+ arr <- Array$create(data)
+ chunked_arr <- ChunkedArray$create(data)
+
+ expect_equal(call_function("stddev", arr, options = list(ddof = 5)), Scalar$create(186))
+ expect_equal(call_function("stddev", chunked_arr, options = list(ddof = 5)), Scalar$create(186))
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-arith.R b/src/arrow/r/tests/testthat/test-compute-arith.R
new file mode 100644
index 000000000..e8674e315
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-arith.R
@@ -0,0 +1,129 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Addition", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_type_equal(a, int32())
+ expect_type_equal(a + 4L, int32())
+ expect_type_equal(a + 4, float64())
+ expect_equal(a + 4L, Array$create(c(5:8, NA_integer_)))
+ expect_identical(as.vector(a + 4L), c(5:8, NA_integer_))
+ expect_equal(a + 4L, Array$create(c(5:8, NA_integer_)))
+ expect_as_vector(a + 4L, c(5:8, NA_integer_))
+ expect_equal(a + NA_integer_, Array$create(rep(NA_integer_, 5)))
+
+ a8 <- a$cast(int8())
+ expect_type_equal(a8 + Scalar$create(1, int8()), int8())
+
+ # int8 will be promoted to int32 when added to int32
+ expect_type_equal(a8 + 127L, int32())
+ expect_equal(a8 + 127L, Array$create(c(128:131, NA_integer_)))
+
+ b <- Array$create(c(4:1, NA_integer_))
+ expect_type_equal(a8 + b, int32())
+ expect_equal(a8 + b, Array$create(c(5L, 5L, 5L, 5L, NA_integer_)))
+
+ expect_type_equal(a + 4.1, float64())
+ expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_)))
+})
+
+test_that("Subtraction", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_equal(a - 3L, Array$create(c(-2:1, NA_integer_)))
+
+ expect_equal(
+ Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_)) - a,
+ Array$create(c(4.1, 4.1, 4.1, 4.1, NA_real_))
+ )
+})
+
+test_that("Multiplication", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_equal(a * 2L, Array$create(c(1:4 * 2L, NA_integer_)))
+
+ expect_equal(
+ (a * 0.5) * 3L,
+ Array$create(c(1.5, 3, 4.5, 6, NA_real_))
+ )
+})
+
+test_that("Division", {
+ a <- Array$create(c(1:4, NA_integer_))
+ expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_)))
+ expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+ expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_)))
+ expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
+ expect_equal(a / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+ # TODO add tests for integer division %/% by 0
+ # see https://issues.apache.org/jira/browse/ARROW-14297
+
+ b <- a$cast(float64())
+ expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_)))
+ expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+ expect_equal(b / 0, Array$create(c(Inf, Inf, Inf, Inf, NA_real_)))
+ # TODO add tests for integer division %/% by 0
+ # see https://issues.apache.org/jira/browse/ARROW-14297
+
+ # the behavior of %/% matches R's (i.e. the integer of the quotient, not
+ # simply dividing two integers)
+ expect_equal(b / 2.2, Array$create(c(1:4 / 2.2, NA_real_)))
+ # nolint start
+ # c(1:4) %/% 2.2 != c(1:4) %/% as.integer(2.2)
+ # c(1:4) %/% 2.2 == c(0L, 0L, 1L, 1L)
+ # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L)
+ # nolint end
+ expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_)))
+
+ expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_)))
+
+ expect_equal(b %% 2, Array$create(c(1:4 %% 2, NA_real_)))
+})
+
+test_that("Power", {
+ a <- Array$create(c(1:4, NA_integer_))
+ b <- a$cast(float64())
+ c <- a$cast(int64())
+ d <- a$cast(uint64())
+
+ expect_equal(a^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(a^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(a^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(a^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+
+ expect_equal(b^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(b^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(b^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(b^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+
+ expect_equal(c^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(c^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(c^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(c^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+
+ expect_equal(d^0, Array$create(c(1, 1, 1, 1, NA_real_)))
+ expect_equal(d^2, Array$create(c(1, 4, 9, 16, NA_real_)))
+ expect_equal(d^(-1), Array$create(c(1, 1 / 2, 1 / 3, 1 / 4, NA_real_)))
+ expect_equal(d^(.5), Array$create(c(1, sqrt(2), sqrt(3), sqrt(4), NA_real_)))
+})
+
+test_that("Dates casting", {
+ a <- Array$create(c(Sys.Date() + 1:4, NA_integer_))
+
+ skip("ARROW-11090 (date/datetime arithmetic)")
+ # Error: NotImplemented: Function add_checked has no kernel matching input types (array[date32[day]], scalar[double])
+ expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4) + 2), NA_integer_))
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-no-bindings.R b/src/arrow/r/tests/testthat/test-compute-no-bindings.R
new file mode 100644
index 000000000..a51d797a4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-no-bindings.R
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("non-bound compute kernels using TrimOptions", {
+ skip_if_not_available("utf8proc")
+ expect_equal(
+ call_function(
+ "utf8_trim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("racadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_ltrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("racadabra")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "ascii_ltrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("racadabra")
+ )
+
+ expect_equal(
+ call_function(
+ "ascii_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+
+ expect_equal(
+ call_function(
+ "ascii_rtrim",
+ Scalar$create("abracadabra"),
+ options = list(characters = "ab")
+ ),
+ Scalar$create("abracadabr")
+ )
+})
+
+test_that("non-bound compute kernels using ReplaceSliceOptions", {
+ skip_if_not_available("utf8proc")
+
+ expect_equal(
+ call_function(
+ "binary_replace_slice",
+ Array$create("I need to fix this string"),
+ options = list(start = 1, stop = 1, replacement = " don't")
+ ),
+ Array$create("I don't need to fix this string")
+ )
+
+ expect_equal(
+ call_function(
+ "utf8_replace_slice",
+ Array$create("I need to fix this string"),
+ options = list(start = 1, stop = 1, replacement = " don't")
+ ),
+ Array$create("I don't need to fix this string")
+ )
+})
+
+test_that("non-bound compute kernels using ModeOptions", {
+ expect_equal(
+ as.vector(
+ call_function("mode", Array$create(c(1:10, 10, 9, NA)), options = list(n = 3))
+ ),
+ tibble::tibble("mode" = c(9, 10, 1), "count" = c(2L, 2L, 1L))
+ )
+
+ expect_equal(
+ as.vector(
+ call_function("mode", Array$create(c(1:10, 10, 9, NA)), options = list(n = 3, skip_nulls = FALSE))
+ ),
+ tibble::tibble("mode" = numeric(), "count" = integer())
+ )
+})
+
+test_that("non-bound compute kernels using PartitionNthOptions", {
+ result <- call_function(
+ "partition_nth_indices",
+ Array$create(c(11:20)),
+ options = list(pivot = 3)
+ )
+ # Order of indices on either side of the pivot is not deterministic
+ # (depends on C++ standard library implementation)
+ expect_true(all(as.vector(result[1:3]) < 3))
+ expect_true(all(as.vector(result[4:10]) >= 3))
+})
+
+
+test_that("non-bound compute kernels using MatchSubstringOptions", {
+ skip_if_not_available("utf8proc")
+
+ # Remove this test when ARROW-13924 has been completed
+ expect_equal(
+ call_function(
+ "starts_with",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "abr")
+ ),
+ Array$create(c(TRUE, FALSE, FALSE, TRUE))
+ )
+
+ # Remove this test when ARROW-13924 has been completed
+ expect_equal(
+ call_function(
+ "ends_with",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "e")
+ ),
+ Array$create(c(FALSE, FALSE, TRUE, TRUE))
+ )
+
+ # Remove this test when ARROW-13156 has been completed
+ expect_equal(
+ as.vector(
+ call_function(
+ "count_substring",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "e")
+ )
+ ),
+ c(0, 0, 1, 1)
+ )
+
+ skip_if_not_available("re2")
+
+ # Remove this test when ARROW-13156 has been completed
+ expect_equal(
+ as.vector(
+ call_function(
+ "count_substring_regex",
+ Array$create(c("abracadabra", "abacus", "abdicate", "abrasive")),
+ options = list(pattern = "e")
+ )
+ ),
+ c(0, 0, 1, 1)
+ )
+})
+
+test_that("non-bound compute kernels using ExtractRegexOptions", {
+ skip_if_not_available("re2")
+ expect_equal(
+ call_function("extract_regex", Scalar$create("abracadabra"), options = list(pattern = "(?P<letter>[a])")),
+ Scalar$create(tibble::tibble(letter = "a"))
+ )
+})
+
+test_that("non-bound compute kernels using IndexOptions", {
+ expect_equal(
+ as.vector(
+ call_function("index", Array$create(c(10, 20, 30, 40)), options = list(value = Scalar$create(40)))
+ ),
+ 3
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-sort.R b/src/arrow/r/tests/testthat/test-compute-sort.R
new file mode 100644
index 000000000..e3574d86f
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-sort.R
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+library(dplyr, warn.conflicts = FALSE)
+
+# randomize order of rows in test data
+tbl <- slice_sample(example_data_for_sorting, prop = 1L)
+
+test_that("sort(Scalar) is identity function", {
+ int <- Scalar$create(42L)
+ expect_equal(sort(int), int)
+ dbl <- Scalar$create(3.14)
+ expect_equal(sort(dbl), dbl)
+ chr <- Scalar$create("foo")
+ expect_equal(sort(chr), chr)
+})
+
+test_that("Array$SortIndices()", {
+ int <- tbl$int
+ # Remove ties because they could give non-deterministic sort indices, and this
+ # test compares sort indices. Other tests compare sorted values, which are
+ # deterministic in the case of ties.
+ int <- int[!duplicated(int)]
+ expect_equal(
+ Array$create(int)$SortIndices(),
+ Array$create(order(int) - 1L, type = uint64())
+ )
+ # Need to remove NAs because ARROW-12063
+ int <- na.omit(int)
+ expect_equal(
+ Array$create(int)$SortIndices(descending = TRUE),
+ Array$create(rev(order(int)) - 1, type = uint64())
+ )
+})
+
+test_that("ChunkedArray$SortIndices()", {
+ int <- tbl$int
+ # Remove ties because they could give non-deterministic sort indices, and this
+ # test compares sort indices. Other tests compare sorted values, which are
+ # deterministic in the case of ties.
+ int <- int[!duplicated(int)]
+ expect_equal(
+ ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(),
+ Array$create(order(int) - 1L, type = uint64())
+ )
+ # Need to remove NAs because ARROW-12063
+ int <- na.omit(int)
+ expect_equal(
+ ChunkedArray$create(int[1:4], int[5:length(int)])$SortIndices(descending = TRUE),
+ Array$create(rev(order(int)) - 1, type = uint64())
+ )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on integers", {
+ compare_expression(
+ sort(.input),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, na.last = NA),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, na.last = TRUE),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, na.last = FALSE),
+ tbl$int
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE),
+ tbl$int,
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = TRUE),
+ tbl$int,
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$int,
+ )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on strings", {
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$chr
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$chr
+ )
+})
+
+test_that("sort(vector), sort(Array), sort(ChunkedArray) give equivalent results on floats", {
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = TRUE),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = FALSE, na.last = TRUE),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = NA),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = TRUE, na.last = FALSE),
+ tbl$dbl,
+ )
+ compare_expression(
+ sort(.input, decreasing = FALSE, na.last = NA),
+ tbl$dbl
+ )
+ compare_expression(
+ sort(.input, decreasing = FALSE, na.last = FALSE),
+ tbl$dbl,
+ )
+})
+
+test_that("Table$SortIndices()", {
+ x <- Table$create(tbl)
+ expect_identical(
+ as.vector(x$Take(x$SortIndices("chr"))$chr),
+ sort(tbl$chr, na.last = TRUE)
+ )
+ expect_identical(
+ as.data.frame(x$Take(x$SortIndices(c("int", "dbl"), c(FALSE, FALSE)))),
+ tbl %>% arrange(int, dbl)
+ )
+})
+
+test_that("RecordBatch$SortIndices()", {
+ x <- record_batch(tbl)
+ expect_identical(
+ as.data.frame(x$Take(x$SortIndices(c("chr", "int", "dbl"), TRUE))),
+ tbl %>% arrange(desc(chr), desc(int), desc(dbl))
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-compute-vector.R b/src/arrow/r/tests/testthat/test-compute-vector.R
new file mode 100644
index 000000000..345da5656
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-compute-vector.R
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+expect_bool_function_equal <- function(array_exp, r_exp) {
+ # Assert that the Array operation returns a boolean array
+ # and that its contents are equal to expected
+ expect_r6_class(array_exp, "ArrowDatum")
+ expect_type_equal(array_exp, bool())
+ expect_identical(as.vector(array_exp), r_exp)
+}
+
+expect_array_compares <- function(x, compared_to) {
+ r_values <- as.vector(x)
+ r_compared_to <- as.vector(compared_to)
+ # Iterate over all comparison functions
+ expect_bool_function_equal(x == compared_to, r_values == r_compared_to)
+ expect_bool_function_equal(x != compared_to, r_values != r_compared_to)
+ expect_bool_function_equal(x > compared_to, r_values > r_compared_to)
+ expect_bool_function_equal(x >= compared_to, r_values >= r_compared_to)
+ expect_bool_function_equal(x < compared_to, r_values < r_compared_to)
+ expect_bool_function_equal(x <= compared_to, r_values <= r_compared_to)
+}
+
+test_that("compare ops with Array", {
+ a <- Array$create(1:5)
+ expect_array_compares(a, 4L)
+ expect_array_compares(a, 4) # implicit casting
+ expect_array_compares(a, Scalar$create(4))
+ expect_array_compares(Array$create(c(NA, 1:5)), 4)
+ expect_array_compares(Array$create(as.numeric(c(NA, 1:5))), 4)
+ expect_array_compares(Array$create(c(NA, 1:5)), Array$create(rev(c(NA, 1:5))))
+ expect_array_compares(Array$create(c(NA, 1:5)), Array$create(rev(c(NA, 1:5)), type = double()))
+})
+
+test_that("compare ops with ChunkedArray", {
+ expect_array_compares(ChunkedArray$create(1:3, 4:5), 4L)
+ expect_array_compares(ChunkedArray$create(1:3, 4:5), 4) # implicit casting
+ expect_array_compares(ChunkedArray$create(1:3, 4:5), Scalar$create(4))
+ expect_array_compares(ChunkedArray$create(c(NA, 1:3), 4:5), 4)
+ expect_array_compares(
+ ChunkedArray$create(c(NA, 1:3), 4:5),
+ ChunkedArray$create(4:5, c(NA, 1:3))
+ )
+ expect_array_compares(
+ ChunkedArray$create(c(NA, 1:3), 4:5),
+ Array$create(c(NA, 1:5))
+ )
+ expect_array_compares(
+ Array$create(c(NA, 1:5)),
+ ChunkedArray$create(c(NA, 1:3), 4:5)
+ )
+})
+
+test_that("logic ops with Array", {
+ truth <- expand.grid(left = c(TRUE, FALSE, NA), right = c(TRUE, FALSE, NA))
+ a_left <- Array$create(truth$left)
+ a_right <- Array$create(truth$right)
+ expect_bool_function_equal(a_left & a_right, truth$left & truth$right)
+ expect_bool_function_equal(a_left | a_right, truth$left | truth$right)
+ expect_bool_function_equal(a_left == a_right, truth$left == truth$right)
+ expect_bool_function_equal(a_left != a_right, truth$left != truth$right)
+ expect_bool_function_equal(!a_left, !truth$left)
+
+ # More complexity
+ isEqualTo <- function(x, y) x == y & !is.na(x)
+ expect_bool_function_equal(
+ isEqualTo(a_left, a_right),
+ isEqualTo(truth$left, truth$right)
+ )
+})
+
+test_that("logic ops with ChunkedArray", {
+ truth <- expand.grid(left = c(TRUE, FALSE, NA), right = c(TRUE, FALSE, NA))
+ a_left <- ChunkedArray$create(truth$left)
+ a_right <- ChunkedArray$create(truth$right)
+ expect_bool_function_equal(a_left & a_right, truth$left & truth$right)
+ expect_bool_function_equal(a_left | a_right, truth$left | truth$right)
+ expect_bool_function_equal(a_left == a_right, truth$left == truth$right)
+ expect_bool_function_equal(a_left != a_right, truth$left != truth$right)
+ expect_bool_function_equal(!a_left, !truth$left)
+
+ # More complexity
+ isEqualTo <- function(x, y) x == y & !is.na(x)
+ expect_bool_function_equal(
+ isEqualTo(a_left, a_right),
+ isEqualTo(truth$left, truth$right)
+ )
+})
+
+test_that("call_function validation", {
+ expect_error(
+ call_function("filter", 4),
+ 'Argument 1 is of class numeric but it must be one of "Array", "ChunkedArray", "RecordBatch", "Table", or "Scalar"'
+ )
+ expect_error(
+ call_function("filter", Array$create(1:4), 3),
+ "Argument 2 is of class numeric"
+ )
+ expect_error(
+ call_function("filter",
+ Array$create(1:4),
+ Array$create(c(TRUE, FALSE, TRUE)),
+ options = list(keep_na = TRUE)
+ ),
+ "Array arguments must all be the same length"
+ )
+ expect_error(
+ call_function("filter",
+ record_batch(a = 1:3),
+ Array$create(c(TRUE, FALSE, TRUE)),
+ options = list(keep_na = TRUE)
+ ),
+ NA
+ )
+ expect_error(
+ call_function("filter", options = list(keep_na = TRUE)),
+ "accepts 2 arguments"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-csv.R b/src/arrow/r/tests/testthat/test-csv.R
new file mode 100644
index 000000000..023eee92e
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-csv.R
@@ -0,0 +1,357 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Not all types round trip via CSV 100% identical by default
+tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
+# Add a date to test its parsing
+tbl$date <- Sys.Date() + 1:10
+
+csv_file <- tempfile()
+
+test_that("Can read csv file", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE)
+
+ tab0 <- Table$create(tbl)
+ tab1 <- read_csv_arrow(tf, as_data_frame = FALSE)
+ expect_equal(tab0, tab1)
+ tab2 <- read_csv_arrow(mmap_open(tf), as_data_frame = FALSE)
+ expect_equal(tab0, tab2)
+ tab3 <- read_csv_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
+ expect_equal(tab0, tab3)
+})
+
+test_that("read_csv_arrow(as_data_frame=TRUE)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE)
+ tab1 <- read_csv_arrow(tf, as_data_frame = TRUE)
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_delim_arrow parsing options: delim", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.table(tbl, tf, sep = "\t", row.names = FALSE)
+ tab1 <- read_tsv_arrow(tf)
+ tab2 <- read_delim_arrow(tf, delim = "\t")
+ expect_equal(tab1, tab2)
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_delim_arrow parsing options: quote", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ df <- data.frame(a = c(1, 2), b = c("'abc'", "'def'"))
+ write.table(df, sep = ";", tf, row.names = FALSE, quote = FALSE)
+ tab1 <- read_delim_arrow(tf, delim = ";", quote = "'")
+
+ # Is this a problem?
+ # Component “aâ€: target is integer64, current is numeric
+ tab1$a <- as.numeric(tab1$a)
+ expect_equal(
+ tab1,
+ tibble::tibble(a = c(1, 2), b = c("abc", "def"))
+ )
+})
+
+test_that("read_csv_arrow parsing options: col_names", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ # Writing the CSV without the header
+ write.table(tbl, tf, sep = ",", row.names = FALSE, col.names = FALSE)
+
+ # Reading with col_names = FALSE autogenerates names
+ no_names <- read_csv_arrow(tf, col_names = FALSE)
+ expect_equal(no_names$f0, tbl[[1]])
+
+ tab1 <- read_csv_arrow(tf, col_names = names(tbl))
+
+ expect_identical(names(tab1), names(tbl))
+ expect_equal(tbl, tab1)
+
+ # This errors (correctly) because I haven't given enough names
+ # but the error message is "Invalid: Empty CSV file", which is not accurate
+ expect_error(
+ read_csv_arrow(tf, col_names = names(tbl)[1])
+ )
+ # Same here
+ expect_error(
+ read_csv_arrow(tf, col_names = c(names(tbl), names(tbl)))
+ )
+})
+
+test_that("read_csv_arrow parsing options: skip", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ # Adding two garbage lines to start the csv
+ cat("asdf\nqwer\n", file = tf)
+ suppressWarnings(write.table(tbl, tf, sep = ",", row.names = FALSE, append = TRUE))
+
+ tab1 <- read_csv_arrow(tf, skip = 2)
+
+ expect_identical(names(tab1), names(tbl))
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_csv_arrow parsing options: skip_empty_rows", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE)
+ cat("\n\n", file = tf, append = TRUE)
+
+ tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE)
+
+ expect_equal(nrow(tab1), nrow(tbl) + 2)
+ expect_true(is.na(tail(tab1, 1)[[1]]))
+})
+
+test_that("read_csv_arrow parsing options: na strings", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ df <- data.frame(
+ a = c(1.2, NA, NA, 3.4),
+ b = c(NA, "B", "C", NA),
+ stringsAsFactors = FALSE
+ )
+ write.csv(df, tf, row.names = FALSE)
+ expect_equal(grep("NA", readLines(tf)), 2:5)
+
+ tab1 <- read_csv_arrow(tf)
+ expect_equal(is.na(tab1$a), is.na(df$a))
+ expect_equal(is.na(tab1$b), is.na(df$b))
+
+ unlink(tf) # Delete and write to the same file name again
+
+ write.csv(df, tf, row.names = FALSE, na = "asdf")
+ expect_equal(grep("asdf", readLines(tf)), 2:5)
+
+ tab2 <- read_csv_arrow(tf, na = "asdf")
+ expect_equal(is.na(tab2$a), is.na(df$a))
+ expect_equal(is.na(tab2$b), is.na(df$b))
+})
+
+test_that("read_csv_arrow() respects col_select", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(tbl, tf, row.names = FALSE, quote = FALSE)
+
+ tab <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = FALSE)
+ expect_equal(tab, Table$create(example_data[, c("dbl", "lgl")]))
+
+ tib <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = TRUE)
+ expect_equal(tib, example_data[, c("dbl", "lgl")])
+})
+
+test_that("read_csv_arrow() can detect compression from file name", {
+ skip_if_not_available("gzip")
+ tf <- tempfile(fileext = ".csv.gz")
+ on.exit(unlink(tf))
+
+ write.csv(tbl, gzfile(tf), row.names = FALSE, quote = FALSE)
+ tab1 <- read_csv_arrow(tf)
+ expect_equal(tbl, tab1)
+})
+
+test_that("read_csv_arrow(schema=)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, schema = schema(int = float64()), skip = 1)
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types = <Schema>)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = schema(int = float64()))
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+})
+
+test_that("read_csv_arrow(col_types=string, col_names)", {
+ tbl <- example_data[, "int"]
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
+ expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
+
+ expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
+ expect_error(read_csv_arrow(tf, col_types = "d"))
+ expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
+ expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
+})
+
+test_that("read_csv_arrow() can read timestamps", {
+ tbl <- tibble::tibble(time = as.POSIXct("2020-07-20 16:20", tz = "UTC"))
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC")))
+ expect_equal(tbl, df)
+
+ # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
+ df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1)
+ expect_equal(tbl, df, ignore_attr = "tzone")
+})
+
+test_that("read_csv_arrow(timestamp_parsers=)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ tbl <- tibble::tibble(time = "23/09/2020")
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(
+ tf,
+ col_types = schema(time = timestamp(timezone = "UTC")),
+ timestamp_parsers = "%d/%m/%Y"
+ )
+ expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
+})
+
+test_that("Skipping columns with null()", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ cols <- c("dbl", "lgl", "false", "chr")
+ tbl <- example_data[, cols]
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf, col_types = "d-_c", col_names = cols, skip = 1)
+ expect_identical(df, tbl[, c("dbl", "chr")])
+})
+
+test_that("Mix of guessing and declaring types", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ cols <- c("dbl", "lgl", "false", "chr")
+ tbl <- example_data[, cols]
+ write.csv(tbl, tf, row.names = FALSE)
+
+ tab <- read_csv_arrow(tf, col_types = schema(dbl = float32()), as_data_frame = FALSE)
+ expect_equal(tab$schema, schema(dbl = float32(), lgl = bool(), false = bool(), chr = utf8()))
+
+ df <- read_csv_arrow(tf, col_types = "d-?c", col_names = cols, skip = 1)
+ expect_identical(df, tbl[, c("dbl", "false", "chr")])
+})
+
+
+test_that("Write a CSV file with header", {
+ tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out, tbl_no_dates)
+
+ tbl_in <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in, tbl_no_dates)
+
+ tbl_out <- write_csv_arrow(tbl, csv_file)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out, tbl)
+
+ tbl_in <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+ tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out, tbl_no_dates)
+ tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+
+ tbl_expected <- tbl_no_dates
+ names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+
+ expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {
+ tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out1, tbl_no_dates)
+ tbl_in1 <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in1, tbl_no_dates)
+
+ tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out2, tbl_no_dates)
+ tbl_in2 <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in2, tbl_no_dates)
+
+ tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12)
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out3, tbl_no_dates)
+ tbl_in3 <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in3, tbl_no_dates)
+})
+
+test_that("Write a CSV file with invalid input type", {
+ bad_input <- Array$create(1:5)
+ expect_error(
+ write_csv_arrow(bad_input, csv_file),
+ regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
+ )
+})
+
+test_that("Write a CSV file with invalid batch size", {
+ expect_error(
+ write_csv_arrow(tbl_no_dates, csv_file, batch_size = -1),
+ regexp = "batch_size not greater than 0"
+ )
+})
+
+test_that("time mapping work as expected (ARROW-13624)", {
+ tbl <- tibble::tibble(
+ dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"),
+ time = c(hms::as_hms("16:20:00"), NA)
+ )
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write.csv(tbl, tf, row.names = FALSE)
+
+ df <- read_csv_arrow(tf,
+ col_names = c("dt", "time"),
+ col_types = "Tt",
+ skip = 1
+ )
+
+ expect_error(
+ read_csv_arrow(tf,
+ col_names = c("dt", "time"),
+ col_types = "tT", skip = 1
+ )
+ )
+
+ expect_equal(df, tbl, ignore_attr = "tzone")
+})
diff --git a/src/arrow/r/tests/testthat/test-data-type.R b/src/arrow/r/tests/testthat/test-data-type.R
new file mode 100644
index 000000000..a9d0879b8
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-data-type.R
@@ -0,0 +1,429 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("null type works as expected", {
+ x <- null()
+ expect_equal(x$id, 0L)
+ expect_equal(x$name, "null")
+ expect_equal(x$ToString(), "null")
+ expect_true(x == x)
+ expect_false(x == int8())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+})
+
+test_that("boolean type work as expected", {
+ x <- boolean()
+ expect_equal(x$id, Type$BOOL)
+ expect_equal(x$name, "bool")
+ expect_equal(x$ToString(), "bool")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 1L)
+})
+
+test_that("int types works as expected", {
+ x <- uint8()
+ expect_equal(x$id, Type$UINT8)
+ expect_equal(x$name, "uint8")
+ expect_equal(x$ToString(), "uint8")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 8L)
+
+ x <- int8()
+ expect_equal(x$id, Type$INT8)
+ expect_equal(x$name, "int8")
+ expect_equal(x$ToString(), "int8")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 8L)
+
+ x <- uint16()
+ expect_equal(x$id, Type$UINT16)
+ expect_equal(x$name, "uint16")
+ expect_equal(x$ToString(), "uint16")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 16L)
+
+ x <- int16()
+ expect_equal(x$id, Type$INT16)
+ expect_equal(x$name, "int16")
+ expect_equal(x$ToString(), "int16")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 16L)
+
+ x <- uint32()
+ expect_equal(x$id, Type$UINT32)
+ expect_equal(x$name, "uint32")
+ expect_equal(x$ToString(), "uint32")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+
+ x <- int32()
+ expect_equal(x$id, Type$INT32)
+ expect_equal(x$name, "int32")
+ expect_equal(x$ToString(), "int32")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+
+ x <- uint64()
+ expect_equal(x$id, Type$UINT64)
+ expect_equal(x$name, "uint64")
+ expect_equal(x$ToString(), "uint64")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+
+ x <- int64()
+ expect_equal(x$id, Type$INT64)
+ expect_equal(x$name, "int64")
+ expect_equal(x$ToString(), "int64")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+})
+
+test_that("float types work as expected", {
+ x <- float16()
+ expect_equal(x$id, Type$HALF_FLOAT)
+ expect_equal(x$name, "halffloat")
+ expect_equal(x$ToString(), "halffloat")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 16L)
+
+ x <- float32()
+ expect_equal(x$id, Type$FLOAT)
+ expect_equal(x$name, "float")
+ expect_equal(x$ToString(), "float")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+
+ x <- float64()
+ expect_equal(x$id, Type$DOUBLE)
+ expect_equal(x$name, "double")
+ expect_equal(x$ToString(), "double")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+})
+
+test_that("utf8 type works as expected", {
+ x <- utf8()
+ expect_equal(x$id, Type$STRING)
+ expect_equal(x$name, "utf8")
+ expect_equal(x$ToString(), "string")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+})
+
+test_that("date types work as expected", {
+ x <- date32()
+ expect_equal(x$id, Type$DATE32)
+ expect_equal(x$name, "date32")
+ expect_equal(x$ToString(), "date32[day]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$unit(), unclass(DateUnit$DAY))
+
+ x <- date64()
+ expect_equal(x$id, Type$DATE64)
+ expect_equal(x$name, "date64")
+ expect_equal(x$ToString(), "date64[ms]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$unit(), unclass(DateUnit$MILLI))
+})
+
+test_that("timestamp type works as expected", {
+ x <- timestamp(TimeUnit$SECOND)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[s]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$SECOND))
+
+ x <- timestamp(TimeUnit$MILLI)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[ms]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$MILLI))
+
+ x <- timestamp(TimeUnit$MICRO)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[us]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$MICRO))
+
+ x <- timestamp(TimeUnit$NANO)
+ expect_equal(x$id, Type$TIMESTAMP)
+ expect_equal(x$name, "timestamp")
+ expect_equal(x$ToString(), "timestamp[ns]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$timezone(), "")
+ expect_equal(x$unit(), unclass(TimeUnit$NANO))
+})
+
+test_that("timestamp with timezone", {
+ expect_equal(timestamp(timezone = "EST")$ToString(), "timestamp[s, tz=EST]")
+})
+
+test_that("time32 types work as expected", {
+ x <- time32(TimeUnit$SECOND)
+ expect_equal(x$id, Type$TIME32)
+ expect_equal(x$name, "time32")
+ expect_equal(x$ToString(), "time32[s]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+ expect_equal(x$unit(), unclass(TimeUnit$SECOND))
+
+ x <- time32(TimeUnit$MILLI)
+ expect_equal(x$id, Type$TIME32)
+ expect_equal(x$name, "time32")
+ expect_equal(x$ToString(), "time32[ms]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 32L)
+ expect_equal(x$unit(), unclass(TimeUnit$MILLI))
+})
+
+test_that("time64 types work as expected", {
+ x <- time64(TimeUnit$MICRO)
+ expect_equal(x$id, Type$TIME64)
+ expect_equal(x$name, "time64")
+ expect_equal(x$ToString(), "time64[us]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$unit(), unclass(TimeUnit$MICRO))
+
+ x <- time64(TimeUnit$NANO)
+ expect_equal(x$id, Type$TIME64)
+ expect_equal(x$name, "time64")
+ expect_equal(x$ToString(), "time64[ns]")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 0L)
+ expect_equal(x$fields(), list())
+ expect_equal(x$bit_width, 64L)
+ expect_equal(x$unit(), unclass(TimeUnit$NANO))
+})
+
+test_that("time type unit validation", {
+ expect_equal(time32(TimeUnit$SECOND), time32("s"))
+ expect_equal(time32(TimeUnit$MILLI), time32("ms"))
+ expect_equal(time32(), time32(TimeUnit$MILLI))
+ expect_error(time32(4), '"unit" should be one of 1 or 0')
+ expect_error(time32(NULL), '"unit" should be one of "ms" or "s"')
+ expect_match_arg_error(time32("years"))
+
+ expect_equal(time64(TimeUnit$NANO), time64("n"))
+ expect_equal(time64(TimeUnit$MICRO), time64("us"))
+ expect_equal(time64(), time64(TimeUnit$NANO))
+ expect_error(time64(4), '"unit" should be one of 3 or 2')
+ expect_error(time64(NULL), '"unit" should be one of "ns" or "us"')
+ expect_match_arg_error(time64("years"))
+})
+
+test_that("timestamp type input validation", {
+ expect_equal(timestamp("ms"), timestamp(TimeUnit$MILLI))
+ expect_equal(timestamp(), timestamp(TimeUnit$SECOND))
+ expect_error(
+ timestamp(NULL),
+ '"unit" should be one of "ns", "us", "ms", or "s"'
+ )
+ expect_error(
+ timestamp(timezone = 1231231),
+ "timezone is not a string"
+ )
+ expect_error(
+ timestamp(timezone = c("not", "a", "timezone")),
+ "timezone is not a string"
+ )
+})
+
+test_that("list type works as expected", {
+ x <- list_of(int32())
+ expect_equal(x$id, Type$LIST)
+ expect_equal(x$name, "list")
+ expect_equal(x$ToString(), "list<item: int32>")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 1L)
+ expect_equal(
+ x$fields()[[1]],
+ field("item", int32())
+ )
+ expect_equal(x$value_type, int32())
+ expect_equal(x$value_field, field("item", int32()))
+})
+
+test_that("struct type works as expected", {
+ x <- struct(x = int32(), y = boolean())
+ expect_equal(x$id, Type$STRUCT)
+ expect_equal(x$name, "struct")
+ expect_equal(x$ToString(), "struct<x: int32, y: bool>")
+ expect_true(x == x)
+ expect_false(x == null())
+ expect_equal(x$num_fields, 2L)
+ expect_equal(
+ x$fields()[[1]],
+ field("x", int32())
+ )
+ expect_equal(
+ x$fields()[[2]],
+ field("y", boolean())
+ )
+ expect_equal(x$GetFieldIndex("x"), 0L)
+ expect_equal(x$GetFieldIndex("y"), 1L)
+ expect_equal(x$GetFieldIndex("z"), -1L)
+
+ expect_equal(x$GetFieldByName("x"), field("x", int32()))
+ expect_equal(x$GetFieldByName("y"), field("y", boolean()))
+ expect_null(x$GetFieldByName("z"))
+})
+
+test_that("DictionaryType works as expected (ARROW-3355)", {
+ d <- dictionary(int32(), utf8())
+ expect_equal(d, d)
+ expect_true(d == d)
+ expect_false(d == int32())
+ expect_equal(d$id, Type$DICTIONARY)
+ expect_equal(d$bit_width, 32L)
+ expect_equal(d$ToString(), "dictionary<values=string, indices=int32>")
+ expect_equal(d$index_type, int32())
+ expect_equal(d$value_type, utf8())
+ ord <- dictionary(ordered = TRUE)
+ expect_equal(ord$ToString(), "dictionary<values=string, indices=int32, ordered>")
+})
+
+test_that("DictionaryType validation", {
+ expect_error(
+ dictionary(utf8(), int32()),
+ "Dictionary index type should be .*integer, got string"
+ )
+ expect_error(dictionary(4, utf8()), 'index_type must be a "DataType"')
+ expect_error(dictionary(int8(), "strings"), 'value_type must be a "DataType"')
+})
+
+test_that("decimal type and validation", {
+ expect_error(decimal())
+ expect_error(decimal("four"), '"precision" must be an integer')
+ expect_error(decimal(4))
+ expect_error(decimal(4, "two"), '"scale" must be an integer')
+ expect_error(decimal(NA, 2), '"precision" must be an integer')
+ expect_error(decimal(0, 2), "Invalid: Decimal precision out of range: 0")
+ expect_error(decimal(100, 2), "Invalid: Decimal precision out of range: 100")
+ expect_error(decimal(4, NA), '"scale" must be an integer')
+
+ expect_r6_class(decimal(4, 2), "Decimal128Type")
+})
+
+test_that("Binary", {
+ expect_r6_class(binary(), "Binary")
+ expect_equal(binary()$ToString(), "binary")
+})
+
+test_that("FixedSizeBinary", {
+ expect_r6_class(fixed_size_binary(4), "FixedSizeBinary")
+ expect_equal(fixed_size_binary(4)$ToString(), "fixed_size_binary[4]")
+
+ # input validation
+ expect_error(fixed_size_binary(NA), "'byte_width' cannot be NA")
+ expect_error(fixed_size_binary(-1), "'byte_width' must be > 0")
+ expect_error(fixed_size_binary("four"))
+ expect_error(fixed_size_binary(c(2, 4)))
+})
+
+test_that("DataType to C-interface", {
+ datatype <- timestamp("ms", timezone = "Pacific/Marquesas")
+
+ # export the datatype via the C-interface
+ ptr <- allocate_arrow_schema()
+ datatype$export_to_c(ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- DataType$import_from_c(ptr)
+ expect_equal(circle, datatype)
+
+ # must clean up the pointer or we leak
+ delete_arrow_schema(ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-csv.R b/src/arrow/r/tests/testthat/test-dataset-csv.R
new file mode 100644
index 000000000..ab6693148
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-csv.R
@@ -0,0 +1,290 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+csv_dir <- make_temp_dir()
+tsv_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dirs)", {
+ dir.create(file.path(csv_dir, 5))
+ dir.create(file.path(csv_dir, 6))
+ write.csv(df1, file.path(csv_dir, 5, "file1.csv"), row.names = FALSE)
+ write.csv(df2, file.path(csv_dir, 6, "file2.csv"), row.names = FALSE)
+ expect_length(dir(csv_dir, recursive = TRUE), 2)
+
+ # Now, tab-delimited
+ dir.create(file.path(tsv_dir, 5))
+ dir.create(file.path(tsv_dir, 6))
+ write.table(df1, file.path(tsv_dir, 5, "file1.tsv"), row.names = FALSE, sep = "\t")
+ write.table(df2, file.path(tsv_dir, 6, "file2.tsv"), row.names = FALSE, sep = "\t")
+ expect_length(dir(tsv_dir, recursive = TRUE), 2)
+})
+
+test_that("CSV dataset", {
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ expect_r6_class(ds$format, "CsvFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+ expect_identical(names(ds), c(names(df1), "part"))
+ if (getRversion() >= "4.0.0") {
+ # CountRows segfaults on RTools35/R 3.6, so don't test it there
+ expect_identical(dim(ds), c(20L, 7L))
+ }
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+ # Collecting virtual partition column works
+ expect_equal(
+ collect(ds) %>% arrange(part) %>% pull(part),
+ c(rep(5, 10), rep(6, 10))
+ )
+})
+
+test_that("CSV scan options", {
+ options <- FragmentScanOptions$create("text")
+ expect_equal(options$type, "csv")
+ options <- FragmentScanOptions$create("csv",
+ null_values = c("mynull"),
+ strings_can_be_null = TRUE
+ )
+ expect_equal(options$type, "csv")
+
+ dst_dir <- make_temp_dir()
+ dst_file <- file.path(dst_dir, "data.csv")
+ df <- tibble(chr = c("foo", "mynull"))
+ write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+ ds <- open_dataset(dst_dir, format = "csv")
+ expect_equal(ds %>% collect(), df)
+
+ sb <- ds$NewScan()
+ sb$FragmentScanOptions(options)
+
+ tab <- sb$Finish()$ToTable()
+ expect_equal(as.data.frame(tab), tibble(chr = c("foo", NA)))
+
+ # Set default convert options in CsvFileFormat
+ csv_format <- CsvFileFormat$create(
+ null_values = c("mynull"),
+ strings_can_be_null = TRUE
+ )
+ ds <- open_dataset(dst_dir, format = csv_format)
+ expect_equal(ds %>% collect(), tibble(chr = c("foo", NA)))
+
+ # Set both parse and convert options
+ df <- tibble(chr = c("foo", "mynull"), chr2 = c("bar", "baz"))
+ write.table(df, dst_file, row.names = FALSE, quote = FALSE, sep = "\t")
+ ds <- open_dataset(dst_dir,
+ format = "csv",
+ delimiter = "\t",
+ null_values = c("mynull"),
+ strings_can_be_null = TRUE
+ )
+ expect_equal(ds %>% collect(), tibble(
+ chr = c("foo", NA),
+ chr2 = c("bar", "baz")
+ ))
+ expect_equal(
+ ds %>%
+ group_by(chr2) %>%
+ summarize(na = all(is.na(chr))) %>%
+ arrange(chr2) %>%
+ collect(),
+ tibble(
+ chr2 = c("bar", "baz"),
+ na = c(FALSE, TRUE)
+ )
+ )
+})
+
+test_that("compressed CSV dataset", {
+ skip_if_not_available("gzip")
+ dst_dir <- make_temp_dir()
+ dst_file <- file.path(dst_dir, "data.csv.gz")
+ write.csv(df1, gzfile(dst_file), row.names = FALSE, quote = FALSE)
+ format <- FileFormat$create("csv")
+ ds <- open_dataset(dst_dir, format = format)
+ expect_r6_class(ds$format, "CsvFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("CSV dataset options", {
+ dst_dir <- make_temp_dir()
+ dst_file <- file.path(dst_dir, "data.csv")
+ df <- tibble(chr = letters[1:10])
+ write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+ format <- FileFormat$create("csv", skip_rows = 1)
+ ds <- open_dataset(dst_dir, format = format)
+
+ expect_equal(
+ ds %>%
+ select(string = a) %>%
+ collect(),
+ df1[-1, ] %>%
+ select(string = chr)
+ )
+
+ ds <- open_dataset(dst_dir, format = "csv", column_names = c("foo"))
+
+ expect_equal(
+ ds %>%
+ select(string = foo) %>%
+ collect(),
+ tibble(string = c(c("chr"), letters[1:10]))
+ )
+})
+
+test_that("Other text delimited dataset", {
+ ds1 <- open_dataset(tsv_dir, partitioning = "part", format = "tsv")
+ expect_equal(
+ ds1 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ ds2 <- open_dataset(tsv_dir, partitioning = "part", format = "text", delimiter = "\t")
+ expect_equal(
+ ds2 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("readr parse options", {
+ arrow_opts <- names(formals(CsvParseOptions$create))
+ readr_opts <- names(formals(readr_to_csv_parse_options))
+
+ # Arrow and readr parse options must be mutually exclusive, or else the code
+ # in `csv_file_format_parse_options()` will error or behave incorrectly. A
+ # failure of this test indicates that these two sets of option names are not
+ # mutually exclusive.
+ expect_equal(
+ intersect(arrow_opts, readr_opts),
+ character(0)
+ )
+
+ # With not yet supported readr parse options (ARROW-8631)
+ expect_error(
+ open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
+ "supported"
+ )
+
+ # With unrecognized (garbage) parse options
+ expect_error(
+ open_dataset(
+ tsv_dir,
+ partitioning = "part",
+ format = "text",
+ asdfg = "\\"
+ ),
+ "Unrecognized"
+ )
+
+ # With both Arrow and readr parse options (disallowed)
+ expect_error(
+ open_dataset(
+ tsv_dir,
+ partitioning = "part",
+ format = "text",
+ quote = "\"",
+ quoting = TRUE
+ ),
+ "either"
+ )
+
+ # With ambiguous partial option names (disallowed)
+ expect_error(
+ open_dataset(
+ tsv_dir,
+ partitioning = "part",
+ format = "text",
+ quo = "\"",
+ ),
+ "Ambiguous"
+ )
+
+ # With only readr parse options (and omitting format = "text")
+ ds1 <- open_dataset(tsv_dir, partitioning = "part", delim = "\t")
+ expect_equal(
+ ds1 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 5) %>%
+ collect() %>%
+ summarize(mean = mean(as.numeric(integer))), # as.numeric bc they're being parsed as int64
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+# see https://issues.apache.org/jira/browse/ARROW-12791
+test_that("Error if no format specified and files are not parquet", {
+ expect_error(
+ open_dataset(csv_dir, partitioning = "part"),
+ "Did you mean to specify a 'format' other than the default (parquet)?",
+ fixed = TRUE
+ )
+ expect_error(
+ open_dataset(csv_dir, partitioning = "part", format = "parquet"),
+ "Parquet magic bytes not found"
+ )
+})
+
+test_that("Column names inferred from schema for headerless CSVs (ARROW-14063)", {
+ headerless_csv_dir <- make_temp_dir()
+ tbl <- df1[, c("int", "dbl")]
+ write.table(tbl, file.path(headerless_csv_dir, "file1.csv"), sep = ",", row.names = FALSE, col.names = FALSE)
+
+ ds <- open_dataset(headerless_csv_dir, format = "csv", schema = schema(int = int32(), dbl = float64()))
+ expect_equal(ds %>% collect(), tbl)
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-dplyr.R b/src/arrow/r/tests/testthat/test-dataset-dplyr.R
new file mode 100644
index 000000000..b4519377c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-dplyr.R
@@ -0,0 +1,340 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+skip_if_not_available("parquet")
+
+library(dplyr, warn.conflicts = FALSE)
+
+dataset_dir <- make_temp_dir()
+hive_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dir)", {
+ dir.create(file.path(dataset_dir, 1))
+ dir.create(file.path(dataset_dir, 2))
+ write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet"))
+ write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet"))
+ expect_length(dir(dataset_dir, recursive = TRUE), 2)
+
+ dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE)
+ dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE)
+ write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet"))
+ write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet"))
+ expect_length(dir(hive_dir, recursive = TRUE), 2)
+})
+
+test_that("filter() with is.nan()", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_equal(
+ ds %>%
+ select(part, dbl) %>%
+ filter(!is.nan(dbl), part == 2) %>%
+ collect(),
+ tibble(part = 2L, dbl = df2$dbl[!is.nan(df2$dbl)])
+ )
+})
+
+test_that("filter() with %in%", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_equal(
+ ds %>%
+ select(int, part) %>%
+ filter(int %in% c(6, 4, 3, 103, 107), part == 1) %>%
+ collect(),
+ tibble(int = df1$int[c(3, 4, 6)], part = 1)
+ )
+
+ # ARROW-9606: bug in %in% filter on partition column with >1 partition columns
+ ds <- open_dataset(hive_dir)
+ expect_equal(
+ ds %>%
+ filter(group %in% 2) %>%
+ select(names(df2)) %>%
+ collect(),
+ df2
+ )
+})
+
+test_that("filter() on timestamp columns", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_equal(
+ ds %>%
+ filter(ts >= lubridate::ymd_hms("2015-05-04 03:12:39")) %>%
+ filter(part == 1) %>%
+ select(ts) %>%
+ collect(),
+ df1[5:10, c("ts")],
+ )
+
+ # Now with Date
+ expect_equal(
+ ds %>%
+ filter(ts >= as.Date("2015-05-04")) %>%
+ filter(part == 1) %>%
+ select(ts) %>%
+ collect(),
+ df1[5:10, c("ts")],
+ )
+
+ # Now with bare string date
+ skip("Implement more aggressive implicit casting for scalars (ARROW-11402)")
+ expect_equal(
+ ds %>%
+ filter(ts >= "2015-05-04") %>%
+ filter(part == 1) %>%
+ select(ts) %>%
+ collect(),
+ df1[5:10, c("ts")],
+ )
+})
+
+test_that("filter() on date32 columns", {
+ tmp <- tempfile()
+ dir.create(tmp)
+ df <- data.frame(date = as.Date(c("2020-02-02", "2020-02-03")))
+ write_parquet(df, file.path(tmp, "file.parquet"))
+
+ expect_equal(
+ open_dataset(tmp) %>%
+ filter(date > as.Date("2020-02-02")) %>%
+ collect() %>%
+ nrow(),
+ 1L
+ )
+
+ # Also with timestamp scalar
+ expect_equal(
+ open_dataset(tmp) %>%
+ filter(date > lubridate::ymd_hms("2020-02-02 00:00:00")) %>%
+ collect() %>%
+ nrow(),
+ 1L
+ )
+})
+
+
+test_that("mutate()", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ mutated <- ds %>%
+ select(chr, dbl, int) %>%
+ filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
+ mutate(twice = int * 2)
+ expect_output(
+ print(mutated),
+ "FileSystemDataset (query)
+chr: string
+dbl: double
+int: int32
+twice: double (multiply_checked(int, 2))
+
+* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3))
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+ expect_equal(
+ mutated %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl", "int")],
+ df2[1:2, c("chr", "dbl", "int")]
+ ) %>%
+ mutate(
+ twice = int * 2
+ )
+ )
+})
+
+test_that("mutate() features not yet implemented", {
+ expect_error(
+ ds %>%
+ group_by(int) %>%
+ mutate(avg = mean(int)),
+ "window functions not currently supported in Arrow\nCall collect() first to pull data into R.",
+ fixed = TRUE
+ )
+})
+
+test_that("filter scalar validation doesn't crash (ARROW-7772)", {
+ expect_error(
+ ds %>%
+ filter(int == "fff", part == 1) %>%
+ collect(),
+ "equal has no kernel matching input types .array.int32., scalar.string.."
+ )
+})
+
+test_that("collect() on Dataset works (if fits in memory)", {
+ expect_equal(
+ collect(open_dataset(dataset_dir)) %>% arrange(int),
+ rbind(df1, df2)
+ )
+})
+
+test_that("count()", {
+ ds <- open_dataset(dataset_dir)
+ df <- rbind(df1, df2)
+ expect_equal(
+ ds %>%
+ filter(int > 6, int < 108) %>%
+ count(chr) %>%
+ arrange(chr) %>%
+ collect(),
+ df %>%
+ filter(int > 6, int < 108) %>%
+ count(chr)
+ )
+})
+
+test_that("arrange()", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ arranged <- ds %>%
+ select(chr, dbl, int) %>%
+ filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
+ mutate(twice = int * 2) %>%
+ arrange(chr, desc(twice), dbl + int)
+ expect_output(
+ print(arranged),
+ "FileSystemDataset (query)
+chr: string
+dbl: double
+int: int32
+twice: double (multiply_checked(int, 2))
+
+* Filter: ((multiply_checked(dbl, 2) > 14) and (subtract_checked(dbl, 50) < 3))
+* Sorted by chr [asc], multiply_checked(int, 2) [desc], add_checked(dbl, int) [asc]
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+ expect_equal(
+ arranged %>%
+ collect(),
+ rbind(
+ df1[8, c("chr", "dbl", "int")],
+ df2[2, c("chr", "dbl", "int")],
+ df1[9, c("chr", "dbl", "int")],
+ df2[1, c("chr", "dbl", "int")],
+ df1[10, c("chr", "dbl", "int")]
+ ) %>%
+ mutate(
+ twice = int * 2
+ )
+ )
+})
+
+test_that("compute()/collect(as_data_frame=FALSE)", {
+ ds <- open_dataset(dataset_dir)
+
+ tab1 <- ds %>% compute()
+ expect_r6_class(tab1, "Table")
+
+ tab2 <- ds %>% collect(as_data_frame = FALSE)
+ expect_r6_class(tab2, "Table")
+
+ tab3 <- ds %>%
+ mutate(negint = -int) %>%
+ filter(negint > -100) %>%
+ arrange(chr) %>%
+ select(negint) %>%
+ compute()
+
+ expect_r6_class(tab3, "Table")
+
+ expect_equal(
+ tab3 %>% collect(),
+ tibble(negint = -1:-10)
+ )
+
+ tab4 <- ds %>%
+ mutate(negint = -int) %>%
+ filter(negint > -100) %>%
+ arrange(chr) %>%
+ select(negint) %>%
+ collect(as_data_frame = FALSE)
+
+ expect_r6_class(tab3, "Table")
+
+ expect_equal(
+ tab4 %>% collect(),
+ tibble(negint = -1:-10)
+ )
+
+ tab5 <- ds %>%
+ mutate(negint = -int) %>%
+ group_by(fct) %>%
+ compute()
+
+ # the group_by() prevents compute() from returning a Table...
+ expect_s3_class(tab5, "arrow_dplyr_query")
+
+ # ... but $.data is a Table (InMemoryDataset)...
+ expect_r6_class(tab5$.data, "InMemoryDataset")
+ # ... and the mutate() was evaluated
+ expect_true("negint" %in% names(tab5$.data))
+})
+
+test_that("head/tail on query on dataset", {
+ # head/tail on arrow_dplyr_query does not have deterministic order,
+ # so without sorting we can only assert the correct number of rows
+ ds <- open_dataset(dataset_dir)
+
+ expect_identical(
+ ds %>%
+ filter(int > 6) %>%
+ head(5) %>%
+ compute() %>%
+ nrow(),
+ 5L
+ )
+
+ expect_equal(
+ ds %>%
+ filter(int > 6) %>%
+ arrange(int) %>%
+ head() %>%
+ collect(),
+ rbind(df1[7:10, ], df2[1:2, ])
+ )
+
+ expect_equal(
+ ds %>%
+ filter(int < 105) %>%
+ tail(4) %>%
+ compute() %>%
+ nrow(),
+ 4L
+ )
+
+ expect_equal(
+ ds %>%
+ filter(int < 105) %>%
+ arrange(int) %>%
+ tail() %>%
+ collect(),
+ rbind(df1[9:10, ], df2[1:4, ])
+ )
+})
+
+test_that("dplyr method not implemented messages", {
+ ds <- open_dataset(dataset_dir)
+ # This one is more nuanced
+ expect_error(
+ ds %>% filter(int > 6, dbl > max(dbl)),
+ "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.",
+ fixed = TRUE
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-uri.R b/src/arrow/r/tests/testthat/test-dataset-uri.R
new file mode 100644
index 000000000..bdcccf282
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-uri.R
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_on_os("windows")
+skip_if_not_available("parquet")
+skip_if_not_available("dataset")
+
+
+library(dplyr, warn.conflicts = FALSE)
+
+dataset_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dir)", {
+ dir.create(file.path(dataset_dir, 1))
+ dir.create(file.path(dataset_dir, 2))
+ write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet"))
+ write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet"))
+ expect_length(dir(dataset_dir, recursive = TRUE), 2)
+})
+
+files <- c(
+ file.path(dataset_dir, 1, "file1.parquet", fsep = "/"),
+ file.path(dataset_dir, 2, "file2.parquet", fsep = "/")
+)
+
+
+test_that("dataset from single local file path", {
+ ds <- open_dataset(files[1])
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7) %>%
+ collect() %>%
+ arrange(dbl),
+ df1[8:10, c("chr", "dbl")]
+ )
+})
+
+test_that("dataset from vector of file paths", {
+ ds <- open_dataset(files)
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+})
+
+test_that("dataset from directory URI", {
+ uri <- paste0("file://", dataset_dir)
+ ds <- open_dataset(uri, partitioning = schema(part = uint8()))
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+})
+
+test_that("dataset from single file URI", {
+ uri <- paste0("file://", files[1])
+ ds <- open_dataset(uri)
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7) %>%
+ collect() %>%
+ arrange(dbl),
+ df1[8:10, c("chr", "dbl")]
+ )
+})
+
+test_that("dataset from vector of file URIs", {
+ uris <- paste0("file://", files)
+ ds <- open_dataset(uris)
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>%
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+})
+
+test_that("open_dataset errors on mixed paths and URIs", {
+ expect_error(
+ open_dataset(c(files[1], paste0("file://", files[2]))),
+ "Vectors of mixed paths and URIs are not supported"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset-write.R b/src/arrow/r/tests/testthat/test-dataset-write.R
new file mode 100644
index 000000000..8e7c077e6
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset-write.R
@@ -0,0 +1,454 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+
+hive_dir <- make_temp_dir()
+csv_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dirs)", {
+ if (arrow_with_parquet()) {
+ dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE)
+ dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE)
+ write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet"))
+ write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet"))
+ expect_length(dir(hive_dir, recursive = TRUE), 2)
+ }
+
+ # Now, CSV
+ dir.create(file.path(csv_dir, 5))
+ dir.create(file.path(csv_dir, 6))
+ write.csv(df1, file.path(csv_dir, 5, "file1.csv"), row.names = FALSE)
+ write.csv(df2, file.path(csv_dir, 6, "file2.csv"), row.names = FALSE)
+ expect_length(dir(csv_dir, recursive = TRUE), 2)
+})
+
+test_that("Writing a dataset: CSV->IPC", {
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ # Check whether "int" is present in the files or just in the dirs
+ first <- read_feather(
+ dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1],
+ as_data_frame = FALSE
+ )
+ # It shouldn't be there
+ expect_false("int" %in% names(first))
+})
+
+test_that("Writing a dataset: Parquet->IPC", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int, group) %>%
+ filter(integer > 6 & group == 1) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: CSV->Parquet", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "parquet", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir)
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: Parquet->Parquet (default)", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir)
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int, group) %>%
+ filter(integer > 6 & group == 1) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: existing data behavior", {
+ # This test does not work on Windows because unlink does not immediately
+ # delete the data.
+ skip_on_os("windows")
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ expect_true(dir.exists(dst_dir))
+
+ check_dataset <- function() {
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+ }
+
+ check_dataset()
+ # By default we should overwrite
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int")
+ check_dataset()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "overwrite")
+ check_dataset()
+ expect_error(
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error"),
+ "directory is not empty"
+ )
+ unlink(dst_dir, recursive = TRUE)
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", existing_data_behavior = "error")
+ check_dataset()
+})
+
+test_that("Writing a dataset: no format specified", {
+ dst_dir <- make_temp_dir()
+ write_dataset(example_data, dst_dir)
+ new_ds <- open_dataset(dst_dir)
+ expect_equal(
+ list.files(dst_dir, pattern = "parquet"),
+ "part-0.parquet"
+ )
+ expect_true(
+ inherits(new_ds$format, "ParquetFileFormat")
+ )
+ expect_equal(
+ new_ds %>% collect(),
+ example_data
+ )
+})
+
+test_that("Dataset writing: dplyr methods", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- tempfile()
+ # Specify partition vars by group_by
+ ds %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ # select to specify schema (and rename)
+ dst_dir2 <- tempfile()
+ ds %>%
+ group_by(int) %>%
+ select(chr, dubs = dbl) %>%
+ write_dataset(dst_dir2, format = "feather")
+ new_ds <- open_dataset(dst_dir2, format = "feather")
+
+ expect_equal(
+ collect(new_ds) %>% arrange(int),
+ rbind(df1[c("chr", "dbl", "int")], df2[c("chr", "dbl", "int")]) %>% rename(dubs = dbl)
+ )
+
+ # filter to restrict written rows
+ dst_dir3 <- tempfile()
+ ds %>%
+ filter(int == 4) %>%
+ write_dataset(dst_dir3, format = "feather")
+ new_ds <- open_dataset(dst_dir3, format = "feather")
+
+ expect_equal(
+ new_ds %>% select(names(df1)) %>% collect(),
+ df1 %>% filter(int == 4)
+ )
+
+ # mutate
+ dst_dir3 <- tempfile()
+ ds %>%
+ filter(int == 4) %>%
+ mutate(twice = int * 2) %>%
+ write_dataset(dst_dir3, format = "feather")
+ new_ds <- open_dataset(dst_dir3, format = "feather")
+
+ expect_equal(
+ new_ds %>% select(c(names(df1), "twice")) %>% collect(),
+ df1 %>% filter(int == 4) %>% mutate(twice = int * 2)
+ )
+})
+
+test_that("Dataset writing: non-hive", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- tempfile()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = "int", hive_style = FALSE)
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(as.character(c(1:10, 101:110))))
+})
+
+test_that("Dataset writing: no partitioning", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(hive_dir)
+ dst_dir <- tempfile()
+ write_dataset(ds, dst_dir, format = "feather", partitioning = NULL)
+ expect_true(dir.exists(dst_dir))
+ expect_true(length(dir(dst_dir)) > 0)
+})
+
+test_that("Dataset writing: partition on null", {
+ ds <- open_dataset(hive_dir)
+
+ dst_dir <- tempfile()
+ partitioning <- hive_partition(lgl = boolean())
+ write_dataset(ds, dst_dir, partitioning = partitioning)
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), c("lgl=__HIVE_DEFAULT_PARTITION__", "lgl=false", "lgl=true"))
+
+ dst_dir <- tempfile()
+ partitioning <- hive_partition(lgl = boolean(), null_fallback = "xyz")
+ write_dataset(ds, dst_dir, partitioning = partitioning)
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), c("lgl=false", "lgl=true", "lgl=xyz"))
+
+ ds_readback <- open_dataset(dst_dir, partitioning = hive_partition(lgl = boolean(), null_fallback = "xyz"))
+
+ expect_identical(
+ ds %>%
+ select(int, lgl) %>%
+ collect() %>%
+ arrange(lgl, int),
+ ds_readback %>%
+ select(int, lgl) %>%
+ collect() %>%
+ arrange(lgl, int)
+ )
+})
+
+test_that("Dataset writing: from data.frame", {
+ dst_dir <- tempfile()
+ stacked <- rbind(df1, df2)
+ stacked %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Dataset writing: from RecordBatch", {
+ dst_dir <- tempfile()
+ stacked <- record_batch(rbind(df1, df2))
+ stacked %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: Ipc format options & compression", {
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+
+ codec <- NULL
+ if (codec_is_available("zstd")) {
+ codec <- Codec$create("zstd")
+ }
+
+ write_dataset(ds, dst_dir, format = "feather", codec = codec)
+ expect_true(dir.exists(dst_dir))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: Parquet format options", {
+ skip_if_not_available("parquet")
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+ dst_dir <- make_temp_dir()
+ dst_dir_no_truncated_timestamps <- make_temp_dir()
+
+ # Use trace() to confirm that options are passed in
+ suppressMessages(trace(
+ "parquet___ArrowWriterProperties___create",
+ tracer = quote(warning("allow_truncated_timestamps == ", allow_truncated_timestamps)),
+ print = FALSE,
+ where = write_dataset
+ ))
+ expect_warning(
+ write_dataset(ds, dst_dir_no_truncated_timestamps, format = "parquet", partitioning = "int"),
+ "allow_truncated_timestamps == FALSE"
+ )
+ expect_warning(
+ write_dataset(ds, dst_dir, format = "parquet", partitioning = "int", allow_truncated_timestamps = TRUE),
+ "allow_truncated_timestamps == TRUE"
+ )
+ suppressMessages(untrace(
+ "parquet___ArrowWriterProperties___create",
+ where = write_dataset
+ ))
+
+ # Now confirm we can read back what we sent
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir)
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("Writing a dataset: CSV format options", {
+ df <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ )
+
+ dst_dir <- make_temp_dir()
+ write_dataset(df, dst_dir, format = "csv")
+ expect_true(dir.exists(dst_dir))
+ new_ds <- open_dataset(dst_dir, format = "csv")
+ expect_equal(new_ds %>% collect(), df)
+
+ dst_dir <- make_temp_dir()
+ write_dataset(df, dst_dir, format = "csv", include_header = FALSE)
+ expect_true(dir.exists(dst_dir))
+ new_ds <- open_dataset(dst_dir,
+ format = "csv",
+ column_names = c("int", "dbl", "lgl", "chr")
+ )
+ expect_equal(new_ds %>% collect(), df)
+})
+
+test_that("Dataset writing: unsupported features/input validation", {
+ skip_if_not_available("parquet")
+ expect_error(write_dataset(4), 'dataset must be a "Dataset"')
+
+ ds <- open_dataset(hive_dir)
+ expect_error(
+ write_dataset(ds, partitioning = c("int", "NOTACOLUMN"), format = "ipc"),
+ 'Invalid field name: "NOTACOLUMN"'
+ )
+ expect_error(
+ write_dataset(ds, tempfile(), basename_template = "something_without_i")
+ )
+ expect_error(
+ write_dataset(ds, tempfile(), basename_template = NULL)
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dataset.R b/src/arrow/r/tests/testthat/test-dataset.R
new file mode 100644
index 000000000..4403b479a
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dataset.R
@@ -0,0 +1,696 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+dataset_dir <- make_temp_dir()
+hive_dir <- make_temp_dir()
+ipc_dir <- make_temp_dir()
+
+test_that("Setup (putting data in the dir)", {
+ if (arrow_with_parquet()) {
+ dir.create(file.path(dataset_dir, 1))
+ dir.create(file.path(dataset_dir, 2))
+ write_parquet(df1, file.path(dataset_dir, 1, "file1.parquet"))
+ write_parquet(df2, file.path(dataset_dir, 2, "file2.parquet"))
+ expect_length(dir(dataset_dir, recursive = TRUE), 2)
+
+ dir.create(file.path(hive_dir, "subdir", "group=1", "other=xxx"), recursive = TRUE)
+ dir.create(file.path(hive_dir, "subdir", "group=2", "other=yyy"), recursive = TRUE)
+ write_parquet(df1, file.path(hive_dir, "subdir", "group=1", "other=xxx", "file1.parquet"))
+ write_parquet(df2, file.path(hive_dir, "subdir", "group=2", "other=yyy", "file2.parquet"))
+ expect_length(dir(hive_dir, recursive = TRUE), 2)
+ }
+
+ # Now, an IPC format dataset
+ dir.create(file.path(ipc_dir, 3))
+ dir.create(file.path(ipc_dir, 4))
+ write_feather(df1, file.path(ipc_dir, 3, "file1.arrow"))
+ write_feather(df2, file.path(ipc_dir, 4, "file2.arrow"))
+ expect_length(dir(ipc_dir, recursive = TRUE), 2)
+})
+
+test_that("IPC/Feather format data", {
+ ds <- open_dataset(ipc_dir, partitioning = "part", format = "feather")
+ expect_r6_class(ds$format, "IpcFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+ expect_identical(names(ds), c(names(df1), "part"))
+ expect_identical(dim(ds), c(20L, 7L))
+
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 3) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ # Collecting virtual partition column works
+ expect_equal(
+ ds %>% arrange(part) %>% pull(part),
+ c(rep(3, 10), rep(4, 10))
+ )
+})
+
+expect_scan_result <- function(ds, schm) {
+ sb <- ds$NewScan()
+ expect_r6_class(sb, "ScannerBuilder")
+ expect_equal(sb$schema, schm)
+
+ sb$Project(c("chr", "lgl"))
+ sb$Filter(Expression$field_ref("dbl") == 8)
+ scn <- sb$Finish()
+ expect_r6_class(scn, "Scanner")
+
+ tab <- scn$ToTable()
+ expect_r6_class(tab, "Table")
+
+ expect_equal(
+ as.data.frame(tab),
+ df1[8, c("chr", "lgl")]
+ )
+}
+
+test_that("URI-decoding with directory partitioning", {
+ root <- make_temp_dir()
+ fmt <- FileFormat$create("feather")
+ fs <- LocalFileSystem$create()
+ selector <- FileSelector$create(root, recursive = TRUE)
+ dir1 <- file.path(root, "2021-05-04 00%3A00%3A00", "%24")
+ dir.create(dir1, recursive = TRUE)
+ write_feather(df1, file.path(dir1, "data.feather"))
+
+ partitioning <- DirectoryPartitioning$create(
+ schema(date = timestamp(unit = "s"), string = utf8())
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ expect_scan_result(ds, schm)
+
+ partitioning <- DirectoryPartitioning$create(
+ schema(date = timestamp(unit = "s"), string = utf8()),
+ segment_encoding = "none"
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ schm <- factory$Inspect()
+ expect_error(factory$Finish(schm), "Invalid: error parsing")
+
+ partitioning_factory <- DirectoryPartitioningFactory$create(
+ c("date", "string")
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ # Can't directly inspect partition expressions, so do it implicitly via scan
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00:00:00", string == "$") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+
+ partitioning_factory <- DirectoryPartitioningFactory$create(
+ c("date", "string"),
+ segment_encoding = "none"
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00%3A00%3A00", string == "%24") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+})
+
+test_that("URI-decoding with hive partitioning", {
+ root <- make_temp_dir()
+ fmt <- FileFormat$create("feather")
+ fs <- LocalFileSystem$create()
+ selector <- FileSelector$create(root, recursive = TRUE)
+ dir1 <- file.path(root, "date=2021-05-04 00%3A00%3A00", "string=%24")
+ dir.create(dir1, recursive = TRUE)
+ write_feather(df1, file.path(dir1, "data.feather"))
+
+ partitioning <- hive_partition(
+ date = timestamp(unit = "s"), string = utf8()
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ ds <- factory$Finish(schm)
+ expect_scan_result(ds, schm)
+
+ partitioning <- hive_partition(
+ date = timestamp(unit = "s"), string = utf8(), segment_encoding = "none"
+ )
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt,
+ partitioning = partitioning
+ )
+ expect_error(factory$Finish(schm), "Invalid: error parsing")
+
+ partitioning_factory <- hive_partition()
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ # Can't directly inspect partition expressions, so do it implicitly via scan
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00:00:00", string == "$") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+
+ partitioning_factory <- hive_partition(segment_encoding = "none")
+ factory <- FileSystemDatasetFactory$create(
+ fs, selector, NULL, fmt, partitioning_factory
+ )
+ schm <- factory$Inspect()
+ ds <- factory$Finish(schm)
+ expect_equal(
+ ds %>%
+ filter(date == "2021-05-04 00%3A00%3A00", string == "%24") %>%
+ select(int) %>%
+ collect(),
+ df1 %>% select(int) %>% collect()
+ )
+})
+
+# Everything else below here is using parquet files
+skip_if_not_available("parquet")
+
+files <- c(
+ file.path(dataset_dir, 1, "file1.parquet", fsep = "/"),
+ file.path(dataset_dir, 2, "file2.parquet", fsep = "/")
+)
+
+test_that("Simple interface for datasets", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_r6_class(ds$format, "ParquetFileFormat")
+ expect_r6_class(ds$filesystem, "LocalFileSystem")
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+
+ expect_equal(
+ ds %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 1) %>% # 6 not 6L to test autocasting
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ # Collecting virtual partition column works
+ expect_equal(
+ ds %>% arrange(part) %>% pull(part),
+ c(rep(1, 10), rep(2, 10))
+ )
+})
+
+test_that("dim method returns the correct number of rows and columns", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+ expect_identical(dim(ds), c(20L, 7L))
+})
+
+
+test_that("dim() correctly determine numbers of rows and columns on arrow_dplyr_query object", {
+ ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+
+ expect_identical(
+ ds %>%
+ filter(chr == "a") %>%
+ dim(),
+ c(2L, 7L)
+ )
+ expect_equal(
+ ds %>%
+ select(chr, fct, int) %>%
+ dim(),
+ c(20L, 3L)
+ )
+ expect_identical(
+ ds %>%
+ select(chr, fct, int) %>%
+ filter(chr == "a") %>%
+ dim(),
+ c(2L, 3L)
+ )
+})
+
+test_that("Simple interface for datasets (custom ParquetFileFormat)", {
+ ds <- open_dataset(dataset_dir,
+ partitioning = schema(part = uint8()),
+ format = FileFormat$create("parquet", dict_columns = c("chr"))
+ )
+ expect_type_equal(ds$schema$GetFieldByName("chr")$type, dictionary())
+})
+
+test_that("Hive partitioning", {
+ ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))
+ expect_r6_class(ds, "Dataset")
+ expect_equal(
+ ds %>%
+ filter(group == 2) %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53) %>%
+ collect() %>%
+ arrange(dbl),
+ df2[1:2, c("chr", "dbl")]
+ )
+})
+
+test_that("input validation", {
+ expect_error(
+ open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8()))
+ )
+})
+
+test_that("Partitioning inference", {
+ # These are the same tests as above, just using the *PartitioningFactory
+ ds1 <- open_dataset(dataset_dir, partitioning = "part")
+ expect_identical(names(ds1), c(names(df1), "part"))
+ expect_equal(
+ ds1 %>%
+ select(string = chr, integer = int, part) %>%
+ filter(integer > 6 & part == 1) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+
+ ds2 <- open_dataset(hive_dir)
+ expect_identical(names(ds2), c(names(df1), "group", "other"))
+ expect_equal(
+ ds2 %>%
+ filter(group == 2) %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53) %>%
+ collect() %>%
+ arrange(dbl),
+ df2[1:2, c("chr", "dbl")]
+ )
+})
+
+test_that("Dataset with multiple file formats", {
+ skip("https://issues.apache.org/jira/browse/ARROW-7653")
+ ds <- open_dataset(list(
+ open_dataset(dataset_dir, format = "parquet", partitioning = "part"),
+ open_dataset(ipc_dir, format = "arrow", partitioning = "part")
+ ))
+ expect_identical(names(ds), c(names(df1), "part"))
+ expect_equal(
+ ds %>%
+ filter(int > 6 & part %in% c(1, 3)) %>%
+ select(string = chr, integer = int) %>%
+ collect(),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ filter(integer > 6) %>%
+ rbind(., .) # Stack it twice
+ )
+})
+
+test_that("Creating UnionDataset", {
+ ds1 <- open_dataset(file.path(dataset_dir, 1))
+ ds2 <- open_dataset(file.path(dataset_dir, 2))
+ union1 <- open_dataset(list(ds1, ds2))
+ expect_r6_class(union1, "UnionDataset")
+ expect_equal(
+ union1 %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+
+ # Now with the c() method
+ union2 <- c(ds1, ds2)
+ expect_r6_class(union2, "UnionDataset")
+ expect_equal(
+ union2 %>%
+ select(chr, dbl) %>%
+ filter(dbl > 7 & dbl < 53L) %>% # Testing the auto-casting of scalars
+ collect() %>%
+ arrange(dbl),
+ rbind(
+ df1[8:10, c("chr", "dbl")],
+ df2[1:2, c("chr", "dbl")]
+ )
+ )
+
+ # Confirm c() method error handling
+ expect_error(c(ds1, 42), "character")
+})
+
+test_that("map_batches", {
+ skip("map_batches() is broken (ARROW-14029)")
+ ds <- open_dataset(dataset_dir, partitioning = "part")
+ expect_equal(
+ ds %>%
+ filter(int > 5) %>%
+ select(int, lgl) %>%
+ map_batches(~ summarize(., min_int = min(int))),
+ tibble(min_int = c(6L, 101L))
+ )
+})
+
+test_that("partitioning = NULL to ignore partition information (but why?)", {
+ ds <- open_dataset(hive_dir, partitioning = NULL)
+ expect_identical(names(ds), names(df1)) # i.e. not c(names(df1), "group", "other")
+})
+
+test_that("head/tail", {
+ # head/tail with no query are still deterministic order
+ ds <- open_dataset(dataset_dir)
+ expect_equal(as.data.frame(head(ds)), head(df1))
+ expect_equal(
+ as.data.frame(head(ds, 12)),
+ rbind(df1, df2[1:2, ])
+ )
+
+ expect_equal(as.data.frame(tail(ds)), tail(df2))
+ expect_equal(
+ as.data.frame(tail(ds, 12)),
+ rbind(df1[9:10, ], df2)
+ )
+})
+
+test_that("Dataset [ (take by index)", {
+ ds <- open_dataset(dataset_dir)
+ # Taking only from one file
+ expect_equal(
+ as.data.frame(ds[c(4, 5, 9), 3:4]),
+ df1[c(4, 5, 9), 3:4]
+ )
+ # Taking from more than one
+ expect_equal(
+ as.data.frame(ds[c(4, 5, 9, 12, 13), 3:4]),
+ rbind(df1[c(4, 5, 9), 3:4], df2[2:3, 3:4])
+ )
+ # Taking out of order
+ expect_equal(
+ as.data.frame(ds[c(4, 13, 9, 12, 5), ]),
+ rbind(
+ df1[4, ],
+ df2[3, ],
+ df1[9, ],
+ df2[2, ],
+ df1[5, ]
+ )
+ )
+
+ # Take from a query
+ ds2 <- ds %>%
+ filter(int > 6) %>%
+ select(int, lgl)
+ expect_equal(
+ as.data.frame(ds2[c(2, 5), ]),
+ rbind(
+ df1[8, c("int", "lgl")],
+ df2[1, c("int", "lgl")]
+ )
+ )
+})
+
+test_that("Dataset and query print methods", {
+ ds <- open_dataset(hive_dir)
+ expect_output(
+ print(ds),
+ paste(
+ "FileSystemDataset with 2 Parquet files",
+ "int: int32",
+ "dbl: double",
+ "lgl: bool",
+ "chr: string",
+ "fct: dictionary<values=string, indices=int32>",
+ "ts: timestamp[us, tz=UTC]",
+ "group: int32",
+ "other: string",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+ expect_type(ds$metadata, "list")
+ q <- select(ds, string = chr, lgl, integer = int)
+ expect_output(
+ print(q),
+ paste(
+ "Dataset (query)",
+ "string: string",
+ "lgl: bool",
+ "integer: int32",
+ "",
+ "See $.data for the source Arrow object",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+ expect_output(
+ print(q %>% filter(integer == 6) %>% group_by(lgl)),
+ paste(
+ "Dataset (query)",
+ "string: string",
+ "lgl: bool",
+ "integer: int32",
+ "",
+ "* Filter: (int == 6)",
+ "* Grouped by lgl",
+ "See $.data for the source Arrow object",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("Scanner$ScanBatches", {
+ ds <- open_dataset(ipc_dir, format = "feather")
+ batches <- ds$NewScan()$Finish()$ScanBatches()
+ table <- Table$create(!!!batches)
+ expect_equal(as.data.frame(table), rbind(df1, df2))
+
+ batches <- ds$NewScan()$UseAsync(TRUE)$Finish()$ScanBatches()
+ table <- Table$create(!!!batches)
+ expect_equal(as.data.frame(table), rbind(df1, df2))
+})
+
+test_that("Scanner$ToRecordBatchReader()", {
+ ds <- open_dataset(dataset_dir, partitioning = "part")
+ scan <- ds %>%
+ filter(part == 1) %>%
+ select(int, lgl) %>%
+ filter(int > 6) %>%
+ Scanner$create()
+ reader <- scan$ToRecordBatchReader()
+ expect_r6_class(reader, "RecordBatchReader")
+ expect_identical(
+ as.data.frame(reader$read_table()),
+ df1[df1$int > 6, c("int", "lgl")]
+ )
+})
+
+test_that("Scanner$create() filter/projection pushdown", {
+ ds <- open_dataset(dataset_dir, partitioning = "part")
+
+ # the standard to compare all Scanner$create()s against
+ scan_one <- ds %>%
+ filter(int > 7 & dbl < 57) %>%
+ select(int, dbl, lgl) %>%
+ mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>%
+ Scanner$create()
+
+ # select a column in projection
+ scan_two <- ds %>%
+ filter(int > 7 & dbl < 57) %>%
+ # select an extra column, since we are going to
+ select(int, dbl, lgl, chr) %>%
+ mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>%
+ Scanner$create(projection = c("int", "dbl", "lgl", "int_plus", "dbl_minus"))
+ expect_identical(
+ as.data.frame(scan_one$ToRecordBatchReader()$read_table()),
+ as.data.frame(scan_two$ToRecordBatchReader()$read_table())
+ )
+
+ # adding filters to Scanner$create
+ scan_three <- ds %>%
+ filter(int > 7) %>%
+ select(int, dbl, lgl) %>%
+ mutate(int_plus = int + 1, dbl_minus = dbl - 1) %>%
+ Scanner$create(
+ filter = Expression$create("less", Expression$field_ref("dbl"), Expression$scalar(57))
+ )
+ expect_identical(
+ as.data.frame(scan_one$ToRecordBatchReader()$read_table()),
+ as.data.frame(scan_three$ToRecordBatchReader()$read_table())
+ )
+
+ expect_error(
+ ds %>%
+ select(int, dbl, lgl) %>%
+ Scanner$create(projection = "not_a_col"),
+ # Full message is "attempting to project with unknown columns" >= 4.0.0, but
+ # prior versions have a less nice "all(projection %in% names(proj)) is not TRUE"
+ "project"
+ )
+
+ expect_error(
+ ds %>%
+ select(int, dbl, lgl) %>%
+ Scanner$create(filter = list("foo", "bar")),
+ "filter expressions must be either an expression or a list of expressions"
+ )
+})
+
+test_that("Assembling a Dataset manually and getting a Table", {
+ fs <- LocalFileSystem$create()
+ selector <- FileSelector$create(dataset_dir, recursive = TRUE)
+ partitioning <- DirectoryPartitioning$create(schema(part = double()))
+
+ fmt <- FileFormat$create("parquet")
+ factory <- FileSystemDatasetFactory$create(fs, selector, NULL, fmt, partitioning = partitioning)
+ expect_r6_class(factory, "FileSystemDatasetFactory")
+
+ schm <- factory$Inspect()
+ expect_r6_class(schm, "Schema")
+
+ phys_schm <- ParquetFileReader$create(files[1])$GetSchema()
+ expect_equal(names(phys_schm), names(df1))
+ expect_equal(names(schm), c(names(phys_schm), "part"))
+
+ child <- factory$Finish(schm)
+ expect_r6_class(child, "FileSystemDataset")
+ expect_r6_class(child$schema, "Schema")
+ expect_r6_class(child$format, "ParquetFileFormat")
+ expect_equal(names(schm), names(child$schema))
+ expect_equal(child$files, files)
+
+ ds <- Dataset$create(list(child), schm)
+ expect_scan_result(ds, schm)
+})
+
+test_that("Assembling multiple DatasetFactories with DatasetFactory", {
+ factory1 <- dataset_factory(file.path(dataset_dir, 1), format = "parquet")
+ expect_r6_class(factory1, "FileSystemDatasetFactory")
+ factory2 <- dataset_factory(file.path(dataset_dir, 2), format = "parquet")
+ expect_r6_class(factory2, "FileSystemDatasetFactory")
+
+ factory <- DatasetFactory$create(list(factory1, factory2))
+ expect_r6_class(factory, "DatasetFactory")
+
+ schm <- factory$Inspect()
+ expect_r6_class(schm, "Schema")
+
+ phys_schm <- ParquetFileReader$create(files[1])$GetSchema()
+ expect_equal(names(phys_schm), names(df1))
+
+ ds <- factory$Finish(schm)
+ expect_r6_class(ds, "UnionDataset")
+ expect_r6_class(ds$schema, "Schema")
+ expect_equal(names(schm), names(ds$schema))
+ expect_equal(unlist(map(ds$children, ~ .$files)), files)
+
+ expect_scan_result(ds, schm)
+})
+
+# see https://issues.apache.org/jira/browse/ARROW-11328
+test_that("Collecting zero columns from a dataset doesn't return entire dataset", {
+ tmp <- tempfile()
+ write_dataset(mtcars, tmp, format = "parquet")
+ expect_equal(
+ open_dataset(tmp) %>% select() %>% collect() %>% dim(),
+ c(32, 0)
+ )
+})
+
+
+test_that("dataset RecordBatchReader to C-interface to arrow_dplyr_query", {
+ ds <- open_dataset(ipc_dir, partitioning = "part", format = "feather")
+
+ # export the RecordBatchReader via the C-interface
+ stream_ptr <- allocate_arrow_array_stream()
+ scan <- Scanner$create(ds)
+ reader <- scan$ToRecordBatchReader()
+ reader$export_to_c(stream_ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- RecordBatchStreamReader$import_from_c(stream_ptr)
+
+ # create an arrow_dplyr_query() from the recordbatch reader
+ reader_adq <- arrow_dplyr_query(circle)
+
+ # TODO: ARROW-14321 should be able to arrange then collect
+ tab_from_c_new <- reader_adq %>%
+ filter(int < 8, int > 55) %>%
+ mutate(part_plus = part + 6) %>%
+ collect()
+ expect_equal(
+ tab_from_c_new %>%
+ arrange(dbl),
+ ds %>%
+ filter(int < 8, int > 55) %>%
+ mutate(part_plus = part + 6) %>%
+ collect() %>%
+ arrange(dbl)
+ )
+
+ # must clean up the pointer or we leak
+ delete_arrow_array_stream(stream_ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-arrange.R b/src/arrow/r/tests/testthat/test-dplyr-arrange.R
new file mode 100644
index 000000000..d22f64a7c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-arrange.R
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+# randomize order of rows in test data
+tbl <- slice_sample(example_data_for_sorting, prop = 1L)
+
+test_that("arrange() on integer, double, and character columns", {
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int, desc(dbl)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int, desc(desc(dbl))) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int) %>%
+ arrange(desc(dbl)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(int + dbl, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(zzz = int + dbl, ) %>%
+ arrange(zzz, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(zzz = int + dbl) %>%
+ arrange(int + dbl, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(int + dbl) %>%
+ arrange(int + dbl, chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange(int, dbl) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange(int, dbl, .by_group = TRUE) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp, grp2) %>%
+ arrange(int, dbl, .by_group = TRUE) %>%
+ collect(),
+ tbl %>%
+ mutate(grp2 = ifelse(is.na(lgl), 1L, as.integer(lgl)))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange(.by_group = TRUE) %>%
+ pull(grp),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange() %>%
+ collect(),
+ tbl %>%
+ group_by(grp)
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(grp) %>%
+ arrange() %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange() %>%
+ collect(),
+ tbl
+ )
+ test_sort_col <- "chr"
+ compare_dplyr_binding(
+ .input %>%
+ arrange(!!sym(test_sort_col)) %>%
+ collect(),
+ tbl %>%
+ select(chr, lgl)
+ )
+ test_sort_cols <- c("int", "dbl")
+ compare_dplyr_binding(
+ .input %>%
+ arrange(!!!syms(test_sort_cols)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("arrange() on datetime columns", {
+ compare_dplyr_binding(
+ .input %>%
+ arrange(dttm, int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ arrange(dttm) %>%
+ collect(),
+ tbl %>%
+ select(dttm, grp)
+ )
+})
+
+test_that("arrange() on logical columns", {
+ compare_dplyr_binding(
+ .input %>%
+ arrange(lgl, int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("arrange() with bad inputs", {
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(1),
+ "does not contain any field names",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(2 + 2),
+ "does not contain any field names",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(aertidjfgjksertyj),
+ "not found",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(desc(aertidjfgjksertyj + iaermxiwerksxsdqq)),
+ "not found",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ arrange(desc(int, chr)),
+ "expects only one argument",
+ fixed = TRUE
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-collapse.R b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
new file mode 100644
index 000000000..c7281b62d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-collapse.R
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+withr::local_options(list(arrow.summarise.sort = TRUE))
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+tab <- Table$create(tbl)
+
+test_that("implicit_schema with select", {
+ expect_equal(
+ tab %>%
+ select(int, lgl) %>%
+ implicit_schema(),
+ schema(int = int32(), lgl = bool())
+ )
+})
+
+test_that("implicit_schema with rename", {
+ expect_equal(
+ tab %>%
+ select(numbers = int, lgl) %>%
+ implicit_schema(),
+ schema(numbers = int32(), lgl = bool())
+ )
+})
+
+test_that("implicit_schema with mutate", {
+ expect_equal(
+ tab %>%
+ transmute(
+ numbers = int * 4,
+ words = as.character(int)
+ ) %>%
+ implicit_schema(),
+ schema(numbers = float64(), words = utf8())
+ )
+})
+
+test_that("implicit_schema with summarize", {
+ expect_equal(
+ tab %>%
+ summarize(
+ avg = mean(int)
+ ) %>%
+ implicit_schema(),
+ schema(avg = float64())
+ )
+})
+
+test_that("implicit_schema with group_by summarize", {
+ expect_equal(
+ tab %>%
+ group_by(some_grouping) %>%
+ summarize(
+ avg = mean(int * 5L)
+ ) %>%
+ implicit_schema(),
+ schema(some_grouping = float64(), avg = float64())
+ )
+})
+
+test_that("collapse", {
+ q <- tab %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L)
+ expect_false(is_collapsed(q))
+ expect_true(is_collapsed(collapse(q)))
+ expect_false(is_collapsed(collapse(q)$.data))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, twice) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ collapse() %>%
+ select(chr, int, lgl) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ collapse() %>%
+ group_by(chr) %>%
+ select(chr, int, lgl) %>%
+ collapse() %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Properties of collapsed query", {
+ q <- tab %>%
+ filter(dbl > 2) %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ group_by(lgl) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ mutate(extra = total * 5)
+
+ # print(tbl %>%
+ # filter(dbl > 2) %>%
+ # select(chr, int, lgl) %>%
+ # mutate(twice = int * 2L) %>%
+ # group_by(lgl) %>%
+ # summarize(total = sum(int, na.rm = TRUE)) %>%
+ # mutate(extra = total * 5))
+
+ # # A tibble: 3 × 3
+ # lgl total extra
+ # <lgl> <int> <dbl>
+ # 1 FALSE 8 40
+ # 2 TRUE 8 40
+ # 3 NA 25 125
+
+ # Avoid evaluating just for nrow
+ expect_identical(dim(q), c(NA_integer_, 3L))
+
+ expect_output(
+ print(q),
+ "InMemoryDataset (query)
+lgl: bool
+total: int32
+extra: double (multiply_checked(total, 5))
+
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+ expect_output(
+ print(q$.data),
+ "InMemoryDataset (query)
+int: int32
+lgl: bool
+
+* Aggregations:
+total: sum(int)
+* Filter: (dbl > 2)
+* Grouped by lgl
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+
+ skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal")
+ # On older R versions:
+ # ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ────────
+ # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40).
+ # Component "total": Mean relative difference: 0.3846154
+ # Component "extra": Mean relative difference: 0.3846154
+ # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ────────
+ # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125).
+ # Component "total": Mean relative difference: 0.9230769
+ # Component "extra": Mean relative difference: 0.9230769
+ expect_equal(
+ q %>% head(1) %>% collect(),
+ tibble::tibble(lgl = FALSE, total = 8L, extra = 40)
+ )
+ skip("TODO (ARROW-1XXXX): implement sorting option about where NAs go")
+ expect_equal(
+ q %>% tail(1) %>% collect(),
+ tibble::tibble(lgl = NA, total = 25L, extra = 125)
+ )
+})
+
+test_that("query_on_dataset handles collapse()", {
+ expect_false(query_on_dataset(
+ tab %>%
+ select(int, chr)
+ ))
+ expect_false(query_on_dataset(
+ tab %>%
+ select(int, chr) %>%
+ collapse() %>%
+ select(int)
+ ))
+
+ ds_dir <- tempfile()
+ dir.create(ds_dir)
+ on.exit(unlink(ds_dir))
+ write_parquet(tab, file.path(ds_dir, "file.parquet"))
+ ds <- open_dataset(ds_dir)
+
+ expect_true(query_on_dataset(
+ ds %>%
+ select(int, chr)
+ ))
+ expect_true(query_on_dataset(
+ ds %>%
+ select(int, chr) %>%
+ collapse() %>%
+ select(int)
+ ))
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-count.R b/src/arrow/r/tests/testthat/test-dplyr-count.R
new file mode 100644
index 000000000..8af9b57aa
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-count.R
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+tbl <- example_data
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+test_that("count/tally", {
+ compare_dplyr_binding(
+ .input %>%
+ count() %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ tally() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("count/tally with wt and grouped data", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ count(wt = int) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ tally(wt = int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("count/tally with sort", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ count(wt = int, sort = TRUE) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ tally(wt = int, sort = TRUE) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("count/tally with name arg", {
+ compare_dplyr_binding(
+ .input %>%
+ count(name = "new_col") %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ tally(name = "new_col") %>%
+ collect(),
+ tbl
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-distinct.R b/src/arrow/r/tests/testthat/test-dplyr-distinct.R
new file mode 100644
index 000000000..3a44c7372
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-distinct.R
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+tbl <- example_data
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+test_that("distinct()", {
+ compare_dplyr_binding(
+ .input %>%
+ distinct(some_grouping, lgl) %>%
+ collect() %>%
+ arrange(some_grouping, lgl),
+ tbl
+ )
+})
+
+test_that("distinct() works without any variables", {
+ compare_dplyr_binding(
+ .input %>%
+ distinct() %>%
+ arrange(int) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(x = int + 1) %>%
+ distinct() %>%
+ # Even though we have group_by(x), all cols (including int) are kept
+ arrange(int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("distinct() can retain groups", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int) %>%
+ distinct(lgl) %>%
+ collect() %>%
+ arrange(lgl, int),
+ tbl
+ )
+
+ # With expressions here
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y = some_grouping, int) %>%
+ distinct(x = lgl) %>%
+ collect() %>%
+ arrange(int),
+ tbl
+ )
+})
+
+test_that("distinct() can contain expressions", {
+ compare_dplyr_binding(
+ .input %>%
+ distinct(lgl, x = some_grouping + 1) %>%
+ collect() %>%
+ arrange(lgl, x),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(lgl, int) %>%
+ distinct(x = some_grouping + 1) %>%
+ collect() %>%
+ arrange(int),
+ tbl
+ )
+})
+
+test_that("distinct() can return all columns", {
+ skip("ARROW-13993 - need this to return correct rows from other cols")
+ compare_dplyr_binding(
+ .input %>%
+ distinct(lgl, .keep_all = TRUE) %>%
+ collect() %>%
+ arrange(int),
+ tbl
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-filter.R b/src/arrow/r/tests/testthat/test-dplyr-filter.R
new file mode 100644
index 000000000..72a64229c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-filter.R
@@ -0,0 +1,412 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_negative <- tbl$int * (-1)^(1:nrow(tbl)) # nolint
+
+test_that("filter() on is.na()", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(is.na(lgl)) %>%
+ select(chr, int, lgl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter() with NAs in selection", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(lgl) %>%
+ select(chr, int, lgl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Filter returning an empty Table should not segfault (ARROW-8354)", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(false) %>%
+ select(chr, int, lgl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filtering with expression", {
+ char_sym <- "b"
+ compare_dplyr_binding(
+ .input %>%
+ filter(chr == char_sym) %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filtering with arithmetic", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl + 1 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl / 2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl / 2L > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int / 2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int / 2L > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl %/% 2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl^2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filtering with expression + autocasting", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl + 1 > 3L) %>% # test autocasting with comparison to 3L
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int + 1 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int^2 > 3) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("More complex select/filter", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter() with %in%", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, chr %in% c("d", "f")) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Negative scalar values", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_negative > -2) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_negative %in% -1) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(int == -some_negative) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter() with between()", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(between(dbl, 1, 2)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(between(dbl, 0.5, 2)) %>%
+ collect(),
+ tbl
+ )
+
+ expect_identical(
+ tbl %>%
+ record_batch() %>%
+ filter(between(dbl, int, dbl2)) %>%
+ collect(),
+ tbl %>%
+ filter(dbl >= int, dbl <= dbl2)
+ )
+
+ expect_error(
+ tbl %>%
+ record_batch() %>%
+ filter(between(dbl, 1, "2")) %>%
+ collect()
+ )
+
+ expect_error(
+ tbl %>%
+ record_batch() %>%
+ filter(between(dbl, 1, NA)) %>%
+ collect()
+ )
+
+ expect_error(
+ tbl %>%
+ record_batch() %>%
+ filter(between(chr, 1, 2)) %>%
+ collect()
+ )
+})
+
+test_that("filter() with string ops", {
+ skip_if_not_available("utf8proc")
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, str_length(verses) > 25) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2, str_length(str_trim(padded_strings, "left")) > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("filter environment scope", {
+ # "object 'b_var' not found"
+ compare_dplyr_error(.input %>% filter(chr == b_var), tbl)
+
+ b_var <- "b"
+ compare_dplyr_binding(
+ .input %>%
+ filter(chr == b_var) %>%
+ collect(),
+ tbl
+ )
+ # Also for functions
+ # 'could not find function "isEqualTo"' because we haven't defined it yet
+ compare_dplyr_error(.input %>% filter(isEqualTo(int, 4)), tbl)
+
+ # This works but only because there are S3 methods for those operations
+ isEqualTo <- function(x, y) x == y & !is.na(x)
+ compare_dplyr_binding(
+ .input %>%
+ select(-fct) %>% # factor levels aren't identical
+ filter(isEqualTo(int, 4)) %>%
+ collect(),
+ tbl
+ )
+ # Try something that needs to call another nse_func
+ compare_dplyr_binding(
+ .input %>%
+ select(-fct) %>%
+ filter(nchar(padded_strings) < 10) %>%
+ collect(),
+ tbl
+ )
+ isShortString <- function(x) nchar(x) < 10
+ skip("TODO: 14071")
+ compare_dplyr_binding(
+ .input %>%
+ select(-fct) %>%
+ filter(isShortString(padded_strings)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Filtering on a column that doesn't exist errors correctly", {
+ with_language("fr", {
+ # expect_warning(., NA) because the usual behavior when it hits a filter
+ # that it can't evaluate is to raise a warning, collect() to R, and retry
+ # the filter. But we want this to error the first time because it's
+ # a user error, not solvable by retrying in R
+ expect_warning(
+ expect_error(
+ tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
+ "objet 'not_a_col' introuvable"
+ ),
+ NA
+ )
+ })
+ with_language("en", {
+ expect_warning(
+ expect_error(
+ tbl %>% record_batch() %>% filter(not_a_col == 42) %>% collect(),
+ "object 'not_a_col' not found"
+ ),
+ NA
+ )
+ })
+})
+
+test_that("Filtering with unsupported functions", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 2, pnorm(dbl) > .99) %>%
+ collect(),
+ tbl,
+ warning = "Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow; pulling data into R"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(
+ nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg
+ int > 2, # good
+ pnorm(dbl) > .99 # bad, opaque
+ ) %>%
+ collect(),
+ tbl,
+ warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported by Arrow
+\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow
+pulling data into R'
+ )
+})
+
+test_that("Calling Arrow compute functions 'directly'", {
+ expect_equal(
+ tbl %>%
+ record_batch() %>%
+ filter(arrow_add(dbl, 1) > 3L) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl %>%
+ filter(dbl + 1 > 3L) %>%
+ select(string = chr, int, dbl)
+ )
+
+ compare_dplyr_binding(
+ tbl %>%
+ record_batch() %>%
+ filter(arrow_greater(arrow_add(dbl, 1), 3L)) %>%
+ select(string = chr, int, dbl) %>%
+ collect(),
+ tbl %>%
+ filter(dbl + 1 > 3L) %>%
+ select(string = chr, int, dbl)
+ )
+})
+
+test_that("filter() with .data pronoun", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(.data$dbl > 4) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(is.na(.data$lgl)) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+
+ # and the .env pronoun too!
+ chr <- 4
+ compare_dplyr_binding(
+ .input %>%
+ filter(.data$dbl > .env$chr) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+
+ skip("test now faulty - code no longer gives error & outputs a empty tibble")
+ # but there is an error if we don't override the masking with `.env`
+ compare_dplyr_error(
+ .input %>%
+ filter(.data$dbl > chr) %>%
+ select(.data$chr, .data$int, .data$lgl) %>%
+ collect(),
+ tbl
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R
new file mode 100644
index 000000000..4f2700795
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-conditional.R
@@ -0,0 +1,409 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+suppressPackageStartupMessages(library(bit64))
+
+
+tbl <- example_data
+tbl$verses <- verses[[1]]
+tbl$another_chr <- tail(letters, 10)
+
+test_that("if_else and ifelse", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, 1, 0)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, int, 0L)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ expect_error(
+ Table$create(tbl) %>%
+ mutate(
+ y = if_else(int > 5, 1, FALSE)
+ ) %>%
+ collect(),
+ "NotImplemented: Function if_else has no kernel matching input types"
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, 1, NA_real_)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = ifelse(int > 5, 1, 0)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(dbl > 5, TRUE, FALSE)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(chr %in% letters[1:3], 1L, 3L)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, "one", "zero")
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, chr, another_chr)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, "true", chr, missing = "MISSING")
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # TODO: remove the mutate + warning after ARROW-13358 is merged and Arrow
+ # supports factors in if(_)else
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(int > 5, fct, factor("a"))
+ ) %>%
+ collect() %>%
+ # This is a no-op on the Arrow side, but necessary to make the results equal
+ mutate(y = as.character(y)),
+ tbl,
+ warning = "Dictionaries .* are currently converted to strings .* in if_else and ifelse"
+ )
+
+ # detecting NA and NaN works just fine
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(is.na(dbl), chr, "false", missing = "MISSING")
+ ) %>%
+ collect(),
+ example_data_for_sorting
+ )
+
+ # However, currently comparisons with NaNs return false and not NaNs or NAs
+ skip("ARROW-13364")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y = if_else(dbl > 5, chr, another_chr, missing = "MISSING")
+ ) %>%
+ collect(),
+ example_data_for_sorting
+ )
+
+ skip("TODO: could? should? we support the autocasting in ifelse")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = ifelse(int > 5, 1, FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("case_when()", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(lgl ~ dbl, !false ~ dbl + dbl2)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(cw = case_when(int > 5 ~ 1, TRUE ~ 0)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(chr %in% letters[1:3] ~ 1L) + 41L) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(case_when(
+ dbl + int - 1.1 == dbl2 ~ TRUE,
+ NA ~ NA,
+ TRUE ~ FALSE
+ ) & !is.na(dbl2)) %>%
+ collect(),
+ tbl
+ )
+
+ # dplyr::case_when() errors if values on right side of formulas do not have
+ # exactly the same type, but the Arrow case_when kernel allows compatible types
+ expect_equal(
+ tbl %>%
+ mutate(i64 = as.integer64(1e10)) %>%
+ Table$create() %>%
+ transmute(cw = case_when(
+ is.na(fct) ~ int,
+ is.na(chr) ~ dbl,
+ TRUE ~ i64
+ )) %>%
+ collect(),
+ tbl %>%
+ transmute(
+ cw = ifelse(is.na(fct), int, ifelse(is.na(chr), dbl, 1e10))
+ )
+ )
+
+ # expected errors (which are caught by abandon_ship() and changed to warnings)
+ # TODO: Find a way to test these directly without abandon_ship() interfering
+ expect_error(
+ # no cases
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when()),
+ "case_when"
+ )
+ )
+ expect_error(
+ # argument not a formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(TRUE ~ FALSE, TRUE)),
+ "case_when"
+ )
+ )
+ expect_error(
+ # non-logical R scalar on left side of formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(0L ~ FALSE, TRUE ~ FALSE)),
+ "case_when"
+ )
+ )
+ expect_error(
+ # non-logical Arrow column reference on left side of formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(int ~ FALSE)),
+ "case_when"
+ )
+ )
+ expect_error(
+ # non-logical Arrow expression on left side of formula
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(cw = case_when(dbl + 3.14159 ~ TRUE)),
+ "case_when"
+ )
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(lgl ~ "abc")) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(cw = case_when(lgl ~ verses, !false ~ paste(chr, chr))) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = case_when(!(!(!(lgl))) ~ factor(chr), TRUE ~ fct)
+ ) %>%
+ collect(),
+ tbl,
+ warning = TRUE
+ )
+})
+
+test_that("coalesce()", {
+ # character
+ df <- tibble(
+ w = c(NA_character_, NA_character_, NA_character_),
+ x = c(NA_character_, NA_character_, "c"),
+ y = c(NA_character_, "b", "c"),
+ z = c("a", "b", "c")
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = coalesce(w),
+ cz = coalesce(z),
+ cwx = coalesce(w, x),
+ cwxy = coalesce(w, x, y),
+ cwxyz = coalesce(w, x, y, z)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # integer
+ df <- tibble(
+ w = c(NA_integer_, NA_integer_, NA_integer_),
+ x = c(NA_integer_, NA_integer_, 3L),
+ y = c(NA_integer_, 2L, 3L),
+ z = 1:3
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = coalesce(w),
+ cz = coalesce(z),
+ cwx = coalesce(w, x),
+ cwxy = coalesce(w, x, y),
+ cwxyz = coalesce(w, x, y, z)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # double with NaNs
+ df <- tibble(
+ w = c(NA_real_, NaN, NA_real_),
+ x = c(NA_real_, NaN, 3.3),
+ y = c(NA_real_, 2.2, 3.3),
+ z = c(1.1, 2.2, 3.3)
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cw = coalesce(w),
+ cz = coalesce(z),
+ cwx = coalesce(w, x),
+ cwxy = coalesce(w, x, y),
+ cwxyz = coalesce(w, x, y, z)
+ ) %>%
+ collect(),
+ df
+ )
+ # NaNs stay NaN and are not converted to NA in the results
+ # (testing this requires expect_identical())
+ expect_identical(
+ df %>% Table$create() %>% mutate(cwx = coalesce(w, x)) %>% collect(),
+ df %>% mutate(cwx = coalesce(w, x))
+ )
+ expect_identical(
+ df %>% Table$create() %>% transmute(cw = coalesce(w)) %>% collect(),
+ df %>% transmute(cw = coalesce(w))
+ )
+ expect_identical(
+ df %>% Table$create() %>% transmute(cn = coalesce(NaN)) %>% collect(),
+ df %>% transmute(cn = coalesce(NaN))
+ )
+ # singles stay single
+ expect_equal(
+ (df %>%
+ Table$create(schema = schema(
+ w = float32(),
+ x = float32(),
+ y = float32(),
+ z = float32()
+ )) %>%
+ transmute(c = coalesce(w, x, y, z)) %>%
+ compute()
+ )$schema[[1]]$type,
+ float32()
+ )
+ # with R literal values
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ c1 = coalesce(4.4),
+ c2 = coalesce(NA_real_),
+ c3 = coalesce(NaN),
+ c4 = coalesce(w, x, y, 5.5),
+ c5 = coalesce(w, x, y, NA_real_),
+ c6 = coalesce(w, x, y, NaN)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # factors
+ # TODO: remove the mutate + warning after ARROW-14167 is merged and Arrow
+ # supports factors in coalesce
+ df <- tibble(
+ x = factor("a", levels = c("a", "z")),
+ y = factor("b", levels = c("a", "b", "c"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(c = coalesce(x, y)) %>%
+ collect() %>%
+ # This is a no-op on the Arrow side, but necessary to make the results equal
+ mutate(c = as.character(c)),
+ df,
+ warning = "Dictionaries .* are currently converted to strings .* in coalesce"
+ )
+
+ # no arguments
+ expect_error(
+ nse_funcs$coalesce(),
+ "At least one argument must be supplied to coalesce()",
+ fixed = TRUE
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R
new file mode 100644
index 000000000..5cb515e69
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -0,0 +1,304 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(lubridate, warn.conflicts = FALSE)
+library(dplyr, warn.conflicts = FALSE)
+
+# base::strptime() defaults to local timezone
+# but arrow's strptime defaults to UTC.
+# So that tests are consistent, set the local timezone to UTC
+# TODO: consider reevaluating this workaround after ARROW-12980
+withr::local_timezone("UTC")
+
+# TODO: We should test on windows once ARROW-13168 is resolved.
+if (tolower(Sys.info()[["sysname"]]) == "windows") {
+ test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "")
+} else {
+ test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas")
+}
+
+
+test_df <- tibble::tibble(
+ # test_date + 1 turns the tzone = "" to NULL, which is functionally equivalent
+ # so we can run some tests on Windows, but this skirts around
+ # https://issues.apache.org/jira/browse/ARROW-13588
+ # That issue is tough because in C++, "" is the "no timezone" value
+ # due to static typing, so we can't distinguish a literal "" from NULL
+ datetime = c(test_date, NA) + 1,
+ date = c(as.Date("2021-09-09"), NA)
+)
+
+# These tests test component extraction from timestamp objects
+
+test_that("extract year from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = year(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoyear from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoyear(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract quarter from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = quarter(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract month from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = month(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoweek from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoweek(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract epiweek from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = epiweek(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract day from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = day(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract wday from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(datetime)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 3)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 1)) %>%
+ collect(),
+ test_df
+ )
+
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, label = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(datetime, label = TRUE, abbr = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract yday from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = yday(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract hour from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = hour(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract minute from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = minute(datetime)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract second from timestamp", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = second(datetime)) %>%
+ collect(),
+ test_df,
+ # arrow supports nanosecond resolution but lubridate does not
+ tolerance = 1e-6
+ )
+})
+
+# These tests test extraction of components from date32 objects
+
+test_that("extract year from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = year(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoyear from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoyear(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract quarter from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = quarter(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract month from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = month(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract isoweek from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = isoweek(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract epiweek from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = epiweek(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract day from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = day(date)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract wday from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 3)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, week_start = 1)) %>%
+ collect(),
+ test_df
+ )
+
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, label = TRUE, abbr = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = wday(date, label = TRUE)) %>%
+ mutate(x = as.character(x)) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("extract yday from date", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = yday(date)) %>%
+ collect(),
+ test_df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R
new file mode 100644
index 000000000..b66630675
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-math.R
@@ -0,0 +1,309 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+
+test_that("abs()", {
+ df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(abs = abs(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("sign()", {
+ df <- tibble(x = c(-127, -10, -1, -0, 0, 1, 10, 127, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(sign = sign(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("ceiling(), floor(), trunc(), round()", {
+ df <- tibble(x = c(-1, -0.55, -0.5, -0.1, 0, 0.1, 0.5, 0.55, 1, NA, NaN))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ c = ceiling(x),
+ f = floor(x),
+ t = trunc(x),
+ r = round(x)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # with digits set to 1
+ compare_dplyr_binding(
+ .input %>%
+ filter(x %% 0.5 == 0) %>% # filter out indeterminate cases (see below)
+ mutate(r = round(x, 1)) %>%
+ collect(),
+ df
+ )
+
+ # with digits set to -1
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ rd = round(floor(x * 111), -1), # double
+ y = ifelse(is.nan(x), NA_integer_, x),
+ ri = round(as.integer(y * 111), -1) # integer (with the NaN removed)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # round(x, -2) is equivalent to round_to_multiple(x, 100)
+ expect_equal(
+ Table$create(x = 1111.1) %>%
+ mutate(r = round(x, -2)) %>%
+ collect(),
+ Table$create(x = 1111.1) %>%
+ mutate(r = arrow_round_to_multiple(x, options = list(multiple = 100))) %>%
+ collect()
+ )
+
+ # For consistency with base R, the binding for round() uses the Arrow
+ # library's HALF_TO_EVEN round mode, but the expectations *above* would pass
+ # even if another round mode were used. The expectations *below* should fail
+ # with other round modes. However, some decimal numbers cannot be represented
+ # exactly as floating point numbers, and for the ones that also end in 5 (such
+ # as 0.55), R's rounding behavior is indeterminate: it will vary depending on
+ # the OS. In practice, this seems to affect Windows, so we skip these tests
+ # on Windows and on CRAN.
+
+ skip_on_cran()
+ skip_on_os("windows")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(r = round(x, 1)) %>%
+ collect(),
+ df
+ )
+
+ # Verify that round mode HALF_TO_EVEN, which is what the round() binding uses,
+ # yields results consistent with R...
+ expect_equal(
+ as.vector(
+ call_function(
+ "round",
+ Array$create(df$x),
+ options = list(ndigits = 1L, round_mode = RoundMode$HALF_TO_EVEN)
+ )
+ ),
+ round(df$x, 1)
+ )
+ # ...but that the round mode HALF_TOWARDS_ZERO does not. If the expectation
+ # below fails, it means that the expectation above is not effectively testing
+ # that Arrow is using the HALF_TO_EVEN mode.
+ expect_false(
+ isTRUE(all.equal(
+ as.vector(
+ call_function(
+ "round",
+ Array$create(df$x),
+ options = list(ndigits = 1L, round_mode = RoundMode$HALF_TOWARDS_ZERO)
+ )
+ ),
+ round(df$x, 1)
+ ))
+ )
+})
+
+test_that("log functions", {
+ df <- tibble(x = c(1:10, NA, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = exp(1))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = 2)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = 10)) %>%
+ collect(),
+ df
+ )
+
+ # test log(, base = (length == 1))
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = 5)) %>%
+ collect(),
+ df
+ )
+
+ # test log(, base = (length != 1))
+ expect_error(
+ nse_funcs$log(10, base = 5:6),
+ "base must be a column or a length-1 numeric; other values not supported by Arrow",
+ fixed = TRUE
+ )
+
+ # test log(x = (length != 1))
+ expect_error(
+ nse_funcs$log(10:11),
+ "x must be a column or a length-1 numeric; other values not supported by Arrow",
+ fixed = TRUE
+ )
+
+ # test log(, base = Expression)
+ compare_dplyr_binding(
+ .input %>%
+ # test cases where base = 1 below
+ filter(x != 1) %>%
+ mutate(
+ y = log(x, base = x),
+ z = log(2, base = x)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # log(1, base = 1) is NaN in both R and Arrow
+ # suppress the R warning because R warns but Arrow does not
+ suppressWarnings(
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = y)) %>%
+ collect(),
+ tibble(x = 1, y = 1)
+ )
+ )
+
+ # log(n != 1, base = 1) is Inf in R and Arrow
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log(x, base = y)) %>%
+ collect(),
+ tibble(x = 10, y = 1)
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = logb(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log1p(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log2(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = log10(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("trig functions", {
+ df <- tibble(x = c(seq(from = 0, to = 1, by = 0.1), NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = sin(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = cos(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = tan(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = asin(x)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = acos(x)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("arith functions ", {
+ df <- tibble(x = c(1:5, NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ int_div = x %/% 2,
+ addition = x + 1,
+ multiplication = x * 3,
+ subtraction = x - 5,
+ division = x / 2,
+ power = x^3,
+ modulo = x %% 3
+ ) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R
new file mode 100644
index 000000000..5e092f4e3
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-string.R
@@ -0,0 +1,1399 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+skip_if_not_available("utf8proc")
+
+library(dplyr, warn.conflicts = FALSE)
+library(lubridate)
+library(stringr)
+library(stringi)
+
+test_that("paste, paste0, and str_c", {
+ df <- tibble(
+ v = c("A", "B", "C"),
+ w = c("a", "b", "c"),
+ x = c("d", NA_character_, "f"),
+ y = c(NA_character_, "h", "i"),
+ z = c(1.1, 2.2, NA)
+ )
+ x <- Expression$field_ref("x")
+ y <- Expression$field_ref("y")
+
+ # no NAs in data
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(v, w)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(v, w, sep = "-")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(v, w)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(v, w)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(v, w, sep = "+")) %>%
+ collect(),
+ df
+ )
+
+ # NAs in data
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, y)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, y, sep = "-")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(x, y)) %>%
+ collect(),
+ df
+ )
+
+ # non-character column in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(x, y, z)) %>%
+ collect(),
+ df
+ )
+
+ # literal string in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, "foo", y)) %>%
+ collect(),
+ df
+ )
+
+ # literal NA in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste(x, NA, y)) %>%
+ collect(),
+ df
+ )
+
+ # expressions in dots
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(x, toupper(y), as.character(z))) %>%
+ collect(),
+ df
+ )
+
+ # sep is literal NA
+ # errors in paste() (consistent with base::paste())
+ expect_error(
+ nse_funcs$paste(x, y, sep = NA_character_),
+ "Invalid separator"
+ )
+ # emits null in str_c() (consistent with stringr::str_c())
+ compare_dplyr_binding(
+ .input %>%
+ transmute(str_c(x, y, sep = NA_character_)) %>%
+ collect(),
+ df
+ )
+
+ # sep passed in dots to paste0 (which doesn't take a sep argument)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(paste0(x, y, sep = "-")) %>%
+ collect(),
+ df
+ )
+
+ # known differences
+
+ # arrow allows the separator to be an array
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(result = paste(x, y, sep = w)) %>%
+ collect(),
+ df %>%
+ transmute(result = paste(x, w, y, sep = ""))
+ )
+
+ # expected errors
+
+ # collapse argument not supported
+ expect_error(
+ nse_funcs$paste(x, y, collapse = ""),
+ "collapse"
+ )
+ expect_error(
+ nse_funcs$paste0(x, y, collapse = ""),
+ "collapse"
+ )
+ expect_error(
+ nse_funcs$str_c(x, y, collapse = ""),
+ "collapse"
+ )
+
+ # literal vectors of length != 1 not supported
+ expect_error(
+ nse_funcs$paste(x, character(0), y),
+ "Literal vectors of length != 1 not supported in string concatenation"
+ )
+ expect_error(
+ nse_funcs$paste(x, c(",", ";"), y),
+ "Literal vectors of length != 1 not supported in string concatenation"
+ )
+})
+
+test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("o", x, fixed = TRUE)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = gsub("o", "u", x, fixed = TRUE)) %>%
+ collect(),
+ df
+ )
+})
+
+# many of the remainder of these tests require RE2
+skip_if_not_available("re2")
+
+test_that("grepl", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ for (fixed in c(TRUE, FALSE)) {
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("Foo", x, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ }
+})
+
+test_that("grepl with ignore.case = TRUE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE,
+ # so we can't use compare_dplyr_binding() for these tests
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = "Foo")
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ filter(x = grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = character(0))
+ )
+})
+
+test_that("str_detect", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, regex("^F"))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("o"))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("O"))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("sub and gsub", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ for (fixed in c(TRUE, FALSE)) {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>%
+ collect(),
+ df
+ )
+ }
+})
+
+test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when
+ # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("Fuo", "bar"))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("Fuu", "bar"))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ df # unchanged
+ )
+})
+
+test_that("str_replace and str_replace_all", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace_all(x, "^F", "baz")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace_all(x, regex("^F"), "baz")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace_all(x, fixed("o"), "u")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, fixed("O"), "u")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("strsplit and str_split", {
+ df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strsplit(x, "and")) %>%
+ collect(),
+ df,
+ # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray)
+ # has type information in it, but it's just a bare list from R/dplyr.
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strsplit(x, " +and +")) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, "and")) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, "and", n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, fixed("and"), n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, regex("and"), n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_split(x, "Foo|bar", n = 2)) %>%
+ collect(),
+ df,
+ ignore_attr = TRUE
+ )
+})
+
+test_that("str_to_lower, str_to_upper, and str_to_title", {
+ df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!"))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ x_lower = str_to_lower(x),
+ x_upper = str_to_upper(x),
+ x_title = str_to_title(x)
+ ) %>%
+ collect(),
+ df
+ )
+
+ # Error checking a single function because they all use the same code path.
+ expect_error(
+ nse_funcs$str_to_lower("Apache Arrow", locale = "sp"),
+ "Providing a value for 'locale' other than the default ('en') is not supported by Arrow",
+ fixed = TRUE
+ )
+})
+
+test_that("arrow_*_split_whitespace functions", {
+ # use only ASCII whitespace characters
+ df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
+
+ # use only non-ASCII whitespace characters
+ df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux"))
+
+ df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux")))
+
+ # use default option values
+ expect_equal(
+ df_ascii %>%
+ Table$create() %>%
+ mutate(x = arrow_ascii_split_whitespace(x)) %>%
+ collect(),
+ df_split,
+ ignore_attr = TRUE
+ )
+ expect_equal(
+ df_utf8 %>%
+ Table$create() %>%
+ mutate(x = arrow_utf8_split_whitespace(x)) %>%
+ collect(),
+ df_split,
+ ignore_attr = TRUE
+ )
+
+ # specify non-default option values
+ expect_equal(
+ df_ascii %>%
+ Table$create() %>%
+ mutate(
+ x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+ ) %>%
+ collect(),
+ tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))),
+ ignore_attr = TRUE
+ )
+ expect_equal(
+ df_utf8 %>%
+ Table$create() %>%
+ mutate(
+ x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE))
+ ) %>%
+ collect(),
+ tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))),
+ ignore_attr = TRUE
+ )
+})
+
+test_that("errors and warnings in string splitting", {
+ # These conditions generate an error, but abandon_ship() catches the error,
+ # issues a warning, and pulls the data into R (if computing on InMemoryDataset)
+ # Elsewhere we test that abandon_ship() works,
+ # so here we can just call the functions directly
+
+ x <- Expression$field_ref("x")
+ expect_error(
+ nse_funcs$str_split(x, fixed("and", ignore_case = TRUE)),
+ "Case-insensitive string splitting not supported by Arrow"
+ )
+ expect_error(
+ nse_funcs$str_split(x, coll("and.?")),
+ "Pattern modifier `coll()` not supported by Arrow",
+ fixed = TRUE
+ )
+ expect_error(
+ nse_funcs$str_split(x, boundary(type = "word")),
+ "Pattern modifier `boundary()` not supported by Arrow",
+ fixed = TRUE
+ )
+ expect_error(
+ nse_funcs$str_split(x, "and", n = 0),
+ "Splitting strings into zero parts not supported by Arrow"
+ )
+
+ # This condition generates a warning
+ expect_warning(
+ nse_funcs$str_split(x, fixed("and"), simplify = TRUE),
+ "Argument 'simplify = TRUE' will be ignored"
+ )
+})
+
+test_that("errors and warnings in string detection and replacement", {
+ x <- Expression$field_ref("x")
+
+ expect_error(
+ nse_funcs$str_detect(x, boundary(type = "character")),
+ "Pattern modifier `boundary()` not supported by Arrow",
+ fixed = TRUE
+ )
+ expect_error(
+ nse_funcs$str_replace_all(x, coll("o", locale = "en"), "ó"),
+ "Pattern modifier `coll()` not supported by Arrow",
+ fixed = TRUE
+ )
+
+ # This condition generates a warning
+ expect_warning(
+ nse_funcs$str_replace_all(x, regex("o", multiline = TRUE), "u"),
+ "Ignoring pattern modifier argument not supported in Arrow: \"multiline\""
+ )
+})
+
+test_that("backreferences in pattern in string detection", {
+ skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)")
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_detect(x, regex("F([aeiou])\\1"))) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("backreferences (substitutions) in string replacement", {
+ df <- tibble(x = c("Foo", "bar"))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(desc = sub(
+ "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?",
+ "path `\\2` on server `\\1`",
+ url
+ )) %>%
+ collect(),
+ tibble(url = "https://arrow.apache.org/docs/r/")
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
+ collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("edge cases in string detection and replacement", {
+ # in case-insensitive fixed match/replace, test that "\\E" in the search
+ # string and backslashes in the replacement string are interpreted literally.
+ # this test does not use compare_dplyr_binding() because base::sub() and
+ # base::grepl() do not support ignore.case = TRUE when fixed = TRUE.
+ expect_equal(
+ tibble(x = c("\\Q\\e\\D")) %>%
+ Table$create() %>%
+ filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("\\Q\\e\\D"))
+ )
+ expect_equal(
+ tibble(x = c("\\Q\\e\\D")) %>%
+ Table$create() %>%
+ transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>%
+ collect(),
+ tibble(x = c("\\Q\\L\\D"))
+ )
+
+ # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's
+ # added in case-insensitive regex match/replace
+ compare_dplyr_binding(
+ .input %>%
+ filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>%
+ collect(),
+ tibble(x = c("ABC"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>%
+ collect(),
+ tibble(x = c("ABC"))
+ )
+})
+
+test_that("strptime", {
+ # base::strptime() defaults to local timezone
+ # but arrow's strptime defaults to UTC.
+ # So that tests are consistent, set the local timezone to UTC
+ # TODO: consider reevaluating this workaround after ARROW-12980
+ withr::local_timezone("UTC")
+
+ t_string <- tibble(x = c("2018-10-07 19:04:05", NA))
+ t_stamp <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA))
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x)
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%Y-%m-%d %H:%M:%S")
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns")
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ expect_equal(
+ t_string %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s")
+ ) %>%
+ collect(),
+ t_stamp,
+ ignore_attr = "tzone"
+ )
+
+ tstring <- tibble(x = c("08-05-2008", NA))
+ tstamp <- strptime(c("08-05-2008", NA), format = "%m-%d-%Y")
+
+ expect_equal(
+ tstring %>%
+ Table$create() %>%
+ mutate(
+ x = strptime(x, format = "%m-%d-%Y")
+ ) %>%
+ pull(),
+ # R's strptime returns POSIXlt (list type)
+ as.POSIXct(tstamp),
+ ignore_attr = "tzone"
+ )
+})
+
+test_that("errors in strptime", {
+ # Error when tz is passed
+ x <- Expression$field_ref("x")
+ expect_error(
+ nse_funcs$strptime(x, tz = "PDT"),
+ "Time zone argument not supported by Arrow"
+ )
+})
+
+test_that("strftime", {
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+ times <- tibble(
+ datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA),
+ date = c(as.Date("2021-01-01"), NA)
+ )
+ formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u"
+ formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u"
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = formats)) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(date, format = formats_date)) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>%
+ collect(),
+ times
+ )
+
+ withr::with_timezone(
+ "Pacific/Marquesas",
+ {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ x = strftime(datetime, format = formats, tz = "EST"),
+ x_date = strftime(date, format = formats_date, tz = "EST")
+ ) %>%
+ collect(),
+ times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ x = strftime(datetime, format = formats),
+ x_date = strftime(date, format = formats_date)
+ ) %>%
+ collect(),
+ times
+ )
+ }
+ )
+
+ # This check is due to differences in the way %c currently works in Arrow and R's strftime.
+ # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved.
+ expect_error(
+ times %>%
+ Table$create() %>%
+ mutate(x = strftime(datetime, format = "%c")) %>%
+ collect(),
+ "%c flag is not supported in non-C locales."
+ )
+
+ # Output precision of %S depends on the input timestamp precision.
+ # Timestamps with second precision are represented as integers while
+ # milliseconds, microsecond and nanoseconds are represented as fixed floating
+ # point numbers with 3, 6 and 9 decimal places respectively.
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = strftime(datetime, format = "%S")) %>%
+ transmute(as.double(substr(x, 1, 2))) %>%
+ collect(),
+ times,
+ tolerance = 1e-6
+ )
+})
+
+test_that("format_ISO8601", {
+ skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+ times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymd", usetz = FALSE)) %>%
+ collect(),
+ times
+ )
+
+ if (getRversion() < "3.5") {
+ # before 3.5, times$x will have no timezone attribute, so Arrow faithfully
+ # errors that there is no timezone to format:
+ expect_error(
+ times %>%
+ Table$create() %>%
+ mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
+ collect(),
+ "Timezone not present, cannot convert to string with timezone: %Y-%m-%d%z"
+ )
+
+ # See comment regarding %S flag in strftime tests
+ expect_error(
+ times %>%
+ Table$create() %>%
+ mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
+ mutate(x = gsub("\\.0*", "", x)) %>%
+ collect(),
+ "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z"
+ )
+ } else {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>%
+ collect(),
+ times
+ )
+
+ # See comment regarding %S flag in strftime tests
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>%
+ mutate(x = gsub("\\.0*", "", x)) %>%
+ collect(),
+ times
+ )
+ }
+
+
+ # See comment regarding %S flag in strftime tests
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>%
+ mutate(x = gsub("\\.0*", "", x)) %>%
+ collect(),
+ times
+ )
+})
+
+test_that("arrow_find_substring and arrow_find_substring_regex", {
+ df <- tibble(x = c("Foo and Bar", "baz and qux and quux"))
+
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>%
+ collect(),
+ tibble(x = c(-1, 0))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring(
+ x,
+ options = list(pattern = "b", ignore_case = TRUE)
+ )) %>%
+ collect(),
+ tibble(x = c(8, 0))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring_regex(
+ x,
+ options = list(pattern = "^[fb]")
+ )) %>%
+ collect(),
+ tibble(x = c(-1, 0))
+ )
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = arrow_find_substring_regex(
+ x,
+ options = list(pattern = "[AEIOU]", ignore_case = TRUE)
+ )) %>%
+ collect(),
+ tibble(x = c(1, 1))
+ )
+})
+
+test_that("stri_reverse and arrow_ascii_reverse functions", {
+ df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux"))
+
+ df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = stri_reverse(x)) %>%
+ collect(),
+ df_utf8
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = stri_reverse(x)) %>%
+ collect(),
+ df_ascii
+ )
+
+ expect_equal(
+ df_ascii %>%
+ Table$create() %>%
+ mutate(x = arrow_ascii_reverse(x)) %>%
+ collect(),
+ tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab"))
+ )
+
+ expect_error(
+ df_utf8 %>%
+ Table$create() %>%
+ mutate(x = arrow_ascii_reverse(x)) %>%
+ collect(),
+ "Invalid: Non-ASCII sequence in input"
+ )
+})
+
+test_that("str_like", {
+ df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+ # TODO: After new version of stringr with str_like has been released, update all
+ # these tests to use compare_dplyr_binding
+
+ # No match - entire string
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "baz")) %>%
+ collect(),
+ tibble(x = c(FALSE, FALSE))
+ )
+
+ # Match - entire string
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "Foo and bar")) %>%
+ collect(),
+ tibble(x = c(TRUE, FALSE))
+ )
+
+ # Wildcard
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>%
+ collect(),
+ tibble(x = c(TRUE, FALSE))
+ )
+
+ # Ignore case
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>%
+ collect(),
+ tibble(x = c(FALSE, FALSE))
+ )
+
+ # Single character
+ expect_equal(
+ df %>%
+ Table$create() %>%
+ mutate(x = str_like(x, "_a%")) %>%
+ collect(),
+ tibble(x = c(FALSE, TRUE))
+ )
+
+ # This will give an error until a new version of stringr with str_like has been released
+ skip_if_not(packageVersion("stringr") > "1.4.0")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_like(x, "%baz%")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("str_pad", {
+ df <- tibble(x = c("Foo and bar", "baz and qux and quux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 31)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 30, side = "right")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(x = str_pad(x, width = 31, side = "both")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("substr", {
+ df <- tibble(x = "Apache Arrow")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 0, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, -1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 6, 1)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, -1, -2)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 9, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, 8, 12)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substr(x, -5, -1)) %>%
+ collect(),
+ df
+ )
+
+ expect_error(
+ nse_funcs$substr("Apache Arrow", c(1, 2), 3),
+ "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ expect_error(
+ nse_funcs$substr("Apache Arrow", 1, c(2, 3)),
+ "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+})
+
+test_that("substring", {
+ # nse_funcs$substring just calls nse_funcs$substr, tested extensively above
+ df <- tibble(x = "Apache Arrow")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = substring(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("str_sub", {
+ df <- tibble(x = "Apache Arrow")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 0, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 6, 1)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -1, -2)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -1, 3)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 9, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, 8, 12)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(y = str_sub(x, -5, -1)) %>%
+ collect(),
+ df
+ )
+
+ expect_error(
+ nse_funcs$str_sub("Apache Arrow", c(1, 2), 3),
+ "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ expect_error(
+ nse_funcs$str_sub("Apache Arrow", 1, c(2, 3)),
+ "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+})
+
+test_that("str_starts, str_ends, startsWith, endsWith", {
+ df <- tibble(x = c("Foo", "bar", "baz", "qux"))
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, "b.*")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, "b.*", negate = TRUE)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, fixed("b.*"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_starts(x, fixed("b"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, "r")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, "r", negate = TRUE)) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, fixed("r$"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(str_ends(x, fixed("r"))) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(startsWith(x, "b")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(endsWith(x, "r")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(startsWith(x, "b.*")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(endsWith(x, "r$")) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("str_count", {
+ df <- tibble(
+ cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"),
+ dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..")
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(a_count = str_count(cities, pattern = "a")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(p_count = str_count(cities, pattern = "d")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(p_count = str_count(cities,
+ pattern = regex("d", ignore_case = TRUE)
+ )) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(e_count = str_count(cities, pattern = "u")) %>%
+ collect(),
+ df
+ )
+
+ # nse_funcs$str_count() is not vectorised over pattern
+ compare_dplyr_binding(
+ .input %>%
+ mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>%
+ collect(),
+ df,
+ warning = TRUE
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(dots_count = str_count(dots, ".")) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(dots_count = str_count(dots, fixed("."))) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R
new file mode 100644
index 000000000..859dc14b9
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R
@@ -0,0 +1,627 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+suppressPackageStartupMessages(library(bit64))
+
+
+tbl <- example_data
+
+test_that("explicit type conversions with cast()", {
+ num_int32 <- 12L
+ num_int64 <- bit64::as.integer64(10)
+
+ int_types <- c(int8(), int16(), int32(), int64())
+ uint_types <- c(uint8(), uint16(), uint32(), uint64())
+ float_types <- c(float32(), float64())
+
+ types <- c(
+ int_types,
+ uint_types,
+ float_types,
+ double(), # not actually a type, a base R function but should be alias for float64
+ string()
+ )
+
+ for (type in types) {
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(x = num_int32) %>%
+ transmute(x = cast(x, type)) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ as_type(type)
+ )
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(x = num_int64) %>%
+ transmute(x = cast(x, type)) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ as_type(type)
+ )
+ }
+
+ # Arrow errors when truncating floats...
+ expect_error(
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(pi = pi) %>%
+ transmute(three = cast(pi, int32())) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ int32()
+ ),
+ "truncated"
+ )
+
+ # ... unless safe = FALSE (or allow_float_truncate = TRUE)
+ expect_type_equal(
+ object = {
+ t1 <- Table$create(pi = pi) %>%
+ transmute(three = cast(pi, int32(), safe = FALSE)) %>%
+ compute()
+ t1$schema[[1]]$type
+ },
+ int32()
+ )
+})
+
+test_that("explicit type conversions with as.*()", {
+ library(bit64)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ int2chr = as.character(int),
+ int2dbl = as.double(int),
+ int2int = as.integer(int),
+ int2num = as.numeric(int),
+ dbl2chr = as.character(dbl),
+ dbl2dbl = as.double(dbl),
+ dbl2int = as.integer(dbl),
+ dbl2num = as.numeric(dbl),
+ ) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr2chr = as.character(chr),
+ chr2dbl = as.double(chr),
+ chr2int = as.integer(chr),
+ chr2num = as.numeric(chr)
+ ) %>%
+ collect(),
+ tibble(chr = c("1", "2", "3"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr2i64 = as.integer64(chr),
+ dbl2i64 = as.integer64(dbl),
+ i642i64 = as.integer64(i64),
+ ) %>%
+ collect(),
+ tibble(chr = "10000000000", dbl = 10000000000, i64 = as.integer64(1e10))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr2lgl = as.logical(chr),
+ dbl2lgl = as.logical(dbl),
+ int2lgl = as.logical(int)
+ ) %>%
+ collect(),
+ tibble(
+ chr = c("TRUE", "FALSE", "true", "false"),
+ dbl = c(1, 0, -99, 0),
+ int = c(1L, 0L, -99L, 0L)
+ )
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ dbl2chr = as.character(dbl),
+ dbl2dbl = as.double(dbl),
+ dbl2int = as.integer(dbl),
+ dbl2lgl = as.logical(dbl),
+ int2chr = as.character(int),
+ int2dbl = as.double(int),
+ int2int = as.integer(int),
+ int2lgl = as.logical(int),
+ lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ...
+ lgl2dbl = as.double(lgl),
+ lgl2int = as.integer(lgl),
+ lgl2lgl = as.logical(lgl)
+ ) %>%
+ collect() %>%
+ # need to use toupper() *after* collect() or else skip if utf8proc not available
+ mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE"
+ tibble(
+ dbl = c(1, 0, NA_real_),
+ int = c(1L, 0L, NA_integer_),
+ lgl = c(TRUE, FALSE, NA)
+ )
+ )
+})
+
+test_that("is.finite(), is.infinite(), is.nan()", {
+ df <- tibble(x = c(
+ -4.94065645841246544e-324, 1.79769313486231570e+308, 0,
+ NA_real_, NaN, Inf, -Inf
+ ))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ is_fin = is.finite(x),
+ is_inf = is.infinite(x)
+ ) %>%
+ collect(),
+ df
+ )
+ # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ is_nan = is.nan(x)
+ ) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", {
+ df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7))
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ is_na = is.na(x)
+ ) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("type checks with is() giving Arrow types", {
+ # with class2=DataType
+ expect_equal(
+ Table$create(
+ i32 = Array$create(1, int32()),
+ dec = Array$create(pi)$cast(decimal(3, 2)),
+ f64 = Array$create(1.1, float64()),
+ str = Array$create("a", arrow::string())
+ ) %>% transmute(
+ i32_is_i32 = is(i32, int32()),
+ i32_is_dec = is(i32, decimal(3, 2)),
+ i32_is_i64 = is(i32, float64()),
+ i32_is_str = is(i32, arrow::string()),
+ dec_is_i32 = is(dec, int32()),
+ dec_is_dec = is(dec, decimal(3, 2)),
+ dec_is_i64 = is(dec, float64()),
+ dec_is_str = is(dec, arrow::string()),
+ f64_is_i32 = is(f64, int32()),
+ f64_is_dec = is(f64, decimal(3, 2)),
+ f64_is_i64 = is(f64, float64()),
+ f64_is_str = is(f64, arrow::string()),
+ str_is_i32 = is(str, int32()),
+ str_is_dec = is(str, decimal(3, 2)),
+ str_is_i64 = is(str, float64()),
+ str_is_str = is(str, arrow::string())
+ ) %>%
+ collect() %>%
+ t() %>%
+ as.vector(),
+ c(
+ TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE,
+ FALSE, FALSE, FALSE, FALSE, TRUE
+ )
+ )
+ # with class2=string
+ expect_equal(
+ Table$create(
+ i32 = Array$create(1, int32()),
+ f64 = Array$create(1.1, float64()),
+ str = Array$create("a", arrow::string())
+ ) %>% transmute(
+ i32_is_i32 = is(i32, "int32"),
+ i32_is_i64 = is(i32, "double"),
+ i32_is_str = is(i32, "string"),
+ f64_is_i32 = is(f64, "int32"),
+ f64_is_i64 = is(f64, "double"),
+ f64_is_str = is(f64, "string"),
+ str_is_i32 = is(str, "int32"),
+ str_is_i64 = is(str, "double"),
+ str_is_str = is(str, "string")
+ ) %>%
+ collect() %>%
+ t() %>%
+ as.vector(),
+ c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE)
+ )
+ # with class2=string alias
+ expect_equal(
+ Table$create(
+ f16 = Array$create(NA_real_, halffloat()),
+ f32 = Array$create(1.1, float()),
+ f64 = Array$create(2.2, float64()),
+ lgl = Array$create(TRUE, bool()),
+ str = Array$create("a", arrow::string())
+ ) %>% transmute(
+ f16_is_f16 = is(f16, "float16"),
+ f16_is_f32 = is(f16, "float32"),
+ f16_is_f64 = is(f16, "float64"),
+ f16_is_lgl = is(f16, "boolean"),
+ f16_is_str = is(f16, "utf8"),
+ f32_is_f16 = is(f32, "float16"),
+ f32_is_f32 = is(f32, "float32"),
+ f32_is_f64 = is(f32, "float64"),
+ f32_is_lgl = is(f32, "boolean"),
+ f32_is_str = is(f32, "utf8"),
+ f64_is_f16 = is(f64, "float16"),
+ f64_is_f32 = is(f64, "float32"),
+ f64_is_f64 = is(f64, "float64"),
+ f64_is_lgl = is(f64, "boolean"),
+ f64_is_str = is(f64, "utf8"),
+ lgl_is_f16 = is(lgl, "float16"),
+ lgl_is_f32 = is(lgl, "float32"),
+ lgl_is_f64 = is(lgl, "float64"),
+ lgl_is_lgl = is(lgl, "boolean"),
+ lgl_is_str = is(lgl, "utf8"),
+ str_is_f16 = is(str, "float16"),
+ str_is_f32 = is(str, "float32"),
+ str_is_f64 = is(str, "float64"),
+ str_is_lgl = is(str, "boolean"),
+ str_is_str = is(str, "utf8")
+ ) %>%
+ collect() %>%
+ t() %>%
+ as.vector(),
+ c(
+ TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
+ FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
+ FALSE, FALSE, TRUE
+ )
+ )
+})
+
+test_that("type checks with is() giving R types", {
+ library(bit64)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is(chr, "character"),
+ chr_is_fct = is(chr, "factor"),
+ chr_is_int = is(chr, "integer"),
+ chr_is_i64 = is(chr, "integer64"),
+ chr_is_lst = is(chr, "list"),
+ chr_is_lgl = is(chr, "logical"),
+ chr_is_num = is(chr, "numeric"),
+ dbl_is_chr = is(dbl, "character"),
+ dbl_is_fct = is(dbl, "factor"),
+ dbl_is_int = is(dbl, "integer"),
+ dbl_is_i64 = is(dbl, "integer64"),
+ dbl_is_lst = is(dbl, "list"),
+ dbl_is_lgl = is(dbl, "logical"),
+ dbl_is_num = is(dbl, "numeric"),
+ fct_is_chr = is(fct, "character"),
+ fct_is_fct = is(fct, "factor"),
+ fct_is_int = is(fct, "integer"),
+ fct_is_i64 = is(fct, "integer64"),
+ fct_is_lst = is(fct, "list"),
+ fct_is_lgl = is(fct, "logical"),
+ fct_is_num = is(fct, "numeric"),
+ int_is_chr = is(int, "character"),
+ int_is_fct = is(int, "factor"),
+ int_is_int = is(int, "integer"),
+ int_is_i64 = is(int, "integer64"),
+ int_is_lst = is(int, "list"),
+ int_is_lgl = is(int, "logical"),
+ int_is_num = is(int, "numeric"),
+ lgl_is_chr = is(lgl, "character"),
+ lgl_is_fct = is(lgl, "factor"),
+ lgl_is_int = is(lgl, "integer"),
+ lgl_is_i64 = is(lgl, "integer64"),
+ lgl_is_lst = is(lgl, "list"),
+ lgl_is_lgl = is(lgl, "logical"),
+ lgl_is_num = is(lgl, "numeric")
+ ) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ i64_is_chr = is(i64, "character"),
+ i64_is_fct = is(i64, "factor"),
+ # we want Arrow to return TRUE, but bit64 returns FALSE
+ # i64_is_int = is(i64, "integer"),
+ i64_is_i64 = is(i64, "integer64"),
+ i64_is_lst = is(i64, "list"),
+ i64_is_lgl = is(i64, "logical"),
+ # we want Arrow to return TRUE, but bit64 returns FALSE
+ # i64_is_num = is(i64, "numeric"),
+ lst_is_chr = is(lst, "character"),
+ lst_is_fct = is(lst, "factor"),
+ lst_is_int = is(lst, "integer"),
+ lst_is_i64 = is(lst, "integer64"),
+ lst_is_lst = is(lst, "list"),
+ lst_is_lgl = is(lst, "logical"),
+ lst_is_num = is(lst, "numeric")
+ ) %>%
+ collect(),
+ tibble(
+ i64 = as.integer64(1:3),
+ lst = list(c("a", "b"), c("d", "e"), c("f", "g"))
+ )
+ )
+})
+
+test_that("type checks with is.*()", {
+ library(bit64)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is.character(chr),
+ chr_is_dbl = is.double(chr),
+ chr_is_fct = is.factor(chr),
+ chr_is_int = is.integer(chr),
+ chr_is_i64 = is.integer64(chr),
+ chr_is_lst = is.list(chr),
+ chr_is_lgl = is.logical(chr),
+ chr_is_num = is.numeric(chr),
+ dbl_is_chr = is.character(dbl),
+ dbl_is_dbl = is.double(dbl),
+ dbl_is_fct = is.factor(dbl),
+ dbl_is_int = is.integer(dbl),
+ dbl_is_i64 = is.integer64(dbl),
+ dbl_is_lst = is.list(dbl),
+ dbl_is_lgl = is.logical(dbl),
+ dbl_is_num = is.numeric(dbl),
+ fct_is_chr = is.character(fct),
+ fct_is_dbl = is.double(fct),
+ fct_is_fct = is.factor(fct),
+ fct_is_int = is.integer(fct),
+ fct_is_i64 = is.integer64(fct),
+ fct_is_lst = is.list(fct),
+ fct_is_lgl = is.logical(fct),
+ fct_is_num = is.numeric(fct),
+ int_is_chr = is.character(int),
+ int_is_dbl = is.double(int),
+ int_is_fct = is.factor(int),
+ int_is_int = is.integer(int),
+ int_is_i64 = is.integer64(int),
+ int_is_lst = is.list(int),
+ int_is_lgl = is.logical(int),
+ int_is_num = is.numeric(int),
+ lgl_is_chr = is.character(lgl),
+ lgl_is_dbl = is.double(lgl),
+ lgl_is_fct = is.factor(lgl),
+ lgl_is_int = is.integer(lgl),
+ lgl_is_i64 = is.integer64(lgl),
+ lgl_is_lst = is.list(lgl),
+ lgl_is_lgl = is.logical(lgl),
+ lgl_is_num = is.numeric(lgl)
+ ) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ i64_is_chr = is.character(i64),
+ # TODO: investigate why this is not matching when testthat runs it
+ # i64_is_dbl = is.double(i64),
+ i64_is_fct = is.factor(i64),
+ # we want Arrow to return TRUE, but bit64 returns FALSE
+ # i64_is_int = is.integer(i64),
+ i64_is_i64 = is.integer64(i64),
+ i64_is_lst = is.list(i64),
+ i64_is_lgl = is.logical(i64),
+ i64_is_num = is.numeric(i64),
+ lst_is_chr = is.character(lst),
+ lst_is_dbl = is.double(lst),
+ lst_is_fct = is.factor(lst),
+ lst_is_int = is.integer(lst),
+ lst_is_i64 = is.integer64(lst),
+ lst_is_lst = is.list(lst),
+ lst_is_lgl = is.logical(lst),
+ lst_is_num = is.numeric(lst)
+ ) %>%
+ collect(),
+ tibble(
+ i64 = as.integer64(1:3),
+ lst = list(c("a", "b"), c("d", "e"), c("f", "g"))
+ )
+ )
+})
+
+test_that("type checks with is_*()", {
+ library(rlang, warn.conflicts = FALSE)
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is_character(chr),
+ chr_is_dbl = is_double(chr),
+ chr_is_int = is_integer(chr),
+ chr_is_lst = is_list(chr),
+ chr_is_lgl = is_logical(chr),
+ dbl_is_chr = is_character(dbl),
+ dbl_is_dbl = is_double(dbl),
+ dbl_is_int = is_integer(dbl),
+ dbl_is_lst = is_list(dbl),
+ dbl_is_lgl = is_logical(dbl),
+ int_is_chr = is_character(int),
+ int_is_dbl = is_double(int),
+ int_is_int = is_integer(int),
+ int_is_lst = is_list(int),
+ int_is_lgl = is_logical(int),
+ lgl_is_chr = is_character(lgl),
+ lgl_is_dbl = is_double(lgl),
+ lgl_is_int = is_integer(lgl),
+ lgl_is_lst = is_list(lgl),
+ lgl_is_lgl = is_logical(lgl)
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("type checks on expressions", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ a = is.character(as.character(int)),
+ b = is.integer(as.character(int)),
+ c = is.integer(int + int),
+ d = is.double(int + dbl),
+ e = is.logical(dbl > pi)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # the code in the expectation below depends on RE2
+ skip_if_not_available("re2")
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ a = is.logical(grepl("[def]", chr))
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("type checks on R scalar literals", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ chr_is_chr = is.character("foo"),
+ int_is_chr = is.character(42L),
+ int_is_int = is.integer(42L),
+ chr_is_int = is.integer("foo"),
+ dbl_is_num = is.numeric(3.14159),
+ int_is_num = is.numeric(42L),
+ chr_is_num = is.numeric("foo"),
+ dbl_is_dbl = is.double(3.14159),
+ chr_is_dbl = is.double("foo"),
+ lgl_is_lgl = is.logical(TRUE),
+ chr_is_lgl = is.logical("foo"),
+ fct_is_fct = is.factor(factor("foo", levels = c("foo", "bar", "baz"))),
+ chr_is_fct = is.factor("foo"),
+ lst_is_lst = is.list(list(c(a = "foo", b = "bar"))),
+ chr_is_lst = is.list("foo")
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("as.factor()/dictionary_encode()", {
+ skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression")
+ df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B"))
+ df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8))
+
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = as.factor(x)) %>%
+ collect(),
+ df1
+ )
+
+ expect_warning(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(x = as.factor(x)) %>%
+ collect(),
+ df2
+ ),
+ "Coercing dictionary values to R character factor levels"
+ )
+
+ # dictionary values with default null encoding behavior ("mask") omits
+ # nulls from the dictionary values
+ expect_equal(
+ object = {
+ rb1 <- df1 %>%
+ record_batch() %>%
+ transmute(x = dictionary_encode(x)) %>%
+ compute()
+ dict <- rb1$x$dictionary()
+ as.vector(dict$Take(dict$SortIndices()))
+ },
+ sort(unique(df1$x), na.last = NA)
+ )
+
+ # dictionary values with "encode" null encoding behavior includes nulls in
+ # the dictionary values
+ expect_equal(
+ object = {
+ rb1 <- df1 %>%
+ record_batch() %>%
+ transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>%
+ compute()
+ dict <- rb1$x$dictionary()
+ as.vector(dict$Take(dict$SortIndices()))
+ },
+ sort(unique(df1$x), na.last = TRUE)
+ )
+})
+
+test_that("bad explicit type conversions with as.*()", {
+
+ # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R)
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(lgl2chr = as.character(lgl)) %>%
+ collect(),
+ tibble(lgl = c(TRUE, FALSE, NA))
+ )
+ )
+
+ # Arrow fails to parse these strings as numbers (instead of returning NAs with
+ # a warning like R does)
+ expect_error(
+ expect_warning(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(chr2num = as.numeric(chr)) %>%
+ collect(),
+ tibble(chr = c("l.O", "S.S", ""))
+ )
+ )
+ )
+
+ # Arrow fails to parse these strings as Booleans (instead of returning NAs
+ # like R does)
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ transmute(chr2lgl = as.logical(chr)) %>%
+ collect(),
+ tibble(chr = c("TRU", "FAX", ""))
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-group-by.R b/src/arrow/r/tests/testthat/test-dplyr-group-by.R
new file mode 100644
index 000000000..7cfcfb5c9
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-group-by.R
@@ -0,0 +1,158 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+
+test_that("group_by groupings are recorded", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("group_by supports creating/renaming", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr, numbers = int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr, numbers = int * 4) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(int > 4, lgl, foo = int > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("ungroup", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(int, chr) %>%
+ ungroup() %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+
+ # to confirm that the above expectation is actually testing what we think it's
+ # testing, verify that compare_dplyr_binding() distinguishes between grouped and
+ # ungrouped tibbles
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(int, chr) %>%
+ (function(x) if (inherits(x, "tbl_df")) ungroup(x) else x) %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+ )
+})
+
+test_that("group_by then rename", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr) %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("group_by with .drop", {
+ test_groups <- c("starting_a_fight", "consoling_a_child", "petting_a_dog")
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups), .drop = TRUE) %>%
+ collect(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ collect(),
+ example_with_logical_factors
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = TRUE) %>%
+ collect() %>%
+ n_groups(),
+ 4L
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ collect() %>%
+ n_groups(),
+ 8L
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ group_by_drop_default(),
+ FALSE
+ )
+ expect_equal(
+ example_with_logical_factors %>%
+ group_by(!!!syms(test_groups), .drop = TRUE) %>%
+ group_by_drop_default(),
+ TRUE
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(.drop = FALSE) %>% # no group by vars
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups)) %>%
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(!!!syms(test_groups), .drop = FALSE) %>%
+ ungroup() %>%
+ group_by_drop_default(),
+ example_with_logical_factors
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-join.R b/src/arrow/r/tests/testthat/test-dplyr-join.R
new file mode 100644
index 000000000..d8239f810
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-join.R
@@ -0,0 +1,175 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+
+left <- example_data
+left$some_grouping <- rep(c(1, 2), 5)
+
+left_tab <- Table$create(left)
+
+to_join <- tibble::tibble(
+ some_grouping = c(1, 2),
+ capital_letters = c("A", "B"),
+ another_column = TRUE
+)
+to_join_tab <- Table$create(to_join)
+
+
+test_that("left_join", {
+ expect_message(
+ compare_dplyr_binding(
+ .input %>%
+ left_join(to_join) %>%
+ collect(),
+ left
+ ),
+ 'Joining, by = "some_grouping"'
+ )
+})
+
+test_that("left_join `by` args", {
+ compare_dplyr_binding(
+ .input %>%
+ left_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+ compare_dplyr_binding(
+ .input %>%
+ left_join(
+ to_join %>%
+ rename(the_grouping = some_grouping),
+ by = c(some_grouping = "the_grouping")
+ ) %>%
+ collect(),
+ left
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ rename(the_grouping = some_grouping) %>%
+ left_join(
+ to_join,
+ by = c(the_grouping = "some_grouping")
+ ) %>%
+ collect(),
+ left
+ )
+})
+
+test_that("join two tables", {
+ expect_identical(
+ left_tab %>%
+ left_join(to_join_tab, by = "some_grouping") %>%
+ collect(),
+ left %>%
+ left_join(to_join, by = "some_grouping") %>%
+ collect()
+ )
+})
+
+test_that("Error handling", {
+ expect_error(
+ left_tab %>%
+ left_join(to_join, by = "not_a_col") %>%
+ collect(),
+ "all(names(by) %in% names(x)) is not TRUE",
+ fixed = TRUE
+ )
+})
+
+# TODO: test duplicate col names
+# TODO: casting: int and float columns?
+
+test_that("right_join", {
+ compare_dplyr_binding(
+ .input %>%
+ right_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("inner_join", {
+ compare_dplyr_binding(
+ .input %>%
+ inner_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("full_join", {
+ compare_dplyr_binding(
+ .input %>%
+ full_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("semi_join", {
+ compare_dplyr_binding(
+ .input %>%
+ semi_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("anti_join", {
+ compare_dplyr_binding(
+ .input %>%
+ # Factor levels when there are no rows in the data don't match
+ # TODO: use better anti_join test data
+ select(-fct) %>%
+ anti_join(to_join, by = "some_grouping") %>%
+ collect(),
+ left
+ )
+})
+
+test_that("mutate then join", {
+ left <- Table$create(
+ one = c("a", "b"),
+ two = 1:2
+ )
+ right <- Table$create(
+ three = TRUE,
+ dos = 2L
+ )
+
+ expect_equal(
+ left %>%
+ rename(dos = two) %>%
+ mutate(one = toupper(one)) %>%
+ left_join(
+ right %>%
+ mutate(three = !three)
+ ) %>%
+ arrange(dos) %>%
+ collect(),
+ tibble(
+ one = c("A", "B"),
+ dos = 1:2,
+ three = c(NA, FALSE)
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-mutate.R b/src/arrow/r/tests/testthat/test-dplyr-mutate.R
new file mode 100644
index 000000000..886ec9e42
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-mutate.R
@@ -0,0 +1,522 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+
+test_that("mutate() is lazy", {
+ expect_s3_class(
+ tbl %>% record_batch() %>% mutate(int = int + 6L),
+ "arrow_dplyr_query"
+ )
+})
+
+test_that("basic mutate", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ mutate(int = int + 6L) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate() with NULL inputs", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(int = NULL) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("empty mutate()", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("transmute", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ transmute(int = int + 6L) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("transmute() with NULL inputs", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(int = NULL) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("empty transmute()", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("transmute() with unsupported arguments", {
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ transmute(int = int + 42L, .keep = "all"),
+ "`transmute()` does not support the `.keep` argument",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ transmute(int = int + 42L, .before = lgl),
+ "`transmute()` does not support the `.before` argument",
+ fixed = TRUE
+ )
+ expect_error(
+ tbl %>%
+ Table$create() %>%
+ transmute(int = int + 42L, .after = chr),
+ "`transmute()` does not support the `.after` argument",
+ fixed = TRUE
+ )
+})
+
+test_that("transmute() defuses dots arguments (ARROW-13262)", {
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ transmute(stringr::str_c(chr, chr)) %>%
+ collect(),
+ "Expression stringr::str_c(chr, chr) not supported in Arrow; pulling data into R",
+ fixed = TRUE
+ )
+})
+
+test_that("mutate and refer to previous mutants", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = nchar(verses),
+ longer = line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("nchar() arguments", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = nchar(verses, type = "bytes"),
+ longer = line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl
+ )
+ # This tests the whole abandon_ship() machinery
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = nchar(verses, type = "bytes", allowNA = TRUE),
+ longer = line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl,
+ warning = paste0(
+ "In nchar\\(verses, type = \"bytes\", allowNA = TRUE\\), ",
+ "allowNA = TRUE not supported by Arrow; pulling data into R"
+ )
+ )
+})
+
+test_that("mutate with .data pronoun", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, verses) %>%
+ mutate(
+ line_lengths = str_length(verses),
+ longer = .data$line_lengths * 10
+ ) %>%
+ filter(line_lengths > 15) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate with unnamed expressions", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, padded_strings) %>%
+ mutate(
+ int, # bare column name
+ nchar(padded_strings) # expression
+ ) %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate with reassigning same name", {
+ compare_dplyr_binding(
+ .input %>%
+ transmute(
+ new = lgl,
+ new = chr
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("mutate with single value for recycling", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, padded_strings) %>%
+ mutate(
+ dr_bronner = 1 # ALL ONE!
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("dplyr::mutate's examples", {
+ # Newly created variables are available immediately
+ compare_dplyr_binding(
+ .input %>%
+ select(name, mass) %>%
+ mutate(
+ mass2 = mass * 2,
+ mass2_squared = mass2 * mass2
+ ) %>%
+ collect(),
+ starwars # this is a test tibble that ships with dplyr
+ )
+
+ # As well as adding new variables, you can use mutate() to
+ # remove variables and modify existing variables.
+ compare_dplyr_binding(
+ .input %>%
+ select(name, height, mass, homeworld) %>%
+ mutate(
+ mass = NULL,
+ height = height * 0.0328084 # convert to feet
+ ) %>%
+ collect(),
+ starwars
+ )
+
+ # Examples we don't support should succeed
+ # but warn that they're pulling data into R to do so
+
+ # across and autosplicing: ARROW-11699
+ compare_dplyr_binding(
+ .input %>%
+ select(name, homeworld, species) %>%
+ mutate(across(!name, as.factor)) %>%
+ collect(),
+ starwars,
+ warning = "Expression across.*not supported in Arrow"
+ )
+
+ # group_by then mutate
+ compare_dplyr_binding(
+ .input %>%
+ select(name, mass, homeworld) %>%
+ group_by(homeworld) %>%
+ mutate(rank = min_rank(desc(mass))) %>%
+ collect(),
+ starwars,
+ warning = TRUE
+ )
+
+ # `.before` and `.after` experimental args: ARROW-11701
+ df <- tibble(x = 1, y = 2)
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y) %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> x y z
+ #> <dbl> <dbl> <dbl>
+ #> 1 1 2 3
+
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .before = 1) %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> z x y
+ #> <dbl> <dbl> <dbl>
+ #> 1 3 1 2
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .after = x) %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> x z y
+ #> <dbl> <dbl> <dbl>
+ #> 1 1 3 2
+
+ # By default, mutate() keeps all columns from the input data.
+ # Experimental: You can override with `.keep`
+ df <- tibble(x = 1, y = 2, a = "a", b = "b")
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "all") %>% collect(), # the default
+ df
+ )
+ #> # A tibble: 1 x 5
+ #> x y a b z
+ #> <dbl> <dbl> <chr> <chr> <dbl>
+ #> 1 1 2 a b 3
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "used") %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> x y z
+ #> <dbl> <dbl> <dbl>
+ #> 1 1 2 3
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "unused") %>% collect(),
+ df
+ )
+ #> # A tibble: 1 x 3
+ #> a b z
+ #> <chr> <chr> <dbl>
+ #> 1 a b 3
+ compare_dplyr_binding(
+ .input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute()
+ df
+ )
+ #> # A tibble: 1 x 1
+ #> z
+ #> <dbl>
+ #> 1 3
+
+ # Grouping ----------------------------------------
+ # The mutate operation may yield different results on grouped
+ # tibbles because the expressions are computed within groups.
+ # The following normalises `mass` by the global average:
+ # TODO: ARROW-13926
+ compare_dplyr_binding(
+ .input %>%
+ select(name, mass, species) %>%
+ mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>%
+ collect(),
+ starwars,
+ warning = "window function"
+ )
+})
+
+test_that("Can mutate after group_by as long as there are no aggregations", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ group_by(chr) %>%
+ mutate(int = int + 6L) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(mean = int, chr) %>%
+ # rename `int` to `mean` and use `mean` in `mutate()` to test that
+ # `all_funs()` does not incorrectly identify it as an aggregate function
+ group_by(chr) %>%
+ mutate(mean = mean + 6L) %>%
+ collect(),
+ tbl
+ )
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ select(int, chr) %>%
+ group_by(chr) %>%
+ mutate(avg_int = mean(int)) %>%
+ collect(),
+ "window functions not currently supported in Arrow; pulling data into R",
+ fixed = TRUE
+ )
+ expect_warning(
+ tbl %>%
+ Table$create() %>%
+ select(mean = int, chr) %>%
+ # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that
+ # `all_funs()` detects `mean()` despite the collision with a column name
+ group_by(chr) %>%
+ mutate(avg_int = mean(mean)) %>%
+ collect(),
+ "window functions not currently supported in Arrow; pulling data into R",
+ fixed = TRUE
+ )
+})
+
+test_that("handle bad expressions", {
+ # TODO: search for functions other than mean() (see above test)
+ # that need to be forced to fail because they error ambiguously
+
+ with_language("fr", {
+ # expect_warning(., NA) because the usual behavior when it hits a filter
+ # that it can't evaluate is to raise a warning, collect() to R, and retry
+ # the filter. But we want this to error the first time because it's
+ # a user error, not solvable by retrying in R
+ expect_warning(
+ expect_error(
+ Table$create(tbl) %>% mutate(newvar = NOTAVAR + 2),
+ "objet 'NOTAVAR' introuvable"
+ ),
+ NA
+ )
+ })
+})
+
+test_that("Can't just add a vector column with mutate()", {
+ expect_warning(
+ expect_equal(
+ Table$create(tbl) %>%
+ select(int) %>%
+ mutate(again = 1:10),
+ tibble::tibble(int = tbl$int, again = 1:10)
+ ),
+ "In again = 1:10, only values of size one are recycled; pulling data into R"
+ )
+})
+
+test_that("print a mutated table", {
+ expect_output(
+ Table$create(tbl) %>%
+ select(int) %>%
+ mutate(twice = int * 2) %>%
+ print(),
+ "InMemoryDataset (query)
+int: int32
+twice: double (multiply_checked(int, 2))
+
+See $.data for the source Arrow object",
+ fixed = TRUE
+ )
+})
+
+test_that("mutate and write_dataset", {
+ skip_if_not_available("dataset")
+ # See related test in test-dataset.R
+
+ first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+ df1 <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ fct = factor(LETTERS[1:10]),
+ ts = first_date + lubridate::days(1:10)
+ )
+
+ second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+ df2 <- tibble(
+ int = 101:110,
+ dbl = c(as.numeric(51:59), NaN),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[10:1],
+ fct = factor(LETTERS[10:1]),
+ ts = second_date + lubridate::days(10:1)
+ )
+
+ dst_dir <- tempfile()
+ stacked <- record_batch(rbind(df1, df2))
+ stacked %>%
+ mutate(twice = int * 2) %>%
+ group_by(int) %>%
+ write_dataset(dst_dir, format = "feather")
+ expect_true(dir.exists(dst_dir))
+ expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "=")))
+
+ new_ds <- open_dataset(dst_dir, format = "feather")
+
+ expect_equal(
+ new_ds %>%
+ select(string = chr, integer = int, twice) %>%
+ filter(integer > 6 & integer < 11) %>%
+ collect() %>%
+ summarize(mean = mean(integer)),
+ df1 %>%
+ select(string = chr, integer = int) %>%
+ mutate(twice = integer * 2) %>%
+ filter(integer > 6) %>%
+ summarize(mean = mean(integer))
+ )
+})
+
+test_that("mutate and pmin/pmax", {
+ df <- tibble(
+ city = c("Chillan", "Valdivia", "Osorno"),
+ val1 = c(200, 300, NA),
+ val2 = c(100, NA, NA),
+ val3 = c(0, NA, NA)
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ max_val_1 = pmax(val1, val2, val3),
+ max_val_2 = pmax(val1, val2, val3, na.rm = TRUE),
+ min_val_1 = pmin(val1, val2, val3),
+ min_val_2 = pmin(val1, val2, val3, na.rm = TRUE)
+ ) %>%
+ collect(),
+ df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ max_val_1 = pmax(val1 - 100, 200, val1 * 100, na.rm = TRUE),
+ min_val_1 = pmin(val1 - 100, 100, val1 * 100, na.rm = TRUE),
+ ) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-query.R b/src/arrow/r/tests/testthat/test-dplyr-query.R
new file mode 100644
index 000000000..21a55f4b4
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-query.R
@@ -0,0 +1,296 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$another_chr <- tail(letters, 10)
+
+test_that("basic select/filter/collect", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5)
+
+ expect_s3_class(b2, "arrow_dplyr_query")
+ t2 <- collect(b2)
+ expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")])
+ # Test that the original object is not affected
+ expect_identical(collect(batch), tbl)
+})
+
+test_that("dim() on query", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 5) %>%
+ select(int, chr) %>%
+ dim(),
+ tbl
+ )
+})
+
+test_that("Print method", {
+ expect_output(
+ record_batch(tbl) %>%
+ filter(dbl > 2, chr == "d" | chr == "f") %>%
+ select(chr, int, lgl) %>%
+ filter(int < 5) %>%
+ select(int, chr) %>%
+ print(),
+ 'InMemoryDataset (query)
+int: int32
+chr: string
+
+* Filter: (((dbl > 2) and ((chr == "d") or (chr == "f"))) and (int < 5))
+See $.data for the source Arrow object',
+ fixed = TRUE
+ )
+})
+
+test_that("pull", {
+ compare_dplyr_binding(
+ .input %>% pull(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>% pull(1),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>% pull(chr),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 4) %>%
+ rename(strng = chr) %>%
+ pull(strng),
+ tbl
+ )
+})
+
+test_that("collect(as_data_frame=FALSE)", {
+ batch <- record_batch(tbl)
+
+ b1 <- batch %>% collect(as_data_frame = FALSE)
+
+ expect_r6_class(b1, "RecordBatch")
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ collect(as_data_frame = FALSE)
+
+ # collect(as_data_frame = FALSE) always returns Table now
+ expect_r6_class(b2, "Table")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ collect(as_data_frame = FALSE)
+ expect_r6_class(b3, "Table")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ collect(as_data_frame = FALSE)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("compute()", {
+ batch <- record_batch(tbl)
+
+ b1 <- batch %>% compute()
+
+ expect_r6_class(b1, "RecordBatch")
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ compute()
+
+ expect_r6_class(b2, "Table")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ compute()
+ expect_r6_class(b3, "Table")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ compute()
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("head", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ head(2)
+ expect_s3_class(b2, "arrow_dplyr_query")
+ expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")][1:2, ]
+ expect_equal(collect(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ head(2)
+ expect_s3_class(b3, "arrow_dplyr_query")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ head(2)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+
+ expect_equal(
+ batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ head(2) %>%
+ mutate(twice = int * 2) %>%
+ collect(),
+ expected %>%
+ rename(strng = chr) %>%
+ mutate(twice = int * 2)
+ )
+
+ # This would fail if we evaluated head() after filter()
+ expect_equal(
+ batch %>%
+ select(int, strng = chr) %>%
+ head(2) %>%
+ filter(int > 5) %>%
+ collect(),
+ expected %>%
+ rename(strng = chr) %>%
+ filter(FALSE)
+ )
+})
+
+test_that("arrange then head returns the right data (ARROW-14162)", {
+
+ compare_dplyr_binding(
+ .input %>%
+ # mpg has ties so we need to sort by two things to get deterministic order
+ arrange(mpg, disp) %>%
+ head(4) %>%
+ collect(),
+ mtcars,
+ ignore_attr = "row.names"
+ )
+})
+
+test_that("arrange then tail returns the right data", {
+ compare_dplyr_binding(
+ .input %>%
+ # mpg has ties so we need to sort by two things to get deterministic order
+ arrange(mpg, disp) %>%
+ tail(4) %>%
+ collect(),
+ mtcars,
+ ignore_attr = "row.names"
+ )
+})
+
+test_that("tail", {
+ batch <- record_batch(tbl)
+
+ b2 <- batch %>%
+ select(int, chr) %>%
+ filter(int > 5) %>%
+ arrange(int) %>%
+ tail(2)
+
+ expect_s3_class(b2, "arrow_dplyr_query")
+ expected <- tail(tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")], 2)
+ expect_equal(as.data.frame(b2), expected)
+
+ b3 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ arrange(int) %>%
+ tail(2)
+ expect_s3_class(b3, "arrow_dplyr_query")
+ expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng")))
+
+ b4 <- batch %>%
+ select(int, strng = chr) %>%
+ filter(int > 5) %>%
+ group_by(int) %>%
+ arrange(int) %>%
+ tail(2)
+ expect_s3_class(b4, "arrow_dplyr_query")
+ expect_equal(
+ as.data.frame(b4),
+ expected %>%
+ rename(strng = chr) %>%
+ group_by(int)
+ )
+})
+
+test_that("No duplicate field names are allowed in an arrow_dplyr_query", {
+ expect_error(
+ Table$create(tbl, tbl) %>%
+ filter(int > 0),
+ regexp = paste0(
+ 'The following field names were found more than once in the data: "int", "dbl", ',
+ '"dbl2", "lgl", "false", "chr", "fct", "verses", "padded_strings"'
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-select.R b/src/arrow/r/tests/testthat/test-dplyr-select.R
new file mode 100644
index 000000000..2ca2b100e
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-select.R
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+
+test_that("Empty select returns no columns", {
+ compare_dplyr_binding(
+ .input %>% select() %>% collect(),
+ tbl,
+ skip_table = "Table with 0 cols doesn't know how many rows it should have"
+ )
+})
+test_that("Empty select still includes the group_by columns", {
+ expect_message(
+ compare_dplyr_binding(
+ .input %>% group_by(chr) %>% select() %>% collect(),
+ tbl
+ ),
+ "Adding missing grouping variables"
+ )
+})
+
+test_that("select/rename", {
+ compare_dplyr_binding(
+ .input %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ rename(string = chr) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ rename(strng = chr) %>%
+ rename(other = strng) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("select/rename with selection helpers", {
+
+ # TODO: add some passing tests here
+
+ expect_error(
+ compare_dplyr_binding(
+ .input %>%
+ select(where(is.numeric)) %>%
+ collect(),
+ tbl
+ ),
+ "Unsupported selection helper"
+ )
+})
+
+test_that("filtering with rename", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(chr == "b") %>%
+ select(string = chr, int) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(string = chr, int) %>%
+ filter(string == "b") %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("relocate", {
+ df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
+ compare_dplyr_binding(
+ .input %>% relocate(f) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(a, .after = c) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(f, .before = b) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(a, .after = last_col()) %>% collect(),
+ df,
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(ff = f) %>% collect(),
+ df,
+ )
+})
+
+test_that("relocate with selection helpers", {
+ df <- tibble(a = 1, b = 1, c = 1, d = "a", e = "a", f = "a")
+ compare_dplyr_binding(
+ .input %>% relocate(any_of(c("a", "e", "i", "o", "u"))) %>% collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(where(is.character)) %>% collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(a, b, c, .after = where(is.character)) %>% collect(),
+ df
+ )
+ compare_dplyr_binding(
+ .input %>% relocate(d, e, f, .before = where(is.numeric)) %>% collect(),
+ df
+ )
+ # works after other dplyr verbs
+ compare_dplyr_binding(
+ .input %>%
+ mutate(c = as.character(c)) %>%
+ relocate(d, e, f, .after = where(is.numeric)) %>%
+ collect(),
+ df
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-dplyr-summarize.R b/src/arrow/r/tests/testthat/test-dplyr-summarize.R
new file mode 100644
index 000000000..3988412b8
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-dplyr-summarize.R
@@ -0,0 +1,881 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("dataset")
+
+withr::local_options(list(arrow.summarise.sort = TRUE))
+
+library(dplyr, warn.conflicts = FALSE)
+library(stringr)
+
+tbl <- example_data
+# Add some better string data
+tbl$verses <- verses[[1]]
+# c(" a ", " b ", " c ", ...) increasing padding
+# nchar = 3 5 7 9 11 13 15 17 19 21
+tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both")
+tbl$some_grouping <- rep(c(1, 2), 5)
+
+test_that("summarize() doesn't evaluate eagerly", {
+ expect_s3_class(
+ Table$create(tbl) %>%
+ summarize(total = sum(int)),
+ "arrow_dplyr_query"
+ )
+ expect_r6_class(
+ Table$create(tbl) %>%
+ summarize(total = sum(int)) %>%
+ compute(),
+ "ArrowTabular"
+ )
+})
+
+test_that("Can aggregate in Arrow", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ summarize(total = sum(int)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by sum on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int * 4, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int)) %>%
+ collect(),
+ tbl,
+ )
+})
+
+test_that("Group by mean on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(mean = mean(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(mean = mean(int, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by sd on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(sd = sd(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(sd = sd(int, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by var on dataset", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(var = var(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(var = var(int, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("n()", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(counts = n()) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(counts = n()) %>%
+ arrange(some_grouping) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by any/all", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(any(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(all(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(any(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(all(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(has_words = nchar(verses) < 0) %>%
+ group_by(some_grouping) %>%
+ summarize(any(has_words, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(has_words = nchar(verses) < 0) %>%
+ group_by(some_grouping) %>%
+ summarize(all(has_words, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(has_words = all(nchar(verses) < 0, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("n_distinct() on dataset", {
+ # With groupby
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+ # Without groupby
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(lgl, na.rm = FALSE)) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(lgl, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(int, lgl)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(distinct = n_distinct(int, lgl)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments"
+ )
+})
+
+test_that("Functions that take ... but we only accept a single arg", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct()) %>%
+ collect(),
+ tbl,
+ warning = "0 arguments"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ summarize(distinct = n_distinct(int, lgl)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments"
+ )
+ # Now that we've demonstrated that the whole machinery works, let's test
+ # the agg_funcs directly
+ expect_error(agg_funcs$n_distinct(), "n_distinct() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$sum(), "sum() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$any(), "any() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$all(), "all() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$min(), "min() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$max(), "max() with 0 arguments", fixed = TRUE)
+ expect_error(agg_funcs$n_distinct(1, 2), "Multiple arguments to n_distinct()")
+ expect_error(agg_funcs$sum(1, 2), "Multiple arguments to sum")
+ expect_error(agg_funcs$any(1, 2), "Multiple arguments to any()")
+ expect_error(agg_funcs$all(1, 2), "Multiple arguments to all()")
+ expect_error(agg_funcs$min(1, 2), "Multiple arguments to min()")
+ expect_error(agg_funcs$max(1, 2), "Multiple arguments to max()")
+})
+
+test_that("median()", {
+ # When medians are integer-valued, stats::median() sometimes returns output of
+ # type integer, whereas whereas the Arrow approx_median kernels always return
+ # output of type float64. The calls to median(int, ...) in the tests below
+ # are enclosed in as.double() to work around this known difference.
+
+ # Use old testthat behavior here so we don't have to assert the same warning
+ # over and over
+ local_edition(2)
+
+ # with groups
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ med_dbl = median(dbl),
+ med_int = as.double(median(int)),
+ med_dbl_narmf = median(dbl, FALSE),
+ med_int_narmf = as.double(median(int, na.rm = FALSE)),
+ med_dbl_narmt = median(dbl, na.rm = TRUE),
+ med_int_narmt = as.double(median(int, TRUE))
+ ) %>%
+ arrange(some_grouping) %>%
+ collect(),
+ tbl,
+ warning = "median\\(\\) currently returns an approximate median in Arrow"
+ )
+ # without groups, with na.rm = TRUE
+ compare_dplyr_binding(
+ .input %>%
+ summarize(
+ med_dbl_narmt = median(dbl, na.rm = TRUE),
+ med_int_narmt = as.double(median(int, TRUE))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "median\\(\\) currently returns an approximate median in Arrow"
+ )
+ # without groups, with na.rm = FALSE (the default)
+ compare_dplyr_binding(
+ .input %>%
+ summarize(
+ med_dbl = median(dbl),
+ med_int = as.double(median(int)),
+ med_dbl_narmf = median(dbl, FALSE),
+ med_int_narmf = as.double(median(int, na.rm = FALSE))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "median\\(\\) currently returns an approximate median in Arrow"
+ )
+ local_edition(3)
+})
+
+test_that("quantile()", {
+ # The default method for stats::quantile() throws an error when na.rm = FALSE
+ # and the input contains NA or NaN, whereas the Arrow tdigest kernels return
+ # null in this situation. To work around this known difference, the tests
+ # below always use na.rm = TRUE when the data contains NA or NaN.
+
+ # The default method for stats::quantile() has an argument `names` that
+ # controls whether the result has a names attribute. It defaults to
+ # names = TRUE. With Arrow, it is not possible to give the result a names
+ # attribute, so the quantile() binding in Arrow does not accept a `names`
+ # argument. Differences in this names attribute cause compare_dplyr_binding() to
+ # report that the objects are not equal, so we do not use compare_dplyr_binding()
+ # in the tests below.
+
+ # The tests below all use probs = 0.5 because other values cause differences
+ # between the exact quantiles returned by R and the approximate quantiles
+ # returned by Arrow.
+
+ # When quantiles are integer-valued, stats::quantile() sometimes returns
+ # output of type integer, whereas whereas the Arrow tdigest kernels always
+ # return output of type float64. The calls to quantile(int, ...) in the tests
+ # below are enclosed in as.double() to work around this known difference.
+
+ local_edition(2)
+ # with groups
+ expect_warning(
+ expect_equal(
+ tbl %>%
+ group_by(some_grouping) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
+ q_int = as.double(
+ quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+ )
+ ) %>%
+ arrange(some_grouping),
+ Table$create(tbl) %>%
+ group_by(some_grouping) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
+ q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE))
+ ) %>%
+ arrange(some_grouping) %>%
+ collect()
+ ),
+ "quantile() currently returns an approximate quantile in Arrow",
+ fixed = TRUE
+ )
+
+ # without groups
+ expect_warning(
+ expect_equal(
+ tbl %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE, names = FALSE),
+ q_int = as.double(
+ quantile(int, probs = 0.5, na.rm = TRUE, names = FALSE)
+ )
+ ),
+ Table$create(tbl) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = TRUE),
+ q_int = as.double(quantile(int, probs = 0.5, na.rm = TRUE))
+ ) %>%
+ collect()
+ ),
+ "quantile() currently returns an approximate quantile in Arrow",
+ fixed = TRUE
+ )
+
+ # with missing values and na.rm = FALSE
+ expect_warning(
+ expect_equal(
+ tibble(
+ q_dbl = NA_real_,
+ q_int = NA_real_
+ ),
+ Table$create(tbl) %>%
+ summarize(
+ q_dbl = quantile(dbl, probs = 0.5, na.rm = FALSE),
+ q_int = as.double(quantile(int, probs = 0.5, na.rm = FALSE))
+ ) %>%
+ collect()
+ ),
+ "quantile() currently returns an approximate quantile in Arrow",
+ fixed = TRUE
+ )
+ local_edition(3)
+
+ # with a vector of 2+ probs
+ expect_warning(
+ Table$create(tbl) %>%
+ summarize(q = quantile(dbl, probs = c(0.2, 0.8), na.rm = TRUE)),
+ "quantile() with length(probs) != 1 not supported by Arrow",
+ fixed = TRUE
+ )
+})
+
+test_that("summarize() with min() and max()", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>% # this filters out the NAs in `int`
+ summarize(min_int = min(int), max_int = max(int)) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ filter(int > 5) %>% # this filters out the NAs in `int`
+ summarize(
+ min_int = min(int + 4) / 2,
+ max_int = 3 / max(42 - int)
+ ) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int, chr) %>%
+ summarize(min_int = min(int), max_int = max(int)) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int) %>%
+ summarize(
+ min_int = min(int, na.rm = TRUE),
+ max_int = max(int, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tbl,
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(dbl, int) %>%
+ summarize(
+ min_int = -min(log(ceiling(dbl)), na.rm = TRUE),
+ max_int = log(max(as.double(int), na.rm = TRUE))
+ ) %>%
+ collect(),
+ tbl,
+ )
+
+ # multiple dots arguments to min(), max() not supported
+ compare_dplyr_binding(
+ .input %>%
+ summarize(min_mult = min(dbl, int)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments to min\\(\\) not supported by Arrow"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(int, dbl, dbl2) %>%
+ summarize(max_mult = max(int, dbl, dbl2)) %>%
+ collect(),
+ tbl,
+ warning = "Multiple arguments to max\\(\\) not supported by Arrow"
+ )
+
+ # min(logical) or max(logical) yields integer in R
+ # min(Boolean) or max(Boolean) yields Boolean in Arrow
+ compare_dplyr_binding(
+ .input %>%
+ select(lgl) %>%
+ summarize(
+ max_lgl = as.logical(max(lgl, na.rm = TRUE)),
+ min_lgl = as.logical(min(lgl, na.rm = TRUE))
+ ) %>%
+ collect(),
+ tbl,
+ )
+})
+
+test_that("min() and max() on character strings", {
+ compare_dplyr_binding(
+ .input %>%
+ summarize(
+ min_chr = min(chr, na.rm = TRUE),
+ max_chr = max(chr, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tbl,
+ )
+ skip("Strings not supported by hash_min_max (ARROW-13988)")
+ compare_dplyr_binding(
+ .input %>%
+ group_by(fct) %>%
+ summarize(
+ min_chr = min(chr, na.rm = TRUE),
+ max_chr = max(chr, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tbl,
+ )
+})
+
+test_that("summarise() with !!sym()", {
+ test_chr_col <- "int"
+ test_dbl_col <- "dbl"
+ test_lgl_col <- "lgl"
+ compare_dplyr_binding(
+ .input %>%
+ group_by(false) %>%
+ summarise(
+ sum = sum(!!sym(test_dbl_col)),
+ any = any(!!sym(test_lgl_col)),
+ all = all(!!sym(test_lgl_col)),
+ mean = mean(!!sym(test_dbl_col)),
+ sd = sd(!!sym(test_dbl_col)),
+ var = var(!!sym(test_dbl_col)),
+ n_distinct = n_distinct(!!sym(test_chr_col)),
+ min = min(!!sym(test_dbl_col)),
+ max = max(!!sym(test_dbl_col))
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Filter and aggregate", {
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_grouping == 2) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 5) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(some_grouping == 2) %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(int > 5) %>%
+ group_by(some_grouping) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Group by edge cases", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping * 2) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(alt = some_grouping * 2) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Do things after summarize", {
+ group2_sum <- tbl %>%
+ group_by(some_grouping) %>%
+ filter(int > 5) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ pull() %>%
+ tail(1)
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ filter(int > 5) %>%
+ summarize(total = sum(int, na.rm = TRUE)) %>%
+ filter(total == group2_sum) %>%
+ mutate(extra = total * 5) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ filter(dbl > 2) %>%
+ select(chr, int, lgl) %>%
+ mutate(twice = int * 2L) %>%
+ group_by(lgl) %>%
+ summarize(
+ count = n(),
+ total = sum(twice, na.rm = TRUE)
+ ) %>%
+ mutate(mean = total / count) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Expressions on aggregations", {
+ # This is what it effectively is
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any = any(lgl),
+ all = all(lgl)
+ ) %>%
+ ungroup() %>% # TODO: loosen the restriction on mutate after group_by
+ mutate(some = any & !all) %>%
+ select(some_grouping, some) %>%
+ collect(),
+ tbl
+ )
+ # More concisely:
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(any(lgl) & !all(lgl)) %>%
+ collect(),
+ tbl
+ )
+
+ # Save one of the aggregates first
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any_lgl = any(lgl),
+ some = any_lgl & !all(lgl)
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # Make sure order of columns in result is correct
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any_lgl = any(lgl),
+ some = any_lgl & !all(lgl),
+ n()
+ ) %>%
+ collect(),
+ tbl
+ )
+
+ # Aggregate on an aggregate (trivial but dplyr allows)
+ skip("Aggregate on an aggregate not supported")
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ any_lgl = any(any(lgl))
+ ) %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Summarize with 0 arguments", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize() %>%
+ collect(),
+ tbl
+ )
+})
+
+test_that("Not (yet) supported: implicit join", {
+ withr::local_options(list(arrow.debug = TRUE))
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ sum((dbl - mean(dbl))^2)
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ sum(dbl - mean(dbl))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression sum\\(dbl - mean\\(dbl\\)\\) not supported in Arrow; pulling data into R"
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ sqrt(sum((dbl - mean(dbl))^2) / (n() - 1L))
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\) not supported in Arrow; pulling data into R"
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ dbl - mean(dbl)
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression dbl - mean\\(dbl\\) not supported in Arrow; pulling data into R"
+ )
+
+ # This one could possibly be supported--in mutate()
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping) %>%
+ summarize(
+ dbl - int
+ ) %>%
+ collect(),
+ tbl,
+ warning = "Expression dbl - int not supported in Arrow; pulling data into R"
+ )
+})
+
+test_that(".groups argument", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n()) %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "drop_last") %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "keep") %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "drop") %>%
+ collect(),
+ tbl
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "rowwise") %>%
+ collect(),
+ tbl,
+ warning = TRUE
+ )
+
+ # abandon_ship() raises the warning, then dplyr itself errors
+ # This isn't ideal but it's fine and won't be an issue on Datasets
+ expect_error(
+ expect_warning(
+ Table$create(tbl) %>%
+ group_by(some_grouping, int < 6) %>%
+ summarize(count = n(), .groups = "NOTVALID"),
+ "Invalid .groups argument"
+ ),
+ "NOTVALID"
+ )
+})
+
+test_that("summarize() handles group_by .drop", {
+ # Error: Type error: Sorting not supported for type dictionary<values=string, indices=int8, ordered=0>
+ withr::local_options(list(arrow.summarise.sort = FALSE))
+
+ tbl <- tibble(
+ x = 1:10,
+ y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c"))
+ )
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y) %>%
+ count() %>%
+ collect() %>%
+ arrange(y),
+ tbl
+ )
+ # Not supported: check message
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y, .drop = FALSE) %>%
+ count() %>%
+ collect() %>%
+ # Because it's not supported, we have to filter out the (empty) row
+ # that dplyr keeps, just so we test equal (otherwise)
+ filter(y != "b") %>%
+ arrange(y),
+ tbl,
+ warning = ".drop = FALSE currently not supported in Arrow aggregation"
+ )
+
+ # But this is ok because there is no factor group
+ compare_dplyr_binding(
+ .input %>%
+ group_by(y, .drop = FALSE) %>%
+ count() %>%
+ collect() %>%
+ arrange(y),
+ tibble(
+ x = 1:10,
+ y = rep(c("a", "c"), each = 5)
+ )
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-duckdb.R b/src/arrow/r/tests/testthat/test-duckdb.R
new file mode 100644
index 000000000..decd6e80e
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-duckdb.R
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_installed("duckdb", minimum_version = "0.2.8")
+skip_if_not_installed("dbplyr")
+skip_if_not_available("dataset")
+skip_on_cran()
+
+library(duckdb, quietly = TRUE)
+library(dplyr, warn.conflicts = FALSE)
+
+test_that("to_duckdb", {
+ ds <- InMemoryDataset$create(example_data)
+
+ expect_identical(
+ ds %>%
+ to_duckdb() %>%
+ collect() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(!fct),
+ select(example_data, !fct)
+ )
+
+ expect_identical(
+ ds %>%
+ select(int, lgl, dbl) %>%
+ to_duckdb() %>%
+ group_by(lgl) %>%
+ summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>%
+ collect(),
+ tibble::tibble(
+ lgl = c(TRUE, NA, FALSE),
+ mean_int = c(3, 6.25, 8.5),
+ mean_dbl = c(3.1, 6.35, 6.1)
+ )
+ )
+
+ # can group_by before the to_duckdb
+ expect_identical(
+ ds %>%
+ select(int, lgl, dbl) %>%
+ group_by(lgl) %>%
+ to_duckdb() %>%
+ summarise(mean_int = mean(int, na.rm = TRUE), mean_dbl = mean(dbl, na.rm = TRUE)) %>%
+ collect(),
+ tibble::tibble(
+ lgl = c(TRUE, NA, FALSE),
+ mean_int = c(3, 6.25, 8.5),
+ mean_dbl = c(3.1, 6.35, 6.1)
+ )
+ )
+})
+
+test_that("to_duckdb then to_arrow", {
+ ds <- InMemoryDataset$create(example_data)
+
+ ds_rt <- ds %>%
+ to_duckdb() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(-fct) %>%
+ to_arrow()
+
+ expect_identical(
+ collect(ds_rt),
+ ds %>%
+ select(-fct) %>%
+ collect()
+ )
+
+ # And we can continue the pipeline
+ ds_rt <- ds %>%
+ to_duckdb() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(-fct) %>%
+ to_arrow() %>%
+ filter(int > 5)
+
+ expect_identical(
+ collect(ds_rt),
+ ds %>%
+ select(-fct) %>%
+ filter(int > 5) %>%
+ collect()
+ )
+
+ # Now check errors
+ ds_rt <- ds %>%
+ to_duckdb() %>%
+ # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
+ select(-fct)
+
+ # alter the class of ds_rt's connection to simulate some other database
+ class(ds_rt$src$con) <- "some_other_connection"
+
+ expect_error(
+ to_arrow(ds_rt),
+ "to_arrow\\(\\) currently only supports Arrow tables, Arrow datasets,"
+ )
+})
+
+# The next set of tests use an already-extant connection to test features of
+# persistence and querying against the table without using the `tbl` itself, so
+# we need to create a connection separate from the ephemeral one that is made
+# with arrow_duck_connection()
+con <- dbConnect(duckdb::duckdb())
+dbExecute(con, "PRAGMA threads=2")
+on.exit(dbDisconnect(con, shutdown = TRUE), add = TRUE)
+
+# write one table to the connection so it is kept open
+DBI::dbWriteTable(con, "mtcars", mtcars)
+
+test_that("Joining, auto-cleanup enabled", {
+ ds <- InMemoryDataset$create(example_data)
+
+ table_one_name <- "my_arrow_table_1"
+ table_one <- to_duckdb(ds, con = con, table_name = table_one_name, auto_disconnect = TRUE)
+ table_two_name <- "my_arrow_table_2"
+ table_two <- to_duckdb(ds, con = con, table_name = table_two_name, auto_disconnect = TRUE)
+
+ res <- dbGetQuery(
+ con,
+ paste0(
+ "SELECT * FROM ", table_one_name,
+ " INNER JOIN ", table_two_name,
+ " ON ", table_one_name, ".int = ", table_two_name, ".int"
+ )
+ )
+ expect_identical(dim(res), c(9L, 14L))
+
+ # clean up cleans up the tables
+ expect_true(all(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
+ rm(table_one, table_two)
+ gc()
+ expect_false(any(c(table_one_name, table_two_name) %in% DBI::dbListTables(con)))
+})
+
+test_that("Joining, auto-cleanup disabled", {
+ ds <- InMemoryDataset$create(example_data)
+
+ table_three_name <- "my_arrow_table_3"
+ table_three <- to_duckdb(ds, con = con, table_name = table_three_name)
+
+ # clean up does *not* clean these tables
+ expect_true(table_three_name %in% DBI::dbListTables(con))
+ rm(table_three)
+ gc()
+ # but because we aren't auto_disconnecting then we still have this table.
+ expect_true(table_three_name %in% DBI::dbListTables(con))
+})
+
+test_that("to_duckdb with a table", {
+ tab <- Table$create(example_data)
+
+ expect_identical(
+ tab %>%
+ to_duckdb() %>%
+ group_by(int > 4) %>%
+ summarise(
+ int_mean = mean(int, na.rm = TRUE),
+ dbl_mean = mean(dbl, na.rm = TRUE)
+ ) %>%
+ collect(),
+ tibble::tibble(
+ "int > 4" = c(FALSE, NA, TRUE),
+ int_mean = c(2, NA, 7.5),
+ dbl_mean = c(2.1, 4.1, 7.3)
+ )
+ )
+})
+
+test_that("to_duckdb passing a connection", {
+ ds <- InMemoryDataset$create(example_data)
+
+ con_separate <- dbConnect(duckdb::duckdb())
+ # we always want to test in parallel
+ dbExecute(con_separate, "PRAGMA threads=2")
+ on.exit(dbDisconnect(con_separate, shutdown = TRUE), add = TRUE)
+
+ # create a table to join to that we know is in our con_separate
+ new_df <- data.frame(
+ int = 1:10,
+ char = letters[26:17],
+ stringsAsFactors = FALSE
+ )
+ DBI::dbWriteTable(con_separate, "separate_join_table", new_df)
+
+ table_four <- ds %>%
+ select(int, lgl, dbl) %>%
+ to_duckdb(con = con_separate, auto_disconnect = FALSE)
+ table_four_name <- table_four$ops$x
+
+ result <- DBI::dbGetQuery(
+ con_separate,
+ paste0(
+ "SELECT * FROM ", table_four_name,
+ " INNER JOIN separate_join_table ",
+ "ON separate_join_table.int = ", table_four_name, ".int"
+ )
+ )
+
+ expect_identical(dim(result), c(9L, 5L))
+ expect_identical(result$char, new_df[new_df$int != 4, ]$char)
+})
diff --git a/src/arrow/r/tests/testthat/test-expression.R b/src/arrow/r/tests/testthat/test-expression.R
new file mode 100644
index 000000000..c4aab718d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-expression.R
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("C++ expressions", {
+ skip_if_not_available("dataset")
+ f <- Expression$field_ref("f")
+ expect_identical(f$field_name, "f")
+ g <- Expression$field_ref("g")
+ date <- Expression$scalar(as.Date("2020-01-15"))
+ ts <- Expression$scalar(as.POSIXct("2020-01-17 11:11:11"))
+ i64 <- Expression$scalar(bit64::as.integer64(42))
+ time <- Expression$scalar(hms::hms(56, 34, 12))
+
+ expect_r6_class(f == g, "Expression")
+ expect_r6_class(f == 4, "Expression")
+ expect_r6_class(f == "", "Expression")
+ expect_r6_class(f == NULL, "Expression")
+ expect_r6_class(f == date, "Expression")
+ expect_r6_class(f == i64, "Expression")
+ expect_r6_class(f == time, "Expression")
+ # can't seem to make this work right now because of R Ops.method dispatch
+ # expect_r6_class(f == as.Date("2020-01-15"), "Expression") # nolint
+ expect_r6_class(f == ts, "Expression")
+ expect_r6_class(f <= 2L, "Expression")
+ expect_r6_class(f != FALSE, "Expression")
+ expect_r6_class(f > 4, "Expression")
+ expect_r6_class(f < 4 & f > 2, "Expression")
+ expect_r6_class(f < 4 | f > 2, "Expression")
+ expect_r6_class(!(f < 4), "Expression")
+ expect_output(
+ print(f > 4),
+ "Expression\n(f > 4)",
+ fixed = TRUE
+ )
+ expect_equal(
+ f$type(schema(f = float64())),
+ float64()
+ )
+ expect_equal(
+ (f > 4)$type(schema(f = float64())),
+ bool()
+ )
+ # Interprets that as a list type
+ expect_r6_class(f == c(1L, 2L), "Expression")
+
+ expect_error(
+ Expression$create("add", 1, 2),
+ "Expression arguments must be Expression objects"
+ )
+})
+
+test_that("Field reference expression schemas and types", {
+ x <- Expression$field_ref("x")
+
+ # type() throws error when schema is NULL
+ expect_error(x$type(), "schema")
+
+ # type() returns type when schema is set
+ x$schema <- Schema$create(x = int32())
+ expect_equal(x$type(), int32())
+})
+
+test_that("Scalar expression schemas and types", {
+ # type() works on scalars without setting the schema
+ expect_equal(
+ Expression$scalar("foo")$type(),
+ arrow::string()
+ )
+ expect_equal(
+ Expression$scalar(42L)$type(),
+ int32()
+ )
+})
+
+test_that("Expression schemas and types", {
+ x <- Expression$field_ref("x")
+ y <- Expression$field_ref("y")
+ z <- Expression$scalar(42L)
+
+ # type() throws error when both schemas are unset
+ expect_error(
+ Expression$create("add_checked", x, y)$type(),
+ "schema"
+ )
+
+ # type() throws error when left schema is unset
+ y$schema <- Schema$create(y = float64())
+ expect_error(
+ Expression$create("add_checked", x, y)$type(),
+ "schema"
+ )
+
+ # type() throws error when right schema is unset
+ x$schema <- Schema$create(x = int32())
+ y$schema <- NULL
+ expect_error(
+ Expression$create("add_checked", x, y)$type(),
+ "schema"
+ )
+
+ # type() returns type when both schemas are set
+ y$schema <- Schema$create(y = float64())
+ expect_equal(
+ Expression$create("add_checked", x, y)$type(),
+ float64()
+ )
+
+ # type() returns type when one arg has schema set and one is scalar
+ expect_equal(
+ Expression$create("add_checked", x, z)$type(),
+ int32()
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-feather.R b/src/arrow/r/tests/testthat/test-feather.R
new file mode 100644
index 000000000..136474dea
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-feather.R
@@ -0,0 +1,256 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+feather_file <- tempfile()
+tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+
+test_that("Write a feather file", {
+ tib_out <- write_feather(tib, feather_file)
+ expect_true(file.exists(feather_file))
+ # Input is returned unmodified
+ expect_identical(tib_out, tib)
+})
+
+expect_feather_roundtrip <- function(write_fun) {
+ tf2 <- normalizePath(tempfile(), mustWork = FALSE)
+ tf3 <- tempfile()
+ on.exit({
+ unlink(tf2)
+ unlink(tf3)
+ })
+
+ # Write two ways. These are what varies with each run
+ write_fun(tib, tf2)
+ expect_true(file.exists(tf2))
+
+ stream <- FileOutputStream$create(tf3)
+ write_fun(tib, stream)
+ stream$close()
+ expect_true(file.exists(tf3))
+
+ # Read both back
+ tab2 <- read_feather(tf2)
+ expect_s3_class(tab2, "data.frame")
+
+ tab3 <- read_feather(tf3)
+ expect_s3_class(tab3, "data.frame")
+
+ # reading directly from arrow::io::MemoryMappedFile
+ tab4 <- read_feather(mmap_open(tf3))
+ expect_s3_class(tab4, "data.frame")
+
+ # reading directly from arrow::io::ReadableFile
+ tab5 <- read_feather(ReadableFile$create(tf3))
+ expect_s3_class(tab5, "data.frame")
+
+ expect_equal(tib, tab2)
+ expect_equal(tib, tab3)
+ expect_equal(tib, tab4)
+ expect_equal(tib, tab5)
+}
+
+test_that("feather read/write round trip", {
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 1))
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 2))
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, chunk_size = 32))
+ if (codec_is_available("lz4")) {
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4"))
+ }
+ if (codec_is_available("zstd")) {
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd"))
+ expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd", compression_level = 3))
+ }
+
+ # Write from Arrow data structures
+ expect_feather_roundtrip(function(x, f) write_feather(RecordBatch$create(x), f))
+ expect_feather_roundtrip(function(x, f) write_feather(Table$create(x), f))
+})
+
+test_that("write_feather option error handling", {
+ tf <- tempfile()
+ expect_false(file.exists(tf))
+ expect_error(
+ write_feather(tib, tf, version = 1, chunk_size = 1024),
+ "Feather version 1 does not support the 'chunk_size' option"
+ )
+ expect_error(
+ write_feather(tib, tf, version = 1, compression = "lz4"),
+ "Feather version 1 does not support the 'compression' option"
+ )
+ expect_error(
+ write_feather(tib, tf, version = 1, compression_level = 1024),
+ "Feather version 1 does not support the 'compression_level' option"
+ )
+ expect_error(
+ write_feather(tib, tf, compression_level = 1024),
+ "Can only specify a 'compression_level' when 'compression' is 'zstd'"
+ )
+ expect_match_arg_error(write_feather(tib, tf, compression = "bz2"))
+ expect_false(file.exists(tf))
+})
+
+test_that("write_feather with invalid input type", {
+ bad_input <- Array$create(1:5)
+ expect_error(
+ write_feather(bad_input, feather_file),
+ regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
+ )
+})
+
+test_that("read_feather supports col_select = <names>", {
+ tab1 <- read_feather(feather_file, col_select = c("x", "y"))
+ expect_s3_class(tab1, "data.frame")
+
+ expect_equal(tib$x, tab1$x)
+ expect_equal(tib$y, tab1$y)
+})
+
+test_that("feather handles col_select = <integer>", {
+ tab1 <- read_feather(feather_file, col_select = 1:2)
+ expect_s3_class(tab1, "data.frame")
+
+ expect_equal(tib$x, tab1$x)
+ expect_equal(tib$y, tab1$y)
+})
+
+test_that("feather handles col_select = <tidyselect helper>", {
+ tab1 <- read_feather(feather_file, col_select = everything())
+ expect_identical(tib, tab1)
+
+ tab2 <- read_feather(feather_file, col_select = starts_with("x"))
+ expect_identical(tab2, tib[, "x", drop = FALSE])
+
+ tab3 <- read_feather(feather_file, col_select = c(starts_with("x"), contains("y")))
+ expect_identical(tab3, tib[, c("x", "y"), drop = FALSE])
+
+ tab4 <- read_feather(feather_file, col_select = -z)
+ expect_identical(tab4, tib[, c("x", "y"), drop = FALSE])
+})
+
+test_that("feather read/write round trip", {
+ tab1 <- read_feather(feather_file, as_data_frame = FALSE)
+ expect_r6_class(tab1, "Table")
+
+ expect_equal(tib, as.data.frame(tab1))
+})
+
+test_that("Read feather from raw vector", {
+ test_raw <- readBin(feather_file, what = "raw", n = 5000)
+ df <- read_feather(test_raw)
+ expect_s3_class(df, "data.frame")
+})
+
+test_that("FeatherReader", {
+ v1 <- tempfile()
+ v2 <- tempfile()
+ on.exit({
+ unlink(v1)
+ unlink(v2)
+ })
+ write_feather(tib, v1, version = 1)
+ write_feather(tib, v2)
+ f1 <- make_readable_file(v1)
+ reader1 <- FeatherReader$create(f1)
+ f1$close()
+ expect_identical(reader1$version, 1L)
+ f2 <- make_readable_file(v2)
+ reader2 <- FeatherReader$create(f2)
+ expect_identical(reader2$version, 2L)
+ f2$close()
+})
+
+test_that("read_feather requires RandomAccessFile and errors nicely otherwise (ARROW-8615)", {
+ skip_if_not_available("gzip")
+ expect_error(
+ read_feather(CompressedInputStream$create(feather_file)),
+ 'file must be a "RandomAccessFile"'
+ )
+})
+
+test_that("read_feather closes connection to file", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_feather(tib, sink = tf)
+ expect_true(file.exists(tf))
+ read_feather(tf)
+ expect_error(file.remove(tf), NA)
+ expect_false(file.exists(tf))
+})
+
+test_that("Character vectors > 2GB can write to feather", {
+ skip_on_cran()
+ skip_if_not_running_large_memory_tests()
+ df <- tibble::tibble(big = make_big_string())
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_feather(df, tf)
+ expect_identical(read_feather(tf), df)
+})
+
+test_that("FeatherReader methods", {
+ # Setup a feather file to use in the test
+ feather_temp <- tempfile()
+ on.exit({
+ unlink(feather_temp)
+ })
+ write_feather(tib, feather_temp)
+ feather_temp_RA <- make_readable_file(feather_temp)
+
+ reader <- FeatherReader$create(feather_temp_RA)
+ feather_temp_RA$close()
+
+ # column_names
+ expect_identical(
+ reader$column_names,
+ c("x", "y", "z")
+ )
+
+ # print method
+ expect_identical(
+ capture.output(print(reader)),
+ # TODO: can we get rows/columns?
+ c("FeatherReader:", "Schema", "x: int32", "y: double", "z: string")
+ )
+})
+
+unlink(feather_file)
+
+ft_file <- test_path("golden-files/data-arrow_2.0.0_lz4.feather")
+
+test_that("Error messages are shown when the compression algorithm lz4 is not found", {
+ msg <- paste0(
+ "NotImplemented: Support for codec 'lz4' not built\nIn order to read this file, ",
+ "you will need to reinstall arrow with additional features enabled.\nSet one of ",
+ "these environment variables before installing:\n\n * LIBARROW_MINIMAL=false ",
+ "(for all optional features, including 'lz4')\n * ARROW_WITH_LZ4=ON (for just 'lz4')",
+ "\n\nSee https://arrow.apache.org/docs/r/articles/install.html for details"
+ )
+
+ if (codec_is_available("lz4")) {
+ d <- read_feather(ft_file)
+ expect_s3_class(d, "data.frame")
+ } else {
+ expect_error(read_feather(ft_file), msg, fixed = TRUE)
+ }
+})
+
+test_that("Error is created when feather reads a parquet file", {
+ expect_error(
+ read_feather(system.file("v0.7.1.parquet", package = "arrow")),
+ "Not a Feather V1 or Arrow IPC file"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-field.R b/src/arrow/r/tests/testthat/test-field.R
new file mode 100644
index 000000000..1be36c064
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-field.R
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("field() factory", {
+ x <- field("x", int32())
+ expect_equal(x$type, int32())
+ expect_equal(x$name, "x")
+ expect_true(x$nullable)
+ expect_true(x == x)
+ expect_false(x == field("x", int64()))
+})
+
+test_that("Field with nullable values", {
+ x <- field("x", int32(), nullable = FALSE)
+ expect_equal(x$type, int32())
+ expect_false(x$nullable)
+ expect_true(x == x)
+ expect_false(x == field("x", int32()))
+})
+
+test_that("Field validation", {
+ expect_error(schema(b = 32), "b must be a DataType, not numeric")
+})
+
+test_that("Print method for field", {
+ expect_output(print(field("x", int32())), "Field\nx: int32")
+ expect_output(
+ print(field("zz", dictionary())),
+ "Field\nzz: dictionary<values=string, indices=int32>"
+ )
+
+ expect_output(
+ print(field("x", int32(), nullable = FALSE)),
+ "Field\nx: int32 not null"
+ )
+
+})
+
+test_that("Field to C-interface", {
+ field <- field("x", time32("s"))
+
+ # export the field via the C-interface
+ ptr <- allocate_arrow_schema()
+ field$export_to_c(ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- Field$import_from_c(ptr)
+ expect_equal(circle, field)
+
+ # must clean up the pointer or we leak
+ delete_arrow_schema(ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-filesystem.R b/src/arrow/r/tests/testthat/test-filesystem.R
new file mode 100644
index 000000000..5ee096f13
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-filesystem.R
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("LocalFilesystem", {
+ fs <- LocalFileSystem$create()
+ expect_identical(fs$type_name, "local")
+ DESCRIPTION <- system.file("DESCRIPTION", package = "arrow")
+ info <- fs$GetFileInfo(DESCRIPTION)[[1]]
+ expect_equal(info$base_name(), "DESCRIPTION")
+ expect_equal(info$extension(), "")
+ expect_equal(info$type, FileType$File)
+ expect_equal(info$path, DESCRIPTION)
+ info <- file.info(DESCRIPTION)
+
+ expect_equal(info$size, info$size)
+ # This fails due to a subsecond difference on Appveyor on Windows with R 3.3 only
+ # So add a greater tolerance to allow for that
+ expect_equal(info$mtime, info$mtime, tolerance = 1)
+
+ tf <- tempfile(fileext = ".txt")
+ fs$CopyFile(DESCRIPTION, tf)
+ info <- fs$GetFileInfo(tf)[[1]]
+ expect_equal(info$extension(), "txt")
+ expect_equal(info$size, info$size)
+ expect_equal(readLines(DESCRIPTION), readLines(tf))
+
+ tf2 <- tempfile(fileext = ".txt")
+ fs$Move(tf, tf2)
+ infos <- fs$GetFileInfo(c(tf, tf2, dirname(tf)))
+ expect_equal(infos[[1]]$type, FileType$NotFound)
+ expect_equal(infos[[2]]$type, FileType$File)
+ expect_equal(infos[[3]]$type, FileType$Directory)
+
+ fs$DeleteFile(tf2)
+ expect_equal(fs$GetFileInfo(tf2)[[1L]]$type, FileType$NotFound)
+ expect_true(!file.exists(tf2))
+
+ expect_equal(fs$GetFileInfo(tf)[[1L]]$type, FileType$NotFound)
+ expect_true(!file.exists(tf))
+
+ td <- tempfile()
+ fs$CreateDir(td)
+ expect_equal(fs$GetFileInfo(td)[[1L]]$type, FileType$Directory)
+ fs$CopyFile(DESCRIPTION, file.path(td, "DESCRIPTION"))
+ fs$DeleteDirContents(td)
+ expect_equal(length(dir(td)), 0L)
+ fs$DeleteDir(td)
+ expect_equal(fs$GetFileInfo(td)[[1L]]$type, FileType$NotFound)
+
+ tf3 <- tempfile()
+ os <- fs$OpenOutputStream(path = tf3)
+ bytes <- as.raw(1:40)
+ os$write(bytes)
+ os$close()
+
+ is <- fs$OpenInputStream(tf3)
+ buf <- is$Read(40)
+ expect_equal(buf$data(), bytes)
+ is$close()
+})
+
+test_that("SubTreeFilesystem", {
+ dir.create(td <- tempfile())
+ DESCRIPTION <- system.file("DESCRIPTION", package = "arrow")
+ file.copy(DESCRIPTION, file.path(td, "DESCRIPTION"))
+
+ st_fs <- SubTreeFileSystem$create(td)
+ expect_r6_class(st_fs, "SubTreeFileSystem")
+ expect_r6_class(st_fs, "FileSystem")
+ expect_r6_class(st_fs$base_fs, "LocalFileSystem")
+ expect_identical(
+ capture.output(print(st_fs)),
+ paste0("SubTreeFileSystem: ", "file://", st_fs$base_path)
+ )
+
+ # FIXME windows has a trailing slash for one but not the other
+ # expect_identical(normalizePath(st_fs$base_path), normalizePath(td)) # nolint
+
+ st_fs$CreateDir("test")
+ st_fs$CopyFile("DESCRIPTION", "DESC.txt")
+ infos <- st_fs$GetFileInfo(c("DESCRIPTION", "test", "nope", "DESC.txt"))
+ expect_equal(infos[[1L]]$type, FileType$File)
+ expect_equal(infos[[2L]]$type, FileType$Directory)
+ expect_equal(infos[[3L]]$type, FileType$NotFound)
+ expect_equal(infos[[4L]]$type, FileType$File)
+ expect_equal(infos[[4L]]$extension(), "txt")
+
+ local_fs <- LocalFileSystem$create()
+ local_fs$DeleteDirContents(td)
+ infos <- st_fs$GetFileInfo(c("DESCRIPTION", "test", "nope", "DESC.txt"))
+ expect_equal(infos[[1L]]$type, FileType$NotFound)
+ expect_equal(infos[[2L]]$type, FileType$NotFound)
+ expect_equal(infos[[3L]]$type, FileType$NotFound)
+ expect_equal(infos[[4L]]$type, FileType$NotFound)
+})
+
+test_that("LocalFileSystem + Selector", {
+ fs <- LocalFileSystem$create()
+ dir.create(td <- tempfile())
+ writeLines("blah blah", file.path(td, "one.txt"))
+ writeLines("yada yada", file.path(td, "two.txt"))
+ dir.create(file.path(td, "dir"))
+ writeLines("...", file.path(td, "dir", "three.txt"))
+
+ selector <- FileSelector$create(td, recursive = TRUE)
+ infos <- fs$GetFileInfo(selector)
+ expect_equal(length(infos), 4L)
+ types <- sapply(infos, function(.x) .x$type)
+ expect_equal(sum(types == FileType$File), 3L)
+ expect_equal(sum(types == FileType$Directory), 1L)
+
+ selector <- FileSelector$create(td, recursive = FALSE)
+ infos <- fs$GetFileInfo(selector)
+ expect_equal(length(infos), 3L)
+ types <- sapply(infos, function(.x) .x$type)
+ expect_equal(sum(types == FileType$File), 2L)
+ expect_equal(sum(types == FileType$Directory), 1L)
+})
+
+test_that("FileSystem$from_uri", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ fs_and_path <- FileSystem$from_uri("s3://ursa-labs-taxi-data")
+ expect_r6_class(fs_and_path$fs, "S3FileSystem")
+ expect_identical(fs_and_path$fs$region, "us-east-2")
+})
+
+test_that("SubTreeFileSystem$create() with URI", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data")
+ expect_r6_class(fs, "SubTreeFileSystem")
+ expect_identical(
+ capture.output(print(fs)),
+ "SubTreeFileSystem: s3://ursa-labs-taxi-data/"
+ )
+})
+
+test_that("S3FileSystem", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ s3fs <- S3FileSystem$create()
+ expect_r6_class(s3fs, "S3FileSystem")
+})
+
+test_that("s3_bucket", {
+ skip_on_cran()
+ skip_if_not_available("s3")
+ skip_if_offline()
+ bucket <- s3_bucket("ursa-labs-r-test")
+ expect_r6_class(bucket, "SubTreeFileSystem")
+ expect_r6_class(bucket$base_fs, "S3FileSystem")
+ expect_identical(bucket$region, "us-west-2")
+ expect_identical(
+ capture.output(print(bucket)),
+ "SubTreeFileSystem: s3://ursa-labs-r-test/"
+ )
+ skip_on_os("windows") # FIXME
+ expect_identical(bucket$base_path, "ursa-labs-r-test/")
+})
diff --git a/src/arrow/r/tests/testthat/test-install-arrow.R b/src/arrow/r/tests/testthat/test-install-arrow.R
new file mode 100644
index 000000000..977f9d77d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-install-arrow.R
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+r_only({
+ test_that("arrow_repos", {
+ cran <- "https://cloud.r-project.org/"
+ ours <- "https://dl.example.com/ursalabs/fake_repo"
+ other <- "https://cran.fiocruz.br/"
+
+ opts <- list(
+ repos = c(CRAN = "@CRAN@"), # Restore defaul
+ arrow.dev_repo = ours
+ )
+ withr::with_options(opts, {
+ expect_identical(arrow_repos(), cran)
+ expect_identical(arrow_repos(c(cran, ours)), cran)
+ expect_identical(arrow_repos(c(ours, other)), other)
+ expect_identical(arrow_repos(nightly = TRUE), c(ours, cran))
+ expect_identical(arrow_repos(c(cran, ours), nightly = TRUE), c(ours, cran))
+ expect_identical(arrow_repos(c(ours, other), nightly = TRUE), c(ours, other))
+ })
+ })
+})
diff --git a/src/arrow/r/tests/testthat/test-json.R b/src/arrow/r/tests/testthat/test-json.R
new file mode 100644
index 000000000..825511b97
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-json.R
@@ -0,0 +1,255 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("json")
+
+test_that("Can read json file with scalars columns (ARROW-5503)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf, useBytes = TRUE)
+
+ tab1 <- read_json_arrow(tf, as_data_frame = FALSE)
+ tab2 <- read_json_arrow(mmap_open(tf), as_data_frame = FALSE)
+ tab3 <- read_json_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
+
+ expect_equal(tab1, tab2)
+ expect_equal(tab1, tab3)
+
+ expect_equal(
+ tab1$schema,
+ schema(hello = float64(), world = boolean(), yo = utf8())
+ )
+ tib <- as.data.frame(tab1)
+ expect_equal(tib$hello, c(3.5, 3.25, 3.125, 0))
+ expect_equal(tib$world, c(FALSE, NA, NA, TRUE))
+ expect_equal(tib$yo, c("thing", NA, "\u5fcd", NA))
+})
+
+test_that("read_json_arrow() converts to tibble", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf, useBytes = TRUE)
+
+ tab1 <- read_json_arrow(tf)
+ tab2 <- read_json_arrow(mmap_open(tf))
+ tab3 <- read_json_arrow(ReadableFile$create(tf))
+
+ expect_s3_class(tab1, "tbl_df")
+ expect_s3_class(tab2, "tbl_df")
+ expect_s3_class(tab3, "tbl_df")
+
+ expect_equal(tab1, tab2)
+ expect_equal(tab1, tab3)
+
+ expect_equal(tab1$hello, c(3.5, 3.25, 3.125, 0))
+ expect_equal(tab1$world, c(FALSE, NA, NA, TRUE))
+ expect_equal(tab1$yo, c("thing", NA, "\u5fcd", NA))
+})
+
+test_that("read_json_arrow() supports col_select=", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, col_select = c(hello, world))
+ expect_equal(names(tab1), c("hello", "world"))
+
+ tab2 <- read_json_arrow(tf, col_select = 1:2)
+ expect_equal(names(tab2), c("hello", "world"))
+})
+
+test_that("read_json_arrow(schema=) with empty schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97 }
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, schema = schema())
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ hello = c(3.5, 3.25, 3.125, 0),
+ world = c(2L, 5L, 8L, 10L),
+ third_col = c(99L, 98L, 97L, 96L)
+ )
+ )
+})
+
+test_that("read_json_arrow(schema=) with partial schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97 }
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, schema = schema(third_col = float64(), world = float64()))
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ third_col = c(99, 98, 97, 96),
+ world = c(2, 5, 8, 10),
+ hello = c(3.5, 3.25, 3.125, 0)
+ )
+ )
+
+ tf2 <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": "99"}
+ { "hello": 3.25, "world": 5, "third_col": "98"}
+ { "hello": 3.125, "world": 8, "third_col": "97"}
+ ', tf2)
+
+ tab2 <- read_json_arrow(tf2, schema = schema(third_col = string(), world = float64()))
+
+ expect_identical(
+ tab2,
+ tibble::tibble(
+ third_col = c("99", "98", "97"),
+ world = c(2, 5, 8),
+ hello = c(3.5, 3.25, 3.125)
+ )
+ )
+})
+
+test_that("read_json_arrow(schema=) with full schema", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": 2, "third_col": 99}
+ { "hello": 3.25, "world": 5, "third_col": 98}
+ { "hello": 3.125, "world": 8, "third_col": 97}
+ { "hello": 0.0, "world": 10, "third_col": 96}
+ ', tf)
+
+ tab1 <- read_json_arrow(
+ tf,
+ schema = schema(
+ hello = float64(),
+ third_col = float64(),
+ world = float64()
+ )
+ )
+
+ expect_identical(
+ tab1,
+ tibble::tibble(
+ hello = c(3.5, 3.25, 3.125, 0),
+ third_col = c(99, 98, 97, 96),
+ world = c(2, 5, 8, 10)
+ )
+ )
+})
+
+test_that("Can read json file with nested columns (ARROW-5503)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ { "arr": [1.0, 2.0, 3.0], "nuf": {} }
+ { "arr": [2.0], "nuf": null }
+ { "arr": [], "nuf": { "ps": 78.0, "hello": "hi" } }
+ { "arr": null, "nuf": { "ps": 90.0, "hello": "bonjour" } }
+ { "arr": [5.0], "nuf": { "hello": "ciao" } }
+ { "arr": [5.0, 6.0], "nuf": { "ps": 19 } }
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, as_data_frame = FALSE)
+ tab2 <- read_json_arrow(mmap_open(tf), as_data_frame = FALSE)
+ tab3 <- read_json_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
+
+ expect_equal(tab1, tab2)
+ expect_equal(tab1, tab3)
+
+ expect_equal(
+ tab1$schema,
+ schema(
+ arr = list_of(float64()),
+ nuf = struct(ps = float64(), hello = utf8())
+ )
+ )
+
+ struct_array <- tab1$column(1)$chunk(0)
+ ps <- Array$create(c(NA, NA, 78, 90, NA, 19))
+ hello <- Array$create(c(NA, NA, "hi", "bonjour", "ciao", NA))
+ expect_equal(struct_array$field(0L), ps)
+ expect_equal(struct_array$GetFieldByName("ps"), ps)
+ struct_cols <- struct_array$Flatten()
+ expect_identical(length(struct_cols), 2L)
+ expect_equal(struct_cols[[1]], ps)
+ expect_equal(struct_cols[[2]], hello)
+ expect_equal(
+ as.vector(struct_array),
+ tibble::tibble(ps = ps$as_vector(), hello = hello$as_vector())
+ )
+
+ list_array_r <- list(
+ c(1, 2, 3),
+ c(2),
+ numeric(),
+ NULL,
+ 5,
+ c(5, 6)
+ )
+ list_array <- tab1$column(0)
+ expect_equal(
+ list_array$as_vector(),
+ list_array_r,
+ ignore_attr = TRUE
+ )
+
+ tib <- as.data.frame(tab1)
+ expect_equal(
+ tib,
+ tibble::tibble(
+ arr = list_array_r,
+ nuf = tibble::tibble(ps = ps$as_vector(), hello = hello$as_vector())
+ ),
+ ignore_attr = TRUE
+ )
+})
+
+test_that("Can read json file with list<struct<T...>> nested columns (ARROW-7740)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ writeLines('
+ {"a":[{"b":1.0},{"b":2.0}]}
+ {"a":[{"b":1.0},{"b":2.0}]}
+ ', tf)
+
+ one <- tibble::tibble(b = c(1, 2))
+ expected <- tibble::tibble(a = c(list(one), list(one)))
+ expect_equal(read_json_arrow(tf), expected, ignore_attr = TRUE)
+})
diff --git a/src/arrow/r/tests/testthat/test-memory-pool.R b/src/arrow/r/tests/testthat/test-memory-pool.R
new file mode 100644
index 000000000..0aa18aadc
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-memory-pool.R
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("default_memory_pool and its attributes", {
+ pool <- default_memory_pool()
+ # Not integer bc can be >2gb, so we cast to double
+ expect_type(pool$bytes_allocated, "double")
+ expect_type(pool$max_memory, "double")
+ expect_true(pool$backend_name %in% c("system", "jemalloc", "mimalloc"))
+
+ expect_true(all(supported_memory_backends() %in% c("system", "jemalloc", "mimalloc")))
+})
diff --git a/src/arrow/r/tests/testthat/test-message-reader.R b/src/arrow/r/tests/testthat/test-message-reader.R
new file mode 100644
index 000000000..44f3fe4f7
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-message-reader.R
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("MessageReader can be created from raw vectors", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+
+ reader <- MessageReader$create(bytes)
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$RECORD_BATCH)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+
+ schema <- schema(x = int32())
+ bytes <- schema$serialize()
+
+ reader <- MessageReader$create(bytes)
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+})
+
+test_that("MessageReader can be created from input stream", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+
+ stream <- BufferReader$create(bytes)
+ expect_r6_class(stream, "BufferReader")
+
+ reader <- MessageReader$create(stream)
+ expect_r6_class(reader, "MessageReader")
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$RECORD_BATCH)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+
+ schema <- schema(x = int32())
+ bytes <- schema$serialize()
+
+ stream <- BufferReader$create(bytes)
+ expect_r6_class(stream, "BufferReader")
+
+ reader <- MessageReader$create(stream)
+ expect_r6_class(reader, "MessageReader")
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- reader$ReadNextMessage()
+ expect_null(message)
+})
diff --git a/src/arrow/r/tests/testthat/test-message.R b/src/arrow/r/tests/testthat/test-message.R
new file mode 100644
index 000000000..c9ee4cb72
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-message.R
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("read_message can read from input stream", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+ stream <- BufferReader$create(bytes)
+
+ message <- read_message(stream)
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$RECORD_BATCH)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- read_message(stream)
+ expect_null(read_message(stream))
+})
+
+test_that("read_message() can read Schema messages", {
+ bytes <- schema(x = int32())$serialize()
+ stream <- BufferReader$create(bytes)
+ message <- read_message(stream)
+
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+ expect_r6_class(message$body, "Buffer")
+ expect_r6_class(message$metadata, "Buffer")
+
+ message <- read_message(stream)
+ expect_null(read_message(stream))
+})
+
+test_that("read_message() can handle raw vectors", {
+ batch <- record_batch(x = 1:10)
+ bytes <- batch$serialize()
+ stream <- BufferReader$create(bytes)
+
+ message_stream <- read_message(stream)
+ message_raw <- read_message(bytes)
+ expect_equal(message_stream, message_raw)
+
+ bytes <- schema(x = int32())$serialize()
+ stream <- BufferReader$create(bytes)
+ message_stream <- read_message(stream)
+ message_raw <- read_message(bytes)
+
+ expect_equal(message_stream, message_raw)
+})
diff --git a/src/arrow/r/tests/testthat/test-metadata.R b/src/arrow/r/tests/testthat/test-metadata.R
new file mode 100644
index 000000000..4c4d8a767
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-metadata.R
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Schema metadata", {
+ s <- schema(b = double())
+ expect_equal(s$metadata, empty_named_list())
+ expect_false(s$HasMetadata)
+ s$metadata <- list(test = TRUE)
+ expect_identical(s$metadata, list(test = "TRUE"))
+ expect_true(s$HasMetadata)
+ s$metadata$foo <- 42
+ expect_identical(s$metadata, list(test = "TRUE", foo = "42"))
+ expect_true(s$HasMetadata)
+ s$metadata$foo <- NULL
+ expect_identical(s$metadata, list(test = "TRUE"))
+ expect_true(s$HasMetadata)
+ s$metadata <- NULL
+ expect_equal(s$metadata, empty_named_list())
+ expect_false(s$HasMetadata)
+ expect_error(
+ s$metadata <- 4,
+ "Key-value metadata must be a named list or character vector"
+ )
+})
+
+test_that("Table metadata", {
+ tab <- Table$create(x = 1:2, y = c("a", "b"))
+ expect_equal(tab$metadata, empty_named_list())
+ tab$metadata <- list(test = TRUE)
+ expect_identical(tab$metadata, list(test = "TRUE"))
+ tab$metadata$foo <- 42
+ expect_identical(tab$metadata, list(test = "TRUE", foo = "42"))
+ tab$metadata$foo <- NULL
+ expect_identical(tab$metadata, list(test = "TRUE"))
+ tab$metadata <- NULL
+ expect_equal(tab$metadata, empty_named_list())
+})
+
+test_that("Table R metadata", {
+ tab <- Table$create(example_with_metadata)
+ expect_output(print(tab$metadata), "arrow_r_metadata")
+ expect_identical(as.data.frame(tab), example_with_metadata)
+})
+
+test_that("R metadata is not stored for types that map to Arrow types (factor, Date, etc.)", {
+ tab <- Table$create(example_data[1:6])
+ expect_null(tab$metadata$r)
+
+ expect_null(Table$create(example_with_times[1:3])$metadata$r)
+})
+
+test_that("classes are not stored for arrow_binary/arrow_large_binary/arrow_fixed_size_binary (ARROW-14140)", {
+ raws <- charToRaw("bonjour")
+
+ binary <- Array$create(list(raws), binary())
+ large_binary <- Array$create(list(raws), large_binary())
+ fixed_size_binary <- Array$create(list(raws), fixed_size_binary(7L))
+
+ expect_null(RecordBatch$create(b = binary)$metadata$r)
+ expect_null(RecordBatch$create(b = large_binary)$metadata$r)
+ expect_null(RecordBatch$create(b = fixed_size_binary)$metadata$r)
+
+ expect_null(Table$create(b = binary)$metadata$r)
+ expect_null(Table$create(b = large_binary)$metadata$r)
+ expect_null(Table$create(b = fixed_size_binary)$metadata$r)
+})
+
+test_that("Garbage R metadata doesn't break things", {
+ tab <- Table$create(example_data[1:6])
+ tab$metadata$r <- "garbage"
+ expect_warning(
+ expect_identical(as.data.frame(tab), example_data[1:6]),
+ "Invalid metadata$r",
+ fixed = TRUE
+ )
+ # serialize data like .serialize_arrow_r_metadata does, but don't call that
+ # directly since it checks to ensure that the data is a list
+ tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE))
+ expect_warning(
+ expect_identical(as.data.frame(tab), example_data[1:6]),
+ "Invalid metadata$r",
+ fixed = TRUE
+ )
+})
+
+test_that("Metadata serialization compression", {
+ # attributes that (when serialized) are just under 100kb are not compressed,
+ # and simply serialized
+ strings <- as.list(rep(make_string_of_size(1), 98))
+ small <- .serialize_arrow_r_metadata(strings)
+ expect_equal(
+ object.size(small),
+ object.size(rawToChar(serialize(strings, NULL, ascii = TRUE)))
+ )
+
+ # Large strings will be compressed
+ large_strings <- as.list(rep(make_string_of_size(1), 100))
+ large <- .serialize_arrow_r_metadata(large_strings)
+ expect_lt(
+ object.size(large),
+ object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+ )
+ # and this compression ends up being smaller than even the "small" strings
+ expect_lt(object.size(large), object.size(small))
+
+ # However strings where compression + serialization is not effective are no
+ # worse than only serialization alone
+ large_few_strings <- as.list(rep(make_random_string_of_size(50), 2))
+ large_few <- .serialize_arrow_r_metadata(large_few_strings)
+ expect_equal(
+ object.size(large_few),
+ object.size(rawToChar(serialize(large_few_strings, NULL, ascii = TRUE)))
+ )
+
+ # But we can disable compression
+ op <- options(arrow.compress_metadata = FALSE)
+ on.exit(options(op))
+
+ large_strings <- as.list(rep(make_string_of_size(1), 100))
+ large <- .serialize_arrow_r_metadata(large_strings)
+ expect_equal(
+ object.size(large),
+ object.size(rawToChar(serialize(large_strings, NULL, ascii = TRUE)))
+ )
+})
+
+test_that("RecordBatch metadata", {
+ rb <- RecordBatch$create(x = 1:2, y = c("a", "b"))
+ expect_equal(rb$metadata, empty_named_list())
+ rb$metadata <- list(test = TRUE)
+ expect_identical(rb$metadata, list(test = "TRUE"))
+ rb$metadata$foo <- 42
+ expect_identical(rb$metadata, list(test = "TRUE", foo = "42"))
+ rb$metadata$foo <- NULL
+ expect_identical(rb$metadata, list(test = "TRUE"))
+ rb$metadata <- NULL
+ expect_equal(rb$metadata, empty_named_list())
+})
+
+test_that("RecordBatch R metadata", {
+ expect_identical(as.data.frame(record_batch(example_with_metadata)), example_with_metadata)
+})
+
+test_that("R metadata roundtrip via parquet", {
+ skip_if_not_available("parquet")
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_parquet(example_with_metadata, tf)
+ expect_identical(read_parquet(tf), example_with_metadata)
+})
+
+test_that("R metadata roundtrip via feather", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_feather(example_with_metadata, tf)
+ expect_identical(read_feather(tf), example_with_metadata)
+})
+
+test_that("haven types roundtrip via feather", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_feather(haven_data, tf)
+ expect_identical(read_feather(tf), haven_data)
+})
+
+test_that("Date/time type roundtrip", {
+ rb <- record_batch(example_with_times)
+ expect_r6_class(rb$schema$posixlt$type, "StructType")
+ expect_identical(as.data.frame(rb), example_with_times)
+})
+
+test_that("metadata keeps attribute of top level data frame", {
+ df <- structure(data.frame(x = 1, y = 2), foo = "bar")
+ tab <- Table$create(df)
+ expect_identical(attr(as.data.frame(tab), "foo"), "bar")
+ expect_identical(as.data.frame(tab), df)
+})
+
+
+test_that("metadata drops readr's problems attribute", {
+ readr_like <- tibble::tibble(
+ dbl = 1.1,
+ not_here = NA_character_
+ )
+ attributes(readr_like) <- append(
+ attributes(readr_like),
+ list(problems = tibble::tibble(
+ row = 1L,
+ col = NA_character_,
+ expected = "2 columns",
+ actual = "1 columns",
+ file = "'test'"
+ ))
+ )
+
+ tab <- Table$create(readr_like)
+ expect_null(attr(as.data.frame(tab), "problems"))
+})
+
+test_that("Row-level metadata (does not by default) roundtrip", {
+ # First tracked at ARROW-10386, though it was later determined that row-level
+ # metadata should be handled separately ARROW-14020, ARROW-12542
+ df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux"))))
+ tab <- Table$create(df)
+ r_metadata <- tab$r_metadata
+ expect_type(r_metadata, "list")
+ expect_null(r_metadata$columns$x$columns)
+
+ # But we can re-enable this / read data that has already been written with
+ # row-level metadata
+ withr::with_options(
+ list("arrow.preserve_row_level_metadata" = TRUE), {
+ tab <- Table$create(df)
+ expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+ expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
+ })
+})
+
+
+test_that("Row-level metadata (does not) roundtrip in datasets", {
+ # First tracked at ARROW-10386, though it was later determined that row-level
+ # metadata should be handled separately ARROW-14020, ARROW-12542
+ skip_if_not_available("dataset")
+ skip_if_not_available("parquet")
+
+ library(dplyr, warn.conflicts = FALSE)
+
+ df <- tibble::tibble(
+ metadata = list(
+ structure(1, my_value_as_attr = 1),
+ structure(2, my_value_as_attr = 2),
+ structure(3, my_value_as_attr = 3),
+ structure(4, my_value_as_attr = 3)
+ ),
+ int = 1L:4L,
+ part = c(1, 3, 2, 1)
+ )
+
+ dst_dir <- make_temp_dir()
+
+ withr::with_options(
+ list("arrow.preserve_row_level_metadata" = TRUE), {
+ expect_warning(
+ write_dataset(df, dst_dir, partitioning = "part"),
+ "Row-level metadata is not compatible with datasets and will be discarded"
+ )
+
+ # Reset directory as previous write will have created some files and the default
+ # behavior is to error on existing
+ dst_dir <- make_temp_dir()
+ # but we need to write a dataset with row-level metadata to make sure when
+ # reading ones that have been written with them we warn appropriately
+ fake_func_name <- write_dataset
+ fake_func_name(df, dst_dir, partitioning = "part")
+
+ ds <- open_dataset(dst_dir)
+ expect_warning(
+ df_from_ds <- collect(ds),
+ "Row-level metadata is not compatible with this operation and has been ignored"
+ )
+ expect_equal(
+ arrange(df_from_ds, int),
+ arrange(df, int),
+ ignore_attr = TRUE
+ )
+
+ # however there is *no* warning if we don't select the metadata column
+ expect_warning(
+ df_from_ds <- ds %>% select(int) %>% collect(),
+ NA
+ )
+ })
+})
+
+test_that("When we encounter SF cols, we warn", {
+ df <- data.frame(x = I(list(structure(1, foo = "bar"), structure(2, baz = "qux"))))
+ class(df$x) <- c("sfc_MULTIPOLYGON", "sfc", "list")
+
+ expect_warning(
+ tab <- Table$create(df),
+ "One of the columns given appears to be an"
+ )
+
+ # but the table was read fine, just sans (row-level) metadata
+ r_metadata <- .unserialize_arrow_r_metadata(tab$metadata$r)
+ expect_null(r_metadata$columns$x$columns)
+
+ # But we can re-enable this / read data that has already been written with
+ # row-level metadata without a warning
+ withr::with_options(
+ list("arrow.preserve_row_level_metadata" = TRUE), {
+ expect_warning(tab <- Table$create(df), NA)
+ expect_identical(attr(as.data.frame(tab)$x[[1]], "foo"), "bar")
+ expect_identical(attr(as.data.frame(tab)$x[[2]], "baz"), "qux")
+ })
+})
+
+test_that("dplyr with metadata", {
+ skip_if_not_available("dataset")
+
+ compare_dplyr_binding(
+ .input %>%
+ collect(),
+ example_with_metadata
+ )
+ compare_dplyr_binding(
+ .input %>%
+ select(a) %>%
+ collect(),
+ example_with_metadata
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(z = b * 4) %>%
+ select(z, a) %>%
+ collect(),
+ example_with_metadata
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(z = nchar(a)) %>%
+ select(z, a) %>%
+ collect(),
+ example_with_metadata
+ )
+ # dplyr drops top-level attributes if you do summarize, though attributes
+ # of grouping columns appear to come through
+ compare_dplyr_binding(
+ .input %>%
+ group_by(a) %>%
+ summarize(n()) %>%
+ collect(),
+ example_with_metadata
+ )
+ # Same name in output but different data, so the column metadata shouldn't
+ # carry through
+ compare_dplyr_binding(
+ .input %>%
+ mutate(a = nchar(a)) %>%
+ select(a) %>%
+ collect(),
+ example_with_metadata
+ )
+})
+
+test_that("grouped_df metadata is recorded (efficiently)", {
+ grouped <- group_by(tibble(a = 1:2, b = 3:4), a)
+ expect_s3_class(grouped, "grouped_df")
+ grouped_tab <- Table$create(grouped)
+ expect_r6_class(grouped_tab, "Table")
+ expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a")
+})
diff --git a/src/arrow/r/tests/testthat/test-na-omit.R b/src/arrow/r/tests/testthat/test-na-omit.R
new file mode 100644
index 000000000..fafebb4ff
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-na-omit.R
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+data_no_na <- c(2:10)
+data_na <- c(data_no_na, NA_real_)
+
+test_that("na.fail on Scalar", {
+ scalar_na <- Scalar$create(NA)
+ scalar_one <- Scalar$create(1)
+ expect_as_vector(na.fail(scalar_one), 1)
+ expect_error(na.fail(scalar_na), "missing values in object")
+})
+
+test_that("na.omit on Array and ChunkedArray", {
+ compare_expression(na.omit(.input), data_no_na)
+ compare_expression(na.omit(.input), data_na, ignore_attr = TRUE)
+})
+
+test_that("na.exclude on Array and ChunkedArray", {
+ compare_expression(na.exclude(.input), data_no_na)
+ compare_expression(na.exclude(.input), data_na, ignore_attr = TRUE)
+})
+
+test_that("na.fail on Array and ChunkedArray", {
+ compare_expression(na.fail(.input), data_no_na, ignore_attr = TRUE)
+ compare_expression_error(na.fail(.input), data_na)
+})
+
+test_that("na.fail on Scalar", {
+ scalar_one <- Scalar$create(1)
+ expect_error(na.fail(scalar_na), regexp = "missing values in object")
+ expect_as_vector(na.fail(scalar_one), na.fail(1))
+})
+
+test_that("na.omit on Table", {
+ tbl <- Table$create(example_data)
+ expect_equal(
+ as.data.frame(na.omit(tbl)),
+ na.omit(example_data),
+ # We don't include an attribute with the rows omitted
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.exclude on Table", {
+ tbl <- Table$create(example_data)
+ expect_equal(
+ as.data.frame(na.exclude(tbl)),
+ na.exclude(example_data),
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.fail on Table", {
+ tbl <- Table$create(example_data)
+ expect_error(na.fail(tbl), "missing values in object")
+})
+
+test_that("na.omit on RecordBatch", {
+ batch <- record_batch(example_data)
+ expect_equal(
+ as.data.frame(na.omit(batch)),
+ na.omit(example_data),
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.exclude on RecordBatch", {
+ batch <- record_batch(example_data)
+ expect_equal(
+ as.data.frame(na.exclude(batch)),
+ na.omit(example_data),
+ ignore_attr = "na.action"
+ )
+})
+
+test_that("na.fail on RecordBatch", {
+ batch <- record_batch(example_data)
+ expect_error(na.fail(batch), "missing values in object")
+})
diff --git a/src/arrow/r/tests/testthat/test-parquet.R b/src/arrow/r/tests/testthat/test-parquet.R
new file mode 100644
index 000000000..55d86b532
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-parquet.R
@@ -0,0 +1,274 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+skip_if_not_available("parquet")
+
+pq_file <- system.file("v0.7.1.parquet", package = "arrow")
+
+test_that("reading a known Parquet file to tibble", {
+ skip_if_not_available("snappy")
+ df <- read_parquet(pq_file)
+ expect_true(tibble::is_tibble(df))
+ expect_identical(dim(df), c(10L, 11L))
+ # TODO: assert more about the contents
+})
+
+test_that("simple int column roundtrip", {
+ df <- tibble::tibble(x = 1:5)
+ pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read)
+ # Make sure file connection is cleaned up
+ expect_error(file.remove(pq_tmp_file), NA)
+ expect_false(file.exists(pq_tmp_file))
+})
+
+test_that("read_parquet() supports col_select", {
+ skip_if_not_available("snappy")
+ df <- read_parquet(pq_file, col_select = c(x, y, z))
+ expect_equal(names(df), c("x", "y", "z"))
+
+ df <- read_parquet(pq_file, col_select = starts_with("c"))
+ expect_equal(names(df), c("carat", "cut", "color", "clarity"))
+})
+
+test_that("read_parquet() with raw data", {
+ skip_if_not_available("snappy")
+ test_raw <- readBin(pq_file, what = "raw", n = 5000)
+ df <- read_parquet(test_raw)
+ expect_identical(dim(df), c(10L, 11L))
+})
+
+test_that("write_parquet() handles various compression= specs", {
+ skip_if_not_available("snappy")
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, compression = "snappy")
+ expect_parquet_roundtrip(tab, compression = rep("snappy", 3L))
+ expect_parquet_roundtrip(tab, compression = c(x1 = "snappy", x2 = "snappy"))
+})
+
+test_that("write_parquet() handles various compression_level= specs", {
+ skip_if_not_available("gzip")
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = 4)
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = rep(4L, 3L))
+ expect_parquet_roundtrip(tab, compression = "gzip", compression_level = c(x1 = 5L, x2 = 3L))
+})
+
+test_that("write_parquet() handles various use_dictionary= specs", {
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, use_dictionary = TRUE)
+ expect_parquet_roundtrip(tab, use_dictionary = c(TRUE, FALSE, TRUE))
+ expect_parquet_roundtrip(tab, use_dictionary = c(x1 = TRUE, x2 = TRUE))
+ expect_error(
+ write_parquet(tab, tempfile(), use_dictionary = c(TRUE, FALSE)),
+ "unsupported use_dictionary= specification"
+ )
+ expect_error(
+ write_parquet(tab, tempfile(), use_dictionary = 12),
+ "is.logical(use_dictionary) is not TRUE",
+ fixed = TRUE
+ )
+})
+
+test_that("write_parquet() handles various write_statistics= specs", {
+ tab <- Table$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+
+ expect_parquet_roundtrip(tab, write_statistics = TRUE)
+ expect_parquet_roundtrip(tab, write_statistics = c(TRUE, FALSE, TRUE))
+ expect_parquet_roundtrip(tab, write_statistics = c(x1 = TRUE, x2 = TRUE))
+})
+
+test_that("write_parquet() accepts RecordBatch too", {
+ batch <- RecordBatch$create(x1 = 1:5, x2 = 1:5, y = 1:5)
+ tab <- parquet_roundtrip(batch)
+ expect_equal(tab, Table$create(batch))
+})
+
+test_that("write_parquet() handles grouped_df", {
+ library(dplyr, warn.conflicts = FALSE)
+ df <- tibble::tibble(a = 1:4, b = 5) %>% group_by(b)
+ # Since `df` is a "grouped_df", this test asserts that we get a grouped_df back
+ expect_parquet_roundtrip(df, as_data_frame = TRUE)
+})
+
+test_that("write_parquet() with invalid input type", {
+ bad_input <- Array$create(1:5)
+ expect_error(
+ write_parquet(bad_input, tempfile()),
+ regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
+ )
+})
+
+test_that("write_parquet() can truncate timestamps", {
+ tab <- Table$create(x1 = as.POSIXct("2020/06/03 18:00:00", tz = "UTC"))
+ expect_type_equal(tab$x1, timestamp("us", "UTC"))
+
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write_parquet(tab, tf, coerce_timestamps = "ms", allow_truncated_timestamps = TRUE)
+ new <- read_parquet(tf, as_data_frame = FALSE)
+ expect_type_equal(new$x1, timestamp("ms", "UTC"))
+ expect_equal(as.data.frame(tab), as.data.frame(new))
+})
+
+test_that("make_valid_version()", {
+ expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0)
+
+ expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0)
+
+ expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0)
+ expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0)
+})
+
+test_that("write_parquet() defaults to snappy compression", {
+ skip_if_not_available("snappy")
+ tmp1 <- tempfile()
+ tmp2 <- tempfile()
+ write_parquet(mtcars, tmp1)
+ write_parquet(mtcars, tmp2, compression = "snappy")
+ expect_equal(file.size(tmp1), file.size(tmp2))
+})
+
+test_that("Factors are preserved when writing/reading from Parquet", {
+ fct <- factor(c("a", "b"), levels = c("c", "a", "b"))
+ ord <- factor(c("a", "b"), levels = c("c", "a", "b"), ordered = TRUE)
+ chr <- c("a", "b")
+ df <- tibble::tibble(fct = fct, ord = ord, chr = chr)
+
+ pq_tmp_file <- tempfile()
+ on.exit(unlink(pq_tmp_file))
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read)
+})
+
+test_that("Lists are preserved when writing/reading from Parquet", {
+ bool <- list(logical(0), NA, c(TRUE, FALSE))
+ int <- list(integer(0), NA_integer_, 1:4)
+ num <- list(numeric(0), NA_real_, c(1, 2))
+ char <- list(character(0), NA_character_, c("itsy", "bitsy"))
+ df <- tibble::tibble(bool = bool, int = int, num = num, char = char)
+
+ pq_tmp_file <- tempfile()
+ on.exit(unlink(pq_tmp_file))
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_equal(df, df_read, ignore_attr = TRUE)
+})
+
+test_that("write_parquet() to stream", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ con <- FileOutputStream$create(tf)
+ on.exit(unlink(tf))
+ write_parquet(df, con)
+ con$close()
+ expect_equal(read_parquet(tf), df)
+})
+
+test_that("write_parquet() returns its input", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ df_out <- write_parquet(df, tf)
+ expect_equal(df, df_out)
+})
+
+test_that("write_parquet() handles version argument", {
+ df <- tibble::tibble(x = 1:5)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ {
+ write_parquet(df, tf, version = .x)
+ expect_identical(read_parquet(tf), df)
+ })
+ purrr::walk(list("3.0", 3.0, 3L, "A"), ~ {
+ expect_error(write_parquet(df, tf, version = .x))
+ })
+})
+
+test_that("ParquetFileWriter raises an error for non-OutputStream sink", {
+ sch <- schema(a = float32())
+ # ARROW-9946
+ expect_error(
+ ParquetFileWriter$create(schema = sch, sink = tempfile()),
+ regex = "OutputStream"
+ )
+})
+
+test_that("ParquetFileReader $ReadRowGroup(s) methods", {
+ tab <- Table$create(x = 1:100)
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_parquet(tab, tf, chunk_size = 10)
+
+ reader <- ParquetFileReader$create(tf)
+ expect_true(reader$ReadRowGroup(0) == Table$create(x = 1:10))
+ expect_true(reader$ReadRowGroup(9) == Table$create(x = 91:100))
+ expect_error(reader$ReadRowGroup(-1), "Some index in row_group_indices")
+ expect_error(reader$ReadRowGroup(111), "Some index in row_group_indices")
+ expect_error(reader$ReadRowGroup(c(1, 2)))
+ expect_error(reader$ReadRowGroup("a"))
+
+ expect_true(reader$ReadRowGroups(c(0, 1)) == Table$create(x = 1:20))
+ expect_error(reader$ReadRowGroups(c(0, 1, -2))) # although it gives a weird error
+ expect_error(reader$ReadRowGroups(c(0, 1, 31))) # ^^
+ expect_error(reader$ReadRowGroups(c("a", "b")))
+
+ ## -- with column_indices
+ expect_true(reader$ReadRowGroup(0, 0) == Table$create(x = 1:10))
+ expect_error(reader$ReadRowGroup(0, 1))
+
+ expect_true(reader$ReadRowGroups(c(0, 1), 0) == Table$create(x = 1:20))
+ expect_error(reader$ReadRowGroups(c(0, 1), 1))
+})
+
+test_that("Error messages are shown when the compression algorithm snappy is not found", {
+ msg <- paste0(
+ "NotImplemented: Support for codec 'snappy' not built\nIn order to read this file, ",
+ "you will need to reinstall arrow with additional features enabled.\nSet one of these ",
+ "environment variables before installing:\n\n * LIBARROW_MINIMAL=false (for all optional ",
+ "features, including 'snappy')\n * ARROW_WITH_SNAPPY=ON (for just 'snappy')\n\n",
+ "See https://arrow.apache.org/docs/r/articles/install.html for details"
+ )
+
+ if (codec_is_available("snappy")) {
+ d <- read_parquet(pq_file)
+ expect_s3_class(d, "data.frame")
+ } else {
+ expect_error(read_parquet(pq_file), msg, fixed = TRUE)
+ }
+})
+
+test_that("Error is created when parquet reads a feather file", {
+ expect_error(
+ read_parquet(test_path("golden-files/data-arrow_2.0.0_lz4.feather")),
+ "Parquet magic bytes not found in footer"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-python-flight.R b/src/arrow/r/tests/testthat/test-python-flight.R
new file mode 100644
index 000000000..c87f3a562
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-python-flight.R
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Assumes:
+# * We've already done arrow::install_pyarrow()
+# * R -e 'arrow::load_flight_server("demo_flight_server")$DemoFlightServer(port = 8089)$serve()'
+# TODO: set up CI job to test this, or some way of running a background process
+if (process_is_running("demo_flight_server")) {
+ client <- flight_connect(port = 8089)
+ flight_obj <- tempfile()
+
+ test_that("flight_path_exists", {
+ expect_false(flight_path_exists(client, flight_obj))
+ expect_false(flight_obj %in% list_flights(client))
+ })
+
+ test_that("flight_put", {
+ flight_put(client, example_data, path = flight_obj)
+ expect_true(flight_path_exists(client, flight_obj))
+ expect_true(flight_obj %in% list_flights(client))
+ })
+
+ test_that("flight_get", {
+ expect_identical(as.data.frame(flight_get(client, flight_obj)), example_data)
+ })
+
+ test_that("flight_put with RecordBatch", {
+ flight_obj2 <- tempfile()
+ flight_put(client, RecordBatch$create(example_data), path = flight_obj2)
+ expect_identical(as.data.frame(flight_get(client, flight_obj2)), example_data)
+ })
+
+ test_that("flight_put with overwrite = FALSE", {
+ expect_error(
+ flight_put(client, example_with_times, path = flight_obj, overwrite = FALSE),
+ "exists"
+ )
+ # Default is TRUE so this will overwrite
+ flight_put(client, example_with_times, path = flight_obj)
+ expect_identical(as.data.frame(flight_get(client, flight_obj)), example_with_times)
+ })
+} else {
+ # Kinda hacky, let's put a skipped test here, just so we note that the tests
+ # didn't run
+ test_that("Flight tests", {
+ skip("Flight server is not running")
+ })
+}
diff --git a/src/arrow/r/tests/testthat/test-python.R b/src/arrow/r/tests/testthat/test-python.R
new file mode 100644
index 000000000..5ad7513fb
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-python.R
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("install_pyarrow", {
+ skip_on_cran()
+ skip_if_offline()
+ skip_if_not_dev_mode()
+ # Windows CI machine doesn't pick up the right python or something
+ skip_on_os("windows")
+ skip_if_not_installed("reticulate")
+
+ venv <- try(reticulate::virtualenv_create("arrow-test"))
+ # Bail out if virtualenv isn't available
+ skip_if(inherits(venv, "try-error"))
+ expect_error(install_pyarrow("arrow-test", nightly = TRUE), NA)
+ # Set this up for the following tests
+ reticulate::use_virtualenv("arrow-test")
+})
+
+skip_if_no_pyarrow()
+
+test_that("Array from Python", {
+ pa <- reticulate::import("pyarrow")
+ py <- pa$array(c(1, 2, 3))
+ expect_equal(py, Array$create(c(1, 2, 3)))
+})
+
+test_that("Array to Python", {
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ r <- Array$create(c(1, 2, 3))
+ py <- pa$concat_arrays(list(r))
+ expect_s3_class(py, "pyarrow.lib.Array")
+ expect_equal(reticulate::py_to_r(py), r)
+})
+
+test_that("RecordBatch to/from Python", {
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3])
+ py <- reticulate::r_to_py(batch)
+ expect_s3_class(py, "pyarrow.lib.RecordBatch")
+ expect_equal(reticulate::py_to_r(py), batch)
+})
+
+test_that("Table and ChunkedArray from Python", {
+ pa <- reticulate::import("pyarrow", convert = FALSE)
+ batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3])
+ tab <- Table$create(batch, batch)
+ pybatch <- reticulate::r_to_py(batch)
+ pytab <- pa$Table$from_batches(list(pybatch, pybatch))
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ expect_s3_class(pytab[0], "pyarrow.lib.ChunkedArray")
+ expect_equal(reticulate::py_to_r(pytab[0]), tab$col1)
+ expect_equal(reticulate::py_to_r(pytab), tab)
+})
+
+test_that("Table and ChunkedArray to Python", {
+ batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3])
+ tab <- Table$create(batch, batch)
+
+ pychunked <- reticulate::r_to_py(tab$col1)
+ expect_s3_class(pychunked, "pyarrow.lib.ChunkedArray")
+ expect_equal(reticulate::py_to_r(pychunked), tab$col1)
+
+ pytab <- reticulate::r_to_py(tab)
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ expect_equal(reticulate::py_to_r(pytab), tab)
+})
+
+test_that("RecordBatch with metadata roundtrip", {
+ batch <- RecordBatch$create(example_with_times)
+ pybatch <- reticulate::r_to_py(batch)
+ expect_s3_class(pybatch, "pyarrow.lib.RecordBatch")
+ expect_equal(reticulate::py_to_r(pybatch), batch)
+ expect_identical(as.data.frame(reticulate::py_to_r(pybatch)), example_with_times)
+})
+
+test_that("Table with metadata roundtrip", {
+ tab <- Table$create(example_with_times)
+ pytab <- reticulate::r_to_py(tab)
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ expect_equal(reticulate::py_to_r(pytab), tab)
+ expect_identical(as.data.frame(reticulate::py_to_r(pytab)), example_with_times)
+})
+
+test_that("DataType roundtrip", {
+ r <- timestamp("ms", timezone = "Pacific/Marquesas")
+ py <- reticulate::r_to_py(r)
+ expect_s3_class(py, "pyarrow.lib.DataType")
+ expect_equal(reticulate::py_to_r(py), r)
+})
+
+test_that("Field roundtrip", {
+ r <- field("x", time32("s"))
+ py <- reticulate::r_to_py(r)
+ expect_s3_class(py, "pyarrow.lib.Field")
+ expect_equal(reticulate::py_to_r(py), r)
+})
+
+test_that("RecordBatchReader to python", {
+ library(dplyr)
+
+ tab <- Table$create(example_data)
+ scan <- tab %>%
+ select(int, lgl) %>%
+ filter(int > 6) %>%
+ Scanner$create()
+ reader <- scan$ToRecordBatchReader()
+ pyreader <- reticulate::r_to_py(reader)
+ expect_s3_class(pyreader, "pyarrow.lib.RecordBatchReader")
+ pytab <- pyreader$read_all()
+ expect_s3_class(pytab, "pyarrow.lib.Table")
+ back_to_r <- reticulate::py_to_r(pytab)
+ expect_r6_class(back_to_r, "Table")
+ expect_identical(
+ as.data.frame(back_to_r),
+ example_data %>%
+ select(int, lgl) %>%
+ filter(int > 6)
+ )
+})
+
+test_that("RecordBatchReader from python", {
+ tab <- Table$create(example_data)
+ scan <- Scanner$create(tab)
+ reader <- scan$ToRecordBatchReader()
+ pyreader <- reticulate::r_to_py(reader)
+ back_to_r <- reticulate::py_to_r(pyreader)
+ rt_table <- back_to_r$read_table()
+ expect_r6_class(rt_table, "Table")
+ expect_identical(as.data.frame(rt_table), example_data)
+})
diff --git a/src/arrow/r/tests/testthat/test-read-record-batch.R b/src/arrow/r/tests/testthat/test-read-record-batch.R
new file mode 100644
index 000000000..ba109da6c
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-read-record-batch.R
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("RecordBatchFileWriter / RecordBatchFileReader roundtrips", {
+ tab <- Table$create(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+
+ tf <- tempfile()
+ expect_error(
+ RecordBatchFileWriter$create(tf, tab$schema),
+ "RecordBatchFileWriter$create() requires an Arrow InputStream. Try providing FileOutputStream$create(tf)",
+ fixed = TRUE
+ )
+
+ stream <- FileOutputStream$create(tf)
+ writer <- RecordBatchFileWriter$create(stream, tab$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write_table(tab)
+ writer$close()
+ stream$close()
+
+ expect_equal(read_feather(tf, as_data_frame = FALSE), tab)
+ # Make sure connections are closed
+ expect_error(file.remove(tf), NA)
+ skip_on_os("windows") # This should pass, we've closed the stream
+ expect_false(file.exists(tf))
+})
+
+test_that("record_batch() handles (raw|Buffer|InputStream, Schema) (ARROW-3450, ARROW-3505)", {
+ tbl <- tibble::tibble(
+ int = 1:10, dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
+ chr = letters[1:10]
+ )
+ batch <- record_batch(!!!tbl)
+ schema <- batch$schema
+
+ raw <- batch$serialize()
+ batch2 <- record_batch(raw, schema = schema)
+ batch3 <- record_batch(buffer(raw), schema = schema)
+ stream <- BufferReader$create(raw)
+ stream$close()
+
+ expect_equal(batch, batch2)
+ expect_equal(batch, batch3)
+})
+
+test_that("record_batch() can handle (Message, Schema) parameters (ARROW-3499)", {
+ batch <- record_batch(x = 1:10)
+ schema <- batch$schema
+
+ raw <- batch$serialize()
+ stream <- BufferReader$create(raw)
+
+ message <- read_message(stream)
+ batch2 <- record_batch(message, schema = schema)
+ expect_equal(batch, batch2)
+ stream$close()
+})
diff --git a/src/arrow/r/tests/testthat/test-read-write.R b/src/arrow/r/tests/testthat/test-read-write.R
new file mode 100644
index 000000000..66f6db56d
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-read-write.R
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("table round trip", {
+ tbl <- tibble::tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ raw = as.raw(1:10)
+ )
+
+ tab <- Table$create(!!!tbl)
+ expect_equal(tab$num_columns, 3L)
+ expect_equal(tab$num_rows, 10L)
+
+ # ChunkedArray
+ chunked_array_int <- tab$column(0)
+ expect_equal(chunked_array_int$length(), 10L)
+ expect_equal(chunked_array_int$null_count, 0L)
+ expect_equal(chunked_array_int$as_vector(), tbl$int)
+
+ # Array
+ chunks_int <- chunked_array_int$chunks
+ expect_equal(length(chunks_int), chunked_array_int$num_chunks)
+ for (i in seq_along(chunks_int)) {
+ expect_equal(chunked_array_int$chunk(i - 1L), chunks_int[[i]])
+ }
+
+ # ChunkedArray
+ chunked_array_dbl <- tab$column(1)
+ expect_equal(chunked_array_dbl$length(), 10L)
+ expect_equal(chunked_array_dbl$null_count, 0L)
+ expect_equal(chunked_array_dbl$as_vector(), tbl$dbl)
+
+ # Array
+ chunks_dbl <- chunked_array_dbl$chunks
+ expect_equal(length(chunks_dbl), chunked_array_dbl$num_chunks)
+ for (i in seq_along(chunks_dbl)) {
+ expect_equal(chunked_array_dbl$chunk(i - 1L), chunks_dbl[[i]])
+ }
+
+ # ChunkedArray
+ chunked_array_raw <- tab$column(2)
+ expect_equal(chunked_array_raw$length(), 10L)
+ expect_equal(chunked_array_raw$null_count, 0L)
+ expect_equal(chunked_array_raw$as_vector(), as.integer(tbl$raw))
+
+ # Array
+ chunks_raw <- chunked_array_raw$chunks
+ expect_equal(length(chunks_raw), chunked_array_raw$num_chunks)
+ for (i in seq_along(chunks_raw)) {
+ expect_equal(chunked_array_raw$chunk(i - 1L), chunks_raw[[i]])
+ }
+ tf <- tempfile()
+ write_feather(tbl, tf)
+
+ res <- read_feather(tf)
+ expect_identical(tbl$int, res$int)
+ expect_identical(tbl$dbl, res$dbl)
+ expect_identical(as.integer(tbl$raw), res$raw)
+ unlink(tf)
+})
+
+test_that("table round trip handles NA in integer and numeric", {
+ tbl <- tibble::tibble(
+ int = c(NA, 2:10),
+ dbl = as.numeric(c(1:5, NA, 7:9, NA)),
+ raw = as.raw(1:10)
+ )
+
+ tab <- Table$create(!!!tbl)
+ expect_equal(tab$num_columns, 3L)
+ expect_equal(tab$num_rows, 10L)
+
+ expect_equal(tab$column(0)$length(), 10L)
+ expect_equal(tab$column(1)$length(), 10L)
+ expect_equal(tab$column(2)$length(), 10L)
+
+ expect_equal(tab$column(0)$null_count, 1L)
+ expect_equal(tab$column(1)$null_count, 2L)
+ expect_equal(tab$column(2)$null_count, 0L)
+
+ expect_equal(tab$column(0)$type, int32())
+ expect_equal(tab$column(1)$type, float64())
+ expect_equal(tab$column(2)$type, uint8())
+
+ tf <- tempfile()
+ write_feather(tbl, tf)
+
+ res <- read_feather(tf)
+ expect_identical(tbl$int, res$int)
+ expect_identical(tbl$dbl, res$dbl)
+ expect_identical(as.integer(tbl$raw), res$raw)
+
+ expect_true(is.na(res$int[1]))
+ expect_true(is.na(res$dbl[6]))
+ expect_true(is.na(res$dbl[10]))
+ unlink(tf)
+})
+
+test_that("reading/writing a raw vector (sparklyr integration)", {
+ # These are effectively what sparklyr calls to get data to/from Spark
+ read_from_raw_test <- function(x) {
+ as.data.frame(RecordBatchStreamReader$create(x)$read_next_batch())
+ }
+ bytes <- write_to_raw(example_data)
+ expect_type(bytes, "raw")
+ expect_identical(read_from_raw_test(bytes), example_data)
+ # this could just be `read_ipc_stream(x)`; propose that
+ expect_identical(read_ipc_stream(bytes), example_data)
+})
diff --git a/src/arrow/r/tests/testthat/test-record-batch-reader.R b/src/arrow/r/tests/testthat/test-record-batch-reader.R
new file mode 100644
index 000000000..3992670dc
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-record-batch-reader.R
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("RecordBatchStreamReader / Writer", {
+ tbl <- tibble::tibble(
+ x = 1:10,
+ y = letters[1:10]
+ )
+ batch <- record_batch(tbl)
+ tab <- Table$create(tbl)
+
+ sink <- BufferOutputStream$create()
+ expect_equal(sink$tell(), 0)
+ writer <- RecordBatchStreamWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ expect_true(sink$tell() > 0)
+ writer$close()
+
+ buf <- sink$finish()
+ expect_r6_class(buf, "Buffer")
+
+ reader <- RecordBatchStreamReader$create(buf)
+ expect_r6_class(reader, "RecordBatchStreamReader")
+
+ batch1 <- reader$read_next_batch()
+ expect_r6_class(batch1, "RecordBatch")
+ expect_equal(batch, batch1)
+ batch2 <- reader$read_next_batch()
+ expect_r6_class(batch2, "RecordBatch")
+ expect_equal(batch, batch2)
+ batch3 <- reader$read_next_batch()
+ expect_r6_class(batch3, "RecordBatch")
+ expect_equal(batch, batch3)
+ expect_null(reader$read_next_batch())
+})
+
+test_that("RecordBatchFileReader / Writer", {
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchFileWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ writer$close()
+
+ buf <- sink$finish()
+ expect_r6_class(buf, "Buffer")
+
+ reader <- RecordBatchFileReader$create(buf)
+ expect_r6_class(reader, "RecordBatchFileReader")
+
+ batch1 <- reader$get_batch(0)
+ expect_r6_class(batch1, "RecordBatch")
+ expect_equal(batch, batch1)
+
+ expect_equal(reader$num_record_batches, 3)
+})
+
+test_that("StreamReader read_table", {
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchStreamWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ writer$close()
+ buf <- sink$finish()
+
+ reader <- RecordBatchStreamReader$create(buf)
+ out <- reader$read_table()
+ expect_identical(dim(out), c(30L, 2L))
+})
+
+test_that("FileReader read_table", {
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchFileWriter$create(sink, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$write(batch)
+ writer$write(tab)
+ writer$write(tbl)
+ writer$close()
+ buf <- sink$finish()
+
+ reader <- RecordBatchFileReader$create(buf)
+ out <- reader$read_table()
+ expect_identical(dim(out), c(30L, 2L))
+})
+
+test_that("MetadataFormat", {
+ expect_identical(get_ipc_metadata_version(5), 4L)
+ expect_identical(get_ipc_metadata_version("V4"), 3L)
+ expect_identical(get_ipc_metadata_version(NULL), 4L)
+ Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = 1)
+ expect_identical(get_ipc_metadata_version(NULL), 3L)
+ Sys.setenv(ARROW_PRE_0_15_IPC_FORMAT = "")
+
+ expect_identical(get_ipc_metadata_version(NULL), 4L)
+ Sys.setenv(ARROW_PRE_1_0_METADATA_VERSION = 1)
+ expect_identical(get_ipc_metadata_version(NULL), 3L)
+ Sys.setenv(ARROW_PRE_1_0_METADATA_VERSION = "")
+
+ expect_error(
+ get_ipc_metadata_version(99),
+ "99 is not a valid IPC MetadataVersion"
+ )
+ expect_error(
+ get_ipc_metadata_version("45"),
+ '"45" is not a valid IPC MetadataVersion'
+ )
+})
+
+test_that("reader with 0 batches", {
+ # IPC stream containing only a schema (ARROW-10642)
+ sink <- BufferOutputStream$create()
+ writer <- RecordBatchStreamWriter$create(sink, schema(a = int32()))
+ writer$close()
+ buf <- sink$finish()
+
+ reader <- RecordBatchStreamReader$create(buf)
+ tab <- reader$read_table()
+ expect_r6_class(tab, "Table")
+ expect_identical(dim(tab), c(0L, 1L))
+})
diff --git a/src/arrow/r/tests/testthat/test-s3-minio.R b/src/arrow/r/tests/testthat/test-s3-minio.R
new file mode 100644
index 000000000..e2c1dc2e7
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-s3-minio.R
@@ -0,0 +1,228 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+if (arrow_with_s3() && process_is_running("minio server")) {
+ # Get minio config, with expected defaults
+ minio_key <- Sys.getenv("MINIO_ACCESS_KEY", "minioadmin")
+ minio_secret <- Sys.getenv("MINIO_SECRET_KEY", "minioadmin")
+ minio_port <- Sys.getenv("MINIO_PORT", "9000")
+
+ # Helper function for minio URIs
+ minio_uri <- function(...) {
+ template <- "s3://%s:%s@%s?scheme=http&endpoint_override=localhost%s%s"
+ sprintf(template, minio_key, minio_secret, minio_path(...), "%3A", minio_port)
+ }
+ minio_path <- function(...) paste(now, ..., sep = "/")
+
+ test_that("minio setup", {
+ # Create a "bucket" on minio for this test run, which we'll delete when done.
+ fs <- S3FileSystem$create(
+ access_key = minio_key,
+ secret_key = minio_secret,
+ scheme = "http",
+ endpoint_override = paste0("localhost:", minio_port)
+ )
+ expect_r6_class(fs, "S3FileSystem")
+ now <- as.character(as.numeric(Sys.time()))
+ # If minio isn't running, this will hang for a few seconds and fail with a
+ # curl timeout, causing `run_these` to be set to FALSE and skipping the tests
+ fs$CreateDir(now)
+ })
+ # Clean up when we're all done
+ on.exit(fs$DeleteDir(now))
+
+ test_that("read/write Feather on minio", {
+ write_feather(example_data, minio_uri("test.feather"))
+ expect_identical(read_feather(minio_uri("test.feather")), example_data)
+ })
+
+ test_that("read/write Feather by filesystem, not URI", {
+ write_feather(example_data, fs$path(minio_path("test2.feather")))
+ expect_identical(
+ read_feather(fs$path(minio_path("test2.feather"))),
+ example_data
+ )
+ })
+
+ test_that("read/write stream", {
+ write_ipc_stream(example_data, fs$path(minio_path("test3.ipc")))
+ expect_identical(
+ read_ipc_stream(fs$path(minio_path("test3.ipc"))),
+ example_data
+ )
+ })
+
+ test_that("read/write Parquet on minio", {
+ skip_if_not_available("parquet")
+ write_parquet(example_data, fs$path(minio_uri("test.parquet")))
+ expect_identical(read_parquet(minio_uri("test.parquet")), example_data)
+ })
+
+ if (arrow_with_dataset()) {
+ library(dplyr)
+
+ make_temp_dir <- function() {
+ path <- tempfile()
+ dir.create(path)
+ normalizePath(path, winslash = "/")
+ }
+
+ test_that("open_dataset with an S3 file (not directory) URI", {
+ skip_if_not_available("parquet")
+ expect_identical(
+ open_dataset(minio_uri("test.parquet")) %>% collect() %>% arrange(int),
+ example_data %>% arrange(int)
+ )
+ })
+
+ test_that("open_dataset with vector of S3 file URIs", {
+ expect_identical(
+ open_dataset(
+ c(minio_uri("test.feather"), minio_uri("test2.feather")),
+ format = "feather"
+ ) %>%
+ arrange(int) %>%
+ collect(),
+ rbind(example_data, example_data) %>% arrange(int)
+ )
+ })
+
+ test_that("open_dataset errors on URIs for different file systems", {
+ td <- make_temp_dir()
+ expect_error(
+ open_dataset(
+ c(
+ minio_uri("test.feather"),
+ paste0("file://", file.path(td, "fake.feather"))
+ ),
+ format = "feather"
+ ),
+ "Vectors of URIs for different file systems are not supported"
+ )
+ })
+
+ # Dataset test setup, cf. test-dataset.R
+ first_date <- lubridate::ymd_hms("2015-04-29 03:12:39")
+ df1 <- tibble(
+ int = 1:10,
+ dbl = as.numeric(1:10),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[1:10],
+ fct = factor(LETTERS[1:10]),
+ ts = first_date + lubridate::days(1:10)
+ )
+
+ second_date <- lubridate::ymd_hms("2017-03-09 07:01:02")
+ df2 <- tibble(
+ int = 101:110,
+ dbl = as.numeric(51:60),
+ lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2),
+ chr = letters[10:1],
+ fct = factor(LETTERS[10:1]),
+ ts = second_date + lubridate::days(10:1)
+ )
+
+ # This is also to set up the dataset tests
+ test_that("write_parquet with filesystem arg", {
+ skip_if_not_available("parquet")
+ fs$CreateDir(minio_path("hive_dir", "group=1", "other=xxx"))
+ fs$CreateDir(minio_path("hive_dir", "group=2", "other=yyy"))
+ expect_length(fs$ls(minio_path("hive_dir")), 2)
+ write_parquet(df1, fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet")))
+ write_parquet(df2, fs$path(minio_path("hive_dir", "group=2", "other=yyy", "file2.parquet")))
+ expect_identical(
+ read_parquet(fs$path(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"))),
+ df1
+ )
+ })
+
+ test_that("open_dataset with fs", {
+ ds <- open_dataset(fs$path(minio_path("hive_dir")))
+ expect_identical(
+ ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int),
+ rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int)
+ )
+ })
+
+ test_that("write_dataset with fs", {
+ ds <- open_dataset(fs$path(minio_path("hive_dir")))
+ write_dataset(ds, fs$path(minio_path("new_dataset_dir")))
+ expect_length(fs$ls(minio_path("new_dataset_dir")), 1)
+ })
+
+ test_that("Let's test copy_files too", {
+ td <- make_temp_dir()
+ copy_files(minio_uri("hive_dir"), td)
+ expect_length(dir(td), 2)
+ ds <- open_dataset(td)
+ expect_identical(
+ ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int),
+ rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int)
+ )
+
+ # Let's copy the other way and use a SubTreeFileSystem rather than URI
+ copy_files(td, fs$path(minio_path("hive_dir2")))
+ ds2 <- open_dataset(fs$path(minio_path("hive_dir2")))
+ expect_identical(
+ ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int),
+ rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int)
+ )
+ })
+ }
+
+ test_that("S3FileSystem input validation", {
+ expect_error(
+ S3FileSystem$create(access_key = "foo"),
+ "Key authentication requires both access_key and secret_key"
+ )
+ expect_error(
+ S3FileSystem$create(secret_key = "foo"),
+ "Key authentication requires both access_key and secret_key"
+ )
+ expect_error(
+ S3FileSystem$create(session_token = "foo"),
+ paste0(
+ "In order to initialize a session with temporary credentials, ",
+ "both secret_key and access_key must be provided ",
+ "in addition to session_token."
+ )
+ )
+ expect_error(
+ S3FileSystem$create(access_key = "foo", secret_key = "asdf", anonymous = TRUE),
+ 'Cannot specify "access_key" and "secret_key" when anonymous = TRUE'
+ )
+ expect_error(
+ S3FileSystem$create(access_key = "foo", secret_key = "asdf", role_arn = "qwer"),
+ "Cannot provide both key authentication and role_arn"
+ )
+ expect_error(
+ S3FileSystem$create(access_key = "foo", secret_key = "asdf", external_id = "qwer"),
+ 'Cannot specify "external_id" without providing a role_arn string'
+ )
+ expect_error(
+ S3FileSystem$create(external_id = "foo"),
+ 'Cannot specify "external_id" without providing a role_arn string'
+ )
+ })
+} else {
+ # Kinda hacky, let's put a skipped test here, just so we note that the tests
+ # didn't run
+ test_that("S3FileSystem tests with Minio", {
+ skip("Minio is not running")
+ })
+}
diff --git a/src/arrow/r/tests/testthat/test-s3.R b/src/arrow/r/tests/testthat/test-s3.R
new file mode 100644
index 000000000..298b15bb8
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-s3.R
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+run_these <- tryCatch(
+ expr = {
+ if (arrow_with_s3() &&
+ identical(tolower(Sys.getenv("ARROW_R_DEV")), "true") &&
+ !identical(Sys.getenv("AWS_ACCESS_KEY_ID"), "") &&
+ !identical(Sys.getenv("AWS_SECRET_ACCESS_KEY"), "")) {
+ # See if we have access to the test bucket
+ bucket <- s3_bucket("ursa-labs-r-test")
+ bucket$GetFileInfo("")
+ TRUE
+ } else {
+ FALSE
+ }
+ },
+ error = function(e) FALSE
+)
+
+bucket_uri <- function(..., bucket = "s3://ursa-labs-r-test/%s?region=us-west-2") {
+ segments <- paste(..., sep = "/")
+ sprintf(bucket, segments)
+}
+
+if (run_these) {
+ now <- as.numeric(Sys.time())
+ on.exit(bucket$DeleteDir(now))
+
+ test_that("read/write Feather on S3", {
+ write_feather(example_data, bucket_uri(now, "test.feather"))
+ expect_identical(read_feather(bucket_uri(now, "test.feather")), example_data)
+ })
+
+ test_that("read/write Parquet on S3", {
+ skip_if_not_available("parquet")
+ write_parquet(example_data, bucket_uri(now, "test.parquet"))
+ expect_identical(read_parquet(bucket_uri(now, "test.parquet")), example_data)
+ })
+}
diff --git a/src/arrow/r/tests/testthat/test-scalar.R b/src/arrow/r/tests/testthat/test-scalar.R
new file mode 100644
index 000000000..3afccf743
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-scalar.R
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+expect_scalar_roundtrip <- function(x, type) {
+ s <- Scalar$create(x)
+ expect_r6_class(s, "Scalar")
+ expect_equal(s$type, type)
+ expect_identical(length(s), 1L)
+ if (inherits(type, "NestedType")) {
+ # Should this be? Missing if all elements are missing?
+ # expect_identical(is.na(s), all(is.na(x))) # nolint
+ } else {
+ expect_identical(as.vector(is.na(s)), is.na(x))
+ # MakeArrayFromScalar not implemented for list types
+ expect_as_vector(s, x)
+ }
+}
+
+test_that("Scalar object roundtrip", {
+ expect_scalar_roundtrip(2, float64())
+ expect_scalar_roundtrip(2L, int32())
+ expect_scalar_roundtrip(c(2, 4), list_of(float64()))
+ expect_scalar_roundtrip(c(NA, NA), list_of(bool()))
+ expect_scalar_roundtrip(data.frame(a = 2, b = 4L), struct(a = double(), b = int32()))
+})
+
+test_that("Scalar print", {
+ expect_output(print(Scalar$create(4)), "Scalar\n4")
+})
+
+test_that("Creating Scalars of a different type and casting them", {
+ expect_equal(Scalar$create(4L, int8())$type, int8())
+ expect_equal(Scalar$create(4L)$cast(float32())$type, float32())
+})
+
+test_that("Scalar to Array", {
+ a <- Scalar$create(42)
+ expect_equal(a$as_array(), Array$create(42))
+ expect_equal(Array$create(a), Array$create(42))
+})
+
+test_that("Scalar$Equals", {
+ a <- Scalar$create(42)
+ aa <- Array$create(42)
+ b <- Scalar$create(42)
+ d <- Scalar$create(43)
+ expect_equal(a, b)
+ expect_true(a$Equals(b))
+ expect_false(a$Equals(d))
+ expect_false(a$Equals(aa))
+})
+
+test_that("Scalar$ApproxEquals", {
+ a <- Scalar$create(1.0000000000001)
+ aa <- Array$create(1.0000000000001)
+ b <- Scalar$create(1.0)
+ d <- 2.400000000000001
+ expect_false(a$Equals(b))
+ expect_true(a$ApproxEquals(b))
+ expect_false(a$ApproxEquals(d))
+ expect_false(a$ApproxEquals(aa))
+})
+
+test_that("Handling string data with embedded nuls", {
+ raws <- as.raw(c(0x6d, 0x61, 0x00, 0x6e))
+ expect_error(
+ rawToChar(raws),
+ "embedded nul in string: 'ma\\0n'", # See?
+ fixed = TRUE
+ )
+ scalar_with_nul <- Scalar$create(raws, binary())$cast(utf8())
+
+ # The behavior of the warnings/errors is slightly different with and without
+ # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
+ # on `as.vector()` where as with it, the error only happens on materialization)
+ skip_if_r_version("3.5.0")
+ v <- expect_error(as.vector(scalar_with_nul), NA)
+ expect_error(
+ v[1],
+ paste0(
+ "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ",
+ "set options(arrow.skip_nul = TRUE)"
+ ),
+ fixed = TRUE
+ )
+
+ withr::with_options(list(arrow.skip_nul = TRUE), {
+ expect_warning(
+ expect_identical(
+ as.vector(scalar_with_nul)[],
+ "man"
+ ),
+ "Stripping '\\0' (nul) from character vector",
+ fixed = TRUE
+ )
+ })
+})
diff --git a/src/arrow/r/tests/testthat/test-schema.R b/src/arrow/r/tests/testthat/test-schema.R
new file mode 100644
index 000000000..8473550df
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-schema.R
@@ -0,0 +1,220 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("Alternate type names are supported", {
+ expect_equal(
+ schema(b = double(), c = bool(), d = string(), e = float(), f = halffloat()),
+ schema(b = float64(), c = boolean(), d = utf8(), e = float32(), f = float16())
+ )
+ expect_equal(names(schema(b = double(), c = bool(), d = string())), c("b", "c", "d"))
+})
+
+test_that("Schema print method", {
+ expect_output(
+ print(schema(b = double(), c = bool(), d = string())),
+ paste(
+ "Schema",
+ "b: double",
+ "c: bool",
+ "d: string",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("Schema with non-nullable fields", {
+ expect_output(
+ print(schema(field("b", double()),
+ field("c", bool(), nullable = FALSE),
+ field("d", string()))),
+ paste(
+ "Schema",
+ "b: double",
+ "c: bool not null",
+ "d: string",
+ sep = "\n"
+ ),
+ fixed = TRUE
+ )
+})
+
+test_that("Schema $GetFieldByName", {
+ schm <- schema(b = double(), c = string())
+ expect_equal(schm$GetFieldByName("b"), field("b", double()))
+ expect_null(schm$GetFieldByName("f"))
+ # TODO: schema(b = double(), b = string())$GetFieldByName("b") # nolint
+ # also returns NULL and probably should error bc duplicated names
+})
+
+test_that("Schema extract (returns Field)", {
+ # TODO: should this return a Field or the Type?
+ # I think of Schema like list(name = type, name = type, ...)
+ # but in practice it is more like list(list(name, type), list(name, type), ...)
+ # -> Field names in a Schema may be duplicated
+ # -> Fields may have metadata (though we don't really handle that in R)
+ schm <- schema(b = double(), c = string())
+ expect_equal(schm$b, field("b", double()))
+ expect_equal(schm[["b"]], field("b", double()))
+ expect_equal(schm[[1]], field("b", double()))
+
+ expect_null(schm[["ZZZ"]])
+ expect_error(schm[[42]]) # Should have better error message
+})
+
+test_that("Schema slicing", {
+ schm <- schema(b = double(), c = string(), d = int8())
+ expect_equal(schm[2:3], schema(c = string(), d = int8()))
+ expect_equal(schm[-1], schema(c = string(), d = int8()))
+ expect_equal(schm[c("d", "c")], schema(d = int8(), c = string()))
+ expect_equal(schm[c(FALSE, TRUE, TRUE)], schema(c = string(), d = int8()))
+ expect_error(schm[c("c", "ZZZ")], 'Invalid field name: "ZZZ"')
+ expect_error(schm[c("XXX", "c", "ZZZ")], 'Invalid field names: "XXX" and "ZZZ"')
+})
+
+test_that("Schema modification", {
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm$c <- boolean()
+ expect_equal(schm, schema(b = double(), c = boolean(), d = int8()))
+ schm[["d"]] <- int16()
+ expect_equal(schm, schema(b = double(), c = boolean(), d = int16()))
+ schm$b <- NULL
+ expect_equal(schm, schema(c = boolean(), d = int16()))
+ # NULL assigning something that doesn't exist doesn't modify
+ schm$zzzz <- NULL
+ expect_equal(schm, schema(c = boolean(), d = int16()))
+ # Adding a field
+ schm$fff <- int32()
+ expect_equal(schm, schema(c = boolean(), d = int16(), fff = int32()))
+
+ # By index
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm[[2]] <- int32()
+ expect_equal(schm, schema(b = double(), c = int32(), d = int8()))
+
+ # Adding actual Fields
+ # If assigning by name, note that this can modify the resulting name
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm$c <- field("x", int32())
+ expect_equal(schm, schema(b = double(), x = int32(), d = int8()))
+ schm[[2]] <- field("y", int64())
+ expect_equal(schm, schema(b = double(), y = int64(), d = int8()))
+
+ # Error handling
+ expect_error(schm$c <- 4, "value must be a DataType")
+ expect_error(schm[[-3]] <- int32(), "i not greater than 0")
+ expect_error(schm[[0]] <- int32(), "i not greater than 0")
+ expect_error(schm[[NA_integer_]] <- int32(), "!is.na(i) is not TRUE", fixed = TRUE)
+ expect_error(schm[[TRUE]] <- int32(), "i is not a numeric or integer vector")
+ expect_error(schm[[c(2, 4)]] <- int32(), "length(i) not equal to 1", fixed = TRUE)
+})
+
+test_that("Metadata is preserved when modifying Schema", {
+ schm <- schema(b = double(), c = string(), d = int8())
+ schm$metadata$foo <- "bar"
+ expect_identical(schm$metadata, list(foo = "bar"))
+ schm$c <- field("x", int32())
+ expect_identical(schm$metadata, list(foo = "bar"))
+})
+
+test_that("reading schema from Buffer", {
+ # TODO: this uses the streaming format, i.e. from RecordBatchStreamWriter
+ # maybe there is an easier way to serialize a schema
+ batch <- record_batch(x = 1:10)
+ expect_r6_class(batch, "RecordBatch")
+
+ stream <- BufferOutputStream$create()
+ writer <- RecordBatchStreamWriter$create(stream, batch$schema)
+ expect_r6_class(writer, "RecordBatchWriter")
+ writer$close()
+
+ buffer <- stream$finish()
+ expect_r6_class(buffer, "Buffer")
+
+ reader <- MessageReader$create(buffer)
+ expect_r6_class(reader, "MessageReader")
+
+ message <- reader$ReadNextMessage()
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+
+ stream <- BufferReader$create(buffer)
+ expect_r6_class(stream, "BufferReader")
+ message <- read_message(stream)
+ expect_r6_class(message, "Message")
+ expect_equal(message$type, MessageType$SCHEMA)
+})
+
+test_that("Input validation when creating a table with a schema", {
+ expect_error(
+ Table$create(b = 1, schema = c(b = float64())), # list not Schema
+ "`schema` must be an arrow::Schema or NULL"
+ )
+})
+
+test_that("Schema$Equals", {
+ a <- schema(b = double(), c = bool())
+ b <- a$WithMetadata(list(some = "metadata"))
+
+ # different metadata
+ expect_failure(expect_equal(a, b))
+ expect_false(a$Equals(b, check_metadata = TRUE))
+
+ # Metadata not checked
+ expect_equal(a, b, ignore_attr = TRUE)
+
+ # Non-schema object
+ expect_false(a$Equals(42))
+})
+
+test_that("unify_schemas", {
+ a <- schema(b = double(), c = bool())
+ z <- schema(b = double(), k = utf8())
+ expect_equal(
+ unify_schemas(a, z),
+ schema(b = double(), c = bool(), k = utf8())
+ )
+ # returns NULL when any arg is NULL
+ expect_null(
+ unify_schemas(a, NULL, z)
+ )
+ # returns NULL when all args are NULL
+ expect_null(
+ unify_schemas(NULL, NULL)
+ )
+ # errors when no args
+ expect_error(
+ unify_schemas(),
+ "Must provide at least one schema to unify"
+ )
+})
+
+test_that("Schema to C-interface", {
+ schema <- schema(b = double(), c = bool())
+
+ # export the schema via the C-interface
+ ptr <- allocate_arrow_schema()
+ schema$export_to_c(ptr)
+
+ # then import it and check that the roundtripped value is the same
+ circle <- Schema$import_from_c(ptr)
+ expect_equal(circle, schema)
+
+ # must clean up the pointer or we leak
+ delete_arrow_schema(ptr)
+})
diff --git a/src/arrow/r/tests/testthat/test-thread-pool.R b/src/arrow/r/tests/testthat/test-thread-pool.R
new file mode 100644
index 000000000..baf410368
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-thread-pool.R
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("can set/get cpu thread pool capacity", {
+ old <- cpu_count()
+ set_cpu_count(19)
+ expect_equal(cpu_count(), 19L)
+ set_cpu_count(old)
+ expect_equal(cpu_count(), old)
+})
+
+test_that("can set/get I/O thread pool capacity", {
+ old <- io_thread_count()
+ set_io_thread_count(19)
+ expect_equal(io_thread_count(), 19L)
+ set_io_thread_count(old)
+ expect_equal(io_thread_count(), old)
+})
diff --git a/src/arrow/r/tests/testthat/test-type.R b/src/arrow/r/tests/testthat/test-type.R
new file mode 100644
index 000000000..3821fb450
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-type.R
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("type() gets the right type for arrow::Array", {
+ a <- Array$create(1:10)
+ expect_equal(type(a), a$type)
+})
+
+test_that("type() gets the right type for ChunkedArray", {
+ a <- chunked_array(1:10, 1:10)
+ expect_equal(type(a), a$type)
+})
+
+test_that("type() infers from R type", {
+ expect_equal(type(1:10), int32())
+ expect_equal(type(1), float64())
+ expect_equal(type(TRUE), boolean())
+ expect_equal(type(raw()), uint8())
+ expect_equal(type(""), utf8())
+ expect_equal(
+ type(example_data$fct),
+ dictionary(int8(), utf8(), FALSE)
+ )
+ expect_equal(
+ type(lubridate::ymd_hms("2019-02-14 13:55:05")),
+ timestamp(TimeUnit$MICRO, "UTC")
+ )
+ expect_equal(
+ type(hms::hms(56, 34, 12)),
+ time32(unit = TimeUnit$SECOND)
+ )
+ expect_equal(
+ type(bit64::integer64()),
+ int64()
+ )
+})
+
+test_that("type() can infer struct types from data frames", {
+ df <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+ expect_equal(type(df), struct(x = int32(), y = float64(), z = utf8()))
+})
+
+test_that("DataType$Equals", {
+ a <- int32()
+ b <- int32()
+ z <- float64()
+ expect_true(a == b)
+ expect_true(a$Equals(b))
+ expect_false(a == z)
+ expect_equal(a, b)
+ expect_failure(expect_equal(a, z))
+ expect_failure(expect_equal(a, z))
+ expect_false(a$Equals(32L))
+})
+
+test_that("Masked data type functions still work", {
+ skip("Work around masking of data type functions (ARROW-12322)")
+
+ # Works when type function is masked
+ string <- rlang::string
+ expect_equal(
+ Array$create("abc", type = string()),
+ arrow::string()
+ )
+ rm(string)
+
+ # Works when with non-Arrow function that returns an Arrow type
+ # when the non-Arrow function has the same name as a base R function...
+ str <- arrow::string
+ expect_equal(
+ Array$create("abc", type = str()),
+ arrow::string()
+ )
+ rm(str)
+
+ # ... and when it has the same name as an Arrow function
+ type <- arrow::string
+ expect_equal(
+ Array$create("abc", type = type()),
+ arrow::string()
+ )
+ rm(type)
+
+ # Works with local variable whose value is an Arrow type
+ type <- arrow::string()
+ expect_equal(
+ Array$create("abc", type = type),
+ arrow::string()
+ )
+ rm(type)
+})
+
+test_that("Type strings are correctly canonicalized", {
+ # data types without arguments
+ expect_equal(canonical_type_str("int8"), int8()$ToString())
+ expect_equal(canonical_type_str("int16"), int16()$ToString())
+ expect_equal(canonical_type_str("int32"), int32()$ToString())
+ expect_equal(canonical_type_str("int64"), int64()$ToString())
+ expect_equal(canonical_type_str("uint8"), uint8()$ToString())
+ expect_equal(canonical_type_str("uint16"), uint16()$ToString())
+ expect_equal(canonical_type_str("uint32"), uint32()$ToString())
+ expect_equal(canonical_type_str("uint64"), uint64()$ToString())
+ expect_equal(canonical_type_str("float16"), float16()$ToString())
+ expect_equal(canonical_type_str("halffloat"), halffloat()$ToString())
+ expect_equal(canonical_type_str("float32"), float32()$ToString())
+ expect_equal(canonical_type_str("float"), float()$ToString())
+ expect_equal(canonical_type_str("float64"), float64()$ToString())
+ expect_equal(canonical_type_str("double"), float64()$ToString())
+ expect_equal(canonical_type_str("boolean"), boolean()$ToString())
+ expect_equal(canonical_type_str("bool"), bool()$ToString())
+ expect_equal(canonical_type_str("utf8"), utf8()$ToString())
+ expect_equal(canonical_type_str("large_utf8"), large_utf8()$ToString())
+ expect_equal(canonical_type_str("large_string"), large_utf8()$ToString())
+ expect_equal(canonical_type_str("binary"), binary()$ToString())
+ expect_equal(canonical_type_str("large_binary"), large_binary()$ToString())
+ expect_equal(canonical_type_str("string"), arrow::string()$ToString())
+ expect_equal(canonical_type_str("null"), null()$ToString())
+
+ # data types with arguments
+ expect_equal(
+ canonical_type_str("fixed_size_binary"),
+ sub("^([^([<]+).*$", "\\1", fixed_size_binary(42)$ToString())
+ )
+ expect_equal(
+ canonical_type_str("date32"),
+ sub("^([^([<]+).*$", "\\1", date32()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("date64"),
+ sub("^([^([<]+).*$", "\\1", date64()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("time32"),
+ sub("^([^([<]+).*$", "\\1", time32()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("time64"),
+ sub("^([^([<]+).*$", "\\1", time64()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("timestamp"),
+ sub("^([^([<]+).*$", "\\1", timestamp()$ToString())
+ )
+ expect_equal(
+ canonical_type_str("decimal"),
+ sub("^([^([<]+).*$", "\\1", decimal(3, 2)$ToString())
+ )
+ expect_equal(
+ canonical_type_str("struct"),
+ sub("^([^([<]+).*$", "\\1", struct(foo = int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("list_of"),
+ sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("list"),
+ sub("^([^([<]+).*$", "\\1", list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("large_list_of"),
+ sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("large_list"),
+ sub("^([^([<]+).*$", "\\1", large_list_of(int32())$ToString())
+ )
+ expect_equal(
+ canonical_type_str("fixed_size_list_of"),
+ sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString())
+ )
+ expect_equal(
+ canonical_type_str("fixed_size_list"),
+ sub("^([^([<]+).*$", "\\1", fixed_size_list_of(int32(), 42)$ToString())
+ )
+
+ # unsupported data types
+ expect_error(
+ canonical_type_str("decimal128(3, 2)"),
+ "parameters"
+ )
+ expect_error(
+ canonical_type_str("list<item: int32>"),
+ "parameters"
+ )
+ expect_error(
+ canonical_type_str("time32[s]"),
+ "parameters"
+ )
+
+ # unrecognized data types
+ expect_error(
+ canonical_type_str("foo"),
+ "Unrecognized"
+ )
+})
diff --git a/src/arrow/r/tests/testthat/test-utf.R b/src/arrow/r/tests/testthat/test-utf.R
new file mode 100644
index 000000000..69d196274
--- /dev/null
+++ b/src/arrow/r/tests/testthat/test-utf.R
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+test_that("We handle non-UTF strings", {
+ # Move the code with non-UTF strings to a separate file so that we don't
+ # get a parse error on *cough* certain platforms
+ skip_on_cran()
+ source("latin1.R", encoding = "latin1")
+})
diff --git a/src/arrow/r/tools/autobrew b/src/arrow/r/tools/autobrew
new file mode 100644
index 000000000..d40729e18
--- /dev/null
+++ b/src/arrow/r/tools/autobrew
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# https://github.com/jeroen/autobrew/blob/gh-pages/apache-arrow
+export HOMEBREW_NO_ANALYTICS=1
+export HOMEBREW_NO_AUTO_UPDATE=1
+
+# Official Homebrew no longer supports El-Capitan
+UPSTREAM_ORG="autobrew"
+
+if [ "$DISABLE_AUTOBREW" ]; then return 0; fi
+AUTOBREW=${TMPDIR-/tmp}
+export HOMEBREW_TEMP="$AUTOBREW/hbtmp"
+BREWDIR="$AUTOBREW/build-$PKG_BREW_NAME"
+BREW="$BREWDIR/bin/brew"
+rm -Rf $BREWDIR
+mkdir -p $BREWDIR
+echo "$(date): Auto-brewing $PKG_BREW_NAME in $BREWDIR..."
+curl -fsSL https://github.com/$UPSTREAM_ORG/brew/tarball/master | tar xz --strip 1 -C $BREWDIR
+
+# Install bottle + dependencies
+export HOMEBREW_CACHE="$AUTOBREW"
+LOCAL_FORMULA="tools/${PKG_BREW_NAME}.rb"
+if [ -f "$LOCAL_FORMULA" ]; then
+ # Use the local brew formula and install --HEAD
+ $BREW deps -n "$LOCAL_FORMULA" 2>/dev/null
+ BREW_DEPS=$($BREW deps -n "$LOCAL_FORMULA" 2>/dev/null)
+ $BREW install --force-bottle $BREW_DEPS 2>&1 | perl -pe 's/Warning/Note/gi'
+ $BREW install -v --build-from-source --HEAD "$LOCAL_FORMULA" 2>&1 | perl -pe 's/Warning/Note/gi'
+else
+ $BREW install --force-bottle $BREW_DEPS $PKG_BREW_NAME 2>&1 | perl -pe 's/Warning/Note/gi'
+fi
+
+# Hardcode this for my custom autobrew build
+rm -f $BREWDIR/lib/*.dylib
+AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common -lpthread -lcurl"
+PKG_LIBS="-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lthrift -llz4 -lsnappy -lzstd $AWS_LIBS"
+PKG_DIRS="-L$BREWDIR/lib"
+
+# Prevent CRAN builder from linking against old libs in /usr/local/lib
+for FILE in $BREWDIR/Cellar/*/*/lib/*.a; do
+ BASENAME=`basename $FILE`
+ LIBNAME=`echo "${BASENAME%.*}" | cut -c4-`
+ cp -f $FILE $BREWDIR/lib/libbrew$LIBNAME.a
+ echo "created $BREWDIR/lib/libbrew$LIBNAME.a"
+ PKG_LIBS=`echo $PKG_LIBS | sed "s/-l$LIBNAME/-lbrew$LIBNAME/g"`
+done
+
+PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3"
+
+unset HOMEBREW_NO_ANALYTICS
+unset HOMEBREW_NO_AUTO_UPDATE
diff --git a/src/arrow/r/tools/nixlibs.R b/src/arrow/r/tools/nixlibs.R
new file mode 100644
index 000000000..869e0abcf
--- /dev/null
+++ b/src/arrow/r/tools/nixlibs.R
@@ -0,0 +1,601 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+args <- commandArgs(TRUE)
+VERSION <- args[1]
+dst_dir <- paste0("libarrow/arrow-", VERSION)
+
+arrow_repo <- "https://arrow-r-nightly.s3.amazonaws.com/libarrow/"
+
+if (getRversion() < 3.4 && is.null(getOption("download.file.method"))) {
+ # default method doesn't work on R 3.3, nor does libcurl
+ options(download.file.method = "wget")
+}
+
+options(.arrow.cleanup = character()) # To collect dirs to rm on exit
+on.exit(unlink(getOption(".arrow.cleanup")))
+
+env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value)
+
+try_download <- function(from_url, to_file) {
+ status <- try(
+ suppressWarnings(
+ download.file(from_url, to_file, quiet = quietly)
+ ),
+ silent = quietly
+ )
+ # Return whether the download was successful
+ !inherits(status, "try-error") && status == 0
+}
+
+# For local debugging, set ARROW_R_DEV=TRUE to make this script print more
+quietly <- !env_is("ARROW_R_DEV", "true")
+
+# Default is build from source, not download a binary
+build_ok <- !env_is("LIBARROW_BUILD", "false")
+binary_ok <- env_is("LIBARROW_BINARY", "true")
+
+# Check if we're doing an offline build.
+# (Note that cmake will still be downloaded if necessary
+# https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds)
+download_ok <- !env_is("TEST_OFFLINE_BUILD", "true") && try_download("https://github.com", tempfile())
+
+# This "tools/thirdparty_dependencies" path, within the tar file, might exist if
+# create_package_with_all_dependencies() was run, or if someone has created it
+# manually before running make build.
+# If you change this path, you also need to edit
+# `create_package_with_all_dependencies()` in install-arrow.R
+thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tools/thirdparty_dependencies")
+
+
+download_binary <- function(os = identify_os()) {
+ libfile <- tempfile()
+ if (!is.null(os)) {
+ # See if we can map this os-version to one we have binaries for
+ os <- find_available_binary(os)
+ binary_url <- paste0(arrow_repo, "bin/", os, "/arrow-", VERSION, ".zip")
+ if (try_download(binary_url, libfile)) {
+ cat(sprintf("*** Successfully retrieved C++ binaries for %s\n", os))
+ if (!identical(os, "centos-7")) {
+ # centos-7 uses gcc 4.8 so the binary doesn't have ARROW_S3=ON but the others do
+ # TODO: actually check for system requirements?
+ cat("**** Binary package requires libcurl and openssl\n")
+ cat("**** If installation fails, retry after installing those system requirements\n")
+ }
+ } else {
+ cat(sprintf("*** No C++ binaries found for %s\n", os))
+ libfile <- NULL
+ }
+ } else {
+ libfile <- NULL
+ }
+ libfile
+}
+
+# Function to figure out which flavor of binary we should download, if at all.
+# By default (unset or "FALSE"), it will not download a precompiled library,
+# but you can override this by setting the env var LIBARROW_BINARY to:
+# * `TRUE` (not case-sensitive), to try to discover your current OS, or
+# * some other string, presumably a related "distro-version" that has binaries
+# built that work for your OS
+identify_os <- function(os = Sys.getenv("LIBARROW_BINARY")) {
+ if (tolower(os) %in% c("", "false")) {
+ # Env var says not to download a binary
+ return(NULL)
+ } else if (!identical(tolower(os), "true")) {
+ # Env var provided an os-version to use--maybe you're on Ubuntu 18.10 but
+ # we only build for 18.04 and that's fine--so use what the user set
+ return(os)
+ }
+
+ linux <- distro()
+ if (is.null(linux)) {
+ cat("*** Unable to identify current OS/version\n")
+ return(NULL)
+ }
+ paste(linux$id, linux$short_version, sep = "-")
+}
+
+#### start distro ####
+
+distro <- function() {
+ # The code in this script is a (potentially stale) copy of the distro package
+ if (requireNamespace("distro", quietly = TRUE)) {
+ # Use the version from the package, which may be updated from this
+ return(distro::distro())
+ }
+
+ out <- lsb_release()
+ if (is.null(out)) {
+ out <- os_release()
+ if (is.null(out)) {
+ out <- system_release()
+ }
+ }
+ if (is.null(out)) {
+ return(NULL)
+ }
+
+ out$id <- tolower(out$id)
+ # debian unstable & testing lsb_release `version` don't include numbers but we can map from pretty name
+ if (is.null(out$version) || out$version %in% c("testing", "unstable")) {
+ if (grepl("bullseye", out$codename)) {
+ out$short_version <- "11"
+ } else if (grepl("bookworm", out$codename)) {
+ out$short_version <- "12"
+ }
+ } else if (out$id == "ubuntu") {
+ # Keep major.minor version
+ out$short_version <- sub('^"?([0-9]+\\.[0-9]+).*"?.*$', "\\1", out$version)
+ } else {
+ # Only major version number
+ out$short_version <- sub('^"?([0-9]+).*"?.*$', "\\1", out$version)
+ }
+ out
+}
+
+lsb_release <- function() {
+ if (have_lsb_release()) {
+ list(
+ id = call_lsb("-is"),
+ version = call_lsb("-rs"),
+ codename = call_lsb("-cs")
+ )
+ } else {
+ NULL
+ }
+}
+
+have_lsb_release <- function() nzchar(Sys.which("lsb_release"))
+call_lsb <- function(args) system(paste("lsb_release", args), intern = TRUE)
+
+os_release <- function() {
+ rel_data <- read_os_release()
+ if (!is.null(rel_data)) {
+ vals <- as.list(sub('^.*="?(.*?)"?$', "\\1", rel_data))
+ names(vals) <- sub("^(.*)=.*$", "\\1", rel_data)
+
+ out <- list(
+ id = vals[["ID"]],
+ version = vals[["VERSION_ID"]]
+ )
+ if ("VERSION_CODENAME" %in% names(vals)) {
+ out$codename <- vals[["VERSION_CODENAME"]]
+ } else {
+ # This probably isn't right, maybe could extract codename from pretty name?
+ out$codename <- vals[["PRETTY_NAME"]]
+ }
+ out
+ } else {
+ NULL
+ }
+}
+
+read_os_release <- function() {
+ if (file.exists("/etc/os-release")) {
+ readLines("/etc/os-release")
+ }
+}
+
+system_release <- function() {
+ rel_data <- read_system_release()
+ if (!is.null(rel_data)) {
+ # Something like "CentOS Linux release 7.7.1908 (Core)"
+ list(
+ id = sub("^([a-zA-Z]+) .* ([0-9.]+).*$", "\\1", rel_data),
+ version = sub("^([a-zA-Z]+) .* ([0-9.]+).*$", "\\2", rel_data),
+ codename = NA
+ )
+ } else {
+ NULL
+ }
+}
+
+read_system_release <- function() {
+ if (file.exists("/etc/system-release")) {
+ readLines("/etc/system-release")[1]
+ }
+}
+
+#### end distro ####
+
+find_available_binary <- function(os) {
+ # Download a csv that maps one to the other, columns "actual" and "use_this"
+ u <- "https://raw.githubusercontent.com/ursa-labs/arrow-r-nightly/master/linux/distro-map.csv"
+ lookup <- try(utils::read.csv(u, stringsAsFactors = FALSE), silent = quietly)
+ if (!inherits(lookup, "try-error") && os %in% lookup$actual) {
+ new <- lookup$use_this[lookup$actual == os]
+ if (length(new) == 1 && !is.na(new)) { # Just some sanity checking
+ cat(sprintf("*** Using %s binary for %s\n", new, os))
+ os <- new
+ }
+ }
+ os
+}
+
+find_local_source <- function() {
+ # We'll take the first of these that exists
+ # The first case probably occurs if we're in the arrow git repo
+ # The second probably occurs if we're installing the arrow R package
+ cpp_dir_options <- c(
+ file.path(Sys.getenv("ARROW_SOURCE_HOME", ".."), "cpp"),
+ "tools/cpp"
+ )
+ for (cpp_dir in cpp_dir_options) {
+ if (file.exists(file.path(cpp_dir, "src/arrow/api.h"))) {
+ cat(paste0("*** Found local C++ source: '", cpp_dir, "'\n"))
+ return(cpp_dir)
+ }
+ }
+ NULL
+}
+
+env_vars_as_string <- function(env_var_list) {
+ # Do some basic checks on env_var_list:
+ # Check that env_var_list has names, that those names are valid POSIX
+ # environment variables, and that none of the values contain `'`.
+ stopifnot(
+ length(env_var_list) == length(names(env_var_list)),
+ all(grepl("^[^0-9]", names(env_var_list))),
+ all(grepl("^[A-Z0-9_]+$", names(env_var_list))),
+ !any(grepl("'", env_var_list, fixed = TRUE))
+ )
+ env_var_string <- paste0(names(env_var_list), "='", env_var_list, "'", collapse = " ")
+ if (nchar(env_var_string) > 30000) {
+ # This could happen if the full paths in *_SOURCE_URL were *very* long.
+ # A more formal check would look at getconf ARG_MAX, but this shouldn't matter
+ cat("*** Warning: Environment variables are very long. This could cause issues on some shells.\n")
+ }
+ env_var_string
+}
+
+build_libarrow <- function(src_dir, dst_dir) {
+ # We'll need to compile R bindings with these libs, so delete any .o files
+ system("rm src/*.o", ignore.stdout = TRUE, ignore.stderr = TRUE)
+ # Set up make for parallel building
+ makeflags <- Sys.getenv("MAKEFLAGS")
+ if (makeflags == "") {
+ # CRAN policy says not to use more than 2 cores during checks
+ # If you have more and want to use more, set MAKEFLAGS
+ ncores <- min(parallel::detectCores(), 2)
+ makeflags <- sprintf("-j%s", ncores)
+ Sys.setenv(MAKEFLAGS = makeflags)
+ }
+ if (!quietly) {
+ cat("*** Building with MAKEFLAGS=", makeflags, "\n")
+ }
+ # Check for libarrow build dependencies:
+ # * cmake
+ cmake <- ensure_cmake()
+
+ # Optionally build somewhere not in tmp so we can dissect the build if it fails
+ debug_dir <- Sys.getenv("LIBARROW_DEBUG_DIR")
+ if (nzchar(debug_dir)) {
+ build_dir <- debug_dir
+ } else {
+ # But normally we'll just build in a tmp dir
+ build_dir <- tempfile()
+ }
+ options(.arrow.cleanup = c(getOption(".arrow.cleanup"), build_dir))
+
+ R_CMD_config <- function(var) {
+ if (getRversion() < 3.4) {
+ # var names were called CXX1X instead of CXX11
+ var <- sub("^CXX11", "CXX1X", var)
+ }
+ # tools::Rcmd introduced R 3.3
+ tools::Rcmd(paste("config", var), stdout = TRUE)
+ }
+ env_var_list <- c(
+ SOURCE_DIR = src_dir,
+ BUILD_DIR = build_dir,
+ DEST_DIR = dst_dir,
+ CMAKE = cmake,
+ # EXTRA_CMAKE_FLAGS will often be "", but it's convenient later to have it defined
+ EXTRA_CMAKE_FLAGS = Sys.getenv("EXTRA_CMAKE_FLAGS"),
+ # Make sure we build with the same compiler settings that R is using
+ CC = R_CMD_config("CC"),
+ CXX = paste(R_CMD_config("CXX11"), R_CMD_config("CXX11STD")),
+ # CXXFLAGS = R_CMD_config("CXX11FLAGS"), # We don't want the same debug symbols
+ LDFLAGS = R_CMD_config("LDFLAGS")
+ )
+ env_var_list <- with_s3_support(env_var_list)
+ env_var_list <- with_mimalloc(env_var_list)
+
+ # turn_off_all_optional_features() needs to happen after with_mimalloc() and
+ # with_s3_support(), since those might turn features ON.
+ thirdparty_deps_unavailable <- !download_ok &&
+ !dir.exists(thirdparty_dependency_dir) &&
+ !env_is("ARROW_DEPENDENCY_SOURCE", "system")
+ on_solaris <- tolower(Sys.info()[["sysname"]]) %in% "sunos"
+ do_minimal_build <- on_solaris || env_is("LIBARROW_MINIMAL", "true")
+
+ if (do_minimal_build) {
+ # Note that JSON support does work on Solaris, but will be turned off with
+ # the rest of the optional dependencies.
+ # All other dependencies don't compile (e.g thrift, jemalloc, and xsimd)
+ # or do compile but `ar` fails to build
+ # libarrow_bundled_dependencies (e.g. re2 and utf8proc).
+ env_var_list <- turn_off_all_optional_features(env_var_list)
+ } else if (thirdparty_deps_unavailable) {
+ cat(paste0(
+ "*** Building C++ library from source, but downloading thirdparty dependencies\n",
+ " is not possible, so this build will turn off all thirdparty features.\n",
+ " See install vignette for details:\n",
+ " https://cran.r-project.org/web/packages/arrow/vignettes/install.html\n"
+ ))
+ env_var_list <- turn_off_all_optional_features(env_var_list)
+ } else if (dir.exists(thirdparty_dependency_dir)) {
+ # Add the *_SOURCE_URL env vars
+ env_var_list <- set_thirdparty_urls(env_var_list)
+ }
+ env_vars <- env_vars_as_string(env_var_list)
+
+ cat("**** arrow", ifelse(quietly, "", paste("with", env_vars)), "\n")
+ status <- suppressWarnings(system(
+ paste(env_vars, "inst/build_arrow_static.sh"),
+ ignore.stdout = quietly, ignore.stderr = quietly
+ ))
+ if (status != 0) {
+ # It failed :(
+ cat(
+ "**** Error building Arrow C++.",
+ ifelse(env_is("ARROW_R_DEV", "true"), "", "Re-run with ARROW_R_DEV=true for debug information."),
+ "\n"
+ )
+ }
+ invisible(status)
+}
+
+ensure_cmake <- function() {
+ cmake <- find_cmake(c(
+ Sys.getenv("CMAKE"),
+ Sys.which("cmake"),
+ Sys.which("cmake3")
+ ))
+
+ if (is.null(cmake)) {
+ # If not found, download it
+ cat("**** cmake\n")
+ CMAKE_VERSION <- Sys.getenv("CMAKE_VERSION", "3.19.2")
+ if (tolower(Sys.info()[["sysname"]]) %in% "darwin") {
+ postfix <- "-macos-universal.tar.gz"
+ } else {
+ postfix <- "-Linux-x86_64.tar.gz"
+ }
+ cmake_binary_url <- paste0(
+ "https://github.com/Kitware/CMake/releases/download/v", CMAKE_VERSION,
+ "/cmake-", CMAKE_VERSION, postfix
+ )
+ cmake_tar <- tempfile()
+ cmake_dir <- tempfile()
+ download_successful <- try_download(cmake_binary_url, cmake_tar)
+ if (!download_successful) {
+ cat(paste0(
+ "*** cmake was not found locally and download failed.\n",
+ " Make sure cmake >= 3.10 is installed and available on your PATH,\n",
+ " or download ", cmake_binary_url, "\n",
+ " and define the CMAKE environment variable.\n"
+ ))
+ }
+ untar(cmake_tar, exdir = cmake_dir)
+ unlink(cmake_tar)
+ options(.arrow.cleanup = c(getOption(".arrow.cleanup"), cmake_dir))
+ cmake <- paste0(
+ cmake_dir,
+ "/cmake-", CMAKE_VERSION, sub(".tar.gz", "", postfix, fixed = TRUE),
+ "/bin/cmake"
+ )
+ }
+ cmake
+}
+
+find_cmake <- function(paths, version_required = 3.10) {
+ # Given a list of possible cmake paths, return the first one that exists and is new enough
+ for (path in paths) {
+ if (nzchar(path) && cmake_version(path) >= version_required) {
+ # Sys.which() returns a named vector, but that plays badly with c() later
+ names(path) <- NULL
+ return(path)
+ }
+ }
+ # If none found, return NULL
+ NULL
+}
+
+cmake_version <- function(cmd = "cmake") {
+ tryCatch(
+ {
+ raw_version <- system(paste(cmd, "--version"), intern = TRUE, ignore.stderr = TRUE)
+ pat <- ".* ([0-9\\.]+).*?"
+ which_line <- grep(pat, raw_version)
+ package_version(sub(pat, "\\1", raw_version[which_line]))
+ },
+ error = function(e) {
+ return(0)
+ }
+ )
+}
+
+turn_off_all_optional_features <- function(env_var_list) {
+ # Because these are done as environment variables (as opposed to build flags),
+ # setting these to "OFF" overrides any previous setting. We don't need to
+ # check the existing value.
+ turn_off <- c(
+ "ARROW_MIMALLOC" = "OFF",
+ "ARROW_JEMALLOC" = "OFF",
+ "ARROW_JSON" = "OFF",
+ "ARROW_PARQUET" = "OFF", # depends on thrift
+ "ARROW_DATASET" = "OFF", # depends on parquet
+ "ARROW_S3" = "OFF",
+ "ARROW_WITH_BROTLI" = "OFF",
+ "ARROW_WITH_BZ2" = "OFF",
+ "ARROW_WITH_LZ4" = "OFF",
+ "ARROW_WITH_SNAPPY" = "OFF",
+ "ARROW_WITH_ZLIB" = "OFF",
+ "ARROW_WITH_ZSTD" = "OFF",
+ "ARROW_WITH_RE2" = "OFF",
+ "ARROW_WITH_UTF8PROC" = "OFF",
+ # The syntax to turn off XSIMD is different.
+ # Pull existing value of EXTRA_CMAKE_FLAGS first (must be defined)
+ "EXTRA_CMAKE_FLAGS" = paste(
+ env_var_list[["EXTRA_CMAKE_FLAGS"]],
+ "-DARROW_SIMD_LEVEL=NONE -DARROW_RUNTIME_SIMD_LEVEL=NONE"
+ )
+ )
+ # Create a new env_var_list, with the values of turn_off set.
+ # replace() also adds new values if they didn't exist before
+ replace(env_var_list, names(turn_off), turn_off)
+}
+
+set_thirdparty_urls <- function(env_var_list) {
+ # This function does *not* check if existing *_SOURCE_URL variables are set.
+ # The directory tools/thirdparty_dependencies is created by
+ # create_package_with_all_dependencies() and saved in the tar file.
+ files <- list.files(thirdparty_dependency_dir, full.names = FALSE)
+ url_env_varname <- toupper(sub("(.*?)-.*", "ARROW_\\1_URL", files))
+ # Special handling for the aws dependencies, which have extra `-`
+ aws <- grepl("^aws", files)
+ url_env_varname[aws] <- sub(
+ "AWS_SDK_CPP", "AWSSDK",
+ gsub(
+ "-", "_",
+ sub(
+ "(AWS.*)-.*", "ARROW_\\1_URL",
+ toupper(files[aws])
+ )
+ )
+ )
+ full_filenames <- file.path(normalizePath(thirdparty_dependency_dir), files)
+
+ env_var_list <- replace(env_var_list, url_env_varname, full_filenames)
+ if (!quietly) {
+ env_var_list <- replace(env_var_list, "ARROW_VERBOSE_THIRDPARTY_BUILD", "ON")
+ }
+ env_var_list
+}
+
+is_feature_requested <- function(env_varname, default = env_is("LIBARROW_MINIMAL", "false")) {
+ env_value <- tolower(Sys.getenv(env_varname))
+ if (identical(env_value, "off")) {
+ # If e.g. ARROW_MIMALLOC=OFF explicitly, override default
+ requested <- FALSE
+ } else if (identical(env_value, "on")) {
+ requested <- TRUE
+ } else {
+ requested <- default
+ }
+ requested
+}
+
+with_mimalloc <- function(env_var_list) {
+ arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC")
+ if (arrow_mimalloc) {
+ # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9
+ if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) {
+ cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n")
+ arrow_mimalloc <- FALSE
+ }
+ }
+ replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF"))
+}
+
+with_s3_support <- function(env_var_list) {
+ arrow_s3 <- is_feature_requested("ARROW_S3")
+ if (arrow_s3) {
+ # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9
+ # and make sure that we have curl and openssl system libs
+ if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) {
+ cat("**** S3 support not available for gcc < 4.9; building with ARROW_S3=OFF\n")
+ arrow_s3 <- FALSE
+ } else if (!cmake_find_package("CURL", NULL, env_var_list)) {
+ # curl on macos should be installed, so no need to alter this for macos
+ cat("**** S3 support requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb); building with ARROW_S3=OFF\n")
+ arrow_s3 <- FALSE
+ } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) {
+ cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n")
+ arrow_s3 <- FALSE
+ }
+ }
+ replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF"))
+}
+
+cmake_gcc_version <- function(env_var_list) {
+ # This function returns NA if using a non-gcc compiler
+ # Always enclose calls to it in isTRUE() or isFALSE()
+ vals <- cmake_cxx_compiler_vars(env_var_list)
+ if (!identical(vals[["CMAKE_CXX_COMPILER_ID"]], "GNU")) {
+ return(NA)
+ }
+ package_version(vals[["CMAKE_CXX_COMPILER_VERSION"]])
+}
+
+cmake_cxx_compiler_vars <- function(env_var_list) {
+ env_vars <- env_vars_as_string(env_var_list)
+ info <- system(paste("export", env_vars, "&& $CMAKE --system-information"), intern = TRUE)
+ info <- grep("^[A-Z_]* .*$", info, value = TRUE)
+ vals <- as.list(sub('^.*? "?(.*?)"?$', "\\1", info))
+ names(vals) <- sub("^(.*?) .*$", "\\1", info)
+ vals[grepl("^CMAKE_CXX_COMPILER_?", names(vals))]
+}
+
+cmake_find_package <- function(pkg, version = NULL, env_var_list) {
+ td <- tempfile()
+ dir.create(td)
+ options(.arrow.cleanup = c(getOption(".arrow.cleanup"), td))
+ find_package <- paste0("find_package(", pkg, " ", version, " REQUIRED)")
+ writeLines(find_package, file.path(td, "CMakeLists.txt"))
+ env_vars <- env_vars_as_string(env_var_list)
+ cmake_cmd <- paste0(
+ "export ", env_vars,
+ " && cd ", td,
+ " && $CMAKE ",
+ " -DCMAKE_EXPORT_NO_PACKAGE_REGISTRY=ON",
+ " -DCMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY=ON",
+ " ."
+ )
+ system(cmake_cmd, ignore.stdout = TRUE, ignore.stderr = TRUE) == 0
+}
+
+#####
+
+if (!file.exists(paste0(dst_dir, "/include/arrow/api.h"))) {
+ # If we're working in a local checkout and have already built the libs, we
+ # don't need to do anything. Otherwise,
+ # (1) Look for a prebuilt binary for this version
+ bin_file <- src_dir <- NULL
+ if (download_ok && binary_ok) {
+ bin_file <- download_binary()
+ }
+ if (!is.null(bin_file)) {
+ # Extract them
+ dir.create(dst_dir, showWarnings = !quietly, recursive = TRUE)
+ unzip(bin_file, exdir = dst_dir)
+ unlink(bin_file)
+ } else if (build_ok) {
+ # (2) Find source and build it
+ src_dir <- find_local_source()
+ if (!is.null(src_dir)) {
+ cat("*** Building C++ libraries\n")
+ build_libarrow(src_dir, dst_dir)
+ } else {
+ cat("*** Proceeding without C++ dependencies\n")
+ }
+ } else {
+ cat("*** Proceeding without C++ dependencies\n")
+ }
+}
diff --git a/src/arrow/r/tools/ubsan.supp b/src/arrow/r/tools/ubsan.supp
new file mode 100644
index 000000000..ff88cf984
--- /dev/null
+++ b/src/arrow/r/tools/ubsan.supp
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+vptr:include/c++/8/bits/shared_ptr_base.h
diff --git a/src/arrow/r/tools/winlibs.R b/src/arrow/r/tools/winlibs.R
new file mode 100644
index 000000000..ccaa5c95d
--- /dev/null
+++ b/src/arrow/r/tools/winlibs.R
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+args <- commandArgs(TRUE)
+VERSION <- args[1]
+if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) {
+ if (length(args) > 1) {
+ # Arg 2 would be the path/to/lib.zip
+ localfile <- args[2]
+ cat(sprintf("*** Using RWINLIB_LOCAL %s\n", localfile))
+ if (!file.exists(localfile)) {
+ cat(sprintf("*** %s does not exist; build will fail\n", localfile))
+ }
+ file.copy(localfile, "lib.zip")
+ } else {
+ # Download static arrow from rwinlib
+ if (getRversion() < "3.3.0") setInternet2()
+ quietly <- !identical(tolower(Sys.getenv("ARROW_R_DEV")), "true")
+ get_file <- function(template, version) {
+ try(
+ suppressWarnings(
+ download.file(sprintf(template, version), "lib.zip", quiet = quietly)
+ ),
+ silent = quietly
+ )
+ }
+ # URL templates
+ nightly <- "https://arrow-r-nightly.s3.amazonaws.com/libarrow/bin/windows/arrow-%s.zip"
+ rwinlib <- "https://github.com/rwinlib/arrow/archive/v%s.zip"
+ # First look for a nightly
+ get_file(nightly, VERSION)
+ # If not found, then check rwinlib
+ if (!file.exists("lib.zip")) {
+ get_file(rwinlib, VERSION)
+ }
+ if (!file.exists("lib.zip")) {
+ # Try a different version
+ # First, try pruning off a dev number, i.e. go from 0.14.1.1 to 0.14.1
+ VERSION <- sub("^([0-9]+\\.[0-9]+\\.[0-9]+).*$", "\\1", VERSION)
+ get_file(rwinlib, VERSION)
+ }
+ if (!file.exists("lib.zip")) {
+ # Next, try without a patch release, i.e. go from 0.14.1 to 0.14.0
+ VERSION <- sub("^([0-9]+\\.[0-9]+\\.).*$", "\\10", VERSION)
+ get_file(rwinlib, VERSION)
+ }
+ }
+ dir.create("windows", showWarnings = FALSE)
+ unzip("lib.zip", exdir = "windows")
+ unlink("lib.zip")
+}
diff --git a/src/arrow/r/vignettes/arrow.Rmd b/src/arrow/r/vignettes/arrow.Rmd
new file mode 100644
index 000000000..ff6bf7ce0
--- /dev/null
+++ b/src/arrow/r/vignettes/arrow.Rmd
@@ -0,0 +1,225 @@
+---
+title: "Using the Arrow C++ Library in R"
+description: "This document describes the low-level interface to the Apache Arrow C++ library in R and reviews the patterns and conventions of the R package."
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Using the Arrow C++ Library in R}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+The Apache Arrow C++ library provides rich, powerful features for working with columnar data. The `arrow` R package provides both a low-level interface to the C++ library and some higher-level, R-flavored tools for working with it. This vignette provides an overview of how the pieces fit together, and it describes the conventions that the classes and methods follow in R.
+
+# Features
+
+## Multi-file datasets
+
+The `arrow` package lets you work efficiently with large, multi-file datasets
+using `dplyr` methods. See `vignette("dataset", package = "arrow")` for an overview.
+
+## Reading and writing files
+
+`arrow` provides some simple functions for using the Arrow C++ library to read and write files.
+These functions are designed to drop into your normal R workflow
+without requiring any knowledge of the Arrow C++ library
+and use naming conventions and arguments that follow popular R packages, particularly `readr`.
+The readers return `data.frame`s
+(or if you use the `tibble` package, they will act like `tbl_df`s),
+and the writers take `data.frame`s.
+
+Importantly, `arrow` provides basic read and write support for the [Apache
+Parquet](https://parquet.apache.org/) columnar data file format.
+
+```r
+library(arrow)
+df <- read_parquet("path/to/file.parquet")
+```
+
+Just as you can read, you can write Parquet files:
+
+```r
+write_parquet(df, "path/to/different_file.parquet")
+```
+
+The `arrow` package also includes a faster and more robust implementation of the
+[Feather](https://github.com/wesm/feather) file format, providing `read_feather()` and
+`write_feather()`. This implementation depends
+on the same underlying C++ library as the Python version does,
+resulting in more reliable and consistent behavior across the two languages, as
+well as [improved performance](https://wesmckinney.com/blog/feather-arrow-future/).
+`arrow` also by default writes the Feather V2 format,
+which supports a wider range of data types, as well as compression.
+
+For CSV and line-delimited JSON, there are `read_csv_arrow()` and `read_json_arrow()`, respectively.
+While `read_csv_arrow()` currently has fewer parsing options for dealing with
+every CSV format variation in the wild, for the files it can read, it is
+often significantly faster than other R CSV readers, such as
+`base::read.csv`, `readr::read_csv`, and `data.table::fread`.
+
+## Working with Arrow data in Python
+
+Using [`reticulate`](https://rstudio.github.io/reticulate/), `arrow` lets you
+share data between R and Python (`pyarrow`) efficiently, enabling you to take
+advantage of the vibrant ecosystem of Python packages that build on top of
+Apache Arrow. See `vignette("python", package = "arrow")` for details.
+
+## Access to Arrow messages, buffers, and streams
+
+The `arrow` package also provides many lower-level bindings to the C++ library, which enable you
+to access and manipulate Arrow objects. You can use these to build connectors
+to other applications and services that use Arrow. One example is Spark: the
+[`sparklyr`](https://spark.rstudio.com/) package has support for using Arrow to
+move data to and from Spark, yielding [significant performance
+gains](http://arrow.apache.org/blog/2019/01/25/r-spark-improvements/).
+
+# Object hierarchy
+
+## Metadata objects
+
+Arrow defines the following classes for representing metadata:
+
+| Class | Description | How to create an instance |
+| ---------- | -------------------------------------------------- | -------------------------------- |
+| `DataType` | attribute controlling how values are represented | functions in `help("data-type")` |
+| `Field` | a character string name and a `DataType` | `field(name, type)` |
+| `Schema` | list of `Field`s | `schema(...)` |
+
+## Data objects
+
+Arrow defines the following classes for representing zero-dimensional (scalar),
+one-dimensional (array/vector-like), and two-dimensional (tabular/data
+frame-like) data:
+
+| Dim | Class | Description | How to create an instance |
+| --- | -------------- | ----------------------------------------- | ------------------------------------------------------------------------------------------------------|
+| 0 | `Scalar` | single value and its `DataType` | `Scalar$create(value, type)` |
+| 1 | `Array` | vector of values and its `DataType` | `Array$create(vector, type)` |
+| 1 | `ChunkedArray` | vectors of values and their `DataType` | `ChunkedArray$create(..., type)` or alias `chunked_array(..., type)` |
+| 2 | `RecordBatch` | list of `Array`s with a `Schema` | `RecordBatch$create(...)` or alias `record_batch(...)` |
+| 2 | `Table` | list of `ChunkedArray` with a `Schema` | `Table$create(...)`, alias `arrow_table(...)`, or `arrow::read_*(file, as_data_frame = FALSE)` |
+| 2 | `Dataset` | list of `Table`s with the same `Schema` | `Dataset$create(sources, schema)` or alias `open_dataset(sources, schema)` |
+
+Each of these is defined as an `R6` class in the `arrow` R package and
+corresponds to a class of the same name in the Arrow C++ library. The `arrow`
+package provides a variety of `R6` and S3 methods for interacting with instances
+of these classes.
+
+For convenience, the `arrow` package also defines several synthetic classes that
+do not exist in the C++ library, including:
+
+* `ArrowDatum`: inherited by `Scalar`, `Array`, and `ChunkedArray`
+* `ArrowTabular`: inherited by `RecordBatch` and `Table`
+* `ArrowObject`: inherited by all Arrow objects
+
+# Internals
+
+## Mapping of R <--> Arrow types
+
+Arrow has a rich data type system that includes direct parallels with R's data types and much more.
+
+In the tables, entries with a `-` are not currently implemented.
+
+### R to Arrow
+
+| R type | Arrow type |
+|--------------------------|------------|
+| logical | boolean |
+| integer | int32 |
+| double ("numeric") | float64^1^ |
+| character | utf8^2^ |
+| factor | dictionary |
+| raw | uint8 |
+| Date | date32 |
+| POSIXct | timestamp |
+| POSIXlt | struct |
+| data.frame | struct |
+| list^3^ | list |
+| bit64::integer64 | int64 |
+| difftime | time32 |
+| vctrs::vctrs_unspecified | null |
+
+
+
+^1^: `float64` and `double` are the same concept and data type in Arrow C++;
+however, only `float64()` is used in arrow as the function `double()` already
+exists in base R
+
+^2^: If the character vector exceeds 2GB of strings, it will be converted to a
+`large_utf8` Arrow type
+
+^3^: Only lists where all elements are the same type are able to be translated
+to Arrow list type (which is a "list of" some type).
+
+
+### Arrow to R
+
+| Arrow type | R type |
+|-------------------|------------------------------|
+| boolean | logical |
+| int8 | integer |
+| int16 | integer |
+| int32 | integer |
+| int64 | integer^1^ |
+| uint8 | integer |
+| uint16 | integer |
+| uint32 | integer^1^ |
+| uint64 | integer^1^ |
+| float16 | -^2^ |
+| float32 | double |
+| float64 | double |
+| utf8 | character |
+| large_utf8 | character |
+| binary | arrow_binary ^3^ |
+| large_binary | arrow_large_binary ^3^ |
+| fixed_size_binary | arrow_fixed_size_binary ^3^ |
+| date32 | Date |
+| date64 | POSIXct |
+| time32 | hms::difftime |
+| time64 | hms::difftime |
+| timestamp | POSIXct |
+| duration | -^2^ |
+| decimal | double |
+| dictionary | factor^4^ |
+| list | arrow_list ^5^ |
+| large_list | arrow_large_list ^5^ |
+| fixed_size_list | arrow_fixed_size_list ^5^ |
+| struct | data.frame |
+| null | vctrs::vctrs_unspecified |
+| map | -^2^ |
+| union | -^2^ |
+
+^1^: These integer types may contain values that exceed the range of R's
+`integer` type (32-bit signed integer). When they do, `uint32` and `uint64` are
+converted to `double` ("numeric") and `int64` is converted to
+`bit64::integer64`. This conversion can be disabled (so that `int64` always
+yields a `bit64::integer64` vector) by setting `options(arrow.int64_downcast = FALSE)`.
+
+^2^: Some Arrow data types do not currently have an R equivalent and will raise an error
+if cast to or mapped to via a schema.
+
+^3^: `arrow*_binary` classes are implemented as lists of raw vectors.
+
+^4^: Due to the limitation of R factors, Arrow `dictionary` values are coerced
+to string when translated to R if they are not already strings.
+
+^5^: `arrow*_list` classes are implemented as subclasses of `vctrs_list_of`
+with a `ptype` attribute set to what an empty Array of the value type converts to.
+
+
+### R object attributes
+
+Arrow supports custom key-value metadata attached to Schemas. When we convert a `data.frame` to an Arrow Table or RecordBatch, the package stores any `attributes()` attached to the columns of the `data.frame` in the Arrow object's Schema. These attributes are stored under the "r" key; you can assign additional string metadata under any other key you wish, like `x$metadata$new_key <- "new value"`.
+
+This metadata is preserved when writing the table to Feather or Parquet, and when reading those files into R, or when calling `as.data.frame()` on a Table/RecordBatch, the column attributes are restored to the columns of the resulting `data.frame`. This means that custom data types, including `haven::labelled`, `vctrs` annotations, and others, are preserved when doing a round-trip through Arrow.
+
+Note that the `attributes()` stored in `$metadata$r` are only understood by R. If you write a `data.frame` with `haven` columns to a Feather file and read that in Pandas, the `haven` metadata won't be recognized there. (Similarly, Pandas writes its own custom metadata, which the R package does not consume.) You are free, however, to define custom metadata conventions for your application and assign any (string) values you want to other metadata keys. For more details, see the documentation for `schema()`.
+
+## Class structure and package conventions
+
+C++ is an object-oriented language, so the core logic of the Arrow library is encapsulated in classes and methods. In the R package, these classes are implemented as `R6` reference classes, most of which are exported from the namespace.
+
+In order to match the C++ naming conventions, the `R6` classes are in TitleCase, e.g. `RecordBatch`. This makes it easy to look up the relevant C++ implementations in the [code](https://github.com/apache/arrow/tree/master/cpp) or [documentation](https://arrow.apache.org/docs/cpp/). To simplify things in R, the C++ library namespaces are generally dropped or flattened; that is, where the C++ library has `arrow::io::FileOutputStream`, it is just `FileOutputStream` in the R package. One exception is for the file readers, where the namespace is necessary to disambiguate. So `arrow::csv::TableReader` becomes `CsvTableReader`, and `arrow::json::TableReader` becomes `JsonTableReader`.
+
+Some of these classes are not meant to be instantiated directly; they may be base classes or other kinds of helpers. For those that you should be able to create, use the `$create()` method to instantiate an object. For example, `rb <- RecordBatch$create(int = 1:10, dbl = as.numeric(1:10))` will create a `RecordBatch`. Many of these factory methods that an R user might most often encounter also have a `snake_case` alias, in order to be more familiar for contemporary R users. So `record_batch(int = 1:10, dbl = as.numeric(1:10))` would do the same as `RecordBatch$create()` above.
+
+The typical user of the `arrow` R package may never deal directly with the `R6` objects. We provide more R-friendly wrapper functions as a higher-level interface to the C++ library. An R user can call `read_parquet()` without knowing or caring that they're instantiating a `ParquetFileReader` object and calling the `$ReadFile()` method on it. The classes are there and available to the advanced programmer who wants fine-grained control over how the C++ library is used.
diff --git a/src/arrow/r/vignettes/dataset.Rmd b/src/arrow/r/vignettes/dataset.Rmd
new file mode 100644
index 000000000..3f33cbae4
--- /dev/null
+++ b/src/arrow/r/vignettes/dataset.Rmd
@@ -0,0 +1,421 @@
+---
+title: "Working with Arrow Datasets and dplyr"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Working with Arrow Datasets and dplyr}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+Apache Arrow lets you work efficiently with large, multi-file datasets.
+The arrow R package provides a [dplyr](https://dplyr.tidyverse.org/) interface to Arrow Datasets,
+and other tools for interactive exploration of Arrow data.
+
+This vignette introduces Datasets and shows how to use dplyr to analyze them.
+
+## Example: NYC taxi data
+
+The [New York City taxi trip record data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
+is widely used in big data exercises and competitions.
+For demonstration purposes, we have hosted a Parquet-formatted version
+of about ten years of the trip data in a public Amazon S3 bucket.
+
+The total file size is around 37 gigabytes, even in the efficient Parquet file
+format. That's bigger than memory on most people's computers, so you can't just
+read it all in and stack it into a single data frame.
+
+In Windows (for R > 3.6) and macOS binary packages, S3 support is included.
+On Linux, when installing from source, S3 support is not enabled by default,
+and it has additional system requirements.
+See `vignette("install", package = "arrow")` for details.
+To see if your arrow installation has S3 support, run:
+
+```{r}
+arrow::arrow_with_s3()
+```
+
+Even with S3 support enabled, network speed will be a bottleneck unless your
+machine is located in the same AWS region as the data. So, for this vignette,
+we assume that the NYC taxi dataset has been downloaded locally in an "nyc-taxi"
+directory.
+
+### Retrieving data from a public Amazon S3 bucket
+
+If your arrow build has S3 support, you can sync the data locally with:
+
+```{r, eval = FALSE}
+arrow::copy_files("s3://ursa-labs-taxi-data", "nyc-taxi")
+```
+
+If your arrow build doesn't have S3 support, you can download the files
+with some additional code:
+
+```{r, eval = FALSE}
+bucket <- "https://ursa-labs-taxi-data.s3.us-east-2.amazonaws.com"
+for (year in 2009:2019) {
+ if (year == 2019) {
+ # We only have through June 2019 there
+ months <- 1:6
+ } else {
+ months <- 1:12
+ }
+ for (month in sprintf("%02d", months)) {
+ dir.create(file.path("nyc-taxi", year, month), recursive = TRUE)
+ try(download.file(
+ paste(bucket, year, month, "data.parquet", sep = "/"),
+ file.path("nyc-taxi", year, month, "data.parquet"),
+ mode = "wb"
+ ), silent = TRUE)
+ }
+}
+```
+
+Note that these download steps in the vignette are not executed: if you want to run
+with live data, you'll have to do it yourself separately.
+Given the size, if you're running this locally and don't have a fast connection,
+feel free to grab only a year or two of data.
+
+If you don't have the taxi data downloaded, the vignette will still run and will
+yield previously cached output for reference. To be explicit about which version
+is running, let's check whether you're running with live data:
+
+```{r}
+dir.exists("nyc-taxi")
+```
+
+## Opening the dataset
+
+Because dplyr is not necessary for many Arrow workflows,
+it is an optional (`Suggests`) dependency. So, to work with Datasets,
+you need to load both arrow and dplyr.
+
+```{r}
+library(arrow, warn.conflicts = FALSE)
+library(dplyr, warn.conflicts = FALSE)
+```
+
+The first step is to create a Dataset object, pointing at the directory of data.
+
+```{r, eval = file.exists("nyc-taxi")}
+ds <- open_dataset("nyc-taxi", partitioning = c("year", "month"))
+```
+
+The file format for `open_dataset()` is controlled by the `format` parameter,
+which has a default value of `"parquet"`. If you had a directory
+of Arrow format files, you could instead specify `format = "arrow"` in the call.
+
+Other supported formats include:
+
+* `"feather"` or `"ipc"` (aliases for `"arrow"`, as Feather v2 is the Arrow file format)
+* `"csv"` (comma-delimited files) and `"tsv"` (tab-delimited files)
+* `"text"` (generic text-delimited files - use the `delimiter` argument to specify which to use)
+
+For text files, you can pass the following parsing options to `open_dataset()`:
+
+* `delim`
+* `quote`
+* `escape_double`
+* `escape_backslash`
+* `skip_empty_rows`
+
+For more information on the usage of these parameters, see `?read_delim_arrow()`.
+
+The `partitioning` argument lets you specify how the file paths provide information
+about how the dataset is chunked into different files. The files in this example
+have file paths like
+
+```
+2009/01/data.parquet
+2009/02/data.parquet
+...
+```
+
+By providing `c("year", "month")` to the `partitioning` argument, you're saying that the first
+path segment gives the value for `year`, and the second segment is `month`.
+Every row in `2009/01/data.parquet` has a value of 2009 for `year`
+and 1 for `month`, even though those columns may not be present in the file.
+
+Indeed, when you look at the dataset, you can see that in addition to the columns present
+in every file, there are also columns `year` and `month` even though they are not present in the files themselves.
+
+```{r, eval = file.exists("nyc-taxi")}
+ds
+```
+```{r, echo = FALSE, eval = !file.exists("nyc-taxi")}
+cat("
+FileSystemDataset with 125 Parquet files
+vendor_id: string
+pickup_at: timestamp[us]
+dropoff_at: timestamp[us]
+passenger_count: int8
+trip_distance: float
+pickup_longitude: float
+pickup_latitude: float
+rate_code_id: null
+store_and_fwd_flag: string
+dropoff_longitude: float
+dropoff_latitude: float
+payment_type: string
+fare_amount: float
+extra: float
+mta_tax: float
+tip_amount: float
+tolls_amount: float
+total_amount: float
+year: int32
+month: int32
+
+See $metadata for additional Schema metadata
+")
+```
+
+The other form of partitioning currently supported is [Hive](https://hive.apache.org/)-style,
+in which the partition variable names are included in the path segments.
+If you had saved your files in paths like:
+
+```
+year=2009/month=01/data.parquet
+year=2009/month=02/data.parquet
+...
+```
+
+you would not have had to provide the names in `partitioning`;
+you could have just called `ds <- open_dataset("nyc-taxi")` and the partitions
+would have been detected automatically.
+
+## Querying the dataset
+
+Up to this point, you haven't loaded any data. You've walked directories to find
+files, you've parsed file paths to identify partitions, and you've read the
+headers of the Parquet files to inspect their schemas so that you can make sure
+they all are as expected.
+
+In the current release, arrow supports the dplyr verbs `mutate()`,
+`transmute()`, `select()`, `rename()`, `relocate()`, `filter()`, and
+`arrange()`. Aggregation is not yet supported, so before you call `summarise()`
+or other verbs with aggregate functions, use `collect()` to pull the selected
+subset of the data into an in-memory R data frame.
+
+Suppose you attempt to call unsupported dplyr verbs or unimplemented functions
+in your query on an Arrow Dataset. In that case, the arrow package raises an error. However,
+for dplyr queries on Arrow Table objects (which are already in memory), the
+package automatically calls `collect()` before processing that dplyr verb.
+
+Here's an example: suppose that you are curious about tipping behavior among the
+longest taxi rides. Let's find the median tip percentage for rides with
+fares greater than $100 in 2015, broken down by the number of passengers:
+
+```{r, eval = file.exists("nyc-taxi")}
+system.time(ds %>%
+ filter(total_amount > 100, year == 2015) %>%
+ select(tip_amount, total_amount, passenger_count) %>%
+ mutate(tip_pct = 100 * tip_amount / total_amount) %>%
+ group_by(passenger_count) %>%
+ collect() %>%
+ summarise(
+ median_tip_pct = median(tip_pct),
+ n = n()
+ ) %>%
+ print())
+```
+
+```{r, echo = FALSE, eval = !file.exists("nyc-taxi")}
+cat("
+# A tibble: 10 x 3
+ passenger_count median_tip_pct n
+ <int> <dbl> <int>
+ 1 0 9.84 380
+ 2 1 16.7 143087
+ 3 2 16.6 34418
+ 4 3 14.4 8922
+ 5 4 11.4 4771
+ 6 5 16.7 5806
+ 7 6 16.7 3338
+ 8 7 16.7 11
+ 9 8 16.7 32
+10 9 16.7 42
+
+ user system elapsed
+ 4.436 1.012 1.402
+")
+```
+
+You've just selected a subset out of a dataset with around 2 billion rows, computed
+a new column, and aggregated it in under 2 seconds on a modern laptop. How does
+this work?
+
+First, `mutate()`/`transmute()`, `select()`/`rename()`/`relocate()`, `filter()`,
+`group_by()`, and `arrange()` record their actions but don't evaluate on the
+data until you run `collect()`.
+
+```{r, eval = file.exists("nyc-taxi")}
+ds %>%
+ filter(total_amount > 100, year == 2015) %>%
+ select(tip_amount, total_amount, passenger_count) %>%
+ mutate(tip_pct = 100 * tip_amount / total_amount) %>%
+ group_by(passenger_count)
+```
+
+```{r, echo = FALSE, eval = !file.exists("nyc-taxi")}
+cat("
+FileSystemDataset (query)
+tip_amount: float
+total_amount: float
+passenger_count: int8
+tip_pct: expr
+
+* Filter: ((total_amount > 100) and (year == 2015))
+* Grouped by passenger_count
+See $.data for the source Arrow object
+")
+```
+
+This code returns an output instantly and shows the manipulations you've made, without
+loading data from the files. Because the evaluation of these queries is deferred,
+you can build up a query that selects down to a small subset without generating
+intermediate datasets that would potentially be large.
+
+Second, all work is pushed down to the individual data files,
+and depending on the file format, chunks of data within the files. As a result,
+you can select a subset of data from a much larger dataset by collecting the
+smaller slices from each file—you don't have to load the whole dataset in
+memory to slice from it.
+
+Third, because of partitioning, you can ignore some files entirely.
+In this example, by filtering `year == 2015`, all files corresponding to other years
+are immediately excluded: you don't have to load them in order to find that no
+rows match the filter. Relatedly, since Parquet files contain row groups with
+statistics on the data within, there may be entire chunks of data you can
+avoid scanning because they have no rows where `total_amount > 100`.
+
+## More dataset options
+
+There are a few ways you can control the Dataset creation to adapt to special use cases.
+
+### Work with files in a directory
+
+If you are working with a single file or a set of files that are not all in the
+same directory, you can provide a file path or a vector of multiple file paths
+to `open_dataset()`. This is useful if, for example, you have a single CSV file
+that is too big to read into memory. You could pass the file path to
+`open_dataset()`, use `group_by()` to partition the Dataset into manageable chunks,
+then use `write_dataset()` to write each chunk to a separate Parquet file—all
+without needing to read the full CSV file into R.
+
+### Explicitly declare column names and data types
+
+You can specify the `schema` argument to `open_dataset()` to declare the columns
+and their data types. This is useful if you have data files that have different
+storage schema (for example, a column could be `int32` in one and `int8` in
+another) and you want to ensure that the resulting Dataset has a specific type.
+
+To be clear, it's not necessary to specify a schema, even in this example of
+mixed integer types, because the Dataset constructor will reconcile differences
+like these. The schema specification just lets you declare what you want the
+result to be.
+
+### Explicitly declare partition format
+
+Similarly, you can provide a Schema in the `partitioning` argument of `open_dataset()`
+in order to declare the types of the virtual columns that define the partitions.
+This would be useful, in the taxi dataset example, if you wanted to keep
+`month` as a string instead of an integer.
+
+### Work with multiple data sources
+
+Another feature of Datasets is that they can be composed of multiple data sources.
+That is, you may have a directory of partitioned Parquet files in one location,
+and in another directory, files that haven't been partitioned.
+Or, you could point to an S3 bucket of Parquet data and a directory
+of CSVs on the local file system and query them together as a single dataset.
+To create a multi-source dataset, provide a list of datasets to `open_dataset()`
+instead of a file path, or simply concatenate them like `big_dataset <- c(ds1, ds2)`.
+
+## Writing datasets
+
+As you can see, querying a large dataset can be made quite fast by storage in an
+efficient binary columnar format like Parquet or Feather and partitioning based on
+columns commonly used for filtering. However, data isn't always stored that way.
+Sometimes you might start with one giant CSV. The first step in analyzing data
+is cleaning is up and reshaping it into a more usable form.
+
+The `write_dataset()` function allows you to take a Dataset or another tabular
+data object—an Arrow Table or RecordBatch, or an R data frame—and write
+it to a different file format, partitioned into multiple files.
+
+Assume that you have a version of the NYC Taxi data as CSV:
+
+```r
+ds <- open_dataset("nyc-taxi/csv/", format = "csv")
+```
+
+You can write it to a new location and translate the files to the Feather format
+by calling `write_dataset()` on it:
+
+```r
+write_dataset(ds, "nyc-taxi/feather", format = "feather")
+```
+
+Next, let's imagine that the `payment_type` column is something you often filter
+on, so you want to partition the data by that variable. By doing so you ensure
+that a filter like `payment_type == "Cash"` will touch only a subset of files
+where `payment_type` is always `"Cash"`.
+
+One natural way to express the columns you want to partition on is to use the
+`group_by()` method:
+
+```r
+ds %>%
+ group_by(payment_type) %>%
+ write_dataset("nyc-taxi/feather", format = "feather")
+```
+
+This will write files to a directory tree that looks like this:
+
+```r
+system("tree nyc-taxi/feather")
+```
+
+```
+## feather
+## ├── payment_type=1
+## │ └── part-18.feather
+## ├── payment_type=2
+## │ └── part-19.feather
+## ...
+## └── payment_type=UNK
+## └── part-17.feather
+##
+## 18 directories, 23 files
+```
+
+Note that the directory names are `payment_type=Cash` and similar:
+this is the Hive-style partitioning described above. This means that when
+you call `open_dataset()` on this directory, you don't have to declare what the
+partitions are because they can be read from the file paths.
+(To instead write bare values for partition segments, i.e. `Cash` rather than
+`payment_type=Cash`, call `write_dataset()` with `hive_style = FALSE`.)
+
+Perhaps, though, `payment_type == "Cash"` is the only data you ever care about,
+and you just want to drop the rest and have a smaller working set.
+For this, you can `filter()` them out when writing:
+
+```r
+ds %>%
+ filter(payment_type == "Cash") %>%
+ write_dataset("nyc-taxi/feather", format = "feather")
+```
+
+The other thing you can do when writing datasets is select a subset of columns
+or reorder them. Suppose you never care about `vendor_id`, and being a string column,
+it can take up a lot of space when you read it in, so let's drop it:
+
+```r
+ds %>%
+ group_by(payment_type) %>%
+ select(-vendor_id) %>%
+ write_dataset("nyc-taxi/feather", format = "feather")
+```
+
+Note that while you can select a subset of columns,
+you cannot currently rename columns when writing a dataset.
diff --git a/src/arrow/r/vignettes/developing.Rmd b/src/arrow/r/vignettes/developing.Rmd
new file mode 100644
index 000000000..5cff5e560
--- /dev/null
+++ b/src/arrow/r/vignettes/developing.Rmd
@@ -0,0 +1,605 @@
+---
+title: "Arrow R Developer Guide"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Arrow R Developer Guide}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r setup-options, include=FALSE}
+knitr::opts_chunk$set(error = TRUE, eval = FALSE)
+# Get environment variables describing what to evaluate
+run <- tolower(Sys.getenv("RUN_DEVDOCS", "false")) == "true"
+macos <- tolower(Sys.getenv("DEVDOCS_MACOS", "false")) == "true"
+ubuntu <- tolower(Sys.getenv("DEVDOCS_UBUNTU", "false")) == "true"
+sys_install <- tolower(Sys.getenv("DEVDOCS_SYSTEM_INSTALL", "false")) == "true"
+# Update the source knit_hook to save the chunk (if it is marked to be saved)
+knit_hooks_source <- knitr::knit_hooks$get("source")
+knitr::knit_hooks$set(source = function(x, options) {
+ # Extra paranoia about when this will write the chunks to the script, we will
+ # only save when:
+ # * CI is true
+ # * RUN_DEVDOCS is true
+ # * options$save is TRUE (and a check that not NULL won't crash it)
+ if (as.logical(Sys.getenv("CI", FALSE)) && run && !is.null(options$save) && options$save)
+ cat(x, file = "script.sh", append = TRUE, sep = "\n")
+ # but hide the blocks we want hidden:
+ if (!is.null(options$hide) && options$hide) {
+ return(NULL)
+ }
+ knit_hooks_source(x, options)
+})
+```
+
+```{bash, save=run, hide=TRUE}
+# Stop on failure, echo input as we go
+set -e
+set -x
+```
+
+If you're looking to contribute to arrow, this vignette can help you set up a development environment that will enable you to write code and run tests locally. It outlines:
+
+* how to build the components that make up the Arrow project and R package
+* workflows that developers use
+* some common troubleshooting steps and solutions
+
+This document is intended only for **developers** of Apache Arrow or the Arrow R package. R package users do not need to do any of this setup. If you're looking for how to install Arrow, see [the instructions in the readme](https://arrow.apache.org/docs/r/#installation).
+
+This document is a work in progress and will grow and change as the Apache Arrow project grows and changes. We have tried to make these steps as robust as possible (in fact, we even test exactly these instructions on our nightly CI to ensure they don't become stale!), but custom configurations might conflict with these instructions and there are differences of opinion across developers about how to set up development environments like this.
+
+We welcome any feedback you have about things that are confusing or additions you would like to see here - please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) if you have any suggestions or requests.
+
+# Developer environment setup
+
+## R-only {.tabset}
+
+Windows and macOS users who wish to contribute to the R package and
+don't need to alter libarrow (Arrow's C++ library) may be able to obtain a
+recent version of the library without building from source.
+
+### Linux
+
+On Linux, you can download a .zip file containing libarrow from the
+nightly repository.
+
+To see what nightlies are available, you can use arrow's (or any other S3 client's) S3 listing functionality to see what is in the bucket `s3://arrow-r-nightly/libarrow/bin`:
+
+```
+nightly <- s3_bucket("arrow-r-nightly")
+nightly$ls("libarrow/bin")
+```
+Version numbers in that repository correspond to dates.
+
+You'll need to create a `libarrow` directory inside the R package directory and unzip the zip file containing the compiled libarrow binary files into it.
+
+### macOS
+On macOS, you can install libarrow using [Homebrew](https://brew.sh/):
+
+```bash
+# For the released version:
+brew install apache-arrow
+# Or for a development version, you can try:
+brew install apache-arrow --HEAD
+```
+
+### Windows
+
+On Windows, you can download a .zip file containing libarrow from the nightly repository.
+
+To see what nightlies are available, you can use arrow's (or any other S3 client's) S3 listing functionality to see what is in the bucket `s3://arrow-r-nightly/libarrow/bin`:
+
+```
+nightly <- s3_bucket("arrow-r-nightly")
+nightly$ls("libarrow/bin")
+```
+Version numbers in that repository correspond to dates.
+
+You can set the `RWINLIB_LOCAL` environment variable to point to the zip file containing libarrow before installing the arrow R package.
+
+
+## R and C++
+
+If you need to alter both libarrow and the R package code, or if you can't get a binary version of the latest libarrow elsewhere, you'll need to build it from source. This section discusses how to set up a C++ libarrow build configured to work with the R package. For more general resources, see the [Arrow C++ developer guide](https://arrow.apache.org/docs/developers/cpp/building.html).
+
+There are five major steps to the process.
+
+### Step 1 - Install dependencies {.tabset}
+
+When building libarrow, by default, system dependencies will be used if suitable versions are found. If system dependencies are not present, libarrow will build them during its own build process. The only dependencies that you need to install _outside_ of the build process are [cmake](https://cmake.org/) (for configuring the build) and [openssl](https://www.openssl.org/) if you are building with S3 support.
+
+For a faster build, you may choose to pre-install more C++ library dependencies (such as [lz4](http://lz4.github.io/lz4/), [zstd](https://facebook.github.io/zstd/), etc.) on the system so that they don't need to be built from source in the libarrow build.
+
+#### Ubuntu
+```{bash, save=run & ubuntu}
+sudo apt install -y cmake libcurl4-openssl-dev libssl-dev
+```
+
+#### macOS
+```{bash, save=run & macos}
+brew install cmake openssl
+```
+
+#### Windows
+
+Currently, the R package cannot be made to work with a local libarrow build. This will be resolved in a future release.
+
+### Step 2 - Configure the libarrow build
+
+We recommend that you configure libarrow to be built to a user-level directory rather than a system directory for your development work. This is so that the development version you are using doesn't overwrite a released version of libarrow you may already have installed, and so that you are also able work with more than one version of libarrow (by using different `ARROW_HOME` directories for the different versions).
+
+In the example below, libarrow is installed to a directory called `dist` that has the same parent directory as the `arrow` checkout. Your installation of the Arrow R package can point to any directory with any name, though we recommend *not* placing it inside of the `arrow` git checkout directory as unwanted changes could stop it working properly.
+
+```{bash, save=run & !sys_install}
+export ARROW_HOME=$(pwd)/dist
+mkdir $ARROW_HOME
+```
+
+_Special instructions on Linux:_ You will need to set `LD_LIBRARY_PATH` to the `lib` directory that is under where you set `$ARROW_HOME`, before launching R and using arrow. One way to do this is to add it to your profile (we use `~/.bash_profile` here, but you might need to put this in a different file depending on your setup, e.g. if you use a shell other than `bash`). On macOS you do not need to do this because the macOS shared library paths are hardcoded to their locations during build time.
+
+```{bash, save=run & ubuntu & !sys_install}
+export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH
+echo "export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH" >> ~/.bash_profile
+```
+
+Start by navigating in a terminal to the `arrow` repository. You will need to create a directory into which the C++ build will put its contents. We recommend that you make a `build` directory inside of the `cpp` directory of the Arrow git repository (it is git-ignored, so you won't accidentally check it in). Next, change directories to be inside `cpp/build`:
+
+```{bash, save=run & !sys_install}
+pushd arrow
+mkdir -p cpp/build
+pushd cpp/build
+```
+
+You'll first call `cmake` to configure the build and then `make install`. For the R package, you'll need to enable several features in libarrow using `-D` flags:
+
+```{bash, save=run & !sys_install}
+cmake \
+ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+ -DCMAKE_INSTALL_LIBDIR=lib \
+ -DARROW_COMPUTE=ON \
+ -DARROW_CSV=ON \
+ -DARROW_DATASET=ON \
+ -DARROW_EXTRA_ERROR_CONTEXT=ON \
+ -DARROW_FILESYSTEM=ON \
+ -DARROW_INSTALL_NAME_RPATH=OFF \
+ -DARROW_JEMALLOC=ON \
+ -DARROW_JSON=ON \
+ -DARROW_PARQUET=ON \
+ -DARROW_WITH_SNAPPY=ON \
+ -DARROW_WITH_ZLIB=ON \
+ ..
+```
+
+`..` refers to the C++ source directory: you're in `cpp/build` and the source is in `cpp`.
+
+#### Enabling more Arrow features
+
+To enable optional features including: S3 support, an alternative memory allocator, and additional compression libraries, add some or all of these flags to your call to `cmake` (the trailing `\` makes them easier to paste into a bash shell on a new line):
+
+```bash
+ -DARROW_MIMALLOC=ON \
+ -DARROW_S3=ON \
+ -DARROW_WITH_BROTLI=ON \
+ -DARROW_WITH_BZ2=ON \
+ -DARROW_WITH_LZ4=ON \
+ -DARROW_WITH_SNAPPY=ON \
+ -DARROW_WITH_ZSTD=ON \
+```
+
+Other flags that may be useful:
+
+* `-DBoost_SOURCE=BUNDLED` and `-DThrift_SOURCE=BUNDLED`, for example, or any other dependency `*_SOURCE`, if you have a system version of a C++ dependency that doesn't work correctly with Arrow. This tells the build to compile its own version of the dependency from source.
+
+* `-DCMAKE_BUILD_TYPE=debug` or `-DCMAKE_BUILD_TYPE=relwithdebinfo` can be useful for debugging. You probably don't want to do this generally because a debug build is much slower at runtime than the default `release` build.
+
+_Note_ `cmake` is particularly sensitive to whitespacing, if you see errors, check that you don't have any errant whitespace.
+
+### Step 3 - Building libarrow
+
+You can add `-j#` between `make` and `install` here too to speed up compilation by running in parallel (where `#` is the number of cores you have available).
+
+```{bash, save=run & !(sys_install & ubuntu)}
+make -j8 install
+```
+
+### Step 4 - Build the Arrow R package
+
+Once you've built libarrow, you can install the R package and its
+dependencies, along with additional dev dependencies, from the git
+checkout:
+
+```{bash, save=run}
+popd # To go back to the root directory of the project, from cpp/build
+pushd r
+R -e 'install.packages("remotes"); remotes::install_deps(dependencies = TRUE)'
+R CMD INSTALL .
+```
+
+#### Compilation flags
+
+If you need to set any compilation flags while building the C++
+extensions, you can use the `ARROW_R_CXXFLAGS` environment variable. For
+example, if you are using `perf` to profile the R extensions, you may
+need to set
+
+```bash
+export ARROW_R_CXXFLAGS=-fno-omit-frame-pointer
+```
+
+#### Recompiling the C++ code
+
+With the setup described here, you should not need to rebuild the Arrow library or even the C++ source in the R package as you iterate and work on the R package. The only time those should need to be rebuilt is if you have changed the C++ in the R package (and even then, `R CMD INSTALL .` should only need to recompile the files that have changed) _or_ if the libarrow C++ has changed and there is a mismatch between libarrow and the R package. If you find yourself rebuilding either or both each time you install the package or run tests, something is probably wrong with your set up.
+
+<details>
+<summary>For a full build: a `cmake` command with all of the R-relevant optional dependencies turned on. Development with other languages might require different flags as well. For example, to develop Python, you would need to also add `-DARROW_PYTHON=ON` (though all of the other flags used for Python are already included here).</summary>
+<p>
+
+```bash
+cmake \
+ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+ -DCMAKE_INSTALL_LIBDIR=lib \
+ -DARROW_COMPUTE=ON \
+ -DARROW_CSV=ON \
+ -DARROW_DATASET=ON \
+ -DARROW_EXTRA_ERROR_CONTEXT=ON \
+ -DARROW_FILESYSTEM=ON \
+ -DARROW_INSTALL_NAME_RPATH=OFF \
+ -DARROW_JEMALLOC=ON \
+ -DARROW_JSON=ON \
+ -DARROW_MIMALLOC=ON \
+ -DARROW_PARQUET=ON \
+ -DARROW_S3=ON \
+ -DARROW_WITH_BROTLI=ON \
+ -DARROW_WITH_BZ2=ON \
+ -DARROW_WITH_LZ4=ON \
+ -DARROW_WITH_SNAPPY=ON \
+ -DARROW_WITH_ZLIB=ON \
+ -DARROW_WITH_ZSTD=ON \
+ ..
+```
+</p>
+</details>
+
+## Installing a version of the R package with a specific git reference
+
+If you need an arrow installation from a specific repository or git reference, on most platforms except Windows, you can run:
+
+```{r}
+remotes::install_github("apache/arrow/r", build = FALSE)
+```
+
+The `build = FALSE` argument is important so that the installation can access the
+C++ source in the `cpp/` directory in `apache/arrow`.
+
+As with other installation methods, setting the environment variables `LIBARROW_MINIMAL=false` and `ARROW_R_DEV=true` will provide a more full-featured version of Arrow and provide more verbose output, respectively.
+
+For example, to install from the (fictional) branch `bugfix` from `apache/arrow` you could run:
+
+```r
+Sys.setenv(LIBARROW_MINIMAL="false")
+remotes::install_github("apache/arrow/r@bugfix", build = FALSE)
+```
+
+Developers may wish to use this method of installing a specific commit
+separate from another Arrow development environment or system installation
+(e.g. we use this in [arrowbench](https://github.com/ursacomputing/arrowbench)
+to install development versions of libarrow isolated from the system install). If
+you already have libarrow installed system-wide, you may need to set
+some additional variables in order to isolate this build from your system libraries:
+
+* Setting the environment variable `FORCE_BUNDLED_BUILD` to `true` will skip the `pkg-config` search for libarrow and attempt to build from the same source at the repository+ref given.
+
+* You may also need to set the Makevars `CPPFLAGS` and `LDFLAGS` to `""` in order to prevent the installation process from attempting to link to already installed system versions of libarrow. One way to do this temporarily is wrapping your `remotes::install_github()` call like so:
+```{r}
+withr::with_makevars(list(CPPFLAGS = "", LDFLAGS = ""), remotes::install_github(...))
+```
+
+# Common developer workflow tasks
+
+The `arrow/r` directory contains a `Makefile` to help with some common tasks from the command line (e.g. `make test`, `make doc`, `make clean`, etc.).
+
+## Loading arrow
+
+You can load the R package via `devtools::load_all()`.
+
+## Rebuilding the documentation
+
+The R documentation uses the [`@examplesIf`](https://roxygen2.r-lib.org/articles/rd.html#functions) tag introduced in `roxygen2` version 7.1.1.9001, which hasn't yet been released on CRAN at the time of writing. If you are making changes which require updating the documentation, please install the development version of `roxygen2` from GitHub.
+
+```{r}
+remotes::install_github("r-lib/roxygen2")
+```
+
+You can use `devtools::document()` and `pkgdown::build_site()` to rebuild the documentation and preview the results.
+
+```r
+# Update roxygen documentation
+devtools::document()
+
+# To preview the documentation website
+pkgdown::build_site(preview=TRUE)
+```
+
+## Styling and linting
+
+### R code
+
+The R code in the package follows [the tidyverse style](https://style.tidyverse.org/). On PR submission (and on pushes) our CI will run linting and will flag possible errors on the pull request with annotations.
+
+To run the [lintr](https://github.com/jimhester/lintr) locally, install the lintr package (note, we currently use a fork that includes fixes not yet accepted upstream, see how lintr is being installed in the file `ci/docker/linux-apt-lint.dockerfile` for the current status) and then run
+
+```{r}
+lintr::lint_package("arrow/r")
+```
+
+You can automatically change the formatting of the code in the package using the [styler](https://styler.r-lib.org/) package. There are two ways to do this:
+
+1. Use the comment bot to do this automatically with the command `@github-actions autotune` on a PR, and commit it back to the branch.
+
+2. Run the styler locally either via Makefile commands:
+
+```bash
+make style # (for only the files changed)
+make style-all # (for all files)
+```
+
+or in R:
+
+```{r}
+# note the two excluded files which should not be styled
+styler::style_pkg(exclude_files = c("tests/testthat/latin1.R", "data-raw/codegen.R"))
+```
+
+The styler package will fix many styling errors, thought not all lintr errors are automatically fixable with styler. The list of files we intentionally do not style is in `r/.styler_excludes.R`.
+
+### C++ code
+
+The arrow package uses some customized tools on top of [cpp11](https://cpp11.r-lib.org/) to prepare its
+C++ code in `src/`. This is because there are some features that are only enabled
+and built conditionally during build time. If you change C++ code in the R
+package, you will need to set the `ARROW_R_DEV` environment variable to `true`
+(optionally, add it to your `~/.Renviron` file to persist across sessions) so
+that the `data-raw/codegen.R` file is used for code generation. The `Makefile`
+commands also handles this automatically.
+
+We use Google C++ style in our C++ code. The easiest way to accomplish this is
+use an editors/IDE that formats your code for you. Many popular editors/IDEs
+have support for running `clang-format` on C++ files when you save them.
+Installing/enabling the appropriate plugin may save you much frustration.
+
+Check for style errors with
+
+```bash
+./lint.sh
+```
+
+Fix any style issues before committing with
+
+```bash
+./lint.sh --fix
+```
+
+The lint script requires Python 3 and `clang-format-8`. If the command
+isn't found, you can explicitly provide the path to it like:
+
+```bash
+CLANG_FORMAT=$(which clang-format-8) ./lint.sh
+```
+
+On macOS, you can get this by installing LLVM via Homebrew and running the script as:
+```bash
+CLANG_FORMAT=$(brew --prefix llvm@8)/bin/clang-format ./lint.sh
+```
+
+_Note_ that the lint script requires Python 3 and the Python dependencies
+(note that `cmake_format is pinned to a specific version):
+
+* autopep8
+* flake8
+* cmake_format==0.5.2
+
+## Running tests
+
+Tests can be run either using `devtools::test()` or the Makefile alternative.
+
+```r
+# Run the test suite, optionally filtering file names
+devtools::test(filter="^regexp$")
+
+# or the Makefile alternative from the arrow/r directory in a shell:
+make test file=regexp
+```
+
+Some tests are conditionally enabled based on the availability of certain
+features in the package build (S3 support, compression libraries, etc.).
+Others are generally skipped by default but can be enabled with environment
+variables or other settings:
+
+* All tests are skipped on Linux if the package builds without the C++ libarrow.
+ To make the build fail if libarrow is not available (as in, to test that
+ the C++ build was successful), set `TEST_R_WITH_ARROW=true`
+
+* Some tests are disabled unless `ARROW_R_DEV=true`
+
+* Tests that require allocating >2GB of memory to test Large types are disabled
+ unless `ARROW_LARGE_MEMORY_TESTS=true`
+
+* Integration tests against a real S3 bucket are disabled unless credentials
+ are set in `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`; these are available
+ on request
+
+* S3 tests using [MinIO](https://min.io/) locally are enabled if the
+ `minio server` process is found running. If you're running MinIO with custom
+ settings, you can set `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, and
+ `MINIO_PORT` to override the defaults.
+
+## Running checks
+
+You can run package checks by using `devtools::check()` and check test coverage
+with `covr::package_coverage()`.
+
+```r
+# All package checks
+devtools::check()
+
+# See test coverage statistics
+covr::report()
+covr::package_coverage()
+```
+
+For full package validation, you can run the following commands from a terminal.
+
+```
+R CMD build .
+R CMD check arrow_*.tar.gz --as-cran
+```
+
+
+## Running additional CI checks
+
+On a pull request, there are some actions you can trigger by commenting on the
+PR. We have additional CI checks that run nightly and can be requested on demand
+using an internal tool called
+[crossbow](https://arrow.apache.org/docs/developers/crossbow.html).
+A few important GitHub comment commands are shown below.
+
+#### Run all extended R CI tasks
+```
+@github-actions crossbow submit -g r
+```
+
+This runs each of the R-related CI tasks.
+
+#### Run a specific task
+```
+@github-actions crossbow submit {task-name}
+```
+
+See the `r:` group definition near the beginning of the [crossbow configuration](https://github.com/apache/arrow/blob/master/dev/tasks/tasks.yml)
+for a list of glob expression patterns that match names of items in the `tasks:`
+list below it.
+
+#### Run linting and documentation building tasks
+
+```
+@github-actions autotune
+```
+
+This will run and fix lint C++ linting errors, run R documentation (among other
+cleanup tasks), run styler on any changed R code, and commit the resulting
+updates to the branch.
+
+# Summary of environment variables
+
+* See the user-facing [Install vignette](install.html) for a large number of
+ environment variables that determine how the build works and what features
+ get built.
+* `TEST_OFFLINE_BUILD`: When set to `true`, the build script will not download
+ prebuilt the C++ library binary.
+ It will turn off any features that require a download, unless they're available
+ in the `tools/cpp/thirdparty/download/` subfolder of the tar.gz file.
+ `create_package_with_all_dependencies()` creates that subfolder.
+ Regardless of this flag's value, `cmake` will be downloaded if it's unavailable.
+* `TEST_R_WITHOUT_LIBARROW`: When set to `true`, skip tests that would require
+ the C++ Arrow library (that is, almost everything).
+
+# Troubleshooting
+
+Note that after any change to libarrow, you must reinstall it and
+run `make clean` or `git clean -fdx .` to remove any cached object code
+in the `r/src/` directory before reinstalling the R package. This is
+only necessary if you make changes to libarrow source; you do not
+need to manually purge object files if you are only editing R or C++
+code inside `r/`.
+
+## Arrow library - R package mismatches
+
+If libarrow and the R package have diverged, you will see errors like:
+
+```
+Error: package or namespace load failed for ‘arrow' in dyn.load(file, DLLpath = DLLpath, ...):
+ unable to load shared object '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so':
+ dlopen(/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so, 6): Symbol not found: __ZN5arrow2io16RandomAccessFile9ReadAsyncERKNS0_9IOContextExx
+ Referenced from: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so
+ Expected in: flat namespace
+ in /Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so
+Error: loading failed
+Execution halted
+ERROR: loading failed
+```
+
+To resolve this, try [rebuilding the Arrow library](#step-3-building-arrow).
+
+## Multiple versions of libarrow
+
+If you are installing from a user-level directory, and you already have a
+previous installation of libarrow in a system directory, you get you may get
+errors like the following when you install the R package:
+
+```
+Error: package or namespace load failed for ‘arrow' in dyn.load(file, DLLpath = DLLpath, ...):
+ unable to load shared object '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so':
+ dlopen(/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: /usr/local/lib/libarrow.400.dylib
+ Referenced from: /usr/local/lib/libparquet.400.dylib
+ Reason: image not found
+```
+
+If this happens, you need to make sure that you don't let R link to your system
+library when building arrow. You can do this a number of different ways:
+
+* Setting the `MAKEFLAGS` environment variable to `"LDFLAGS="` (see below for an example) this is the recommended way to accomplish this
+* Using {withr}'s `with_makevars(list(LDFLAGS = ""), ...)`
+* adding `LDFLAGS=` to your `~/.R/Makevars` file (the least recommended way, though it is a common debugging approach suggested online)
+
+```{bash, save=run & !sys_install & macos, hide=TRUE}
+# Setup troubleshooting section
+# install a system-level arrow on macOS
+brew install apache-arrow
+```
+
+
+```{bash, save=run & !sys_install & ubuntu, hide=TRUE}
+# Setup troubleshooting section
+# install a system-level arrow on Ubuntu
+sudo apt update
+sudo apt install -y -V ca-certificates lsb-release wget
+wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+sudo apt update
+sudo apt install -y -V libarrow-dev
+```
+
+```{bash, save=run & !sys_install & macos}
+MAKEFLAGS="LDFLAGS=" R CMD INSTALL .
+```
+
+
+## `rpath` issues
+
+If the package fails to install/load with an error like this:
+
+```
+ ** testing if installed package can be loaded from temporary location
+ Error: package or namespace load failed for 'arrow' in dyn.load(file, DLLpath = DLLpath, ...):
+ unable to load shared object '/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so':
+ dlopen(/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: @rpath/libarrow.14.dylib
+```
+
+ensure that `-DARROW_INSTALL_NAME_RPATH=OFF` was passed (this is important on
+macOS to prevent problems at link time and is a no-op on other platforms).
+Alternatively, try setting the environment variable `R_LD_LIBRARY_PATH` to
+wherever Arrow C++ was put in `make install`, e.g. `export
+R_LD_LIBRARY_PATH=/usr/local/lib`, and retry installing the R package.
+
+When installing from source, if the R and C++ library versions do not
+match, installation may fail. If you've previously installed the
+libraries and want to upgrade the R package, you'll need to update the
+Arrow C++ library first.
+
+For any other build/configuration challenges, see the [C++ developer
+guide](https://arrow.apache.org/docs/developers/cpp/building.html).
+
+## Other installation issues
+
+There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information.
+>>>>>>> master
diff --git a/src/arrow/r/vignettes/flight.Rmd b/src/arrow/r/vignettes/flight.Rmd
new file mode 100644
index 000000000..e8af5cad6
--- /dev/null
+++ b/src/arrow/r/vignettes/flight.Rmd
@@ -0,0 +1,87 @@
+---
+title: "Connecting to Flight RPC Servers"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Connecting to Flight RPC Servers}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+[**Flight**](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/)
+is a general-purpose client-server framework for high performance
+transport of large datasets over network interfaces, built as part of the
+[Apache Arrow](https://arrow.apache.org) project.
+
+Flight allows for highly efficient data transfer as it:
+
+* removes the need for deserialization during data transfer
+* allows for parallel data streaming
+* is highly optimized to take advantage of Arrow's columnar format.
+
+The arrow package provides methods for connecting to Flight RPC servers
+to send and receive data.
+
+## Getting Started
+
+The `flight` functions in the package use [reticulate](https://rstudio.github.io/reticulate/) to call methods in the
+[pyarrow](https://arrow.apache.org/docs/python/api/flight.html) Python package.
+
+Before using them for the first time,
+you'll need to be sure you have reticulate and pyarrow installed:
+
+```r
+install.packages("reticulate")
+arrow::install_pyarrow()
+```
+
+See `vignette("python", package = "arrow")` for more details on setting up
+`pyarrow`.
+
+## Example
+
+The package includes methods for starting a Python-based Flight server, as well
+as methods for connecting to a Flight server running elsewhere.
+
+To illustrate both sides, in one process let's start a demo server:
+
+```r
+library(arrow)
+demo_server <- load_flight_server("demo_flight_server")
+server <- demo_server$DemoFlightServer(port = 8089)
+server$serve()
+```
+
+We'll leave that one running.
+
+In a different R process, let's connect to it and put some data in it.
+
+```r
+library(arrow)
+client <- flight_connect(port = 8089)
+# Upload some data to our server so there's something to demo
+flight_put(client, iris, path = "test_data/iris")
+```
+
+Now, in a new R process, let's connect to the server and pull the data we
+put there:
+
+```r
+library(arrow)
+library(dplyr)
+client <- flight_connect(port = 8089)
+client %>%
+ flight_get("test_data/iris") %>%
+ group_by(Species) %>%
+ summarize(max_petal = max(Petal.Length))
+
+## # A tibble: 3 x 2
+## Species max_petal
+## <fct> <dbl>
+## 1 setosa 1.9
+## 2 versicolor 5.1
+## 3 virginica 6.9
+```
+
+Because `flight_get()` returns an Arrow data structure, you can directly pipe
+its result into a [dplyr](https://dplyr.tidyverse.org/) workflow.
+See `vignette("dataset", package = "arrow")` for more information on working with Arrow objects via a dplyr interface.
diff --git a/src/arrow/r/vignettes/fs.Rmd b/src/arrow/r/vignettes/fs.Rmd
new file mode 100644
index 000000000..5d699c49d
--- /dev/null
+++ b/src/arrow/r/vignettes/fs.Rmd
@@ -0,0 +1,130 @@
+---
+title: "Working with Cloud Storage (S3)"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Working with Cloud Storage (S3)}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+The Arrow C++ library includes a generic filesystem interface and specific
+implementations for some cloud storage systems. This setup allows various
+parts of the project to be able to read and write data with different storage
+backends. In the `arrow` R package, support has been enabled for AWS S3.
+This vignette provides an overview of working with S3 data using Arrow.
+
+> In Windows and macOS binary packages, S3 support is included. On Linux when installing from source, S3 support is not enabled by default, and it has additional system requirements. See `vignette("install", package = "arrow")` for details.
+
+## URIs
+
+File readers and writers (`read_parquet()`, `write_feather()`, et al.)
+accept an S3 URI as the source or destination file,
+as do `open_dataset()` and `write_dataset()`.
+An S3 URI looks like:
+
+```
+s3://[access_key:secret_key@]bucket/path[?region=]
+```
+
+For example, one of the NYC taxi data files used in `vignette("dataset", package = "arrow")` is found at
+
+```
+s3://ursa-labs-taxi-data/2019/06/data.parquet
+```
+
+Given this URI, we can pass it to `read_parquet()` just as if it were a local file path:
+
+```r
+df <- read_parquet("s3://ursa-labs-taxi-data/2019/06/data.parquet")
+```
+
+Note that this will be slower to read than if the file were local,
+though if you're running on a machine in the same AWS region as the file in S3,
+the cost of reading the data over the network should be much lower.
+
+## Creating a FileSystem object
+
+Another way to connect to S3 is to create a `FileSystem` object once and pass
+that to the read/write functions.
+`S3FileSystem` objects can be created with the `s3_bucket()` function, which
+automatically detects the bucket's AWS region. Additionally, the resulting
+`FileSystem` will consider paths relative to the bucket's path (so for example
+you don't need to prefix the bucket path when listing a directory).
+This may be convenient when dealing with
+long URIs, and it's necessary for some options and authentication methods
+that aren't supported in the URI format.
+
+With a `FileSystem` object, we can point to specific files in it with the `$path()` method.
+In the previous example, this would look like:
+
+```r
+bucket <- s3_bucket("ursa-labs-taxi-data")
+df <- read_parquet(bucket$path("2019/06/data.parquet"))
+```
+
+See the help for `FileSystem` for a list of options that `s3_bucket()` and `S3FileSystem$create()`
+can take. `region`, `scheme`, and `endpoint_override` can be encoded as query
+parameters in the URI (though `region` will be auto-detected in `s3_bucket()` or from the URI if omitted).
+`access_key` and `secret_key` can also be included,
+but other options are not supported in the URI.
+
+The object that `s3_bucket()` returns is technically a `SubTreeFileSystem`, which holds a path and a file system to which it corresponds. `SubTreeFileSystem`s can be useful for holding a reference to a subdirectory somewhere, on S3 or elsewhere.
+
+One way to get a subtree is to call the `$cd()` method on a `FileSystem`
+
+```r
+june2019 <- bucket$cd("2019/06")
+df <- read_parquet(june2019$path("data.parquet"))
+```
+
+`SubTreeFileSystem` can also be made from a URI:
+
+```r
+june2019 <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data/2019/06")
+```
+
+## Authentication
+
+To access private S3 buckets, you need typically need two secret parameters:
+a `access_key`, which is like a user id,
+and `secret_key`, like a token.
+There are a few options for passing these credentials:
+
+1. Include them in the URI, like `s3://access_key:secret_key@bucket-name/path/to/file`. Be sure to [URL-encode](https://en.wikipedia.org/wiki/Percent-encoding) your secrets if they contain special characters like "/".
+
+2. Pass them as `access_key` and `secret_key` to `S3FileSystem$create()` or `s3_bucket()`
+
+3. Set them as environment variables named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`, respectively.
+
+4. Define them in a `~/.aws/credentials` file, according to the [AWS documentation](https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html).
+
+You can also use an [AccessRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html)
+for temporary access by passing the `role_arn` identifier to `S3FileSystem$create()` or `s3_bucket()`.
+
+## File systems that emulate S3
+
+The `S3FileSystem` machinery enables you to work with any file system that
+provides an S3-compatible interface. For example, [MinIO](https://min.io/) is
+and object-storage server that emulates the S3 API. If you were to
+run `minio server` locally with its default settings, you could connect to
+it with `arrow` using `S3FileSystem` like this:
+
+```r
+minio <- S3FileSystem$create(
+ access_key = "minioadmin",
+ secret_key = "minioadmin",
+ scheme = "http",
+ endpoint_override = "localhost:9000"
+)
+```
+
+or, as a URI, it would be
+
+```
+s3://minioadmin:minioadmin@?scheme=http&endpoint_override=localhost%3A9000
+```
+
+(note the URL escaping of the `:` in `endpoint_override`).
+
+Among other applications, this can be useful for testing out code locally before
+running on a remote S3 bucket.
diff --git a/src/arrow/r/vignettes/install.Rmd b/src/arrow/r/vignettes/install.Rmd
new file mode 100644
index 000000000..5bd76a371
--- /dev/null
+++ b/src/arrow/r/vignettes/install.Rmd
@@ -0,0 +1,448 @@
+---
+title: "Installing the Arrow Package on Linux"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Installing the Arrow Package on Linux}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+On macOS and Windows, when you `install.packages("arrow")`,
+you get a binary package that contains Arrow’s C++ dependencies along with it.
+On Linux, `install.packages()` retrieves a source package that has to be compiled locally,
+and C++ dependencies need to be resolved as well.
+Generally for R packages with C++ dependencies,
+this requires either installing system packages, which you may not have privileges to do,
+or building the C++ dependencies separately,
+which introduces all sorts of additional ways for things to go wrong.
+
+Our goal is to make `install.packages("arrow")` "just work" for as many Linux distributions,
+versions, and configurations as possible.
+This document describes how it works and the options for fine-tuning Linux installation.
+The intended audience for this document is `arrow` R package users on Linux, not developers.
+If you're contributing to the Arrow project, see `vignette("developing", package = "arrow") for guidance on setting up your development environment.
+
+Note also that if you use `conda` to manage your R environment, this document does not apply.
+You can `conda install -c conda-forge --strict-channel-priority r-arrow` and you'll get the latest official
+release of the R package along with any C++ dependencies.
+
+> Having trouble installing `arrow`? See the "Troubleshooting" section below.
+
+# Installation basics
+
+Install the latest release of `arrow` from CRAN with
+
+```r
+install.packages("arrow")
+```
+
+Daily development builds, which are not official releases,
+can be installed from the Ursa Labs repository:
+
+```r
+install.packages("arrow", repos = "https://arrow-r-nightly.s3.amazonaws.com")
+```
+
+or for conda users via:
+
+```
+conda install -c arrow-nightlies -c conda-forge --strict-channel-priority r-arrow
+```
+
+You can also install the R package from a git checkout:
+
+```shell
+git clone https://github.com/apache/arrow
+cd arrow/r
+R CMD INSTALL .
+```
+
+If you don't already have the Arrow C++ libraries on your system,
+when installing the R package from source, it will also download and build
+the Arrow C++ libraries for you. To speed installation up, you can set
+
+```shell
+export LIBARROW_BINARY=true
+```
+
+to look for C++ binaries prebuilt for your Linux distribution/version.
+Alternatively, you can set
+
+```shell
+export LIBARROW_MINIMAL=false
+```
+
+to build the Arrow libraries from source with optional features such as compression libraries
+enabled. This will increase the build time but provides many useful features.
+Prebuilt binaries are built with this flag enabled, so you get the full
+functionality by using them as well.
+
+Both of these variables are also set this way if you have the `NOT_CRAN=true`
+environment variable set.
+
+## Helper function: install_arrow()
+
+If you already have `arrow` installed and want to upgrade to a different version,
+install a development build, or try to reinstall and fix issues with Linux
+C++ binaries, you can call `install_arrow()`.
+`install_arrow()` provides some convenience wrappers around the various
+environment variables described below.
+This function is part of the `arrow` package,
+and it is also available as a standalone script, so you can
+access it for convenience without first installing the package:
+
+```r
+source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")
+```
+
+`install_arrow()` will install from CRAN,
+while `install_arrow(nightly = TRUE)` will give you a development build.
+`install_arrow()` does not require environment variables to be set in order to
+satisfy C++ dependencies.
+
+> Note that, unlike packages like `tensorflow`, `blogdown`, and others that require external dependencies, you do not need to run `install_arrow()` after a successful `arrow` installation.
+
+## Offline installation
+
+The `install-arrow.R` file also includes the `create_package_with_all_dependencies()`
+function. Normally, when installing on a computer with internet access, the
+build process will download third-party dependencies as needed.
+This function provides a way to download them in advance.
+Doing so may be useful when installing Arrow on a computer without internet access.
+Note that Arrow _can_ be installed on a computer without internet access without doing this, but
+many useful features will be disabled, as they depend on third-party components.
+More precisely, `arrow::arrow_info()$capabilities()` will be `FALSE` for every
+capability.
+One approach to add more capabilities in an offline install is to prepare a
+package with pre-downloaded dependencies. The
+`create_package_with_all_dependencies()` function does this preparation.
+
+If you're using binary packages you shouldn't need to follow these steps. You
+should download the appropriate binary from your package repository, transfer
+that to the offline computer, and install that. Any OS can create the source
+bundle, but it cannot be installed on Windows. (Instead, use a standard
+Windows binary package.)
+
+Note if you're using RStudio Package Manager on Linux: If you still want to
+make a source bundle with this function, make sure to set the first repo in
+`options("repos")` to be a mirror that contains source packages (that is:
+something other than the RSPM binary mirror URLs).
+
+### Using a computer with internet access, pre-download the dependencies:
+* Install the `arrow` package _or_ run
+ `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")`
+* Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")`
+* Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access
+
+### On the computer without internet access, install the prepared package:
+* Install the `arrow` package from the copied file
+ * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))`
+ * This installation will build from source, so `cmake` must be available
+* Run `arrow_info()` to check installed capabilities
+
+#### Alternative, hands-on approach
+* Download the dependency files (`cpp/thirdparty/download_dependencies.sh` may be helpful)
+* Copy the directory of dependencies to the offline computer
+* Create the environment variable `ARROW_THIRDPARTY_DEPENDENCY_DIR` on the offline computer, pointing to the copied directory.
+* Install the `arrow` package as usual.
+
+## S3 support
+
+The `arrow` package allows you to work with data in AWS S3 or in other cloud
+storage system that emulate S3. However, support for working with S3 is not
+enabled in the default build, and it has additional system requirements. To
+enable it, set the environment variable `LIBARROW_MINIMAL=false` or
+`NOT_CRAN=true` to choose the full-featured build, or more selectively set
+`ARROW_S3=ON`. You also need the following system dependencies:
+
+* `gcc` >= 4.9 or `clang` >= 3.3; note that the default compiler on CentOS 7 is gcc 4.8.5, which is not sufficient
+* CURL: install `libcurl-devel` (rpm) or `libcurl4-openssl-dev` (deb)
+* OpenSSL >= 1.0.2: install `openssl-devel` (rpm) or `libssl-dev` (deb)
+
+The prebuilt C++ binaries come with S3 support enabled, so you will need to meet
+these system requirements in order to use them--the package will not install
+without them. If you're building everything from source, the install script
+will check for the presence of these dependencies and turn off S3 support in the
+build if the prerequisites are not met--installation will succeed but without
+S3 functionality. If afterwards you install the missing system requirements,
+you'll need to reinstall the package in order to enable S3 support.
+
+# How dependencies are resolved
+
+In order for the `arrow` R package to work, it needs the Arrow C++ library.
+There are a number of ways you can get it: a system package; a library you've
+built yourself outside of the context of installing the R package;
+or, if you don't already have it, the R package will attempt to resolve it
+automatically when it installs.
+
+If you are authorized to install system packages and you're installing a CRAN release,
+you may want to use the official Apache Arrow release packages corresponding to the R package version (though there are some drawbacks: see "Troubleshooting" below).
+See the [Arrow project installation page](https://arrow.apache.org/install/)
+to find pre-compiled binary packages for some common Linux distributions,
+including Debian, Ubuntu, and CentOS.
+You'll need to install `libparquet-dev` on Debian and Ubuntu, or `parquet-devel` on CentOS.
+This will also automatically install the Arrow C++ library as a dependency.
+
+When you install the `arrow` R package on Linux,
+it will first attempt to find the Arrow C++ libraries on your system using
+the `pkg-config` command.
+This will find either installed system packages or libraries you've built yourself.
+In order for `install.packages("arrow")` to work with these system packages,
+you'll need to install them before installing the R package.
+
+If no Arrow C++ libraries are found on the system,
+the R package installation script will next attempt to download
+prebuilt static Arrow C++ libraries
+that match your both your local operating system and `arrow` R package version.
+C++ binaries will only be retrieved if you have set the environment variable
+`LIBARROW_BINARY` or `NOT_CRAN`.
+If found, they will be downloaded and bundled when your R package compiles.
+For a list of supported distributions and versions,
+see the [arrow-r-nightly](https://github.com/ursa-labs/arrow-r-nightly/blob/master/README.md) project.
+
+If no C++ library binary is found, it will attempt to build it locally.
+First, it will also look to see if you are in
+a checkout of the `apache/arrow` git repository and thus have the C++ source there.
+Otherwise, it builds from the C++ files included in the package.
+Depending on your system, building Arrow C++ from source may be slow.
+
+For the specific mechanics of how all this works, see the R package `configure` script,
+which calls `tools/nixlibs.R`.
+
+If the C++ library is built from source, `inst/build_arrow_static.sh` is executed.
+This build script is also what is used to generate the prebuilt binaries.
+
+## How the package is installed - advanced
+
+This subsection contains information which is likely to be most relevant mostly
+to Arrow developers and is not necessary for Arrow users to install Arrow.
+
+There are a number of scripts that are triggered when `R CMD INSTALL .` is run.
+For Arrow users, these should all just work without configuration and pull in
+the most complete pieces (e.g. official binaries that we host).
+
+An overview of these scripts is shown below:
+
+* `configure` and `configure.win` - these scripts are triggered during
+`R CMD INSTALL .` on non-Windows and Windows platforms, respectively. They
+handle finding the Arrow library, setting up the build variables necessary, and
+writing the package Makevars file that is used to compile the C++ code in the R
+package.
+
+* `tools/nixlibs.R` - this script is sometimes called by `configure` on Linux
+(or on any non-windows OS with the environment variable
+`FORCE_BUNDLED_BUILD=true`). This sets up the build process for our bundled
+builds (which is the default on linux). The operative logic is at the end of
+the script, but it will do the following (and it will stop with the first one
+that succeeds and some of the steps are only checked if they are enabled via an
+environment variable):
+ * Check if there is an already built libarrow in `arrow/r/libarrow-{version}`,
+ use that to link against if it exists.
+ * Check if a binary is available from our hosted unofficial builds.
+ * Download the Arrow source and build the Arrow Library from source.
+ * `*** Proceed without C++` dependencies (this is an error and the package
+ will not work, but if you see this message you know the previous steps have
+ not succeeded/were not enabled)
+
+* `inst/build_arrow_static.sh` - called by `tools/nixlibs.R` when the Arrow
+library is being built. It builds Arrow for a bundled, static build, and
+mirrors the steps described in the ["Arrow R Developer Guide" vignette](./developing.html)
+
+# Troubleshooting
+
+The intent is that `install.packages("arrow")` will just work and handle all C++
+dependencies, but depending on your system, you may have better results if you
+tune one of several parameters. Here are some known complications and ways to address them.
+
+## Package failed to build C++ dependencies
+
+If you see a message like
+
+```
+------------------------- NOTE ---------------------------
+There was an issue preparing the Arrow C++ libraries.
+See https://arrow.apache.org/docs/r/articles/install.html
+---------------------------------------------------------
+```
+
+in the output when the package fails to install,
+that means that installation failed to retrieve or build C++ libraries
+compatible with the current version of the R package.
+
+It is expected that C++ dependencies should be built successfully
+on all Linux distributions, so you should not see this message. If you do,
+please check the "Known installation issues" below to see if any apply.
+If none apply, set the environment variable `ARROW_R_DEV=TRUE`
+so that details on what failed are shown, and try installing again. Then,
+please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues)
+and include the full verbose installation output.
+
+## Using system libraries
+
+If a system library or other installed Arrow is found but it doesn't match the R package version
+(for example, you have libarrow 1.0.0 on your system and are installing R package 2.0.0),
+it is likely that the R bindings will fail to compile.
+Because the Apache Arrow project is under active development,
+is it essential that versions of the C++ and R libraries match.
+When `install.packages("arrow")` has to download the C++ libraries,
+the install script ensures that you fetch the C++ libraries that correspond to your R package version.
+However, if you are using Arrow libraries already on your system, version match isn't guaranteed.
+
+To fix version mismatch, you can either update your system packages to match the R package version,
+or set the environment variable `ARROW_USE_PKG_CONFIG=FALSE`
+to tell the configure script not to look for system Arrow packages.
+(The latter is the default of `install_arrow()`.)
+System packages are available corresponding to all CRAN releases
+but not for nightly or dev versions, so depending on the R package version you're installing,
+system packages may not be an option.
+
+Note also that once you have a working R package installation based on system (shared) libraries,
+if you update your system Arrow, you'll need to reinstall the R package to match its version.
+Similarly, if you're using Arrow system libraries, running `update.packages()`
+after a new release of the `arrow` package will likely fail unless you first
+update the system packages.
+
+## Using prebuilt binaries
+
+If the R package finds and downloads a prebuilt binary of the C++ library,
+but then the `arrow` package can't be loaded, perhaps with "undefined symbols" errors,
+please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues).
+This is likely a compiler mismatch and may be resolvable by setting some
+environment variables to instruct R to compile the packages to match the C++ library.
+
+A workaround would be to set the environment variable `LIBARROW_BINARY=FALSE`
+and retry installation: this value instructs the package to build the C++ library from source
+instead of downloading the prebuilt binary.
+That should guarantee that the compiler settings match.
+
+If a prebuilt binary wasn't found for your operating system but you think it should have been,
+check the logs for a message that says `*** Unable to identify current OS/version`,
+or a message that says `*** No C++ binaries found for` an invalid OS.
+If you see either, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues).
+You may also set the environment variable `ARROW_R_DEV=TRUE` for additional
+debug messages.
+
+A workaround would be to set the environment variable `LIBARROW_BINARY`
+to a `distribution-version` that exists in the Ursa Labs repository.
+Setting `LIBARROW_BINARY` is also an option when there's not an exact match
+for your OS but a similar version would work,
+such as if you're on `ubuntu-18.10` and there's only a binary for `ubuntu-18.04`.
+
+If that workaround works for you, and you believe that it should work for everyone else too,
+you may propose [adding an entry to this lookup table](https://github.com/ursa-labs/arrow-r-nightly/edit/master/linux/distro-map.csv).
+This table is checked during the installation process
+and tells the script to use binaries built on a different operating system/version
+because they're known to work.
+
+## Building C++ from source
+
+If building the C++ library from source fails, check the error message.
+(If you don't see an error message, only the `----- NOTE -----`,
+set the environment variable `ARROW_R_DEV=TRUE` to increase verbosity and retry installation.)
+The install script should work everywhere, so if the C++ library fails to compile,
+please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues)
+so that we can improve the script.
+
+## Known installation issues
+
+* On CentOS, if you are using a more modern `devtoolset`, you may need to set
+the environment variables `CC` and `CXX` either in the shell or in R's `Makeconf`.
+For CentOS 7 and above, both the Arrow system packages and the C++ binaries
+for R are built with the default system compilers. If you want to use either of these
+and you have a `devtoolset` installed, set `CC=/usr/bin/gcc CXX=/usr/bin/g++`
+to use the system compilers instead of the `devtoolset`.
+Alternatively, if you want to build `arrow` with the newer `devtoolset` compilers,
+set both `ARROW_USE_PKG_CONFIG` and `LIBARROW_BINARY` to `false` so that
+you build the Arrow C++ from source using those compilers.
+Compiler mismatch between the arrow system libraries and the R
+package may cause R to segfault when `arrow` package functions are used.
+See discussions [here](https://issues.apache.org/jira/browse/ARROW-8586)
+and [here](https://issues.apache.org/jira/browse/ARROW-10780).
+
+* If you have multiple versions of `zstd` installed on your system,
+installation by building the C++ from source may fail with an undefined symbols
+error. Workarounds include (1) setting `LIBARROW_BINARY` to use a C++ binary; (2)
+setting `ARROW_WITH_ZSTD=OFF` to build without `zstd`; or (3) uninstalling
+the conflicting `zstd`.
+See discussion [here](https://issues.apache.org/jira/browse/ARROW-8556).
+
+## Summary of build environment variables
+
+Some features are optional when you build Arrow from source. With the exception of `ARROW_S3`, these are all `ON` by default in the bundled C++ build, but you can set them to `OFF` to disable them.
+
+* `ARROW_S3`: If set to `ON` S3 support will be built as long as the
+ dependencies are met; if they are not met, the build script will turn this `OFF`
+* `ARROW_JEMALLOC` for the `jemalloc` memory allocator
+* `ARROW_MIMALLOC` for the `mimalloc` memmory allocator
+* `ARROW_PARQUET`
+* `ARROW_DATASET`
+* `ARROW_JSON` for the JSON parsing library
+* `ARROW_WITH_RE2` for the RE2 regular expression library, used in some string compute functions
+* `ARROW_WITH_UTF8PROC` for the UTF8Proc string library, used in many other string compute functions
+* `ARROW_JSON` for JSON parsing
+* `ARROW_WITH_BROTLI`, `ARROW_WITH_BZ2`, `ARROW_WITH_LZ4`, `ARROW_WITH_SNAPPY`, `ARROW_WITH_ZLIB`, and `ARROW_WITH_ZSTD` for various compression algorithms
+
+
+There are a number of other variables that affect the `configure` script and the bundled build script.
+By default, these are all unset. All boolean variables are case-insensitive.
+
+* `ARROW_USE_PKG_CONFIG`: If set to `false`, the configure script
+ won't look for Arrow libraries on your system and instead will look to download/build them.
+ Use this if you have a version mismatch between installed system libraries
+ and the version of the R package you're installing.
+* `LIBARROW_BINARY`: If set to `true`, the script will try to download a binary
+ C++ library built for your operating system.
+ You may also set it to some other string,
+ a related "distro-version" that has binaries built that work for your OS.
+ If no binary is found, installation will fall back to building C++
+ dependencies from source.
+* `LIBARROW_BUILD`: If set to `false`, the build script
+ will not attempt to build the C++ from source. This means you will only get
+ a working `arrow` R package if a prebuilt binary is found.
+ Use this if you want to avoid compiling the C++ library, which may be slow
+ and resource-intensive, and ensure that you only use a prebuilt binary.
+* `LIBARROW_MINIMAL`: If set to `false`, the build script
+ will enable some optional features, including compression libraries, S3
+ support, and additional alternative memory allocators. This will increase the
+ source build time but results in a more fully functional library.
+* `NOT_CRAN`: If this variable is set to `true`, as the `devtools` package does,
+ the build script will set `LIBARROW_BINARY=true` and `LIBARROW_MINIMAL=false`
+ unless those environment variables are already set. This provides for a more
+ complete and fast installation experience for users who already have
+ `NOT_CRAN=true` as part of their workflow, without requiring additional
+ environment variables to be set.
+* `ARROW_R_DEV`: If set to `true`, more verbose messaging will be printed
+ in the build script. `arrow::install_arrow(verbose = TRUE)` sets this.
+ This variable also is needed if you're modifying C++
+ code in the package: see the developer guide vignette.
+* `LIBARROW_DEBUG_DIR`: If the C++ library building from source fails (`cmake`),
+ there may be messages telling you to check some log file in the build directory.
+ However, when the library is built during R package installation,
+ that location is in a temp directory that is already deleted.
+ To capture those logs, set this variable to an absolute (not relative) path
+ and the log files will be copied there.
+ The directory will be created if it does not exist.
+* `CMAKE`: When building the C++ library from source, you can specify a
+ `/path/to/cmake` to use a different version than whatever is found on the `$PATH`
+
+# Contributing
+
+As mentioned above, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues)
+if you encounter ways to improve this. If you find that your Linux distribution
+or version is not supported, we welcome the contribution of Docker images
+(hosted on Docker Hub) that we can use in our continuous integration. These
+Docker images should be minimal, containing only R and the dependencies it
+requires. (For reference, see the images that
+[R-hub](https://github.com/r-hub/rhub-linux-builders) uses.)
+
+You can test the `arrow` R package installation using the `docker-compose`
+setup included in the `apache/arrow` git repository. For example,
+
+```
+R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose build r
+R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose run r
+```
+
+installs the `arrow` R package, including the C++ source build, on the
+[rhub/ubuntu-gcc-release](https://hub.docker.com/r/rhub/ubuntu-gcc-release)
+image.
diff --git a/src/arrow/r/vignettes/python.Rmd b/src/arrow/r/vignettes/python.Rmd
new file mode 100644
index 000000000..c05ee7dc7
--- /dev/null
+++ b/src/arrow/r/vignettes/python.Rmd
@@ -0,0 +1,131 @@
+---
+title: "Apache Arrow in Python and R with reticulate"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Apache Arrow in Python and R with reticulate}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+The `arrow` package provides `reticulate` methods for passing data between
+R and Python in the same process. This document provides a brief overview.
+
+## Installing
+
+To use `arrow` in Python, at a minimum you'll need the `pyarrow` library.
+To install it in a virtualenv,
+
+```r
+library(reticulate)
+virtualenv_create("arrow-env")
+install_pyarrow("arrow-env")
+```
+
+If you want to install a development version of `pyarrow`,
+add `nightly = TRUE`:
+
+```r
+install_pyarrow("arrow-env", nightly = TRUE)
+```
+
+`install_pyarrow()` also works with `conda` environments
+(`conda_create()` instead of `virtualenv_create()`).
+
+For more on installing and configuring Python,
+see the [reticulate docs](https://rstudio.github.io/reticulate/articles/python_packages.html).
+
+## Using
+
+To start, load `arrow` and `reticulate`, and then import `pyarrow`.
+
+```r
+library(arrow)
+library(reticulate)
+use_virtualenv("arrow-env")
+pa <- import("pyarrow")
+```
+
+The package includes support for sharing Arrow `Array` and `RecordBatch`
+objects in-process between R and Python. For example, let's create an `Array`
+in `pyarrow`.
+
+```r
+a <- pa$array(c(1, 2, 3))
+a
+
+## Array
+## <double>
+## [
+## 1,
+## 2,
+## 3
+## ]
+```
+
+`a` is now an `Array` object in our R session, even though we created it in Python.
+We can apply R methods on it:
+
+```r
+a[a > 1]
+
+## Array
+## <double>
+## [
+## 2,
+## 3
+## ]
+```
+
+We can send data both ways. One reason we might want to use `pyarrow` in R is
+to take advantage of functionality that is better supported in Python than in R.
+For example, `pyarrow` has a `concat_arrays` function, but as of 0.17, this
+function is not implemented in the `arrow` R package. We can use `reticulate`
+to use it efficiently.
+
+```r
+b <- Array$create(c(5, 6, 7, 8, 9))
+a_and_b <- pa$concat_arrays(list(a, b))
+a_and_b
+
+## Array
+## <double>
+## [
+## 1,
+## 2,
+## 3,
+## 5,
+## 6,
+## 7,
+## 8,
+## 9
+## ]
+```
+
+Now we have a single `Array` in R.
+
+"Send", however, isn't the correct word. Internally, we're passing pointers to
+the data between the R and Python interpreters running together in the same
+process, without copying anything. Nothing is being sent: we're sharing and
+accessing the same internal Arrow memory buffers.
+
+## Troubleshooting
+
+If you get an error like
+
+```
+Error in py_get_attr_impl(x, name, silent) :
+ AttributeError: 'pyarrow.lib.DoubleArray' object has no attribute '_export_to_c'
+```
+
+it means that the version of `pyarrow` you're using is too old.
+Support for passing data to and from R is included in versions 0.17 and greater.
+Check your pyarrow version like this:
+
+```r
+pa$`__version__`
+
+## [1] "0.16.0"
+```
+
+Note that your `pyarrow` and `arrow` versions don't need themselves to match:
+they just need to be 0.17 or greater.
diff --git a/src/arrow/ruby/Gemfile b/src/arrow/ruby/Gemfile
new file mode 100644
index 000000000..002a2a0b2
--- /dev/null
+++ b/src/arrow/ruby/Gemfile
@@ -0,0 +1,22 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gem "pkg-config"
diff --git a/src/arrow/ruby/README.md b/src/arrow/ruby/README.md
new file mode 100644
index 000000000..fbcf61520
--- /dev/null
+++ b/src/arrow/ruby/README.md
@@ -0,0 +1,36 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Apache Arrow Ruby
+
+There are the official Ruby bindings for Apache Arrow.
+
+[Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow) is the base Apache Arrow bindings.
+
+[Red Arrow CUDA](https://github.com/apache/arrow/tree/master/ruby/red-arrow-cuda) is the Apache Arrow bindings of CUDA part.
+
+[Red Arrow Dataset](https://github.com/apache/arrow/tree/master/ruby/red-arrow-dataset) is the Apache Arrow Dataset bindings.
+
+[Red Gandiva](https://github.com/apache/arrow/tree/master/ruby/red-gandiva) is the Gandiva bindings.
+
+[Red Plasma](https://github.com/apache/arrow/tree/master/ruby/red-plasma) is the Plasma bindings.
+
+[Red Parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) is the Parquet bindings.
+
+
diff --git a/src/arrow/ruby/Rakefile b/src/arrow/ruby/Rakefile
new file mode 100644
index 000000000..64559eff9
--- /dev/null
+++ b/src/arrow/ruby/Rakefile
@@ -0,0 +1,56 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "pkg-config"
+
+base_dir = File.join(__dir__)
+
+packages = []
+Dir.glob("#{base_dir}/*/*.gemspec") do |gemspec|
+ package = File.basename(File.dirname(gemspec))
+ glib_package_name = package.gsub(/\Ared-/, "") + "-glib"
+ next unless PKGConfig.exist?(glib_package_name)
+ packages << package
+end
+
+packages.each do |package|
+ desc "Run test for #{package}"
+ task package do
+ cd(File.join(base_dir, package)) do
+ if ENV["USE_BUNDLER"]
+ sh("bundle", "exec", "rake")
+ else
+ ruby("-S", "rake")
+ end
+ end
+ end
+end
+
+sorted_packages = packages.sort_by do |package|
+ if package == "red-arrow"
+ "000-#{package}"
+ else
+ package
+ end
+end
+
+desc "Run test for all packages"
+task all: sorted_packages
+
+task default: :all
diff --git a/src/arrow/ruby/red-arrow-cuda/.gitignore b/src/arrow/ruby/red-arrow-cuda/.gitignore
new file mode 100644
index 000000000..afd93a168
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/.gitignore
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/Gemfile.lock
+/pkg/
diff --git a/src/arrow/ruby/red-arrow-cuda/Gemfile b/src/arrow/ruby/red-arrow-cuda/Gemfile
new file mode 100644
index 000000000..7c4cefcf3
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/Gemfile
@@ -0,0 +1,24 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gemspec
+
+gem "red-arrow", path: "../red-arrow"
diff --git a/src/arrow/ruby/red-arrow-cuda/LICENSE.txt b/src/arrow/ruby/red-arrow-cuda/LICENSE.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/arrow/ruby/red-arrow-cuda/NOTICE.txt b/src/arrow/ruby/red-arrow-cuda/NOTICE.txt
new file mode 100644
index 000000000..e08aeda8a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/NOTICE.txt
@@ -0,0 +1,2 @@
+Apache Arrow
+Copyright 2016 The Apache Software Foundation
diff --git a/src/arrow/ruby/red-arrow-cuda/README.md b/src/arrow/ruby/red-arrow-cuda/README.md
new file mode 100644
index 000000000..f05e6640a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/README.md
@@ -0,0 +1,60 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Red Arrow CUDA - Apache Arrow CUDA Ruby
+
+Red Arrow CUDA is the Ruby bindings of Apache Arrow CUDA. Red Arrow CUDA is based on GObject Introspection.
+
+[Apache Arrow CUDA](https://arrow.apache.org/) is an in-memory columnar data store on GPU.
+
+[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime.
+
+Red Arrow CUDA uses [Apache Arrow CUDA GLib](https://github.com/apache/arrow/tree/master/c_glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Arrow CUDA.
+
+Apache Arrow CUDA GLib is a C wrapper for [Apache Arrow CUDA C++](https://github.com/apache/arrow/tree/master/cpp). GObject Introspection can't use Apache Arrow CUDA C++ directly. Apache Arrow CUDA GLib is a bridge between Apache Arrow CUDA C++ and GObject Introspection.
+
+gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow CUDA uses GObject Introspection via gobject-introspection gem.
+
+## Install
+
+Install Apache Arrow CUDA GLib before install Red Arrow CUDA. Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
+
+Install Red Arrow CUDA after you install Apache Arrow CUDA GLib:
+
+```text
+% gem install red-arrow-cuda
+```
+
+## Usage
+
+```ruby
+require "arrow-cuda"
+
+manager = ArrowCUDA::DeviceManager.new
+if manager.n_devices.zero?
+ raise "No GPU is found"
+end
+
+context = manager[0]
+buffer = ArrowCUDA::Buffer.new(context, 128)
+ArrowCUDA::BufferOutputStream.open(buffer) do |stream|
+ stream.write("Hello World")
+end
+puts buffer.copy_to_host(0, 11) # => "Hello World"
+```
diff --git a/src/arrow/ruby/red-arrow-cuda/Rakefile b/src/arrow/ruby/red-arrow-cuda/Rakefile
new file mode 100644
index 000000000..2bbe6e761
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/Rakefile
@@ -0,0 +1,41 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "rubygems"
+require "bundler/gem_helper"
+
+base_dir = File.join(File.dirname(__FILE__))
+
+helper = Bundler::GemHelper.new(base_dir)
+helper.install
+
+release_task = Rake::Task["release"]
+release_task.prerequisites.replace(["build", "release:rubygem_push"])
+
+desc "Run tests"
+task :test do
+ cd(base_dir) do
+ cd("dependency-check") do
+ ruby("-S", "rake")
+ end
+ ruby("test/run-test.rb")
+ end
+end
+
+task default: :test
diff --git a/src/arrow/ruby/red-arrow-cuda/dependency-check/Rakefile b/src/arrow/ruby/red-arrow-cuda/dependency-check/Rakefile
new file mode 100644
index 000000000..518c1a65c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/dependency-check/Rakefile
@@ -0,0 +1,47 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "pkg-config"
+require "native-package-installer"
+require_relative "../lib/arrow-cuda/version"
+
+case RUBY_PLATFORM
+when /mingw|mswin/
+ task :default => "nothing"
+else
+ task :default => "dependency:check"
+end
+
+task :nothing do
+end
+
+namespace :dependency do
+ desc "Check dependency"
+ task :check do
+ unless PKGConfig.check_version?("arrow-cuda-glib",
+ ArrowCUDA::Version::MAJOR,
+ ArrowCUDA::Version::MINOR,
+ ArrowCUDA::Version::MICRO)
+ unless NativePackageInstaller.install(:debian => "libarrow-cuda-glib-dev",
+ :redhat => "arrow-cuda-glib-devel")
+ exit(false)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda.rb b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda.rb
new file mode 100644
index 000000000..1fc13d0a0
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "arrow-cuda/version"
+
+require "arrow-cuda/loader"
+
+module ArrowCUDA
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/device-manager.rb b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/device-manager.rb
new file mode 100644
index 000000000..bbef74972
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/device-manager.rb
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowCUDA
+ class DeviceManager
+ # Experimental.
+ #
+ # Can we think device manager is a container of contexts?
+ alias_method :[], :get_context
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/loader.rb b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/loader.rb
new file mode 100644
index 000000000..6b2afc404
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/loader.rb
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowCUDA
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("ArrowCUDA", ArrowCUDA)
+ end
+ end
+
+ private
+ def post_load(repository, namespace)
+ require_libraries
+ end
+
+ def require_libraries
+ require "arrow-cuda/device-manager"
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb
new file mode 100644
index 000000000..cc7fe0153
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/lib/arrow-cuda/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowCUDA
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-cuda/red-arrow-cuda.gemspec b/src/arrow/ruby/red-arrow-cuda/red-arrow-cuda.gemspec
new file mode 100644
index 000000000..7bb34c6c2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/red-arrow-cuda.gemspec
@@ -0,0 +1,51 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "lib/arrow-cuda/version"
+
+Gem::Specification.new do |spec|
+ spec.name = "red-arrow-cuda"
+ version_components = [
+ ArrowCUDA::Version::MAJOR.to_s,
+ ArrowCUDA::Version::MINOR.to_s,
+ ArrowCUDA::Version::MICRO.to_s,
+ ArrowCUDA::Version::TAG,
+ ]
+ spec.version = version_components.compact.join(".")
+ spec.homepage = "https://arrow.apache.org/"
+ spec.authors = ["Apache Arrow Developers"]
+ spec.email = ["dev@arrow.apache.org"]
+
+ spec.summary = "Red Arrow CUDA is the Ruby bindings of Apache Arrow CUDA"
+ spec.description =
+ "Apache Arrow CUDA is a common in-memory columnar data store on CUDA. " +
+ "It's useful to share and process large data."
+ spec.license = "Apache-2.0"
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
+ spec.files += ["LICENSE.txt", "NOTICE.txt"]
+ spec.files += Dir.glob("lib/**/*.rb")
+ spec.test_files += Dir.glob("test/**/*")
+ spec.extensions = ["dependency-check/Rakefile"]
+
+ spec.add_runtime_dependency("red-arrow", "= #{spec.version}")
+
+ spec.add_development_dependency("bundler")
+ spec.add_development_dependency("rake")
+ spec.add_development_dependency("test-unit")
+end
diff --git a/src/arrow/ruby/red-arrow-cuda/test/helper.rb b/src/arrow/ruby/red-arrow-cuda/test/helper.rb
new file mode 100644
index 000000000..045eb10ee
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/test/helper.rb
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow-cuda"
+
+require "test-unit"
diff --git a/src/arrow/ruby/red-arrow-cuda/test/run-test.rb b/src/arrow/ruby/red-arrow-cuda/test/run-test.rb
new file mode 100755
index 000000000..48d2c49e1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/test/run-test.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+$VERBOSE = true
+
+require "pathname"
+
+(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path|
+ RubyInstaller::Runtime.add_dll_directory(path)
+end
+
+base_dir = Pathname.new(__dir__).parent.expand_path
+arrow_base_dir = base_dir.parent + "red-arrow"
+
+lib_dir = base_dir + "lib"
+test_dir = base_dir + "test"
+
+arrow_lib_dir = arrow_base_dir + "lib"
+arrow_ext_dir = arrow_base_dir + "ext" + "arrow"
+
+build_dir = ENV["BUILD_DIR"]
+if build_dir
+ arrow_build_dir = Pathname.new(build_dir) + "red-arrow"
+else
+ arrow_build_dir = arrow_ext_dir
+end
+
+$LOAD_PATH.unshift(arrow_build_dir.to_s)
+$LOAD_PATH.unshift(arrow_lib_dir.to_s)
+$LOAD_PATH.unshift(lib_dir.to_s)
+
+require_relative "helper"
+
+exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/src/arrow/ruby/red-arrow-cuda/test/test-cuda.rb b/src/arrow/ruby/red-arrow-cuda/test/test-cuda.rb
new file mode 100644
index 000000000..a48b687d3
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-cuda/test/test-cuda.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestCUDA < Test::Unit::TestCase
+ def setup
+ @manager = ArrowCUDA::DeviceManager.new
+ omit("At least one GPU is required") if @manager.n_devices.zero?
+ @context = @manager[0]
+ end
+
+ sub_test_case("BufferOutputStream") do
+ def setup
+ super
+ @buffer = ArrowCUDA::Buffer.new(@context, 128)
+ end
+
+ def test_new
+ ArrowCUDA::BufferOutputStream.open(@buffer) do |stream|
+ stream.write("Hello World")
+ end
+ assert_equal("Hello World", @buffer.copy_to_host(0, 11).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/.gitignore b/src/arrow/ruby/red-arrow-dataset/.gitignore
new file mode 100644
index 000000000..afd93a168
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/.gitignore
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/Gemfile.lock
+/pkg/
diff --git a/src/arrow/ruby/red-arrow-dataset/Gemfile b/src/arrow/ruby/red-arrow-dataset/Gemfile
new file mode 100644
index 000000000..7c4cefcf3
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/Gemfile
@@ -0,0 +1,24 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gemspec
+
+gem "red-arrow", path: "../red-arrow"
diff --git a/src/arrow/ruby/red-arrow-dataset/LICENSE.txt b/src/arrow/ruby/red-arrow-dataset/LICENSE.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/arrow/ruby/red-arrow-dataset/NOTICE.txt b/src/arrow/ruby/red-arrow-dataset/NOTICE.txt
new file mode 100644
index 000000000..e08aeda8a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/NOTICE.txt
@@ -0,0 +1,2 @@
+Apache Arrow
+Copyright 2016 The Apache Software Foundation
diff --git a/src/arrow/ruby/red-arrow-dataset/README.md b/src/arrow/ruby/red-arrow-dataset/README.md
new file mode 100644
index 000000000..b48ef0b6c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/README.md
@@ -0,0 +1,50 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Red Arrow Dataset - Apache Arrow Dataset Ruby
+
+Red Arrow Dataset is the Ruby bindings of Apache Arrow Dataset. Red Arrow Dataset is based on GObject Introspection.
+
+[Apache Arrow Dataset](https://arrow.apache.org/) is one of Apache Arrow components to read and write semantic datasets stored in different locations and formats.
+
+[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime.
+
+Red Arrow Dataset uses [Apache Arrow Dataset GLib](https://github.com/apache/arrow/tree/master/c_glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Arrow Dataset.
+
+Apache Arrow Dataset GLib is a C wrapper for [Apache Arrow Dataset C++](https://github.com/apache/arrow/tree/master/cpp). GObject Introspection can't use Apache Arrow Dataset C++ directly. Apache Arrow Dataset GLib is a bridge between Apache Arrow Dataset C++ and GObject Introspection.
+
+gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow Dataset uses GObject Introspection via gobject-introspection gem.
+
+## Install
+
+Install Apache Arrow Dataset GLib before install Red Arrow Dataset. Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
+
+Install Red Arrow Dataset after you install Apache Arrow Dataset GLib:
+
+```console
+$ gem install red-arrow-dataset
+```
+
+## Usage
+
+```ruby
+require "arrow-dataset"
+
+# TODO
+```
diff --git a/src/arrow/ruby/red-arrow-dataset/Rakefile b/src/arrow/ruby/red-arrow-dataset/Rakefile
new file mode 100644
index 000000000..2bbe6e761
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/Rakefile
@@ -0,0 +1,41 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "rubygems"
+require "bundler/gem_helper"
+
+base_dir = File.join(File.dirname(__FILE__))
+
+helper = Bundler::GemHelper.new(base_dir)
+helper.install
+
+release_task = Rake::Task["release"]
+release_task.prerequisites.replace(["build", "release:rubygem_push"])
+
+desc "Run tests"
+task :test do
+ cd(base_dir) do
+ cd("dependency-check") do
+ ruby("-S", "rake")
+ end
+ ruby("test/run-test.rb")
+ end
+end
+
+task default: :test
diff --git a/src/arrow/ruby/red-arrow-dataset/dependency-check/Rakefile b/src/arrow/ruby/red-arrow-dataset/dependency-check/Rakefile
new file mode 100644
index 000000000..df2e24905
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/dependency-check/Rakefile
@@ -0,0 +1,47 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "pkg-config"
+require "native-package-installer"
+require_relative "../lib/arrow-dataset/version"
+
+case RUBY_PLATFORM
+when /mingw|mswin/
+ task :default => "nothing"
+else
+ task :default => "dependency:check"
+end
+
+task :nothing do
+end
+
+namespace :dependency do
+ desc "Check dependency"
+ task :check do
+ unless PKGConfig.check_version?("arrow-dataset-glib",
+ ArrowDataset::Version::MAJOR,
+ ArrowDataset::Version::MINOR,
+ ArrowDataset::Version::MICRO)
+ unless NativePackageInstaller.install(:debian => "libarrow-dataset-glib-dev",
+ :redhat => "arrow-dataset-glib-devel")
+ exit(false)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb
new file mode 100644
index 000000000..fe4f2d518
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "arrow-dataset/version"
+
+require "arrow-dataset/loader"
+
+module ArrowDataset
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
new file mode 100644
index 000000000..14c8dce6f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ module ArrowTableLoadable
+ private
+ def path_to_uri(path)
+ absolute_path = ::File.expand_path(path)
+ if absolute_path.start_with?("/")
+ URI("file://#{absolute_path}")
+ else
+ URI("file:///#{absolute_path}")
+ end
+ end
+
+ def load_from_directory
+ internal_load_from_uri(path_to_uri(@input))
+ end
+
+ def load_from_uri
+ internal_load_from_uri(@input)
+ end
+
+ def internal_load_from_uri(uri)
+ format = FileFormat.resolve(@options[:format])
+ dataset = FileSystemDataset.build(format) do |factory|
+ factory.file_system_uri = uri
+ end
+ scanner_builder = dataset.begin_scan
+ @options.each do |key, value|
+ next if key == :format
+ next if value.nil?
+ setter = "#{key}="
+ next unless scanner_builder.respond_to?(setter)
+ scanner_builder.public_send(setter, value)
+ end
+ scanner = scanner_builder.finish
+ scanner.to_table
+ end
+ end
+end
+
+module Arrow
+ class TableLoader
+ include ArrowDataset::ArrowTableLoadable
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb
new file mode 100644
index 000000000..30ad6c292
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ module ArrowTableSavable
+ private
+ def save_to_uri
+ format = FileFormat.resolve(@options[:format])
+ options = FileSystemDatasetWriteOptions.new
+ options.file_write_options = format.default_write_options
+ path = @output.path
+ if @output.scheme.nil?
+ options.file_system = Arrow::LocalFileSystem.new
+ else
+ options.file_system = Arrow::FileSystem.create(@output.to_s)
+ # /C:/... -> C:/...
+ unless File.expand_path(".").start_with?("/")
+ path = path.gsub(/\A\//, "")
+ end
+ end
+ partitioning = @options[:partitioning]
+ if partitioning
+ # TODO
+ options.base_dir = File.dirname(path)
+ options.base_name_template = File.basename(path)
+ options.partitioning = Partitioning.resolve(@options[:partitioning])
+ scanner_builder = ScannerBuilder.new(@table)
+ scanner_builder.use_async(true)
+ scanner = scanner_builder.finish
+ FileSystemDataset.write_scanner(scanner, options)
+ else
+ dir = File.dirname(path)
+ unless File.exist?(dir)
+ options.file_system.create_dir(dir, true)
+ end
+ options.file_system.open_output_stream(path) do |output_stream|
+ format.open_writer(output_stream,
+ options.file_system,
+ path,
+ @table.schema,
+ format.default_write_options) do |writer|
+ reader = Arrow::TableBatchReader.new(@table)
+ writer.write_record_batch_reader(reader)
+ end
+ end
+ end
+ end
+ end
+end
+
+module Arrow
+ class TableSaver
+ include ArrowDataset::ArrowTableSavable
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb
new file mode 100644
index 000000000..a658fc3f2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class Dataset
+ class << self
+ def build(*args)
+ factory_class = ArrowDataset.const_get("#{name}Factory")
+ factory = factory_class.new(*args)
+ yield(factory)
+ factory.finish
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb
new file mode 100644
index 000000000..83e61c4b2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class FileFormat
+ class << self
+ def resolve(format)
+ case format
+ when :arrow, :arrow_file, :arrow_streaming
+ IPCFileFormat.new
+ when :parquet
+ ParquetFileFormat.new
+ when :csv
+ CSVFileFormat.new
+ else
+ available_formats = [
+ :arrow,
+ :arrow_file,
+ :arrow_streaming,
+ :parquet,
+ :csv,
+ ]
+ message = "Arrow::Table load format must be one of ["
+ message << available_formats.join(", ")
+ message << "]: #{@options[:format].inspect}"
+ raise ArgumentError, message
+ end
+ end
+ end
+
+ alias_method :open_writer_raw, :open_writer
+ def open_writer(destination, file_system, path, schema, options)
+ writer = open_writer_raw(destination, file_system, path, schema, options)
+ if block_given?
+ begin
+ yield(writer)
+ ensure
+ writer.finish
+ end
+ else
+ writer
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb
new file mode 100644
index 000000000..111a29a3c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class FileSystemDatasetFactory
+ alias_method :set_file_system_uri_raw, :set_file_system_uri
+ def set_file_system_uri(uri)
+ if uri.is_a?(URI)
+ if uri.scheme.nil?
+ uri = uri.dup
+ absolute_path = File.expand_path(uri.path)
+ if absolute_path.start_with?("/")
+ uri.path = absolute_path
+ else
+ uri.path = "/#{absolute_path}"
+ end
+ uri.scheme = "file"
+ end
+ uri = uri.to_s
+ end
+ set_file_system_uri_raw(uri)
+ end
+ alias_method :file_system_uri=, :set_file_system_uri
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb
new file mode 100644
index 000000000..b1be000f7
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("ArrowDataset", ArrowDataset)
+ end
+ end
+
+ private
+ def post_load(repository, namespace)
+ require_libraries
+ end
+
+ def require_libraries
+ require "arrow-dataset/arrow-table-loadable"
+ require "arrow-dataset/arrow-table-savable"
+ require "arrow-dataset/dataset"
+ require "arrow-dataset/file-format"
+ require "arrow-dataset/file-system-dataset-factory"
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb
new file mode 100644
index 000000000..1a37139d1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/red-arrow-dataset.gemspec b/src/arrow/ruby/red-arrow-dataset/red-arrow-dataset.gemspec
new file mode 100644
index 000000000..0a60925e4
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/red-arrow-dataset.gemspec
@@ -0,0 +1,51 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "lib/arrow-dataset/version"
+
+Gem::Specification.new do |spec|
+ spec.name = "red-arrow-dataset"
+ version_components = [
+ ArrowDataset::Version::MAJOR.to_s,
+ ArrowDataset::Version::MINOR.to_s,
+ ArrowDataset::Version::MICRO.to_s,
+ ArrowDataset::Version::TAG,
+ ]
+ spec.version = version_components.compact.join(".")
+ spec.homepage = "https://arrow.apache.org/"
+ spec.authors = ["Apache Arrow Developers"]
+ spec.email = ["dev@arrow.apache.org"]
+
+ spec.summary = "Red Arrow Dataset is the Ruby bindings of Apache Arrow Dataset"
+ spec.description =
+ "Apache Arrow Dataset is one of Apache Arrow components to read and write " +
+ "semantic datasets stored in different locations and formats."
+ spec.license = "Apache-2.0"
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
+ spec.files += ["LICENSE.txt", "NOTICE.txt"]
+ spec.files += Dir.glob("lib/**/*.rb")
+ spec.test_files += Dir.glob("test/**/*")
+ spec.extensions = ["dependency-check/Rakefile"]
+
+ spec.add_runtime_dependency("red-arrow", "= #{spec.version}")
+
+ spec.add_development_dependency("bundler")
+ spec.add_development_dependency("rake")
+ spec.add_development_dependency("test-unit")
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/test/helper.rb b/src/arrow/ruby/red-arrow-dataset/test/helper.rb
new file mode 100644
index 000000000..7231eb1cb
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/test/helper.rb
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow-dataset"
+
+require "tmpdir"
+
+require "test-unit"
diff --git a/src/arrow/ruby/red-arrow-dataset/test/run-test.rb b/src/arrow/ruby/red-arrow-dataset/test/run-test.rb
new file mode 100755
index 000000000..48d2c49e1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/test/run-test.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+$VERBOSE = true
+
+require "pathname"
+
+(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path|
+ RubyInstaller::Runtime.add_dll_directory(path)
+end
+
+base_dir = Pathname.new(__dir__).parent.expand_path
+arrow_base_dir = base_dir.parent + "red-arrow"
+
+lib_dir = base_dir + "lib"
+test_dir = base_dir + "test"
+
+arrow_lib_dir = arrow_base_dir + "lib"
+arrow_ext_dir = arrow_base_dir + "ext" + "arrow"
+
+build_dir = ENV["BUILD_DIR"]
+if build_dir
+ arrow_build_dir = Pathname.new(build_dir) + "red-arrow"
+else
+ arrow_build_dir = arrow_ext_dir
+end
+
+$LOAD_PATH.unshift(arrow_build_dir.to_s)
+$LOAD_PATH.unshift(arrow_lib_dir.to_s)
+$LOAD_PATH.unshift(lib_dir.to_s)
+
+require_relative "helper"
+
+exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/src/arrow/ruby/red-arrow-dataset/test/test-arrow-table.rb b/src/arrow/ruby/red-arrow-dataset/test/test-arrow-table.rb
new file mode 100644
index 000000000..191306374
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/test/test-arrow-table.rb
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestArrowTable < Test::Unit::TestCase
+ def setup
+ Dir.mktmpdir do |tmpdir|
+ @dir = tmpdir
+ @path1 = File.join(@dir, "data", "table1.arrow")
+ @table1 = Arrow::Table.new(visible: [true, false, true],
+ point: [1, 2, 3])
+ @path2 = File.join(@dir, "data", "table2.arrow")
+ @table2 = Arrow::Table.new(visible: [true],
+ point: [10])
+ yield
+ end
+ end
+
+ def build_file_uri(path)
+ absolute_path = File.expand_path(path)
+ if absolute_path.start_with?("/")
+ URI("file://#{absolute_path}")
+ else
+ URI("file:///#{absolute_path}")
+ end
+ end
+
+ sub_test_case("load") do
+ def test_no_scheme
+ Dir.chdir(@dir) do
+ uri = URI(File.basename(@path1))
+ @table1.save(uri)
+ assert_equal(@table1, Arrow::Table.load(uri))
+ end
+ end
+
+ def test_file
+ uri = build_file_uri(@path1)
+ @table1.save(uri)
+ assert_equal(@table1, Arrow::Table.load(uri))
+ end
+
+ def test_directory_uri
+ uri = build_file_uri(@dir)
+ @table1.save(build_file_uri(@path1))
+ @table2.save(build_file_uri(@path2))
+ assert_equal(@table1.concatenate([@table2]),
+ Arrow::Table.load(uri))
+ end
+
+ def test_directory_path
+ @table1.save(build_file_uri(@path1))
+ @table2.save(build_file_uri(@path2))
+ assert_equal(@table1.concatenate([@table2]),
+ Arrow::Table.load(@dir))
+ end
+
+ def test_filter
+ @table1.save(build_file_uri(@path1))
+ @table2.save(build_file_uri(@path2))
+ assert_equal(Arrow::Table.new(visible: [true, true, true],
+ point: [1, 3, 10]),
+ Arrow::Table.load(@dir,
+ filter: ["equal", :visible, true]))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/test/test-file-system-dataset.rb b/src/arrow/ruby/red-arrow-dataset/test/test-file-system-dataset.rb
new file mode 100644
index 000000000..17cbcb88d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/test/test-file-system-dataset.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestFileSystemDataset < Test::Unit::TestCase
+ def setup
+ Dir.mktmpdir do |tmpdir|
+ @dir = tmpdir
+ @path = File.join(@dir, "table.arrow")
+ @table = Arrow::Table.new(visible: [true, false, true],
+ point: [1, 2, 3])
+ @table.save(@path)
+ @format = ArrowDataset::IPCFileFormat.new
+ yield
+ end
+ end
+
+ test(".build") do
+ dataset = ArrowDataset::FileSystemDataset.build(@format) do |factory|
+ factory.file_system = Arrow::LocalFileSystem.new
+ factory.add_path(File.expand_path(@path))
+ end
+ assert_equal(@table, dataset.to_table)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/.gitignore b/src/arrow/ruby/red-arrow-flight/.gitignore
new file mode 100644
index 000000000..779545d90
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/.gitignore
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/pkg/
diff --git a/src/arrow/ruby/red-arrow-flight/Gemfile b/src/arrow/ruby/red-arrow-flight/Gemfile
new file mode 100644
index 000000000..7c4cefcf3
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/Gemfile
@@ -0,0 +1,24 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gemspec
+
+gem "red-arrow", path: "../red-arrow"
diff --git a/src/arrow/ruby/red-arrow-flight/LICENSE.txt b/src/arrow/ruby/red-arrow-flight/LICENSE.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/arrow/ruby/red-arrow-flight/NOTICE.txt b/src/arrow/ruby/red-arrow-flight/NOTICE.txt
new file mode 100644
index 000000000..e08aeda8a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/NOTICE.txt
@@ -0,0 +1,2 @@
+Apache Arrow
+Copyright 2016 The Apache Software Foundation
diff --git a/src/arrow/ruby/red-arrow-flight/README.md b/src/arrow/ruby/red-arrow-flight/README.md
new file mode 100644
index 000000000..e81f50f9a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/README.md
@@ -0,0 +1,50 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Red Arrow Flight - Apache Arrow Flight Ruby
+
+Red Arrow Flight is the Ruby bindings of Apache Arrow Flight. Red Arrow Flight is based on GObject Introspection.
+
+[Apache Arrow Flight](https://arrow.apache.org/) is one of Apache Arrow components to read and write semantic flights stored in different locations and formats.
+
+[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime.
+
+Red Arrow Flight uses [Apache Arrow Flight GLib](https://github.com/apache/arrow/tree/master/c_glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Arrow Flight.
+
+Apache Arrow Flight GLib is a C wrapper for [Apache Arrow Flight C++](https://github.com/apache/arrow/tree/master/cpp). GObject Introspection can't use Apache Arrow Flight C++ directly. Apache Arrow Flight GLib is a bridge between Apache Arrow Flight C++ and GObject Introspection.
+
+gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow Flight uses GObject Introspection via gobject-introspection gem.
+
+## Install
+
+Install Apache Arrow Flight GLib before install Red Arrow Flight. Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
+
+Install Red Arrow Flight after you install Apache Arrow Flight GLib:
+
+```console
+$ gem install red-arrow-flight
+```
+
+## Usage
+
+```ruby
+require "arrow-flight"
+
+# TODO
+```
diff --git a/src/arrow/ruby/red-arrow-flight/Rakefile b/src/arrow/ruby/red-arrow-flight/Rakefile
new file mode 100644
index 000000000..2bbe6e761
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/Rakefile
@@ -0,0 +1,41 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "rubygems"
+require "bundler/gem_helper"
+
+base_dir = File.join(File.dirname(__FILE__))
+
+helper = Bundler::GemHelper.new(base_dir)
+helper.install
+
+release_task = Rake::Task["release"]
+release_task.prerequisites.replace(["build", "release:rubygem_push"])
+
+desc "Run tests"
+task :test do
+ cd(base_dir) do
+ cd("dependency-check") do
+ ruby("-S", "rake")
+ end
+ ruby("test/run-test.rb")
+ end
+end
+
+task default: :test
diff --git a/src/arrow/ruby/red-arrow-flight/dependency-check/Rakefile b/src/arrow/ruby/red-arrow-flight/dependency-check/Rakefile
new file mode 100644
index 000000000..6aca19609
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/dependency-check/Rakefile
@@ -0,0 +1,47 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "pkg-config"
+require "native-package-installer"
+require_relative "../lib/arrow-flight/version"
+
+case RUBY_PLATFORM
+when /mingw|mswin/
+ task :default => "nothing"
+else
+ task :default => "dependency:check"
+end
+
+task :nothing do
+end
+
+namespace :dependency do
+ desc "Check dependency"
+ task :check do
+ unless PKGConfig.check_version?("arrow-flight-glib",
+ ArrowFlight::Version::MAJOR,
+ ArrowFlight::Version::MINOR,
+ ArrowFlight::Version::MICRO)
+ unless NativePackageInstaller.install(:debian => "libarrow-flight-glib-dev",
+ :redhat => "arrow-flight-glib-devel")
+ exit(false)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight.rb
new file mode 100644
index 000000000..2070f354a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "arrow-flight/version"
+
+require "arrow-flight/loader"
+
+module ArrowFlight
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/call-options.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/call-options.rb
new file mode 100644
index 000000000..2030b2d33
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/call-options.rb
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowFlight
+ class CallOptions
+ class << self
+ def try_convert(value)
+ case value
+ when Hash
+ options = new
+ value.each do |name, value|
+ options.__send__("#{name}=", value)
+ end
+ options
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/client-options.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/client-options.rb
new file mode 100644
index 000000000..2294b2133
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/client-options.rb
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowFlight
+ class ClientOptions
+ class << self
+ def try_convert(value)
+ case value
+ when Hash
+ options = new
+ value.each do |name, value|
+ options.__send__("#{name}=", value)
+ end
+ options
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/loader.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/loader.rb
new file mode 100644
index 000000000..2e8878d69
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/loader.rb
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowFlight
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("ArrowFlight", ArrowFlight)
+ end
+ end
+
+ private
+ def post_load(repository, namespace)
+ require_libraries
+ self.class.start_callback_dispatch_thread
+ end
+
+ def require_libraries
+ require "arrow-flight/call-options"
+ require "arrow-flight/client-options"
+ require "arrow-flight/location"
+ require "arrow-flight/server-options"
+ require "arrow-flight/ticket"
+ end
+
+ def should_unlock_gvl?(info, klass)
+ true
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/location.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/location.rb
new file mode 100644
index 000000000..d49178d04
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/location.rb
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowFlight
+ class Location
+ class << self
+ def try_convert(value)
+ case value
+ when String
+ new(value)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/server-options.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/server-options.rb
new file mode 100644
index 000000000..f28aed87e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/server-options.rb
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowFlight
+ class ServerOptions
+ class << self
+ def try_convert(value)
+ case value
+ when Location
+ new(value)
+ when Hash
+ return nil unless value.key?(:location)
+ options = new(value[:location])
+ value.each do |name, value|
+ next if name == :location
+ options.__send__("#{name}=", value)
+ end
+ options
+ else
+ value = Location.try_convert(value)
+ return nil if value.nil?
+ try_convert(value)
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/ticket.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/ticket.rb
new file mode 100644
index 000000000..92afad386
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/ticket.rb
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowFlight
+ class Ticket
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when String
+ new(value)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/version.rb b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/version.rb
new file mode 100644
index 000000000..b4a252b15
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/lib/arrow-flight/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowFlight
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/red-arrow-flight.gemspec b/src/arrow/ruby/red-arrow-flight/red-arrow-flight.gemspec
new file mode 100644
index 000000000..efe868ca7
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/red-arrow-flight.gemspec
@@ -0,0 +1,52 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "lib/arrow-flight/version"
+
+Gem::Specification.new do |spec|
+ spec.name = "red-arrow-flight"
+ version_components = [
+ ArrowFlight::Version::MAJOR.to_s,
+ ArrowFlight::Version::MINOR.to_s,
+ ArrowFlight::Version::MICRO.to_s,
+ ArrowFlight::Version::TAG,
+ ]
+ spec.version = version_components.compact.join(".")
+ spec.homepage = "https://arrow.apache.org/"
+ spec.authors = ["Apache Arrow Developers"]
+ spec.email = ["dev@arrow.apache.org"]
+
+ spec.summary = "Red Arrow Flight is the Ruby bindings of Apache Arrow Flight"
+ spec.description =
+ "Apache Arrow Flight is a general-purpose client-server framework to " +
+ "simplify high performance transport of large datasets over " +
+ "network interfaces."
+ spec.license = "Apache-2.0"
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
+ spec.files += ["LICENSE.txt", "NOTICE.txt"]
+ spec.files += Dir.glob("lib/**/*.rb")
+ spec.test_files += Dir.glob("test/**/*")
+ spec.extensions = ["dependency-check/Rakefile"]
+
+ spec.add_runtime_dependency("red-arrow", "= #{spec.version}")
+
+ spec.add_development_dependency("bundler")
+ spec.add_development_dependency("rake")
+ spec.add_development_dependency("test-unit")
+end
diff --git a/src/arrow/ruby/red-arrow-flight/test/helper.rb b/src/arrow/ruby/red-arrow-flight/test/helper.rb
new file mode 100644
index 000000000..cddfdea5f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/test/helper.rb
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow-flight"
+
+require "test-unit"
+
+require_relative "helper/server"
diff --git a/src/arrow/ruby/red-arrow-flight/test/helper/info-generator.rb b/src/arrow/ruby/red-arrow-flight/test/helper/info-generator.rb
new file mode 100644
index 000000000..e5430fd6b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/test/helper/info-generator.rb
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+module Helper
+ class InfoGenerator
+ def page_view_table
+ Arrow::Table.new("count" => Arrow::UInt64Array.new([1, 2, 3]),
+ "private" => Arrow::BooleanArray.new([true, false, true]))
+ end
+
+ def page_view_descriptor
+ ArrowFlight::PathDescriptor.new(["page-view"])
+ end
+
+ def page_view_ticket
+ "page-view"
+ end
+
+ def page_view_endpoints
+ locations = [
+ ArrowFlight::Location.new("grpc+tcp://127.0.0.1:10000"),
+ ArrowFlight::Location.new("grpc+tcp://127.0.0.1:10001"),
+ ]
+ [
+ ArrowFlight::Endpoint.new(page_view_ticket, locations),
+ ]
+ end
+
+ def page_view
+ table = page_view_table
+ descriptor = page_view_descriptor
+ endpoints = page_view_endpoints
+ output = Arrow::ResizableBuffer.new(0)
+ table.save(output, format: :stream)
+ ArrowFlight::Info.new(table.schema,
+ descriptor,
+ endpoints,
+ table.n_rows,
+ output.size)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/test/helper/server.rb b/src/arrow/ruby/red-arrow-flight/test/helper/server.rb
new file mode 100644
index 000000000..269bb5f3d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/test/helper/server.rb
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "info-generator"
+
+module Helper
+ class Server < ArrowFlight::Server
+ type_register
+
+ private
+ def virtual_do_list_flights(context, criteria)
+ generator = InfoGenerator.new
+ [generator.page_view]
+ end
+
+ def virtual_do_do_get(context, ticket)
+ generator = InfoGenerator.new
+ if ticket.data.to_s != generator.page_view_ticket
+ raise Arrow::Error::Invalid.new("invalid ticket")
+ end
+ table = generator.page_view_table
+ ArrowFlight::RecordBatchStream.new(table)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/test/run-test.rb b/src/arrow/ruby/red-arrow-flight/test/run-test.rb
new file mode 100755
index 000000000..48d2c49e1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/test/run-test.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+$VERBOSE = true
+
+require "pathname"
+
+(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path|
+ RubyInstaller::Runtime.add_dll_directory(path)
+end
+
+base_dir = Pathname.new(__dir__).parent.expand_path
+arrow_base_dir = base_dir.parent + "red-arrow"
+
+lib_dir = base_dir + "lib"
+test_dir = base_dir + "test"
+
+arrow_lib_dir = arrow_base_dir + "lib"
+arrow_ext_dir = arrow_base_dir + "ext" + "arrow"
+
+build_dir = ENV["BUILD_DIR"]
+if build_dir
+ arrow_build_dir = Pathname.new(build_dir) + "red-arrow"
+else
+ arrow_build_dir = arrow_ext_dir
+end
+
+$LOAD_PATH.unshift(arrow_build_dir.to_s)
+$LOAD_PATH.unshift(arrow_lib_dir.to_s)
+$LOAD_PATH.unshift(lib_dir.to_s)
+
+require_relative "helper"
+
+exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/src/arrow/ruby/red-arrow-flight/test/test-client.rb b/src/arrow/ruby/red-arrow-flight/test/test-client.rb
new file mode 100644
index 000000000..850d6f457
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/test/test-client.rb
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestClient < Test::Unit::TestCase
+ def setup
+ @server = nil
+ omit("Unstable on Windows") if Gem.win_platform?
+ @server = Helper::Server.new
+ @server.listen("grpc://127.0.0.1:0")
+ @location = "grpc://127.0.0.1:#{@server.port}"
+ end
+
+ def teardown
+ return if @server.nil?
+ @server.shutdown
+ end
+
+ def test_list_flights
+ client = ArrowFlight::Client.new(@location)
+ generator = Helper::InfoGenerator.new
+ assert_equal([generator.page_view],
+ client.list_flights)
+ end
+
+ def test_do_get
+ client = ArrowFlight::Client.new(@location)
+ generator = Helper::InfoGenerator.new
+ reader = client.do_get(generator.page_view_ticket)
+ assert_equal(generator.page_view_table,
+ reader.read_all)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/test/test-location.rb b/src/arrow/ruby/red-arrow-flight/test/test-location.rb
new file mode 100644
index 000000000..5edd5594f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/test/test-location.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestLocation < Test::Unit::TestCase
+ sub_test_case(".try_convert") do
+ def test_string
+ location = ArrowFlight::Location.try_convert("grpc://127.0.0.1:2929")
+ assert_equal("grpc://127.0.0.1:2929",
+ location.to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-flight/test/test-ticket.rb b/src/arrow/ruby/red-arrow-flight/test/test-ticket.rb
new file mode 100644
index 000000000..d8668be74
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-flight/test/test-ticket.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestTicket < Test::Unit::TestCase
+ sub_test_case(".try_convert") do
+ def test_string
+ ticket = ArrowFlight::Ticket.try_convert("data")
+ assert_equal("data",
+ ticket.data.to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/.gitignore b/src/arrow/ruby/red-arrow/.gitignore
new file mode 100644
index 000000000..3330f8657
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/.gitignore
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/.yardoc/
+/Gemfile.lock
+/doc/reference/
+/ext/arrow/Makefile
+/ext/arrow/mkmf.log
+/pkg/
diff --git a/src/arrow/ruby/red-arrow/.yardopts b/src/arrow/ruby/red-arrow/.yardopts
new file mode 100644
index 000000000..67159b1dc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/.yardopts
@@ -0,0 +1,6 @@
+--output-dir doc/reference
+--markup markdown
+--no-private
+lib/**/*.rb
+-
+doc/text/*
diff --git a/src/arrow/ruby/red-arrow/Gemfile b/src/arrow/ruby/red-arrow/Gemfile
new file mode 100644
index 000000000..3907918c8
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/Gemfile
@@ -0,0 +1,22 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gemspec
diff --git a/src/arrow/ruby/red-arrow/LICENSE.txt b/src/arrow/ruby/red-arrow/LICENSE.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/arrow/ruby/red-arrow/NOTICE.txt b/src/arrow/ruby/red-arrow/NOTICE.txt
new file mode 100644
index 000000000..e08aeda8a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/NOTICE.txt
@@ -0,0 +1,2 @@
+Apache Arrow
+Copyright 2016 The Apache Software Foundation
diff --git a/src/arrow/ruby/red-arrow/README.md b/src/arrow/ruby/red-arrow/README.md
new file mode 100644
index 000000000..4249eeae6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/README.md
@@ -0,0 +1,75 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Red Arrow - Apache Arrow Ruby
+
+Red Arrow is the Ruby bindings of Apache Arrow. Red Arrow is based on GObject Introspection.
+
+[Apache Arrow](https://arrow.apache.org/) is an in-memory columnar data store. It's used by many products for data analytics.
+
+[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime.
+
+Red Arrow uses [Apache Arrow GLib](https://github.com/apache/arrow/tree/master/c_glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Arrow.
+
+Apache Arrow GLib is a C wrapper for [Apache Arrow C++](https://github.com/apache/arrow/tree/master/cpp). GObject Introspection can't use Apache Arrow C++ directly. Apache Arrow GLib is a bridge between Apache Arrow C++ and GObject Introspection.
+
+gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow uses GObject Introspection via gobject-introspection gem.
+
+## Install
+
+Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
+
+Install Red Arrow after you install Apache Arrow GLib:
+
+```console
+% gem install red-arrow
+```
+
+## Usage
+
+```ruby
+require "arrow"
+
+table = Arrow::Table.load("/dev/shm/data.arrow")
+# Process data in table
+table.save("/dev/shm/data-processed.arrow")
+```
+
+## Development
+
+Note that you need to install Apache Arrow C++/GLib at master before preparing Red Arrow. See also:
+
+ * For Apache Arrow C++: https://arrow.apache.org/docs/developers/cpp/building.html
+ * For Apache Arrow GLib: https://github.com/apache/arrow/blob/master/c_glib/README.md
+
+```console
+$ cd ruby/red-arrow
+$ bundle install
+$ bundle exec rake test
+```
+
+### For macOS with Homebrew
+
+```console
+$ cd ruby/red-arrow
+$ bundle install
+$ brew install apache-arrow --head
+$ brew install apache-arrow-glib --head
+$ bundle exec rake test
+``` \ No newline at end of file
diff --git a/src/arrow/ruby/red-arrow/Rakefile b/src/arrow/ruby/red-arrow/Rakefile
new file mode 100644
index 000000000..dd2c310b6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/Rakefile
@@ -0,0 +1,100 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "bundler/gem_helper"
+require "rake/clean"
+require "yard"
+
+base_dir = File.join(__dir__)
+
+helper = Bundler::GemHelper.new(base_dir)
+helper.install
+spec = helper.gemspec
+
+release_task = Rake::Task["release"]
+release_task.prerequisites.replace(["build", "release:rubygem_push"])
+
+def run_extconf(build_dir, extension_dir, *arguments)
+ cd(build_dir) do
+ ruby(File.join(extension_dir, "extconf.rb"),
+ *arguments)
+ end
+end
+
+spec.extensions.each do |extension|
+ extension_dir = File.join(base_dir, File.dirname(extension))
+ build_dir = ENV["BUILD_DIR"]
+ if build_dir
+ build_dir = File.join(build_dir, "red-arrow")
+ directory build_dir
+ else
+ build_dir = extension_dir
+ end
+ CLOBBER << File.join(build_dir, "Makefile")
+ CLOBBER << File.join(build_dir, "mkmf.log")
+
+ makefile = File.join(build_dir, "Makefile")
+ file makefile => build_dir do
+ run_extconf(build_dir, extension_dir)
+ end
+
+ desc "Configure"
+ task :configure => build_dir do
+ run_extconf(build_dir, extension_dir)
+ end
+
+ desc "Compile"
+ task :compile => makefile do
+ cd(build_dir) do
+ sh("make")
+ end
+ end
+
+ task :clean do
+ cd(build_dir) do
+ sh("make", "clean") if File.exist?("Makefile")
+ end
+ end
+end
+
+desc "Run tests"
+task :test do
+ cd(base_dir) do
+ ruby("test/run-test.rb")
+ end
+end
+
+task default: :test
+
+desc "Run benchmarks"
+task :benchmark do
+ benchmarks = if ENV["BENCHMARKS"]
+ ENV["BENCHMARKS"].split
+ else
+ FileList["benchmark/{,*/**/}*.yml"]
+ end
+ cd(base_dir) do
+ benchmarks.each do |benchmark|
+ sh("benchmark-driver", benchmark)
+ end
+ end
+end
+
+YARD::Rake::YardocTask.new do |task|
+end
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/boolean.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/boolean.yml
new file mode 100644
index 000000000..5e2551e2c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/boolean.yml
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_rows = 1000
+ n_columns = 10
+ type = :boolean
+
+ fields = {}
+ arrays = {}
+ n_columns.times do |i|
+ column_name = "column_#{i}"
+ fields[column_name] = type
+ arrays[column_name] = n_rows.times.map { Faker::Boolean.boolean }
+ end
+ record_batch = Arrow::RecordBatch.new(fields, arrays)
+
+ def pure_ruby_raw_records(record_batch)
+ n_rows = record_batch.n_rows
+ n_columns = record_batch.n_columns
+ columns = record_batch.columns
+ records = []
+ i = 0
+ while i < n_rows
+ record = []
+ j = 0
+ while j < n_columns
+ record << columns[j][i]
+ j += 1
+ end
+ records << record
+ i += 1
+ end
+ records
+ end
+benchmark:
+ pure_ruby: |-
+ pure_ruby_raw_records(record_batch)
+ raw_records: |-
+ record_batch.raw_records
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/decimal128.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/decimal128.yml
new file mode 100644
index 000000000..367e7c713
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/decimal128.yml
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_rows = 1000
+ n_columns = 10
+ type = Arrow::Decimal128DataType.new(10, 5)
+
+ fields = {}
+ arrays = {}
+ n_columns.times do |i|
+ column_name = "column_#{i}"
+ fields[column_name] = type
+ arrays[column_name] = n_rows.times.map do
+ Faker::Number.decimal(l_digits: 10, r_digits: 5)
+ end
+ end
+ record_batch = Arrow::RecordBatch.new(fields, arrays)
+
+ def pure_ruby_raw_records(record_batch)
+ n_rows = record_batch.n_rows
+ n_columns = record_batch.n_columns
+ columns = record_batch.columns
+ records = []
+ i = 0
+ while i < n_rows
+ record = []
+ j = 0
+ while j < n_columns
+ x = columns[j][i]
+ record << BigDecimal(x.to_s)
+ j += 1
+ end
+ records << record
+ i += 1
+ end
+ records
+ end
+benchmark:
+ pure_ruby: |-
+ pure_ruby_raw_records(record_batch)
+ raw_records: |-
+ record_batch.raw_records()
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml
new file mode 100644
index 000000000..151bb412f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_rows = 1000
+ n_columns = 10
+ type = Arrow::DictionaryDataType.new(:int8, :string, true)
+
+ fields = n_columns.times.map {|i| ["column_#{i}".to_sym, type] }.to_h
+ schema = Arrow::Schema.new(**fields)
+ dictionary = Arrow::StringArray.new(
+ 100.times.map { Faker::Book.genre }.uniq.sort
+ )
+ indices = Arrow::Int8Array.new(
+ n_rows.times.map {
+ Faker::Number.within(range: 0 ... dictionary.length)
+ }
+ )
+ arrays = n_columns.times.map do
+ Arrow::DictionaryArray.new(
+ type,
+ indices,
+ dictionary,
+ )
+ end
+ record_batch = Arrow::RecordBatch.new(schema, n_rows, arrays)
+
+ def pure_ruby_raw_records(record_batch)
+ n_rows = record_batch.n_rows
+ n_columns = record_batch.n_columns
+ columns = record_batch.columns
+ records = []
+ i = 0
+ while i < n_rows
+ record = []
+ j = 0
+ while j < n_columns
+ record << columns[j].data.indices[i]
+ j += 1
+ end
+ records << record
+ i += 1
+ end
+ records
+ end
+benchmark:
+ pure_ruby: |-
+ pure_ruby_raw_records(record_batch)
+ raw_records: |-
+ record_batch.raw_records
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/int64.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/int64.yml
new file mode 100644
index 000000000..bd03ab942
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/int64.yml
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_rows = 1000
+ n_columns = 10
+ type = :int64
+
+ fields = {}
+ arrays = {}
+ n_columns.times do |i|
+ column_name = "column_#{i}"
+ fields[column_name] = type
+ arrays[column_name] = n_rows.times.map do
+ Faker::Number.number(digits: 18).to_i
+ end
+ end
+ record_batch = Arrow::RecordBatch.new(fields, arrays)
+
+ def pure_ruby_raw_records(record_batch)
+ n_rows = record_batch.n_rows
+ n_columns = record_batch.n_columns
+ columns = record_batch.columns
+ records = []
+ i = 0
+ while i < n_rows
+ record = []
+ j = 0
+ while j < n_columns
+ record << columns[j][i]
+ j += 1
+ end
+ records << record
+ i += 1
+ end
+ records
+ end
+benchmark:
+ pure_ruby: |-
+ pure_ruby_raw_records(record_batch)
+ raw_records: |-
+ record_batch.raw_records
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/list.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/list.yml
new file mode 100644
index 000000000..b9a526710
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/list.yml
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_rows = 1000
+ n_columns = 10
+ type = Arrow::ListDataType.new(name: "values", type: :double)
+
+ fields = {}
+ arrays = {}
+ n_columns.times do |i|
+ column_name = "column_#{i}"
+ fields[column_name] = type
+ arrays[column_name] = n_rows.times.map do
+ n_elements = Faker::Number.within(range: 1 ... 100)
+ n_elements.times.map do
+ Faker::Number.normal(mean: 0, standard_deviation: 1e+6)
+ end
+ end
+ end
+ record_batch = Arrow::RecordBatch.new(fields, arrays)
+
+ def pure_ruby_raw_records(record_batch)
+ n_rows = record_batch.n_rows
+ n_columns = record_batch.n_columns
+ columns = record_batch.columns
+ records = []
+ i = 0
+ while i < n_rows
+ record = []
+ j = 0
+ while j < n_columns
+ record << columns[j][i]
+ j += 1
+ end
+ records << record
+ i += 1
+ end
+ records
+ end
+benchmark:
+ pure_ruby: |-
+ pure_ruby_raw_records(record_batch)
+ raw_records: |-
+ record_batch.raw_records
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/string.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/string.yml
new file mode 100644
index 000000000..2854a376b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/string.yml
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_rows = 1000
+ n_columns = 10
+ type = :string
+
+ fields = {}
+ arrays = {}
+ n_columns.times do |i|
+ column_name = "column_#{i}"
+ fields[column_name] = type
+ arrays[column_name] = n_rows.times.map { Faker::Name.name }
+ end
+ record_batch = Arrow::RecordBatch.new(fields, arrays)
+
+ def pure_ruby_raw_records(record_batch)
+ n_rows = record_batch.n_rows
+ n_columns = record_batch.n_columns
+ columns = record_batch.columns
+ records = []
+ i = 0
+ while i < n_rows
+ record = []
+ j = 0
+ while j < n_columns
+ record << columns[j][i]
+ j += 1
+ end
+ records << record
+ i += 1
+ end
+ records
+ end
+benchmark:
+ pure_ruby: |-
+ pure_ruby_raw_records(record_batch)
+ raw_records: |-
+ record_batch.raw_records
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/timestamp.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/timestamp.yml
new file mode 100644
index 000000000..9b65b790a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/timestamp.yml
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_rows = 1000
+ n_columns = 10
+ type = Arrow::TimestampDataType.new(:micro)
+ base_timestamp = Time.at(Faker::Number.within(range: 0 ... 1_000_000_000))
+ thirty_days_in_sec = 30*24*3600
+ timestamp_range = {
+ from: base_timestamp - thirty_days_in_sec,
+ to: base_timestamp + thirty_days_in_sec,
+ }
+
+ fields = {}
+ arrays = {}
+ n_columns.times do |i|
+ column_name = "column_#{i}"
+ fields[column_name] = type
+ arrays[column_name] = n_rows.times.map do
+ sec = Faker::Time.between(timestamp_range).to_i
+ micro = Faker::Number.within(range: 0 ... 1_000_000)
+ sec * 1_000_000 + micro
+ end
+ end
+ record_batch = Arrow::RecordBatch.new(fields, arrays)
+
+ def pure_ruby_raw_records(record_batch)
+ n_rows = record_batch.n_rows
+ n_columns = record_batch.n_columns
+ columns = record_batch.columns
+ records = []
+ i = 0
+ while i < n_rows
+ record = []
+ j = 0
+ while j < n_columns
+ record << columns[j][i]
+ j += 1
+ end
+ records << record
+ i += 1
+ end
+ records
+ end
+benchmark:
+ pure_ruby: |-
+ pure_ruby_raw_records(record_batch)
+ raw_records: |-
+ record_batch.raw_records
diff --git a/src/arrow/ruby/red-arrow/benchmark/values/boolean.yml b/src/arrow/ruby/red-arrow/benchmark/values/boolean.yml
new file mode 100644
index 000000000..45abff523
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/values/boolean.yml
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_values = 1000
+ values = n_values.times.map { Faker::Boolean.boolean }
+ array = Arrow::BooleanArray.new(values)
+benchmark:
+ pure_ruby: |-
+ array.collect.to_a
+ values: |-
+ array.values
diff --git a/src/arrow/ruby/red-arrow/benchmark/values/decimal128.yml b/src/arrow/ruby/red-arrow/benchmark/values/decimal128.yml
new file mode 100644
index 000000000..4a2a5bff5
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/values/decimal128.yml
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_values = 1000
+ type = Arrow::Decimal128DataType.new(10, 5)
+ values = n_values.times.map { Faker::Number.decimal(l_digits: 10, r_digits: 5) }
+ array = Arrow::Decimal128Array.new(type, values)
+benchmark:
+ pure_ruby: |-
+ array.collect.to_a
+ values: |-
+ array.values
diff --git a/src/arrow/ruby/red-arrow/benchmark/values/dictionary.yml b/src/arrow/ruby/red-arrow/benchmark/values/dictionary.yml
new file mode 100644
index 000000000..5b4f20dc8
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/values/dictionary.yml
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_values = 1000
+ type = Arrow::DictionaryDataType.new(:int8, :string, true)
+
+ dictionary = Arrow::StringArray.new(
+ 100.times.map { Faker::Book.genre }.uniq.sort
+ )
+ indices = Arrow::Int8Array.new(
+ n_values.times.map {
+ Faker::Number.within(range: 0 ... dictionary.length)
+ }
+ )
+ array = Arrow::DictionaryArray.new(type, indices, dictionary)
+benchmark:
+ pure_ruby: |-
+ array.length.times.collect {|i| array.indices[i]}
+ values: |-
+ array.values
diff --git a/src/arrow/ruby/red-arrow/benchmark/values/int64.yml b/src/arrow/ruby/red-arrow/benchmark/values/int64.yml
new file mode 100644
index 000000000..d9e89261a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/values/int64.yml
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_values = 1000
+ values = n_values.times.map { Faker::Number.number(digits: 18).to_i }
+ array = Arrow::Int64Array.new(values)
+benchmark:
+ pure_ruby: |-
+ array.collect.to_a
+ values: |-
+ array.values
diff --git a/src/arrow/ruby/red-arrow/benchmark/values/list.yml b/src/arrow/ruby/red-arrow/benchmark/values/list.yml
new file mode 100644
index 000000000..2764c1a61
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/values/list.yml
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_values = 1000
+ type = Arrow::ListDataType.new(name: "values", type: :double)
+
+ values = n_values.times.map do
+ n_elements = Faker::Number.within(range: 1 ... 100)
+ n_elements.times.map do
+ Faker::Number.normal(mean: 0, standard_deviation: 1e+6)
+ end
+ end
+ array = Arrow::ListArray.new(type, values)
+benchmark:
+ pure_ruby: |-
+ array.collect.to_a
+ values: |-
+ array.values
diff --git a/src/arrow/ruby/red-arrow/benchmark/values/string.yml b/src/arrow/ruby/red-arrow/benchmark/values/string.yml
new file mode 100644
index 000000000..8a40deaa0
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/values/string.yml
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_values = 1000
+
+ values = n_values.times.map { Faker::Name.name }
+ array = Arrow::StringArray.new(values)
+benchmark:
+ pure_ruby: |-
+ array.collect.to_a
+ values: |-
+ array.values
diff --git a/src/arrow/ruby/red-arrow/benchmark/values/timestamp.yml b/src/arrow/ruby/red-arrow/benchmark/values/timestamp.yml
new file mode 100644
index 000000000..4af46d1db
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/benchmark/values/timestamp.yml
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+contexts:
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+prelude: |-
+ require "arrow"
+ require "faker"
+
+ state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
+ Faker::Config.random = Random.new(state)
+
+ n_values = 1000
+ type = Arrow::TimestampDataType.new(:micro)
+ base_timestamp = Time.at(Faker::Number.within(range: 0 ... 1_000_000_000))
+ thirty_days_in_sec = 30*24*3600
+ timestamp_range = {
+ from: base_timestamp - thirty_days_in_sec,
+ to: base_timestamp + thirty_days_in_sec,
+ }
+
+ values = n_values.times.map do
+ sec = Faker::Time.between(timestamp_range).to_i
+ micro = Faker::Number.within(range: 0 ... 1_000_000)
+ sec * 1_000_000 + micro
+ end
+ array = Arrow::TimestampArray.new(type, values)
+benchmark:
+ pure_ruby: |-
+ array.collect.to_a
+ values: |-
+ array.values
diff --git a/src/arrow/ruby/red-arrow/doc/text/development.md b/src/arrow/ruby/red-arrow/doc/text/development.md
new file mode 100644
index 000000000..cc86de35f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/doc/text/development.md
@@ -0,0 +1,34 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Development
+
+## Naming convention
+
+### Reader and Writer
+
+Reader and Writer require an opened IO stream.
+
+### Loader and Saver
+
+Loader and Saver require a path. They are convenient classes.
+
+Loader opens the path and reads data by Reader.
+
+Writer opens the path and writes data by Writer.
diff --git a/src/arrow/ruby/red-arrow/example/read-file.rb b/src/arrow/ruby/red-arrow/example/read-file.rb
new file mode 100755
index 000000000..9a99d3377
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/example/read-file.rb
@@ -0,0 +1,36 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+Arrow::MemoryMappedInputStream.open("/tmp/file.arrow") do |input|
+ reader = Arrow::RecordBatchFileReader.new(input)
+ fields = reader.schema.fields
+ reader.each_with_index do |record_batch, i|
+ puts("=" * 40)
+ puts("record-batch[#{i}]:")
+ fields.each do |field|
+ field_name = field.name
+ values = record_batch.collect do |record|
+ record[field_name]
+ end
+ puts(" #{field_name}: #{values.inspect}")
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/example/read-stream.rb b/src/arrow/ruby/red-arrow/example/read-stream.rb
new file mode 100755
index 000000000..c7197120c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/example/read-stream.rb
@@ -0,0 +1,36 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+Arrow::MemoryMappedInputStream.open("/tmp/stream.arrow") do |input|
+ reader = Arrow::RecordBatchStreamReader.new(input)
+ fields = reader.schema.fields
+ reader.each_with_index do |record_batch, i|
+ puts("=" * 40)
+ puts("record-batch[#{i}]:")
+ fields.each do |field|
+ field_name = field.name
+ values = record_batch.collect do |record|
+ record[field_name]
+ end
+ puts(" #{field_name}: #{values.inspect}")
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/example/write-file.rb b/src/arrow/ruby/red-arrow/example/write-file.rb
new file mode 100755
index 000000000..c55ab2ef2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/example/write-file.rb
@@ -0,0 +1,63 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+fields = [
+ Arrow::Field.new("uint8", :uint8),
+ Arrow::Field.new("uint16", :uint16),
+ Arrow::Field.new("uint32", :uint32),
+ Arrow::Field.new("uint64", :uint64),
+ Arrow::Field.new("int8", :int8),
+ Arrow::Field.new("int16", :int16),
+ Arrow::Field.new("int32", :int32),
+ Arrow::Field.new("int64", :int64),
+ Arrow::Field.new("float", :float),
+ Arrow::Field.new("double", :double),
+]
+schema = Arrow::Schema.new(fields)
+
+Arrow::FileOutputStream.open("/tmp/file.arrow", false) do |output|
+ Arrow::RecordBatchFileWriter.open(output, schema) do |writer|
+ uints = [1, 2, 4, 8]
+ ints = [1, -2, 4, -8]
+ floats = [1.1, -2.2, 4.4, -8.8]
+ columns = [
+ Arrow::UInt8Array.new(uints),
+ Arrow::UInt16Array.new(uints),
+ Arrow::UInt32Array.new(uints),
+ Arrow::UInt64Array.new(uints),
+ Arrow::Int8Array.new(ints),
+ Arrow::Int16Array.new(ints),
+ Arrow::Int32Array.new(ints),
+ Arrow::Int64Array.new(ints),
+ Arrow::FloatArray.new(floats),
+ Arrow::DoubleArray.new(floats),
+ ]
+
+ record_batch = Arrow::RecordBatch.new(schema, 4, columns)
+ writer.write_record_batch(record_batch)
+
+ sliced_columns = columns.collect do |column|
+ column.slice(1, 3)
+ end
+ record_batch = Arrow::RecordBatch.new(schema, 3, sliced_columns)
+ writer.write_record_batch(record_batch)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/example/write-stream.rb b/src/arrow/ruby/red-arrow/example/write-stream.rb
new file mode 100755
index 000000000..fde486206
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/example/write-stream.rb
@@ -0,0 +1,63 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+fields = [
+ Arrow::Field.new("uint8", :uint8),
+ Arrow::Field.new("uint16", :uint16),
+ Arrow::Field.new("uint32", :uint32),
+ Arrow::Field.new("uint64", :uint64),
+ Arrow::Field.new("int8", :int8),
+ Arrow::Field.new("int16", :int16),
+ Arrow::Field.new("int32", :int32),
+ Arrow::Field.new("int64", :int64),
+ Arrow::Field.new("float", :float),
+ Arrow::Field.new("double", :double),
+]
+schema = Arrow::Schema.new(fields)
+
+Arrow::FileOutputStream.open("/tmp/stream.arrow", false) do |output|
+ Arrow::RecordBatchStreamWriter.open(output, schema) do |writer|
+ uints = [1, 2, 4, 8]
+ ints = [1, -2, 4, -8]
+ floats = [1.1, -2.2, 4.4, -8.8]
+ columns = [
+ Arrow::UInt8Array.new(uints),
+ Arrow::UInt16Array.new(uints),
+ Arrow::UInt32Array.new(uints),
+ Arrow::UInt64Array.new(uints),
+ Arrow::Int8Array.new(ints),
+ Arrow::Int16Array.new(ints),
+ Arrow::Int32Array.new(ints),
+ Arrow::Int64Array.new(ints),
+ Arrow::FloatArray.new(floats),
+ Arrow::DoubleArray.new(floats),
+ ]
+
+ record_batch = Arrow::RecordBatch.new(schema, 4, columns)
+ writer.write_record_batch(record_batch)
+
+ sliced_columns = columns.collect do |column|
+ column.slice(1, 3)
+ end
+ record_batch = Arrow::RecordBatch.new(schema, 3, sliced_columns)
+ writer.write_record_batch(record_batch)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/arrow.cpp b/src/arrow/ruby/red-arrow/ext/arrow/arrow.cpp
new file mode 100644
index 000000000..86c8c8fb6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/arrow.cpp
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "red-arrow.hpp"
+#include "memory-view.hpp"
+
+#include <ruby.hpp>
+
+namespace red_arrow {
+ VALUE cDate;
+
+ VALUE cArrowTime;
+
+ VALUE ArrowTimeUnitSECOND;
+ VALUE ArrowTimeUnitMILLI;
+ VALUE ArrowTimeUnitMICRO;
+ VALUE ArrowTimeUnitNANO;
+
+ ID id_BigDecimal;
+ ID id_jd;
+ ID id_new;
+ ID id_to_datetime;
+}
+
+extern "C" void Init_arrow() {
+ auto mArrow = rb_const_get_at(rb_cObject, rb_intern("Arrow"));
+
+ auto cArrowArray = rb_const_get_at(mArrow, rb_intern("Array"));
+ rb_define_method(cArrowArray, "values",
+ reinterpret_cast<rb::RawMethod>(red_arrow::array_values),
+ 0);
+
+ auto cArrowChunkedArray = rb_const_get_at(mArrow, rb_intern("ChunkedArray"));
+ rb_define_method(cArrowChunkedArray, "values",
+ reinterpret_cast<rb::RawMethod>(red_arrow::chunked_array_values),
+ 0);
+
+ auto cArrowRecordBatch = rb_const_get_at(mArrow, rb_intern("RecordBatch"));
+ rb_define_method(cArrowRecordBatch, "raw_records",
+ reinterpret_cast<rb::RawMethod>(red_arrow::record_batch_raw_records),
+ 0);
+
+ auto cArrowTable = rb_const_get_at(mArrow, rb_intern("Table"));
+ rb_define_method(cArrowTable, "raw_records",
+ reinterpret_cast<rb::RawMethod>(red_arrow::table_raw_records),
+ 0);
+
+ red_arrow::cDate = rb_const_get(rb_cObject, rb_intern("Date"));
+
+ red_arrow::cArrowTime = rb_const_get_at(mArrow, rb_intern("Time"));
+
+ auto cArrowTimeUnit = rb_const_get_at(mArrow, rb_intern("TimeUnit"));
+ red_arrow::ArrowTimeUnitSECOND =
+ rb_const_get_at(cArrowTimeUnit, rb_intern("SECOND"));
+ red_arrow::ArrowTimeUnitMILLI =
+ rb_const_get_at(cArrowTimeUnit, rb_intern("MILLI"));
+ red_arrow::ArrowTimeUnitMICRO =
+ rb_const_get_at(cArrowTimeUnit, rb_intern("MICRO"));
+ red_arrow::ArrowTimeUnitNANO =
+ rb_const_get_at(cArrowTimeUnit, rb_intern("NANO"));
+
+ red_arrow::id_BigDecimal = rb_intern("BigDecimal");
+ red_arrow::id_jd = rb_intern("jd");
+ red_arrow::id_new = rb_intern("new");
+ red_arrow::id_to_datetime = rb_intern("to_datetime");
+
+ red_arrow::memory_view::init(mArrow);
+}
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/converters.cpp b/src/arrow/ruby/red-arrow/ext/arrow/converters.cpp
new file mode 100644
index 000000000..f3bfa6f34
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/converters.cpp
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "converters.hpp"
+
+namespace red_arrow {
+ VALUE ArrayValueConverter::convert(const arrow::ListArray& array,
+ const int64_t i) {
+ return list_array_value_converter_->convert(array, i);
+ }
+
+ VALUE ArrayValueConverter::convert(const arrow::StructArray& array,
+ const int64_t i) {
+ return struct_array_value_converter_->convert(array, i);
+ }
+
+ VALUE ArrayValueConverter::convert(const arrow::MapArray& array,
+ const int64_t i) {
+ return map_array_value_converter_->convert(array, i);
+ }
+
+ VALUE ArrayValueConverter::convert(const arrow::UnionArray& array,
+ const int64_t i) {
+ return union_array_value_converter_->convert(array, i);
+ }
+
+ VALUE ArrayValueConverter::convert(const arrow::DictionaryArray& array,
+ const int64_t i) {
+ return dictionary_array_value_converter_->convert(array, i);
+ }
+}
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/converters.hpp b/src/arrow/ruby/red-arrow/ext/arrow/converters.hpp
new file mode 100644
index 000000000..f7532f951
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/converters.hpp
@@ -0,0 +1,795 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "red-arrow.hpp"
+
+#include <ruby.hpp>
+#include <ruby/encoding.h>
+
+#include <arrow-glib/error.hpp>
+
+#include <arrow/util/logging.h>
+
+namespace red_arrow {
+ class ListArrayValueConverter;
+ class StructArrayValueConverter;
+ class MapArrayValueConverter;
+ class UnionArrayValueConverter;
+ class DictionaryArrayValueConverter;
+
+ class ArrayValueConverter {
+ public:
+ ArrayValueConverter()
+ : decimal_buffer_(),
+ list_array_value_converter_(nullptr),
+ struct_array_value_converter_(nullptr),
+ map_array_value_converter_(nullptr),
+ union_array_value_converter_(nullptr),
+ dictionary_array_value_converter_(nullptr) {
+ }
+
+ inline void set_sub_value_converters(ListArrayValueConverter* list_array_value_converter,
+ StructArrayValueConverter* struct_array_value_converter,
+ MapArrayValueConverter* map_array_value_converter,
+ UnionArrayValueConverter* union_array_value_converter,
+ DictionaryArrayValueConverter* dictionary_array_value_converter) {
+ list_array_value_converter_ = list_array_value_converter;
+ struct_array_value_converter_ = struct_array_value_converter;
+ map_array_value_converter_ = map_array_value_converter;
+ union_array_value_converter_ = union_array_value_converter;
+ dictionary_array_value_converter_ = dictionary_array_value_converter;
+ }
+
+ inline VALUE convert(const arrow::NullArray& array,
+ const int64_t i) {
+ return Qnil;
+ }
+
+ inline VALUE convert(const arrow::BooleanArray& array,
+ const int64_t i) {
+ return array.Value(i) ? Qtrue : Qfalse;
+ }
+
+ inline VALUE convert(const arrow::Int8Array& array,
+ const int64_t i) {
+ return INT2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::Int16Array& array,
+ const int64_t i) {
+ return INT2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::Int32Array& array,
+ const int64_t i) {
+ return INT2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::Int64Array& array,
+ const int64_t i) {
+ return LL2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::UInt8Array& array,
+ const int64_t i) {
+ return UINT2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::UInt16Array& array,
+ const int64_t i) {
+ return UINT2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::UInt32Array& array,
+ const int64_t i) {
+ return UINT2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::UInt64Array& array,
+ const int64_t i) {
+ return ULL2NUM(array.Value(i));
+ }
+
+ // TODO
+ // inline VALUE convert(const arrow::HalfFloatArray& array,
+ // const int64_t i) {
+ // }
+
+ inline VALUE convert(const arrow::FloatArray& array,
+ const int64_t i) {
+ return DBL2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::DoubleArray& array,
+ const int64_t i) {
+ return DBL2NUM(array.Value(i));
+ }
+
+ inline VALUE convert(const arrow::BinaryArray& array,
+ const int64_t i) {
+ int32_t length;
+ const auto value = array.GetValue(i, &length);
+ // TODO: encoding support
+ return rb_enc_str_new(reinterpret_cast<const char*>(value),
+ length,
+ rb_ascii8bit_encoding());
+ }
+
+ inline VALUE convert(const arrow::StringArray& array,
+ const int64_t i) {
+ int32_t length;
+ const auto value = array.GetValue(i, &length);
+ return rb_utf8_str_new(reinterpret_cast<const char*>(value),
+ length);
+ }
+
+ inline VALUE convert(const arrow::FixedSizeBinaryArray& array,
+ const int64_t i) {
+ return rb_enc_str_new(reinterpret_cast<const char*>(array.Value(i)),
+ array.byte_width(),
+ rb_ascii8bit_encoding());
+ }
+
+ constexpr static int32_t JULIAN_DATE_UNIX_EPOCH = 2440588;
+ inline VALUE convert(const arrow::Date32Array& array,
+ const int64_t i) {
+ const auto value = array.Value(i);
+ const auto days_in_julian = value + JULIAN_DATE_UNIX_EPOCH;
+ return rb_funcall(cDate, id_jd, 1, LONG2NUM(days_in_julian));
+ }
+
+ inline VALUE convert(const arrow::Date64Array& array,
+ const int64_t i) {
+ const auto value = array.Value(i);
+ auto msec = LL2NUM(value);
+ auto sec = rb_rational_new(msec, INT2NUM(1000));
+ auto time_value = rb_time_num_new(sec, Qnil);
+ return rb_funcall(time_value, id_to_datetime, 0, 0);
+ }
+
+ inline VALUE convert(const arrow::Time32Array& array,
+ const int64_t i) {
+ const auto type =
+ arrow::internal::checked_cast<const arrow::Time32Type*>(array.type().get());
+ const auto value = array.Value(i);
+ return rb_funcall(red_arrow::cArrowTime,
+ id_new,
+ 2,
+ time_unit_to_enum(type->unit()),
+ INT2NUM(value));
+ }
+
+ inline VALUE convert(const arrow::Time64Array& array,
+ const int64_t i) {
+ const auto type =
+ arrow::internal::checked_cast<const arrow::Time64Type*>(array.type().get());
+ const auto value = array.Value(i);
+ return rb_funcall(red_arrow::cArrowTime,
+ id_new,
+ 2,
+ time_unit_to_enum(type->unit()),
+ LL2NUM(value));
+ }
+
+ inline VALUE convert(const arrow::TimestampArray& array,
+ const int64_t i) {
+ const auto type =
+ arrow::internal::checked_cast<const arrow::TimestampType*>(array.type().get());
+ auto scale = time_unit_to_scale(type->unit());
+ auto value = array.Value(i);
+ auto sec = rb_rational_new(LL2NUM(value), scale);
+ return rb_time_num_new(sec, Qnil);
+ }
+
+ // TODO
+ // inline VALUE convert(const arrow::IntervalArray& array,
+ // const int64_t i) {
+ // };
+
+ VALUE convert(const arrow::ListArray& array,
+ const int64_t i);
+
+ VALUE convert(const arrow::StructArray& array,
+ const int64_t i);
+
+ VALUE convert(const arrow::MapArray& array,
+ const int64_t i);
+
+ VALUE convert(const arrow::UnionArray& array,
+ const int64_t i);
+
+ VALUE convert(const arrow::DictionaryArray& array,
+ const int64_t i);
+
+ inline VALUE convert(const arrow::Decimal128Array& array,
+ const int64_t i) {
+ return convert_decimal(std::move(array.FormatValue(i)));
+ }
+
+ inline VALUE convert(const arrow::Decimal256Array& array,
+ const int64_t i) {
+ return convert_decimal(std::move(array.FormatValue(i)));
+ }
+
+ private:
+ inline VALUE convert_decimal(std::string&& value) {
+ decimal_buffer_ = value;
+ return rb_funcall(rb_cObject,
+ id_BigDecimal,
+ 1,
+ rb_enc_str_new(decimal_buffer_.data(),
+ decimal_buffer_.length(),
+ rb_ascii8bit_encoding()));
+ }
+
+ std::string decimal_buffer_;
+ ListArrayValueConverter* list_array_value_converter_;
+ StructArrayValueConverter* struct_array_value_converter_;
+ MapArrayValueConverter* map_array_value_converter_;
+ UnionArrayValueConverter* union_array_value_converter_;
+ DictionaryArrayValueConverter* dictionary_array_value_converter_;
+ };
+
+ class ListArrayValueConverter : public arrow::ArrayVisitor {
+ public:
+ explicit ListArrayValueConverter(ArrayValueConverter* converter)
+ : array_value_converter_(converter),
+ offset_(0),
+ length_(0),
+ result_(Qnil) {}
+
+ VALUE convert(const arrow::ListArray& array, const int64_t index) {
+ auto values = array.values().get();
+ auto offset_keep = offset_;
+ auto length_keep = length_;
+ offset_ = array.value_offset(index);
+ length_ = array.value_length(index);
+ auto result_keep = result_;
+ result_ = rb_ary_new_capa(length_);
+ check_status(values->Accept(this),
+ "[raw-records][list-array]");
+ offset_ = offset_keep;
+ length_ = length_keep;
+ auto result_return = result_;
+ result_ = result_keep;
+ return result_return;
+ }
+
+#define VISIT(TYPE) \
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
+ return visit_value(array); \
+ }
+
+ VISIT(Null)
+ VISIT(Boolean)
+ VISIT(Int8)
+ VISIT(Int16)
+ VISIT(Int32)
+ VISIT(Int64)
+ VISIT(UInt8)
+ VISIT(UInt16)
+ VISIT(UInt32)
+ VISIT(UInt64)
+ // TODO
+ // VISIT(HalfFloat)
+ VISIT(Float)
+ VISIT(Double)
+ VISIT(Binary)
+ VISIT(String)
+ VISIT(FixedSizeBinary)
+ VISIT(Date32)
+ VISIT(Date64)
+ VISIT(Time32)
+ VISIT(Time64)
+ VISIT(Timestamp)
+ // TODO
+ // VISIT(Interval)
+ VISIT(List)
+ VISIT(Struct)
+ VISIT(Map)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
+ VISIT(Dictionary)
+ VISIT(Decimal128)
+ VISIT(Decimal256)
+ // TODO
+ // VISIT(Extension)
+
+#undef VISIT
+
+ private:
+ template <typename ArrayType>
+ inline VALUE convert_value(const ArrayType& array,
+ const int64_t i) {
+ return array_value_converter_->convert(array, i);
+ }
+
+ template <typename ArrayType>
+ arrow::Status visit_value(const ArrayType& array) {
+ if (array.null_count() > 0) {
+ for (int64_t i = 0; i < length_; ++i) {
+ auto value = Qnil;
+ if (!array.IsNull(i + offset_)) {
+ value = convert_value(array, i + offset_);
+ }
+ rb_ary_push(result_, value);
+ }
+ } else {
+ for (int64_t i = 0; i < length_; ++i) {
+ rb_ary_push(result_, convert_value(array, i + offset_));
+ }
+ }
+ return arrow::Status::OK();
+ }
+
+ ArrayValueConverter* array_value_converter_;
+ int32_t offset_;
+ int32_t length_;
+ VALUE result_;
+ };
+
+ class StructArrayValueConverter : public arrow::ArrayVisitor {
+ public:
+ explicit StructArrayValueConverter(ArrayValueConverter* converter)
+ : array_value_converter_(converter),
+ key_(Qnil),
+ index_(0),
+ result_(Qnil) {}
+
+ VALUE convert(const arrow::StructArray& array,
+ const int64_t index) {
+ auto index_keep = index_;
+ auto result_keep = result_;
+ index_ = index;
+ result_ = rb_hash_new();
+ const auto struct_type = array.struct_type();
+ const auto n = struct_type->num_fields();
+ for (int i = 0; i < n; ++i) {
+ const auto field_type = struct_type->field(i).get();
+ const auto& field_name = field_type->name();
+ auto key_keep = key_;
+ key_ = rb_utf8_str_new(field_name.data(), field_name.length());
+ const auto field_array = array.field(i).get();
+ check_status(field_array->Accept(this),
+ "[raw-records][struct-array]");
+ key_ = key_keep;
+ }
+ auto result_return = result_;
+ result_ = result_keep;
+ index_ = index_keep;
+ return result_return;
+ }
+
+#define VISIT(TYPE) \
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
+ fill_field(array); \
+ return arrow::Status::OK(); \
+ }
+
+ VISIT(Null)
+ VISIT(Boolean)
+ VISIT(Int8)
+ VISIT(Int16)
+ VISIT(Int32)
+ VISIT(Int64)
+ VISIT(UInt8)
+ VISIT(UInt16)
+ VISIT(UInt32)
+ VISIT(UInt64)
+ // TODO
+ // VISIT(HalfFloat)
+ VISIT(Float)
+ VISIT(Double)
+ VISIT(Binary)
+ VISIT(String)
+ VISIT(FixedSizeBinary)
+ VISIT(Date32)
+ VISIT(Date64)
+ VISIT(Time32)
+ VISIT(Time64)
+ VISIT(Timestamp)
+ // TODO
+ // VISIT(Interval)
+ VISIT(List)
+ VISIT(Struct)
+ VISIT(Map)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
+ VISIT(Dictionary)
+ VISIT(Decimal128)
+ VISIT(Decimal256)
+ // TODO
+ // VISIT(Extension)
+
+#undef VISIT
+
+ private:
+ template <typename ArrayType>
+ inline VALUE convert_value(const ArrayType& array,
+ const int64_t i) {
+ return array_value_converter_->convert(array, i);
+ }
+
+ template <typename ArrayType>
+ void fill_field(const ArrayType& array) {
+ if (array.IsNull(index_)) {
+ rb_hash_aset(result_, key_, Qnil);
+ } else {
+ rb_hash_aset(result_, key_, convert_value(array, index_));
+ }
+ }
+
+ ArrayValueConverter* array_value_converter_;
+ VALUE key_;
+ int64_t index_;
+ VALUE result_;
+ };
+
+ class MapArrayValueConverter : public arrow::ArrayVisitor {
+ public:
+ explicit MapArrayValueConverter(ArrayValueConverter* converter)
+ : array_value_converter_(converter),
+ offset_(0),
+ length_(0),
+ values_(Qnil) {}
+
+ VALUE convert(const arrow::MapArray& array,
+ const int64_t index) {
+ auto key_array = array.keys().get();
+ auto item_array = array.items().get();
+ auto offset_keep = offset_;
+ auto length_keep = length_;
+ auto values_keep = values_;
+ offset_ = array.value_offset(index);
+ length_ = array.value_length(index);
+ auto keys = rb_ary_new_capa(length_);
+ values_ = keys;
+ check_status(key_array->Accept(this),
+ "[raw-records][map-array][keys]");
+ auto items = rb_ary_new_capa(length_);
+ values_ = items;
+ check_status(item_array->Accept(this),
+ "[raw-records][map-array][items]");
+ auto map = rb_hash_new();
+ auto n = RARRAY_LEN(keys);
+ auto raw_keys = RARRAY_CONST_PTR(keys);
+ auto raw_items = RARRAY_CONST_PTR(items);
+ for (long i = 0; i < n; ++i) {
+ rb_hash_aset(map, raw_keys[i], raw_items[i]);
+ }
+ offset_ = offset_keep;
+ length_ = length_keep;
+ values_ = values_keep;
+ return map;
+ }
+
+#define VISIT(TYPE) \
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
+ return visit_value(array); \
+ }
+
+ VISIT(Null)
+ VISIT(Boolean)
+ VISIT(Int8)
+ VISIT(Int16)
+ VISIT(Int32)
+ VISIT(Int64)
+ VISIT(UInt8)
+ VISIT(UInt16)
+ VISIT(UInt32)
+ VISIT(UInt64)
+ // TODO
+ // VISIT(HalfFloat)
+ VISIT(Float)
+ VISIT(Double)
+ VISIT(Binary)
+ VISIT(String)
+ VISIT(FixedSizeBinary)
+ VISIT(Date32)
+ VISIT(Date64)
+ VISIT(Time32)
+ VISIT(Time64)
+ VISIT(Timestamp)
+ // TODO
+ // VISIT(Interval)
+ VISIT(List)
+ VISIT(Struct)
+ VISIT(Map)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
+ VISIT(Dictionary)
+ VISIT(Decimal128)
+ VISIT(Decimal256)
+ // TODO
+ // VISIT(Extension)
+
+#undef VISIT
+
+ private:
+ template <typename ArrayType>
+ inline VALUE convert_value(const ArrayType& array,
+ const int64_t i) {
+ return array_value_converter_->convert(array, i);
+ }
+
+ template <typename ArrayType>
+ arrow::Status visit_value(const ArrayType& array) {
+ if (array.null_count() > 0) {
+ for (int64_t i = 0; i < length_; ++i) {
+ auto value = Qnil;
+ if (!array.IsNull(i + offset_)) {
+ value = convert_value(array, i + offset_);
+ }
+ rb_ary_push(values_, value);
+ }
+ } else {
+ for (int64_t i = 0; i < length_; ++i) {
+ rb_ary_push(values_, convert_value(array, i + offset_));
+ }
+ }
+ return arrow::Status::OK();
+ }
+
+ ArrayValueConverter* array_value_converter_;
+ int32_t offset_;
+ int32_t length_;
+ VALUE values_;
+ };
+
+ class UnionArrayValueConverter : public arrow::ArrayVisitor {
+ public:
+ explicit UnionArrayValueConverter(ArrayValueConverter* converter)
+ : array_value_converter_(converter),
+ index_(0),
+ result_(Qnil) {}
+
+ VALUE convert(const arrow::UnionArray& array,
+ const int64_t index) {
+ const auto index_keep = index_;
+ const auto result_keep = result_;
+ index_ = index;
+ switch (array.mode()) {
+ case arrow::UnionMode::SPARSE:
+ convert_sparse(static_cast<const arrow::SparseUnionArray&>(array));
+ break;
+ case arrow::UnionMode::DENSE:
+ convert_dense(static_cast<const arrow::DenseUnionArray&>(array));
+ break;
+ default:
+ rb_raise(rb_eArgError, "Invalid union mode");
+ break;
+ }
+ auto result_return = result_;
+ index_ = index_keep;
+ result_ = result_keep;
+ return result_return;
+ }
+
+#define VISIT(TYPE) \
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
+ convert_value(array); \
+ return arrow::Status::OK(); \
+ }
+
+ VISIT(Null)
+ VISIT(Boolean)
+ VISIT(Int8)
+ VISIT(Int16)
+ VISIT(Int32)
+ VISIT(Int64)
+ VISIT(UInt8)
+ VISIT(UInt16)
+ VISIT(UInt32)
+ VISIT(UInt64)
+ // TODO
+ // VISIT(HalfFloat)
+ VISIT(Float)
+ VISIT(Double)
+ VISIT(Binary)
+ VISIT(String)
+ VISIT(FixedSizeBinary)
+ VISIT(Date32)
+ VISIT(Date64)
+ VISIT(Time32)
+ VISIT(Time64)
+ VISIT(Timestamp)
+ // TODO
+ // VISIT(Interval)
+ VISIT(List)
+ VISIT(Struct)
+ VISIT(Map)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
+ VISIT(Dictionary)
+ VISIT(Decimal128)
+ VISIT(Decimal256)
+ // TODO
+ // VISIT(Extension)
+
+#undef VISIT
+
+ private:
+ template <typename ArrayType>
+ inline void convert_value(const ArrayType& array) {
+ auto result = rb_hash_new();
+ if (array.IsNull(index_)) {
+ rb_hash_aset(result, field_name_, Qnil);
+ } else {
+ rb_hash_aset(result,
+ field_name_,
+ array_value_converter_->convert(array, index_));
+ }
+ result_ = result;
+ }
+
+ uint8_t compute_field_index(const arrow::UnionArray& array,
+ arrow::UnionType* type,
+ const char* tag) {
+ const auto type_code = array.raw_type_codes()[index_];
+ if (type_code >= 0 && type_code <= arrow::UnionType::kMaxTypeCode) {
+ const auto field_id = type->child_ids()[type_code];
+ if (field_id >= 0) {
+ return field_id;
+ }
+ }
+ check_status(arrow::Status::Invalid("Unknown type ID: ", type_code),
+ tag);
+ return 0;
+ }
+
+ void convert_sparse(const arrow::SparseUnionArray& array) {
+ const auto type =
+ std::static_pointer_cast<arrow::UnionType>(array.type()).get();
+ const auto tag = "[raw-records][union-sparse-array]";
+ const auto index = compute_field_index(array, type, tag);
+ const auto field = type->field(index).get();
+ const auto& field_name = field->name();
+ const auto field_name_keep = field_name_;
+ field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
+ const auto field_array = array.field(index).get();
+ check_status(field_array->Accept(this), tag);
+ field_name_ = field_name_keep;
+ }
+
+ void convert_dense(const arrow::DenseUnionArray& array) {
+ const auto type =
+ std::static_pointer_cast<arrow::UnionType>(array.type()).get();
+ const auto tag = "[raw-records][union-dense-array]";
+ const auto index = compute_field_index(array, type, tag);
+ const auto field = type->field(index).get();
+ const auto& field_name = field->name();
+ const auto field_name_keep = field_name_;
+ field_name_ = rb_utf8_str_new(field_name.data(), field_name.length());
+ const auto field_array = array.field(index);
+ const auto index_keep = index_;
+ index_ = array.value_offset(index_);
+ check_status(field_array->Accept(this), tag);
+ index_ = index_keep;
+ field_name_ = field_name_keep;
+ }
+
+ ArrayValueConverter* array_value_converter_;
+ int64_t index_;
+ VALUE field_name_;
+ VALUE result_;
+ };
+
+ class DictionaryArrayValueConverter : public arrow::ArrayVisitor {
+ public:
+ explicit DictionaryArrayValueConverter(ArrayValueConverter* converter)
+ : array_value_converter_(converter),
+ value_index_(0),
+ result_(Qnil) {
+ }
+
+ VALUE convert(const arrow::DictionaryArray& array,
+ const int64_t index) {
+ value_index_ = array.GetValueIndex(index);
+ auto dictionary = array.dictionary().get();
+ check_status(dictionary->Accept(this),
+ "[raw-records][dictionary-array]");
+ return result_;
+ }
+
+#define VISIT(TYPE) \
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
+ result_ = convert_value(array, value_index_); \
+ return arrow::Status::OK(); \
+ }
+
+ VISIT(Null)
+ VISIT(Boolean)
+ VISIT(Int8)
+ VISIT(Int16)
+ VISIT(Int32)
+ VISIT(Int64)
+ VISIT(UInt8)
+ VISIT(UInt16)
+ VISIT(UInt32)
+ VISIT(UInt64)
+ // TODO
+ // VISIT(HalfFloat)
+ VISIT(Float)
+ VISIT(Double)
+ VISIT(Binary)
+ VISIT(String)
+ VISIT(FixedSizeBinary)
+ VISIT(Date32)
+ VISIT(Date64)
+ VISIT(Time32)
+ VISIT(Time64)
+ VISIT(Timestamp)
+ // TODO
+ // VISIT(Interval)
+ VISIT(List)
+ VISIT(Struct)
+ VISIT(Map)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
+ VISIT(Dictionary)
+ VISIT(Decimal128)
+ VISIT(Decimal256)
+ // TODO
+ // VISIT(Extension)
+
+#undef VISIT
+
+ private:
+ template <typename ArrayType>
+ inline VALUE convert_value(const ArrayType& array,
+ const int64_t i) {
+ return array_value_converter_->convert(array, i);
+ }
+
+ ArrayValueConverter* array_value_converter_;
+ int64_t value_index_;
+ VALUE result_;
+ };
+
+ class Converter {
+ public:
+ explicit Converter()
+ : array_value_converter_(),
+ list_array_value_converter_(&array_value_converter_),
+ struct_array_value_converter_(&array_value_converter_),
+ map_array_value_converter_(&array_value_converter_),
+ union_array_value_converter_(&array_value_converter_),
+ dictionary_array_value_converter_(&array_value_converter_) {
+ array_value_converter_.
+ set_sub_value_converters(&list_array_value_converter_,
+ &struct_array_value_converter_,
+ &map_array_value_converter_,
+ &union_array_value_converter_,
+ &dictionary_array_value_converter_);
+ }
+
+ template <typename ArrayType>
+ inline VALUE convert_value(const ArrayType& array,
+ const int64_t i) {
+ return array_value_converter_.convert(array, i);
+ }
+
+ ArrayValueConverter array_value_converter_;
+ ListArrayValueConverter list_array_value_converter_;
+ StructArrayValueConverter struct_array_value_converter_;
+ MapArrayValueConverter map_array_value_converter_;
+ UnionArrayValueConverter union_array_value_converter_;
+ DictionaryArrayValueConverter dictionary_array_value_converter_;
+ };
+}
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/extconf.rb b/src/arrow/ruby/red-arrow/ext/arrow/extconf.rb
new file mode 100644
index 000000000..9e92bd316
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/extconf.rb
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "extpp"
+require "mkmf-gnome"
+require_relative "../../lib/arrow/version"
+
+arrow_pkg_config_path = ENV["ARROW_PKG_CONFIG_PATH"]
+if arrow_pkg_config_path
+ pkg_config_paths = [arrow_pkg_config_path, ENV["PKG_CONFIG_PATH"]].compact
+ ENV["PKG_CONFIG_PATH"] = pkg_config_paths.join(File::PATH_SEPARATOR)
+end
+
+checking_for(checking_message("Homebrew")) do
+ platform = NativePackageInstaller::Platform.detect
+ if platform.is_a?(NativePackageInstaller::Platform::Homebrew)
+ openssl_prefix = `brew --prefix openssl@1.1`.chomp
+ unless openssl_prefix.empty?
+ PKGConfig.add_path("#{openssl_prefix}/lib/pkgconfig")
+ end
+ true
+ else
+ false
+ end
+end
+
+unless required_pkg_config_package([
+ "arrow",
+ Arrow::Version::MAJOR,
+ Arrow::Version::MINOR,
+ Arrow::Version::MICRO,
+ ],
+ debian: "libarrow-dev",
+ redhat: "arrow-devel",
+ homebrew: "apache-arrow",
+ msys2: "arrow")
+ exit(false)
+end
+
+unless required_pkg_config_package([
+ "arrow-glib",
+ Arrow::Version::MAJOR,
+ Arrow::Version::MINOR,
+ Arrow::Version::MICRO,
+ ],
+ debian: "libarrow-glib-dev",
+ redhat: "arrow-glib-devel",
+ homebrew: "apache-arrow-glib",
+ msys2: "arrow")
+ exit(false)
+end
+
+[
+ ["glib2", "ext/glib2"],
+].each do |name, relative_source_dir|
+ spec = find_gem_spec(name)
+ source_dir = File.join(spec.full_gem_path, relative_source_dir)
+ build_dir = source_dir
+ add_depend_package_path(name, source_dir, build_dir)
+end
+
+create_makefile("arrow")
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/memory-view.cpp b/src/arrow/ruby/red-arrow/ext/arrow/memory-view.cpp
new file mode 100644
index 000000000..a3135310c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/memory-view.cpp
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "memory-view.hpp"
+
+#include <arrow-glib/arrow-glib.hpp>
+#include <rbgobject.h>
+
+#include <ruby/version.h>
+
+#if RUBY_API_VERSION_MAJOR >= 3
+# define HAVE_MEMORY_VIEW
+# define private memory_view_private
+# include <ruby/memory_view.h>
+# undef private
+#endif
+
+#include <sstream>
+
+namespace red_arrow {
+ namespace memory_view {
+#ifdef HAVE_MEMORY_VIEW
+ // This is workaround for the following rb_memory_view_t problems
+ // in C++:
+ //
+ // * Can't use "private" as member name
+ // * Can't assign a value to "rb_memory_view_t::private"
+ //
+ // This has compatible layout with rb_memory_view_t.
+ struct memory_view {
+ VALUE obj;
+ void *data;
+ ssize_t byte_size;
+ bool readonly;
+ const char *format;
+ ssize_t item_size;
+ struct {
+ const rb_memory_view_item_component_t *components;
+ size_t length;
+ } item_desc;
+ ssize_t ndim;
+ const ssize_t *shape;
+ const ssize_t *strides;
+ const ssize_t *sub_offsets;
+ void *private_data;
+ };
+
+ struct PrivateData {
+ std::string format;
+ };
+
+ class PrimitiveArrayGetter : public arrow::ArrayVisitor {
+ public:
+ explicit PrimitiveArrayGetter(memory_view *view)
+ : view_(view) {
+ }
+
+ arrow::Status Visit(const arrow::BooleanArray& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ // Memory view doesn't support bit stream. We use one byte
+ // for 8 elements. Users can't calculate the number of
+ // elements from memory view but it's limitation of memory view.
+#ifdef ARROW_LITTLE_ENDIAN
+ view_->format = "b8";
+#else
+ view_->format = "B8";
+#endif
+ view_->item_size = 1;
+ view_->byte_size = (array.length() + 7) / 8;
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Int8Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "c";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Int16Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "s";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Int32Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "l";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Int64Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "q";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::UInt8Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "C";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::UInt16Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "S";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::UInt32Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "L";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::UInt64Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "Q";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::FloatArray& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "f";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::DoubleArray& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "d";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::FixedSizeBinaryArray& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ auto priv = static_cast<PrivateData *>(view_->private_data);
+ const auto type =
+ std::static_pointer_cast<const arrow::FixedSizeBinaryType>(
+ array.type());
+ std::ostringstream output;
+ output << "C" << type->byte_width();
+ priv->format = output.str();
+ view_->format = priv->format.c_str();
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Date32Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "l";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Date64Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "q";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Time32Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "l";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Time64Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "q";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::TimestampArray& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "q";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Decimal128Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "q2";
+ return arrow::Status::OK();
+ }
+
+ arrow::Status Visit(const arrow::Decimal256Array& array) override {
+ fill(static_cast<const arrow::Array&>(array));
+ view_->format = "q4";
+ return arrow::Status::OK();
+ }
+
+ private:
+ void fill(const arrow::Array& array) {
+ const auto array_data = array.data();
+ const auto data = array_data->GetValuesSafe<uint8_t>(1);
+ view_->data = const_cast<void *>(reinterpret_cast<const void *>(data));
+ const auto type =
+ std::static_pointer_cast<const arrow::FixedWidthType>(array.type());
+ view_->item_size = type->bit_width() / 8;
+ view_->byte_size = view_->item_size * array.length();
+ }
+
+ memory_view *view_;
+ };
+
+ bool primitive_array_get(VALUE obj, rb_memory_view_t *view, int flags) {
+ if (flags != RUBY_MEMORY_VIEW_SIMPLE) {
+ return false;
+ }
+ auto view_ = reinterpret_cast<memory_view *>(view);
+ view_->obj = obj;
+ view_->private_data = new PrivateData();
+ auto array = GARROW_ARRAY(RVAL2GOBJ(obj));
+ auto arrow_array = garrow_array_get_raw(array);
+ PrimitiveArrayGetter getter(view_);
+ auto status = arrow_array->Accept(&getter);
+ if (!status.ok()) {
+ return false;
+ }
+ view_->readonly = true;
+ view_->ndim = 1;
+ view_->shape = NULL;
+ view_->strides = NULL;
+ view_->sub_offsets = NULL;
+ return true;
+ }
+
+ bool primitive_array_release(VALUE obj, rb_memory_view_t *view) {
+ auto view_ = reinterpret_cast<memory_view *>(view);
+ delete static_cast<PrivateData *>(view_->private_data);
+ return true;
+ }
+
+ bool primitive_array_available_p(VALUE obj) {
+ return true;
+ }
+
+ rb_memory_view_entry_t primitive_array_entry = {
+ primitive_array_get,
+ primitive_array_release,
+ primitive_array_available_p,
+ };
+
+ bool buffer_get(VALUE obj, rb_memory_view_t *view, int flags) {
+ if (flags != RUBY_MEMORY_VIEW_SIMPLE) {
+ return false;
+ }
+ auto view_ = reinterpret_cast<memory_view *>(view);
+ view_->obj = obj;
+ auto buffer = GARROW_BUFFER(RVAL2GOBJ(obj));
+ auto arrow_buffer = garrow_buffer_get_raw(buffer);
+ view_->data =
+ const_cast<void *>(reinterpret_cast<const void *>(arrow_buffer->data()));
+ // Memory view doesn't support bit stream. We use one byte
+ // for 8 elements. Users can't calculate the number of
+ // elements from memory view but it's limitation of memory view.
+#ifdef ARROW_LITTLE_ENDIAN
+ view_->format = "b8";
+#else
+ view_->format = "B8";
+#endif
+ view_->item_size = 1;
+ view_->byte_size = arrow_buffer->size();
+ view_->readonly = true;
+ view_->ndim = 1;
+ view_->shape = NULL;
+ view_->strides = NULL;
+ view_->sub_offsets = NULL;
+ return true;
+ }
+
+ bool buffer_release(VALUE obj, rb_memory_view_t *view) {
+ return true;
+ }
+
+ bool buffer_available_p(VALUE obj) {
+ return true;
+ }
+
+ rb_memory_view_entry_t buffer_entry = {
+ buffer_get,
+ buffer_release,
+ buffer_available_p,
+ };
+#endif
+
+ void init(VALUE mArrow) {
+#ifdef HAVE_MEMORY_VIEW
+ auto cPrimitiveArray =
+ rb_const_get_at(mArrow, rb_intern("PrimitiveArray"));
+ rb_memory_view_register(cPrimitiveArray,
+ &(red_arrow::memory_view::primitive_array_entry));
+
+ auto cBuffer = rb_const_get_at(mArrow, rb_intern("Buffer"));
+ rb_memory_view_register(cBuffer, &(red_arrow::memory_view::buffer_entry));
+#endif
+ }
+ }
+}
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/memory-view.hpp b/src/arrow/ruby/red-arrow/ext/arrow/memory-view.hpp
new file mode 100644
index 000000000..7a7764622
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/memory-view.hpp
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <ruby.hpp>
+
+namespace red_arrow {
+ namespace memory_view {
+ void init(VALUE mArrow);
+ }
+}
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/raw-records.cpp b/src/arrow/ruby/red-arrow/ext/arrow/raw-records.cpp
new file mode 100644
index 000000000..16261b895
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/raw-records.cpp
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "converters.hpp"
+
+namespace red_arrow {
+ namespace {
+ class RawRecordsBuilder : private Converter, public arrow::ArrayVisitor {
+ public:
+ explicit RawRecordsBuilder(VALUE records, int n_columns)
+ : Converter(),
+ records_(records),
+ n_columns_(n_columns) {
+ }
+
+ void build(const arrow::RecordBatch& record_batch) {
+ rb::protect([&] {
+ const auto n_rows = record_batch.num_rows();
+ for (int64_t i = 0; i < n_rows; ++i) {
+ auto record = rb_ary_new_capa(n_columns_);
+ rb_ary_push(records_, record);
+ }
+ row_offset_ = 0;
+ for (int i = 0; i < n_columns_; ++i) {
+ const auto array = record_batch.column(i).get();
+ column_index_ = i;
+ check_status(array->Accept(this),
+ "[record-batch][raw-records]");
+ }
+ return Qnil;
+ });
+ }
+
+ void build(const arrow::Table& table) {
+ rb::protect([&] {
+ const auto n_rows = table.num_rows();
+ for (int64_t i = 0; i < n_rows; ++i) {
+ auto record = rb_ary_new_capa(n_columns_);
+ rb_ary_push(records_, record);
+ }
+ for (int i = 0; i < n_columns_; ++i) {
+ const auto& chunked_array = table.column(i).get();
+ column_index_ = i;
+ row_offset_ = 0;
+ for (const auto array : chunked_array->chunks()) {
+ check_status(array->Accept(this),
+ "[table][raw-records]");
+ row_offset_ += array->length();
+ }
+ }
+ return Qnil;
+ });
+ }
+
+#define VISIT(TYPE) \
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
+ convert(array); \
+ return arrow::Status::OK(); \
+ }
+
+ VISIT(Null)
+ VISIT(Boolean)
+ VISIT(Int8)
+ VISIT(Int16)
+ VISIT(Int32)
+ VISIT(Int64)
+ VISIT(UInt8)
+ VISIT(UInt16)
+ VISIT(UInt32)
+ VISIT(UInt64)
+ // TODO
+ // VISIT(HalfFloat)
+ VISIT(Float)
+ VISIT(Double)
+ VISIT(Binary)
+ VISIT(String)
+ VISIT(FixedSizeBinary)
+ VISIT(Date32)
+ VISIT(Date64)
+ VISIT(Time32)
+ VISIT(Time64)
+ VISIT(Timestamp)
+ // TODO
+ // VISIT(Interval)
+ VISIT(List)
+ VISIT(Struct)
+ VISIT(Map)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
+ VISIT(Dictionary)
+ VISIT(Decimal128)
+ VISIT(Decimal256)
+ // TODO
+ // VISIT(Extension)
+
+#undef VISIT
+
+ private:
+ template <typename ArrayType>
+ void convert(const ArrayType& array) {
+ const auto n = array.length();
+ if (array.null_count() > 0) {
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
+ auto value = Qnil;
+ if (!array.IsNull(i)) {
+ value = convert_value(array, i);
+ }
+ auto record = rb_ary_entry(records_, ii);
+ rb_ary_store(record, column_index_, value);
+ }
+ } else {
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
+ auto record = rb_ary_entry(records_, ii);
+ rb_ary_store(record, column_index_, convert_value(array, i));
+ }
+ }
+ }
+
+ // Destination for converted records.
+ VALUE records_;
+
+ // The current column index.
+ int column_index_;
+
+ // The current row offset.
+ int64_t row_offset_;
+
+ // The number of columns.
+ const int n_columns_;
+ };
+ }
+
+ VALUE
+ record_batch_raw_records(VALUE rb_record_batch) {
+ auto garrow_record_batch = GARROW_RECORD_BATCH(RVAL2GOBJ(rb_record_batch));
+ auto record_batch = garrow_record_batch_get_raw(garrow_record_batch).get();
+ const auto n_rows = record_batch->num_rows();
+ const auto n_columns = record_batch->num_columns();
+ auto records = rb_ary_new_capa(n_rows);
+
+ try {
+ RawRecordsBuilder builder(records, n_columns);
+ builder.build(*record_batch);
+ } catch (rb::State& state) {
+ state.jump();
+ }
+
+ return records;
+ }
+
+ VALUE
+ table_raw_records(VALUE rb_table) {
+ auto garrow_table = GARROW_TABLE(RVAL2GOBJ(rb_table));
+ auto table = garrow_table_get_raw(garrow_table).get();
+ const auto n_rows = table->num_rows();
+ const auto n_columns = table->num_columns();
+ auto records = rb_ary_new_capa(n_rows);
+
+ try {
+ RawRecordsBuilder builder(records, n_columns);
+ builder.build(*table);
+ } catch (rb::State& state) {
+ state.jump();
+ }
+
+ return records;
+ }
+}
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/red-arrow.hpp b/src/arrow/ruby/red-arrow/ext/arrow/red-arrow.hpp
new file mode 100644
index 000000000..c3301dc7b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/red-arrow.hpp
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow/api.h>
+
+#ifdef _WIN32
+# define gmtime_r gmtime_r_ruby_win32
+# define localtime_r localtime_r_ruby_win32
+# include <ruby.h>
+# undef gmtime_r
+# undef localtime_r
+#endif
+
+#include <arrow-glib/arrow-glib.hpp>
+#include <rbgobject.h>
+
+namespace red_arrow {
+ extern VALUE cDate;
+
+ extern VALUE cArrowTime;
+
+ extern VALUE ArrowTimeUnitSECOND;
+ extern VALUE ArrowTimeUnitMILLI;
+ extern VALUE ArrowTimeUnitMICRO;
+ extern VALUE ArrowTimeUnitNANO;
+
+ extern ID id_BigDecimal;
+ extern ID id_jd;
+ extern ID id_new;
+ extern ID id_to_datetime;
+
+ VALUE array_values(VALUE obj);
+ VALUE chunked_array_values(VALUE obj);
+
+ VALUE record_batch_raw_records(VALUE obj);
+ VALUE table_raw_records(VALUE obj);
+
+ inline VALUE time_unit_to_scale(const arrow::TimeUnit::type unit) {
+ switch (unit) {
+ case arrow::TimeUnit::SECOND:
+ return INT2FIX(1);
+ case arrow::TimeUnit::MILLI:
+ return INT2FIX(1000);
+ case arrow::TimeUnit::MICRO:
+ return INT2FIX(1000 * 1000);
+ case arrow::TimeUnit::NANO:
+ // NOTE: INT2FIX works for 1e+9 because: FIXNUM_MAX >= (1<<30) - 1 > 1e+9
+ return INT2FIX(1000 * 1000 * 1000);
+ default:
+ rb_raise(rb_eArgError, "invalid arrow::TimeUnit: %d", unit);
+ return Qnil;
+ }
+ }
+
+ inline VALUE time_unit_to_enum(const arrow::TimeUnit::type unit) {
+ switch (unit) {
+ case arrow::TimeUnit::SECOND:
+ return red_arrow::ArrowTimeUnitSECOND;
+ case arrow::TimeUnit::MILLI:
+ return red_arrow::ArrowTimeUnitMILLI;
+ case arrow::TimeUnit::MICRO:
+ return red_arrow::ArrowTimeUnitMICRO;
+ case arrow::TimeUnit::NANO:
+ return red_arrow::ArrowTimeUnitNANO;
+ default:
+ rb_raise(rb_eArgError, "invalid arrow::TimeUnit: %d", unit);
+ return Qnil;
+ }
+ }
+
+ inline void check_status(const arrow::Status&& status, const char* context) {
+ GError* error = nullptr;
+ if (!garrow_error_check(&error, status, context)) {
+ RG_RAISE_ERROR(error);
+ }
+ }
+}
diff --git a/src/arrow/ruby/red-arrow/ext/arrow/values.cpp b/src/arrow/ruby/red-arrow/ext/arrow/values.cpp
new file mode 100644
index 000000000..a8a5775b9
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/ext/arrow/values.cpp
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "converters.hpp"
+
+namespace red_arrow {
+ namespace {
+ class ValuesBuilder : private Converter, public arrow::ArrayVisitor {
+ public:
+ explicit ValuesBuilder(VALUE values)
+ : Converter(),
+ values_(values),
+ row_offset_(0) {
+ }
+
+ void build(const arrow::Array& array, VALUE rb_array) {
+ rb::protect([&] {
+ check_status(array.Accept(this),
+ "[array][values]");
+ return Qnil;
+ });
+ }
+
+ void build(const arrow::ChunkedArray& chunked_array,
+ VALUE rb_chunked_array) {
+ rb::protect([&] {
+ for (const auto& array : chunked_array.chunks()) {
+ check_status(array->Accept(this),
+ "[chunked-array][values]");
+ row_offset_ += array->length();
+ }
+ return Qnil;
+ });
+ }
+
+#define VISIT(TYPE) \
+ arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
+ convert(array); \
+ return arrow::Status::OK(); \
+ }
+
+ VISIT(Null)
+ VISIT(Boolean)
+ VISIT(Int8)
+ VISIT(Int16)
+ VISIT(Int32)
+ VISIT(Int64)
+ VISIT(UInt8)
+ VISIT(UInt16)
+ VISIT(UInt32)
+ VISIT(UInt64)
+ // TODO
+ // VISIT(HalfFloat)
+ VISIT(Float)
+ VISIT(Double)
+ VISIT(Binary)
+ VISIT(String)
+ VISIT(FixedSizeBinary)
+ VISIT(Date32)
+ VISIT(Date64)
+ VISIT(Time32)
+ VISIT(Time64)
+ VISIT(Timestamp)
+ // TODO
+ // VISIT(Interval)
+ VISIT(List)
+ VISIT(Struct)
+ VISIT(Map)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
+ VISIT(Dictionary)
+ VISIT(Decimal128)
+ VISIT(Decimal256)
+ // TODO
+ // VISIT(Extension)
+
+#undef VISIT
+
+ private:
+ template <typename ArrayType>
+ void convert(const ArrayType& array) {
+ const auto n = array.length();
+ if (array.null_count() > 0) {
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
+ auto value = Qnil;
+ if (!array.IsNull(i)) {
+ value = convert_value(array, i);
+ }
+ rb_ary_store(values_, ii, value);
+ }
+ } else {
+ for (int64_t i = 0, ii = row_offset_; i < n; ++i, ++ii) {
+ rb_ary_store(values_, ii, convert_value(array, i));
+ }
+ }
+ }
+
+ // Destination for converted values.
+ VALUE values_;
+
+ // The current row offset.
+ int64_t row_offset_;
+ };
+ }
+
+ VALUE
+ array_values(VALUE rb_array) {
+ auto garrow_array = GARROW_ARRAY(RVAL2GOBJ(rb_array));
+ auto array = garrow_array_get_raw(garrow_array).get();
+ const auto n_rows = array->length();
+ auto values = rb_ary_new_capa(n_rows);
+
+ try {
+ ValuesBuilder builder(values);
+ builder.build(*array, rb_array);
+ } catch (rb::State& state) {
+ state.jump();
+ }
+
+ return values;
+ }
+
+ VALUE
+ chunked_array_values(VALUE rb_chunked_array) {
+ auto garrow_chunked_array =
+ GARROW_CHUNKED_ARRAY(RVAL2GOBJ(rb_chunked_array));
+ auto chunked_array =
+ garrow_chunked_array_get_raw(garrow_chunked_array).get();
+ const auto n_rows = chunked_array->length();
+ auto values = rb_ary_new_capa(n_rows);
+
+ try {
+ ValuesBuilder builder(values);
+ builder.build(*chunked_array, rb_chunked_array);
+ } catch (rb::State& state) {
+ state.jump();
+ }
+
+ return values;
+ }
+}
diff --git a/src/arrow/ruby/red-arrow/image/red-arrow.png b/src/arrow/ruby/red-arrow/image/red-arrow.png
new file mode 100644
index 000000000..6db9b4b7a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/image/red-arrow.png
Binary files differ
diff --git a/src/arrow/ruby/red-arrow/lib/arrow.rb b/src/arrow/ruby/red-arrow/lib/arrow.rb
new file mode 100644
index 000000000..8fbc537bc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow.rb
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "extpp/setup"
+require "gio2"
+
+require "arrow/version"
+
+require "arrow/loader"
+
+module Arrow
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/aggregate-node-options.rb b/src/arrow/ruby/red-arrow/lib/arrow/aggregate-node-options.rb
new file mode 100644
index 000000000..f3a6ace58
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/aggregate-node-options.rb
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class AggregateNodeOptions
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when Hash
+ aggregations = value[:aggregations]
+ return nil if aggregations.nil?
+ keys = value[:keys]
+ new(aggregations, keys)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/aggregation.rb b/src/arrow/ruby/red-arrow/lib/arrow/aggregation.rb
new file mode 100644
index 000000000..9aac8239d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/aggregation.rb
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Aggregation
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when Hash
+ function = value[:function]
+ return nil if function.nil?
+ function = function.to_s if function.is_a?(Symbol)
+ return nil unless function.is_a?(String)
+ # TODO: Improve this when we have non hash based aggregate function
+ function = "hash_#{function}" unless function.start_with?("hash_")
+ options = value[:options]
+ input = value[:input]
+ return nil if input.nil?
+ output = value[:output]
+ if output.nil?
+ normalized_function = function.gsub(/\Ahash_/, "")
+ output = "#{normalized_function}(#{input})"
+ end
+ new(function, options, input, output)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/array-builder.rb
new file mode 100644
index 000000000..651aed962
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/array-builder.rb
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "date"
+
+module Arrow
+ class ArrayBuilder
+ class << self
+ def build(values)
+ if self != ArrayBuilder
+ builder = new
+ return builder.build(values)
+ end
+
+ builder_info = nil
+ values.each do |value|
+ builder_info = detect_builder_info(value, builder_info)
+ break if builder_info and builder_info[:detected]
+ end
+ if builder_info
+ builder = builder_info[:builder]
+ builder.build(values)
+ else
+ Arrow::StringArray.new(values)
+ end
+ end
+
+ def buildable?(args)
+ args.size == method(:build).arity
+ end
+
+ private
+ def detect_builder_info(value, builder_info)
+ case value
+ when nil
+ builder_info
+ when true, false
+ {
+ builder: BooleanArrayBuilder.new,
+ detected: true,
+ }
+ when String
+ {
+ builder: StringArrayBuilder.new,
+ detected: true,
+ }
+ when Symbol
+ {
+ builder: StringDictionaryArrayBuilder.new,
+ detected: true,
+ }
+ when Float
+ {
+ builder: DoubleArrayBuilder.new,
+ detected: true,
+ }
+ when Integer
+ if value < 0
+ {
+ builder: IntArrayBuilder.new,
+ detected: true,
+ }
+ else
+ {
+ builder: UIntArrayBuilder.new,
+ }
+ end
+ when Time
+ data_type = value.data_type
+ case data_type.unit
+ when TimeUnit::SECOND
+ builder_info || {
+ builder: Time32ArrayBuilder.new(data_type)
+ }
+ when TimeUnit::MILLI
+ if builder_info and builder_info[:builder].is_a?(Time64ArrayBuilder)
+ builder_info
+ else
+ {
+ builder: Time32ArrayBuilder.new(data_type),
+ }
+ end
+ when TimeUnit::MICRO
+ {
+ builder: Time64ArrayBuilder.new(data_type),
+ }
+ when TimeUnit::NANO
+ {
+ builder: Time64ArrayBuilder.new(data_type),
+ detected: true
+ }
+ end
+ when ::Time
+ data_type = TimestampDataType.new(:nano)
+ {
+ builder: TimestampArrayBuilder.new(data_type),
+ detected: true,
+ }
+ when DateTime
+ {
+ builder: Date64ArrayBuilder.new,
+ detected: true,
+ }
+ when Date
+ {
+ builder: Date32ArrayBuilder.new,
+ detected: true,
+ }
+ when BigDecimal
+ if value.to_arrow.is_a?(Decimal128)
+ {
+ builder: Decimal128ArrayBuilder.new,
+ }
+ else
+ {
+ builder: Decimal256ArrayBuilder.new,
+ detected: true,
+ }
+ end
+ when ::Array
+ sub_builder_info = nil
+ value.each do |sub_value|
+ sub_builder_info = detect_builder_info(sub_value, sub_builder_info)
+ break if sub_builder_info and sub_builder_info[:detected]
+ end
+ if sub_builder_info and sub_builder_info[:detected]
+ sub_value_data_type = sub_builder_info[:builder].value_data_type
+ field = Field.new("item", sub_value_data_type)
+ {
+ builder: ListArrayBuilder.new(ListDataType.new(field)),
+ detected: true,
+ }
+ else
+ builder_info
+ end
+ else
+ {
+ builder: StringArrayBuilder.new,
+ detected: true,
+ }
+ end
+ end
+ end
+
+ def build(values)
+ append(*values)
+ finish
+ end
+
+ # @since 0.12.0
+ def append(*values)
+ value_convertable = respond_to?(:convert_to_arrow_value, true)
+ start_index = 0
+ current_index = 0
+ status = :value
+
+ values.each do |value|
+ if value.nil?
+ if status == :value
+ if start_index != current_index
+ target_values = values[start_index...current_index]
+ if value_convertable
+ target_values = target_values.collect do |v|
+ convert_to_arrow_value(v)
+ end
+ end
+ append_values(target_values, nil)
+ start_index = current_index
+ end
+ status = :null
+ end
+ else
+ if status == :null
+ append_nulls(current_index - start_index)
+ start_index = current_index
+ status = :value
+ end
+ end
+ current_index += 1
+ end
+ if start_index != current_index
+ if status == :value
+ if start_index == 0 and current_index == values.size
+ target_values = values
+ else
+ target_values = values[start_index...current_index]
+ end
+ if value_convertable
+ target_values = target_values.collect do |v|
+ convert_to_arrow_value(v)
+ end
+ end
+ append_values(target_values, nil)
+ else
+ append_nulls(current_index - start_index)
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/array.rb b/src/arrow/ruby/red-arrow/lib/arrow/array.rb
new file mode 100644
index 000000000..c6c0daaec
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/array.rb
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Array
+ include Enumerable
+ include GenericFilterable
+ include GenericTakeable
+
+ class << self
+ def new(*args)
+ _builder_class = builder_class
+ return super if _builder_class.nil?
+ return super unless _builder_class.buildable?(args)
+ _builder_class.build(*args)
+ end
+
+ def builder_class
+ builder_class_name = "#{name}Builder"
+ return nil unless const_defined?(builder_class_name)
+ const_get(builder_class_name)
+ end
+ end
+
+ # @param i [Integer]
+ # The index of the value to be gotten.
+ #
+ # You can specify negative index like for `::Array#[]`.
+ #
+ # @return [Object, nil]
+ # The `i`-th value.
+ #
+ # `nil` for NULL value or out of range `i`.
+ def [](i)
+ i += length if i < 0
+ return nil if i < 0 or i >= length
+ if null?(i)
+ nil
+ else
+ get_value(i)
+ end
+ end
+
+ # @param other [Arrow::Array] The array to be compared.
+ # @param options [Arrow::EqualOptions, Hash] (nil)
+ # The options to custom how to compare.
+ #
+ # @return [Boolean]
+ # `true` if both of them have the same data, `false` otherwise.
+ #
+ # @since 5.0.0
+ def equal_array?(other, options=nil)
+ equal_options(other, options)
+ end
+
+ def each
+ return to_enum(__method__) unless block_given?
+
+ length.times do |i|
+ yield(self[i])
+ end
+ end
+
+ def reverse_each
+ return to_enum(__method__) unless block_given?
+
+ (length - 1).downto(0) do |i|
+ yield(self[i])
+ end
+ end
+
+ def to_arrow
+ self
+ end
+
+ alias_method :value_data_type_raw, :value_data_type
+ def value_data_type
+ @value_data_type ||= value_data_type_raw
+ end
+
+ def to_a
+ values
+ end
+
+ alias_method :is_in_raw, :is_in
+ def is_in(values)
+ case values
+ when ::Array
+ if self.class.builder_class.buildable?([values])
+ values = self.class.new(values)
+ else
+ values = self.class.new(value_data_type, values)
+ end
+ is_in_raw(values)
+ when ChunkedArray
+ is_in_chunked_array(values)
+ else
+ is_in_raw(values)
+ end
+ end
+
+ # @api private
+ alias_method :concatenate_raw, :concatenate
+ # Concatenates the given other arrays to the array.
+ #
+ # @param other_arrays [::Array, Arrow::Array] The arrays to be
+ # concatenated.
+ #
+ # Each other array is processed by {#resolve} before they're
+ # concatenated.
+ #
+ # @example Raw Ruby Array
+ # array = Arrow::Int32Array.new([1])
+ # array.concatenate([2, 3], [4]) # => Arrow::Int32Array.new([1, 2, 3, 4])
+ #
+ # @example Arrow::Array
+ # array = Arrow::Int32Array.new([1])
+ # array.concatenate(Arrow::Int32Array.new([2, 3]),
+ # Arrow::Int8Array.new([4])) # => Arrow::Int32Array.new([1, 2, 3, 4])
+ #
+ # @since 4.0.0
+ def concatenate(*other_arrays)
+ other_arrays = other_arrays.collect do |other_array|
+ resolve(other_array)
+ end
+ concatenate_raw(other_arrays)
+ end
+
+ # Concatenates the given other array to the array.
+ #
+ # If you have multiple arrays to be concatenated, you should use
+ # {#concatenate} to concatenate multiple arrays at once.
+ #
+ # @param other_array [::Array, Arrow::Array] The array to be concatenated.
+ #
+ # `@other_array` is processed by {#resolve} before it's
+ # concatenated.
+ #
+ # @example Raw Ruby Array
+ # Arrow::Int32Array.new([1]) + [2, 3] # => Arrow::Int32Array.new([1, 2, 3])
+ #
+ # @example Arrow::Array
+ # Arrow::Int32Array.new([1]) +
+ # Arrow::Int32Array.new([2, 3]) # => Arrow::Int32Array.new([1, 2, 3])
+ #
+ # @since 4.0.0
+ def +(other_array)
+ concatenate(other_array)
+ end
+
+ # Ensures returning the same data type array from the given array.
+ #
+ # @return [Arrow::Array]
+ #
+ # @overload resolve(other_raw_array)
+ #
+ # @param other_raw_array [::Array] A raw Ruby Array. A new Arrow::Array
+ # is built by `self.class.new`.
+ #
+ # @example Raw Ruby Array
+ # int32_array = Arrow::Int32Array.new([1])
+ # other_array = int32_array.resolve([2, 3, 4])
+ # other_array # => Arrow::Int32Array.new([2, 3, 4])
+ #
+ # @overload resolve(other_array)
+ #
+ # @param other_array [Arrow::Array] Another Arrow::Array.
+ #
+ # If the given other array is an same data type array of
+ # `self`, the given other array is returned as-is.
+ #
+ # If the given other array isn't an same data type array of
+ # `self`, the given other array is casted.
+ #
+ # @example Same data type
+ # int32_array = Arrow::Int32Array.new([1])
+ # other_int32_array = Arrow::Int32Array.new([2, 3, 4])
+ # other_array = int32_array.resolve(other_int32_array)
+ # other_array.object_id == other_int32_array.object_id
+ #
+ # @example Other data type
+ # int32_array = Arrow::Int32Array.new([1])
+ # other_int8_array = Arrow::Int8Array.new([2, 3, 4])
+ # other_array = int32_array.resolve(other_int32_array)
+ # other_array #=> Arrow::Int32Array.new([2, 3, 4])
+ #
+ # @since 4.0.0
+ def resolve(other_array)
+ if other_array.is_a?(::Array)
+ builder_class = self.class.builder_class
+ if builder_class.nil?
+ message =
+ "[array][resolve] can't build #{value_data_type} array " +
+ "from raw Ruby Array"
+ raise ArgumentError, message
+ end
+ if builder_class.buildable?([other_array])
+ other_array = builder_class.build(other_array)
+ elsif builder_class.buildable?([value_data_type, other_array])
+ other_array = builder_class.build(value_data_type, other_array)
+ else
+ message =
+ "[array][resolve] need to implement " +
+ "a feature that building #{value_data_type} array " +
+ "from raw Ruby Array"
+ raise NotImpelemented, message
+ end
+ other_array
+ elsif other_array.respond_to?(:value_data_type)
+ return other_array if value_data_type == other_array.value_data_type
+ other_array.cast(value_data_type)
+ else
+ message =
+ "[array][resolve] can't build #{value_data_type} array: " +
+ "#{other_array.inspect}"
+ raise ArgumentError, message
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/bigdecimal-extension.rb b/src/arrow/ruby/red-arrow/lib/arrow/bigdecimal-extension.rb
new file mode 100644
index 000000000..338efe696
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/bigdecimal-extension.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "bigdecimal"
+
+class BigDecimal
+ def to_arrow
+ if precision <= Arrow::Decimal128DataType::MAX_PRECISION
+ Arrow::Decimal128.new(to_s)
+ else
+ Arrow::Decimal256.new(to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/binary-dictionary-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/binary-dictionary-array-builder.rb
new file mode 100644
index 000000000..6d05e2c41
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/binary-dictionary-array-builder.rb
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class BinaryDictionaryArrayBuilder
+ include SymbolValuesAppendable
+
+ private
+ def create_values_array_builder
+ BinaryArrayBuilder.new
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/block-closable.rb b/src/arrow/ruby/red-arrow/lib/arrow/block-closable.rb
new file mode 100644
index 000000000..ec236bd15
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/block-closable.rb
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module BlockClosable
+ def open(*args, &block)
+ io = new(*args)
+ return io unless block
+
+ begin
+ yield(io)
+ ensure
+ if io.respond_to?(:closed?)
+ io.close unless io.closed?
+ else
+ io.close
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/buffer.rb b/src/arrow/ruby/red-arrow/lib/arrow/buffer.rb
new file mode 100644
index 000000000..9f3a3f61b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/buffer.rb
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Buffer
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when String
+ new(value)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/chunked-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/chunked-array.rb
new file mode 100644
index 000000000..30dffa856
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/chunked-array.rb
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class ChunkedArray
+ include Enumerable
+ include GenericFilterable
+ include GenericTakeable
+
+ alias_method :size, :n_rows
+ unless method_defined?(:length)
+ alias_method :length, :n_rows
+ end
+
+ alias_method :chunks_raw, :chunks
+ def chunks
+ @chunks ||= chunks_raw
+ end
+
+ def null?(i)
+ chunks.each do |array|
+ return array.null?(i) if i < array.length
+ i -= array.length
+ end
+ nil
+ end
+
+ def valid?(i)
+ chunks.each do |array|
+ return array.valid?(i) if i < array.length
+ i -= array.length
+ end
+ nil
+ end
+
+ def [](i)
+ i += length if i < 0
+ chunks.each do |array|
+ return array[i] if i < array.length
+ i -= array.length
+ end
+ nil
+ end
+
+ def each(&block)
+ return to_enum(__method__) unless block_given?
+
+ chunks.each do |array|
+ array.each(&block)
+ end
+ end
+
+ def reverse_each(&block)
+ return to_enum(__method__) unless block_given?
+
+ chunks.reverse_each do |array|
+ array.reverse_each(&block)
+ end
+ end
+
+ def each_chunk(&block)
+ chunks.each(&block)
+ end
+
+ def pack
+ first_chunk = chunks.first
+ data_type = first_chunk.value_data_type
+ case data_type
+ when TimestampDataType
+ builder = TimestampArrayBuilder.new(data_type)
+ builder.build(to_a)
+ else
+ first_chunk.class.new(to_a)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/column-containable.rb b/src/arrow/ruby/red-arrow/lib/arrow/column-containable.rb
new file mode 100644
index 000000000..7d7de66bd
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/column-containable.rb
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module ColumnContainable
+ def columns
+ @columns ||= schema.n_fields.times.collect do |i|
+ Column.new(self, i)
+ end
+ end
+
+ def each_column(&block)
+ columns.each(&block)
+ end
+
+ # @overload [](name)
+ # Find a column that has the given name.
+ #
+ # @param name [String, Symbol] The column name to be found.
+ # @return [Column] The found column.
+ #
+ # @overload [](index)
+ # Find the `index`-th column.
+ #
+ # @param index [Integer] The index to be found.
+ # @return [Column] The found column.
+ def find_column(name_or_index)
+ case name_or_index
+ when String, Symbol
+ name = name_or_index.to_s
+ index = schema.get_field_index(name)
+ return nil if index == -1
+ Column.new(self, index)
+ when Integer
+ index = name_or_index
+ index += n_columns if index < 0
+ return nil if index < 0 or index >= n_columns
+ Column.new(self, index)
+ else
+ message = "column name or index must be String, Symbol or Integer: "
+ message << name_or_index.inspect
+ raise ArgumentError, message
+ end
+ end
+
+ # Selects columns that are selected by `selectors` and/or `block`
+ # and creates a new container only with the selected columns.
+ #
+ # @param selectors [Array<String, Symbol, Integer, Range>]
+ # If a selector is `String`, `Symbol` or `Integer`, the selector
+ # selects a column by {#find_column}.
+ #
+ # If a selector is `Range`, the selector selects columns by `::Array#[]`.
+ # @yield [column] Gives a column to the block to select columns.
+ # This uses `::Array#select`.
+ # @yieldparam column [Column] A target column.
+ # @yieldreturn [Boolean] Whether the given column is selected or not.
+ # @return [self.class] The newly created container that only has selected
+ # columns.
+ def select_columns(*selectors, &block)
+ if selectors.empty?
+ return to_enum(__method__) unless block_given?
+ selected_columns = columns.select(&block)
+ else
+ selected_columns = []
+ selectors.each do |selector|
+ case selector
+ when Range
+ selected_columns.concat(columns[selector])
+ else
+ column = find_column(selector)
+ if column.nil?
+ case selector
+ when String, Symbol
+ message = "unknown column: #{selector.inspect}: #{inspect}"
+ raise KeyError.new(message)
+ else
+ message = "out of index (0..#{n_columns - 1}): "
+ message << "#{selector.inspect}: #{inspect}"
+ raise IndexError.new(message)
+ end
+ end
+ selected_columns << column
+ end
+ end
+ selected_columns = selected_columns.select(&block) if block_given?
+ end
+ self.class.new(selected_columns)
+ end
+
+ # @overload [](name)
+ # Find a column that has the given name.
+ #
+ # @param name [String, Symbol] The column name to be found.
+ # @return [Column] The found column.
+ # @see #find_column
+ #
+ # @overload [](index)
+ # Find the `index`-th column.
+ #
+ # @param index [Integer] The index to be found.
+ # @return [Column] The found column.
+ # @see #find_column
+ #
+ # @overload [](range)
+ # Selects columns that are in `range` and creates a new container
+ # only with the selected columns.
+ #
+ # @param range [Range] The range to be selected.
+ # @return [self.class] The newly created container that only has selected
+ # columns.
+ # @see #select_columns
+ #
+ # @overload [](selectors)
+ # Selects columns that are selected by `selectors` and creates a
+ # new container only with the selected columns.
+ #
+ # @param selectors [Array] The selectors that are used to select columns.
+ # @return [self.class] The newly created container that only has selected
+ # columns.
+ # @see #select_columns
+ def [](selector)
+ case selector
+ when ::Array
+ select_columns(*selector)
+ when Range
+ select_columns(selector)
+ else
+ find_column(selector)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/column.rb b/src/arrow/ruby/red-arrow/lib/arrow/column.rb
new file mode 100644
index 000000000..06f3dbdc0
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/column.rb
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Column
+ include Enumerable
+
+ attr_reader :container
+ attr_reader :field
+ attr_reader :data
+ def initialize(container, index)
+ @container = container
+ @index = index
+ @field = @container.schema[@index]
+ @data = @container.get_column_data(@index)
+ end
+
+ def name
+ @field.name
+ end
+
+ def data_type
+ @field.data_type
+ end
+
+ def null?(i)
+ @data.null?(i)
+ end
+
+ def valid?(i)
+ @data.valid?(i)
+ end
+
+ def [](i)
+ @data[i]
+ end
+
+ def each(&block)
+ @data.each(&block)
+ end
+
+ def reverse_each(&block)
+ @data.reverse_each(&block)
+ end
+
+ def n_rows
+ @data.n_rows
+ end
+ alias_method :size, :n_rows
+ alias_method :length, :n_rows
+
+ def n_nulls
+ @data.n_nulls
+ end
+
+ def ==(other)
+ other.is_a?(self.class) and
+ @field == other.field and
+ @data == other.data
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/compression-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/compression-type.rb
new file mode 100644
index 000000000..b913e48ff
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/compression-type.rb
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class CompressionType
+ EXTENSIONS = {}
+ values.each do |value|
+ case value
+ when UNCOMPRESSED
+ when GZIP
+ EXTENSIONS["gz"] = value
+ else
+ EXTENSIONS[value.nick] = value
+ end
+ end
+
+ class << self
+ def resolve_extension(extension)
+ EXTENSIONS[extension.to_s]
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/constructor-arguments-gc-guardable.rb b/src/arrow/ruby/red-arrow/lib/arrow/constructor-arguments-gc-guardable.rb
new file mode 100644
index 000000000..16669be93
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/constructor-arguments-gc-guardable.rb
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module ConstructorArgumentsGCGuardable
+ def initialize(*args)
+ super
+ @arguments = args
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/csv-loader.rb b/src/arrow/ruby/red-arrow/lib/arrow/csv-loader.rb
new file mode 100644
index 000000000..f82263e46
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/csv-loader.rb
@@ -0,0 +1,384 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "csv"
+require "pathname"
+require "time"
+
+module Arrow
+ class CSVLoader
+ class << self
+ def load(path_or_data, **options)
+ new(path_or_data, **options).load
+ end
+ end
+
+ def initialize(path_or_data, **options)
+ @path_or_data = path_or_data
+ @options = options
+ if @options.key?(:delimiter)
+ @options[:col_sep] = @options.delete(:delimiter)
+ end
+ @compression = @options.delete(:compression)
+ end
+
+ def load
+ case @path_or_data
+ when Pathname
+ load_from_path(@path_or_data.to_path)
+ when /\A.+\.csv\z/i
+ load_from_path(@path_or_data)
+ else
+ load_data(@path_or_data)
+ end
+ end
+
+ private
+ def open_csv(path, **options)
+ CSV.open(path, **options) do |csv|
+ yield(csv)
+ end
+ end
+
+ def parse_csv_data(data, **options)
+ csv = CSV.new(data, **options)
+ begin
+ yield(csv)
+ ensure
+ csv.close
+ end
+ end
+
+ def read_csv(csv)
+ values_set = []
+ csv.each do |row|
+ if row.is_a?(CSV::Row)
+ row = row.collect(&:last)
+ end
+ row.each_with_index do |value, i|
+ values = (values_set[i] ||= [])
+ values << value
+ end
+ end
+ return nil if values_set.empty?
+
+ arrays = values_set.collect.with_index do |values, i|
+ ArrayBuilder.build(values)
+ end
+ if csv.headers
+ names = csv.headers
+ else
+ names = arrays.size.times.collect(&:to_s)
+ end
+ raw_table = {}
+ names.each_with_index do |name, i|
+ raw_table[name] = arrays[i]
+ end
+ Table.new(raw_table)
+ end
+
+ def reader_options
+ options = CSVReadOptions.new
+ @options.each do |key, value|
+ case key
+ when :headers
+ case value
+ when ::Array
+ options.column_names = value
+ when String
+ return nil
+ else
+ if value
+ options.generate_column_names = false
+ else
+ options.generate_column_names = true
+ end
+ end
+ when :column_types
+ value.each do |name, type|
+ options.add_column_type(name, type)
+ end
+ when :schema
+ options.add_schema(value)
+ when :encoding
+ # process encoding on opening input
+ when :col_sep
+ options.delimiter = value
+ else
+ setter = "#{key}="
+ if options.respond_to?(setter)
+ options.__send__(setter, value)
+ else
+ return nil
+ end
+ end
+ end
+ options
+ end
+
+ def open_decompress_input(raw_input)
+ if @compression
+ codec = Codec.new(@compression)
+ CompressedInputStream.open(codec, raw_input) do |input|
+ yield(input)
+ end
+ else
+ yield(raw_input)
+ end
+ end
+
+ def open_encoding_convert_stream(raw_input, &block)
+ encoding = @options[:encoding]
+ if encoding
+ converter = Gio::CharsetConverter.new("UTF-8", encoding)
+ convert_input_stream =
+ Gio::ConverterInputStream.new(raw_input, converter)
+ GIOInputStream.open(convert_input_stream, &block)
+ else
+ yield(raw_input)
+ end
+ end
+
+ def wrap_input(raw_input)
+ open_decompress_input(raw_input) do |input_|
+ open_encoding_convert_stream(input_) do |input__|
+ yield(input__)
+ end
+ end
+ end
+
+ def load_from_path(path)
+ options = reader_options
+ if options
+ begin
+ MemoryMappedInputStream.open(path) do |raw_input|
+ wrap_input(raw_input) do |input|
+ return CSVReader.new(input, options).read
+ end
+ end
+ rescue Arrow::Error::Invalid, Gio::Error
+ end
+ end
+
+ options = update_csv_parse_options(@options, :open_csv, path)
+ open_csv(path, **options) do |csv|
+ read_csv(csv)
+ end
+ end
+
+ def load_data(data)
+ options = reader_options
+ if options
+ begin
+ BufferInputStream.open(Buffer.new(data)) do |raw_input|
+ wrap_input(raw_input) do |input|
+ return CSVReader.new(input, options).read
+ end
+ end
+ rescue Arrow::Error::Invalid, Gio::Error
+ end
+ end
+
+ options = update_csv_parse_options(@options, :parse_csv_data, data)
+ parse_csv_data(data, **options) do |csv|
+ read_csv(csv)
+ end
+ end
+
+ def selective_converter(target_index)
+ lambda do |field, field_info|
+ if target_index.nil? or field_info.index == target_index
+ yield(field)
+ else
+ field
+ end
+ end
+ end
+
+ BOOLEAN_CONVERTER = lambda do |field|
+ begin
+ encoded_field = field.encode(CSV::ConverterEncoding)
+ rescue EncodingError
+ field
+ else
+ case encoded_field
+ when "true"
+ true
+ when "false"
+ false
+ else
+ field
+ end
+ end
+ end
+
+ ISO8601_CONVERTER = lambda do |field|
+ begin
+ encoded_field = field.encode(CSV::ConverterEncoding)
+ rescue EncodingError
+ field
+ else
+ begin
+ ::Time.iso8601(encoded_field)
+ rescue ArgumentError
+ field
+ end
+ end
+ end
+
+ AVAILABLE_CSV_PARSE_OPTIONS = {}
+ CSV.instance_method(:initialize).parameters.each do |type, name|
+ AVAILABLE_CSV_PARSE_OPTIONS[name] = true if type == :key
+ end
+
+ def update_csv_parse_options(options, create_csv, *args)
+ if options.key?(:converters)
+ new_options = options.dup
+ else
+ converters = [:all, BOOLEAN_CONVERTER, ISO8601_CONVERTER]
+ new_options = options.merge(converters: converters)
+ end
+
+ # TODO: Support :schema and :column_types
+
+ unless AVAILABLE_CSV_PARSE_OPTIONS.empty?
+ new_options.select! do |key, value|
+ AVAILABLE_CSV_PARSE_OPTIONS.key?(key)
+ end
+ end
+
+ unless options.key?(:headers)
+ __send__(create_csv, *args, **new_options) do |csv|
+ new_options[:headers] = have_header?(csv)
+ end
+ end
+ unless options.key?(:converters)
+ __send__(create_csv, *args, **new_options) do |csv|
+ new_options[:converters] = detect_robust_converters(csv)
+ end
+ end
+
+ new_options
+ end
+
+ def have_header?(csv)
+ if @options.key?(:headers)
+ return @options[:headers]
+ end
+
+ row1 = csv.shift
+ return false if row1.nil?
+ return false if row1.any?(&:nil?)
+
+ row2 = csv.shift
+ return nil if row2.nil?
+ return true if row2.any?(&:nil?)
+
+ return false if row1.any? {|value| not value.is_a?(String)}
+
+ if row1.collect(&:class) != row2.collect(&:class)
+ return true
+ end
+
+ nil
+ end
+
+ def detect_robust_converters(csv)
+ column_types = []
+ csv.each do |row|
+ if row.is_a?(CSV::Row)
+ each_value = Enumerator.new do |yielder|
+ row.each do |_name, value|
+ yielder << value
+ end
+ end
+ else
+ each_value = row.each
+ end
+ each_value.with_index do |value, i|
+ current_column_type = column_types[i]
+ next if current_column_type == :string
+
+ candidate_type = nil
+ case value
+ when nil
+ next
+ when "true", "false", true, false
+ candidate_type = :boolean
+ when Integer
+ candidate_type = :integer
+ if current_column_type == :float
+ candidate_type = :float
+ end
+ when Float
+ candidate_type = :float
+ if current_column_type == :integer
+ column_types[i] = candidate_type
+ end
+ when ::Time
+ candidate_type = :time
+ when DateTime
+ candidate_type = :date_time
+ when Date
+ candidate_type = :date
+ when String
+ next if value.empty?
+ candidate_type = :string
+ else
+ candidate_type = :string
+ end
+
+ column_types[i] ||= candidate_type
+ if column_types[i] != candidate_type
+ column_types[i] = :string
+ end
+ end
+ end
+
+ converters = []
+ column_types.each_with_index do |type, i|
+ case type
+ when :boolean
+ converters << selective_converter(i, &BOOLEAN_CONVERTER)
+ when :integer
+ converters << selective_converter(i) do |field|
+ if field.nil? or field.empty?
+ nil
+ else
+ CSV::Converters[:integer].call(field)
+ end
+ end
+ when :float
+ converters << selective_converter(i) do |field|
+ if field.nil? or field.empty?
+ nil
+ else
+ CSV::Converters[:float].call(field)
+ end
+ end
+ when :time
+ converters << selective_converter(i, &ISO8601_CONVERTER)
+ when :date_time
+ converters << selective_converter(i, &CSV::Converters[:date_time])
+ when :date
+ converters << selective_converter(i, &CSV::Converters[:date])
+ end
+ end
+ converters
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/csv-read-options.rb b/src/arrow/ruby/red-arrow/lib/arrow/csv-read-options.rb
new file mode 100644
index 000000000..dec3dec95
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/csv-read-options.rb
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class CSVReadOptions
+ alias_method :add_column_type_raw, :add_column_type
+ def add_column_type(name, type)
+ add_column_type_raw(name, DataType.resolve(type))
+ end
+
+ alias_method :delimiter_raw, :delimiter
+ def delimiter
+ delimiter_raw.chr
+ end
+
+ alias_method :delimiter_raw=, :delimiter=
+ def delimiter=(delimiter)
+ case delimiter
+ when String
+ if delimiter.bytesize != 1
+ message = "delimiter must be 1 byte character: #{delimiter.inspect}"
+ raise ArgumentError, message
+ end
+ delimiter = delimiter.ord
+ end
+ self.delimiter_raw = delimiter
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/data-type.rb
new file mode 100644
index 000000000..07b452521
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/data-type.rb
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class DataType
+ class << self
+ # Ensure returning suitable {Arrow::DataType}.
+ #
+ # @overload resolve(data_type)
+ #
+ # Returns the given data type itself. This is convenient to
+ # use this method as {Arrow::DataType} converter.
+ #
+ # @param data_type [Arrow::DataType] The data type.
+ #
+ # @return [Arrow::DataType] The given data type itself.
+ #
+ # @overload resolve(name)
+ #
+ # Creates a suitable data type from the given type name. For
+ # example, you can create {Arrow::BooleanDataType} from
+ # `:boolean`.
+ #
+ # @param name [String, Symbol] The type name of the data type.
+ #
+ # @return [Arrow::DataType] A new suitable data type.
+ #
+ # @example Create a boolean data type
+ # Arrow::DataType.resolve(:boolean)
+ #
+ # @overload resolve(name_with_arguments)
+ #
+ # Creates a new suitable data type from the given type name
+ # with arguments.
+ #
+ # @param name_with_arguments [::Array<String, ...>]
+ # The type name of the data type as the first element.
+ #
+ # The rest elements are additional information of the data type.
+ #
+ # For example, {Arrow::TimestampDataType} needs unit as
+ # additional information.
+ #
+ # @return [Arrow::DataType] A new suitable data type.
+ #
+ # @example Create a boolean data type
+ # Arrow::DataType.resolve([:boolean])
+ #
+ # @example Create a milliseconds unit timestamp data type
+ # Arrow::DataType.resolve([:timestamp, :milli])
+ #
+ # @overload resolve(description)
+ #
+ # Creates a new suitable data type from the given data type
+ # description.
+ #
+ # Data type description is a raw `Hash`. Data type description
+ # must have `:type` value. `:type` is the type of the data type.
+ #
+ # If the type needs additional information, you need to
+ # specify it. See constructor document what information is
+ # needed. For example, {Arrow::ListDataType#initialize} needs
+ # `:field` value.
+ #
+ # @param description [Hash] The description of the data type.
+ #
+ # @option description [String, Symbol] :type The type name of
+ # the data type.
+ #
+ # @return [Arrow::DataType] A new suitable data type.
+ #
+ # @example Create a boolean data type
+ # Arrow::DataType.resolve(type: :boolean)
+ #
+ # @example Create a list data type
+ # Arrow::DataType.resolve(type: :list,
+ # field: {name: "visible", type: :boolean})
+ def resolve(data_type)
+ case data_type
+ when DataType
+ data_type
+ when String, Symbol
+ resolve_class(data_type).new
+ when ::Array
+ type, *arguments = data_type
+ resolve_class(type).new(*arguments)
+ when Hash
+ type = nil
+ description = {}
+ data_type.each do |key, value|
+ key = key.to_sym
+ case key
+ when :type
+ type = value
+ else
+ description[key] = value
+ end
+ end
+ if type.nil?
+ message =
+ "data type description must have :type value: #{data_type.inspect}"
+ raise ArgumentError, message
+ end
+ data_type_class = resolve_class(type)
+ if description.empty?
+ data_type_class.new
+ else
+ data_type_class.new(description)
+ end
+ else
+ message =
+ "data type must be " +
+ "Arrow::DataType, String, Symbol, [String, ...], [Symbol, ...] " +
+ "{type: String, ...} or {type: Symbol, ...}: #{data_type.inspect}"
+ raise ArgumentError, message
+ end
+ end
+
+ def sub_types
+ types = {}
+ gtype.children.each do |child|
+ sub_type = child.to_class
+ types[sub_type] = true
+ sub_type.sub_types.each do |sub_sub_type|
+ types[sub_sub_type] = true
+ end
+ end
+ types.keys
+ end
+
+ def try_convert(value)
+ begin
+ resolve(value)
+ rescue ArgumentError
+ nil
+ end
+ end
+
+ private
+ def resolve_class(data_type)
+ components = data_type.to_s.split("_").collect(&:capitalize)
+ data_type_name = components.join.gsub(/\AUint/, "UInt")
+ data_type_class_name = "#{data_type_name}DataType"
+ unless Arrow.const_defined?(data_type_class_name)
+ available_types = []
+ Arrow.constants.each do |name|
+ name = name.to_s
+ next if name == "DataType"
+ next unless name.end_with?("DataType")
+ name = name.gsub(/DataType\z/, "")
+ components = name.scan(/(UInt[0-9]+|[A-Z][a-z\d]+)/).flatten
+ available_types << components.collect(&:downcase).join("_").to_sym
+ end
+ message =
+ "unknown type: <#{data_type.inspect}>: " +
+ "available types: #{available_types.inspect}"
+ raise ArgumentError, message
+ end
+ data_type_class = Arrow.const_get(data_type_class_name)
+ if data_type_class.gtype.abstract?
+ not_abstract_types = data_type_class.sub_types.find_all do |sub_type|
+ not sub_type.gtype.abstract?
+ end
+ not_abstract_types = not_abstract_types.sort_by do |type|
+ type.name
+ end
+ message =
+ "abstract type: <#{data_type.inspect}>: " +
+ "use one of not abstract type: #{not_abstract_types.inspect}"
+ raise ArgumentError, message
+ end
+ data_type_class
+ end
+ end
+
+ def build_array(values)
+ base_name = self.class.name.gsub(/DataType\z/, "")
+ builder_class = self.class.const_get("#{base_name}ArrayBuilder")
+ args = [values]
+ args.unshift(self) unless builder_class.buildable?(args)
+ builder_class.build(*args)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/date32-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/date32-array-builder.rb
new file mode 100644
index 000000000..dedbba85e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/date32-array-builder.rb
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Date32ArrayBuilder
+ private
+ UNIX_EPOCH = Date.new(1970, 1, 1)
+ def convert_to_arrow_value(value)
+ value = value.to_date if value.respond_to?(:to_date)
+
+ if value.is_a?(Date)
+ (value - UNIX_EPOCH).to_i
+ else
+ value
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/date32-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/date32-array.rb
new file mode 100644
index 000000000..121dbcb55
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/date32-array.rb
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Date32Array
+ def get_value(i)
+ to_date(get_raw_value(i))
+ end
+
+ private
+ UNIX_EPOCH = 2440588
+ def to_date(raw_value)
+ Date.jd(UNIX_EPOCH + raw_value)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/date64-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/date64-array-builder.rb
new file mode 100644
index 000000000..658118122
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/date64-array-builder.rb
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Date64ArrayBuilder
+ private
+ def convert_to_arrow_value(value)
+ if value.respond_to?(:to_time) and not value.is_a?(::Time)
+ value = value.to_time
+ end
+
+ if value.is_a?(::Time)
+ value.to_i * 1_000 + value.usec / 1_000
+ else
+ value
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/date64-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/date64-array.rb
new file mode 100644
index 000000000..9b8a92476
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/date64-array.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Date64Array
+ def get_value(i)
+ to_datetime(get_raw_value(i))
+ end
+
+ private
+ def to_datetime(raw_value)
+ ::Time.at(*raw_value.divmod(1_000)).to_datetime
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/datum.rb b/src/arrow/ruby/red-arrow/lib/arrow/datum.rb
new file mode 100644
index 000000000..196a18f54
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/datum.rb
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Datum
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when Table
+ TableDatum.new(value)
+ when Array
+ ArrayDatum.new(value)
+ when ChunkedArray
+ ChunkedArrayDatum.new(value)
+ when Scalar
+ ScalarDatum.new(value)
+ when ::Array
+ ArrayDatum.new(ArrayBuilder.build(value))
+ when Integer
+ case value
+ when (0..((2 ** 8) - 1))
+ try_convert(UInt8Scalar.new(value))
+ when ((-(2 ** 7))..((2 ** 7) - 1))
+ try_convert(Int8Scalar.new(value))
+ when (0..((2 ** 16) - 1))
+ try_convert(UInt16Scalar.new(value))
+ when ((-(2 ** 15))..((2 ** 15) - 1))
+ try_convert(Int16Scalar.new(value))
+ when (0..((2 ** 32) - 1))
+ try_convert(UInt32Scalar.new(value))
+ when ((-(2 ** 31))..((2 ** 31) - 1))
+ try_convert(Int32Scalar.new(value))
+ when (0..((2 ** 64) - 1))
+ try_convert(UInt64Scalar.new(value))
+ when ((-(2 ** 63))..((2 ** 63) - 1))
+ try_convert(Int64Scalar.new(value))
+ else
+ nil
+ end
+ when Float
+ try_convert(DoubleScalar.new(value))
+ when true, false
+ try_convert(BooleanScalar.new(value))
+ when String
+ if value.ascii_only? or value.encoding == Encoding::UTF_8
+ if value.bytesize <= ((2 ** 31) - 1)
+ try_convert(StringScalar.new(value))
+ else
+ try_convert(LargeStringScalar.new(value))
+ end
+ else
+ if value.bytesize <= ((2 ** 31) - 1)
+ try_convert(BinaryScalar.new(value))
+ else
+ try_convert(LargeBinaryScalar.new(value))
+ end
+ end
+ when Date
+ date32_value = (value - Date32ArrayBuilder::UNIX_EPOCH).to_i
+ try_convert(Date32Scalar.new(date32_value))
+ when Time
+ case value.unit
+ when TimeUnit::SECOND, TimeUnit::MILLI
+ data_type = Time32DataType.new(value.unit)
+ scalar_class = Time32Scalar
+ else
+ data_type = Time64DataType.new(value.unit)
+ scalar_class = Time64Scalar
+ end
+ try_convert(scalar_class.new(data_type, value.value))
+ when ::Time
+ data_type = TimestampDataType.new(:nano)
+ timestamp_value = value.to_i * 1_000_000_000 + value.nsec
+ try_convert(TimestampScalar.new(data_type, timestamp_value))
+ when Decimal128
+ data_type = TimestampDataType.new(:nano)
+ timestamp_value = value.to_i * 1_000_000_000 + value.nsec
+ try_convert(Decimal128Scalar.new(data_type, timestamp_value))
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal128-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal128-array-builder.rb
new file mode 100644
index 000000000..d380ce070
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal128-array-builder.rb
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal128ArrayBuilder
+ class << self
+ def build(data_type, values)
+ builder = new(data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :append_value_raw, :append_value
+ def append_value(value)
+ append_value_raw(normalize_value(value))
+ end
+
+ alias_method :append_values_raw, :append_values
+ def append_values(values, is_valids=nil)
+ if values.is_a?(::Array)
+ values = values.collect do |value|
+ normalize_value(value)
+ end
+ append_values_raw(values, is_valids)
+ else
+ append_values_packed(values, is_valids)
+ end
+ end
+
+ private
+ def normalize_value(value)
+ case value
+ when String
+ Decimal128.new(value)
+ when Float
+ Decimal128.new(value.to_s)
+ when BigDecimal
+ Decimal128.new(value.to_s)
+ else
+ value
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal128-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal128-array.rb
new file mode 100644
index 000000000..a5ee53be7
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal128-array.rb
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal128Array
+ def get_value(i)
+ BigDecimal(format_value(i))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal128-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal128-data-type.rb
new file mode 100644
index 000000000..4b5583896
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal128-data-type.rb
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal128DataType
+ MAX_PRECISION = max_precision
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::Decimal128DataType}.
+ #
+ # @overload initialize(precision, scale)
+ #
+ # @param precision [Integer] The precision of the decimal data
+ # type. It's the number of digits including the number of
+ # digits after the decimal point.
+ #
+ # @param scale [Integer] The scale of the decimal data
+ # type. It's the number of digits after the decimal point.
+ #
+ # @example Create a decimal data type for "XXXXXX.YY" decimal
+ # Arrow::Decimal128DataType.new(8, 2)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the decimal data
+ # type. It must have `:precision` and `:scale` values.
+ #
+ # @option description [Integer] :precision The precision of the
+ # decimal data type. It's the number of digits including the
+ # number of digits after the decimal point.
+ #
+ # @option description [Integer] :scale The scale of the decimal
+ # data type. It's the number of digits after the decimal
+ # point.
+ #
+ # @example Create a decimal data type for "XXXXXX.YY" decimal
+ # Arrow::Decimal128DataType.new(precision: 8,
+ # scale: 2)
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ description = args[0]
+ precision = description[:precision]
+ scale = description[:scale]
+ when 2
+ precision, scale = args
+ else
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
+ raise ArgumentError, message
+ end
+ initialize_raw(precision, scale)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal128.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal128.rb
new file mode 100644
index 000000000..bf853ae7f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal128.rb
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal128
+ alias_method :to_s_raw, :to_s
+
+ # @overload to_s
+ #
+ # @return [String]
+ # The string representation of the decimal.
+ #
+ # @overload to_s(scale)
+ #
+ # @param scale [Integer] The scale of the decimal.
+ # @return [String]
+ # The string representation of the decimal including the scale.
+ #
+ # @since 0.13.0
+ def to_s(scale=nil)
+ if scale
+ to_string_scale(scale)
+ else
+ to_s_raw
+ end
+ end
+
+ alias_method :abs!, :abs
+
+ # @since 3.0.0
+ def abs
+ copied = dup
+ copied.abs!
+ copied
+ end
+
+ alias_method :negate!, :negate
+
+ # @since 3.0.0
+ def negate
+ copied = dup
+ copied.negate!
+ copied
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal256-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal256-array-builder.rb
new file mode 100644
index 000000000..fb89ff00b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal256-array-builder.rb
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal256ArrayBuilder
+ class << self
+ # @since 3.0.0
+ def build(data_type, values)
+ builder = new(data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :append_value_raw, :append_value
+ # @since 3.0.0
+ def append_value(value)
+ append_value_raw(normalize_value(value))
+ end
+
+ alias_method :append_values_raw, :append_values
+ # @since 3.0.0
+ def append_values(values, is_valids=nil)
+ if values.is_a?(::Array)
+ values = values.collect do |value|
+ normalize_value(value)
+ end
+ append_values_raw(values, is_valids)
+ else
+ append_values_packed(values, is_valids)
+ end
+ end
+
+ private
+ def normalize_value(value)
+ case value
+ when String
+ Decimal256.new(value)
+ when Float
+ Decimal256.new(value.to_s)
+ when BigDecimal
+ Decimal256.new(value.to_s)
+ else
+ value
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal256-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal256-array.rb
new file mode 100644
index 000000000..8c2306dfe
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal256-array.rb
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal256Array
+ # @since 3.0.0
+ def get_value(i)
+ BigDecimal(format_value(i))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal256-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal256-data-type.rb
new file mode 100644
index 000000000..8264e388e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal256-data-type.rb
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal256DataType
+ MAX_PRECISION = max_precision
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::Decimal256DataType}.
+ #
+ # @overload initialize(precision, scale)
+ #
+ # @param precision [Integer] The precision of the decimal data
+ # type. It's the number of digits including the number of
+ # digits after the decimal point.
+ #
+ # @param scale [Integer] The scale of the decimal data
+ # type. It's the number of digits after the decimal point.
+ #
+ # @example Create a decimal data type for "XXXXXX.YY" decimal
+ # Arrow::Decimal256DataType.new(8, 2)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the decimal data
+ # type. It must have `:precision` and `:scale` values.
+ #
+ # @option description [Integer] :precision The precision of the
+ # decimal data type. It's the number of digits including the
+ # number of digits after the decimal point.
+ #
+ # @option description [Integer] :scale The scale of the decimal
+ # data type. It's the number of digits after the decimal
+ # point.
+ #
+ # @example Create a decimal data type for "XXXXXX.YY" decimal
+ # Arrow::Decimal256DataType.new(precision: 8,
+ # scale: 2)
+ #
+ # @since 3.0.0
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ description = args[0]
+ precision = description[:precision]
+ scale = description[:scale]
+ when 2
+ precision, scale = args
+ else
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
+ raise ArgumentError, message
+ end
+ initialize_raw(precision, scale)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/decimal256.rb b/src/arrow/ruby/red-arrow/lib/arrow/decimal256.rb
new file mode 100644
index 000000000..1a7097a4d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/decimal256.rb
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Decimal256
+ alias_method :to_s_raw, :to_s
+
+ # @overload to_s
+ #
+ # @return [String]
+ # The string representation of the decimal.
+ #
+ # @overload to_s(scale)
+ #
+ # @param scale [Integer] The scale of the decimal.
+ # @return [String]
+ # The string representation of the decimal including the scale.
+ #
+ # @since 3.0.0
+ def to_s(scale=nil)
+ if scale
+ to_string_scale(scale)
+ else
+ to_s_raw
+ end
+ end
+
+ alias_method :abs!, :abs
+
+ # @since 3.0.0
+ def abs
+ copied = dup
+ copied.abs!
+ copied
+ end
+
+ alias_method :negate!, :negate
+
+ # @since 3.0.0
+ def negate
+ copied = dup
+ copied.negate!
+ copied
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/dense-union-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/dense-union-data-type.rb
new file mode 100644
index 000000000..6d2bf5e70
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/dense-union-data-type.rb
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class DenseUnionDataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::DenseUnionDataType}.
+ #
+ # @overload initialize(fields, type_codes)
+ #
+ # @param fields [::Array<Arrow::Field, Hash>] The fields of the
+ # dense union data type. You can mix {Arrow::Field} and field
+ # description in the fields.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @param type_codes [::Array<Integer>] The IDs that indicates
+ # corresponding fields.
+ #
+ # @example Create a dense union data type for `{2: visible, 9: count}`
+ # fields = [
+ # Arrow::Field.new("visible", :boolean),
+ # {
+ # name: "count",
+ # type: :int32,
+ # },
+ # ]
+ # Arrow::DenseUnionDataType.new(fields, [2, 9])
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the dense union
+ # data type. It must have `:fields` and `:type_codes` values.
+ #
+ # @option description [::Array<Arrow::Field, Hash>] :fields The
+ # fields of the dense union data type. You can mix
+ # {Arrow::Field} and field description in the fields.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @option description [::Array<Integer>] :type_codes The IDs
+ # that indicates corresponding fields.
+ #
+ # @example Create a dense union data type for `{2: visible, 9: count}`
+ # fields = [
+ # Arrow::Field.new("visible", :boolean),
+ # {
+ # name: "count",
+ # type: :int32,
+ # },
+ # ]
+ # Arrow::DenseUnionDataType.new(fields: fields,
+ # type_codes: [2, 9])
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ description = args[0]
+ fields = description[:fields]
+ type_codes = description[:type_codes]
+ when 2
+ fields, type_codes = args
+ else
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
+ raise ArgumentError, message
+ end
+ fields = fields.collect do |field|
+ field = Field.new(field) unless field.is_a?(Field)
+ field
+ end
+ initialize_raw(fields, type_codes)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/dictionary-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/dictionary-array.rb
new file mode 100644
index 000000000..70591ab7c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/dictionary-array.rb
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class DictionaryArray
+ def get_value(i)
+ dictionary[indices[i]]
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/dictionary-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/dictionary-data-type.rb
new file mode 100644
index 000000000..8396e311c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/dictionary-data-type.rb
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class DictionaryDataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::DictionaryDataType}.
+ #
+ # @overload initialize(index_data_type, value_data_type, ordered)
+ #
+ # @param index_data_type [Arrow::DataType, Hash, String, Symbol]
+ # The index data type of the dictionary data type. It must be
+ # signed integer data types. Here are available signed integer
+ # data types:
+ #
+ # * Arrow::Int8DataType
+ # * Arrow::Int16DataType
+ # * Arrow::Int32DataType
+ # * Arrow::Int64DataType
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @param value_data_type [Arrow::DataType, Hash, String, Symbol]
+ # The value data type of the dictionary data type.
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @param ordered [Boolean] Whether dictionary contents are
+ # ordered or not.
+ #
+ # @example Create a dictionary data type for `{0: "Hello", 1: "World"}`
+ # index_data_type = :int8
+ # value_data_type = :string
+ # ordered = true
+ # Arrow::DictionaryDataType.new(index_data_type,
+ # value_data_type,
+ # ordered)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the dictionary
+ # data type. It must have `:index_data_type`, `:dictionary`
+ # and `:ordered` values.
+ #
+ # @option description [Arrow::DataType, Hash, String, Symbol]
+ # :index_data_type The index data type of the dictionary data
+ # type. It must be signed integer data types. Here are
+ # available signed integer data types:
+ #
+ # * Arrow::Int8DataType
+ # * Arrow::Int16DataType
+ # * Arrow::Int32DataType
+ # * Arrow::Int64DataType
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @option description [Arrow::DataType, Hash, String, Symbol]
+ # :value_data_type
+ # The value data type of the dictionary data type.
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @option description [Boolean] :ordered Whether dictionary
+ # contents are ordered or not.
+ #
+ # @example Create a dictionary data type for `{0: "Hello", 1: "World"}`
+ # Arrow::DictionaryDataType.new(index_data_type: :int8,
+ # value_data_type: :string,
+ # ordered: true)
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ description = args[0]
+ index_data_type = description[:index_data_type]
+ value_data_type = description[:value_data_type]
+ ordered = description[:ordered]
+ when 3
+ index_data_type, value_data_type, ordered = args
+ else
+ message = "wrong number of arguments (given, #{n_args}, expected 1 or 3)"
+ raise ArgumentError, message
+ end
+ index_data_type = DataType.resolve(index_data_type)
+ value_data_type = DataType.resolve(value_data_type)
+ initialize_raw(index_data_type, value_data_type, ordered)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/equal-options.rb b/src/arrow/ruby/red-arrow/lib/arrow/equal-options.rb
new file mode 100644
index 000000000..4eb9964ad
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/equal-options.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class EqualOptions
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when Hash
+ options = new
+ value.each do |k, v|
+ setter = :"#{k}="
+ return unless options.respond_to?(setter)
+ options.__send__(setter, v)
+ end
+ options
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/expression.rb b/src/arrow/ruby/red-arrow/lib/arrow/expression.rb
new file mode 100644
index 000000000..a33cc53c2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/expression.rb
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Expression
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when Symbol
+ FieldExpression.new(value.to_s)
+ when ::Array
+ function_name, *arguments = value
+ case function_name
+ when String, Symbol
+ function_name = function_name.to_s
+ else
+ return nil
+ end
+ if arguments.last.is_a?(FunctionOptions)
+ options = arguments.pop
+ else
+ options = nil
+ end
+ CallExpression.new(function_name, arguments, options)
+ else
+ datum = Datum.try_convert(value)
+ return nil if datum.nil?
+ LiteralExpression.new(datum)
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/field-containable.rb b/src/arrow/ruby/red-arrow/lib/arrow/field-containable.rb
new file mode 100644
index 000000000..e4dbf4ec2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/field-containable.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module FieldContainable
+ def find_field(name_or_index)
+ case name_or_index
+ when String, Symbol
+ name = name_or_index
+ get_field_by_name(name)
+ when Integer
+ index = name_or_index
+ raise if index < 0
+ index += n_fields if index < 0
+ return nil if index < 0 or index >= n_fields
+ get_field(index)
+ else
+ message = "field name or index must be String, Symbol or Integer"
+ message << ": <#{name_or_index.inspect}>"
+ raise ArgumentError, message
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/field.rb b/src/arrow/ruby/red-arrow/lib/arrow/field.rb
new file mode 100644
index 000000000..e439cb960
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/field.rb
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Field
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::Field}.
+ #
+ # @overload initialize(name, data_type)
+ #
+ # @param name [String, Symbol] The name of the field.
+ #
+ # @param data_type [Arrow::DataType, Hash, String, Symbol] The
+ # data type of the field.
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @example Create a field with {Arrow::DataType}s
+ # Arrow::Field.new("visible", Arrow::BooleanDataType.new)
+ #
+ # @example Create a field with data type description
+ # Arrow::Field.new("visible", :boolean)
+ #
+ # @example Create a field with name as `Symbol`
+ # Arrow::Field.new(:visible, :boolean)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the field.
+ #
+ # Field description is a raw `Hash`. Field description must
+ # have `:name` and `:data_type` values. `:name` is the name of
+ # the field. `:data_type` is the data type of the field. You
+ # can use {Arrow::DataType} or data type description as
+ # `:data_type` value.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # There is a shortcut for convenience. If field description
+ # doesn't have `:data_type`, all keys except `:name` are
+ # processes as data type description. For example, the
+ # following field descriptions are the same:
+ #
+ # ```ruby
+ # {name: "visible", data_type: {type: :boolean}}
+ # {name: "visible", type: :boolean} # Shortcut version
+ # ```
+ #
+ # @option description [String, Symbol] :name The name of the field.
+ #
+ # @option description [Arrow::DataType, Hash] :data_type The
+ # data type of the field. You can specify data type description
+ # by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @example Create a field with {Arrow::DataType}s
+ # Arrow::Field.new(name: "visible",
+ # data_type: Arrow::BooleanDataType.new)
+ #
+ # @example Create a field with data type description
+ # Arrow::Field.new(name: "visible", data_type: {type: :boolean}
+ #
+ # @example Create a field with shortcut form
+ # Arrow::Field.new(name: "visible", type: :boolean)
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ description = args[0]
+ name = nil
+ data_type = nil
+ data_type_description = {}
+ description.each do |key, value|
+ key = key.to_sym
+ case key
+ when :name
+ name = value
+ when :data_type
+ data_type = DataType.resolve(value)
+ else
+ data_type_description[key] = value
+ end
+ end
+ data_type ||= DataType.resolve(data_type_description)
+ when 2
+ name = args[0]
+ data_type = DataType.resolve(args[1])
+ else
+ message = "wrong number of arguments (given #{n_args}, expected 1..2)"
+ raise ArgumentError, message
+ end
+
+ initialize_raw(name, data_type)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/file-output-stream.rb b/src/arrow/ruby/red-arrow/lib/arrow/file-output-stream.rb
new file mode 100644
index 000000000..f39ad14ca
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/file-output-stream.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class FileOutputStream
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+ def initialize(path, options={})
+ append = nil
+ case options
+ when true, false
+ append = options
+ when Hash
+ append = options[:append]
+ end
+ append = false if append.nil?
+ initialize_raw(path, append)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/file-system.rb b/src/arrow/ruby/red-arrow/lib/arrow/file-system.rb
new file mode 100644
index 000000000..7d105b42a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/file-system.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class FileSystem
+ alias_method :open_output_stream_raw, :open_output_stream
+ def open_output_stream(path)
+ stream = open_output_stream_raw(path)
+ if block_given?
+ begin
+ yield(stream)
+ ensure
+ stream.close
+ end
+ else
+ stream
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array-builder.rb
new file mode 100644
index 000000000..516d8143d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array-builder.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class FixedSizeBinaryArrayBuilder
+ class << self
+ # @since 3.0.0
+ def build(data_type, values)
+ builder = new(data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :append_values_raw, :append_values
+ # @since 3.0.0
+ def append_values(values, is_valids=nil)
+ if values.is_a?(::Array)
+ append_values_raw(values, is_valids)
+ else
+ append_values_packed(values, is_valids)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array.rb
new file mode 100644
index 000000000..37c121d8e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/fixed-size-binary-array.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class FixedSizeBinaryArray
+ alias_method :get_value_raw, :get_value
+ # @since 3.0.0
+ def get_value(i)
+ get_value_raw(i).to_s
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/generic-filterable.rb b/src/arrow/ruby/red-arrow/lib/arrow/generic-filterable.rb
new file mode 100644
index 000000000..50a79142a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/generic-filterable.rb
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module GenericFilterable
+ class << self
+ def included(base)
+ base.__send__(:alias_method, :filter_raw, :filter)
+ base.__send__(:alias_method, :filter, :filter_generic)
+ end
+ end
+
+ def filter_generic(filter, options=nil)
+ case filter
+ when ::Array
+ filter_raw(BooleanArray.new(filter), options)
+ when ChunkedArray
+ if respond_to?(:filter_chunked_array)
+ filter_chunked_array(filter, options)
+ else
+ # TODO: Implement this in C++
+ filter_raw(filter.pack, options)
+ end
+ else
+ filter_raw(filter, options)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/generic-takeable.rb b/src/arrow/ruby/red-arrow/lib/arrow/generic-takeable.rb
new file mode 100644
index 000000000..f32b43f22
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/generic-takeable.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module GenericTakeable
+ class << self
+ def included(base)
+ base.__send__(:alias_method, :take_raw, :take)
+ base.__send__(:alias_method, :take, :take_generic)
+ end
+ end
+
+ def take_generic(indices)
+ case indices
+ when ::Array
+ take_raw(IntArrayBuilder.build(indices))
+ when ChunkedArray
+ take_chunked_array(indices)
+ else
+ take_raw(indices)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/group.rb b/src/arrow/ruby/red-arrow/lib/arrow/group.rb
new file mode 100644
index 000000000..7827ac0bd
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/group.rb
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Group
+ def initialize(table, keys)
+ @table = table
+ @keys = keys
+ end
+
+ def count(*target_names)
+ aggregate(*build_aggregations("hash_count", target_names))
+ end
+
+ def sum(*target_names)
+ aggregate(*build_aggregations("hash_sum", target_names))
+ end
+
+ def product(*target_names)
+ aggregate(*build_aggregations("hash_product", target_names))
+ end
+
+ def mean(*target_names)
+ aggregate(*build_aggregations("hash_mean", target_names))
+ end
+
+ def min(*target_names)
+ aggregate(*build_aggregations("hash_min", target_names))
+ end
+
+ def max(*target_names)
+ aggregate(*build_aggregations("hash_max", target_names))
+ end
+
+ def stddev(*target_names)
+ aggregate(*build_aggregations("hash_stddev", target_names))
+ end
+
+ def variance(*target_names)
+ aggregate(*build_aggregations("hash_variance", target_names))
+ end
+
+ def aggregate(aggregation, *more_aggregations)
+ aggregations = [aggregation] + more_aggregations
+ normalized_aggregations = normalize_aggregations(aggregations)
+ plan = ExecutePlan.new
+ source_node = plan.build_source_node(@table)
+ aggregate_node =
+ plan.build_aggregate_node(source_node,
+ {
+ aggregations: normalized_aggregations,
+ keys: @keys
+ })
+ sink_node_options = SinkNodeOptions.new
+ plan.build_sink_node(aggregate_node, sink_node_options)
+ plan.validate
+ plan.start
+ plan.wait
+ reader = sink_node_options.get_reader(aggregate_node.output_schema)
+ reader.read_all
+ end
+
+ private
+ def build_aggregations(function_name, target_names)
+ if target_names.empty?
+ [function_name]
+ else
+ target_names.collect do |name|
+ "#{function_name}(#{name})"
+ end
+ end
+ end
+
+ def normalize_aggregations(aggregations)
+ normalized_aggregations = []
+ aggregations.each do |aggregation|
+ case aggregation
+ when :all
+ all_functions = [
+ "hash_count",
+ "hash_sum",
+ "hash_product",
+ "hash_mean",
+ "hash_stddev",
+ "hash_variance",
+ # "hash_tdigest",
+ "hash_min",
+ "hash_max",
+ "hash_any",
+ "hash_all",
+ ]
+ normalized_aggregations.concat(normalize_aggregations(all_functions))
+ when /\A([a-zA-Z0-9_].+?)\((.+?)\)\z/
+ function = $1
+ input = $2.strip
+ normalized_aggregations << {function: function, input: input}
+ when "count", "hash_count"
+ function = aggregation
+ target_columns.each do |column|
+ normalized_aggregations << {function: function, input: column.name}
+ end
+ when "any", "hash_any", "all", "hash_all"
+ function = aggregation
+ boolean_target_columns.each do |column|
+ normalized_aggregations << {function: function, input: column.name}
+ end
+ when String
+ function = aggregation
+ numeric_target_columns.each do |column|
+ normalized_aggregations << {function: function, input: column.name}
+ end
+ else
+ normalized_aggregations << aggregation
+ end
+ end
+ normalized_aggregations
+ end
+
+ def target_columns
+ @target_columns ||= find_target_columns
+ end
+
+ def find_target_columns
+ key_names = @keys.collect(&:to_s)
+ @table.columns.find_all do |column|
+ not key_names.include?(column.name)
+ end
+ end
+
+ def boolean_target_columns
+ @boolean_target_columns ||= find_boolean_target_columns
+ end
+
+ def find_boolean_target_columns
+ target_columns.find_all do |column|
+ column.data_type.is_a?(BooleanDataType)
+ end
+ end
+
+ def numeric_target_columns
+ @numeric_target_columns ||= find_numeric_target_columns
+ end
+
+ def find_numeric_target_columns
+ target_columns.find_all do |column|
+ column.data_type.is_a?(NumericDataType)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/list-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/list-array-builder.rb
new file mode 100644
index 000000000..d889c8a0c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/list-array-builder.rb
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class ListArrayBuilder
+ class << self
+ def build(data_type, values)
+ builder = new(data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :append_value_raw, :append_value
+
+ # @overload append_value
+ #
+ # Starts appending a list record. You also need to append list
+ # value by {#value_builder}.
+ #
+ # @overload append_value(list)
+ #
+ # Appends a list record including list value.
+ #
+ # @param value [nil, ::Array] The list value of the record.
+ #
+ # If this is `nil`, the list record is null.
+ #
+ # If this is `Array`, it's the list value of the record.
+ #
+ # @since 0.12.0
+ def append_value(*args)
+ n_args = args.size
+
+ case n_args
+ when 0
+ append_value_raw
+ when 1
+ value = args[0]
+ case value
+ when nil
+ append_null
+ when ::Array
+ append_value_raw
+ @value_builder ||= value_builder
+ @value_builder.append(*value)
+ else
+ message = "list value must be nil or Array: #{value.inspect}"
+ raise ArgumentError, message
+ end
+ else
+ message = "wrong number of arguments (given #{n_args}, expected 0..1)"
+ raise ArgumentError, message
+ end
+ end
+
+ def append_values(lists, is_valids=nil)
+ if is_valids
+ is_valids.each_with_index do |is_valid, i|
+ if is_valid
+ append_value(lists[i])
+ else
+ append_null
+ end
+ end
+ else
+ lists.each do |list|
+ append_value(list)
+ end
+ end
+ end
+
+ # @since 0.12.0
+ def append(*values)
+ if values.empty?
+ # For backward compatibility
+ append_value
+ else
+ super
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/list-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/list-data-type.rb
new file mode 100644
index 000000000..cfcdd2a9e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/list-data-type.rb
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class ListDataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::ListDataType}.
+ #
+ # @overload initialize(field)
+ #
+ # @param field [Arrow::Field, Hash] The field of the list data
+ # type. You can also specify field description by `Hash`.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @example Create a list data type with {Arrow::Field}
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # Arrow::ListDataType.new(visible_field)
+ #
+ # @example Create a list data type with field description
+ # Arrow::ListDataType.new(name: "visible", type: :boolean)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the list data
+ # type. It must have `:field` value.
+ #
+ # @option description [Arrow::Field, Hash] :field The field of
+ # the list data type. You can also specify field description
+ # by `Hash`.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @example Create a list data type with {Arrow::Field}
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # Arrow::ListDataType.new(field: visible_field)
+ #
+ # @example Create a list data type with field description
+ # Arrow::ListDataType.new(field: {name: "visible", type: :boolean})
+ #
+ # @overload initialize(data_type)
+ #
+ # @param data_type [Arrow::DataType, String, Symbol,
+ # ::Array<String>, ::Array<Symbol>, Hash] The element data
+ # type of the list data type. A field is created with the
+ # default name `"item"` from the data type automatically.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type.
+ #
+ # @example Create a list data type with {Arrow::DataType}
+ # Arrow::ListDataType.new(Arrow::BooleanDataType.new)
+ #
+ # @example Create a list data type with data type name as String
+ # Arrow::ListDataType.new("boolean")
+ #
+ # @example Create a list data type with data type name as Symbol
+ # Arrow::ListDataType.new(:boolean)
+ #
+ # @example Create a list data type with data type as Array
+ # Arrow::ListDataType.new([:time32, :milli])
+ def initialize(arg)
+ data_type = resolve_data_type(arg)
+ if data_type
+ field = Field.new(default_field_name, data_type)
+ else
+ field = resolve_field(arg)
+ end
+ initialize_raw(field)
+ end
+
+ private
+ def resolve_data_type(arg)
+ case arg
+ when DataType, String, Symbol, ::Array
+ DataType.resolve(arg)
+ when Hash
+ return nil if arg[:name]
+ return nil unless arg[:type]
+ DataType.resolve(arg)
+ else
+ nil
+ end
+ end
+
+ def default_field_name
+ "item"
+ end
+
+ def resolve_field(arg)
+ if arg.is_a?(Hash) and arg.key?(:field)
+ description = arg
+ arg = description[:field]
+ end
+ if arg.is_a?(Hash)
+ field_description = arg
+ Field.new(field_description)
+ else
+ arg
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/loader.rb b/src/arrow/ruby/red-arrow/lib/arrow/loader.rb
new file mode 100644
index 000000000..804a94894
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/loader.rb
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow/block-closable"
+
+module Arrow
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("Arrow", Arrow)
+ end
+ end
+
+ private
+ def post_load(repository, namespace)
+ require_libraries
+ require_extension_library
+ gc_guard
+ end
+
+ def require_libraries
+ require "arrow/column-containable"
+ require "arrow/field-containable"
+ require "arrow/generic-filterable"
+ require "arrow/generic-takeable"
+ require "arrow/record-containable"
+ require "arrow/symbol-values-appendable"
+
+ require "arrow/aggregate-node-options"
+ require "arrow/aggregation"
+ require "arrow/array"
+ require "arrow/array-builder"
+ require "arrow/bigdecimal-extension"
+ require "arrow/binary-dictionary-array-builder"
+ require "arrow/buffer"
+ require "arrow/chunked-array"
+ require "arrow/column"
+ require "arrow/compression-type"
+ require "arrow/csv-loader"
+ require "arrow/csv-read-options"
+ require "arrow/data-type"
+ require "arrow/date32-array"
+ require "arrow/date32-array-builder"
+ require "arrow/date64-array"
+ require "arrow/date64-array-builder"
+ require "arrow/datum"
+ require "arrow/decimal128"
+ require "arrow/decimal128-array"
+ require "arrow/decimal128-array-builder"
+ require "arrow/decimal128-data-type"
+ require "arrow/decimal256"
+ require "arrow/decimal256-array"
+ require "arrow/decimal256-array-builder"
+ require "arrow/decimal256-data-type"
+ require "arrow/dense-union-data-type"
+ require "arrow/dictionary-array"
+ require "arrow/dictionary-data-type"
+ require "arrow/equal-options"
+ require "arrow/expression"
+ require "arrow/field"
+ require "arrow/file-output-stream"
+ require "arrow/file-system"
+ require "arrow/fixed-size-binary-array"
+ require "arrow/fixed-size-binary-array-builder"
+ require "arrow/group"
+ require "arrow/list-array-builder"
+ require "arrow/list-data-type"
+ require "arrow/map-array"
+ require "arrow/map-array-builder"
+ require "arrow/map-data-type"
+ require "arrow/null-array"
+ require "arrow/null-array-builder"
+ require "arrow/path-extension"
+ require "arrow/record"
+ require "arrow/record-batch"
+ require "arrow/record-batch-builder"
+ require "arrow/record-batch-file-reader"
+ require "arrow/record-batch-iterator"
+ require "arrow/record-batch-reader"
+ require "arrow/record-batch-stream-reader"
+ require "arrow/rolling-window"
+ require "arrow/scalar"
+ require "arrow/schema"
+ require "arrow/slicer"
+ require "arrow/sort-key"
+ require "arrow/sort-options"
+ require "arrow/source-node-options"
+ require "arrow/sparse-union-data-type"
+ require "arrow/string-dictionary-array-builder"
+ require "arrow/struct-array"
+ require "arrow/struct-array-builder"
+ require "arrow/struct-data-type"
+ require "arrow/table"
+ require "arrow/table-concatenate-options"
+ require "arrow/table-formatter"
+ require "arrow/table-list-formatter"
+ require "arrow/table-table-formatter"
+ require "arrow/table-loader"
+ require "arrow/table-saver"
+ require "arrow/tensor"
+ require "arrow/time"
+ require "arrow/time32-array"
+ require "arrow/time32-array-builder"
+ require "arrow/time32-data-type"
+ require "arrow/time64-array"
+ require "arrow/time64-array-builder"
+ require "arrow/time64-data-type"
+ require "arrow/timestamp-array"
+ require "arrow/timestamp-array-builder"
+ require "arrow/timestamp-data-type"
+ require "arrow/writable"
+ end
+
+ def require_extension_library
+ require "arrow.so"
+ end
+
+ def gc_guard
+ require "arrow/constructor-arguments-gc-guardable"
+
+ [
+ @base_module::BinaryScalar,
+ @base_module::Buffer,
+ @base_module::DenseUnionScalar,
+ @base_module::FixedSizeBinaryScalar,
+ @base_module::LargeBinaryScalar,
+ @base_module::LargeListScalar,
+ @base_module::LargeStringScalar,
+ @base_module::ListScalar,
+ @base_module::MapScalar,
+ @base_module::SparseUnionScalar,
+ @base_module::StringScalar,
+ @base_module::StructScalar,
+ ].each do |klass|
+ klass.prepend(ConstructorArgumentsGCGuardable)
+ end
+ end
+
+ def load_object_info(info)
+ super
+
+ klass = @base_module.const_get(rubyish_class_name(info))
+ if klass.method_defined?(:close)
+ klass.extend(BlockClosable)
+ end
+ end
+
+ def load_method_info(info, klass, method_name)
+ case klass.name
+ when /Array\z/
+ case method_name
+ when "values"
+ method_name = "values_raw"
+ end
+ end
+
+ case klass.name
+ when /Builder\z/
+ case method_name
+ when "append"
+ return
+ else
+ super
+ end
+ when "Arrow::StringArray"
+ case method_name
+ when "get_value"
+ method_name = "get_raw_value"
+ when "get_string"
+ method_name = "get_value"
+ end
+ super(info, klass, method_name)
+ when "Arrow::Date32Array",
+ "Arrow::Date64Array",
+ "Arrow::Decimal128Array",
+ "Arrow::Decimal256Array",
+ "Arrow::Time32Array",
+ "Arrow::Time64Array",
+ "Arrow::TimestampArray"
+ case method_name
+ when "get_value"
+ method_name = "get_raw_value"
+ end
+ super(info, klass, method_name)
+ when "Arrow::Decimal128", "Arrow::Decimal256"
+ case method_name
+ when "copy"
+ method_name = "dup"
+ end
+ super(info, klass, method_name)
+ when "Arrow::BooleanScalar"
+ case method_name
+ when "value?"
+ method_name = "value"
+ end
+ super(info, klass, method_name)
+ else
+ super
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/map-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/map-array-builder.rb
new file mode 100644
index 000000000..9e269d1c5
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/map-array-builder.rb
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class MapArrayBuilder
+ class << self
+ def build(data_type, values)
+ builder = new(data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :append_value_raw, :append_value
+
+ # @overload append_value
+ #
+ # Starts appending a map record. You need to append
+ # values of map by {#key_builder} and {#item_builder}.
+ #
+ # @overload append_value(value)
+ #
+ # Appends a map record including key and item values.
+ #
+ # @param value [nil, #each] The map record.
+ #
+ # If this is `nil`, the map record is null.
+ #
+ # If this is an `Object` that has `#each`, each value is a pair of key and item.
+ #
+ # @since 6.0.0
+ def append_value(*args)
+ n_args = args.size
+
+ case n_args
+ when 0
+ append_value_raw
+ when 1
+ value = args[0]
+ case value
+ when nil
+ append_null
+ else
+ unless value.respond_to?(:each)
+ message = "map value must be nil, Hash or Object that has #each: #{value.inspect}"
+ raise ArgumentError, message
+ end
+ append_value_raw
+ @key_builder ||= key_builder
+ @item_builder ||= item_builder
+ case value
+ when Hash
+ keys = value.keys
+ values = value.values
+ else
+ keys = []
+ values = []
+ value.each do |key, item|
+ keys << key
+ values << item
+ end
+ end
+ @key_builder.append(*keys)
+ @item_builder.append(*values)
+ end
+ else
+ message = "wrong number of arguments (given #{n_args}, expected 0..1)"
+ raise ArgumentError, message
+ end
+ end
+
+ alias_method :append_values_raw, :append_values
+
+ def append_values(values, is_valids=nil)
+ value = values[0]
+ case value
+ when Integer
+ append_values_raw(values, is_valids)
+ else
+ if is_valids
+ is_valids.each_with_index do |is_valid, i|
+ if is_valid
+ append_value(values[i])
+ else
+ append_null
+ end
+ end
+ else
+ values.each do |value|
+ append_value(value)
+ end
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/map-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/map-array.rb
new file mode 100644
index 000000000..96b8c01b1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/map-array.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class MapArray
+ def get_value(i)
+ super.each_with_object({}) do |item, result|
+ result[item["key"]] = item["value"]
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/map-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/map-data-type.rb
new file mode 100644
index 000000000..67e134329
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/map-data-type.rb
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class MapDataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::MapDataType}.
+ #
+ # @overload initialize(key, item)
+ #
+ # @param key [Arrow::DataType, Hash, String, Symbol]
+ # The key data type of the map data type.
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @param item [Arrow::DataType, Hash, String, Symbol]
+ # The item data type of the map data type.
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @example Create a map data type for `{0: "Hello", 1: "World"}`
+ # key = :int8
+ # item = :string
+ # Arrow::MapDataType.new(key, item)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the map data
+ # type. It must have `:key`, `:item` values.
+ #
+ # @option description [Arrow::DataType, Hash, String, Symbol]
+ # :key The key data type of the map data type.
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @option description [Arrow::DataType, Hash, String, Symbol]
+ # :item The item data type of the map data type.
+ #
+ # You can specify data type as a description by `Hash`.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @example Create a map data type for `{0: "Hello", 1: "World"}`
+ # Arrow::MapDataType.new(key: :int8, item: :string)
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ description = args[0]
+ key = description[:key]
+ item = description[:item]
+ when 2
+ key, item = args
+ else
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
+ raise ArgumentError, message
+ end
+ key = DataType.resolve(key)
+ item = DataType.resolve(item)
+ initialize_raw(key, item)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/null-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/null-array-builder.rb
new file mode 100644
index 000000000..26e58ccdc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/null-array-builder.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class NullArrayBuilder
+ class << self
+ def buildable?(args)
+ super and not (args.size == 1 and args[0].is_a?(Integer))
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/null-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/null-array.rb
new file mode 100644
index 000000000..7426bb345
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/null-array.rb
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class NullArray
+ def get_value(i)
+ nil
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/path-extension.rb b/src/arrow/ruby/red-arrow/lib/arrow/path-extension.rb
new file mode 100644
index 000000000..1273f298c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/path-extension.rb
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class PathExtension
+ def initialize(path)
+ @path = path
+ end
+
+ def extract
+ basename = ::File.basename(@path)
+ components = basename.split(".")
+ return {} if components.size < 2
+
+ extension = components.last.downcase
+ if components.size > 2
+ compression = CompressionType.resolve_extension(extension)
+ if compression
+ {
+ format: components[-2].downcase,
+ compression: compression,
+ }
+ else
+ {format: extension}
+ end
+ else
+ {format: extension}
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/raw-table-converter.rb b/src/arrow/ruby/red-arrow/lib/arrow/raw-table-converter.rb
new file mode 100644
index 000000000..41d331fb3
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/raw-table-converter.rb
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class RawTableConverter
+ attr_reader :n_rows
+ attr_reader :schema
+ attr_reader :values
+ def initialize(raw_table)
+ @raw_table = raw_table
+ convert
+ end
+
+ private
+ def convert
+ if @raw_table.is_a?(::Array) and @raw_table[0].is_a?(Column)
+ fields = @raw_table.collect(&:field)
+ @schema = Schema.new(fields)
+ @values = @raw_table.collect(&:data)
+ else
+ fields = []
+ @values = []
+ @raw_table.each do |name, array|
+ array = ArrayBuilder.build(array) if array.is_a?(::Array)
+ fields << Field.new(name.to_s, array.value_data_type)
+ @values << array
+ end
+ @schema = Schema.new(fields)
+ end
+ @n_rows = @values[0].length
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record-batch-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-builder.rb
new file mode 100644
index 000000000..dc20312f2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-builder.rb
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class RecordBatchBuilder
+ class << self
+ # @since 0.12.0
+ def build(schema, data)
+ builder = new(schema)
+ builder.append(data)
+ builder.flush
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+ def initialize(schema)
+ unless schema.is_a?(Schema)
+ schema = Schema.new(schema)
+ end
+ initialize_raw(schema)
+ @name_to_index = {}
+ schema.fields.each_with_index do |field, i|
+ @name_to_index[field.name] = i
+ end
+ end
+
+ # @since 0.12.0
+ def [](name_or_index)
+ case name_or_index
+ when String, Symbol
+ name = name_or_index
+ self[resolve_name(name)]
+ else
+ index = name_or_index
+ column_builders[index]
+ end
+ end
+
+ # @since 0.12.0
+ def append(*values)
+ values.each do |value|
+ case value
+ when Hash
+ append_columns(value)
+ else
+ append_records(value)
+ end
+ end
+ end
+
+ # @since 0.12.0
+ def append_records(records)
+ n = n_columns
+ columns = n.times.collect do
+ []
+ end
+ records.each_with_index do |record, nth_record|
+ case record
+ when nil
+ when Hash
+ record.each do |name, value|
+ nth_column = resolve_name(name)
+ next if nth_column.nil?
+ columns[nth_column] << value
+ end
+ else
+ record.each_with_index do |value, nth_column|
+ columns[nth_column] << value
+ end
+ end
+ columns.each do |column|
+ column << nil if column.size != (nth_record + 1)
+ end
+ end
+ columns.each_with_index do |column, i|
+ self[i].append(*column)
+ end
+ end
+
+ # @since 0.12.0
+ def append_columns(columns)
+ columns.each do |name, values|
+ self[name].append(*values)
+ end
+ end
+
+ # @since 0.13.0
+ def column_builders
+ @column_builders ||= n_columns.times.collect do |i|
+ get_column_builder(i)
+ end
+ end
+
+ private
+ def resolve_name(name)
+ @name_to_index[name.to_s]
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record-batch-file-reader.rb b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-file-reader.rb
new file mode 100644
index 000000000..86a757e32
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-file-reader.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class RecordBatchFileReader
+ include Enumerable
+
+ def each
+ n_record_batches.times do |i|
+ yield(get_record_batch(i))
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record-batch-iterator.rb b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-iterator.rb
new file mode 100644
index 000000000..4b828c6dc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-iterator.rb
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class RecordBatchIterator
+ alias_method :to_a, :to_list
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record-batch-reader.rb b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-reader.rb
new file mode 100644
index 000000000..e030e4f3b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-reader.rb
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class RecordBatchReader
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when ::Array
+ return nil if value.empty?
+ if value.all? {|v| v.is_a?(RecordBatch)}
+ new(value)
+ else
+ nil
+ end
+ when RecordBatch
+ new([value])
+ when Table
+ TableBatchReader.new(value)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record-batch-stream-reader.rb b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-stream-reader.rb
new file mode 100644
index 000000000..fa15c8000
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record-batch-stream-reader.rb
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class RecordBatchStreamReader
+ include Enumerable
+
+ def each
+ loop do
+ record_batch = next_record_batch
+ break if record_batch.nil?
+ yield(record_batch)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record-batch.rb b/src/arrow/ruby/red-arrow/lib/arrow/record-batch.rb
new file mode 100644
index 000000000..c5aaf876b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record-batch.rb
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow/raw-table-converter"
+
+module Arrow
+ class RecordBatch
+ include ColumnContainable
+ include RecordContainable
+ include Enumerable
+
+ class << self
+ def new(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ raw_table_converter = RawTableConverter.new(args[0])
+ n_rows = raw_table_converter.n_rows
+ schema = raw_table_converter.schema
+ values = raw_table_converter.values
+ super(schema, n_rows, values)
+ when 2
+ schema, data = args
+ RecordBatchBuilder.build(schema, data)
+ when 3
+ super
+ else
+ message = "wrong number of arguments (given #{n_args}, expected 1..3)"
+ raise ArgumentError, message
+ end
+ end
+ end
+
+ alias_method :each, :each_record
+
+ alias_method :size, :n_rows
+ alias_method :length, :n_rows
+
+ # Converts the record batch to {Arrow::Table}.
+ #
+ # @return [Arrow::Table]
+ #
+ # @since 0.12.0
+ def to_table
+ Table.new(schema, [self])
+ end
+
+ def respond_to_missing?(name, include_private)
+ return true if find_column(name)
+ super
+ end
+
+ def method_missing(name, *args, &block)
+ if args.empty?
+ column = find_column(name)
+ return column if column
+ end
+ super
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record-containable.rb b/src/arrow/ruby/red-arrow/lib/arrow/record-containable.rb
new file mode 100644
index 000000000..20c9ac2f5
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record-containable.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module RecordContainable
+ def each_record(reuse_record: false)
+ unless block_given?
+ return to_enum(__method__, reuse_record: reuse_record)
+ end
+
+ if reuse_record
+ record = Record.new(self, nil)
+ n_rows.times do |i|
+ record.index = i
+ yield(record)
+ end
+ else
+ n_rows.times do |i|
+ yield(Record.new(self, i))
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/record.rb b/src/arrow/ruby/red-arrow/lib/arrow/record.rb
new file mode 100644
index 000000000..6f83dded0
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/record.rb
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Record
+ attr_reader :container
+ attr_accessor :index
+ def initialize(container, index)
+ @container = container
+ @index = index
+ end
+
+ def [](column_name_or_column_index)
+ column = @container.find_column(column_name_or_column_index)
+ return nil if column.nil?
+ column[@index]
+ end
+
+ def to_a
+ @container.columns.collect do |column|
+ column[@index]
+ end
+ end
+
+ def to_h
+ attributes = {}
+ @container.columns.each do |column|
+ attributes[column.name] = column[@index]
+ end
+ attributes
+ end
+
+ def respond_to_missing?(name, include_private)
+ return true if @container.find_column(name)
+ super
+ end
+
+ def method_missing(name, *args, &block)
+ if args.empty?
+ column = @container.find_column(name)
+ return column[@index] if column
+ end
+ super
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/rolling-window.rb b/src/arrow/ruby/red-arrow/lib/arrow/rolling-window.rb
new file mode 100644
index 000000000..1db03bb23
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/rolling-window.rb
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ # Experimental
+ #
+ # TODO: Almost codes should be implemented in Apache Arrow C++.
+ class RollingWindow
+ def initialize(table, size)
+ @table = table
+ @size = size
+ end
+
+ def lag(key, diff: 1)
+ column = @table[key]
+ if @size
+ windows = column.each_slice(@size)
+ else
+ windows = column
+ end
+ lag_values = [nil] * diff
+ windows.each_cons(diff + 1) do |values|
+ target = values[0]
+ current = values[1]
+ if target.nil? or current.nil?
+ lag_values << nil
+ else
+ lag_values << current - target
+ end
+ end
+ ArrayBuilder.build(lag_values)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/scalar.rb b/src/arrow/ruby/red-arrow/lib/arrow/scalar.rb
new file mode 100644
index 000000000..b2bf1ac59
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/scalar.rb
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Scalar
+ # @param other [Arrow::Scalar] The scalar to be compared.
+ # @param options [Arrow::EqualOptions, Hash] (nil)
+ # The options to custom how to compare.
+ #
+ # @return [Boolean]
+ # `true` if both of them have the same data, `false` otherwise.
+ #
+ # @since 5.0.0
+ def equal_scalar?(other, options=nil)
+ equal_options(other, options)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/schema.rb b/src/arrow/ruby/red-arrow/lib/arrow/schema.rb
new file mode 100644
index 000000000..03354c862
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/schema.rb
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Schema
+ include FieldContainable
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::Schema}.
+ #
+ # @overload initialize(fields)
+ #
+ # @param fields [::Array<Arrow::Field, Hash>] The fields of the
+ # schema. You can mix {Arrow::Field} and field description in
+ # the fields.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @example Create a schema with {Arrow::Field}s
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # Arrow::Schema.new([visible_field])
+ #
+ # @example Create a schema with field descriptions
+ # visible_field_description = {
+ # name: "visible",
+ # data_type: :boolean,
+ # }
+ # Arrow::Schema.new([visible_field_description])
+ #
+ # @example Create a schema with {Arrow::Field}s and field descriptions
+ # fields = [
+ # Arrow::Field.new("visible", :boolean),
+ # {
+ # name: "count",
+ # type: :int32,
+ # },
+ # ]
+ # Arrow::Schema.new(fields)
+ #
+ # @overload initialize(fields)
+ #
+ # @param fields [Hash{String, Symbol => Arrow::DataType, Hash}]
+ # The pairs of field name and field data type of the schema.
+ # You can mix {Arrow::DataType} and data description for field
+ # data type.
+ #
+ # See {Arrow::DataType.new} how to specify data type description.
+ #
+ # @example Create a schema with fields
+ # fields = {
+ # "visible" => Arrow::BooleanDataType.new,
+ # :count => :int32,
+ # :tags => {
+ # type: :list,
+ # field: {
+ # name: "tag",
+ # type: :string,
+ # },
+ # },
+ # }
+ # Arrow::Schema.new(fields)
+ def initialize(fields)
+ case fields
+ when ::Array
+ fields = fields.collect do |field|
+ field = Field.new(field) unless field.is_a?(Field)
+ field
+ end
+ when Hash
+ fields = fields.collect do |name, data_type|
+ Field.new(name, data_type)
+ end
+ end
+ initialize_raw(fields)
+ end
+
+ alias_method :[], :find_field
+
+ alias_method :to_s_raw, :to_s
+ def to_s(show_metadata: false)
+ to_string_metadata(show_metadata)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/slicer.rb b/src/arrow/ruby/red-arrow/lib/arrow/slicer.rb
new file mode 100644
index 000000000..6cca7f75e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/slicer.rb
@@ -0,0 +1,355 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Slicer
+ def initialize(table)
+ @table = table
+ end
+
+ def [](column_name)
+ column = @table[column_name]
+ return nil if column.nil?
+ ColumnCondition.new(column)
+ end
+
+ def respond_to_missing?(name, include_private)
+ return true if self[name]
+ super
+ end
+
+ def method_missing(name, *args, &block)
+ if args.empty?
+ column_condition = self[name]
+ return column_condition if column_condition
+ end
+ super
+ end
+
+ module Helper
+ class << self
+ def ensure_boolean(column)
+ case column.data_type
+ when Arrow::BooleanDataType
+ column.data
+ else
+ options = CastOptions.new
+ options.to_data_type = Arrow::BooleanDataType.new
+ Function.find("cast").execute([column.data], options).value
+ end
+ end
+ end
+ end
+
+ class Condition
+ def evaluate
+ message = "Slicer::Condition must define \#evaluate: #{inspect}"
+ raise NotImplementedError.new(message)
+ end
+
+ def &(condition)
+ AndCondition.new(self, condition)
+ end
+
+ def |(condition)
+ OrCondition.new(self, condition)
+ end
+
+ def ^(condition)
+ XorCondition.new(self, condition)
+ end
+ end
+
+ class LogicalCondition < Condition
+ def initialize(condition1, condition2)
+ @condition1 = condition1
+ @condition2 = condition2
+ end
+
+ def evaluate
+ function.execute([@condition1.evaluate, @condition2.evaluate]).value
+ end
+ end
+
+ class AndCondition < LogicalCondition
+ private
+ def function
+ Function.find("and")
+ end
+ end
+
+ class OrCondition < LogicalCondition
+ private
+ def function
+ Function.find("or")
+ end
+ end
+
+ class XorCondition < LogicalCondition
+ private
+ def function
+ Function.find("xor")
+ end
+ end
+
+ class ColumnCondition < Condition
+ def initialize(column)
+ @column = column
+ end
+
+ def evaluate
+ Helper.ensure_boolean(@column)
+ end
+
+ def !@
+ NotColumnCondition.new(@column)
+ end
+
+ def null?
+ self == nil
+ end
+
+ def valid?
+ self != nil
+ end
+
+ def ==(value)
+ EqualCondition.new(@column, value)
+ end
+
+ def !=(value)
+ NotEqualCondition.new(@column, value)
+ end
+
+ def <(value)
+ LessCondition.new(@column, value)
+ end
+
+ def <=(value)
+ LessEqualCondition.new(@column, value)
+ end
+
+ def >(value)
+ GreaterCondition.new(@column, value)
+ end
+
+ def >=(value)
+ GreaterEqualCondition.new(@column, value)
+ end
+
+ def in?(values)
+ InCondition.new(@column, values)
+ end
+
+ def select(&block)
+ SelectCondition.new(@column, block)
+ end
+
+ def reject(&block)
+ RejectCondition.new(@column, block)
+ end
+ end
+
+ class NotColumnCondition < Condition
+ def initialize(column)
+ @column = column
+ end
+
+ def evaluate
+ data = Helper.ensure_boolean(@column)
+ Function.find("invert").execute([data]).value
+ end
+
+ def !@
+ ColumnCondition.new(@column)
+ end
+ end
+
+ class EqualCondition < Condition
+ def initialize(column, value)
+ @column = column
+ @value = value
+ end
+
+ def !@
+ NotEqualCondition.new(@column, @value)
+ end
+
+ def evaluate
+ if @value.nil?
+ Function.find("is_null").execute([@column.data]).value
+ else
+ Function.find("equal").execute([@column.data, @value]).value
+ end
+ end
+ end
+
+ class NotEqualCondition < Condition
+ def initialize(column, value)
+ @column = column
+ @value = value
+ end
+
+ def !@
+ EqualCondition.new(@column, @value)
+ end
+
+ def evaluate
+ if @value.nil?
+ Function.find("is_valid").execute([@column.data]).value
+ else
+ Function.find("not_equal").execute([@column.data, @value]).value
+ end
+ end
+ end
+
+ class LessCondition < Condition
+ def initialize(column, value)
+ @column = column
+ @value = value
+ end
+
+ def !@
+ GreaterEqualCondition.new(@column, @value)
+ end
+
+ def evaluate
+ Function.find("less").execute([@column.data, @value]).value
+ end
+ end
+
+ class LessEqualCondition < Condition
+ def initialize(column, value)
+ @column = column
+ @value = value
+ end
+
+ def !@
+ GreaterCondition.new(@column, @value)
+ end
+
+ def evaluate
+ Function.find("less_equal").execute([@column.data, @value]).value
+ end
+ end
+
+ class GreaterCondition < Condition
+ def initialize(column, value)
+ @column = column
+ @value = value
+ end
+
+ def !@
+ LessEqualCondition.new(@column, @value)
+ end
+
+ def evaluate
+ Function.find("greater").execute([@column.data, @value]).value
+ end
+ end
+
+ class GreaterEqualCondition < Condition
+ def initialize(column, value)
+ @column = column
+ @value = value
+ end
+
+ def !@
+ LessCondition.new(@column, @value)
+ end
+
+ def evaluate
+ Function.find("greater_equal").execute([@column.data, @value]).value
+ end
+ end
+
+ class InCondition < Condition
+ def initialize(column, values)
+ @column = column
+ @values = values
+ end
+
+ def !@
+ NotInCondition.new(@column, @values)
+ end
+
+ def evaluate
+ values = @values
+ values = Array.new(values) unless values.is_a?(Array)
+ options = SetLookupOptions.new(values)
+ Function.find("is_in").execute([@column.data], options).value
+ end
+ end
+
+ class NotInCondition < Condition
+ def initialize(column, values)
+ @column = column
+ @values = values
+ end
+
+ def !@
+ InCondition.new(@column, @values)
+ end
+
+ def evaluate
+ values = @values
+ values = Array.new(values) unless values.is_a?(Array)
+ options = SetLookupOptions.new(values)
+ booleans = Function.find("is_in").execute([@column.data], options).value
+ Function.find("invert").execute([booleans]).value
+ end
+ end
+
+ class SelectCondition < Condition
+ def initialize(column, block)
+ @column = column
+ @block = block
+ end
+
+ def !@
+ RejectCondition.new(@column, @block)
+ end
+
+ def evaluate
+ BooleanArray.new(@column.collect(&@block))
+ end
+ end
+
+ class RejectCondition < Condition
+ def initialize(column, block)
+ @column = column
+ @block = block
+ end
+
+ def !@
+ SelectCondition.new(@column, @block)
+ end
+
+ def evaluate
+ raw_array = @column.collect do |value|
+ evaluated_value = @block.call(value)
+ if evaluated_value.nil?
+ nil
+ else
+ not evaluated_value
+ end
+ end
+ BooleanArray.new(raw_array)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/sort-key.rb b/src/arrow/ruby/red-arrow/lib/arrow/sort-key.rb
new file mode 100644
index 000000000..987027256
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/sort-key.rb
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class SortKey
+ class << self
+ # Ensure returning suitable {Arrow::SortKey}.
+ #
+ # @overload resolve(sort_key)
+ #
+ # Returns the given sort key itself. This is convenient to use
+ # this method as {Arrow::SortKey} converter.
+ #
+ # @param sort_key [Arrow::SortKey] The sort key.
+ #
+ # @return [Arrow::SortKey] The given sort key itself.
+ #
+ # @overload resolve(name)
+ #
+ # Creates a new suitable sort key from column name with
+ # leading order mark. See {#initialize} for details about
+ # order mark.
+ #
+ # @return [Arrow::SortKey] A new suitable sort key.
+ #
+ # @overload resolve(name, order)
+ #
+ # Creates a new suitable sort key from column name without
+ # leading order mark and order. See {#initialize} for details.
+ #
+ # @return [Arrow::SortKey] A new suitable sort key.
+ #
+ # @since 4.0.0
+ def resolve(name, order=nil)
+ return name if name.is_a?(self)
+ new(name, order)
+ end
+
+ # @api private
+ def try_convert(value)
+ case value
+ when Symbol, String
+ new(value.to_s, :ascending)
+ else
+ nil
+ end
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+ # Creates a new {Arrow::SortKey}.
+ #
+ # @overload initialize(name)
+ #
+ # @param name [Symbol, String] The name of the sort column.
+ #
+ # If `name` is a String, the first character may be processed
+ # as the "leading order mark". If the first character is `"+"`
+ # or `"-"`, they are processed as a leading order mark. If the
+ # first character is processed as a leading order mark, the
+ # first character is removed from sort column name and
+ # corresponding order is used. `"+"` uses ascending order and
+ # `"-"` uses ascending order.
+ #
+ # If `name` is not a String nor `name` doesn't start with the
+ # leading order mark, sort column name is `name` as-is and
+ # ascending order is used.
+ #
+ # @example String without the leading order mark
+ # key = Arrow::SortKey.new("count")
+ # key.name # => "count"
+ # key.order # => Arrow::SortOrder::ASCENDING
+ #
+ # @example String with the "+" leading order mark
+ # key = Arrow::SortKey.new("+count")
+ # key.name # => "count"
+ # key.order # => Arrow::SortOrder::ASCENDING
+ #
+ # @example String with the "-" leading order mark
+ # key = Arrow::SortKey.new("-count")
+ # key.name # => "count"
+ # key.order # => Arrow::SortOrder::DESCENDING
+ #
+ # @example Symbol that starts with "-"
+ # key = Arrow::SortKey.new(:"-count")
+ # key.name # => "-count"
+ # key.order # => Arrow::SortOrder::ASCENDING
+ #
+ # @overload initialize(name, order)
+ #
+ # @param name [Symbol, String] The name of the sort column.
+ #
+ # No leading order mark processing. The given `name` is used
+ # as-is.
+ #
+ # @param order [Symbol, String, Arrow::SortOrder] How to order
+ # by this sort key.
+ #
+ # If this is a Symbol or String, this must be `:ascending`,
+ # `"ascending"`, `:asc`, `"asc"`, `:descending`,
+ # `"descending"`, `:desc` or `"desc"`.
+ #
+ # @example No leading order mark processing
+ # key = Arrow::SortKey.new("-count", :ascending)
+ # key.name # => "-count"
+ # key.order # => Arrow::SortOrder::ASCENDING
+ #
+ # @example Order by abbreviated name with Symbol
+ # key = Arrow::SortKey.new("count", :desc)
+ # key.name # => "count"
+ # key.order # => Arrow::SortOrder::DESCENDING
+ #
+ # @example Order by String
+ # key = Arrow::SortKey.new("count", "descending")
+ # key.name # => "count"
+ # key.order # => Arrow::SortOrder::DESCENDING
+ #
+ # @example Order by Arrow::SortOrder
+ # key = Arrow::SortKey.new("count", Arrow::SortOrder::DESCENDING)
+ # key.name # => "count"
+ # key.order # => Arrow::SortOrder::DESCENDING
+ #
+ # @since 4.0.0
+ def initialize(name, order=nil)
+ name, order = normalize_name(name, order)
+ order = normalize_order(order) || :ascending
+ initialize_raw(name, order)
+ end
+
+ # @return [String] The string representation of this sort key. You
+ # can use recreate {Arrow::SortKey} by
+ # `Arrow::SortKey.new(key.to_s)`.
+ #
+ # @example Recreate Arrow::SortKey
+ # key = Arrow::SortKey.new("-count")
+ # key.to_s # => "-count"
+ # key == Arrow::SortKey.new(key.to_s) # => true
+ #
+ # @since 4.0.0
+ def to_s
+ if order == SortOrder::ASCENDING
+ "+#{name}"
+ else
+ "-#{name}"
+ end
+ end
+
+ private
+ def normalize_name(name, order)
+ case name
+ when Symbol
+ return name.to_s, order
+ when String
+ return name, order if order
+ if name.start_with?("-")
+ return name[1..-1], order || :descending
+ elsif name.start_with?("+")
+ return name[1..-1], order || :ascending
+ else
+ return name, order
+ end
+ else
+ return name, order
+ end
+ end
+
+ def normalize_order(order)
+ case order
+ when :asc, "asc"
+ :ascending
+ when :desc, "desc"
+ :descending
+ else
+ order
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/sort-options.rb b/src/arrow/ruby/red-arrow/lib/arrow/sort-options.rb
new file mode 100644
index 000000000..a7c2d6431
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/sort-options.rb
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class SortOptions
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when Symbol, String
+ new(value)
+ when ::Array
+ new(*value)
+ else
+ nil
+ end
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+ # @param sort_keys [::Array<String, Symbol, Arrow::SortKey>] The
+ # sort keys to be used. See {Arrow::SortKey.resolve} how to
+ # resolve each sort key in `sort_keys`.
+ #
+ # You can add more sort keys by {#add_sort_key} later.
+ #
+ # @example No initial sort keys
+ # options = Arrow::SortOptions.new
+ # options.sort_keys # => []
+ #
+ # @example String sort keys
+ # options = Arrow::SortOptions.new("count", "-age")
+ # options.sort_keys.collect(&:to_s) # => ["+count", "-age"]
+ #
+ # @example Symbol sort keys
+ # options = Arrow::SortOptions.new(:count, :age)
+ # options.sort_keys.collect(&:to_s) # => ["+count", "+age"]
+ #
+ # @example Mixed sort keys
+ # options = Arrow::SortOptions.new(:count, "-age")
+ # options.sort_keys.collect(&:to_s) # => ["+count", "-age"]
+ #
+ # @since 4.0.0
+ def initialize(*sort_keys)
+ initialize_raw
+ sort_keys.each do |sort_key|
+ add_sort_key(sort_key)
+ end
+ end
+
+ # @api private
+ alias_method :add_sort_key_raw, :add_sort_key
+ # Add a sort key.
+ #
+ # @return [void]
+ #
+ # @overload add_sort_key(key)
+ #
+ # @param key [Arrow::SortKey] The sort key to be added.
+ #
+ # @example Add a key to sort by "price" column in descending order
+ # options = Arrow::SortOptions.new
+ # options.add_sort_key(Arrow::SortKey.new(:price, :descending))
+ # options.sort_keys.collect(&:to_s) # => ["-price"]
+ #
+ # @overload add_sort_key(name)
+ #
+ # @param name [Symbol, String] The sort key name to be
+ # added. See also {Arrow::SortKey#initialize} for the leading
+ # order mark for String name.
+ #
+ # @example Add a key to sort by "price" column in descending order
+ # options = Arrow::SortOptions.new
+ # options.add_sort_key("-price")
+ # options.sort_keys.collect(&:to_s) # => ["-price"]
+ #
+ # @overload add_sort_key(name, order)
+ #
+ # @param name [Symbol, String] The sort key name.
+ #
+ # @param order [Symbol, String, Arrow::SortOrder] The sort
+ # order. See {Arrow::SortKey#initialize} for details.
+ #
+ # @example Add a key to sort by "price" column in descending order
+ # options = Arrow::SortOptions.new
+ # options.add_sort_key("price", :desc)
+ # options.sort_keys.collect(&:to_s) # => ["-price"]
+ #
+ # @since 4.0.0
+ def add_sort_key(name, order=nil)
+ add_sort_key_raw(SortKey.resolve(name, order))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/source-node-options.rb b/src/arrow/ruby/red-arrow/lib/arrow/source-node-options.rb
new file mode 100644
index 000000000..402ea85f7
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/source-node-options.rb
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class SourceNodeOptions
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when RecordBatchReader, RecordBatch, Table
+ new(value)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/sparse-union-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/sparse-union-data-type.rb
new file mode 100644
index 000000000..14f3e5a7e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/sparse-union-data-type.rb
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class SparseUnionDataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::SparseUnionDataType}.
+ #
+ # @overload initialize(fields, type_codes)
+ #
+ # @param fields [::Array<Arrow::Field, Hash>] The fields of the
+ # sparse union data type. You can mix {Arrow::Field} and field
+ # description in the fields.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @param type_codes [::Array<Integer>] The IDs that indicates
+ # corresponding fields.
+ #
+ # @example Create a sparse union data type for `{2: visible, 9: count}`
+ # fields = [
+ # Arrow::Field.new("visible", :boolean),
+ # {
+ # name: "count",
+ # type: :int32,
+ # },
+ # ]
+ # Arrow::SparseUnionDataType.new(fields, [2, 9])
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the sparse union
+ # data type. It must have `:fields` and `:type_codes` values.
+ #
+ # @option description [::Array<Arrow::Field, Hash>] :fields The
+ # fields of the sparse union data type. You can mix
+ # {Arrow::Field} and field description in the fields.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @option description [::Array<Integer>] :type_codes The IDs
+ # that indicates corresponding fields.
+ #
+ # @example Create a sparse union data type for `{2: visible, 9: count}`
+ # fields = [
+ # Arrow::Field.new("visible", :boolean),
+ # {
+ # name: "count",
+ # type: :int32,
+ # },
+ # ]
+ # Arrow::SparseUnionDataType.new(fields: fields,
+ # type_codes: [2, 9])
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ description = args[0]
+ fields = description[:fields]
+ type_codes = description[:type_codes]
+ when 2
+ fields, type_codes = args
+ else
+ message = "wrong number of arguments (given, #{n_args}, expected 1..2)"
+ raise ArgumentError, message
+ end
+ fields = fields.collect do |field|
+ field = Field.new(field) unless field.is_a?(Field)
+ field
+ end
+ initialize_raw(fields, type_codes)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/string-dictionary-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/string-dictionary-array-builder.rb
new file mode 100644
index 000000000..fc2f90b80
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/string-dictionary-array-builder.rb
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class StringDictionaryArrayBuilder
+ include SymbolValuesAppendable
+
+ private
+ def create_values_array_builder
+ StringArrayBuilder.new
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/struct-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/struct-array-builder.rb
new file mode 100644
index 000000000..ce883166a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/struct-array-builder.rb
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class StructArrayBuilder
+ class << self
+ def build(data_type, values)
+ builder = new(data_type)
+ builder.build(values)
+ end
+ end
+
+ def [](index_or_name)
+ find_field_builder(index_or_name)
+ end
+
+ def find_field_builder(index_or_name)
+ case index_or_name
+ when String, Symbol
+ name = index_or_name
+ cached_name_to_builder[name.to_s]
+ else
+ index = index_or_name
+ cached_field_builders[index]
+ end
+ end
+
+ alias_method :append_value_raw, :append_value
+
+ # @overload append_value
+ #
+ # Starts appending a struct record. You need to append values of
+ # fields.
+ #
+ # @overload append_value(value)
+ #
+ # Appends a struct record including values of fields.
+ #
+ # @param value [nil, ::Array, Hash] The struct record value.
+ #
+ # If this is `nil`, the struct record is null.
+ #
+ # If this is `Array` or `Hash`, they are values of fields.
+ #
+ # @since 0.12.0
+ def append_value(*args)
+ n_args = args.size
+
+ case n_args
+ when 0
+ append_value_raw
+ when 1
+ value = args[0]
+ case value
+ when nil
+ append_null
+ when ::Array
+ append_value_raw
+ cached_field_builders.zip(value) do |builder, sub_value|
+ builder.append(sub_value)
+ end
+ when Hash
+ append_value_raw
+ local_name_to_builder = cached_name_to_builder.dup
+ value.each do |name, sub_value|
+ builder = local_name_to_builder.delete(name.to_s)
+ builder.append(sub_value)
+ end
+ local_name_to_builder.each do |_, builder|
+ builder.append_null
+ end
+ else
+ message =
+ "struct value must be nil, Array or Hash: #{value.inspect}"
+ raise ArgumentError, message
+ end
+ else
+ message = "wrong number of arguments (given #{n_args}, expected 0..1)"
+ raise ArgumentError, message
+ end
+ end
+
+ def append_values(values, is_valids=nil)
+ if is_valids
+ is_valids.each_with_index do |is_valid, i|
+ if is_valid
+ append_value(values[i])
+ else
+ append_null
+ end
+ end
+ else
+ values.each do |value|
+ append_value(value)
+ end
+ end
+ end
+
+ alias_method :append_null_raw, :append_null
+ def append_null
+ append_null_raw
+ end
+
+ # @since 0.12.0
+ def append(*values)
+ if values.empty?
+ # For backward compatibility
+ append_value_raw
+ else
+ super
+ end
+ end
+
+ private
+ def cached_field_builders
+ @field_builders ||= field_builders
+ end
+
+ def build_name_to_builder
+ name_to_builder = {}
+ builders = cached_field_builders
+ value_data_type.fields.each_with_index do |field, i|
+ name_to_builder[field.name] = builders[i]
+ end
+ name_to_builder
+ end
+
+ def cached_name_to_builder
+ @name_to_builder ||= build_name_to_builder
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/struct-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/struct-array.rb
new file mode 100644
index 000000000..0b293dfc1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/struct-array.rb
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class StructArray
+ # @param i [Integer]
+ # The index of the value to be gotten. You must specify the value index.
+ #
+ # You can use {Arrow::Array#[]} for convenient value access.
+ #
+ # @return [Hash] The `i`-th struct.
+ def get_value(i)
+ value = {}
+ value_data_type.fields.zip(fields) do |field, field_array|
+ value[field.name] = field_array[i]
+ end
+ value
+ end
+
+ # @overload find_field(index)
+ # @param index [Integer] The index of the field to be found.
+ # @return [Arrow::Array, nil]
+ # The `index`-th field or `nil` for out of range.
+ #
+ # @overload find_field(name)
+ # @param index [String, Symbol] The name of the field to be found.
+ # @return [Arrow::Array, nil]
+ # The field that has `name` or `nil` for nonexistent name.
+ def find_field(index_or_name)
+ case index_or_name
+ when String, Symbol
+ name = index_or_name
+ (@name_to_field ||= build_name_to_field)[name.to_s]
+ else
+ index = index_or_name
+ fields[index]
+ end
+ end
+
+ alias_method :fields_raw, :fields
+ def fields
+ @fields ||= fields_raw
+ end
+
+ private
+ def build_name_to_field
+ name_to_field = {}
+ value_data_type.fields.zip(fields) do |field, field_array|
+ name_to_field[field.name] = field_array
+ end
+ name_to_field
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/struct-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/struct-data-type.rb
new file mode 100644
index 000000000..a89a01689
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/struct-data-type.rb
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class StructDataType
+ include FieldContainable
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::StructDataType}.
+ #
+ # @overload initialize(fields)
+ #
+ # @param fields [::Array<Arrow::Field, Hash>] The fields of the
+ # struct data type. You can also specify field description as
+ # a field. You can mix {Arrow::Field} and field description.
+ #
+ # See {Arrow::Field.new} how to specify field description.
+ #
+ # @example Create a struct data type with {Arrow::Field}s
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # count_field = Arrow::Field.new("count", :int32)
+ # Arrow::StructDataType.new([visible_field, count_field])
+ #
+ # @example Create a struct data type with field descriptions
+ # field_descriptions = [
+ # {name: "visible", type: :boolean},
+ # {name: "count", type: :int32},
+ # ]
+ # Arrow::StructDataType.new(field_descriptions)
+ #
+ # @example Create a struct data type with {Arrow::Field} and field description
+ # fields = [
+ # Arrow::Field.new("visible", :boolean),
+ # {name: "count", type: :int32},
+ # ]
+ # Arrow::StructDataType.new(fields)
+ #
+ # @overload initialize(fields)
+ #
+ # @param fields [Hash{String, Symbol => Arrow::DataType, Hash}]
+ # The pairs of field name and field data type of the struct
+ # data type. You can also specify data type description by
+ # `Hash`. You can mix {Arrow::DataType} and data type description.
+ #
+ # See {Arrow::DataType.resolve} how to specify data type
+ # description.
+ #
+ # @example Create a struct data type with {Arrow::DataType}s
+ # fields = {
+ # "visible" => Arrow::BooleanDataType.new,
+ # "count" => Arrow::Int32DataType.new,
+ # }
+ # Arrow::StructDataType.new(fields)
+ #
+ # @example Create a struct data type with data type descriptions
+ # fields = {
+ # "visible" => :boolean,
+ # "count" => {type: :int32},
+ # }
+ # Arrow::StructDataType.new(fields)
+ #
+ # @example Create a struct data type with {Arrow::DataType} and data type description
+ # fields = {
+ # "visible" => Arrow::BooleanDataType.new,
+ # "count" => {type: :int32},
+ # }
+ # Arrow::StructDataType.new(fields)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the struct data
+ # type. It must have `:fields` value.
+ #
+ # @option description
+ # [::Array<Arrow::Field, Hash>,
+ # Hash{String, Symbol => Arrow::DataType, Hash, String, Symbol}]
+ # :fields The fields of the struct data type.
+ #
+ # @example Create a struct data type with {Arrow::Field} and field description
+ # fields = [
+ # Arrow::Field.new("visible", :boolean),
+ # {name: "count", type: :int32},
+ # ]
+ # Arrow::StructDataType.new(fields: fields)
+ #
+ # @example Create a struct data type with {Arrow::DataType} and data type description
+ # fields = {
+ # "visible" => Arrow::BooleanDataType.new,
+ # "count" => {type: :int32},
+ # }
+ # Arrow::StructDataType.new(fields: fields)
+ def initialize(fields)
+ if fields.is_a?(Hash) and fields.key?(:fields)
+ description = fields
+ fields = description[:fields]
+ end
+ if fields.is_a?(Hash)
+ fields = fields.collect do |name, data_type|
+ Field.new(name, data_type)
+ end
+ else
+ fields = fields.collect do |field|
+ field = Field.new(field) unless field.is_a?(Field)
+ field
+ end
+ end
+ initialize_raw(fields)
+ end
+
+ alias_method :[], :find_field
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/symbol-values-appendable.rb b/src/arrow/ruby/red-arrow/lib/arrow/symbol-values-appendable.rb
new file mode 100644
index 000000000..66ab0a490
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/symbol-values-appendable.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module SymbolValuesAppendable
+ def append_values(values, is_valids=nil)
+ builder = create_values_array_builder
+ values = values.collect do |value|
+ case value
+ when Symbol
+ value.to_s
+ else
+ value
+ end
+ end
+ builder.append_values(values, is_valids)
+ append_array(builder.finish)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/table-concatenate-options.rb b/src/arrow/ruby/red-arrow/lib/arrow/table-concatenate-options.rb
new file mode 100644
index 000000000..730bce1c8
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/table-concatenate-options.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class TableConcatenateOptions
+ class << self
+ # @api private
+ def try_convert(value)
+ case value
+ when Hash
+ options = new
+ value.each do |k, v|
+ options.public_send("#{k}=", value)
+ end
+ options
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/table-formatter.rb b/src/arrow/ruby/red-arrow/lib/arrow/table-formatter.rb
new file mode 100644
index 000000000..d039679f9
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/table-formatter.rb
@@ -0,0 +1,190 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ # TODO: Almost codes should be implemented in Apache Arrow C++.
+ class TableFormatter
+ # @private
+ class ColumnFormatter
+ attr_reader :column
+ attr_reader :head_values
+ attr_reader :tail_values
+ attr_reader :sample_values
+ def initialize(column, head_values, tail_values)
+ @column = column
+ @head_values = head_values
+ @tail_values = tail_values
+ @sample_values = head_values + tail_values
+ @field_value_widths = {}
+ end
+
+ def data_type
+ @data_type ||= @column.data_type
+ end
+
+ def name
+ @name ||= @column.name
+ end
+
+ def aligned_name
+ @aligned_name ||= format_aligned_name(name, data_type, @sample_values)
+ end
+
+ FLOAT_N_DIGITS = 10
+ FORMATTED_NULL = "(null)"
+
+ def format_value(value, width=0)
+ case value
+ when ::Time
+ value.iso8601
+ when Float
+ "%*f" % [[width, FLOAT_N_DIGITS].max, value]
+ when Integer
+ "%*d" % [width, value]
+ when Hash
+ formatted_values = data_type.fields.collect do |field|
+ field_name = field.name
+ field_value_width = compute_field_value_width(field, @sample_values)
+ formatted_name = format_value(field_name, 0)
+ formatted_value = format_value(value[field_name], field_value_width)
+ "#{formatted_name}: #{formatted_value}"
+ end
+ formatted = "{"
+ formatted << formatted_values.join(", ")
+ formatted << "}"
+ "%-*s" % [width, formatted]
+ when nil
+ "%*s" % [width, FORMATTED_NULL]
+ else
+ "%-*s" % [width, value.to_s]
+ end
+ end
+
+ private
+ def compute_field_value_width(field, sample_values)
+ unless @field_value_widths.key?(field)
+ field_name = field.name
+ field_sample_values = sample_values.collect do |v|
+ (v || {})[field_name]
+ end
+ field_aligned_name = format_aligned_name("",
+ field.data_type,
+ field_sample_values)
+ @field_value_widths[field] = field_aligned_name.size
+ end
+ @field_value_widths[field]
+ end
+
+ def format_aligned_name(name, data_type, sample_values)
+ case data_type
+ when TimestampDataType
+ "%*s" % [::Time.now.iso8601.size, name]
+ when IntegerDataType
+ have_null = false
+ have_negative = false
+ max_value = nil
+ sample_values.each do |value|
+ if value.nil?
+ have_null = true
+ else
+ if max_value.nil?
+ max_value = value.abs
+ else
+ max_value = [value.abs, max_value].max
+ end
+ have_negative = true if value.negative?
+ end
+ end
+ if max_value.nil?
+ width = 0
+ elsif max_value.zero?
+ width = 1
+ else
+ width = (Math.log10(max_value) + 1).truncate
+ end
+ width += 1 if have_negative # Need "-"
+ width = [width, FORMATTED_NULL.size].max if have_null
+ "%*s" % [width, name]
+ when FloatDataType, DoubleDataType
+ "%*s" % [FLOAT_N_DIGITS, name]
+ when StructDataType
+ field_widths = data_type.fields.collect do |field|
+ field_value_width = compute_field_value_width(field, sample_values)
+ field.name.size + ": ".size + field_value_width
+ end
+ width = "{}".size + field_widths.sum
+ if field_widths.size > 0
+ width += (", ".size * (field_widths.size - 1))
+ end
+ "%*s" % [width, name]
+ else
+ name
+ end
+ end
+ end
+
+ def initialize(table, options={})
+ @table = table
+ @options = options
+ end
+
+ def format
+ text = ""
+ n_rows = @table.n_rows
+ border = @options[:border] || 10
+
+ head_limit = [border, n_rows].min
+
+ tail_start = [border, n_rows - border].max
+ tail_limit = n_rows - tail_start
+
+ column_formatters = @table.columns.collect do |column|
+ head_values = column.each.take(head_limit)
+ if tail_limit > 0
+ tail_values = column.reverse_each.take(tail_limit).reverse
+ else
+ tail_values = []
+ end
+ ColumnFormatter.new(column, head_values, tail_values)
+ end
+
+ format_header(text, column_formatters)
+ return text if n_rows.zero?
+
+ n_digits = (Math.log10(n_rows) + 1).truncate
+ format_rows(text,
+ column_formatters,
+ column_formatters.collect(&:head_values).transpose,
+ n_digits,
+ 0)
+ return text if n_rows <= border
+
+
+ if head_limit != tail_start
+ format_ellipsis(text)
+ end
+
+ format_rows(text,
+ column_formatters,
+ column_formatters.collect(&:tail_values).transpose,
+ n_digits,
+ tail_start)
+
+ text
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/table-list-formatter.rb b/src/arrow/ruby/red-arrow/lib/arrow/table-list-formatter.rb
new file mode 100644
index 000000000..4fe293416
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/table-list-formatter.rb
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ # TODO: Almost codes should be implemented in Apache Arrow C++.
+ class TableListFormatter < TableFormatter
+ private
+ def format_header(text, columns)
+ end
+
+ def format_rows(text, column_formatters, rows, n_digits, start_offset)
+ rows.each_with_index do |row, nth_row|
+ text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n")
+ row.each_with_index do |column_value, nth_column|
+ column_formatter = column_formatters[nth_column]
+ formatted_name = column_formatter.name
+ formatted_value = column_formatter.format_value(column_value)
+ text << "#{formatted_name}: #{formatted_value}\n"
+ end
+ end
+ end
+
+ def format_ellipsis(text)
+ text << "...\n"
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/table-loader.rb b/src/arrow/ruby/red-arrow/lib/arrow/table-loader.rb
new file mode 100644
index 000000000..8f43b69df
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/table-loader.rb
@@ -0,0 +1,225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "uri"
+
+module Arrow
+ class TableLoader
+ class << self
+ def load(input, options={})
+ new(input, options).load
+ end
+ end
+
+ def initialize(input, options={})
+ input = input.to_path if input.respond_to?(:to_path)
+ @input = input
+ @options = options
+ fill_options
+ end
+
+ def load
+ if @input.is_a?(URI)
+ custom_load_method = "load_from_uri"
+ elsif @input.is_a?(String) and ::File.directory?(@input)
+ custom_load_method = "load_from_directory"
+ else
+ custom_load_method = "load_from_file"
+ end
+ unless respond_to?(custom_load_method, true)
+ available_schemes = []
+ (methods(true) | private_methods(true)).each do |name|
+ match_data = /\Aload_from_/.match(name.to_s)
+ if match_data
+ available_schemes << match_data.post_match
+ end
+ end
+ message = "Arrow::Table load source must be one of ["
+ message << available_schemes.join(", ")
+ message << "]: #{@input.inspect}"
+ raise ArgumentError, message
+ end
+ __send__(custom_load_method)
+ end
+
+ private
+ def load_from_file
+ format = @options[:format]
+ custom_load_method = "load_as_#{format}"
+ unless respond_to?(custom_load_method, true)
+ available_formats = []
+ (methods(true) | private_methods(true)).each do |name|
+ match_data = /\Aload_as_/.match(name.to_s)
+ if match_data
+ available_formats << match_data.post_match
+ end
+ end
+ deprecated_formats = ["batch", "stream"]
+ available_formats -= deprecated_formats
+ message = "Arrow::Table load format must be one of ["
+ message << available_formats.join(", ")
+ message << "]: #{format.inspect}"
+ raise ArgumentError, message
+ end
+ if method(custom_load_method).arity.zero?
+ __send__(custom_load_method)
+ else
+ # For backward compatibility.
+ __send__(custom_load_method, @input)
+ end
+ end
+
+ def fill_options
+ if @options[:format] and @options.key?(:compression)
+ return
+ end
+
+ case @input
+ when Buffer
+ info = {}
+ when URI
+ extension = PathExtension.new(@input.path)
+ info = extension.extract
+ else
+ extension = PathExtension.new(@input)
+ info = extension.extract
+ end
+ format = info[:format]
+ @options = @options.dup
+ if format
+ @options[:format] ||= format.to_sym
+ else
+ @options[:format] ||= :arrow
+ end
+ unless @options.key?(:compression)
+ @options[:compression] = info[:compression]
+ end
+ end
+
+ def open_input_stream
+ if @input.is_a?(Buffer)
+ BufferInputStream.new(@input)
+ else
+ MemoryMappedInputStream.new(@input)
+ end
+ end
+
+ def load_raw(input, reader)
+ schema = reader.schema
+ record_batches = []
+ reader.each do |record_batch|
+ record_batches << record_batch
+ end
+ table = Table.new(schema, record_batches)
+ table.instance_variable_set(:@input, input)
+ table
+ end
+
+ def load_as_arrow
+ input = nil
+ reader = nil
+ error = nil
+ reader_class_candidates = [
+ RecordBatchFileReader,
+ RecordBatchStreamReader,
+ ]
+ reader_class_candidates.each do |reader_class_candidate|
+ input = open_input_stream
+ begin
+ reader = reader_class_candidate.new(input)
+ rescue Arrow::Error
+ error = $!
+ else
+ break
+ end
+ end
+ raise error if reader.nil?
+ load_raw(input, reader)
+ end
+
+ # @since 1.0.0
+ def load_as_arrow_file
+ input = open_input_stream
+ reader = RecordBatchFileReader.new(input)
+ load_raw(input, reader)
+ end
+
+ # @deprecated Use `format: :arrow_file` instead.
+ def load_as_batch
+ load_as_arrow_file
+ end
+
+ # @since 1.0.0
+ def load_as_arrow_streaming
+ input = open_input_stream
+ reader = RecordBatchStreamReader.new(input)
+ load_raw(input, reader)
+ end
+
+ # @deprecated Use `format: :arrow_streaming` instead.
+ def load_as_stream
+ load_as_arrow_streaming
+ end
+
+ if Arrow.const_defined?(:ORCFileReader)
+ def load_as_orc
+ input = open_input_stream
+ reader = ORCFileReader.new(input)
+ field_indexes = @options[:field_indexes]
+ reader.set_field_indexes(field_indexes) if field_indexes
+ table = reader.read_stripes
+ table.instance_variable_set(:@input, input)
+ table
+ end
+ end
+
+ def csv_load(options)
+ options.delete(:format)
+ if @input.is_a?(Buffer)
+ CSVLoader.load(@input.data.to_s, **options)
+ else
+ CSVLoader.load(Pathname.new(@input), **options)
+ end
+ end
+
+ def load_as_csv
+ csv_load(@options.dup)
+ end
+
+ def load_as_tsv
+ options = @options.dup
+ options[:delimiter] = "\t"
+ csv_load(options.dup)
+ end
+
+ def load_as_feather
+ input = open_input_stream
+ reader = FeatherFileReader.new(input)
+ table = reader.read
+ table.instance_variable_set(:@input, input)
+ table
+ end
+
+ def load_as_json
+ input = open_input_stream
+ reader = JSONReader.new(input)
+ table = reader.read
+ table.instance_variable_set(:@input, input)
+ table
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/table-saver.rb b/src/arrow/ruby/red-arrow/lib/arrow/table-saver.rb
new file mode 100644
index 000000000..207a10a82
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/table-saver.rb
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class TableSaver
+ class << self
+ def save(table, output, options={})
+ new(table, output, options).save
+ end
+ end
+
+ def initialize(table, output, options={})
+ @table = table
+ output = output.to_path if output.respond_to?(:to_path)
+ @output = output
+ @options = options
+ fill_options
+ end
+
+ def save
+ if @output.is_a?(URI)
+ custom_save_method = "save_to_uri"
+ else
+ custom_save_method = "save_to_file"
+ end
+ unless respond_to?(custom_save_method, true)
+ available_schemes = []
+ (methods(true) | private_methods(true)).each do |name|
+ match_data = /\Asave_to_/.match(name.to_s)
+ if match_data
+ available_schemes << match_data.post_match
+ end
+ end
+ message = "Arrow::Table save source must be one of ["
+ message << available_schemes.join(", ")
+ message << "]: #{@output.scheme.inspect}"
+ raise ArgumentError, message
+ end
+ __send__(custom_save_method)
+ end
+
+ private
+ def save_to_file
+ format = @options[:format]
+ custom_save_method = "save_as_#{format}"
+ unless respond_to?(custom_save_method, true)
+ available_formats = []
+ (methods(true) | private_methods(true)).each do |name|
+ match_data = /\Asave_as_/.match(name.to_s)
+ if match_data
+ available_formats << match_data.post_match
+ end
+ end
+ deprecated_formats = ["batch", "stream"]
+ available_formats -= deprecated_formats
+ message = "Arrow::Table save format must be one of ["
+ message << available_formats.join(", ")
+ message << "]: #{format.inspect}"
+ raise ArgumentError, message
+ end
+ if method(custom_save_method).arity.zero?
+ __send__(custom_save_method)
+ else
+ # For backward compatibility.
+ __send__(custom_save_method, @output)
+ end
+ end
+
+ def fill_options
+ if @options[:format] and @options.key?(:compression)
+ return
+ end
+
+ case @output
+ when Buffer
+ info = {}
+ when URI
+ extension = PathExtension.new(@output.path)
+ info = extension.extract
+ else
+ extension = PathExtension.new(@output)
+ info = extension.extract
+ end
+ format = info[:format]
+ @options = @options.dup
+ if format
+ @options[:format] ||= format.to_sym
+ else
+ @options[:format] ||= :arrow
+ end
+ unless @options.key?(:compression)
+ @options[:compression] = info[:compression]
+ end
+ end
+
+ def open_raw_output_stream(&block)
+ if @output.is_a?(Buffer)
+ BufferOutputStream.open(@output, &block)
+ else
+ FileOutputStream.open(@output, false, &block)
+ end
+ end
+
+ def open_output_stream(&block)
+ compression = @options[:compression]
+ if compression
+ codec = Codec.new(compression)
+ open_raw_output_stream do |raw_output|
+ CompressedOutputStream.open(codec, raw_output) do |output|
+ yield(output)
+ end
+ end
+ else
+ open_raw_output_stream(&block)
+ end
+ end
+
+ def save_raw(writer_class)
+ open_output_stream do |output|
+ writer_class.open(output, @table.schema) do |writer|
+ writer.write_table(@table)
+ end
+ end
+ end
+
+ def save_as_arrow
+ save_as_arrow_file
+ end
+
+ # @since 1.0.0
+ def save_as_arrow_file
+ save_raw(RecordBatchFileWriter)
+ end
+
+ # @deprecated Use `format: :arrow_batch` instead.
+ def save_as_batch
+ save_as_arrow_file
+ end
+
+ # @since 1.0.0
+ def save_as_arrow_streaming
+ save_raw(RecordBatchStreamWriter)
+ end
+
+ # @deprecated Use `format: :arrow_streaming` instead.
+ def save_as_stream
+ save_as_arrow_streaming
+ end
+
+ def csv_save(**options)
+ open_output_stream do |output|
+ csv = CSV.new(output, **options)
+ names = @table.schema.fields.collect(&:name)
+ csv << names
+ @table.raw_records.each do |record|
+ csv << record
+ end
+ end
+ end
+
+ def save_as_csv
+ csv_save
+ end
+
+ def save_as_tsv
+ csv_save(col_sep: "\t")
+ end
+
+ def save_as_feather
+ properties = FeatherWriteProperties.new
+ properties.class.properties.each do |name|
+ value = @options[name.to_sym]
+ next if value.nil?
+ properties.__send__("#{name}=", value)
+ end
+ open_raw_output_stream do |output|
+ @table.write_as_feather(output, properties)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/table-table-formatter.rb b/src/arrow/ruby/red-arrow/lib/arrow/table-table-formatter.rb
new file mode 100644
index 000000000..36121e1b6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/table-table-formatter.rb
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "time"
+
+module Arrow
+ # TODO: Almost codes should be implemented in Apache Arrow C++.
+ class TableTableFormatter < TableFormatter
+ private
+ def format_header(text, column_formatters)
+ column_formatters.each do |column_formatter|
+ text << "\t"
+ text << column_formatter.aligned_name
+ end
+ text << "\n"
+ end
+
+ def format_rows(text, column_formatters, rows, n_digits, start_offset)
+ rows.each_with_index do |row, nth_row|
+ text << ("%*d" % [n_digits, start_offset + nth_row])
+ row.each_with_index do |column_value, nth_column|
+ text << "\t"
+ column_formatter = column_formatters[nth_column]
+ aligned_name = column_formatter.aligned_name
+ text << column_formatter.format_value(column_value, aligned_name.size)
+ end
+ text << "\n"
+ end
+ end
+
+ def format_ellipsis(text)
+ text << "...\n"
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/table.rb b/src/arrow/ruby/red-arrow/lib/arrow/table.rb
new file mode 100644
index 000000000..e8aa39bac
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/table.rb
@@ -0,0 +1,519 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow/raw-table-converter"
+
+module Arrow
+ class Table
+ include ColumnContainable
+ include GenericFilterable
+ include GenericTakeable
+ include RecordContainable
+
+ class << self
+ def load(path, options={})
+ TableLoader.load(path, options)
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::Table}.
+ #
+ # @overload initialize(columns)
+ #
+ # @param columns [::Array<Arrow::Column>] The columns of the table.
+ #
+ # @example Create a table from columns
+ # count_field = Arrow::Field.new("count", :uint32)
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+ # count_column = Arrow::Column.new(count_field, count_array)
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
+ # Arrow::Table.new([count_column, visible_column])
+ #
+ # @overload initialize(raw_table)
+ #
+ # @param raw_table [Hash<String, Arrow::Array>]
+ # The pairs of column name and values of the table. Column values is
+ # `Arrow::Array`.
+ #
+ # @example Create a table from column name and values
+ # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
+ # "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
+ #
+ # @overload initialize(raw_table)
+ #
+ # @param raw_table [Hash<String, Arrow::ChunkedArray>]
+ # The pairs of column name and values of the table. Column values is
+ # `Arrow::ChunkedArray`.
+ #
+ # @example Create a table from column name and values
+ # count_chunks = [
+ # Arrow::UInt32Array.new([0, 2]),
+ # Arrow::UInt32Array.new([nil, 4]),
+ # ]
+ # visible_chunks = [
+ # Arrow::BooleanArray.new([true]),
+ # Arrow::BooleanArray.new([nil, nil, false]),
+ # ]
+ # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
+ # "visible" => Arrow::ChunkedArray.new(visible_chunks))
+ #
+ # @overload initialize(raw_table)
+ #
+ # @param raw_table [Hash<String, ::Array>]
+ # The pairs of column name and values of the table. Column values is
+ # `Array`.
+ #
+ # @example Create a table from column name and values
+ # Arrow::Table.new("count" => [0, 2, nil, 4],
+ # "visible" => [true, nil, nil, false])
+ #
+ # @overload initialize(schema, columns)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param columns [::Array<Arrow::Column>] The data of the table.
+ #
+ # @example Create a table from schema and columns
+ # count_field = Arrow::Field.new("count", :uint32)
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+ # count_column = Arrow::Column.new(count_field, count_array)
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+ # visible_column = Arrow::Column.new(visible_field, visible_array)
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
+ # [count_column, visible_column])
+ #
+ # @overload initialize(schema, arrays)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param arrays [::Array<Arrow::Array>] The data of the table.
+ #
+ # @example Create a table from schema and arrays
+ # count_field = Arrow::Field.new("count", :uint32)
+ # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
+ # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
+ # [count_array, visible_array])
+ #
+ # @overload initialize(schema, record_batches)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
+ #
+ # @example Create a table from schema and record batches
+ # count_field = Arrow::Field.new("count", :uint32)
+ # visible_field = Arrow::Field.new("visible", :boolean)
+ # schema = Arrow::Schema.new([count_field, visible_field])
+ # record_batches = [
+ # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
+ # Arrow::RecordBatch.new(schema, [[4, false]]),
+ # ]
+ # Arrow::Table.new(schema, record_batches)
+ #
+ # @overload initialize(schema, raw_records)
+ #
+ # @param schema [Arrow::Schema] The schema of the table.
+ # You can also specify schema as primitive Ruby objects.
+ # See {Arrow::Schema#initialize} for details.
+ #
+ # @param arrays [::Array<::Array>] The data of the table as primitive
+ # Ruby objects.
+ #
+ # @example Create a table from schema and raw records
+ # schema = {
+ # count: :uint32,
+ # visible: :boolean,
+ # }
+ # raw_records = [
+ # [0, true],
+ # [2, nil],
+ # [nil, nil],
+ # [4, false],
+ # ]
+ # Arrow::Table.new(schema, raw_records)
+ def initialize(*args)
+ n_args = args.size
+ case n_args
+ when 1
+ raw_table_converter = RawTableConverter.new(args[0])
+ schema = raw_table_converter.schema
+ values = raw_table_converter.values
+ when 2
+ schema = args[0]
+ schema = Schema.new(schema) unless schema.is_a?(Schema)
+ values = args[1]
+ case values[0]
+ when ::Array
+ values = [RecordBatch.new(schema, values)]
+ when Column
+ values = values.collect(&:data)
+ end
+ else
+ message = "wrong number of arguments (given #{n_args}, expected 1..2)"
+ raise ArgumentError, message
+ end
+ initialize_raw(schema, values)
+ end
+
+ def each_record_batch
+ return to_enum(__method__) unless block_given?
+
+ reader = TableBatchReader.new(self)
+ while record_batch = reader.read_next
+ yield(record_batch)
+ end
+ end
+
+ alias_method :size, :n_rows
+ alias_method :length, :n_rows
+
+ alias_method :slice_raw, :slice
+
+ # @overload slice(offset, length)
+ #
+ # @param offset [Integer] The offset of sub Arrow::Table.
+ # @param length [Integer] The length of sub Arrow::Table.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only from
+ # `offset` to `offset + length` range.
+ #
+ # @overload slice(index)
+ #
+ # @param index [Integer] The index in this table.
+ # @return [Arrow::Record]
+ # The `Arrow::Record` corresponding to index of
+ # the table.
+ #
+ # @overload slice(booleans)
+ #
+ # @param booleans [::Array<Boolean>]
+ # The values indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of indices
+ # the values of `booleans` is true.
+ #
+ # @overload slice(boolean_array)
+ #
+ # @param boolean_array [::Array<Arrow::BooleanArray>]
+ # The values indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of indices
+ # the values of `boolean_array` is true.
+ #
+ # @overload slice(range)
+ #
+ # @param range_included_end [Range] The range indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of the range of indices.
+ #
+ # @overload slice(conditions)
+ #
+ # @param conditions [Hash] The conditions to select records.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows matched by condition
+ #
+ # @overload slice
+ #
+ # @yield [slicer] Gives slicer that constructs condition to select records.
+ # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
+ # build condition.
+ # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
+ # The condition to select records.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows matched by condition
+ # specified by slicer.
+ def slice(*args)
+ slicers = []
+ if block_given?
+ unless args.empty?
+ raise ArgumentError, "must not specify both arguments and block"
+ end
+ block_slicer = yield(Slicer.new(self))
+ case block_slicer
+ when ::Array
+ slicers.concat(block_slicer)
+ else
+ slicers << block_slicer
+ end
+ else
+ expected_n_args = nil
+ case args.size
+ when 1
+ case args[0]
+ when Integer
+ index = args[0]
+ index += n_rows if index < 0
+ return nil if index < 0
+ return nil if index >= n_rows
+ return Record.new(self, index)
+ when Hash
+ condition_pairs = args[0]
+ slicer = Slicer.new(self)
+ conditions = []
+ condition_pairs.each do |key, value|
+ case value
+ when Range
+ # TODO: Optimize "begin <= key <= end" case by missing "between" kernel
+ # https://issues.apache.org/jira/browse/ARROW-9843
+ unless value.begin.nil?
+ conditions << (slicer[key] >= value.begin)
+ end
+ unless value.end.nil?
+ if value.exclude_end?
+ conditions << (slicer[key] < value.end)
+ else
+ conditions << (slicer[key] <= value.end)
+ end
+ end
+ else
+ conditions << (slicer[key] == value)
+ end
+ end
+ slicers << conditions.inject(:&)
+ else
+ slicers << args[0]
+ end
+ when 2
+ offset, length = args
+ slicers << (offset...(offset + length))
+ else
+ expected_n_args = "1..2"
+ end
+ if expected_n_args
+ message = "wrong number of arguments " +
+ "(given #{args.size}, expected #{expected_n_args})"
+ raise ArgumentError, message
+ end
+ end
+
+ filter_options = Arrow::FilterOptions.new
+ filter_options.null_selection_behavior = :emit_null
+ sliced_tables = []
+ slicers.each do |slicer|
+ slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
+ case slicer
+ when Integer
+ slicer += n_rows if slicer < 0
+ sliced_tables << slice_by_range(slicer, n_rows - 1)
+ when Range
+ original_from = from = slicer.first
+ to = slicer.last
+ to -= 1 if slicer.exclude_end?
+ from += n_rows if from < 0
+ if from < 0 or from >= n_rows
+ message =
+ "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
+ "#{original_from}"
+ raise ArgumentError, message
+ end
+ to += n_rows if to < 0
+ sliced_tables << slice_by_range(from, to)
+ when ::Array, BooleanArray, ChunkedArray
+ sliced_tables << filter(slicer, filter_options)
+ else
+ message = "slicer must be Integer, Range, (from, to), " +
+ "Arrow::ChunkedArray of Arrow::BooleanArray, " +
+ "Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
+ raise ArgumentError, message
+ end
+ end
+ if sliced_tables.size > 1
+ sliced_tables[0].concatenate(sliced_tables[1..-1])
+ else
+ sliced_tables[0]
+ end
+ end
+
+ # TODO
+ #
+ # @return [Arrow::Table]
+ def merge(other)
+ added_columns = {}
+ removed_columns = {}
+
+ case other
+ when Hash
+ other.each do |name, value|
+ name = name.to_s
+ if value
+ added_columns[name] = ensure_raw_column(name, value)
+ else
+ removed_columns[name] = true
+ end
+ end
+ when Table
+ added_columns = {}
+ other.columns.each do |column|
+ name = column.name
+ added_columns[name] = ensure_raw_column(name, column)
+ end
+ else
+ message = "merge target must be Hash or Arrow::Table: " +
+ "<#{other.inspect}>: #{inspect}"
+ raise ArgumentError, message
+ end
+
+ new_columns = []
+ columns.each do |column|
+ column_name = column.name
+ new_column = added_columns.delete(column_name)
+ if new_column
+ new_columns << new_column
+ next
+ end
+ next if removed_columns.key?(column_name)
+ new_columns << ensure_raw_column(column_name, column)
+ end
+ added_columns.each do |name, new_column|
+ new_columns << new_column
+ end
+ new_fields = []
+ new_arrays = []
+ new_columns.each do |new_column|
+ new_fields << new_column[:field]
+ new_arrays << new_column[:data]
+ end
+ self.class.new(new_fields, new_arrays)
+ end
+
+ alias_method :remove_column_raw, :remove_column
+ def remove_column(name_or_index)
+ case name_or_index
+ when String, Symbol
+ name = name_or_index.to_s
+ index = columns.index {|column| column.name == name}
+ if index.nil?
+ message = "unknown column: #{name_or_index.inspect}: #{inspect}"
+ raise KeyError.new(message)
+ end
+ else
+ index = name_or_index
+ index += n_columns if index < 0
+ if index < 0 or index >= n_columns
+ message = "out of index (0..#{n_columns - 1}): " +
+ "#{name_or_index.inspect}: #{inspect}"
+ raise IndexError.new(message)
+ end
+ end
+ remove_column_raw(index)
+ end
+
+ # Experimental
+ def group(*keys)
+ Group.new(self, keys)
+ end
+
+ # Experimental
+ def window(size: nil)
+ RollingWindow.new(self, size)
+ end
+
+ def save(output, options={})
+ saver = TableSaver.new(self, output, options)
+ saver.save
+ end
+
+ def pack
+ packed_arrays = columns.collect do |column|
+ column.data.pack
+ end
+ self.class.new(schema, packed_arrays)
+ end
+
+ alias_method :to_s_raw, :to_s
+ def to_s(options={})
+ format = options[:format]
+ case format
+ when :column
+ return to_s_raw
+ when :list
+ formatter_class = TableListFormatter
+ when :table, nil
+ formatter_class = TableTableFormatter
+ else
+ message = ":format must be :column, :list, :table or nil"
+ raise ArgumentError, "#{message}: <#{format.inspect}>"
+ end
+ formatter = formatter_class.new(self, options)
+ formatter.format
+ end
+
+ alias_method :inspect_raw, :inspect
+ def inspect
+ "#{super}\n#{to_s}"
+ end
+
+ def respond_to_missing?(name, include_private)
+ return true if find_column(name)
+ super
+ end
+
+ def method_missing(name, *args, &block)
+ if args.empty?
+ column = find_column(name)
+ return column if column
+ end
+ super
+ end
+
+ private
+ def slice_by_range(from, to)
+ slice_raw(from, to - from + 1)
+ end
+
+ def ensure_raw_column(name, data)
+ case data
+ when Array
+ {
+ field: Field.new(name, data.value_data_type),
+ data: ChunkedArray.new([data]),
+ }
+ when ChunkedArray
+ {
+ field: Field.new(name, data.value_data_type),
+ data: data,
+ }
+ when Column
+ column = data
+ data = column.data
+ data = ChunkedArray.new([data]) unless data.is_a?(ChunkedArray)
+ {
+ field: column.field,
+ data: data,
+ }
+ else
+ message = "column must be Arrow::Array or Arrow::Column: " +
+ "<#{name}>: <#{data.inspect}>: #{inspect}"
+ raise ArgumentError, message
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/tensor.rb b/src/arrow/ruby/red-arrow/lib/arrow/tensor.rb
new file mode 100644
index 000000000..fdcc6c1ae
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/tensor.rb
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Tensor
+ def to_arrow
+ self
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/time.rb b/src/arrow/ruby/red-arrow/lib/arrow/time.rb
new file mode 100644
index 000000000..3d25a6403
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/time.rb
@@ -0,0 +1,159 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Time
+ attr_reader :unit
+ attr_reader :value
+ def initialize(unit, value)
+ @unit = unit
+ @value = value
+ @unconstructed = false
+ end
+
+ def ==(other)
+ other.is_a?(self.class) and
+ positive? == other.positive? and
+ hour == other.hour and
+ minute == other.minute and
+ second == other.second and
+ nano_second == other.nano_second
+ end
+
+ def cast(target_unit)
+ return self.class.new(@unit, @value) if @unit == target_unit
+
+ target_value = (hour * 60 * 60) + (minute * 60) + second
+ case target_unit
+ when TimeUnit::MILLI
+ target_value *= 1000
+ target_value += nano_second / 1000 / 1000
+ when TimeUnit::MICRO
+ target_value *= 1000 * 1000
+ target_value += nano_second / 1000
+ when TimeUnit::NANO
+ target_value *= 1000 * 1000 * 1000
+ target_value += nano_second
+ end
+ target_value = -target_value if negative?
+ self.class.new(target_unit, target_value)
+ end
+
+ def to_f
+ case @unit
+ when TimeUnit::SECOND
+ @value.to_f
+ when TimeUnit::MILLI
+ @value.to_f / 1000.0
+ when TimeUnit::MICRO
+ @value.to_f / 1000.0 / 1000.0
+ when TimeUnit::NANO
+ @value.to_f / 1000.0 / 1000.0 / 1000.0
+ end
+ end
+
+ def positive?
+ @value.positive?
+ end
+
+ def negative?
+ @value.negative?
+ end
+
+ def hour
+ unconstruct
+ @hour
+ end
+
+ def minute
+ unconstruct
+ @minute
+ end
+ alias_method :min, :minute
+
+ def second
+ unconstruct
+ @second
+ end
+ alias_method :sec, :second
+
+ def nano_second
+ unconstruct
+ @nano_second
+ end
+ alias_method :nsec, :nano_second
+
+ def to_s
+ unconstruct
+ if @nano_second.zero?
+ nano_second_string = ""
+ else
+ nano_second_string = (".%09d" % @nano_second).gsub(/0+\z/, "")
+ end
+ "%s%02d:%02d:%02d%s" % [
+ @value.negative? ? "-" : "",
+ @hour,
+ @minute,
+ @second,
+ nano_second_string,
+ ]
+ end
+
+ private
+ def unconstruct
+ return if @unconstructed
+ abs_value = @value.abs
+ case unit
+ when TimeUnit::SECOND
+ unconstruct_second(abs_value)
+ @nano_second = 0
+ when TimeUnit::MILLI
+ unconstruct_second(abs_value / 1000)
+ @nano_second = (abs_value % 1000) * 1000 * 1000
+ when TimeUnit::MICRO
+ unconstruct_second(abs_value / 1000 / 1000)
+ @nano_second = (abs_value % (1000 * 1000)) * 1000
+ when TimeUnit::NANO
+ unconstruct_second(abs_value / 1000 / 1000 / 1000)
+ @nano_second = abs_value % (1000 * 1000 * 1000)
+ else
+ raise ArgumentError, "invalid unit: #{@unit.inspect}"
+ end
+ @unconstructed = true
+ end
+
+ def unconstruct_second(abs_value_in_second)
+ if abs_value_in_second < 60
+ hour = 0
+ minute = 0
+ second = abs_value_in_second
+ elsif abs_value_in_second < (60 * 60)
+ hour = 0
+ minute = abs_value_in_second / 60
+ second = abs_value_in_second % 60
+ else
+ in_minute = abs_value_in_second / 60
+ hour = in_minute / 60
+ minute = in_minute % 60
+ second = abs_value_in_second % 60
+ end
+ @hour = hour
+ @minute = minute
+ @second = second
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/time32-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/time32-array-builder.rb
new file mode 100644
index 000000000..088f37c4e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/time32-array-builder.rb
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Time32ArrayBuilder
+ class << self
+ def build(unit_or_data_type, values)
+ builder = new(unit_or_data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ def initialize(unit_or_data_type)
+ case unit_or_data_type
+ when DataType
+ data_type = unit_or_data_type
+ else
+ unit = unit_or_data_type
+ data_type = Time32DataType.new(unit)
+ end
+ initialize_raw(data_type)
+ end
+
+ def unit
+ @unit ||= value_data_type.unit
+ end
+
+ private
+ def convert_to_arrow_value(value)
+ return value unless value.is_a?(Time)
+ value.cast(unit).value
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/time32-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/time32-array.rb
new file mode 100644
index 000000000..e01dd9732
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/time32-array.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Time32Array
+ def get_value(i)
+ Time.new(unit, get_raw_value(i))
+ end
+
+ def unit
+ @unit ||= value_data_type.unit
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/time32-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/time32-data-type.rb
new file mode 100644
index 000000000..be1d04fa2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/time32-data-type.rb
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Time32DataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::Time32DataType}.
+ #
+ # @overload initialize(unit)
+ #
+ # @param unit [Arrow::TimeUnit, Symbol] The unit of the
+ # time32 data type.
+ #
+ # The unit must be second or millisecond.
+ #
+ # @example Create a time32 data type with Arrow::TimeUnit
+ # Arrow::Time32DataType.new(Arrow::TimeUnit::MILLI)
+ #
+ # @example Create a time32 data type with Symbol
+ # Arrow::Time32DataType.new(:milli)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the time32 data
+ # type. It must have `:unit` value.
+ #
+ # @option description [Arrow::TimeUnit, Symbol] :unit The unit of
+ # the time32 data type.
+ #
+ # The unit must be second or millisecond.
+ #
+ # @example Create a time32 data type with Arrow::TimeUnit
+ # Arrow::Time32DataType.new(unit: Arrow::TimeUnit::MILLI)
+ #
+ # @example Create a time32 data type with Symbol
+ # Arrow::Time32DataType.new(unit: :milli)
+ def initialize(unit)
+ if unit.is_a?(Hash)
+ description = unit
+ unit = description[:unit]
+ end
+ initialize_raw(unit)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/time64-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/time64-array-builder.rb
new file mode 100644
index 000000000..dec15b8bf
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/time64-array-builder.rb
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Time64ArrayBuilder
+ class << self
+ def build(unit_or_data_type, values)
+ builder = new(unit_or_data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ def initialize(unit_or_data_type)
+ case unit_or_data_type
+ when DataType
+ data_type = unit_or_data_type
+ else
+ unit = unit_or_data_type
+ data_type = Time64DataType.new(unit)
+ end
+ initialize_raw(data_type)
+ end
+
+ def unit
+ @unit ||= value_data_type.unit
+ end
+
+ private
+ def convert_to_arrow_value(value)
+ return value unless value.is_a?(Time)
+ value.cast(unit).value
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/time64-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/time64-array.rb
new file mode 100644
index 000000000..7fc2fd9ab
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/time64-array.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Time64Array
+ def get_value(i)
+ Time.new(unit, get_raw_value(i))
+ end
+
+ def unit
+ @unit ||= value_data_type.unit
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/time64-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/time64-data-type.rb
new file mode 100644
index 000000000..13795aa83
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/time64-data-type.rb
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Time64DataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::Time64DataType}.
+ #
+ # @overload initialize(unit)
+ #
+ # @param unit [Arrow::TimeUnit, Symbol] The unit of the
+ # time64 data type.
+ #
+ # The unit must be microsecond or nanosecond.
+ #
+ # @example Create a time64 data type with Arrow::TimeUnit
+ # Arrow::Time64DataType.new(Arrow::TimeUnit::NANO)
+ #
+ # @example Create a time64 data type with Symbol
+ # Arrow::Time64DataType.new(:nano)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the time64 data
+ # type. It must have `:unit` value.
+ #
+ # @option description [Arrow::TimeUnit, Symbol] :unit The unit of
+ # the time64 data type.
+ #
+ # The unit must be microsecond or nanosecond.
+ #
+ # @example Create a time64 data type with Arrow::TimeUnit
+ # Arrow::Time64DataType.new(unit: Arrow::TimeUnit::NANO)
+ #
+ # @example Create a time64 data type with Symbol
+ # Arrow::Time64DataType.new(unit: :nano)
+ def initialize(unit)
+ if unit.is_a?(Hash)
+ description = unit
+ unit = description[:unit]
+ end
+ initialize_raw(unit)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb b/src/arrow/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb
new file mode 100644
index 000000000..68bcb0fec
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class TimestampArrayBuilder
+ class << self
+ def build(unit_or_data_type, values)
+ builder = new(unit_or_data_type)
+ builder.build(values)
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ def initialize(unit_or_data_type)
+ case unit_or_data_type
+ when DataType
+ data_type = unit_or_data_type
+ else
+ unit = unit_or_data_type
+ data_type = TimestampDataType.new(unit)
+ end
+ initialize_raw(data_type)
+ end
+
+ private
+ def unit_id
+ @unit_id ||= value_data_type.unit.nick.to_sym
+ end
+
+ def convert_to_arrow_value(value)
+ if value.respond_to?(:to_time) and not value.is_a?(Time)
+ value = value.to_time
+ end
+
+ if value.is_a?(::Time)
+ case unit_id
+ when :second
+ value.to_i
+ when :milli
+ value.to_i * 1_000 + value.usec / 1_000
+ when :micro
+ value.to_i * 1_000_000 + value.usec
+ else
+ value.to_i * 1_000_000_000 + value.nsec
+ end
+ else
+ value
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/timestamp-array.rb b/src/arrow/ruby/red-arrow/lib/arrow/timestamp-array.rb
new file mode 100644
index 000000000..011273487
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/timestamp-array.rb
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class TimestampArray
+ def get_value(i)
+ cast_to_time(get_raw_value(i))
+ end
+
+ def unit
+ @unit ||= value_data_type.unit
+ end
+
+ private
+ def cast_to_time(raw_value)
+ case unit
+ when TimeUnit::SECOND
+ ::Time.at(raw_value)
+ when TimeUnit::MILLI
+ ::Time.at(*raw_value.divmod(1_000))
+ when TimeUnit::MICRO
+ ::Time.at(*raw_value.divmod(1_000_000))
+ else
+ ::Time.at(raw_value / 1_000_000_000.0)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/timestamp-data-type.rb b/src/arrow/ruby/red-arrow/lib/arrow/timestamp-data-type.rb
new file mode 100644
index 000000000..cd91f567d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/timestamp-data-type.rb
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class TimestampDataType
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+
+ # Creates a new {Arrow::TimestampDataType}.
+ #
+ # @overload initialize(unit)
+ #
+ # @param unit [Arrow::TimeUnit, Symbol] The unit of the
+ # timestamp data type.
+ #
+ # @example Create a timestamp data type with Arrow::TimeUnit
+ # Arrow::TimestampDataType.new(Arrow::TimeUnit::MILLI)
+ #
+ # @example Create a timestamp data type with Symbol
+ # Arrow::TimestampDataType.new(:milli)
+ #
+ # @overload initialize(description)
+ #
+ # @param description [Hash] The description of the timestamp data
+ # type. It must have `:unit` value.
+ #
+ # @option description [Arrow::TimeUnit, Symbol] :unit The unit of
+ # the timestamp data type.
+ #
+ # @example Create a timestamp data type with Arrow::TimeUnit
+ # Arrow::TimestampDataType.new(unit: Arrow::TimeUnit::MILLI)
+ #
+ # @example Create a timestamp data type with Symbol
+ # Arrow::TimestampDataType.new(unit: :milli)
+ def initialize(unit)
+ if unit.is_a?(Hash)
+ description = unit
+ unit = description[:unit]
+ end
+ initialize_raw(unit)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/version.rb b/src/arrow/ruby/red-arrow/lib/arrow/version.rb
new file mode 100644
index 000000000..f830ff895
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/lib/arrow/writable.rb b/src/arrow/ruby/red-arrow/lib/arrow/writable.rb
new file mode 100644
index 000000000..02be9ddfc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/lib/arrow/writable.rb
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ module Writable
+ alias_method :<<, :write
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/red-arrow.gemspec b/src/arrow/ruby/red-arrow/red-arrow.gemspec
new file mode 100644
index 000000000..2fcc31b60
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/red-arrow.gemspec
@@ -0,0 +1,67 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "lib/arrow/version"
+
+Gem::Specification.new do |spec|
+ spec.name = "red-arrow"
+ version_components = [
+ Arrow::Version::MAJOR.to_s,
+ Arrow::Version::MINOR.to_s,
+ Arrow::Version::MICRO.to_s,
+ Arrow::Version::TAG,
+ ]
+ spec.version = version_components.compact.join(".")
+ spec.homepage = "https://arrow.apache.org/"
+ spec.authors = ["Apache Arrow Developers"]
+ spec.email = ["dev@arrow.apache.org"]
+
+ spec.summary = "Red Arrow is the Ruby bindings of Apache Arrow"
+ spec.description =
+ "Apache Arrow is a common in-memory columnar data store. " +
+ "It's useful to share and process large data."
+ spec.license = "Apache-2.0"
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
+ spec.files += ["LICENSE.txt", "NOTICE.txt"]
+ spec.files += Dir.glob("ext/**/*.{cpp,hpp,rb}")
+ spec.files += Dir.glob("lib/**/*.rb")
+ spec.files += Dir.glob("image/*.*")
+ spec.files += Dir.glob("doc/text/*")
+ spec.test_files += Dir.glob("test/**/*")
+ spec.extensions = ["ext/arrow/extconf.rb"]
+
+ spec.add_runtime_dependency("bigdecimal", ">= 2.0.3")
+ spec.add_runtime_dependency("extpp", ">= 0.0.7")
+ spec.add_runtime_dependency("gio2", ">= 3.4.9")
+ spec.add_runtime_dependency("native-package-installer")
+ spec.add_runtime_dependency("pkg-config")
+
+ spec.add_development_dependency("benchmark-driver")
+ spec.add_development_dependency("bundler")
+ spec.add_development_dependency("faker")
+ spec.add_development_dependency("fiddle", ">= 1.0.9")
+ spec.add_development_dependency("rake")
+ spec.add_development_dependency("redcarpet")
+ spec.add_development_dependency("test-unit")
+ spec.add_development_dependency("yard")
+
+ required_msys2_package_version = version_components[0, 3].join(".")
+ spec.metadata["msys2_mingw_dependencies"] =
+ "arrow>=#{required_msys2_package_version}"
+end
diff --git a/src/arrow/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc b/src/arrow/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc
new file mode 100644
index 000000000..4fb0beff8
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/TestOrcFile.test1.orc
Binary files differ
diff --git a/src/arrow/ruby/red-arrow/test/fixture/float-integer.csv b/src/arrow/ruby/red-arrow/test/fixture/float-integer.csv
new file mode 100644
index 000000000..5eae562bc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/float-integer.csv
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+score
+2.9
+10
+-1.1
diff --git a/src/arrow/ruby/red-arrow/test/fixture/integer-float.csv b/src/arrow/ruby/red-arrow/test/fixture/integer-float.csv
new file mode 100644
index 000000000..da7614199
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/integer-float.csv
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+score
+10
+2.9
+-1.1
diff --git a/src/arrow/ruby/red-arrow/test/fixture/null-with-double-quote.csv b/src/arrow/ruby/red-arrow/test/fixture/null-with-double-quote.csv
new file mode 100644
index 000000000..d84545928
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/null-with-double-quote.csv
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+name,score
+alice,10
+bob,""
+chris,-1
diff --git a/src/arrow/ruby/red-arrow/test/fixture/null-without-double-quote.csv b/src/arrow/ruby/red-arrow/test/fixture/null-without-double-quote.csv
new file mode 100644
index 000000000..c91c8880a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/null-without-double-quote.csv
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+name,score
+alice,10
+bob,
+chris,-1
diff --git a/src/arrow/ruby/red-arrow/test/fixture/with-header-float.csv b/src/arrow/ruby/red-arrow/test/fixture/with-header-float.csv
new file mode 100644
index 000000000..f62fc00b6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/with-header-float.csv
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+name,score
+alice,10.1
+bob,29.2
+chris,-1.3
diff --git a/src/arrow/ruby/red-arrow/test/fixture/with-header.csv b/src/arrow/ruby/red-arrow/test/fixture/with-header.csv
new file mode 100644
index 000000000..a93fc5aec
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/with-header.csv
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+name,score
+alice,10
+bob,29
+chris,-1
diff --git a/src/arrow/ruby/red-arrow/test/fixture/without-header-float.csv b/src/arrow/ruby/red-arrow/test/fixture/without-header-float.csv
new file mode 100644
index 000000000..584a20996
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/without-header-float.csv
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+alice,10.1
+bob,29.2
+chris,-1.3
diff --git a/src/arrow/ruby/red-arrow/test/fixture/without-header.csv b/src/arrow/ruby/red-arrow/test/fixture/without-header.csv
new file mode 100644
index 000000000..1f775eae4
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/fixture/without-header.csv
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+alice,10
+bob,29
+chris,-1
diff --git a/src/arrow/ruby/red-arrow/test/helper.rb b/src/arrow/ruby/red-arrow/test/helper.rb
new file mode 100644
index 000000000..29e5f9cbc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/helper.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "fiddle"
+require "pathname"
+require "tempfile"
+require "zlib"
+
+require "test-unit"
+
+require_relative "helper/fixture"
+require_relative "helper/omittable"
diff --git a/src/arrow/ruby/red-arrow/test/helper/fixture.rb b/src/arrow/ruby/red-arrow/test/helper/fixture.rb
new file mode 100644
index 000000000..24445a7e4
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/helper/fixture.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Helper
+ module Fixture
+ def fixture_dir
+ Pathname.new(__dir__).join("..", "fixture").expand_path
+ end
+
+ def fixture_path(*components)
+ fixture_dir.join(*components)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/helper/omittable.rb b/src/arrow/ruby/red-arrow/test/helper/omittable.rb
new file mode 100644
index 000000000..a1c0334b6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/helper/omittable.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Helper
+ module Omittable
+ def require_gi_bindings(major, minor, micro)
+ return if GLib.check_binding_version?(major, minor, micro)
+ message =
+ "Require gobject-introspection #{major}.#{minor}.#{micro} or later: " +
+ GLib::BINDING_VERSION.join(".")
+ omit(message)
+ end
+
+ def require_gi(major, minor, micro)
+ return if GObjectIntrospection::Version.or_later?(major, minor, micro)
+ message =
+ "Require GObject Introspection #{major}.#{minor}.#{micro} or later: " +
+ GObjectIntrospection::Version::STRING
+ omit(message)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-basic-arrays.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-basic-arrays.rb
new file mode 100644
index 000000000..c80020666
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-basic-arrays.rb
@@ -0,0 +1,365 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module RawRecordsBasicArraysTests
+ def test_null
+ records = [
+ [nil],
+ [nil],
+ [nil],
+ [nil],
+ ]
+ target = build({column: :null}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_boolean
+ records = [
+ [true],
+ [nil],
+ [false],
+ ]
+ target = build({column: :boolean}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int8
+ records = [
+ [-(2 ** 7)],
+ [nil],
+ [(2 ** 7) - 1],
+ ]
+ target = build({column: :int8}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint8
+ records = [
+ [0],
+ [nil],
+ [(2 ** 8) - 1],
+ ]
+ target = build({column: :uint8}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int16
+ records = [
+ [-(2 ** 15)],
+ [nil],
+ [(2 ** 15) - 1],
+ ]
+ target = build({column: :int16}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint16
+ records = [
+ [0],
+ [nil],
+ [(2 ** 16) - 1],
+ ]
+ target = build({column: :uint16}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int32
+ records = [
+ [-(2 ** 31)],
+ [nil],
+ [(2 ** 31) - 1],
+ ]
+ target = build({column: :int32}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint32
+ records = [
+ [0],
+ [nil],
+ [(2 ** 32) - 1],
+ ]
+ target = build({column: :uint32}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int64
+ records = [
+ [-(2 ** 63)],
+ [nil],
+ [(2 ** 63) - 1],
+ ]
+ target = build({column: :int64}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint64
+ records = [
+ [0],
+ [nil],
+ [(2 ** 64) - 1],
+ ]
+ target = build({column: :uint64}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_float
+ records = [
+ [-1.0],
+ [nil],
+ [1.0],
+ ]
+ target = build({column: :float}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_double
+ records = [
+ [-1.0],
+ [nil],
+ [1.0],
+ ]
+ target = build({column: :double}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_binary
+ records = [
+ ["\x00".b],
+ [nil],
+ ["\xff".b],
+ ]
+ target = build({column: :binary}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_tring
+ records = [
+ ["Ruby"],
+ [nil],
+ ["\u3042"], # U+3042 HIRAGANA LETTER A
+ ]
+ target = build({column: :string}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date32
+ records = [
+ [Date.new(1960, 1, 1)],
+ [nil],
+ [Date.new(2017, 8, 23)],
+ ]
+ target = build({column: :date32}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date64
+ records = [
+ [DateTime.new(1960, 1, 1, 2, 9, 30)],
+ [nil],
+ [DateTime.new(2017, 8, 23, 14, 57, 2)],
+ ]
+ target = build({column: :date64}, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_second
+ records = [
+ [Time.parse("1960-01-01T02:09:30Z")],
+ [nil],
+ [Time.parse("2017-08-23T14:57:02Z")],
+ ]
+ target = build({
+ column: {
+ type: :timestamp,
+ unit: :second,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_milli
+ records = [
+ [Time.parse("1960-01-01T02:09:30.123Z")],
+ [nil],
+ [Time.parse("2017-08-23T14:57:02.987Z")],
+ ]
+ target = build({
+ column: {
+ type: :timestamp,
+ unit: :milli,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_micro
+ records = [
+ [Time.parse("1960-01-01T02:09:30.123456Z")],
+ [nil],
+ [Time.parse("2017-08-23T14:57:02.987654Z")],
+ ]
+ target = build({
+ column: {
+ type: :timestamp,
+ unit: :micro,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_nano
+ records = [
+ [Time.parse("1960-01-01T02:09:30.123456789Z")],
+ [nil],
+ [Time.parse("2017-08-23T14:57:02.987654321Z")],
+ ]
+ target = build({
+ column: {
+ type: :timestamp,
+ unit: :nano,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ records = [
+ [Arrow::Time.new(unit, 60 * 10)], # 00:10:00
+ [nil],
+ [Arrow::Time.new(unit, 60 * 60 * 2 + 9)], # 02:00:09
+ ]
+ target = build({
+ column: {
+ type: :time32,
+ unit: :second,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ records = [
+ [Arrow::Time.new(unit, (60 * 10) * 1000 + 123)], # 00:10:00.123
+ [nil],
+ [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987)], # 02:00:09.987
+ ]
+ target = build({
+ column: {
+ type: :time32,
+ unit: :milli,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ records = [
+ # 00:10:00.123456
+ [Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)],
+ [nil],
+ # 02:00:09.987654
+ [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654)],
+ ]
+ target = build({
+ column: {
+ type: :time64,
+ unit: :micro,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ records = [
+ # 00:10:00.123456789
+ [Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)],
+ [nil],
+ # 02:00:09.987654321
+ [Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321)],
+ ]
+ target = build({
+ column: {
+ type: :time64,
+ unit: :nano,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal128
+ records = [
+ [BigDecimal("92.92")],
+ [nil],
+ [BigDecimal("29.29")],
+ ]
+ target = build({
+ column: {
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal256
+ records = [
+ [BigDecimal("92.92")],
+ [nil],
+ [BigDecimal("29.29")],
+ ]
+ target = build({
+ column: {
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ }
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+end
+
+class RawRecordsRecordBatchBasicArraysTest < Test::Unit::TestCase
+ include RawRecordsBasicArraysTests
+
+ def build(schema, records)
+ Arrow::RecordBatch.new(schema, records)
+ end
+end
+
+class RawRecordsTableBasicArraysTest < Test::Unit::TestCase
+ include RawRecordsBasicArraysTests
+
+ def build(schema, records)
+ Arrow::Table.new(schema, records)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-dense-union-array.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-dense-union-array.rb
new file mode 100644
index 000000000..8d94a77fe
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-dense-union-array.rb
@@ -0,0 +1,494 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module RawRecordsDenseUnionArrayTests
+ def build_schema(type, type_codes)
+ field_description = {}
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ {
+ column: {
+ type: :dense_union,
+ fields: [
+ field_description.merge(name: "0"),
+ field_description.merge(name: "1"),
+ ],
+ type_codes: type_codes,
+ },
+ }
+ end
+
+ # TODO: Use Arrow::RecordBatch.new(build_schema(type, type_codes), records)
+ def build_record_batch(type, records)
+ type_codes = [0, 1]
+ schema = Arrow::Schema.new(build_schema(type, type_codes))
+ type_ids = []
+ offsets = []
+ arrays = schema.fields[0].data_type.fields.collect do |field|
+ sub_schema = Arrow::Schema.new([field])
+ sub_records = []
+ records.each do |record|
+ column = record[0]
+ next if column.nil?
+ next unless column.key?(field.name)
+ sub_records << [column[field.name]]
+ end
+ sub_record_batch = Arrow::RecordBatch.new(sub_schema,
+ sub_records)
+ sub_record_batch.columns[0].data
+ end
+ records.each do |record|
+ column = record[0]
+ if column.key?("0")
+ type_id = type_codes[0]
+ type_ids << type_id
+ offsets << (type_ids.count(type_id) - 1)
+ elsif column.key?("1")
+ type_id = type_codes[1]
+ type_ids << type_id
+ offsets << (type_ids.count(type_id) - 1)
+ end
+ end
+ union_array = Arrow::DenseUnionArray.new(schema.fields[0].data_type,
+ Arrow::Int8Array.new(type_ids),
+ Arrow::Int32Array.new(offsets),
+ arrays)
+ schema = Arrow::Schema.new(column: union_array.value_data_type)
+ Arrow::RecordBatch.new(schema,
+ records.size,
+ [union_array])
+ end
+
+ def test_null
+ records = [
+ [{"0" => nil}],
+ ]
+ target = build(:null, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_boolean
+ records = [
+ [{"0" => true}],
+ [{"1" => nil}],
+ ]
+ target = build(:boolean, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int8
+ records = [
+ [{"0" => -(2 ** 7)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint8
+ records = [
+ [{"0" => (2 ** 8) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int16
+ records = [
+ [{"0" => -(2 ** 15)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint16
+ records = [
+ [{"0" => (2 ** 16) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int32
+ records = [
+ [{"0" => -(2 ** 31)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint32
+ records = [
+ [{"0" => (2 ** 32) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int64
+ records = [
+ [{"0" => -(2 ** 63)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint64
+ records = [
+ [{"0" => (2 ** 64) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_float
+ records = [
+ [{"0" => -1.0}],
+ [{"1" => nil}],
+ ]
+ target = build(:float, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_double
+ records = [
+ [{"0" => -1.0}],
+ [{"1" => nil}],
+ ]
+ target = build(:double, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_binary
+ records = [
+ [{"0" => "\xff".b}],
+ [{"1" => nil}],
+ ]
+ target = build(:binary, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_string
+ records = [
+ [{"0" => "Ruby"}],
+ [{"1" => nil}],
+ ]
+ target = build(:string, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date32
+ records = [
+ [{"0" => Date.new(1960, 1, 1)}],
+ [{"1" => nil}],
+ ]
+ target = build(:date32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date64
+ records = [
+ [{"0" => DateTime.new(1960, 1, 1, 2, 9, 30)}],
+ [{"1" => nil}],
+ ]
+ target = build(:date64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_second
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_milli
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30.123Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_micro
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30.123456Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_nano
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30.123456789Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ records = [
+ # 00:10:00
+ [{"0" => Arrow::Time.new(unit, 60 * 10)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ records = [
+ # 00:10:00.123
+ [{"0" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ records = [
+ # 00:10:00.123456
+ [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ records = [
+ # 00:10:00.123456789
+ [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal128
+ records = [
+ [{"0" => BigDecimal("92.92")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal256
+ records = [
+ [{"0" => BigDecimal("92.92")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_list
+ records = [
+ [{"0" => [true, nil, false]}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_struct
+ records = [
+ [{"0" => {"sub_field" => true}}],
+ [{"1" => nil}],
+ [{"0" => {"sub_field" => nil}}],
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :sub_field,
+ type: :boolean,
+ },
+ ],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_map
+ records = [
+ [{"0" => {"key1" => true, "key2" => nil}}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ records = [
+ [{"0" => {"field1" => true}}],
+ [{"1" => nil}],
+ [{"0" => {"field2" => nil}}],
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ records = [
+ [{"0" => {"field1" => true}}],
+ [{"1" => nil}],
+ [{"0" => {"field2" => nil}}],
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ records = [
+ [{"0" => "Ruby"}],
+ [{"1" => nil}],
+ [{"0" => "GLib"}],
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+end
+
+class RawRecordsRecordBatchDenseUnionArrayTest < Test::Unit::TestCase
+ include RawRecordsDenseUnionArrayTests
+
+ def build(type, records)
+ build_record_batch(type, records)
+ end
+end
+
+class RawRecordsTableDenseUnionArrayTest < Test::Unit::TestCase
+ include RawRecordsDenseUnionArrayTests
+
+ def build(type, records)
+ build_record_batch(type, records).to_table
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-list-array.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-list-array.rb
new file mode 100644
index 000000000..6d7d4c079
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-list-array.rb
@@ -0,0 +1,571 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module RawRecordsListArrayTests
+ def build_schema(type)
+ field_description = {
+ name: :element,
+ }
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ {
+ column: {
+ type: :list,
+ field: field_description,
+ },
+ }
+ end
+
+ def test_null
+ records = [
+ [[nil, nil, nil]],
+ [nil],
+ ]
+ target = build(:null, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_boolean
+ records = [
+ [[true, nil, false]],
+ [nil],
+ ]
+ target = build(:boolean, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int8
+ records = [
+ [[-(2 ** 7), nil, (2 ** 7) - 1]],
+ [nil],
+ ]
+ target = build(:int8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint8
+ records = [
+ [[0, nil, (2 ** 8) - 1]],
+ [nil],
+ ]
+ target = build(:uint8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int16
+ records = [
+ [[-(2 ** 15), nil, (2 ** 15) - 1]],
+ [nil],
+ ]
+ target = build(:int16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint16
+ records = [
+ [[0, nil, (2 ** 16) - 1]],
+ [nil],
+ ]
+ target = build(:uint16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int32
+ records = [
+ [[-(2 ** 31), nil, (2 ** 31) - 1]],
+ [nil],
+ ]
+ target = build(:int32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint32
+ records = [
+ [[0, nil, (2 ** 32) - 1]],
+ [nil],
+ ]
+ target = build(:uint32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int64
+ records = [
+ [[-(2 ** 63), nil, (2 ** 63) - 1]],
+ [nil],
+ ]
+ target = build(:int64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint64
+ records = [
+ [[0, nil, (2 ** 64) - 1]],
+ [nil],
+ ]
+ target = build(:uint64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_float
+ records = [
+ [[-1.0, nil, 1.0]],
+ [nil],
+ ]
+ target = build(:float, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_double
+ records = [
+ [[-1.0, nil, 1.0]],
+ [nil],
+ ]
+ target = build(:double, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_binary
+ records = [
+ [["\x00".b, nil, "\xff".b]],
+ [nil],
+ ]
+ target = build(:binary, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_string
+ records = [
+ [
+ [
+ "Ruby",
+ nil,
+ "\u3042", # U+3042 HIRAGANA LETTER A
+ ],
+ ],
+ [nil],
+ ]
+ target = build(:string, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date32
+ records = [
+ [
+ [
+ Date.new(1960, 1, 1),
+ nil,
+ Date.new(2017, 8, 23),
+ ],
+ ],
+ [nil],
+ ]
+ target = build(:date32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date64
+ records = [
+ [
+ [
+ DateTime.new(1960, 1, 1, 2, 9, 30),
+ nil,
+ DateTime.new(2017, 8, 23, 14, 57, 2),
+ ],
+ ],
+ [nil],
+ ]
+ target = build(:date64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_second
+ records = [
+ [
+ [
+ Time.parse("1960-01-01T02:09:30Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02Z"),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_milli
+ records = [
+ [
+ [
+ Time.parse("1960-01-01T02:09:30.123Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987Z"),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_micro
+ records = [
+ [
+ [
+ Time.parse("1960-01-01T02:09:30.123456Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987654Z"),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_nano
+ records = [
+ [
+ [
+ Time.parse("1960-01-01T02:09:30.123456789Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987654321Z"),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ records = [
+ [
+ [
+ # 00:10:00
+ Arrow::Time.new(unit, 60 * 10),
+ nil,
+ # 02:00:09
+ Arrow::Time.new(unit, 60 * 60 * 2 + 9),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ records = [
+ [
+ [
+ # 00:10:00.123
+ Arrow::Time.new(unit, (60 * 10) * 1000 + 123),
+ nil,
+ # 02:00:09.987
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ records = [
+ [
+ [
+ # 00:10:00.123456
+ Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456),
+ nil,
+ # 02:00:09.987654
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ records = [
+ [
+ [
+ # 00:10:00.123456789
+ Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789),
+ nil,
+ # 02:00:09.987654321
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal128
+ records = [
+ [
+ [
+ BigDecimal("92.92"),
+ nil,
+ BigDecimal("29.29"),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal256
+ records = [
+ [
+ [
+ BigDecimal("92.92"),
+ nil,
+ BigDecimal("29.29"),
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_list
+ records = [
+ [
+ [
+ [
+ true,
+ nil,
+ ],
+ nil,
+ [
+ nil,
+ false,
+ ],
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_struct
+ records = [
+ [
+ [
+ {"field" => true},
+ nil,
+ {"field" => nil},
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :field,
+ type: :boolean,
+ },
+ ],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_map
+ records = [
+ [
+ [
+ {"key1" => true, "key2" => nil},
+ nil,
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_sparse
+ omit("Need to add support for SparseUnionArrayBuilder")
+ records = [
+ [
+ [
+ {"field1" => true},
+ nil,
+ {"field2" => nil},
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dense
+ omit("Need to add support for DenseUnionArrayBuilder")
+ records = [
+ [
+ [
+ {"field1" => true},
+ nil,
+ {"field2" => nil},
+ ],
+ ],
+ [nil],
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ records = [
+ [
+ [
+ "Ruby",
+ nil,
+ "GLib",
+ ],
+ ],
+ [nil],
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+end
+
+class RawRecordsRecordBatchListArrayTest < Test::Unit::TestCase
+ include RawRecordsListArrayTests
+
+ def build(type, records)
+ Arrow::RecordBatch.new(build_schema(type), records)
+ end
+end
+
+class RawRecordsTableListArrayTest < Test::Unit::TestCase
+ include RawRecordsListArrayTests
+
+ def build(type, records)
+ Arrow::Table.new(build_schema(type), records)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-map-array.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-map-array.rb
new file mode 100644
index 000000000..c5abb7d77
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-map-array.rb
@@ -0,0 +1,441 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module RawRecordsMapArrayTests
+ def build_schema(type)
+ {
+ column: {
+ type: :map,
+ key: :string,
+ item: type
+ },
+ }
+ end
+
+ def test_null
+ records = [
+ [{"key1" => nil}],
+ [nil],
+ ]
+ target = build(:null, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_boolean
+ records = [
+ [{"key1" => true, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:boolean, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int8
+ records = [
+ [{"key1" => -(2 ** 7), "key2" => nil}],
+ [nil],
+ ]
+ target = build(:int8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint8
+ records = [
+ [{"key1" => (2 ** 8) - 1, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:uint8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int16
+ records = [
+ [{"key1" => -(2 ** 15), "key2" => nil}],
+ [nil],
+ ]
+ target = build(:int16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint16
+ records = [
+ [{"key1" => (2 ** 16) - 1, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:uint16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int32
+ records = [
+ [{"key1" => -(2 ** 31), "key2" => nil}],
+ [nil],
+ ]
+ target = build(:int32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint32
+ records = [
+ [{"key1" => (2 ** 32) - 1, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:uint32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int64
+ records = [
+ [{"key1" => -(2 ** 63), "key2" => nil}],
+ [nil],
+ ]
+ target = build(:int64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint64
+ records = [
+ [{"key1" => (2 ** 64) - 1, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:uint64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_float
+ records = [
+ [{"key1" => -1.0, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:float, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_double
+ records = [
+ [{"key1" => -1.0, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:double, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_binary
+ records = [
+ [{"key1" => "\xff".b, "key2" => nil}],
+ [nil],
+ ]
+ target = build(:binary, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_string
+ records = [
+ [{"key1" => "Ruby", "key2" => nil}],
+ [nil],
+ ]
+ target = build(:string, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date32
+ records = [
+ [{"key1" => Date.new(1960, 1, 1), "key2" => nil}],
+ [nil],
+ ]
+ target = build(:date32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date64
+ records = [
+ [{"key1" => DateTime.new(1960, 1, 1, 2, 9, 30), "key2" => nil}],
+ [nil],
+ ]
+ target = build(:date64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_second
+ records = [
+ [{"key1" => Time.parse("1960-01-01T02:09:30Z"), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_milli
+ records = [
+ [{"key1" => Time.parse("1960-01-01T02:09:30.123Z"), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_micro
+ records = [
+ [{"key1" => Time.parse("1960-01-01T02:09:30.123456Z"), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_nano
+ records = [
+ [{"key1" => Time.parse("1960-01-01T02:09:30.123456789Z"), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ records = [
+ # 00:10:00
+ [{"key1" => Arrow::Time.new(unit, 60 * 10), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ records = [
+ # 00:10:00.123
+ [{"key1" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ records = [
+ # 00:10:00.123456
+ [{"key1" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ records = [
+ # 00:10:00.123456789
+ [{"key1" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal128
+ records = [
+ [{"key1" => BigDecimal("92.92"), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal256
+ records = [
+ [{"key1" => BigDecimal("92.92"), "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_list
+ records = [
+ [{"key1" => [true, nil, false], "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :element,
+ type: :boolean,
+ },
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_struct
+ records = [
+ [{"key1" => {"field" => true}, "key2" => nil, "key3" => {"field" => nil}}],
+ [nil],
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :field,
+ type: :boolean,
+ },
+ ],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_map
+ records = [
+ [{"key1" => {"sub_key1" => true, "sub_key2" => nil}, "key2" => nil}],
+ [nil],
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ records = [
+ [{"key1" => {"field" => true, "key2" => nil, "key3" => {"field" => nil}}}],
+ [nil],
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ records = [
+ [{"key1" => {"field1" => true}, "key2" => nil, "key3" => {"field2" => nil}}],
+ [nil],
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ records = [
+ [{"key1" => "Ruby", "key2" => nil, "key3" => "GLib"}],
+ [nil],
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+end
+
+class RawRecordsRecordBatchMapArrayTest < Test::Unit::TestCase
+ include RawRecordsMapArrayTests
+
+ def build(type, records)
+ Arrow::RecordBatch.new(build_schema(type), records)
+ end
+end
+
+class RawRecordsTableMapArrayTest < Test::Unit::TestCase
+ include RawRecordsMapArrayTests
+
+ def build(type, records)
+ Arrow::Table.new(build_schema(type), records)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-multiple-columns.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-multiple-columns.rb
new file mode 100644
index 000000000..50dff67ce
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-multiple-columns.rb
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module RawRecordsMultipleColumnsTests
+ def test_3_elements
+ records = [
+ [true, nil, "Ruby"],
+ [nil, 0, "GLib"],
+ [false, 2 ** 8 - 1, nil],
+ ]
+ target = build([
+ {name: :column0, type: :boolean},
+ {name: :column1, type: :uint8},
+ {name: :column2, type: :string},
+ ],
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_4_elements
+ records = [
+ [true, nil, "Ruby", -(2 ** 63)],
+ [nil, 0, "GLib", nil],
+ [false, 2 ** 8 - 1, nil, (2 ** 63) - 1],
+ ]
+ target = build([
+ {name: :column0, type: :boolean},
+ {name: :column1, type: :uint8},
+ {name: :column2, type: :string},
+ {name: :column3, type: :int64},
+ ],
+ records)
+ assert_equal(records, target.raw_records)
+ end
+end
+
+class RawRecordsRecordBatchMultipleColumnsTest < Test::Unit::TestCase
+ include RawRecordsMultipleColumnsTests
+
+ def build(schema, records)
+ Arrow::RecordBatch.new(schema, records)
+ end
+end
+
+class RawRecordsTableMultipleColumnsTest < Test::Unit::TestCase
+ include RawRecordsMultipleColumnsTests
+
+ def build(schema, records)
+ Arrow::Table.new(schema, records)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb
new file mode 100644
index 000000000..415401216
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb
@@ -0,0 +1,484 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module RawRecordsSparseUnionArrayTests
+ def build_schema(type, type_codes)
+ field_description = {}
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ {
+ column: {
+ type: :sparse_union,
+ fields: [
+ field_description.merge(name: "0"),
+ field_description.merge(name: "1"),
+ ],
+ type_codes: type_codes,
+ },
+ }
+ end
+
+ # TODO: Use Arrow::RecordBatch.new(build_schema(type, type_codes), records)
+ def build_record_batch(type, records)
+ type_codes = [0, 1]
+ schema = Arrow::Schema.new(build_schema(type, type_codes))
+ type_ids = []
+ arrays = schema.fields[0].data_type.fields.collect do |field|
+ sub_schema = Arrow::Schema.new([field])
+ sub_records = records.collect do |record|
+ [record[0].nil? ? nil : record[0][field.name]]
+ end
+ sub_record_batch = Arrow::RecordBatch.new(sub_schema,
+ sub_records)
+ sub_record_batch.columns[0].data
+ end
+ records.each do |record|
+ column = record[0]
+ if column.key?("0")
+ type_ids << type_codes[0]
+ elsif column.key?("1")
+ type_ids << type_codes[1]
+ end
+ end
+ union_array = Arrow::SparseUnionArray.new(schema.fields[0].data_type,
+ Arrow::Int8Array.new(type_ids),
+ arrays)
+ schema = Arrow::Schema.new(column: union_array.value_data_type)
+ Arrow::RecordBatch.new(schema,
+ records.size,
+ [union_array])
+ end
+
+ def test_null
+ records = [
+ [{"0" => nil}],
+ ]
+ target = build(:null, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_boolean
+ records = [
+ [{"0" => true}],
+ [{"1" => nil}],
+ ]
+ target = build(:boolean, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int8
+ records = [
+ [{"0" => -(2 ** 7)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint8
+ records = [
+ [{"0" => (2 ** 8) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int16
+ records = [
+ [{"0" => -(2 ** 15)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint16
+ records = [
+ [{"0" => (2 ** 16) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int32
+ records = [
+ [{"0" => -(2 ** 31)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint32
+ records = [
+ [{"0" => (2 ** 32) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int64
+ records = [
+ [{"0" => -(2 ** 63)}],
+ [{"1" => nil}],
+ ]
+ target = build(:int64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint64
+ records = [
+ [{"0" => (2 ** 64) - 1}],
+ [{"1" => nil}],
+ ]
+ target = build(:uint64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_float
+ records = [
+ [{"0" => -1.0}],
+ [{"1" => nil}],
+ ]
+ target = build(:float, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_double
+ records = [
+ [{"0" => -1.0}],
+ [{"1" => nil}],
+ ]
+ target = build(:double, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_binary
+ records = [
+ [{"0" => "\xff".b}],
+ [{"1" => nil}],
+ ]
+ target = build(:binary, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_string
+ records = [
+ [{"0" => "Ruby"}],
+ [{"1" => nil}],
+ ]
+ target = build(:string, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date32
+ records = [
+ [{"0" => Date.new(1960, 1, 1)}],
+ [{"1" => nil}],
+ ]
+ target = build(:date32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date64
+ records = [
+ [{"0" => DateTime.new(1960, 1, 1, 2, 9, 30)}],
+ [{"1" => nil}],
+ ]
+ target = build(:date64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_second
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_milli
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30.123Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_micro
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30.123456Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_nano
+ records = [
+ [{"0" => Time.parse("1960-01-01T02:09:30.123456789Z")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ records = [
+ # 00:10:00
+ [{"0" => Arrow::Time.new(unit, 60 * 10)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ records = [
+ # 00:10:00.123
+ [{"0" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ records = [
+ # 00:10:00.123456
+ [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ records = [
+ # 00:10:00.123456789
+ [{"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal128
+ records = [
+ [{"0" => BigDecimal("92.92")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal256
+ records = [
+ [{"0" => BigDecimal("92.92")}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_list
+ records = [
+ [{"0" => [true, nil, false]}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_struct
+ records = [
+ [{"0" => {"sub_field" => true}}],
+ [{"1" => nil}],
+ [{"0" => {"sub_field" => nil}}],
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :sub_field,
+ type: :boolean,
+ },
+ ],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_map
+ records = [
+ [{"0" => {"key1" => true, "key2" => nil}}],
+ [{"1" => nil}],
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ records = [
+ [{"0" => {"field1" => true}}],
+ [{"1" => nil}],
+ [{"0" => {"field2" => nil}}],
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ records = [
+ [{"0" => {"field1" => true}}],
+ [{"1" => nil}],
+ [{"0" => {"field2" => nil}}],
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ records = [
+ [{"0" => "Ruby"}],
+ [{"1" => nil}],
+ [{"0" => "GLib"}],
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+end
+
+class RawRecordsRecordBatchSparseUnionArrayTest < Test::Unit::TestCase
+ include RawRecordsSparseUnionArrayTests
+
+ def build(type, records)
+ build_record_batch(type, records)
+ end
+end
+
+class RawRecordsTableSparseUnionArrayTest < Test::Unit::TestCase
+ include RawRecordsSparseUnionArrayTests
+
+ def build(type, records)
+ build_record_batch(type, records).to_table
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-struct-array.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-struct-array.rb
new file mode 100644
index 000000000..6c01facf8
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-struct-array.rb
@@ -0,0 +1,485 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module RawRecordsStructArrayTests
+ def build_schema(type)
+ field_description = {
+ name: :field,
+ }
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ {
+ column: {
+ type: :struct,
+ fields: [
+ field_description,
+ ],
+ },
+ }
+ end
+
+ def test_null
+ records = [
+ [{"field" => nil}],
+ [nil],
+ ]
+ target = build(:null, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_boolean
+ records = [
+ [{"field" => true}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:boolean, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int8
+ records = [
+ [{"field" => -(2 ** 7)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:int8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint8
+ records = [
+ [{"field" => (2 ** 8) - 1}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:uint8, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int16
+ records = [
+ [{"field" => -(2 ** 15)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:int16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint16
+ records = [
+ [{"field" => (2 ** 16) - 1}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:uint16, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int32
+ records = [
+ [{"field" => -(2 ** 31)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:int32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint32
+ records = [
+ [{"field" => (2 ** 32) - 1}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:uint32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_int64
+ records = [
+ [{"field" => -(2 ** 63)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:int64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_uint64
+ records = [
+ [{"field" => (2 ** 64) - 1}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:uint64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_float
+ records = [
+ [{"field" => -1.0}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:float, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_double
+ records = [
+ [{"field" => -1.0}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:double, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_binary
+ records = [
+ [{"field" => "\xff".b}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:binary, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_string
+ records = [
+ [{"field" => "Ruby"}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:string, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date32
+ records = [
+ [{"field" => Date.new(1960, 1, 1)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:date32, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_date64
+ records = [
+ [{"field" => DateTime.new(1960, 1, 1, 2, 9, 30)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build(:date64, records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_second
+ records = [
+ [{"field" => Time.parse("1960-01-01T02:09:30Z")}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_milli
+ records = [
+ [{"field" => Time.parse("1960-01-01T02:09:30.123Z")}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_micro
+ records = [
+ [{"field" => Time.parse("1960-01-01T02:09:30.123456Z")}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_timestamp_nano
+ records = [
+ [{"field" => Time.parse("1960-01-01T02:09:30.123456789Z")}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ records = [
+ # 00:10:00
+ [{"field" => Arrow::Time.new(unit, 60 * 10)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ records = [
+ # 00:10:00.123
+ [{"field" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ records = [
+ # 00:10:00.123456
+ [{"field" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ records = [
+ # 00:10:00.123456789
+ [{"field" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal128
+ records = [
+ [{"field" => BigDecimal("92.92")}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_decimal256
+ records = [
+ [{"field" => BigDecimal("92.92")}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_list
+ records = [
+ [{"field" => [true, nil, false]}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_struct
+ records = [
+ [{"field" => {"sub_field" => true}}],
+ [nil],
+ [{"field" => nil}],
+ [{"field" => {"sub_field" => nil}}],
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :sub_field,
+ type: :boolean,
+ },
+ ],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_map
+ records = [
+ [{"field" => {"key1" => true, "key2" => nil}}],
+ [nil],
+ [{"field" => nil}],
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ records = [
+ [{"field" => {"field1" => true}}],
+ [nil],
+ [{"field" => nil}],
+ [{"field" => {"field2" => nil}}],
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ records = [
+ [{"field" => {"field1" => true}}],
+ [nil],
+ [{"field" => nil}],
+ [{"field" => {"field2" => nil}}],
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ records = [
+ [{"field" => "Ruby"}],
+ [nil],
+ [{"field" => nil}],
+ [{"field" => "GLib"}],
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ records)
+ assert_equal(records, target.raw_records)
+ end
+end
+
+class RawRecordsRecordBatchStructArrayTest < Test::Unit::TestCase
+ include RawRecordsStructArrayTests
+
+ def build(type, records)
+ Arrow::RecordBatch.new(build_schema(type), records)
+ end
+end
+
+class RawRecordsTableStructArrayTest < Test::Unit::TestCase
+ include RawRecordsStructArrayTests
+
+ def build(type, records)
+ Arrow::Table.new(build_schema(type), records)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/raw-records/test-table.rb b/src/arrow/ruby/red-arrow/test/raw-records/test-table.rb
new file mode 100644
index 000000000..ae90217c2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/raw-records/test-table.rb
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RawRecordsTableTest < Test::Unit::TestCase
+ test("2 arrays") do
+ raw_record_batches = [
+ [
+ [true, nil, "Ruby"],
+ [nil, 0, "GLib"],
+ [false, 2 ** 8 - 1, nil],
+ ],
+ [
+ [nil, 10, "A"],
+ [true, 20, "B"],
+ [false, nil, "C"],
+ [nil, 40, nil],
+ ]
+ ]
+ raw_records = raw_record_batches.inject do |all_records, record_batch|
+ all_records + record_batch
+ end
+ schema = [
+ {name: :column0, type: :boolean},
+ {name: :column1, type: :uint8},
+ {name: :column2, type: :string},
+ ]
+ record_batches = raw_record_batches.collect do |record_batch|
+ Arrow::RecordBatch.new(schema, record_batch)
+ end
+ table = Arrow::Table.new(schema, record_batches)
+ assert_equal(raw_records, table.raw_records)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/run-test.rb b/src/arrow/ruby/red-arrow/test/run-test.rb
new file mode 100755
index 000000000..41ab73cb6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/run-test.rb
@@ -0,0 +1,71 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+$VERBOSE = true
+
+require "fileutils"
+require "pathname"
+
+(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path|
+ RubyInstaller::Runtime.add_dll_directory(path)
+end
+
+base_dir = Pathname.new(__dir__).parent.expand_path
+
+lib_dir = base_dir + "lib"
+ext_dir = base_dir + "ext" + "arrow"
+test_dir = base_dir + "test"
+
+build_dir = ENV["BUILD_DIR"]
+if build_dir
+ build_dir = File.join(build_dir, "red-arrow")
+ FileUtils.mkdir_p(build_dir)
+else
+ build_dir = ext_dir
+end
+
+make = nil
+if ENV["NO_MAKE"] != "yes"
+ if ENV["MAKE"]
+ make = ENV["MAKE"]
+ elsif system("which gmake > #{File::NULL} 2>&1")
+ make = "gmake"
+ elsif system("which make > #{File::NULL} 2>&1")
+ make = "make"
+ end
+end
+if make
+ Dir.chdir(build_dir.to_s) do
+ unless File.exist?("Makefile")
+ system(RbConfig.ruby,
+ (ext_dir + "extconf.rb").to_s,
+ "--enable-debug-build") or exit(false)
+ end
+ system("#{make} > #{File::NULL}") or exit(false)
+ end
+end
+
+$LOAD_PATH.unshift(build_dir.to_s)
+$LOAD_PATH.unshift(lib_dir.to_s)
+
+require_relative "helper"
+
+ENV["TEST_UNIT_MAX_DIFF_TARGET_STRING_SIZE"] ||= "10000"
+
+exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/src/arrow/ruby/red-arrow/test/test-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-array-builder.rb
new file mode 100644
index 000000000..318167d51
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-array-builder.rb
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ArrayBuilderTest < Test::Unit::TestCase
+ sub_test_case(".build") do
+ def assert_build(builder_class, raw_array)
+ array = builder_class.build(raw_array)
+ assert_equal(raw_array, array.to_a)
+ end
+
+ sub_test_case("generic builder") do
+ test("strings") do
+ assert_build(Arrow::ArrayBuilder,
+ ["Hello", nil, "World"])
+ end
+
+ test("symbols") do
+ array = Arrow::ArrayBuilder.build([:hello, nil, :world])
+ expected_builder = Arrow::StringDictionaryArrayBuilder.new
+ assert_equal(expected_builder.build(["hello", nil, "world"]),
+ array)
+ end
+
+ test("boolean") do
+ assert_build(Arrow::ArrayBuilder,
+ [true, nil, false])
+ end
+
+ test("positive integers") do
+ assert_build(Arrow::ArrayBuilder,
+ [1, nil, 2, nil, 3])
+ end
+
+ test("negative integers") do
+ assert_build(Arrow::ArrayBuilder,
+ [nil, -1, nil, -2, nil, -3])
+ end
+
+ test("times") do
+ assert_build(Arrow::ArrayBuilder,
+ [Time.at(0), Time.at(1), Time.at(2)])
+ end
+
+ test("dates") do
+ assert_build(Arrow::ArrayBuilder,
+ [Date.new(2018, 1, 4), Date.new(2018, 1, 5)])
+ end
+
+ test("datetimes") do
+ assert_build(Arrow::ArrayBuilder,
+ [
+ DateTime.new(2018, 1, 4, 23, 18, 23),
+ DateTime.new(2018, 1, 5, 0, 23, 21),
+ ])
+ end
+
+ test("list<boolean>s") do
+ assert_build(Arrow::ArrayBuilder,
+ [
+ [nil, true, false],
+ nil,
+ [false],
+ ])
+ end
+
+ test("list<string>s") do
+ assert_build(Arrow::ArrayBuilder,
+ [
+ ["Hello", "World"],
+ ["Apache Arrow"],
+ ])
+ end
+ end
+
+ sub_test_case("specific builder") do
+ test("empty") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [])
+ end
+
+ test("values") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [1, -2])
+ end
+
+ test("values, nils") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [1, -2, nil, nil])
+ end
+
+ test("values, nils, values") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [1, -2, nil, nil, 3, -4])
+ end
+
+ test("values, nils, values, nils") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [1, -2, nil, nil, 3, -4, nil, nil])
+ end
+
+ test("nils") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [nil, nil])
+ end
+
+ test("nils, values") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [nil, nil, 3, -4])
+ end
+
+ test("nils, values, nil") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [nil, nil, 3, -4, nil, nil])
+ end
+
+ test("nils, values, nil, values") do
+ assert_build(Arrow::Int32ArrayBuilder,
+ [nil, nil, 3, -4, nil, nil, 5, -6])
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-array.rb b/src/arrow/ruby/red-arrow/test/test-array.rb
new file mode 100644
index 000000000..2b7112da6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-array.rb
@@ -0,0 +1,325 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("Boolean") do
+ array = Arrow::BooleanArray.new([true, false, true])
+ assert_equal([true, false, true],
+ array.to_a)
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @values = [true, false, nil, true]
+ @array = Arrow::BooleanArray.new(@values)
+ end
+
+ test("#each") do
+ assert_equal(@values, @array.to_a)
+ end
+
+ sub_test_case("#[]") do
+ test("valid range") do
+ assert_equal(@values,
+ @array.length.times.collect {|i| @array[i]})
+ end
+
+ test("out of range") do
+ assert_nil(@array[@array.length])
+ end
+
+ test("negative index") do
+ assert_equal(@values.last,
+ @array[-1])
+ end
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Array") do
+ assert do
+ @array == @array
+ end
+ end
+
+ test("not Arrow::Array") do
+ assert do
+ not (@array == 29)
+ end
+ end
+ end
+
+ sub_test_case("#equal_array?") do
+ test("no options") do
+ array1 = Arrow::FloatArray.new([1.1, Float::NAN])
+ array2 = Arrow::FloatArray.new([1.1, Float::NAN])
+ assert do
+ not array1.equal_array?(array2)
+ end
+ end
+
+ test("approx") do
+ array1 = Arrow::FloatArray.new([1.1])
+ array2 = Arrow::FloatArray.new([1.100001])
+ assert do
+ array1.equal_array?(array2, approx: true)
+ end
+ end
+
+ test("nans-equal") do
+ array1 = Arrow::FloatArray.new([1.1, Float::NAN])
+ array2 = Arrow::FloatArray.new([1.1, Float::NAN])
+ assert do
+ array1.equal_array?(array2, nans_equal: true)
+ end
+ end
+
+ test("absolute-tolerance") do
+ array1 = Arrow::FloatArray.new([1.1])
+ array2 = Arrow::FloatArray.new([1.101])
+ assert do
+ array1.equal_array?(array2, approx: true, absolute_tolerance: 0.01)
+ end
+ end
+ end
+
+ sub_test_case("#cast") do
+ test("Symbol") do
+ assert_equal(Arrow::Int32Array.new([1, 2, 3]),
+ Arrow::StringArray.new(["1", "2", "3"]).cast(:int32))
+ end
+ end
+ end
+
+ sub_test_case("#filter") do
+ def setup
+ values = [true, false, false, true]
+ @array = Arrow::BooleanArray.new(values)
+ @options = Arrow::FilterOptions.new
+ @options.null_selection_behavior = :emit_null
+ end
+
+ test("Array: boolean") do
+ filter = [nil, true, true, false]
+ filtered_array = Arrow::BooleanArray.new([nil, false, false])
+ assert_equal(filtered_array,
+ @array.filter(filter, @options))
+ end
+
+ test("Arrow::BooleanArray") do
+ filter = Arrow::BooleanArray.new([nil, true, true, false])
+ filtered_array = Arrow::BooleanArray.new([nil, false, false])
+ assert_equal(filtered_array,
+ @array.filter(filter, @options))
+ end
+
+ test("Arrow::ChunkedArray") do
+ chunks = [
+ Arrow::BooleanArray.new([nil, true]),
+ Arrow::BooleanArray.new([true, false]),
+ ]
+ filter = Arrow::ChunkedArray.new(chunks)
+ filtered_array = Arrow::BooleanArray.new([nil, false, false])
+ assert_equal(filtered_array,
+ @array.filter(filter, @options))
+ end
+ end
+
+ sub_test_case("#take") do
+ def setup
+ values = [1, 0 ,2]
+ @array = Arrow::Int16Array.new(values)
+ end
+
+ test("Arrow: boolean") do
+ indices = [1, 0, 2]
+ assert_equal(Arrow::Int16Array.new([0, 1, 2]),
+ @array.take(indices))
+ end
+
+ test("Arrow::Array") do
+ indices = Arrow::Int16Array.new([1, 0, 2])
+ assert_equal(Arrow::Int16Array.new([0, 1, 2]),
+ @array.take(indices))
+ end
+
+ test("Arrow::ChunkedArray") do
+ taken_chunks = [
+ Arrow::Int16Array.new([0, 1]),
+ Arrow::Int16Array.new([2])
+ ]
+ taken_chunked_array = Arrow::ChunkedArray.new(taken_chunks)
+ indices_chunks = [
+ Arrow::Int16Array.new([1, 0]),
+ Arrow::Int16Array.new([2])
+ ]
+ indices = Arrow::ChunkedArray.new(indices_chunks)
+ assert_equal(taken_chunked_array,
+ @array.take(indices))
+ end
+ end
+
+ sub_test_case("#is_in") do
+ def setup
+ values = [1, 0, 1, 2]
+ @array = Arrow::Int16Array.new(values)
+ end
+
+ test("Arrow: Array") do
+ right = [2, 0]
+ assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
+ @array.is_in(right))
+ end
+
+ test("Arrow::Array") do
+ right = Arrow::Int16Array.new([2, 0])
+ assert_equal(Arrow::BooleanArray.new([false, true, false, true]),
+ @array.is_in(right))
+ end
+
+ test("Arrow::ChunkedArray") do
+ chunks = [
+ Arrow::Int16Array.new([1, 4]),
+ Arrow::Int16Array.new([0, 3])
+ ]
+ right = Arrow::ChunkedArray.new(chunks)
+ assert_equal(Arrow::BooleanArray.new([true, true, true, false]),
+ @array.is_in(right))
+ end
+ end
+
+ sub_test_case("#concatenate") do
+ test("Arrow::Array: same") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil, 4 ,5, 6]),
+ Arrow::Int32Array.new([1, 2, nil]).
+ concatenate(Arrow::Int32Array.new([4, 5]),
+ Arrow::Int32Array.new([6])))
+ end
+
+ test("Arrow::Array: castable") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil, 4 ,5, 6]),
+ Arrow::Int32Array.new([1, 2, nil]).
+ concatenate(Arrow::Int8Array.new([4, 5]),
+ Arrow::UInt32Array.new([6])))
+ end
+
+ test("Arrow::Array: non-castable") do
+ assert_raise(Arrow::Error::Invalid) do
+ Arrow::Int32Array.new([1, 2, nil]).
+ concatenate(Arrow::StringArray.new(["X"]))
+ end
+ end
+
+ test("Array") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil, 4 ,nil, 6]),
+ Arrow::Int32Array.new([1, 2, nil]).
+ concatenate([4, nil],
+ [6]))
+ end
+
+ test("invalid") do
+ message = "[array][resolve] can't build int32 array: 4"
+ assert_raise(ArgumentError.new(message)) do
+ Arrow::Int32Array.new([1, 2, nil]).
+ concatenate(4)
+ end
+ end
+ end
+
+ sub_test_case("#+") do
+ test("Arrow::Array: same") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil, 4 ,5, 6]),
+ Arrow::Int32Array.new([1, 2, nil]) +
+ Arrow::Int32Array.new([4, 5, 6]))
+ end
+
+ test("Arrow::Array: castable") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil, 4 ,5, 6]),
+ Arrow::Int32Array.new([1, 2, nil]) +
+ Arrow::Int8Array.new([4, 5, 6]))
+ end
+
+ test("Arrow::Array: non-castable") do
+ assert_raise(Arrow::Error::Invalid) do
+ Arrow::Int32Array.new([1, 2, nil]) +
+ Arrow::StringArray.new(["X"])
+ end
+ end
+
+ test("Array") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil, 4 ,nil, 6]),
+ Arrow::Int32Array.new([1, 2, nil]) +
+ [4, nil, 6])
+ end
+
+ test("invalid") do
+ message = "[array][resolve] can't build int32 array: 4"
+ assert_raise(ArgumentError.new(message)) do
+ Arrow::Int32Array.new([1, 2, nil]) + 4
+ end
+ end
+ end
+
+ sub_test_case("#resolve") do
+ test("Arrow::Array: same") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil]),
+ Arrow::Int32Array.new([]).
+ resolve(Arrow::Int32Array.new([1, 2, nil])))
+ end
+
+ test("Arrow::Array: castable") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil]),
+ Arrow::Int32Array.new([]).
+ resolve(Arrow::Int8Array.new([1, 2, nil])))
+ end
+
+ test("Arrow::Array: non-castable") do
+ assert_raise(Arrow::Error::Invalid) do
+ Arrow::Int32Array.new([]) +
+ Arrow::StringArray.new(["X"])
+ end
+ end
+
+ test("Array: non-parametric") do
+ assert_equal(Arrow::Int32Array.new([1, 2, nil]),
+ Arrow::Int32Array.new([]).
+ resolve([1, 2, nil]))
+ end
+
+ test("Array: parametric") do
+ list_data_type = Arrow::ListDataType.new(name: "visible", type: :boolean)
+ list_array = Arrow::ListArray.new(list_data_type, [])
+ assert_equal(Arrow::ListArray.new(list_data_type,
+ [
+ [true, false],
+ nil,
+ ]),
+ list_array.resolve([
+ [true, false],
+ nil,
+ ]))
+ end
+
+ test("invalid") do
+ message = "[array][resolve] can't build int32 array: 4"
+ assert_raise(ArgumentError.new(message)) do
+ Arrow::Int32Array.new([]).resolve(4)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-bigdecimal.rb b/src/arrow/ruby/red-arrow/test/test-bigdecimal.rb
new file mode 100644
index 000000000..424f12d39
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-bigdecimal.rb
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class BigDecimalTest < Test::Unit::TestCase
+ sub_test_case("#to_arrow") do
+ def test_128_positive
+ assert_equal(Arrow::Decimal128.new("0.1e38"),
+ BigDecimal("0.1e38").to_arrow)
+ end
+
+ def test_128_negative
+ assert_equal(Arrow::Decimal128.new("-0.1e38"),
+ BigDecimal("-0.1e38").to_arrow)
+ end
+
+ def test_256_positive
+ assert_equal(Arrow::Decimal256.new("0.1e39"),
+ BigDecimal("0.1e39").to_arrow)
+ end
+
+ def test_256_negative
+ assert_equal(Arrow::Decimal256.new("-0.1e39"),
+ BigDecimal("-0.1e39").to_arrow)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-binary-dictionary-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-binary-dictionary-array-builder.rb
new file mode 100644
index 000000000..743dbae5e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-binary-dictionary-array-builder.rb
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class BinaryDictionaryArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ @builder = Arrow::BinaryDictionaryArrayBuilder.new
+ end
+
+ sub_test_case("#append_values") do
+ test("[nil]") do
+ @builder.append_values([nil])
+ array = @builder.finish
+ assert_equal([
+ [],
+ [nil],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("[String]") do
+ @builder.append_values(["he\xffllo"])
+ array = @builder.finish
+ assert_equal([
+ ["he\xffllo".b],
+ [0],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("[Symbol]") do
+ @builder.append_values([:hello])
+ array = @builder.finish
+ assert_equal([
+ ["hello"],
+ [0],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("[nil, String, Symbol]") do
+ @builder.append_values([
+ nil,
+ "He\xffllo",
+ :world,
+ "world",
+ ])
+ array = @builder.finish
+ assert_equal([
+ ["He\xffllo".b, "world"],
+ [nil, 0, 1, 1],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("is_valids") do
+ @builder.append_values([
+ "He\xffllo",
+ :world,
+ :goodbye,
+ ],
+ [
+ true,
+ false,
+ true,
+ ])
+ array = @builder.finish
+ assert_equal([
+ ["He\xffllo".b, "goodbye"],
+ [0, nil, 1],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-boolean-scalar.rb b/src/arrow/ruby/red-arrow/test/test-boolean-scalar.rb
new file mode 100644
index 000000000..1053d1716
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-boolean-scalar.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class BooleanScalarTest < Test::Unit::TestCase
+ def setup
+ @scalar = Arrow::BooleanScalar.new(true)
+ end
+
+ test("#value") do
+ assert_equal(true, @scalar.value)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-buffer.rb b/src/arrow/ruby/red-arrow/test/test-buffer.rb
new file mode 100644
index 000000000..b47a1abba
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-buffer.rb
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class BufferTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("GC") do
+ data = "Hello"
+ data_id = data.object_id
+ _buffer = Arrow::Buffer.new(data)
+ data = nil
+ GC.start
+ assert_equal("Hello", ObjectSpace._id2ref(data_id))
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @buffer = Arrow::Buffer.new("Hello")
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Buffer") do
+ assert do
+ @buffer == @buffer
+ end
+ end
+
+ test("not Arrow::Buffer") do
+ assert do
+ not (@buffer == 29)
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-chunked-array.rb b/src/arrow/ruby/red-arrow/test/test-chunked-array.rb
new file mode 100644
index 000000000..3785e9868
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-chunked-array.rb
@@ -0,0 +1,183 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ChunkedArrayTest < Test::Unit::TestCase
+ test("#each") do
+ arrays = [
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::BooleanArray.new([nil, true]),
+ ]
+ chunked_array = Arrow::ChunkedArray.new(arrays)
+ assert_equal([true, false, nil, true],
+ chunked_array.to_a)
+ end
+
+ sub_test_case("#pack") do
+ test("basic array") do
+ arrays = [
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::BooleanArray.new([nil, true]),
+ ]
+ chunked_array = Arrow::ChunkedArray.new(arrays)
+ packed_chunked_array = chunked_array.pack
+ assert_equal([
+ Arrow::BooleanArray,
+ [true, false, nil, true],
+ ],
+ [
+ packed_chunked_array.class,
+ packed_chunked_array.to_a,
+ ])
+ end
+
+ test("TimestampArray") do
+ type = Arrow::TimestampDataType.new(:nano)
+ arrays = [
+ Arrow::TimestampArrayBuilder.new(type).build([Time.at(0)]),
+ Arrow::TimestampArrayBuilder.new(type).build([Time.at(1)]),
+ ]
+ chunked_array = Arrow::ChunkedArray.new(arrays)
+ packed_chunked_array = chunked_array.pack
+ assert_equal([
+ Arrow::TimestampArray,
+ [Time.at(0), Time.at(1)],
+ ],
+ [
+ packed_chunked_array.class,
+ packed_chunked_array.to_a,
+ ])
+ end
+ end
+
+ sub_test_case("#==") do
+ def setup
+ arrays = [
+ Arrow::BooleanArray.new([true]),
+ Arrow::BooleanArray.new([false, true]),
+ ]
+ @chunked_array = Arrow::ChunkedArray.new(arrays)
+ end
+
+ test("Arrow::ChunkedArray") do
+ assert do
+ @chunked_array == @chunked_array
+ end
+ end
+
+ test("not Arrow::ChunkedArray") do
+ assert do
+ not (@chunked_array == 29)
+ end
+ end
+ end
+
+ sub_test_case("#filter") do
+ def setup
+ arrays = [
+ Arrow::BooleanArray.new([false, true]),
+ Arrow::BooleanArray.new([false, true, false]),
+ ]
+ @chunked_array = Arrow::ChunkedArray.new(arrays)
+ @options = Arrow::FilterOptions.new
+ @options.null_selection_behavior = :emit_null
+ end
+
+ test("Array: boolean") do
+ filter = [nil, true, true, false, true]
+ chunks = [
+ Arrow::BooleanArray.new([nil, true]),
+ Arrow::BooleanArray.new([false, false]),
+ ]
+ filtered_chunked_array = Arrow::ChunkedArray.new(chunks)
+ assert_equal(filtered_chunked_array,
+ @chunked_array.filter(filter, @options))
+ end
+
+ test("Arrow::BooleanArray") do
+ filter = Arrow::BooleanArray.new([nil, true, true, false, true])
+ chunks = [
+ Arrow::BooleanArray.new([nil, true]),
+ Arrow::BooleanArray.new([false, false]),
+ ]
+ filtered_chunked_array = Arrow::ChunkedArray.new(chunks)
+ assert_equal(filtered_chunked_array,
+ @chunked_array.filter(filter, @options))
+ end
+
+ test("Arrow::ChunkedArray") do
+ chunks = [
+ Arrow::BooleanArray.new([nil, true]),
+ Arrow::BooleanArray.new([true, false, true]),
+ ]
+ filter = Arrow::ChunkedArray.new(chunks)
+ filtered_chunks = [
+ Arrow::BooleanArray.new([nil, true]),
+ Arrow::BooleanArray.new([false, false]),
+ ]
+ filtered_chunked_array = Arrow::ChunkedArray.new(filtered_chunks)
+ assert_equal(filtered_chunked_array,
+ @chunked_array.filter(filter, @options))
+ end
+ end
+
+ sub_test_case("#take") do
+ def setup
+ chunks = [
+ Arrow::Int16Array.new([1, 0]),
+ Arrow::Int16Array.new([2]),
+ ]
+ @chunked_array = Arrow::ChunkedArray.new(chunks)
+ end
+
+ test("Arrow: boolean") do
+ chunks = [
+ Arrow::Int16Array.new([0, 1]),
+ Arrow::Int16Array.new([2])
+ ]
+ taken_chunked_array = Arrow::ChunkedArray.new(chunks)
+ indices = [1, 0, 2]
+ assert_equal(taken_chunked_array,
+ @chunked_array.take(indices))
+ end
+
+ test("Arrow::Array") do
+ chunks = [
+ Arrow::Int16Array.new([0, 1]),
+ Arrow::Int16Array.new([2])
+ ]
+ taken_chunked_array = Arrow::ChunkedArray.new(chunks)
+ indices = Arrow::Int16Array.new([1, 0, 2])
+ assert_equal(taken_chunked_array,
+ @chunked_array.take(indices))
+ end
+
+ test("Arrow::ChunkedArray") do
+ taken_chunks = [
+ Arrow::Int16Array.new([0, 1]),
+ Arrow::Int16Array.new([2])
+ ]
+ taken_chunked_array = Arrow::ChunkedArray.new(taken_chunks)
+ indices_chunks = [
+ Arrow::Int16Array.new([1, 0]),
+ Arrow::Int16Array.new([2])
+ ]
+ indices = Arrow::ChunkedArray.new(indices_chunks)
+ assert_equal(taken_chunked_array,
+ @chunked_array.take(indices))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-column.rb b/src/arrow/ruby/red-arrow/test/test-column.rb
new file mode 100644
index 000000000..613b01ccc
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-column.rb
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ColumnTest < Test::Unit::TestCase
+ def setup
+ table = Arrow::Table.new("visible" => [true, nil, false])
+ @column = table.visible
+ end
+
+ test("#name") do
+ assert_equal("visible", @column.name)
+ end
+
+ test("#data_type") do
+ assert_equal(Arrow::BooleanDataType.new, @column.data_type)
+ end
+
+ test("#null?") do
+ assert do
+ @column.null?(1)
+ end
+ end
+
+ test("#valid?") do
+ assert do
+ @column.valid?(0)
+ end
+ end
+
+ test("#each") do
+ assert_equal([true, nil, false], @column.each.to_a)
+ end
+
+ test("#reverse_each") do
+ assert_equal([false, nil, true], @column.reverse_each.to_a)
+ end
+
+ test("#n_rows") do
+ assert_equal(3, @column.n_rows)
+ end
+
+ test("#n_nulls") do
+ assert_equal(1, @column.n_nulls)
+ end
+
+ sub_test_case("#==") do
+ test("same value") do
+ table1 = Arrow::Table.new("visible" => [true, false])
+ table2 = Arrow::Table.new("visible" => [true, false])
+ assert do
+ table1.visible == table2.visible
+ end
+ end
+
+ test("different name") do
+ table1 = Arrow::Table.new("visible" => [true, false])
+ table2 = Arrow::Table.new("invisible" => [true, false])
+ assert do
+ not table1.visible == table2.invisible
+ end
+ end
+
+ test("different value") do
+ table1 = Arrow::Table.new("visible" => [true, false])
+ table2 = Arrow::Table.new("visible" => [true, true])
+ assert do
+ not table1.visible == table2.visible
+ end
+ end
+
+ test("not Arrow::Column") do
+ table = Arrow::Table.new("visible" => [true, false])
+ assert do
+ not table.visible == 29
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-csv-loader.rb b/src/arrow/ruby/red-arrow/test/test-csv-loader.rb
new file mode 100644
index 000000000..7f7f23498
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-csv-loader.rb
@@ -0,0 +1,250 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class CSVLoaderTest < Test::Unit::TestCase
+ include Helper::Fixture
+
+ def load_csv(input)
+ Arrow::CSVLoader.load(input, skip_lines: /^#/)
+ end
+
+ sub_test_case(".load") do
+ test("String: data: with header") do
+ data = fixture_path("with-header-float.csv").read
+ assert_equal(<<-TABLE, load_csv(data).to_s)
+ name score
+0 alice 10.100000
+1 bob 29.200000
+2 chris -1.300000
+ TABLE
+ end
+
+ test("String: data: without header") do
+ data = fixture_path("without-header-float.csv").read
+ assert_equal(<<-TABLE, load_csv(data).to_s)
+ 0 1
+0 alice 10.100000
+1 bob 29.200000
+2 chris -1.300000
+ TABLE
+ end
+
+ test("String: path: with header") do
+ path = fixture_path("with-header-float.csv").to_s
+ assert_equal(<<-TABLE, load_csv(path).to_s)
+ name score
+0 alice 10.100000
+1 bob 29.200000
+2 chris -1.300000
+ TABLE
+ end
+
+ test("String: path: without header") do
+ path = fixture_path("without-header-float.csv").to_s
+ assert_equal(<<-TABLE, load_csv(path).to_s)
+ 0 1
+0 alice 10.100000
+1 bob 29.200000
+2 chris -1.300000
+ TABLE
+ end
+
+ test("Pathname: with header") do
+ path = fixture_path("with-header-float.csv")
+ assert_equal(<<-TABLE, load_csv(path).to_s)
+ name score
+0 alice 10.100000
+1 bob 29.200000
+2 chris -1.300000
+ TABLE
+ end
+
+ test("Pathname: without header") do
+ path = fixture_path("without-header-float.csv")
+ assert_equal(<<-TABLE, load_csv(path).to_s)
+ 0 1
+0 alice 10.100000
+1 bob 29.200000
+2 chris -1.300000
+ TABLE
+ end
+
+ test("null: with double quote") do
+ path = fixture_path("null-with-double-quote.csv").to_s
+ assert_equal(<<-TABLE, load_csv(path).to_s)
+ name score
+0 alice 10
+1 bob (null)
+2 chris -1
+ TABLE
+ end
+
+ test("null: without double quote") do
+ path = fixture_path("null-without-double-quote.csv").to_s
+ assert_equal(<<-TABLE, load_csv(path).to_s)
+ name score
+0 alice 10
+1 bob (null)
+2 chris -1
+ TABLE
+ end
+
+ test("number: float, integer") do
+ path = fixture_path("float-integer.csv").to_s
+ assert_equal([2.9, 10, -1.1],
+ load_csv(path)[:score].to_a)
+ end
+
+ test("number: integer, float") do
+ path = fixture_path("integer-float.csv").to_s
+ assert_equal([10.0, 2.9, -1.1],
+ load_csv(path)[:score].to_a)
+ end
+ end
+
+ sub_test_case("CSVReader") do
+ def load_csv(data, **options)
+ Arrow::CSVLoader.load(data, **options)
+ end
+
+ sub_test_case(":headers") do
+ test("true") do
+ values = Arrow::StringArray.new(["a", "b", "c"])
+ assert_equal(Arrow::Table.new(value: values),
+ load_csv(<<-CSV, headers: true))
+value
+a
+b
+c
+ CSV
+ end
+
+ test(":first_line") do
+ values = Arrow::StringArray.new(["a", "b", "c"])
+ assert_equal(Arrow::Table.new(value: values),
+ load_csv(<<-CSV, headers: :first_line))
+value
+a
+b
+c
+ CSV
+ end
+
+ test("truthy") do
+ values = Arrow::StringArray.new(["a", "b", "c"])
+ assert_equal(Arrow::Table.new(value: values),
+ load_csv(<<-CSV, headers: 0))
+value
+a
+b
+c
+ CSV
+ end
+
+ test("Array of column names") do
+ values = Arrow::StringArray.new(["a", "b", "c"])
+ assert_equal(Arrow::Table.new(column: values),
+ load_csv(<<-CSV, headers: ["column"]))
+a
+b
+c
+ CSV
+ end
+
+ test("false") do
+ values = Arrow::StringArray.new(["a", "b", "c"])
+ assert_equal(Arrow::Table.new(f0: values),
+ load_csv(<<-CSV, headers: false))
+a
+b
+c
+ CSV
+ end
+
+ test("nil") do
+ values = Arrow::StringArray.new(["a", "b", "c"])
+ assert_equal(Arrow::Table.new(f0: values),
+ load_csv(<<-CSV, headers: nil))
+a
+b
+c
+ CSV
+ end
+
+ test("string") do
+ values = Arrow::StringArray.new(["a", "b", "c"])
+ assert_equal(Arrow::Table.new(column: values),
+ load_csv(<<-CSV, headers: "column"))
+a
+b
+c
+ CSV
+ end
+ end
+
+ test(":column_types") do
+ assert_equal(Arrow::Table.new(:count => Arrow::UInt16Array.new([1, 2, 4])),
+ load_csv(<<-CSV, column_types: {count: :uint16}))
+count
+1
+2
+4
+ CSV
+ end
+
+ test(":schema") do
+ table = Arrow::Table.new(:count => Arrow::UInt16Array.new([1, 2, 4]))
+ assert_equal(table,
+ load_csv(<<-CSV, schema: table.schema))
+count
+1
+2
+4
+ CSV
+ end
+
+ test(":encoding") do
+ messages = [
+ "\u3042", # U+3042 HIRAGANA LETTER A
+ "\u3044", # U+3044 HIRAGANA LETTER I
+ "\u3046", # U+3046 HIRAGANA LETTER U
+ ]
+ table = Arrow::Table.new(:message => Arrow::StringArray.new(messages))
+ encoding = "cp932"
+ assert_equal(table,
+ load_csv((["message"] + messages).join("\n").encode(encoding),
+ schema: table.schema,
+ encoding: encoding))
+ end
+
+ test(":encoding and :compression") do
+ messages = [
+ "\u3042", # U+3042 HIRAGANA LETTER A
+ "\u3044", # U+3044 HIRAGANA LETTER I
+ "\u3046", # U+3046 HIRAGANA LETTER U
+ ]
+ table = Arrow::Table.new(:message => Arrow::StringArray.new(messages))
+ encoding = "cp932"
+ csv = (["message"] + messages).join("\n").encode(encoding)
+ assert_equal(table,
+ load_csv(Zlib::Deflate.deflate(csv),
+ schema: table.schema,
+ encoding: encoding,
+ compression: :gzip))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-data-type.rb b/src/arrow/ruby/red-arrow/test/test-data-type.rb
new file mode 100644
index 000000000..f54831780
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-data-type.rb
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class DataTypeTest < Test::Unit::TestCase
+ sub_test_case(".resolve") do
+ test("DataType") do
+ assert_equal(Arrow::BooleanDataType.new,
+ Arrow::DataType.resolve(Arrow::BooleanDataType.new))
+ end
+
+ test("String") do
+ assert_equal(Arrow::BooleanDataType.new,
+ Arrow::DataType.resolve("boolean"))
+ end
+
+ test("Symbol") do
+ assert_equal(Arrow::BooleanDataType.new,
+ Arrow::DataType.resolve(:boolean))
+ end
+
+ test("Array") do
+ field = Arrow::Field.new(:visible, :boolean)
+ assert_equal(Arrow::ListDataType.new(field),
+ Arrow::DataType.resolve([:list, field]))
+ end
+
+ test("Hash") do
+ field = Arrow::Field.new(:visible, :boolean)
+ assert_equal(Arrow::ListDataType.new(field),
+ Arrow::DataType.resolve(type: :list, field: field))
+ end
+
+ test("_") do
+ assert_equal(Arrow::FixedSizeBinaryDataType.new(10),
+ Arrow::DataType.resolve([:fixed_size_binary, 10]))
+ end
+
+ test("abstract") do
+ message =
+ "abstract type: <:floating_point>: " +
+ "use one of not abstract type: [" +
+ "Arrow::DoubleDataType, " +
+ "Arrow::FloatDataType]"
+ assert_raise(ArgumentError.new(message)) do
+ Arrow::DataType.resolve(:floating_point)
+ end
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @data_type = Arrow::StringDataType.new
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::DataType") do
+ assert do
+ @data_type == @data_type
+ end
+ end
+
+ test("not Arrow::DataType") do
+ assert do
+ not (@data_type == 29)
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-date32-array.rb b/src/arrow/ruby/red-arrow/test/test-date32-array.rb
new file mode 100644
index 000000000..6918b48db
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-date32-array.rb
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Date32ArrayTest < Test::Unit::TestCase
+ test("#[]") do
+ n_days_since_epoch = 17406 # 2017-08-28
+ array = Arrow::Date32Array.new([n_days_since_epoch])
+ assert_equal(Date.new(2017, 8, 28), array[0])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-date64-array.rb b/src/arrow/ruby/red-arrow/test/test-date64-array.rb
new file mode 100644
index 000000000..ec1c6db7c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-date64-array.rb
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Date64ArrayTest < Test::Unit::TestCase
+ test("#[]") do
+ n_msecs_since_epoch = 1503878400000 # 2017-08-28T00:00:00Z
+ array = Arrow::Date64Array.new([n_msecs_since_epoch])
+ assert_equal(DateTime.new(2017, 8, 28, 0, 0, 0),
+ array[0])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal128-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-decimal128-array-builder.rb
new file mode 100644
index 000000000..31d58bd58
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal128-array-builder.rb
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal128ArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ @data_type = Arrow::Decimal128DataType.new(3, 1)
+ @builder = Arrow::Decimal128ArrayBuilder.new(@data_type)
+ end
+
+ sub_test_case("#append_value") do
+ test("nil") do
+ @builder.append_value(nil)
+ array = @builder.finish
+ assert_equal(nil, array[0])
+ end
+
+ test("Arrow::Decimal128") do
+ @builder.append_value(Arrow::Decimal128.new("10.1"))
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+
+ test("String") do
+ @builder.append_value("10.1")
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+
+ test("Float") do
+ @builder.append_value(10.1)
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+
+ test("BigDecimal") do
+ @builder.append_value(BigDecimal("10.1"))
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+ end
+
+ sub_test_case("#append_values") do
+ test("mixed") do
+ @builder.append_values([
+ Arrow::Decimal128.new("10.1"),
+ nil,
+ "10.1",
+ 10.1,
+ BigDecimal("10.1"),
+ ])
+ array = @builder.finish
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ BigDecimal("10.1"),
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+
+ test("is_valids") do
+ @builder.append_values([
+ Arrow::Decimal128.new("10.1"),
+ Arrow::Decimal128.new("10.1"),
+ Arrow::Decimal128.new("10.1"),
+ ],
+ [
+ true,
+ false,
+ true,
+ ])
+ array = @builder.finish
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+
+ test("packed") do
+ @builder.append_values(Arrow::Decimal128.new("10.1").to_bytes.to_s * 3,
+ [true, false, true])
+ array = @builder.finish
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal128-array.rb b/src/arrow/ruby/red-arrow/test/test-decimal128-array.rb
new file mode 100644
index 000000000..88ab1c26c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal128-array.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal128ArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("build") do
+ data_type = Arrow::Decimal128DataType.new(3, 1)
+ values = [
+ 10.1,
+ nil,
+ "10.1",
+ BigDecimal("10.1"),
+ ]
+ array = Arrow::Decimal128Array.new(data_type, values)
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal128-data-type.rb b/src/arrow/ruby/red-arrow/test/test-decimal128-data-type.rb
new file mode 100644
index 000000000..5390a7a44
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal128-data-type.rb
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal128DataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("ordered arguments") do
+ assert_equal("decimal128(8, 2)",
+ Arrow::Decimal128DataType.new(8, 2).to_s)
+ end
+
+ test("description") do
+ assert_equal("decimal128(8, 2)",
+ Arrow::Decimal128DataType.new(precision: 8,
+ scale: 2).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal128.rb b/src/arrow/ruby/red-arrow/test/test-decimal128.rb
new file mode 100644
index 000000000..9e7f8792c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal128.rb
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal128Test < Test::Unit::TestCase
+ sub_test_case("instance methods") do
+ def setup
+ @decimal128 = Arrow::Decimal128.new("10.1")
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Decimal128") do
+ assert do
+ @decimal128 == @decimal128
+ end
+ end
+
+ test("not Arrow::Decimal128") do
+ assert do
+ not (@decimal128 == 10.1)
+ end
+ end
+ end
+
+ sub_test_case("#!=") do
+ test("Arrow::Decimal128") do
+ assert do
+ not (@decimal128 != @decimal128)
+ end
+ end
+
+ test("not Arrow::Decimal128") do
+ assert do
+ @decimal128 != 10.1
+ end
+ end
+ end
+
+ sub_test_case("#to_s") do
+ test("default") do
+ assert_equal("101",
+ @decimal128.to_s)
+ end
+
+ test("scale") do
+ assert_equal("10.1",
+ @decimal128.to_s(1))
+ end
+ end
+
+ test("#abs") do
+ decimal128 = Arrow::Decimal128.new("-10.1")
+ assert_equal([
+ Arrow::Decimal128.new("-10.1"),
+ Arrow::Decimal128.new("10.1"),
+ ],
+ [
+ decimal128,
+ decimal128.abs,
+ ])
+ end
+
+ test("#abs!") do
+ decimal128 = Arrow::Decimal128.new("-10.1")
+ decimal128.abs!
+ assert_equal(Arrow::Decimal128.new("10.1"),
+ decimal128)
+ end
+
+ test("#negate") do
+ decimal128 = Arrow::Decimal128.new("-10.1")
+ assert_equal([
+ Arrow::Decimal128.new("-10.1"),
+ Arrow::Decimal128.new("10.1"),
+ ],
+ [
+ decimal128,
+ decimal128.negate,
+ ])
+ end
+
+ test("#negate!") do
+ decimal128 = Arrow::Decimal128.new("-10.1")
+ decimal128.negate!
+ assert_equal(Arrow::Decimal128.new("10.1"),
+ decimal128)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal256-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-decimal256-array-builder.rb
new file mode 100644
index 000000000..f0769b662
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal256-array-builder.rb
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal256ArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ @data_type = Arrow::Decimal256DataType.new(3, 1)
+ @builder = Arrow::Decimal256ArrayBuilder.new(@data_type)
+ end
+
+ sub_test_case("#append_value") do
+ test("nil") do
+ @builder.append_value(nil)
+ array = @builder.finish
+ assert_equal(nil, array[0])
+ end
+
+ test("Arrow::Decimal256") do
+ @builder.append_value(Arrow::Decimal256.new("10.1"))
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+
+ test("String") do
+ @builder.append_value("10.1")
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+
+ test("Float") do
+ @builder.append_value(10.1)
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+
+ test("BigDecimal") do
+ @builder.append_value(BigDecimal("10.1"))
+ array = @builder.finish
+ assert_equal(BigDecimal("10.1"),
+ array[0])
+ end
+ end
+
+ sub_test_case("#append_values") do
+ test("mixed") do
+ @builder.append_values([
+ Arrow::Decimal256.new("10.1"),
+ nil,
+ "10.1",
+ 10.1,
+ BigDecimal("10.1"),
+ ])
+ array = @builder.finish
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ BigDecimal("10.1"),
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+
+ test("is_valids") do
+ @builder.append_values([
+ Arrow::Decimal256.new("10.1"),
+ Arrow::Decimal256.new("10.1"),
+ Arrow::Decimal256.new("10.1"),
+ ],
+ [
+ true,
+ false,
+ true,
+ ])
+ array = @builder.finish
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+
+ test("packed") do
+ @builder.append_values(Arrow::Decimal256.new("10.1").to_bytes.to_s * 3,
+ [true, false, true])
+ array = @builder.finish
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal256-array.rb b/src/arrow/ruby/red-arrow/test/test-decimal256-array.rb
new file mode 100644
index 000000000..7049a4509
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal256-array.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal256ArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("build") do
+ data_type = Arrow::Decimal256DataType.new(3, 1)
+ values = [
+ 10.1,
+ nil,
+ "10.1",
+ BigDecimal("10.1"),
+ ]
+ array = Arrow::Decimal256Array.new(data_type, values)
+ assert_equal([
+ BigDecimal("10.1"),
+ nil,
+ BigDecimal("10.1"),
+ BigDecimal("10.1"),
+ ],
+ array.to_a)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal256-data-type.rb b/src/arrow/ruby/red-arrow/test/test-decimal256-data-type.rb
new file mode 100644
index 000000000..96b2a505b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal256-data-type.rb
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal256DataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("ordered arguments") do
+ assert_equal("decimal256(8, 2)",
+ Arrow::Decimal256DataType.new(8, 2).to_s)
+ end
+
+ test("description") do
+ assert_equal("decimal256(8, 2)",
+ Arrow::Decimal256DataType.new(precision: 8,
+ scale: 2).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-decimal256.rb b/src/arrow/ruby/red-arrow/test/test-decimal256.rb
new file mode 100644
index 000000000..422167f99
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-decimal256.rb
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Decimal256Test < Test::Unit::TestCase
+ sub_test_case("instance methods") do
+ def setup
+ @decimal256 = Arrow::Decimal256.new("10.1")
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Decimal256") do
+ assert do
+ @decimal256 == @decimal256
+ end
+ end
+
+ test("not Arrow::Decimal256") do
+ assert do
+ not (@decimal256 == 10.1)
+ end
+ end
+ end
+
+ sub_test_case("#!=") do
+ test("Arrow::Decimal256") do
+ assert do
+ not (@decimal256 != @decimal256)
+ end
+ end
+
+ test("not Arrow::Decimal256") do
+ assert do
+ @decimal256 != 10.1
+ end
+ end
+ end
+
+ sub_test_case("#to_s") do
+ test("default") do
+ assert_equal("101",
+ @decimal256.to_s)
+ end
+
+ test("scale") do
+ assert_equal("10.1",
+ @decimal256.to_s(1))
+ end
+ end
+
+ test("#abs") do
+ decimal256 = Arrow::Decimal256.new("-10.1")
+ assert_equal([
+ Arrow::Decimal256.new("-10.1"),
+ Arrow::Decimal256.new("10.1"),
+ ],
+ [
+ decimal256,
+ decimal256.abs,
+ ])
+ end
+
+ test("#abs!") do
+ decimal256 = Arrow::Decimal256.new("-10.1")
+ decimal256.abs!
+ assert_equal(Arrow::Decimal256.new("10.1"),
+ decimal256)
+ end
+
+ test("#negate") do
+ decimal256 = Arrow::Decimal256.new("-10.1")
+ assert_equal([
+ Arrow::Decimal256.new("-10.1"),
+ Arrow::Decimal256.new("10.1"),
+ ],
+ [
+ decimal256,
+ decimal256.negate,
+ ])
+ end
+
+ test("#negate!") do
+ decimal256 = Arrow::Decimal256.new("-10.1")
+ decimal256.negate!
+ assert_equal(Arrow::Decimal256.new("10.1"),
+ decimal256)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-dense-union-data-type.rb b/src/arrow/ruby/red-arrow/test/test-dense-union-data-type.rb
new file mode 100644
index 000000000..d8da6f772
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-dense-union-data-type.rb
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class DenseUnionDataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ def setup
+ @fields = [
+ Arrow::Field.new("visible", :boolean),
+ {
+ name: "count",
+ type: :int32,
+ },
+ ]
+ end
+
+ test("ordered arguments") do
+ assert_equal("dense_union<visible: bool=2, count: int32=9>",
+ Arrow::DenseUnionDataType.new(@fields, [2, 9]).to_s)
+ end
+
+ test("description") do
+ assert_equal("dense_union<visible: bool=2, count: int32=9>",
+ Arrow::DenseUnionDataType.new(fields: @fields,
+ type_codes: [2, 9]).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-dictionary-array.rb b/src/arrow/ruby/red-arrow/test/test-dictionary-array.rb
new file mode 100644
index 000000000..83368e9ec
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-dictionary-array.rb
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class DictionaryArrayTest < Test::Unit::TestCase
+ sub_test_case("instance methods") do
+ def setup
+ @values = ["a", "b", "c", "b", "a"]
+ @string_array = Arrow::StringArray.new(@values)
+ @array = @string_array.dictionary_encode
+ end
+
+ test("#[]") do
+ assert_equal(@values, @array.to_a)
+ end
+
+ test("#get_value") do
+ assert_equal([
+ @values[0],
+ @values[3],
+ ],
+ [
+ @array.get_value(0),
+ @array.get_value(3),
+ ])
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-dictionary-data-type.rb b/src/arrow/ruby/red-arrow/test/test-dictionary-data-type.rb
new file mode 100644
index 000000000..c5b6dd1bf
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-dictionary-data-type.rb
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class DictionaryDataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ def setup
+ @index_data_type = :int8
+ @value_data_type = :string
+ @ordered = true
+ end
+
+ test("ordered arguments") do
+ assert_equal("dictionary<values=string, indices=int8, ordered=1>",
+ Arrow::DictionaryDataType.new(@index_data_type,
+ @value_data_type,
+ @ordered).to_s)
+ end
+
+ test("description") do
+ assert_equal("dictionary<values=string, indices=int8, ordered=1>",
+ Arrow::DictionaryDataType.new(index_data_type: @index_data_type,
+ value_data_type: @value_data_type,
+ ordered: @ordered).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-expression.rb b/src/arrow/ruby/red-arrow/test/test-expression.rb
new file mode 100644
index 000000000..e172e78be
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-expression.rb
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestExpression < Test::Unit::TestCase
+ sub_test_case(".try_convert") do
+ test("Symbol") do
+ assert_equal(Arrow::FieldExpression.new("visible"),
+ Arrow::Expression.try_convert(:visible))
+ end
+
+ test("[String]") do
+ assert_equal(Arrow::CallExpression.new("func", []),
+ Arrow::Expression.try_convert(["func"]))
+ end
+
+ test("[Symbol]") do
+ assert_equal(Arrow::CallExpression.new("func", []),
+ Arrow::Expression.try_convert([:func]))
+ end
+
+ test("[String, String]") do
+ assert_equal(Arrow::CallExpression.new("func", ["argument1"]),
+ Arrow::Expression.try_convert(["func", "argument1"]))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-feather.rb b/src/arrow/ruby/red-arrow/test/test-feather.rb
new file mode 100644
index 000000000..21d8a2c31
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-feather.rb
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class FeatherTest < Test::Unit::TestCase
+ include Helper::Fixture
+
+ def setup
+ columns = {
+ "message" => Arrow::StringArray.new(["Start", "Crash", "Shutdown"]),
+ "is_critical" => Arrow::BooleanArray.new([false, true, false]),
+ }
+ @table = Arrow::Table.new(columns)
+
+ @output = Tempfile.new(["red-arrow", ".feather"])
+ begin
+ yield(@output)
+ ensure
+ @output.close!
+ end
+ end
+
+ def test_default
+ @table.save(@output.path)
+ @output.close
+
+ assert_equal(@table, Arrow::Table.load(@output.path))
+ end
+
+ def test_compression
+ @table.save(@output.path, compression: :zstd)
+ @output.close
+
+ assert_equal(@table, Arrow::Table.load(@output.path))
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-field.rb b/src/arrow/ruby/red-arrow/test/test-field.rb
new file mode 100644
index 000000000..1b6bc4b17
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-field.rb
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class FieldTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("String, Arrow::DataType") do
+ assert_equal("visible: bool",
+ Arrow::Field.new("visible", Arrow::BooleanDataType.new).to_s)
+ end
+
+ test("Symbol, Arrow::DataType") do
+ assert_equal("visible: bool",
+ Arrow::Field.new(:visible, Arrow::BooleanDataType.new).to_s)
+ end
+
+ test("String, Symbol") do
+ assert_equal("visible: bool",
+ Arrow::Field.new(:visible, :boolean).to_s)
+ end
+
+ test("String, Hash") do
+ assert_equal("visible: bool",
+ Arrow::Field.new(:visible, type: :boolean).to_s)
+ end
+
+ test("description: String") do
+ assert_equal("visible: bool",
+ Arrow::Field.new(name: "visible",
+ data_type: :boolean).to_s)
+ end
+
+ test("description: Symbol") do
+ assert_equal("visible: bool",
+ Arrow::Field.new(name: :visible,
+ data_type: :boolean).to_s)
+ end
+
+ test("description: shortcut") do
+ assert_equal("visible: bool",
+ Arrow::Field.new(name: :visible,
+ type: :boolean).to_s)
+ end
+
+ test("Hash: shortcut: additional") do
+ description = {
+ name: :tags,
+ type: :list,
+ field: {
+ name: "tag",
+ type: :string,
+ },
+ }
+ assert_equal("tags: list<tag: string>",
+ Arrow::Field.new(description).to_s)
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @field = Arrow::Field.new("count", :uint32)
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Field") do
+ assert do
+ @field == @field
+ end
+ end
+
+ test("not Arrow::Field") do
+ assert do
+ not (@field == 29)
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-file-output-stream.rb b/src/arrow/ruby/red-arrow/test/test-file-output-stream.rb
new file mode 100644
index 000000000..559406a4e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-file-output-stream.rb
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestFileOutputStream < Test::Unit::TestCase
+ sub_test_case(".open") do
+ def setup
+ @file = Tempfile.open("arrow-file-output-stream")
+ @file.write("Hello")
+ @file.close
+ end
+
+ def test_default
+ Arrow::FileOutputStream.open(@file.path) do |file|
+ file.write(" World")
+ end
+ assert_equal(" World", File.read(@file.path))
+ end
+
+ def test_options_append
+ Arrow::FileOutputStream.open(@file.path, append: true) do |file|
+ file.write(" World")
+ end
+ assert_equal("Hello World", File.read(@file.path))
+ end
+
+ def test_append_true
+ Arrow::FileOutputStream.open(@file.path, true) do |file|
+ file.write(" World")
+ end
+ assert_equal("Hello World", File.read(@file.path))
+ end
+
+ def test_append_false
+ Arrow::FileOutputStream.open(@file.path, false) do |file|
+ file.write(" World")
+ end
+ assert_equal(" World", File.read(@file.path))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array-builder.rb
new file mode 100644
index 000000000..fae79f285
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array-builder.rb
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class FixedSizeBinaryArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ @data_type = Arrow::FixedSizeBinaryDataType.new(4)
+ @builder = Arrow::FixedSizeBinaryArrayBuilder.new(@data_type)
+ end
+
+ sub_test_case("#append_value") do
+ test("nil") do
+ @builder.append_value(nil)
+ array = @builder.finish
+ assert_equal(nil, array[0])
+ end
+
+ test("String") do
+ @builder.append_value("0123")
+ array = @builder.finish
+ assert_equal("0123", array[0])
+ end
+
+ test("GLib::Bytes") do
+ @builder.append_value(GLib::Bytes.new("0123"))
+ array = @builder.finish
+ assert_equal("0123", array[0])
+ end
+ end
+
+ sub_test_case("#append_values") do
+ test("mixed") do
+ @builder.append_values([
+ "0123",
+ nil,
+ GLib::Bytes.new("abcd"),
+ ])
+ array = @builder.finish
+ assert_equal([
+ "0123",
+ nil,
+ "abcd",
+ ],
+ array.to_a)
+ end
+
+ test("is_valids") do
+ @builder.append_values([
+ "0123",
+ "0123",
+ "0123",
+ ],
+ [
+ true,
+ false,
+ true,
+ ])
+ array = @builder.finish
+ assert_equal([
+ "0123",
+ nil,
+ "0123",
+ ],
+ array.to_a)
+ end
+
+ test("packed") do
+ @builder.append_values("0123" * 3,
+ [true, false, true])
+ array = @builder.finish
+ assert_equal([
+ "0123",
+ nil,
+ "0123",
+ ],
+ array.to_a)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array.rb b/src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array.rb
new file mode 100644
index 000000000..3cb46b964
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-fixed-size-binary-array.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class FixedSizeBinaryArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("build") do
+ data_type = Arrow::FixedSizeBinaryDataType.new(4)
+ values = [
+ "0123",
+ nil,
+ GLib::Bytes.new("abcd"),
+ ]
+ array = Arrow::FixedSizeBinaryArray.new(data_type, values)
+ assert_equal([
+ "0123",
+ nil,
+ "abcd",
+ ],
+ array.to_a)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-float-scalar.rb b/src/arrow/ruby/red-arrow/test/test-float-scalar.rb
new file mode 100644
index 000000000..1117d7728
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-float-scalar.rb
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class FloatScalarTest < Test::Unit::TestCase
+ sub_test_case("#equal_scalar?") do
+ test("no options") do
+ scalar1 = Arrow::FloatScalar.new(1.1)
+ scalar2 = Arrow::FloatScalar.new(1.1000001)
+ assert do
+ not scalar1.equal_scalar?(scalar2)
+ end
+ end
+
+ test(":approx") do
+ scalar1 = Arrow::FloatScalar.new(1.1)
+ scalar2 = Arrow::FloatScalar.new(1.1000001)
+ assert do
+ scalar1.equal_scalar?(scalar2, approx: true)
+ end
+ end
+
+ test(":absolute_tolerance") do
+ scalar1 = Arrow::FloatScalar.new(1.1)
+ scalar2 = Arrow::FloatScalar.new(1.1001)
+ assert do
+ scalar1.equal_scalar?(scalar2,
+ approx: true,
+ absolute_tolerance: 0.001)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-function.rb b/src/arrow/ruby/red-arrow/test/test-function.rb
new file mode 100644
index 000000000..95667e66c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-function.rb
@@ -0,0 +1,176 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class FunctionTest < Test::Unit::TestCase
+ sub_test_case("#execute") do
+ test("Arrow::Array") do
+ or_function = Arrow::Function.find("or")
+ args = [
+ Arrow::BooleanArray.new([true, false, false]),
+ Arrow::BooleanArray.new([true, false, true]),
+ ]
+ assert_equal([true, false, true],
+ or_function.execute(args).value.to_a)
+ end
+
+ test("Array") do
+ or_function = Arrow::Function.find("or")
+ args = [
+ [true, false, false],
+ [true, false, true],
+ ]
+ assert_equal([true, false, true],
+ or_function.execute(args).value.to_a)
+ end
+
+ test("Arrow::ChunkedArray") do
+ or_function = Arrow::Function.find("or")
+ args = [
+ Arrow::ChunkedArray.new([
+ Arrow::BooleanArray.new([true]),
+ Arrow::BooleanArray.new([false, false]),
+ ]),
+ Arrow::ChunkedArray.new([
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::BooleanArray.new([true]),
+ ]),
+ ]
+ assert_equal([true, false, true],
+ or_function.execute(args).value.to_a)
+ end
+
+ test("Arrow::Scalar") do
+ add_function = Arrow::Function.find("add")
+ args = [
+ Arrow::Int8Array.new([1, 2, 3]),
+ Arrow::Int8Scalar.new(5),
+ ]
+ assert_equal([6, 7, 8],
+ add_function.execute(args).value.to_a)
+ end
+
+ test("Integer") do
+ add_function = Arrow::Function.find("add")
+ args = [
+ [1, 2, 3],
+ 5,
+ ]
+ assert_equal([6, 7, 8],
+ add_function.execute(args).value.to_a)
+ end
+
+ test("Float") do
+ add_function = Arrow::Function.find("add")
+ args = [
+ [1, 2, 3],
+ 5.1,
+ ]
+ assert_equal([6.1, 7.1, 8.1],
+ add_function.execute(args).value.to_a)
+ end
+
+ test("true") do
+ and_function = Arrow::Function.find("and")
+ args = [
+ Arrow::BooleanArray.new([true, false, false]),
+ true,
+ ]
+ assert_equal([true, false, false],
+ and_function.execute(args).value.to_a)
+ end
+
+ test("false") do
+ or_function = Arrow::Function.find("or")
+ args = [
+ Arrow::BooleanArray.new([true, false, false]),
+ false,
+ ]
+ assert_equal([true, false, false],
+ or_function.execute(args).value.to_a)
+ end
+
+ test("String") do
+ ascii_upper_function = Arrow::Function.find("ascii_upper")
+ args = [
+ "Hello",
+ ]
+ assert_equal("HELLO",
+ ascii_upper_function.execute(args).value.to_s)
+ end
+
+ test("Date") do
+ cast_function = Arrow::Function.find("cast")
+ date = Date.new(2021, 6, 12)
+ args = [date]
+ options = Arrow::CastOptions.new
+ options.to_data_type = Arrow::TimestampDataType.new(:second)
+ time = Time.utc(date.year,
+ date.month,
+ date.day)
+ assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
+ time.to_i),
+ cast_function.execute(args, options).value)
+ end
+
+ test("Arrow::Time: second") do
+ cast_function = Arrow::Function.find("cast")
+ arrow_time = Arrow::Time.new(Arrow::TimeUnit::SECOND,
+ # 00:10:00
+ 60 * 10)
+ args = [arrow_time]
+ options = Arrow::CastOptions.new
+ options.to_data_type = Arrow::Time64DataType.new(:micro)
+ assert_equal(Arrow::Time64Scalar.new(options.to_data_type,
+ # 00:10:00.000000
+ 60 * 10 * 1000 * 1000),
+ cast_function.execute(args, options).value)
+ end
+
+ test("Arrow::Time: micro") do
+ cast_function = Arrow::Function.find("cast")
+ arrow_time = Arrow::Time.new(Arrow::TimeUnit::MICRO,
+ # 00:10:00.000000
+ 60 * 10 * 1000 * 1000)
+ args = [arrow_time]
+ options = Arrow::CastOptions.new
+ options.to_data_type = Arrow::Time32DataType.new(:second)
+ options.allow_time_truncate = true
+ assert_equal(Arrow::Time32Scalar.new(options.to_data_type,
+ # 00:10:00
+ 60 * 10),
+ cast_function.execute(args, options).value)
+ end
+
+ test("Time") do
+ cast_function = Arrow::Function.find("cast")
+ time = Time.utc(2021, 6, 12, 1, 2, 3, 1)
+ args = [time]
+ options = Arrow::CastOptions.new
+ options.to_data_type = Arrow::TimestampDataType.new(:second)
+ options.allow_time_truncate = true
+ time = Time.utc(time.year,
+ time.month,
+ time.day,
+ time.hour,
+ time.min,
+ time.sec)
+ assert_equal(Arrow::TimestampScalar.new(options.to_data_type,
+ time.to_i),
+ cast_function.execute(args, options).value)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-group.rb b/src/arrow/ruby/red-arrow/test/test-group.rb
new file mode 100644
index 000000000..2823977d5
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-group.rb
@@ -0,0 +1,180 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class GroupTest < Test::Unit::TestCase
+ include Helper::Fixture
+
+ def setup
+ raw_table = {
+ :group_key1 => Arrow::UInt8Array.new([1, 1, 2, 3, 3, 3]),
+ :group_key2 => Arrow::UInt8Array.new([1, 1, 1, 1, 2, 2]),
+ :int => Arrow::Int32Array.new([-1, -2, nil, -4, -5, -6]),
+ :uint => Arrow::UInt32Array.new([1, nil, 3, 4, 5, 6]),
+ :float => Arrow::FloatArray.new([nil, 2.2, 3.3, 4.4, 5.5, 6.6]),
+ :string => Arrow::StringArray.new(["a", "b", "c", nil, "e", "f"]),
+ }
+ @table = Arrow::Table.new(raw_table)
+ end
+
+ sub_test_case("key") do
+ test("Time") do
+ time_values = [
+ Time.parse("2018-01-29"),
+ Time.parse("2018-01-30"),
+ ]
+ raw_table = {
+ :time => Arrow::ArrayBuilder.build(time_values),
+ :int => Arrow::Int32Array.new([-1, -2]),
+ }
+ table = Arrow::Table.new(raw_table)
+ assert_equal(<<-TABLE, table.group(:time).count.to_s)
+ count(int) time
+0 1 #{time_values[0].iso8601}
+1 1 #{time_values[1].iso8601}
+ TABLE
+ end
+ end
+
+ sub_test_case("#count") do
+ test("single") do
+ assert_equal(<<-TABLE, @table.group(:group_key1).count.to_s)
+ count(group_key2) count(int) count(uint) count(float) count(string) group_key1
+0 2 2 1 1 2 1
+1 1 0 1 1 1 2
+2 3 3 3 3 2 3
+ TABLE
+ end
+
+ test("multiple") do
+ assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).count.to_s)
+ count(int) count(uint) count(float) count(string) group_key1 group_key2
+0 2 1 1 2 1 1
+1 0 1 1 1 2 1
+2 1 1 1 0 3 1
+3 2 2 2 2 3 2
+ TABLE
+ end
+
+ test("column") do
+ group = @table.group(:group_key1, :group_key2)
+ assert_equal(<<-TABLE, group.count(:int, :uint).to_s)
+ count(int) count(uint) group_key1 group_key2
+0 2 1 1 1
+1 0 1 2 1
+2 1 1 3 1
+3 2 2 3 2
+ TABLE
+ end
+ end
+
+ sub_test_case("#sum") do
+ test("single") do
+ assert_equal(<<-TABLE, @table.group(:group_key1).sum.to_s)
+ sum(group_key2) sum(int) sum(uint) sum(float) group_key1
+0 2 -3 1 2.200000 1
+1 1 (null) 3 3.300000 2
+2 5 -15 15 16.500000 3
+ TABLE
+ end
+
+ test("multiple") do
+ assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).sum.to_s)
+ sum(int) sum(uint) sum(float) group_key1 group_key2
+0 -3 1 2.200000 1 1
+1 (null) 3 3.300000 2 1
+2 -4 4 4.400000 3 1
+3 -11 11 12.100000 3 2
+ TABLE
+ end
+ end
+
+ sub_test_case("#mean") do
+ test("single") do
+ assert_equal(<<-TABLE, @table.group(:group_key1).mean.to_s)
+ mean(group_key2) mean(int) mean(uint) mean(float) group_key1
+0 1.000000 -1.500000 1.000000 2.200000 1
+1 1.000000 (null) 3.000000 3.300000 2
+2 1.666667 -5.000000 5.000000 5.500000 3
+ TABLE
+ end
+
+ test("multiple") do
+ assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).mean.to_s)
+ mean(int) mean(uint) mean(float) group_key1 group_key2
+0 -1.500000 1.000000 2.200000 1 1
+1 (null) 3.000000 3.300000 2 1
+2 -4.000000 4.000000 4.400000 3 1
+3 -5.500000 5.500000 6.050000 3 2
+ TABLE
+ end
+ end
+
+ sub_test_case("#min") do
+ test("single") do
+ assert_equal(<<-TABLE, @table.group(:group_key1).min.to_s)
+ min(group_key2) min(int) min(uint) min(float) group_key1
+0 1 -2 1 2.200000 1
+1 1 (null) 3 3.300000 2
+2 1 -6 4 4.400000 3
+ TABLE
+ end
+
+ test("multiple") do
+ assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).min.to_s)
+ min(int) min(uint) min(float) group_key1 group_key2
+0 -2 1 2.200000 1 1
+1 (null) 3 3.300000 2 1
+2 -4 4 4.400000 3 1
+3 -6 5 5.500000 3 2
+ TABLE
+ end
+ end
+
+ sub_test_case("#max") do
+ test("single") do
+ assert_equal(<<-TABLE, @table.group(:group_key1).max.to_s)
+ max(group_key2) max(int) max(uint) max(float) group_key1
+0 1 -1 1 2.200000 1
+1 1 (null) 3 3.300000 2
+2 2 -4 6 6.600000 3
+ TABLE
+ end
+
+ test("multiple") do
+ assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).max.to_s)
+ max(int) max(uint) max(float) group_key1 group_key2
+0 -1 1 2.200000 1 1
+1 (null) 3 3.300000 2 1
+2 -4 4 4.400000 3 1
+3 -5 6 6.600000 3 2
+ TABLE
+ end
+ end
+
+ sub_test_case("#aggregate") do
+ test("function()") do
+ group = @table.group(:group_key1, :group_key2)
+ assert_equal(<<-TABLE, group.aggregate("count(int)", "sum(uint)").to_s)
+ count(int) sum(uint) group_key1 group_key2
+0 2 1 1 1
+1 0 3 2 1
+2 1 4 3 1
+3 2 11 3 2
+ TABLE
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-list-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-list-array-builder.rb
new file mode 100644
index 000000000..aee31e73b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-list-array-builder.rb
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ListArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ @data_type = Arrow::ListDataType.new(name: "visible", type: :boolean)
+ @builder = Arrow::ListArrayBuilder.new(@data_type)
+ end
+
+ sub_test_case("#append_value") do
+ test("nil") do
+ @builder.append_value(nil)
+ array = @builder.finish
+ assert_equal(nil, array[0])
+ end
+
+ test("Array") do
+ @builder.append_value([true, false, true])
+ array = @builder.finish
+ assert_equal([true, false, true], array[0].to_a)
+ end
+ end
+
+ sub_test_case("#append_values") do
+ test("[nil, Array]") do
+ @builder.append_values([[false], nil, [true, false, true]])
+ array = @builder.finish
+ assert_equal([
+ [false],
+ nil,
+ [true, false, true],
+ ],
+ array.collect {|list| list ? list.to_a : nil})
+ end
+
+ test("is_valids") do
+ @builder.append_values([[false], [true, true], [true, false, true]],
+ [true, false, true])
+ array = @builder.finish
+ assert_equal([
+ [false],
+ nil,
+ [true, false, true],
+ ],
+ array.collect {|list| list ? list.to_a : nil})
+ end
+ end
+
+ sub_test_case("#append") do
+ test("backward compatibility") do
+ @builder.append
+ @builder.value_builder.append(true)
+ @builder.value_builder.append(false)
+ @builder.append
+ @builder.value_builder.append(true)
+ array = @builder.finish
+
+ assert_equal([
+ [true, false],
+ [true],
+ ],
+ array.collect(&:to_a))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-list-array.rb b/src/arrow/ruby/red-arrow/test/test-list-array.rb
new file mode 100644
index 000000000..c1f762492
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-list-array.rb
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ListArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("build") do
+ data_type = Arrow::ListDataType.new(name: "visible", type: :boolean)
+ values = [
+ [true, false],
+ nil,
+ [false, true, false],
+ ]
+ array = Arrow::ListArray.new(data_type, values)
+ assert_equal(values,
+ array.collect {|value| value ? value.to_a : nil})
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-list-data-type.rb b/src/arrow/ruby/red-arrow/test/test-list-data-type.rb
new file mode 100644
index 000000000..ada46394d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-list-data-type.rb
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ListDataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("Arrow::Field") do
+ field = Arrow::Field.new(:tag, :string)
+ assert_equal("list<tag: string>",
+ Arrow::ListDataType.new(field).to_s)
+ end
+
+ test("name: String") do
+ assert_equal("list<tag: string>",
+ Arrow::ListDataType.new(name: "tag", type: :string).to_s)
+ end
+
+ test("field: Arrow::Field") do
+ field = Arrow::Field.new(:tag, :string)
+ assert_equal("list<tag: string>",
+ Arrow::ListDataType.new(field: field).to_s)
+ end
+
+ test("field: Hash") do
+ field_description = {name: "tag", type: :string}
+ assert_equal("list<tag: string>",
+ Arrow::ListDataType.new(field: field_description).to_s)
+ end
+
+ test("Arrow::DataType") do
+ data_type = Arrow::BooleanDataType.new
+ assert_equal("list<item: bool>",
+ Arrow::ListDataType.new(data_type).to_s)
+ end
+
+ test("String") do
+ assert_equal("list<item: bool>",
+ Arrow::ListDataType.new("boolean").to_s)
+ end
+
+ test("Symbol") do
+ assert_equal("list<item: bool>",
+ Arrow::ListDataType.new(:boolean).to_s)
+ end
+
+ test("[data type name, additional information]") do
+ assert_equal("list<item: time32[ms]>",
+ Arrow::ListDataType.new([:time32, :milli]).to_s)
+ end
+
+ test("type: Symbol") do
+ assert_equal("list<item: bool>",
+ Arrow::ListDataType.new(type: :boolean).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-map-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-map-array-builder.rb
new file mode 100644
index 000000000..80e571448
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-map-array-builder.rb
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class MapArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ key_type = Arrow::StringDataType.new
+ item_type = Arrow::Int16DataType.new
+ data_type = Arrow::MapDataType.new(key_type, item_type)
+ @builder = Arrow::MapArrayBuilder.new(data_type)
+ end
+
+ sub_test_case("#append_value") do
+ test("nil") do
+ @builder.append_value(nil)
+ array = @builder.finish
+ assert_equal([nil], array.collect {|value| value})
+ end
+
+ test("Hash") do
+ @builder.append_value({"a" => 0, "b" => 1})
+ @builder.append_value({"c" => 0, "d" => 1})
+ array = @builder.finish
+ assert_equal([
+ {"a" => 0, "b" => 1},
+ {"c" => 0, "d" => 1}
+ ],
+ array.collect {|value| value})
+ end
+
+ test("#each") do
+ @builder.append_value([["a", 0], ["b", 1]])
+ @builder.append_value([["c", 0], ["d", 1]])
+ array = @builder.finish
+ assert_equal([
+ {"a" => 0, "b" => 1},
+ {"c" => 0, "d" => 1}
+ ],
+ array.collect {|value| value})
+ end
+ end
+
+ sub_test_case("#append_values") do
+ test("[nil]") do
+ @builder.append_values([nil])
+ array = @builder.finish
+ assert_equal([nil], array.collect {|value| value})
+ end
+
+ test("[Hash]") do
+ @builder.append_values([{"a" => 0, "b" => 1}, {"c" => 0, "d" => 1}])
+ array = @builder.finish
+ assert_equal([
+ {"a" => 0, "b" => 1},
+ {"c" => 0, "d" => 1}
+ ],
+ array.collect {|value| value})
+ end
+
+ test("[#each]") do
+ @builder.append_values([[["a", 0], ["b", 1]], [["c", 0], ["d", 1]]])
+ array = @builder.finish
+ assert_equal([
+ {"a" => 0, "b" => 1},
+ {"c" => 0, "d" => 1}
+ ],
+ array.collect {|value| value})
+ end
+
+ test("[nil, Hash, #each]") do
+ @builder.append_values([nil, {"a" => 0, "b" => 1}, [["c", 0], ["d", 1]]])
+ array = @builder.finish
+ assert_equal([
+ nil,
+ {"a" => 0, "b" => 1},
+ {"c" => 0, "d" => 1}
+ ],
+ array.collect {|value| value})
+ end
+
+ test("is_valids") do
+ @builder.append_values([
+ {"a" => 0, "b" => 1},
+ {"c" => 0, "d" => 1},
+ {"e" => 0, "f" => 1}
+ ],
+ [true, false, true])
+ array = @builder.finish
+ assert_equal([
+ {"a" => 0, "b" => 1},
+ nil,
+ {"e" => 0, "f" => 1}
+ ],
+ array.collect {|value| value})
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-map-array.rb b/src/arrow/ruby/red-arrow/test/test-map-array.rb
new file mode 100644
index 000000000..9f4c1ff57
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-map-array.rb
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class MapArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("build") do
+ key_type = Arrow::StringDataType.new
+ item_type = Arrow::Int16DataType.new
+ data_type = Arrow::MapDataType.new(key_type, item_type)
+ values = [
+ {"a" => 0, "b" => 1},
+ nil,
+ {"c" => 0, "d" => 1}
+ ]
+ array = Arrow::MapArray.new(data_type, values)
+ assert_equal(values, array.collect {|value| value})
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-map-data-type.rb b/src/arrow/ruby/red-arrow/test/test-map-data-type.rb
new file mode 100644
index 000000000..cdbbc2ed1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-map-data-type.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class MapDataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ def setup
+ @key = :int8
+ @item = :string
+ end
+
+ test("ordered arguments") do
+ assert_equal("map<int8, string>",
+ Arrow::MapDataType.new(@key, @item).to_s)
+ end
+
+ test("description") do
+ assert_equal("map<int8, string>",
+ Arrow::MapDataType.new(key: @key,
+ item: @item).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-memory-view.rb b/src/arrow/ruby/red-arrow/test/test-memory-view.rb
new file mode 100644
index 000000000..0b9c98c40
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-memory-view.rb
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class MemoryViewTest < Test::Unit::TestCase
+ def setup
+ unless Fiddle.const_defined?(:MemoryView)
+ omit("Fiddle::MemoryView is needed")
+ end
+ unless Fiddle::MemoryView.respond_to?(:export)
+ omit("Fiddle::MemoryView.export is needed")
+ end
+ end
+
+ def little_endian?
+ [1].pack("s") == [1].pack("s<")
+ end
+
+ test("BooleanArray") do
+ array = Arrow::BooleanArray.new([true] * 9)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ if little_endian?
+ template = "b"
+ else
+ template = "B"
+ end
+ assert_equal([
+ "#{template}8",
+ 1,
+ 2,
+ [(("1" * 9) + ("0" * 7))].pack("#{template}*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Int8Array") do
+ values = [-(2 ** 7), 0, (2 ** 7) - 1]
+ array = Arrow::Int8Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "c",
+ 1,
+ values.size,
+ values.pack("c*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Int16Array") do
+ values = [-(2 ** 15), 0, (2 ** 15) - 1]
+ array = Arrow::Int16Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "s",
+ 2,
+ 2 * values.size,
+ values.pack("s*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Int32Array") do
+ values = [-(2 ** 31), 0, (2 ** 31) - 1]
+ array = Arrow::Int32Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "l",
+ 4,
+ 4 * values.size,
+ values.pack("l*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Int64Array") do
+ values = [-(2 ** 63), 0, (2 ** 63) - 1]
+ array = Arrow::Int64Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "q",
+ 8,
+ 8 * values.size,
+ values.pack("q*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("UInt8Array") do
+ values = [0, (2 ** 8) - 1]
+ array = Arrow::UInt8Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "C",
+ 1,
+ values.size,
+ values.pack("C*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("UInt16Array") do
+ values = [0, (2 ** 16) - 1]
+ array = Arrow::UInt16Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "S",
+ 2,
+ 2 * values.size,
+ values.pack("S*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("UInt32Array") do
+ values = [0, (2 ** 32) - 1]
+ array = Arrow::UInt32Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "L",
+ 4,
+ 4 * values.size,
+ values.pack("L*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("UInt64Array") do
+ values = [(2 ** 64) - 1]
+ array = Arrow::UInt64Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "Q",
+ 8,
+ 8 * values.size,
+ values.pack("Q*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("FloatArray") do
+ values = [-1.1, 0.0, 1.1]
+ array = Arrow::FloatArray.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "f",
+ 4,
+ 4 * values.size,
+ values.pack("f*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("DoubleArray") do
+ values = [-1.1, 0.0, 1.1]
+ array = Arrow::DoubleArray.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "d",
+ 8,
+ 8 * values.size,
+ values.pack("d*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("FixedSizeBinaryArray") do
+ values = ["\x01\x02", "\x03\x04", "\x05\x06"]
+ data_type = Arrow::FixedSizeBinaryDataType.new(2)
+ array = Arrow::FixedSizeBinaryArray.new(data_type, values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "C2",
+ 2,
+ 2 * values.size,
+ values.join("").b,
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Date32Array") do
+ n_days_since_epoch = 17406 # 2017-08-28
+ values = [n_days_since_epoch]
+ array = Arrow::Date32Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "l",
+ 4,
+ 4 * values.size,
+ values.pack("l*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Date64Array") do
+ n_msecs_since_epoch = 1503878400000 # 2017-08-28T00:00:00Z
+ values = [n_msecs_since_epoch]
+ array = Arrow::Date64Array.new(values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "q",
+ 8,
+ 8 * values.size,
+ values.pack("q*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Time32Array") do
+ values = [1, 2, 3]
+ array = Arrow::Time32Array.new(:milli, values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "l",
+ 4,
+ 4 * values.size,
+ values.pack("l*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Time64Array") do
+ values = [1, 2, 3]
+ array = Arrow::Time64Array.new(:nano, values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "q",
+ 8,
+ 8 * values.size,
+ values.pack("q*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("TimestampArray") do
+ values = [1, 2, 3]
+ array = Arrow::TimestampArray.new(:micro, values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "q",
+ 8,
+ 8 * values.size,
+ values.pack("q*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Decimal128Array") do
+ values = [
+ Arrow::Decimal128.new("10.1"),
+ Arrow::Decimal128.new("11.1"),
+ Arrow::Decimal128.new("10.2"),
+ ]
+ data_type = Arrow::Decimal128DataType.new(3, 1)
+ array = Arrow::Decimal128Array.new(data_type, values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "q2",
+ 16,
+ 16 * values.size,
+ values.collect {|value| value.to_bytes.to_s}.join(""),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Decimal256Array") do
+ values = [
+ Arrow::Decimal256.new("10.1"),
+ Arrow::Decimal256.new("11.1"),
+ Arrow::Decimal256.new("10.2"),
+ ]
+ data_type = Arrow::Decimal256DataType.new(3, 1)
+ array = Arrow::Decimal256Array.new(data_type, values)
+ Fiddle::MemoryView.export(array) do |memory_view|
+ assert_equal([
+ "q4",
+ 32,
+ 32 * values.size,
+ values.collect {|value| value.to_bytes.to_s}.join(""),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+
+ test("Buffer") do
+ values = [0, nil, nil] * 3
+ array = Arrow::Int8Array.new(values)
+ buffer = array.null_bitmap
+ Fiddle::MemoryView.export(buffer) do |memory_view|
+ if little_endian?
+ template = "b"
+ else
+ template = "B"
+ end
+ assert_equal([
+ "#{template}8",
+ 1,
+ 2,
+ ["100" * 3].pack("#{template}*"),
+ ],
+ [
+ memory_view.format,
+ memory_view.item_size,
+ memory_view.byte_size,
+ memory_view.to_s,
+ ])
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-null-array.rb b/src/arrow/ruby/red-arrow/test/test-null-array.rb
new file mode 100644
index 000000000..c5d061636
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-null-array.rb
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class NullArrayTest < Test::Unit::TestCase
+ test("#[]") do
+ array = Arrow::NullArray.new(1)
+ assert_nil(array[0])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-orc.rb b/src/arrow/ruby/red-arrow/test/test-orc.rb
new file mode 100644
index 000000000..b882da0a1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-orc.rb
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class ORCTest < Test::Unit::TestCase
+ include Helper::Fixture
+
+ def setup
+ omit("Require Apache Arrow ORC") unless Arrow.const_defined?(:ORCFileReader)
+ @orc_path = fixture_path("TestOrcFile.test1.orc")
+ end
+
+ def pp_values(values)
+ "[\n " + values.collect(&:inspect).join(",\n ") + "\n]"
+ end
+
+ sub_test_case("load") do
+ test("default") do
+ table = Arrow::Table.load(@orc_path)
+ dump = table.columns.collect do |column|
+ [
+ column.field.to_s,
+ column.data.chunks.collect(&:to_s),
+ ]
+ end
+ assert_equal([
+ ["boolean1: bool", [pp_values([false, true])]],
+ ["byte1: int8", [pp_values([1, 100])]],
+ ["short1: int16", [pp_values([1024, 2048])]],
+ ["int1: int32", [pp_values([65536, 65536])]],
+ [
+ "long1: int64",
+ [pp_values([9223372036854775807, 9223372036854775807])],
+ ],
+ ["float1: float", [pp_values([1, 2])]],
+ ["double1: double", [pp_values([-15, -5])]],
+ ["bytes1: binary", ["[\n 0001020304,\n \n]"]],
+ ["string1: string", [pp_values(["hi", "bye"])]],
+ [
+ "middle: " +
+ "struct<list: " +
+ "list<item: struct<int1: int32, string1: string>>>",
+ [
+ <<-STRUCT.chomp
+-- is_valid: all not null
+-- child 0 type: list<item: struct<int1: int32, string1: string>>
+ [
+ -- is_valid: all not null
+ -- child 0 type: int32
+ [
+ 1,
+ 2
+ ]
+ -- child 1 type: string
+ [
+ "bye",
+ "sigh"
+ ],
+ -- is_valid: all not null
+ -- child 0 type: int32
+ [
+ 1,
+ 2
+ ]
+ -- child 1 type: string
+ [
+ "bye",
+ "sigh"
+ ]
+ ]
+ STRUCT
+ ]
+ ],
+ [
+ "list: list<item: struct<int1: int32, string1: string>>",
+ [
+ <<-LIST.chomp
+[
+ -- is_valid: all not null
+ -- child 0 type: int32
+ [
+ 3,
+ 4
+ ]
+ -- child 1 type: string
+ [
+ "good",
+ "bad"
+ ],
+ -- is_valid: all not null
+ -- child 0 type: int32
+ [
+ 100000000,
+ -100000,
+ 1234
+ ]
+ -- child 1 type: string
+ [
+ "cat",
+ "in",
+ "hat"
+ ]
+]
+ LIST
+ ]
+ ],
+ [
+ "map: map<string, struct<int1: int32, string1: string>>",
+ [
+ <<-MAP.chomp
+[
+ keys:
+ []
+ values:
+ -- is_valid: all not null
+ -- child 0 type: int32
+ []
+ -- child 1 type: string
+ [],
+ keys:
+ [
+ "chani",
+ "mauddib"
+ ]
+ values:
+ -- is_valid: all not null
+ -- child 0 type: int32
+ [
+ 5,
+ 1
+ ]
+ -- child 1 type: string
+ [
+ "chani",
+ "mauddib"
+ ]
+]
+ MAP
+ ],
+ ],
+ ],
+ dump)
+ end
+
+ test(":field_indexes") do
+ table = Arrow::Table.load(@orc_path, field_indexes: [1, 3])
+ dump = table.columns.collect do |column|
+ [
+ column.field.to_s,
+ column.data.chunks.collect(&:to_s),
+ ]
+ end
+ assert_equal([
+ ["boolean1: bool", [pp_values([false, true])]],
+ ["short1: int16", [pp_values([1024, 2048])]],
+ ],
+ dump)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-record-batch-builder.rb b/src/arrow/ruby/red-arrow/test/test-record-batch-builder.rb
new file mode 100644
index 000000000..988e02043
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-record-batch-builder.rb
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RecordBatchBuilderTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("Schema") do
+ schema = Arrow::Schema.new(visible: :boolean,
+ count: :uint32)
+ builder = Arrow::RecordBatchBuilder.new(schema)
+ assert_equal(schema,
+ builder.schema)
+ end
+
+ test("Hash") do
+ builder = Arrow::RecordBatchBuilder.new(visible: :boolean,
+ count: :uint32)
+ assert_equal(Arrow::Schema.new(visible: :boolean,
+ count: :uint32),
+ builder.schema)
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @schema = Arrow::Schema.new(visible: :boolean,
+ count: :uint32)
+ @builder = Arrow::RecordBatchBuilder.new(@schema)
+ end
+
+ sub_test_case("#[]") do
+ test("String") do
+ assert_equal(Arrow::BooleanDataType.new,
+ @builder["visible"].value_data_type)
+ end
+
+ test("Symbol") do
+ assert_equal(Arrow::BooleanDataType.new,
+ @builder[:visible].value_data_type)
+ end
+
+ test("Integer") do
+ assert_equal(Arrow::UInt32DataType.new,
+ @builder[1].value_data_type)
+ end
+ end
+
+ test("#append") do
+ records = [
+ {visible: true, count: 1},
+ ]
+ columns = {
+ visible: [false],
+ count: [2],
+ }
+ arrays = [
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::UInt32Array.new([1, 2]),
+ ]
+ @builder.append(records, columns)
+ assert_equal(Arrow::RecordBatch.new(@schema,
+ arrays[0].length,
+ arrays),
+ @builder.flush)
+ end
+
+ test("#append_records") do
+ records = [
+ {visible: true, count: 1},
+ {visible: true, count: 2, garbage: "garbage"},
+ {visible: true},
+ [false, 4],
+ nil,
+ [true],
+ ]
+ arrays = [
+ Arrow::BooleanArray.new([true, true, true, false, nil, true]),
+ Arrow::UInt32Array.new([1, 2, nil, 4, nil, nil]),
+ ]
+ @builder.append_records(records)
+ assert_equal(Arrow::RecordBatch.new(@schema,
+ arrays[0].length,
+ arrays),
+ @builder.flush)
+ end
+
+ test("#append_columns") do
+ columns = {
+ visible: [true, true, true, false, nil, true],
+ count: [1, 2, nil, 4, nil, nil],
+ }
+ arrays = [
+ Arrow::BooleanArray.new(columns[:visible]),
+ Arrow::UInt32Array.new(columns[:count]),
+ ]
+ @builder.append_columns(columns)
+ assert_equal(Arrow::RecordBatch.new(@schema,
+ arrays[0].length,
+ arrays),
+ @builder.flush)
+ end
+
+ test("#column_builders") do
+ column_builders = [
+ @builder.get_column_builder(0),
+ @builder.get_column_builder(1),
+ ]
+ assert_equal(column_builders,
+ @builder.column_builders)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-record-batch-file-reader.rb b/src/arrow/ruby/red-arrow/test/test-record-batch-file-reader.rb
new file mode 100644
index 000000000..57b02abf9
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-record-batch-file-reader.rb
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RecordBatchFileReaderTest < Test::Unit::TestCase
+ test("write/read") do
+ fields = [
+ Arrow::Field.new("uint8", :uint8),
+ Arrow::Field.new("uint16", :uint16),
+ Arrow::Field.new("uint32", :uint32),
+ Arrow::Field.new("uint64", :uint64),
+ Arrow::Field.new("int8", :int8),
+ Arrow::Field.new("int16", :int16),
+ Arrow::Field.new("int32", :int32),
+ Arrow::Field.new("int64", :int64),
+ Arrow::Field.new("float", :float),
+ Arrow::Field.new("double", :double),
+ ]
+ schema = Arrow::Schema.new(fields)
+
+ tempfile = Tempfile.new(["batch", ".arrow"])
+ Arrow::FileOutputStream.open(tempfile.path, false) do |output|
+ Arrow::RecordBatchFileWriter.open(output, schema) do |writer|
+ uints = [1, 2, 4, 8]
+ ints = [1, -2, 4, -8]
+ floats = [1.1, -2.2, 4.4, -8.8]
+ columns = [
+ Arrow::UInt8Array.new(uints),
+ Arrow::UInt16Array.new(uints),
+ Arrow::UInt32Array.new(uints),
+ Arrow::UInt64Array.new(uints),
+ Arrow::Int8Array.new(ints),
+ Arrow::Int16Array.new(ints),
+ Arrow::Int32Array.new(ints),
+ Arrow::Int64Array.new(ints),
+ Arrow::FloatArray.new(floats),
+ Arrow::DoubleArray.new(floats),
+ ]
+
+ record_batch = Arrow::RecordBatch.new(schema, 4, columns)
+ writer.write_record_batch(record_batch)
+ end
+ end
+
+ Arrow::MemoryMappedInputStream.open(tempfile.path) do |input|
+ reader = Arrow::RecordBatchFileReader.new(input)
+ reader.each do |record_batch|
+ assert_equal([
+ {
+ "uint8" => 1,
+ "uint16" => 1,
+ "uint32" => 1,
+ "uint64" => 1,
+ "int8" => 1,
+ "int16" => 1,
+ "int32" => 1,
+ "int64" => 1,
+ "float" => 1.100000023841858,
+ "double" => 1.1,
+ },
+ {
+ "uint8" => 2,
+ "uint16" => 2,
+ "uint32" => 2,
+ "uint64" => 2,
+ "int8" => -2,
+ "int16" => -2,
+ "int32" => -2,
+ "int64" => -2,
+ "float" => -2.200000047683716,
+ "double" => -2.2,
+ },
+ {
+ "uint8" => 4,
+ "uint16" => 4,
+ "uint32" => 4,
+ "uint64" => 4,
+ "int8" => 4,
+ "int16" => 4,
+ "int32" => 4,
+ "int64" => 4,
+ "float" => 4.400000095367432,
+ "double" => 4.4,
+ },
+ {
+ "uint8" => 8,
+ "uint16" => 8,
+ "uint32" => 8,
+ "uint64" => 8,
+ "int8" => -8,
+ "int16" => -8,
+ "int32" => -8,
+ "int64" => -8,
+ "float" => -8.800000190734863,
+ "double" => -8.8,
+ },
+ ],
+ record_batch.collect(&:to_h))
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-record-batch-iterator.rb b/src/arrow/ruby/red-arrow/test/test-record-batch-iterator.rb
new file mode 100644
index 000000000..88f3ecaac
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-record-batch-iterator.rb
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RecordBatchIteratorTest < Test::Unit::TestCase
+ def setup
+ @schema = Arrow::Schema.new(visible: :boolean,
+ count: :uint32)
+ @record_batches = [
+ Arrow::RecordBatch.new(@schema,
+ visible: [true],
+ count: [1]),
+ Arrow::RecordBatch.new(@schema,
+ visible: [false, nil],
+ count: [nil, 3]),
+ ]
+ @iterator = Arrow::RecordBatchIterator.new(@record_batches)
+ end
+
+ def test_to_a
+ assert_equal(@record_batches,
+ @iterator.to_a)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-record-batch-reader.rb b/src/arrow/ruby/red-arrow/test/test-record-batch-reader.rb
new file mode 100644
index 000000000..1becdf5b6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-record-batch-reader.rb
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestRecordBatchReader < Test::Unit::TestCase
+ sub_test_case(".try_convert") do
+ test("Arrow::RecordBatch") do
+ record_batch =
+ Arrow::RecordBatch.new("count" => [1, 2, 3],
+ "private" => [true, false, true])
+ reader = Arrow::RecordBatchReader.try_convert(record_batch)
+ assert_equal(record_batch,
+ reader.read_next)
+ end
+
+ test("[Arrow::RecordBatch]") do
+ record_batch =
+ Arrow::RecordBatch.new("count" => [1, 2, 3],
+ "private" => [true, false, true])
+ reader = Arrow::RecordBatchReader.try_convert([record_batch])
+ assert_equal(record_batch,
+ reader.read_next)
+ end
+
+ test("Arrow::Table") do
+ table = Arrow::Table.new("count" => [1, 2, 3],
+ "private" => [true, false, true])
+ reader = Arrow::RecordBatchReader.try_convert(table)
+ assert_equal(table,
+ reader.read_all)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-record-batch.rb b/src/arrow/ruby/red-arrow/test/test-record-batch.rb
new file mode 100644
index 000000000..e94c26f2e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-record-batch.rb
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RecordBatchTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ def setup
+ @schema = Arrow::Schema.new(visible: :boolean,
+ count: :uint32)
+ end
+
+ test("[raw_table]") do
+ raw_table = {
+ visible: [true, nil, false],
+ count: [1, nil, 3],
+ }
+ record_batch = Arrow::RecordBatch.new(raw_table)
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => nil, "count" => nil},
+ {"visible" => false, "count" => 3},
+ ],
+ record_batch.each_record.collect(&:to_h))
+ end
+
+ test("[Schema, records]") do
+ records = [
+ {visible: true, count: 1},
+ nil,
+ [false, 3],
+ ]
+ record_batch = Arrow::RecordBatch.new(@schema, records)
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => nil, "count" => nil},
+ {"visible" => false, "count" => 3},
+ ],
+ record_batch.each_record.collect(&:to_h))
+ end
+
+ test("[Schema, columns]") do
+ columns = {
+ visible: [true, nil, false],
+ count: [1, 2, nil],
+ }
+ record_batch = Arrow::RecordBatch.new(@schema, columns)
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => nil, "count" => 2},
+ {"visible" => false, "count" => nil},
+ ],
+ record_batch.each_record.collect(&:to_h))
+ end
+
+ test("[Schema, n_rows, columns]") do
+ columns = [
+ Arrow::BooleanArray.new([true, nil, false]),
+ Arrow::UInt32Array.new([1, 2, nil]),
+ ]
+ n_rows = columns[0].length
+ record_batch = Arrow::RecordBatch.new(@schema, n_rows, columns)
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => nil, "count" => 2},
+ {"visible" => false, "count" => nil},
+ ],
+ record_batch.each_record.collect(&:to_h))
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @schema = Arrow::Schema.new(count: :uint32)
+ @counts = Arrow::UInt32Array.new([1, 2, 4, 8])
+ @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts])
+ end
+
+ sub_test_case("#each") do
+ test("default") do
+ records = []
+ @record_batch.each do |record|
+ records << [record, record.index]
+ end
+ assert_equal([
+ [0, 0],
+ [1, 1],
+ [2, 2],
+ [3, 3],
+ ],
+ records.collect {|record, i| [record.index, i]})
+ end
+
+ test("reuse_record: true") do
+ records = []
+ @record_batch.each(reuse_record: true) do |record|
+ records << [record, record.index]
+ end
+ assert_equal([
+ [3, 0],
+ [3, 1],
+ [3, 2],
+ [3, 3],
+ ],
+ records.collect {|record, i| [record.index, i]})
+ end
+ end
+
+ test("#to_table") do
+ assert_equal(Arrow::Table.new(@schema, [@counts]),
+ @record_batch.to_table)
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::RecordBatch") do
+ assert do
+ @record_batch == @record_batch
+ end
+ end
+
+ test("not Arrow::RecordBatch") do
+ assert do
+ not (@record_batch == 29)
+ end
+ end
+ end
+
+ sub_test_case("#[]") do
+ def setup
+ @record_batch = Arrow::RecordBatch.new(a: [true],
+ b: [true],
+ c: [true],
+ d: [true],
+ e: [true],
+ f: [true],
+ g: [true])
+ end
+
+ test("[String]") do
+ assert_equal(Arrow::Column.new(@record_batch, 0),
+ @record_batch["a"])
+ end
+
+ test("[Symbol]") do
+ assert_equal(Arrow::Column.new(@record_batch, 1),
+ @record_batch[:b])
+ end
+
+ test("[Integer]") do
+ assert_equal(Arrow::Column.new(@record_batch, 6),
+ @record_batch[-1])
+ end
+
+ test("[Range]") do
+ assert_equal(Arrow::RecordBatch.new(d: [true],
+ e: [true]),
+ @record_batch[3..4])
+ end
+
+ test("[[Symbol, String, Integer, Range]]") do
+ assert_equal(Arrow::RecordBatch.new(c: [true],
+ a: [true],
+ g: [true],
+ d: [true],
+ e: [true]),
+ @record_batch[[:c, "a", -1, 3..4]])
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-rolling-window.rb b/src/arrow/ruby/red-arrow/test/test-rolling-window.rb
new file mode 100644
index 000000000..4158ad162
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-rolling-window.rb
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RollingWindowTest < Test::Unit::TestCase
+ include Helper::Fixture
+
+ def setup
+ raw_table = {
+ :number => Arrow::Int32Array.new([1, -2, nil, 4, 6, 3]),
+ }
+ @table = Arrow::Table.new(raw_table)
+ end
+
+ test("#lag") do
+ assert_equal(<<-ARRAY.chomp, @table.window.lag(:number).to_s)
+[
+ null,
+ -3,
+ null,
+ null,
+ 2,
+ -3
+]
+ ARRAY
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-schema.rb b/src/arrow/ruby/red-arrow/test/test-schema.rb
new file mode 100644
index 000000000..20d73b272
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-schema.rb
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class SchemaTest < Test::Unit::TestCase
+ include Helper::Omittable
+
+ def setup
+ @count_field = Arrow::Field.new("count", :uint32)
+ @visible_field = Arrow::Field.new("visible", :boolean)
+ end
+
+ sub_test_case(".new") do
+ test("[Arrow::Field]") do
+ fields = [
+ @count_field,
+ @visible_field,
+ ]
+ assert_equal("count: uint32\n" +
+ "visible: bool",
+ Arrow::Schema.new(fields).to_s)
+ end
+
+ test("[Arrow::Field, Hash]") do
+ fields = [
+ @count_field,
+ {name: "visible", type: :boolean},
+ ]
+ assert_equal("count: uint32\n" +
+ "visible: bool",
+ Arrow::Schema.new(fields).to_s)
+ end
+
+ test("{String, Symbol => Arrow::DataType}") do
+ fields = {
+ "count" => Arrow::UInt32DataType.new,
+ :visible => :boolean,
+ }
+ assert_equal("count: uint32\n" +
+ "visible: bool",
+ Arrow::Schema.new(fields).to_s)
+ end
+
+ test("{String, Symbol => Hash}") do
+ fields = {
+ "count" => {type: :uint32},
+ :tags => {
+ type: :list,
+ field: {
+ name: "tag",
+ type: :string,
+ },
+ },
+ }
+ assert_equal("count: uint32\n" +
+ "tags: list<tag: string>",
+ Arrow::Schema.new(fields).to_s)
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ super
+ @schema = Arrow::Schema.new([@count_field, @visible_field])
+ end
+
+ sub_test_case("#[]") do
+ test("[String]") do
+ assert_equal([@count_field, @visible_field],
+ [@schema["count"], @schema["visible"]])
+ end
+
+ test("[Symbol]") do
+ assert_equal([@count_field, @visible_field],
+ [@schema[:count], @schema[:visible]])
+ end
+
+ test("[Integer]") do
+ assert_equal([@count_field, @visible_field],
+ [@schema[0], @schema[1]])
+ end
+
+ test("[invalid]") do
+ invalid = []
+ message = "field name or index must be String, Symbol or Integer"
+ message << ": <#{invalid.inspect}>"
+ assert_raise(ArgumentError.new(message)) do
+ @schema[invalid]
+ end
+ end
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Schema") do
+ assert do
+ @schema == @schema
+ end
+ end
+
+ test("not Arrow::Schema") do
+ assert do
+ not (@schema == 29)
+ end
+ end
+ end
+
+ sub_test_case("#to_s") do
+ test("show_metadata") do
+ require_gi_bindings(3, 4, 2)
+
+ schema = @schema.with_metadata("key" => "value")
+ assert_equal(<<-SCHEMA.chomp, schema.to_s(show_metadata: true))
+count: uint32
+visible: bool
+-- metadata --
+key: value
+ SCHEMA
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-slicer.rb b/src/arrow/ruby/red-arrow/test/test-slicer.rb
new file mode 100644
index 000000000..420086690
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-slicer.rb
@@ -0,0 +1,487 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class SlicerTest < Test::Unit::TestCase
+ def setup
+ @count_field = Arrow::Field.new("count", :uint32)
+ @visible_field = Arrow::Field.new("visible", :boolean)
+ schema = Arrow::Schema.new([@count_field, @visible_field])
+ count_arrays = [
+ Arrow::UInt32Array.new([0, 1, 2]),
+ Arrow::UInt32Array.new([4, 8, 16]),
+ Arrow::UInt32Array.new([32, 64, nil]),
+ Arrow::UInt32Array.new([256]),
+ ]
+ visible_arrays = [
+ Arrow::BooleanArray.new([nil, true, false, nil]),
+ Arrow::BooleanArray.new([true]),
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::BooleanArray.new([nil]),
+ Arrow::BooleanArray.new([nil]),
+ Arrow::BooleanArray.new([true]),
+ ]
+ @count_array = Arrow::ChunkedArray.new(count_arrays)
+ @visible_array = Arrow::ChunkedArray.new(visible_arrays)
+ @table = Arrow::Table.new(schema, [@count_array, @visible_array])
+ end
+
+ sub_test_case("column") do
+ test("BooleanArray") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 1 true
+2 (null) (null)
+3 8 true
+4 16 true
+5 (null) (null)
+6 (null) (null)
+7 256 true
+ TABLE
+ end
+
+ test("not BooleanArray") do
+ sliced_table = @table.slice do |slicer|
+ slicer.count
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 1 true
+1 2 false
+2 4 (null)
+3 8 true
+4 16 true
+5 32 false
+6 64 (null)
+7 (null) (null)
+8 256 true
+ TABLE
+ end
+ end
+
+ sub_test_case("!column") do
+ test("BooleanArray") do
+ sliced_table = @table.slice do |slicer|
+ !slicer.visible
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 (null) (null)
+3 32 false
+4 (null) (null)
+5 (null) (null)
+ TABLE
+ end
+
+ test("not BooleanArray") do
+ sliced_table = @table.slice do |slicer|
+ !slicer.count
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 (null) (null)
+ TABLE
+ end
+ end
+
+ test("column.null?") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible.null?
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 4 (null)
+2 64 (null)
+3 (null) (null)
+ TABLE
+ end
+
+ test("column.valid?") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible.valid?
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 1 true
+1 2 false
+2 8 true
+3 16 true
+4 32 false
+5 256 true
+ TABLE
+ end
+
+ sub_test_case("column ==") do
+ test("nil") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible == nil
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 4 (null)
+2 64 (null)
+3 (null) (null)
+ TABLE
+ end
+
+ test("value") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible == true
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 1 true
+2 (null) (null)
+3 8 true
+4 16 true
+5 (null) (null)
+6 (null) (null)
+7 256 true
+ TABLE
+ end
+ end
+
+ sub_test_case("!(column ==)") do
+ test("nil") do
+ sliced_table = @table.slice do |slicer|
+ !(slicer.visible == nil)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 1 true
+1 2 false
+2 8 true
+3 16 true
+4 32 false
+5 256 true
+ TABLE
+ end
+
+ test("value") do
+ sliced_table = @table.slice do |slicer|
+ !(slicer.visible == true)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 (null) (null)
+3 32 false
+4 (null) (null)
+5 (null) (null)
+ TABLE
+ end
+ end
+
+ sub_test_case("column !=") do
+ test("nil") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible != nil
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 1 true
+1 2 false
+2 8 true
+3 16 true
+4 32 false
+5 256 true
+ TABLE
+ end
+
+ test("value") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible != true
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 (null) (null)
+3 32 false
+4 (null) (null)
+5 (null) (null)
+ TABLE
+ end
+ end
+
+ test("column < value") do
+ sliced_table = @table.slice do |slicer|
+ slicer.count < 16
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 1 true
+2 2 false
+3 4 (null)
+4 8 true
+5 (null) (null)
+ TABLE
+ end
+
+ test("!(column < value)") do
+ sliced_table = @table.slice do |slicer|
+ !(slicer.count < 16)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 16 true
+1 32 false
+2 64 (null)
+3 (null) (null)
+4 256 true
+ TABLE
+ end
+
+ test("column <= value") do
+ sliced_table = @table.slice do |slicer|
+ slicer.count <= 16
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 1 true
+2 2 false
+3 4 (null)
+4 8 true
+5 16 true
+6 (null) (null)
+ TABLE
+ end
+
+ test("!(column <= value)") do
+ sliced_table = @table.slice do |slicer|
+ !(slicer.count <= 16)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 32 false
+1 64 (null)
+2 (null) (null)
+3 256 true
+ TABLE
+ end
+
+ test("column > value") do
+ sliced_table = @table.slice do |slicer|
+ slicer.count > 16
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 32 false
+1 64 (null)
+2 (null) (null)
+3 256 true
+ TABLE
+ end
+
+ test("!(column > value)") do
+ sliced_table = @table.slice do |slicer|
+ !(slicer.count > 16)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 1 true
+2 2 false
+3 4 (null)
+4 8 true
+5 16 true
+6 (null) (null)
+ TABLE
+ end
+
+ test("column >= value") do
+ sliced_table = @table.slice do |slicer|
+ slicer.count >= 16
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 16 true
+1 32 false
+2 64 (null)
+3 (null) (null)
+4 256 true
+ TABLE
+ end
+
+ test("!(column >= value)") do
+ sliced_table = @table.slice do |slicer|
+ !(slicer.count >= 16)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 1 true
+2 2 false
+3 4 (null)
+4 8 true
+5 (null) (null)
+ TABLE
+ end
+
+ test("column.in") do
+ sliced_table = @table.slice do |slicer|
+ slicer.count.in?([1, 4, 16, 64])
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 1 true
+1 4 (null)
+2 16 true
+3 64 (null)
+ TABLE
+ end
+
+ test("!column.in") do
+ sliced_table = @table.slice do |slicer|
+ !slicer.count.in?([1, 4, 16, 64])
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 2 false
+2 8 true
+3 32 false
+4 (null) (null)
+5 256 true
+ TABLE
+ end
+
+ test("condition & condition") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible & (slicer.count >= 16)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 (null) (null)
+2 16 true
+3 (null) (null)
+4 (null) (null)
+5 256 true
+ TABLE
+ end
+
+ test("condition | condition") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible | (slicer.count >= 16)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 1 true
+2 (null) (null)
+3 8 true
+4 16 true
+5 32 false
+6 (null) (null)
+7 (null) (null)
+8 256 true
+ TABLE
+ end
+
+ test("condition ^ condition") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible ^ (slicer.count >= 16)
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 (null) (null)
+1 1 true
+2 (null) (null)
+3 8 true
+4 32 false
+5 (null) (null)
+6 (null) (null)
+ TABLE
+ end
+
+ test("select") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible.select do |value|
+ value.nil? or value
+ end
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 1 true
+2 4 (null)
+3 8 true
+4 16 true
+5 64 (null)
+6 (null) (null)
+7 256 true
+ TABLE
+ end
+
+ test("!select") do
+ sliced_table = @table.slice do |slicer|
+ !slicer.visible.select do |value|
+ value.nil? or value
+ end
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 2 false
+1 32 false
+ TABLE
+ end
+
+ test("reject") do
+ sliced_table = @table.slice do |slicer|
+ slicer.visible.reject do |value|
+ value.nil? or value
+ end
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 2 false
+1 32 false
+ TABLE
+ end
+
+ test("!reject") do
+ sliced_table = @table.slice do |slicer|
+ !slicer.visible.reject do |value|
+ value.nil? or value
+ end
+ end
+ assert_equal(<<-TABLE, sliced_table.to_s)
+ count visible
+0 0 (null)
+1 1 true
+2 4 (null)
+3 8 true
+4 16 true
+5 64 (null)
+6 (null) (null)
+7 256 true
+ TABLE
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-sort-indices.rb b/src/arrow/ruby/red-arrow/test/test-sort-indices.rb
new file mode 100644
index 000000000..b177831fe
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-sort-indices.rb
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class SortIndicesTest < Test::Unit::TestCase
+ def setup
+ @table = Arrow::Table.new(number1: [16, -1, 2, 32, -4, -4, -8],
+ number2: [32, 2, -16, 8, 1, 4, 1])
+ end
+
+ sub_test_case("Table") do
+ test("Symbol") do
+ assert_equal(Arrow::UInt64Array.new([6, 4, 5, 1, 2, 0, 3]),
+ @table.sort_indices(:number1))
+ end
+
+ test("-String") do
+ assert_equal(Arrow::UInt64Array.new([3, 0, 2, 1, 4, 5, 6]),
+ @table.sort_indices("-number1"))
+ end
+
+ test("Symbol, -String") do
+ assert_equal(Arrow::UInt64Array.new([6, 5, 4, 1, 2, 0, 3]),
+ @table.sort_indices([:number1, "-number2"]))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-sort-key.rb b/src/arrow/ruby/red-arrow/test/test-sort-key.rb
new file mode 100644
index 000000000..0a31f8461
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-sort-key.rb
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class SortKeyTest < Test::Unit::TestCase
+ sub_test_case(".resolve") do
+ test("SortKey") do
+ assert_equal(Arrow::SortKey.new("-count"),
+ Arrow::SortKey.resolve(Arrow::SortKey.new("-count")))
+ end
+
+ test("-String") do
+ assert_equal(Arrow::SortKey.new("-count"),
+ Arrow::SortKey.resolve("-count"))
+ end
+
+ test("Symbol, Symbol") do
+ assert_equal(Arrow::SortKey.new("-count"),
+ Arrow::SortKey.resolve(:count, :desc))
+ end
+ end
+
+ sub_test_case("#initialize") do
+ test("String") do
+ assert_equal("+count",
+ Arrow::SortKey.new("count").to_s)
+ end
+
+ test("+String") do
+ assert_equal("+count",
+ Arrow::SortKey.new("+count").to_s)
+ end
+
+ test("-String") do
+ assert_equal("-count",
+ Arrow::SortKey.new("-count").to_s)
+ end
+
+ test("Symbol") do
+ assert_equal("+-count",
+ Arrow::SortKey.new(:"-count").to_s)
+ end
+
+ test("String, Symbol") do
+ assert_equal("--count",
+ Arrow::SortKey.new("-count", :desc).to_s)
+ end
+
+ test("String, String") do
+ assert_equal("--count",
+ Arrow::SortKey.new("-count", "desc").to_s)
+ end
+
+ test("String, SortOrder") do
+ assert_equal("--count",
+ Arrow::SortKey.new("-count",
+ Arrow::SortOrder::DESCENDING).to_s)
+ end
+ end
+
+ sub_test_case("#to_s") do
+ test("recreatable") do
+ key = Arrow::SortKey.new("-count", :desc)
+ assert_equal(key,
+ Arrow::SortKey.new(key.to_s))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-sort-options.rb b/src/arrow/ruby/red-arrow/test/test-sort-options.rb
new file mode 100644
index 000000000..0afd65b0f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-sort-options.rb
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class SortOptionsTest < Test::Unit::TestCase
+ sub_test_case("#initialize") do
+ test("none") do
+ options = Arrow::SortOptions.new
+ assert_equal([],
+ options.sort_keys.collect(&:to_s))
+ end
+
+ test("-String, Symbol") do
+ options = Arrow::SortOptions.new("-count", :age)
+ assert_equal(["-count", "+age"],
+ options.sort_keys.collect(&:to_s))
+ end
+ end
+
+ sub_test_case("instance methods") do
+ setup do
+ @options = Arrow::SortOptions.new
+ end
+
+ sub_test_case("#add_sort_key") do
+ test("-String") do
+ @options.add_sort_key("-count")
+ assert_equal(["-count"],
+ @options.sort_keys.collect(&:to_s))
+ end
+
+ test("-String, Symbol") do
+ @options.add_sort_key("-count", :desc)
+ assert_equal(["--count"],
+ @options.sort_keys.collect(&:to_s))
+ end
+
+ test("SortKey") do
+ @options.add_sort_key(Arrow::SortKey.new("-count"))
+ assert_equal(["-count"],
+ @options.sort_keys.collect(&:to_s))
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-sparse-union-data-type.rb b/src/arrow/ruby/red-arrow/test/test-sparse-union-data-type.rb
new file mode 100644
index 000000000..e672f82d4
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-sparse-union-data-type.rb
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class SparseUnionDataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ def setup
+ @fields = [
+ Arrow::Field.new("visible", :boolean),
+ {
+ name: "count",
+ type: :int32,
+ },
+ ]
+ end
+
+ test("ordered arguments") do
+ assert_equal("sparse_union<visible: bool=2, count: int32=9>",
+ Arrow::SparseUnionDataType.new(@fields, [2, 9]).to_s)
+ end
+
+ test("description") do
+ assert_equal("sparse_union<visible: bool=2, count: int32=9>",
+ Arrow::SparseUnionDataType.new(fields: @fields,
+ type_codes: [2, 9]).to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-string-dictionary-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-string-dictionary-array-builder.rb
new file mode 100644
index 000000000..d6df509ed
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-string-dictionary-array-builder.rb
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class StringDictionaryArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ @builder = Arrow::StringDictionaryArrayBuilder.new
+ end
+
+ sub_test_case("#append_values") do
+ test("[nil]") do
+ @builder.append_values([nil])
+ array = @builder.finish
+ assert_equal([
+ [],
+ [nil],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("[String]") do
+ @builder.append_values(["hello"])
+ array = @builder.finish
+ assert_equal([
+ ["hello"],
+ [0],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("[Symbol]") do
+ @builder.append_values([:hello])
+ array = @builder.finish
+ assert_equal([
+ ["hello"],
+ [0],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("[nil, String, Symbol]") do
+ @builder.append_values([
+ nil,
+ "Hello",
+ :world,
+ "world",
+ ])
+ array = @builder.finish
+ assert_equal([
+ ["Hello", "world"],
+ [nil, 0, 1, 1],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+
+ test("is_valids") do
+ @builder.append_values([
+ "Hello",
+ :world,
+ :goodbye,
+ ],
+ [
+ true,
+ false,
+ true,
+ ])
+ array = @builder.finish
+ assert_equal([
+ ["Hello", "goodbye"],
+ [0, nil, 1],
+ ],
+ [
+ array.dictionary.to_a,
+ array.indices.to_a,
+ ])
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-struct-array-builder.rb b/src/arrow/ruby/red-arrow/test/test-struct-array-builder.rb
new file mode 100644
index 000000000..ab0aa5edf
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-struct-array-builder.rb
@@ -0,0 +1,184 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class StructArrayBuilderTest < Test::Unit::TestCase
+ def setup
+ @data_type = Arrow::StructDataType.new(visible: {type: :boolean},
+ count: {type: :uint64})
+ @builder = Arrow::StructArrayBuilder.new(@data_type)
+ end
+
+ sub_test_case("#append_value") do
+ test("nil") do
+ @builder.append_value(nil)
+ array = @builder.finish
+ assert_equal([
+ [false],
+ [0],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+
+ test("Array") do
+ @builder.append_value([true, 1])
+ @builder.append_value([])
+ @builder.append_value([false])
+ array = @builder.finish
+ assert_equal([
+ [true, nil, false],
+ [1, nil, nil],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+
+ test("Arrow::Struct") do
+ source_array = Arrow::StructArray.new(@data_type, [[true, 1]])
+ struct = source_array.get_value(0)
+ @builder.append_value(struct)
+ array = @builder.finish
+ assert_equal([
+ [true],
+ [1],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+
+ test("Hash") do
+ @builder.append_value(count: 1, visible: true)
+ @builder.append_value(visible: false)
+ @builder.append_value(count: 2)
+ array = @builder.finish
+ assert_equal([
+ [true, false, nil],
+ [1, nil, 2],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+ end
+
+ sub_test_case("#append_values") do
+ test("[nil]") do
+ @builder.append_values([nil])
+ array = @builder.finish
+ assert_equal([
+ [false],
+ [0],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+
+ test("[Array]") do
+ @builder.append_values([[true, 1]])
+ array = @builder.finish
+ assert_equal([
+ [true],
+ [1],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+
+ test("[Hash]") do
+ @builder.append_values([{count: 1, visible: true}])
+ array = @builder.finish
+ assert_equal([
+ [true],
+ [1],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+
+ test("[nil, Array, Hash]") do
+ @builder.append_values([
+ nil,
+ [true, 1],
+ {count: 2, visible: false},
+ ])
+ array = @builder.finish
+ assert_equal([
+ [false, true, false],
+ [0, 1, 2],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+
+ test("is_valids") do
+ @builder.append_values([
+ [true, 1],
+ [false, 2],
+ [true, 3],
+ ],
+ [
+ true,
+ false,
+ true,
+ ])
+ array = @builder.finish
+ assert_equal([
+ [true, false, true],
+ [1, 0, 3],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+ end
+
+ sub_test_case("#append") do
+ test("backward compatibility") do
+ @builder.append
+ @builder.get_field_builder(0).append(true)
+ @builder.get_field_builder(1).append(1)
+ @builder.append
+ @builder.get_field_builder(0).append(false)
+ @builder.get_field_builder(1).append(2)
+ array = @builder.finish
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => false, "count" => 2},
+ ],
+ [
+ array.get_value(0),
+ array.get_value(1),
+ ])
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-struct-array.rb b/src/arrow/ruby/red-arrow/test/test-struct-array.rb
new file mode 100644
index 000000000..2c01f33ef
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-struct-array.rb
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class StructArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("build") do
+ data_type = Arrow::StructDataType.new(visible: :boolean,
+ count: :uint64)
+ values = [
+ [true, 1],
+ nil,
+ [false, 2],
+ ]
+ array = Arrow::StructArray.new(data_type, values)
+ assert_equal([
+ [true, false, false],
+ [1, 0, 2],
+ ],
+ [
+ array.find_field(0).to_a,
+ array.find_field(1).to_a,
+ ])
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @data_type = Arrow::StructDataType.new(visible: {type: :boolean},
+ count: {type: :uint64})
+ @values = [
+ [true, 1],
+ [false, 2],
+ ]
+ @array = Arrow::StructArray.new(@data_type, @values)
+ end
+
+ test("#[]") do
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => false, "count" => 2},
+ ],
+ @array.to_a)
+ end
+
+ test("#get_value") do
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => false, "count" => 2},
+ ],
+ [
+ @array.get_value(0),
+ @array.get_value(1),
+ ])
+ end
+
+ sub_test_case("#find_field") do
+ test("Integer") do
+ assert_equal([
+ [true, false],
+ [1, 2],
+ ],
+ [
+ @array.find_field(0).to_a,
+ @array.find_field(1).to_a,
+ ])
+ end
+
+ test("String, Symbol") do
+ assert_equal([
+ [true, false],
+ [1, 2],
+ ],
+ [
+ @array.find_field("visible").to_a,
+ @array.find_field(:count).to_a,
+ ])
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-struct-data-type.rb b/src/arrow/ruby/red-arrow/test/test-struct-data-type.rb
new file mode 100644
index 000000000..d106e38b1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-struct-data-type.rb
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class StructDataTypeTest < Test::Unit::TestCase
+ def setup
+ @count_field = Arrow::Field.new("count", :uint32)
+ @visible_field = Arrow::Field.new("visible", :boolean)
+ end
+
+ sub_test_case(".new") do
+ test("[Arrow::Field]") do
+ fields = [
+ @count_field,
+ @visible_field,
+ ]
+ assert_equal("struct<count: uint32, visible: bool>",
+ Arrow::StructDataType.new(fields).to_s)
+ end
+
+ test("[Hash]") do
+ fields = [
+ {name: "count", data_type: :uint32},
+ {name: "visible", data_type: :boolean},
+ ]
+ assert_equal("struct<count: uint32, visible: bool>",
+ Arrow::StructDataType.new(fields).to_s)
+ end
+
+ test("[Arrow::Field, Hash]") do
+ fields = [
+ @count_field,
+ {name: "visible", data_type: :boolean},
+ ]
+ assert_equal("struct<count: uint32, visible: bool>",
+ Arrow::StructDataType.new(fields).to_s)
+ end
+
+ test("{Arrow::DataType}") do
+ fields = {
+ "count" => Arrow::UInt32DataType.new,
+ "visible" => Arrow::BooleanDataType.new,
+ }
+ assert_equal("struct<count: uint32, visible: bool>",
+ Arrow::StructDataType.new(fields).to_s)
+ end
+
+ test("{Hash}") do
+ fields = {
+ "count" => {type: :uint32},
+ "visible" => {type: :boolean},
+ }
+ assert_equal("struct<count: uint32, visible: bool>",
+ Arrow::StructDataType.new(fields).to_s)
+ end
+
+ test("{String, Symbol}") do
+ fields = {
+ "count" => "uint32",
+ "visible" => :boolean,
+ }
+ assert_equal("struct<count: uint32, visible: bool>",
+ Arrow::StructDataType.new(fields).to_s)
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ super
+ @data_type = Arrow::StructDataType.new([@count_field, @visible_field])
+ end
+
+ sub_test_case("#[]") do
+ test("[String]") do
+ assert_equal([@count_field, @visible_field],
+ [@data_type["count"], @data_type["visible"]])
+ end
+
+ test("[Symbol]") do
+ assert_equal([@count_field, @visible_field],
+ [@data_type[:count], @data_type[:visible]])
+ end
+
+ test("[Integer]") do
+ assert_equal([@count_field, @visible_field],
+ [@data_type[0], @data_type[1]])
+ end
+
+ test("[invalid]") do
+ invalid = []
+ message = "field name or index must be String, Symbol or Integer"
+ message << ": <#{invalid.inspect}>"
+ assert_raise(ArgumentError.new(message)) do
+ @data_type[invalid]
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-table.rb b/src/arrow/ruby/red-arrow/test/test-table.rb
new file mode 100644
index 000000000..78361a824
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-table.rb
@@ -0,0 +1,925 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TableTest < Test::Unit::TestCase
+ include Helper::Fixture
+
+ def setup
+ @count_field = Arrow::Field.new("count", :uint8)
+ @visible_field = Arrow::Field.new("visible", :boolean)
+ schema = Arrow::Schema.new([@count_field, @visible_field])
+ count_arrays = [
+ Arrow::UInt8Array.new([1, 2]),
+ Arrow::UInt8Array.new([4, 8, 16]),
+ Arrow::UInt8Array.new([32, 64]),
+ Arrow::UInt8Array.new([128]),
+ ]
+ visible_arrays = [
+ Arrow::BooleanArray.new([true, false, nil]),
+ Arrow::BooleanArray.new([true]),
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::BooleanArray.new([nil]),
+ Arrow::BooleanArray.new([nil]),
+ ]
+ @count_array = Arrow::ChunkedArray.new(count_arrays)
+ @visible_array = Arrow::ChunkedArray.new(visible_arrays)
+ @table = Arrow::Table.new(schema, [@count_array, @visible_array])
+ end
+
+ test("#columns") do
+ assert_equal([
+ Arrow::Column.new(@table, 0),
+ Arrow::Column.new(@table, 1),
+ ],
+ @table.columns)
+ end
+
+ sub_test_case("#slice") do
+ test("Arrow::BooleanArray") do
+ target_rows_raw = [nil, true, true, false, true, false, true, true]
+ target_rows = Arrow::BooleanArray.new(target_rows_raw)
+ assert_equal(<<-TABLE, @table.slice(target_rows).to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 4 (null)
+3 16 true
+4 64 (null)
+5 128 (null)
+ TABLE
+ end
+
+ test("Array: boolean") do
+ target_rows_raw = [nil, true, true, false, true, false, true, true]
+ assert_equal(<<-TABLE, @table.slice(target_rows_raw).to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 4 (null)
+3 16 true
+4 64 (null)
+5 128 (null)
+ TABLE
+ end
+
+ test("Integer: positive") do
+ assert_equal({"count" => 128, "visible" => nil},
+ @table.slice(@table.n_rows - 1).to_h)
+ end
+
+ test("Integer: negative") do
+ assert_equal({"count" => 1, "visible" => true},
+ @table.slice(-@table.n_rows).to_h)
+ end
+
+ test("Integer: out of index") do
+ assert_equal([
+ nil,
+ nil,
+ ],
+ [
+ @table.slice(@table.n_rows),
+ @table.slice(-(@table.n_rows + 1)),
+ ])
+ end
+
+ test("Range: positive: include end") do
+ assert_equal(<<-TABLE, @table.slice(2..4).to_s)
+ count visible
+0 4 (null)
+1 8 true
+2 16 true
+ TABLE
+ end
+
+ test("Range: positive: exclude end") do
+ assert_equal(<<-TABLE, @table.slice(2...4).to_s)
+ count visible
+0 4 (null)
+1 8 true
+ TABLE
+ end
+
+ test("Range: negative: include end") do
+ assert_equal(<<-TABLE, @table.slice(-4..-2).to_s)
+ count visible
+0 16 true
+1 32 false
+2 64 (null)
+ TABLE
+ end
+
+ test("Range: negative: exclude end") do
+ assert_equal(<<-TABLE, @table.slice(-4...-2).to_s)
+ count visible
+0 16 true
+1 32 false
+ TABLE
+ end
+
+ test("[from, to]: positive") do
+ assert_equal(<<-TABLE, @table.slice(0, 2).to_s)
+ count visible
+0 1 true
+1 2 false
+ TABLE
+ end
+
+ test("[from, to]: negative") do
+ assert_equal(<<-TABLE, @table.slice(-4, 2).to_s)
+ count visible
+0 16 true
+1 32 false
+ TABLE
+ end
+
+ test("{key: Number}") do
+ assert_equal(<<-TABLE, @table.slice(count: 16).to_s)
+ count visible
+0 16 true
+ TABLE
+ end
+
+ test("{key: String}") do
+ table = Arrow::Table.new(name: Arrow::StringArray.new(["a", "b", "c"]))
+ assert_equal(<<-TABLE, table.slice(name: 'b').to_s)
+ name
+0 b
+ TABLE
+ end
+
+ test("{key: true}") do
+ assert_equal(<<-TABLE, @table.slice(visible: true).to_s)
+ count visible
+0 1 true
+1 (null) (null)
+2 8 true
+3 16 true
+4 (null) (null)
+5 (null) (null)
+ TABLE
+ end
+
+ test("{key: false}") do
+ assert_equal(<<-TABLE, @table.slice(visible: false).to_s)
+ count visible
+0 2 false
+1 (null) (null)
+2 32 false
+3 (null) (null)
+4 (null) (null)
+ TABLE
+ end
+
+ test("{key: Range}: beginless include end") do
+ assert_equal(<<-TABLE, @table.slice(count: ..8).to_s)
+ count visible
+0 1 true
+1 2 false
+2 4 (null)
+3 8 true
+ TABLE
+ end
+
+ test("{key: Range}: beginless exclude end") do
+ assert_equal(<<-TABLE, @table.slice(count: ...8).to_s)
+ count visible
+0 1 true
+1 2 false
+2 4 (null)
+ TABLE
+ end
+
+ test("{key: Range}: endless") do
+ assert_equal(<<-TABLE, @table.slice(count: 16..).to_s)
+ count visible
+0 16 true
+1 32 false
+2 64 (null)
+3 128 (null)
+ TABLE
+ end
+
+ test("{key: Range}: include end") do
+ assert_equal(<<-TABLE, @table.slice(count: 1..16).to_s)
+ count visible
+0 1 true
+1 2 false
+2 4 (null)
+3 8 true
+4 16 true
+ TABLE
+ end
+
+ test("{key: Range}: exclude end") do
+ assert_equal(<<-TABLE, @table.slice(count: 1...16).to_s)
+ count visible
+0 1 true
+1 2 false
+2 4 (null)
+3 8 true
+ TABLE
+ end
+
+ test("{key1: Range, key2: true}") do
+ assert_equal(<<-TABLE, @table.slice(count: 0..8, visible: false).to_s)
+ count visible
+0 2 false
+1 (null) (null)
+2 (null) (null)
+3 (null) (null)
+ TABLE
+ end
+
+ sub_test_case("wrong argument") do
+ test("no arguments") do
+ message = "wrong number of arguments (given 0, expected 1..2)"
+ assert_raise(ArgumentError.new(message)) do
+ @table.slice
+ end
+ end
+
+ test("too many arguments") do
+ message = "wrong number of arguments (given 3, expected 1..2)"
+ assert_raise(ArgumentError.new(message)) do
+ @table.slice(1, 2, 3)
+ end
+ end
+
+ test("arguments: with block") do
+ message = "must not specify both arguments and block"
+ assert_raise(ArgumentError.new(message)) do
+ @table.slice(1, 2) {}
+ end
+ end
+
+ test("offset: too small") do
+ n_rows = @table.n_rows
+ offset = -(n_rows + 1)
+ message = "offset is out of range (-#{n_rows + 1},#{n_rows}): #{offset}"
+ assert_raise(ArgumentError.new(message)) do
+ @table.slice(offset, 1)
+ end
+ end
+
+ test("offset: too large") do
+ n_rows = @table.n_rows
+ offset = n_rows
+ message = "offset is out of range (-#{n_rows + 1},#{n_rows}): #{offset}"
+ assert_raise(ArgumentError.new(message)) do
+ @table.slice(offset, 1)
+ end
+ end
+ end
+ end
+
+ sub_test_case("#[]") do
+ def setup
+ @table = Arrow::Table.new(a: [true],
+ b: [true],
+ c: [true],
+ d: [true],
+ e: [true],
+ f: [true],
+ g: [true])
+ end
+
+ test("[String]") do
+ assert_equal(Arrow::Column.new(@table, 0),
+ @table["a"])
+ end
+
+ test("[Symbol]") do
+ assert_equal(Arrow::Column.new(@table, 1),
+ @table[:b])
+ end
+
+ test("[Integer]") do
+ assert_equal(Arrow::Column.new(@table, 6),
+ @table[-1])
+ end
+
+ test("[Range]") do
+ assert_equal(Arrow::Table.new(d: [true],
+ e: [true]),
+ @table[3..4])
+ end
+
+ test("[[Symbol, String, Integer, Range]]") do
+ assert_equal(Arrow::Table.new(c: [true],
+ a: [true],
+ g: [true],
+ d: [true],
+ e: [true]),
+ @table[[:c, "a", -1, 3..4]])
+ end
+ end
+
+ sub_test_case("#merge") do
+ sub_test_case("Hash") do
+ test("add") do
+ name_array = Arrow::StringArray.new(["a", "b", "c", "d", "e", "f", "g", "h"])
+ assert_equal(<<-TABLE, @table.merge(:name => name_array).to_s)
+ count visible name
+0 1 true a
+1 2 false b
+2 4 (null) c
+3 8 true d
+4 16 true e
+5 32 false f
+6 64 (null) g
+7 128 (null) h
+ TABLE
+ end
+
+ test("remove") do
+ assert_equal(<<-TABLE, @table.merge(:visible => nil).to_s)
+ count
+0 1
+1 2
+2 4
+3 8
+4 16
+5 32
+6 64
+7 128
+ TABLE
+ end
+
+ test("replace") do
+ visible_array = Arrow::Int32Array.new([1] * @visible_array.length)
+ assert_equal(<<-TABLE, @table.merge(:visible => visible_array).to_s)
+ count visible
+0 1 1
+1 2 1
+2 4 1
+3 8 1
+4 16 1
+5 32 1
+6 64 1
+7 128 1
+ TABLE
+ end
+ end
+
+ sub_test_case("Arrow::Table") do
+ test("add") do
+ name_array = Arrow::StringArray.new(["a", "b", "c", "d", "e", "f", "g", "h"])
+ table = Arrow::Table.new("name" => name_array)
+ assert_equal(<<-TABLE, @table.merge(table).to_s)
+ count visible name
+0 1 true a
+1 2 false b
+2 4 (null) c
+3 8 true d
+4 16 true e
+5 32 false f
+6 64 (null) g
+7 128 (null) h
+ TABLE
+ end
+
+ test("replace") do
+ visible_array = Arrow::Int32Array.new([1] * @visible_array.length)
+ table = Arrow::Table.new("visible" => visible_array)
+ assert_equal(<<-TABLE, @table.merge(table).to_s)
+ count visible
+0 1 1
+1 2 1
+2 4 1
+3 8 1
+4 16 1
+5 32 1
+6 64 1
+7 128 1
+ TABLE
+ end
+ end
+ end
+
+ test("column name getter") do
+ assert_equal(Arrow::Column.new(@table, 1),
+ @table.visible)
+ end
+
+ sub_test_case("#remove_column") do
+ test("String") do
+ assert_equal(<<-TABLE, @table.remove_column("visible").to_s)
+ count
+0 1
+1 2
+2 4
+3 8
+4 16
+5 32
+6 64
+7 128
+ TABLE
+ end
+
+ test("Symbol") do
+ assert_equal(<<-TABLE, @table.remove_column(:visible).to_s)
+ count
+0 1
+1 2
+2 4
+3 8
+4 16
+5 32
+6 64
+7 128
+ TABLE
+ end
+
+ test("unknown column name") do
+ assert_raise(KeyError) do
+ @table.remove_column(:nonexistent)
+ end
+ end
+
+ test("Integer") do
+ assert_equal(<<-TABLE, @table.remove_column(1).to_s)
+ count
+0 1
+1 2
+2 4
+3 8
+4 16
+5 32
+6 64
+7 128
+ TABLE
+ end
+
+ test("negative integer") do
+ assert_equal(<<-TABLE, @table.remove_column(-1).to_s)
+ count
+0 1
+1 2
+2 4
+3 8
+4 16
+5 32
+6 64
+7 128
+ TABLE
+ end
+
+ test("too small index") do
+ assert_raise(IndexError) do
+ @table.remove_column(-3)
+ end
+ end
+
+ test("too large index") do
+ assert_raise(IndexError) do
+ @table.remove_column(2)
+ end
+ end
+ end
+
+ sub_test_case("#select_columns") do
+ def setup
+ raw_table = {
+ :a => Arrow::UInt8Array.new([1]),
+ :b => Arrow::UInt8Array.new([1]),
+ :c => Arrow::UInt8Array.new([1]),
+ :d => Arrow::UInt8Array.new([1]),
+ :e => Arrow::UInt8Array.new([1]),
+ }
+ @table = Arrow::Table.new(raw_table)
+ end
+
+ test("names") do
+ assert_equal(<<-TABLE, @table.select_columns(:c, :a).to_s)
+ c a
+0 1 1
+ TABLE
+ end
+
+ test("range") do
+ assert_equal(<<-TABLE, @table.select_columns(2...4).to_s)
+ c d
+0 1 1
+ TABLE
+ end
+
+ test("indexes") do
+ assert_equal(<<-TABLE, @table.select_columns(0, -1, 2).to_s)
+ a e c
+0 1 1 1
+ TABLE
+ end
+
+ test("mixed") do
+ assert_equal(<<-TABLE, @table.select_columns(:a, -1, 2..3).to_s)
+ a e c d
+0 1 1 1 1
+ TABLE
+ end
+
+ test("block") do
+ selected_table = @table.select_columns.with_index do |column, i|
+ column.name == "a" or i.odd?
+ end
+ assert_equal(<<-TABLE, selected_table.to_s)
+ a b d
+0 1 1 1
+ TABLE
+ end
+
+ test("names, indexes and block") do
+ selected_table = @table.select_columns(:a, -1) do |column|
+ column.name == "a"
+ end
+ assert_equal(<<-TABLE, selected_table.to_s)
+ a
+0 1
+ TABLE
+ end
+ end
+
+ sub_test_case("#save and .load") do
+ module SaveLoadFormatTests
+ def test_default
+ output = create_output(".arrow")
+ @table.save(output)
+ assert_equal(@table, Arrow::Table.load(output))
+ end
+
+ def test_arrow_file
+ output = create_output(".arrow")
+ @table.save(output, format: :arrow_file)
+ assert_equal(@table, Arrow::Table.load(output, format: :arrow_file))
+ end
+
+ def test_batch
+ output = create_output(".arrow")
+ @table.save(output, format: :batch)
+ assert_equal(@table, Arrow::Table.load(output, format: :batch))
+ end
+
+ def test_arrow_streaming
+ output = create_output(".arrow")
+ @table.save(output, format: :arrow_streaming)
+ assert_equal(@table, Arrow::Table.load(output, format: :arrow_streaming))
+ end
+
+ def test_stream
+ output = create_output(".arrow")
+ @table.save(output, format: :stream)
+ assert_equal(@table, Arrow::Table.load(output, format: :stream))
+ end
+
+ def test_csv
+ output = create_output(".csv")
+ @table.save(output, format: :csv)
+ assert_equal(@table,
+ Arrow::Table.load(output,
+ format: :csv,
+ schema: @table.schema))
+ end
+
+ def test_csv_gz
+ output = create_output(".csv.gz")
+ @table.save(output,
+ format: :csv,
+ compression: :gzip)
+ assert_equal(@table,
+ Arrow::Table.load(output,
+ format: :csv,
+ compression: :gzip,
+ schema: @table.schema))
+ end
+
+ def test_tsv
+ output = create_output(".tsv")
+ @table.save(output, format: :tsv)
+ assert_equal(@table,
+ Arrow::Table.load(output,
+ format: :tsv,
+ schema: @table.schema))
+ end
+ end
+
+ sub_test_case("path") do
+ sub_test_case(":format") do
+ include SaveLoadFormatTests
+
+ def create_output(extension)
+ @file = Tempfile.new(["red-arrow", extension])
+ @file.path
+ end
+
+ sub_test_case("save: auto detect") do
+ test("csv") do
+ output = create_output(".csv")
+ @table.save(output)
+ assert_equal(@table,
+ Arrow::Table.load(output,
+ format: :csv,
+ schema: @table.schema))
+ end
+
+ test("csv.gz") do
+ output = create_output(".csv.gz")
+ @table.save(output)
+ assert_equal(@table,
+ Arrow::Table.load(output,
+ format: :csv,
+ compression: :gzip,
+ schema: @table.schema))
+ end
+
+ test("tsv") do
+ output = create_output(".tsv")
+ @table.save(output)
+ assert_equal(@table,
+ Arrow::Table.load(output,
+ format: :tsv,
+ schema: @table.schema))
+ end
+ end
+
+ sub_test_case("load: auto detect") do
+ test("arrow: file") do
+ output = create_output(".arrow")
+ @table.save(output, format: :arrow_file)
+ assert_equal(@table, Arrow::Table.load(output))
+ end
+
+ test("arrow: streaming") do
+ output = create_output(".arrow")
+ @table.save(output, format: :arrow_streaming)
+ assert_equal(@table, Arrow::Table.load(output))
+ end
+
+ test("csv") do
+ path = fixture_path("with-header.csv")
+ table = Arrow::Table.load(path, skip_lines: /^\#/)
+ assert_equal(<<-TABLE, table.to_s)
+ name score
+0 alice 10
+1 bob 29
+2 chris -1
+ TABLE
+ end
+
+ test("csv.gz") do
+ file = Tempfile.new(["red-arrow", ".csv.gz"])
+ file.close
+ Zlib::GzipWriter.open(file.path) do |gz|
+ gz.write(<<-CSV)
+name,score
+alice,10
+bob,29
+chris,-1
+ CSV
+ end
+ assert_equal(<<-TABLE, Arrow::Table.load(file.path).to_s)
+ name score
+0 alice 10
+1 bob 29
+2 chris -1
+ TABLE
+ end
+
+ test("tsv") do
+ file = Tempfile.new(["red-arrow", ".tsv"])
+ file.puts(<<-TSV)
+name\tscore
+alice\t10
+bob\t29
+chris\t-1
+ TSV
+ file.close
+ table = Arrow::Table.load(file.path)
+ assert_equal(<<-TABLE, table.to_s)
+ name score
+0 alice 10
+1 bob 29
+2 chris -1
+ TABLE
+ end
+ end
+ end
+ end
+
+ sub_test_case("Buffer") do
+ sub_test_case(":format") do
+ include SaveLoadFormatTests
+
+ def create_output(extension)
+ Arrow::ResizableBuffer.new(1024)
+ end
+ end
+ end
+ end
+
+ test("#pack") do
+ packed_table = @table.pack
+ column_n_chunks = packed_table.columns.collect {|c| c.data.n_chunks}
+ assert_equal([[1, 1], <<-TABLE], [column_n_chunks, packed_table.to_s])
+ count visible
+0 1 true
+1 2 false
+2 4 (null)
+3 8 true
+4 16 true
+5 32 false
+6 64 (null)
+7 128 (null)
+ TABLE
+ end
+
+ sub_test_case("#to_s") do
+ sub_test_case(":format") do
+ def setup
+ columns = {
+ "count" => Arrow::UInt8Array.new([1, 2]),
+ "visible" => Arrow::BooleanArray.new([true, false]),
+ }
+ @table = Arrow::Table.new(columns)
+ end
+
+ test(":column") do
+ assert_equal(<<-TABLE, @table.to_s(format: :column))
+count: uint8
+visible: bool
+----
+count:
+ [
+ [
+ 1,
+ 2
+ ]
+ ]
+visible:
+ [
+ [
+ true,
+ false
+ ]
+ ]
+ TABLE
+ end
+
+ test(":list") do
+ assert_equal(<<-TABLE, @table.to_s(format: :list))
+==================== 0 ====================
+count: 1
+visible: true
+==================== 1 ====================
+count: 2
+visible: false
+ TABLE
+ end
+
+ test(":table") do
+ assert_equal(<<-TABLE, @table.to_s(format: :table))
+ count visible
+0 1 true
+1 2 false
+ TABLE
+ end
+
+ test("invalid") do
+ message = ":format must be :column, :list, :table or nil: <:invalid>"
+ assert_raise(ArgumentError.new(message)) do
+ @table.to_s(format: :invalid)
+ end
+ end
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Table") do
+ assert do
+ @table == @table
+ end
+ end
+
+ test("not Arrow::Table") do
+ assert do
+ not (@table == 29)
+ end
+ end
+ end
+ end
+
+ sub_test_case("#filter") do
+ def setup
+ super
+ @options = Arrow::FilterOptions.new
+ @options.null_selection_behavior = :emit_null
+ end
+
+ test("Array: boolean") do
+ filter = [nil, true, true, false, true, false, true, true]
+ assert_equal(<<-TABLE, @table.filter(filter, @options).to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 4 (null)
+3 16 true
+4 64 (null)
+5 128 (null)
+ TABLE
+ end
+
+ test("Arrow::BooleanArray") do
+ array = [nil, true, true, false, true, false, true, true]
+ filter = Arrow::BooleanArray.new(array)
+ assert_equal(<<-TABLE, @table.filter(filter, @options).to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 4 (null)
+3 16 true
+4 64 (null)
+5 128 (null)
+ TABLE
+ end
+
+ test("Arrow::ChunkedArray") do
+ filter_chunks = [
+ Arrow::BooleanArray.new([nil, true, true]),
+ Arrow::BooleanArray.new([false, true, false]),
+ Arrow::BooleanArray.new([true, true]),
+ ]
+ filter = Arrow::ChunkedArray.new(filter_chunks)
+ assert_equal(<<-TABLE, @table.filter(filter, @options).to_s)
+ count visible
+0 (null) (null)
+1 2 false
+2 4 (null)
+3 16 true
+4 64 (null)
+5 128 (null)
+ TABLE
+ end
+ end
+
+ sub_test_case("#take") do
+ test("Arrow: boolean") do
+ indices = [1, 0, 2]
+ assert_equal(<<-TABLE, @table.take(indices).to_s)
+ count visible
+0 2 false
+1 1 true
+2 4 (null)
+ TABLE
+ end
+
+ test("Arrow::Array") do
+ indices = Arrow::Int16Array.new([1, 0, 2])
+ assert_equal(<<-TABLE, @table.take(indices).to_s)
+ count visible
+0 2 false
+1 1 true
+2 4 (null)
+ TABLE
+ end
+
+ test("Arrow::ChunkedArray") do
+ chunks = [
+ Arrow::Int16Array.new([1, 0]),
+ Arrow::Int16Array.new([2])
+ ]
+ indices = Arrow::ChunkedArray.new(chunks)
+ assert_equal(<<-TABLE, @table.take(indices).to_s)
+ count visible
+0 2 false
+1 1 true
+2 4 (null)
+ TABLE
+ end
+ end
+
+ sub_test_case("#concatenate") do
+ test("options: :unify_schemas") do
+ table1 = Arrow::Table.new(a: [true],
+ b: [false])
+ table2 = Arrow::Table.new(b: [false])
+ concatenated = table1.concatenate([table2], unify_schemas: true)
+ assert_equal(<<-TABLE, concatenated.to_s)
+ a b
+0 true false
+1 (null) false
+ TABLE
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-tensor.rb b/src/arrow/ruby/red-arrow/test/test-tensor.rb
new file mode 100644
index 000000000..ffa1e3241
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-tensor.rb
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TensorTest < Test::Unit::TestCase
+ sub_test_case("instance methods") do
+ def setup
+ raw_data = [
+ 1, 2,
+ 3, 4,
+
+ 5, 6,
+ 7, 8,
+
+ 9, 10,
+ 11, 12,
+ ]
+ data = Arrow::Buffer.new(raw_data.pack("c*"))
+ shape = [3, 2, 2]
+ strides = []
+ names = ["a", "b", "c"]
+ @tensor = Arrow::Tensor.new(Arrow::Int8DataType.new,
+ data,
+ shape,
+ strides,
+ names)
+ end
+
+ sub_test_case("#==") do
+ test("Arrow::Tensor") do
+ assert do
+ @tensor == @tensor
+ end
+ end
+
+ test("not Arrow::Tensor") do
+ assert do
+ not (@tensor == 29)
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-time.rb b/src/arrow/ruby/red-arrow/test/test-time.rb
new file mode 100644
index 000000000..37c098c69
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-time.rb
@@ -0,0 +1,288 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TimeTest < Test::Unit::TestCase
+ sub_test_case("#==") do
+ test("same unit") do
+ assert do
+ Arrow::Time.new(:second, 10) == Arrow::Time.new(:second, 10)
+ end
+ end
+
+ test("different unit") do
+ assert do
+ Arrow::Time.new(:second, 10) == Arrow::Time.new(:milli, 10 * 1000)
+ end
+ end
+
+ test("false") do
+ assert do
+ not(Arrow::Time.new(:second, 10) == Arrow::Time.new(:second, 11))
+ end
+ end
+ end
+
+ sub_test_case("#cast") do
+ test("same unit") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, 10)
+ casted_time = time.cast(Arrow::TimeUnit::SECOND)
+ assert_equal([time.unit, time.value],
+ [casted_time.unit, casted_time.value])
+ end
+
+ test("second -> milli") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, 10)
+ casted_time = time.cast(Arrow::TimeUnit::MILLI)
+ assert_equal([
+ Arrow::TimeUnit::MILLI,
+ time.value * 1000,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("second -> micro") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, 10)
+ casted_time = time.cast(Arrow::TimeUnit::MICRO)
+ assert_equal([
+ Arrow::TimeUnit::MICRO,
+ time.value * 1000 * 1000,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("second -> nano") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, 10)
+ casted_time = time.cast(Arrow::TimeUnit::NANO)
+ assert_equal([
+ Arrow::TimeUnit::NANO,
+ time.value * 1000 * 1000 * 1000,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("milli -> second") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MILLI, 10_200)
+ casted_time = time.cast(Arrow::TimeUnit::SECOND)
+ assert_equal([
+ Arrow::TimeUnit::SECOND,
+ 10,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("milli -> micro") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MILLI, 10_200)
+ casted_time = time.cast(Arrow::TimeUnit::MICRO)
+ assert_equal([
+ Arrow::TimeUnit::MICRO,
+ time.value * 1000,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("milli -> nano") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MILLI, 10_200)
+ casted_time = time.cast(Arrow::TimeUnit::NANO)
+ assert_equal([
+ Arrow::TimeUnit::NANO,
+ time.value * 1000 * 1000,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("micro -> second") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MICRO, 10_200_300)
+ casted_time = time.cast(Arrow::TimeUnit::SECOND)
+ assert_equal([
+ Arrow::TimeUnit::SECOND,
+ 10,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("micro -> milli") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MICRO, 10_200_300)
+ casted_time = time.cast(Arrow::TimeUnit::MILLI)
+ assert_equal([
+ Arrow::TimeUnit::MILLI,
+ 10_200,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("micro -> nano") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MICRO, 10_200_300)
+ casted_time = time.cast(Arrow::TimeUnit::NANO)
+ assert_equal([
+ Arrow::TimeUnit::NANO,
+ time.value * 1000,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("nano -> second") do
+ time = Arrow::Time.new(Arrow::TimeUnit::NANO, 10_200_300_400)
+ casted_time = time.cast(Arrow::TimeUnit::SECOND)
+ assert_equal([
+ Arrow::TimeUnit::SECOND,
+ 10,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("nano -> milli") do
+ time = Arrow::Time.new(Arrow::TimeUnit::NANO, 10_200_300_400)
+ casted_time = time.cast(Arrow::TimeUnit::MILLI)
+ assert_equal([
+ Arrow::TimeUnit::MILLI,
+ 10_200,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+
+ test("nano -> micro") do
+ time = Arrow::Time.new(Arrow::TimeUnit::NANO, 10_200_300_400)
+ casted_time = time.cast(Arrow::TimeUnit::MICRO)
+ assert_equal([
+ Arrow::TimeUnit::MICRO,
+ 10_200_300,
+ ],
+ [
+ casted_time.unit,
+ casted_time.value,
+ ])
+ end
+ end
+
+ sub_test_case("#to_f") do
+ test("second") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, 10)
+ assert_in_delta(10.0, time.to_f)
+ end
+
+ test("milli") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MILLI, 10_200)
+ assert_in_delta(10.2, time.to_f)
+ end
+
+ test("micro") do
+ time = Arrow::Time.new(Arrow::TimeUnit::MICRO, 10_200_300)
+ assert_in_delta(10.2003, time.to_f)
+ end
+
+ test("nano") do
+ time = Arrow::Time.new(Arrow::TimeUnit::NANO, 10_200_300_400)
+ assert_in_delta(10.2003004, time.to_f)
+ end
+ end
+
+ sub_test_case("#positive?") do
+ test("true") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, 10)
+ assert do
+ time.positive?
+ end
+ end
+
+ test("false") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, -10)
+ assert do
+ not time.positive?
+ end
+ end
+ end
+
+ sub_test_case("#negative?") do
+ test("true") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, -10)
+ assert do
+ time.negative?
+ end
+ end
+
+ test("false") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND, 10)
+ assert do
+ not time.negative?
+ end
+ end
+ end
+
+ test("#hour") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND,
+ (5 * 60 * 60) + (12 * 60) + 10)
+ assert_equal(5, time.hour)
+ end
+
+ test("#minute") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND,
+ (5 * 60 * 60) + (12 * 60) + 10)
+ assert_equal(12, time.minute)
+ end
+
+ test("#second") do
+ time = Arrow::Time.new(Arrow::TimeUnit::SECOND,
+ (5 * 60 * 60) + (12 * 60) + 10)
+ assert_equal(10, time.second)
+ end
+
+ test("#nano_second") do
+ time = Arrow::Time.new(Arrow::TimeUnit::NANO, 1234)
+ assert_equal(1234, time.nano_second)
+ end
+
+ test("#to_s") do
+ time = Arrow::Time.new(Arrow::TimeUnit::NANO,
+ -(((5 * 60 * 60) + (12 * 60) + 10) * 1_000_000_000 +
+ 1234))
+ assert_equal("-05:12:10.000001234",
+ time.to_s)
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-time32-array.rb b/src/arrow/ruby/red-arrow/test/test-time32-array.rb
new file mode 100644
index 000000000..b8bb4eb94
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-time32-array.rb
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Time32ArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ sub_test_case("unit") do
+ test("Arrow::TimeUnit") do
+ values = [1000 * 10, nil]
+ array = Arrow::Time32Array.new(Arrow::TimeUnit::MILLI, values)
+ assert_equal([
+ "time32[ms]",
+ [
+ Arrow::Time.new(Arrow::TimeUnit::MILLI,
+ 1000 * 10),
+ nil,
+ ],
+ ],
+ [
+ array.value_data_type.to_s,
+ array.to_a,
+ ])
+ end
+
+ test("Symbol") do
+ values = [60 * 10, nil]
+ array = Arrow::Time32Array.new(:second, values)
+ assert_equal([
+ "time32[s]",
+ [
+ Arrow::Time.new(Arrow::TimeUnit::SECOND,
+ 60 * 10),
+ nil,
+ ],
+ ],
+ [
+ array.value_data_type.to_s,
+ array.to_a,
+ ])
+ end
+ end
+
+ sub_test_case("values") do
+ test("Arrow::Time") do
+ data_type = Arrow::Time32DataType.new(:second)
+ values = [
+ Arrow::Time.new(Arrow::TimeUnit::SECOND,
+ 60 * 10),
+ nil,
+ ]
+ array = Arrow::Time32Array.new(data_type, values)
+ assert_equal(values, array.to_a)
+ end
+
+ test("Integer") do
+ data_type = Arrow::Time32DataType.new(:second)
+ values = [60 * 10, nil]
+ array = Arrow::Time32Array.new(data_type, values)
+ assert_equal([
+ Arrow::Time.new(Arrow::TimeUnit::SECOND,
+ 60 * 10),
+ nil,
+ ],
+ array.to_a)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-time32-data-type.rb b/src/arrow/ruby/red-arrow/test/test-time32-data-type.rb
new file mode 100644
index 000000000..26f17359a
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-time32-data-type.rb
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Time32DataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("Arrow::TimeUnit") do
+ assert_equal("time32[ms]",
+ Arrow::Time32DataType.new(Arrow::TimeUnit::MILLI).to_s)
+ end
+
+ test("Symbol") do
+ assert_equal("time32[ms]",
+ Arrow::Time32DataType.new(:milli).to_s)
+ end
+
+ test("unit: Arrow::TimeUnit") do
+ data_type = Arrow::Time32DataType.new(unit: Arrow::TimeUnit::MILLI)
+ assert_equal("time32[ms]",
+ data_type.to_s)
+ end
+
+ test("unit: Symbol") do
+ data_type = Arrow::Time32DataType.new(unit: :milli)
+ assert_equal("time32[ms]",
+ data_type.to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-time64-array.rb b/src/arrow/ruby/red-arrow/test/test-time64-array.rb
new file mode 100644
index 000000000..831af1e35
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-time64-array.rb
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Time64ArrayTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ sub_test_case("unit") do
+ test("Arrow::TimeUnit") do
+ values = [1000 * 10, nil]
+ array = Arrow::Time64Array.new(Arrow::TimeUnit::NANO, values)
+ assert_equal([
+ "time64[ns]",
+ [
+ Arrow::Time.new(Arrow::TimeUnit::NANO,
+ 1000 * 10),
+ nil,
+ ],
+ ],
+ [
+ array.value_data_type.to_s,
+ array.to_a,
+ ])
+ end
+
+ test("Symbol") do
+ values = [1000 * 10, nil]
+ array = Arrow::Time64Array.new(:micro, values)
+ assert_equal([
+ "time64[us]",
+ [
+ Arrow::Time.new(Arrow::TimeUnit::MICRO,
+ 1000 * 10),
+ nil,
+ ],
+ ],
+ [
+ array.value_data_type.to_s,
+ array.to_a,
+ ])
+ end
+ end
+
+ sub_test_case("values") do
+ test("Arrow::Time") do
+ data_type = Arrow::Time64DataType.new(:nano)
+ values = [
+ Arrow::Time.new(Arrow::TimeUnit::NANO,
+ 1000 * 10),
+ nil,
+ ]
+ array = Arrow::Time64Array.new(data_type, values)
+ assert_equal(values, array.to_a)
+ end
+
+ test("Integer") do
+ data_type = Arrow::Time64DataType.new(:nano)
+ values = [1000 * 10, nil]
+ array = Arrow::Time64Array.new(data_type, values)
+ assert_equal([
+ Arrow::Time.new(Arrow::TimeUnit::NANO,
+ 1000 * 10),
+ nil,
+ ],
+ array.to_a)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-time64-data-type.rb b/src/arrow/ruby/red-arrow/test/test-time64-data-type.rb
new file mode 100644
index 000000000..a5f341753
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-time64-data-type.rb
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class Time64DataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("Arrow::TimeUnit") do
+ assert_equal("time64[ns]",
+ Arrow::Time64DataType.new(Arrow::TimeUnit::NANO).to_s)
+ end
+
+ test("Symbol") do
+ assert_equal("time64[ns]",
+ Arrow::Time64DataType.new(:nano).to_s)
+ end
+
+ test("unit: Arrow::TimeUnit") do
+ data_type = Arrow::Time64DataType.new(unit: Arrow::TimeUnit::NANO)
+ assert_equal("time64[ns]",
+ data_type.to_s)
+ end
+
+ test("unit: Symbol") do
+ data_type = Arrow::Time64DataType.new(unit: :nano)
+ assert_equal("time64[ns]",
+ data_type.to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-timestamp-array.rb b/src/arrow/ruby/red-arrow/test/test-timestamp-array.rb
new file mode 100644
index 000000000..248a2531e
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-timestamp-array.rb
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TimestampArrayTest < Test::Unit::TestCase
+ test("#[]") do
+ sec = 1513267750
+ usec = 914509
+ array = Arrow::TimestampArray.new(:micro, [sec * (10 ** 6) + usec])
+ time = Time.at(sec, usec)
+ assert_equal(time, array[0])
+ end
+
+ sub_test_case("#is_in") do
+ def setup
+ values = [
+ Time.parse("2019-11-18T00:09:11"),
+ Time.parse("2019-11-18T00:09:12"),
+ Time.parse("2019-11-18T00:09:13"),
+ ]
+ @array = Arrow::TimestampArray.new(:micro, values)
+ end
+
+ test("Arrow: Array") do
+ right = [
+ Time.parse("2019-11-18T00:09:12"),
+ ]
+ assert_equal(Arrow::BooleanArray.new([false, true, false]),
+ @array.is_in(right))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/test-timestamp-data-type.rb b/src/arrow/ruby/red-arrow/test/test-timestamp-data-type.rb
new file mode 100644
index 000000000..f8ccd3d8b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/test-timestamp-data-type.rb
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TimestampDataTypeTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("Arrow::TimeUnit") do
+ assert_equal("timestamp[ms]",
+ Arrow::TimestampDataType.new(Arrow::TimeUnit::MILLI).to_s)
+ end
+
+ test("Symbol") do
+ assert_equal("timestamp[ms]",
+ Arrow::TimestampDataType.new(:milli).to_s)
+ end
+
+ test("unit: Arrow::TimeUnit") do
+ data_type = Arrow::TimestampDataType.new(unit: Arrow::TimeUnit::MILLI)
+ assert_equal("timestamp[ms]",
+ data_type.to_s)
+ end
+
+ test("unit: Symbol") do
+ data_type = Arrow::TimestampDataType.new(unit: :milli)
+ assert_equal("timestamp[ms]",
+ data_type.to_s)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/values/test-basic-arrays.rb b/src/arrow/ruby/red-arrow/test/values/test-basic-arrays.rb
new file mode 100644
index 000000000..c54c7f62d
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/values/test-basic-arrays.rb
@@ -0,0 +1,295 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ValuesBasicArraysTests
+ def test_null
+ target = build(Arrow::NullArray.new(4))
+ assert_equal([nil] * 4, target.values)
+ end
+
+ def test_boolean
+ values = [true, nil, false]
+ target = build(Arrow::BooleanArray.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_int8
+ values = [
+ -(2 ** 7),
+ nil,
+ (2 ** 7) - 1,
+ ]
+ target = build(Arrow::Int8Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_uint8
+ values = [
+ 0,
+ nil,
+ (2 ** 8) - 1,
+ ]
+ target = build(Arrow::UInt8Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_int16
+ values = [
+ -(2 ** 15),
+ nil,
+ (2 ** 15) - 1,
+ ]
+ target = build(Arrow::Int16Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_uint16
+ values = [
+ 0,
+ nil,
+ (2 ** 16) - 1,
+ ]
+ target = build(Arrow::UInt16Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_int32
+ values = [
+ -(2 ** 31),
+ nil,
+ (2 ** 31) - 1,
+ ]
+ target = build(Arrow::Int32Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_uint32
+ values = [
+ 0,
+ nil,
+ (2 ** 32) - 1,
+ ]
+ target = build(Arrow::UInt32Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_int64
+ values = [
+ -(2 ** 63),
+ nil,
+ (2 ** 63) - 1,
+ ]
+ target = build(Arrow::Int64Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_uint64
+ values = [
+ 0,
+ nil,
+ (2 ** 64) - 1,
+ ]
+ target = build(Arrow::UInt64Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_float
+ values = [
+ -1.0,
+ nil,
+ 1.0,
+ ]
+ target = build(Arrow::FloatArray.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_double
+ values = [
+ -1.0,
+ nil,
+ 1.0,
+ ]
+ target = build(Arrow::DoubleArray.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_binary
+ values = [
+ "\x00".b,
+ nil,
+ "\xff".b,
+ ]
+ target = build(Arrow::BinaryArray.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_tring
+ values = [
+ "Ruby",
+ nil,
+ "\u3042", # U+3042 HIRAGANA LETTER A
+ ]
+ target = build(Arrow::StringArray.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_date32
+ values = [
+ Date.new(1960, 1, 1),
+ nil,
+ Date.new(2017, 8, 23),
+ ]
+ target = build(Arrow::Date32Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_date64
+ values = [
+ DateTime.new(1960, 1, 1, 2, 9, 30),
+ nil,
+ DateTime.new(2017, 8, 23, 14, 57, 2),
+ ]
+ target = build(Arrow::Date64Array.new(values))
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_second
+ values = [
+ Time.parse("1960-01-01T02:09:30Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02Z"),
+ ]
+ target = build(Arrow::TimestampArray.new(:second, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_milli
+ values = [
+ Time.parse("1960-01-01T02:09:30.123Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987Z"),
+ ]
+ target = build(Arrow::TimestampArray.new(:milli, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_micro
+ values = [
+ Time.parse("1960-01-01T02:09:30.123456Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987654Z"),
+ ]
+ target = build(Arrow::TimestampArray.new(:micro, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_nano
+ values = [
+ Time.parse("1960-01-01T02:09:30.123456789Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987654321Z"),
+ ]
+ target = build(Arrow::TimestampArray.new(:nano, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ values = [
+ Arrow::Time.new(unit, 60 * 10), # 00:10:00
+ nil,
+ Arrow::Time.new(unit, 60 * 60 * 2 + 9), # 02:00:09
+ ]
+ target = build(Arrow::Time32Array.new(:second, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ values = [
+ Arrow::Time.new(unit, (60 * 10) * 1000 + 123), # 00:10:00.123
+ nil,
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987), # 02:00:09.987
+ ]
+ target = build(Arrow::Time32Array.new(:milli, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ values = [
+ # 00:10:00.123456
+ Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456),
+ nil,
+ # 02:00:09.987654
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654),
+ ]
+ target = build(Arrow::Time64Array.new(:micro, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ values = [
+ # 00:10:00.123456789
+ Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789),
+ nil,
+ # 02:00:09.987654321
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321),
+ ]
+ target = build(Arrow::Time64Array.new(:nano, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal128
+ values = [
+ BigDecimal("92.92"),
+ nil,
+ BigDecimal("29.29"),
+ ]
+ data_type = Arrow::Decimal128DataType.new(8, 2)
+ target = build(Arrow::Decimal128Array.new(data_type, values))
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal256
+ values = [
+ BigDecimal("92.92"),
+ nil,
+ BigDecimal("29.29"),
+ ]
+ data_type = Arrow::Decimal256DataType.new(38, 2)
+ target = build(Arrow::Decimal256Array.new(data_type, values))
+ assert_equal(values, target.values)
+ end
+end
+
+class ValuesArrayBasicArraysTest < Test::Unit::TestCase
+ include ValuesBasicArraysTests
+
+ def build(array)
+ array
+ end
+end
+
+class ValuesChunkedArrayBasicArraysTest < Test::Unit::TestCase
+ include ValuesBasicArraysTests
+
+ def build(array)
+ Arrow::ChunkedArray.new([array])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/values/test-dense-union-array.rb b/src/arrow/ruby/red-arrow/test/values/test-dense-union-array.rb
new file mode 100644
index 000000000..465ffb9e6
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/values/test-dense-union-array.rb
@@ -0,0 +1,482 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ValuesDenseUnionArrayTests
+ def build_data_type(type, type_codes)
+ field_description = {}
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ Arrow::DenseUnionDataType.new(fields: [
+ field_description.merge(name: "0"),
+ field_description.merge(name: "1"),
+ ],
+ type_codes: type_codes)
+ end
+
+ def build_array(type, values)
+ type_codes = [0, 1]
+ data_type = build_data_type(type, type_codes)
+ type_ids = []
+ offsets = []
+ arrays = data_type.fields.collect do |field|
+ sub_schema = Arrow::Schema.new([field])
+ sub_records = []
+ values.each do |value|
+ next if value.nil?
+ next unless value.key?(field.name)
+ sub_records << [value[field.name]]
+ end
+ sub_record_batch = Arrow::RecordBatch.new(sub_schema,
+ sub_records)
+ sub_record_batch.columns[0].data
+ end
+ values.each do |value|
+ if value.key?("0")
+ type_id = type_codes[0]
+ type_ids << type_id
+ offsets << (type_ids.count(type_id) - 1)
+ elsif value.key?("1")
+ type_id = type_codes[1]
+ type_ids << type_id
+ offsets << (type_ids.count(type_id) - 1)
+ end
+ end
+ Arrow::DenseUnionArray.new(data_type,
+ Arrow::Int8Array.new(type_ids),
+ Arrow::Int32Array.new(offsets),
+ arrays)
+ end
+
+ def test_null
+ values = [
+ {"0" => nil},
+ ]
+ target = build(:null, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_boolean
+ values = [
+ {"0" => true},
+ {"1" => nil},
+ ]
+ target = build(:boolean, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int8
+ values = [
+ {"0" => -(2 ** 7)},
+ {"1" => nil},
+ ]
+ target = build(:int8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint8
+ values = [
+ {"0" => (2 ** 8) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int16
+ values = [
+ {"0" => -(2 ** 15)},
+ {"1" => nil},
+ ]
+ target = build(:int16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint16
+ values = [
+ {"0" => (2 ** 16) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int32
+ values = [
+ {"0" => -(2 ** 31)},
+ {"1" => nil},
+ ]
+ target = build(:int32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint32
+ values = [
+ {"0" => (2 ** 32) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int64
+ values = [
+ {"0" => -(2 ** 63)},
+ {"1" => nil},
+ ]
+ target = build(:int64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint64
+ values = [
+ {"0" => (2 ** 64) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_float
+ values = [
+ {"0" => -1.0},
+ {"1" => nil},
+ ]
+ target = build(:float, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_double
+ values = [
+ {"0" => -1.0},
+ {"1" => nil},
+ ]
+ target = build(:double, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_binary
+ values = [
+ {"0" => "\xff".b},
+ {"1" => nil},
+ ]
+ target = build(:binary, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_string
+ values = [
+ {"0" => "Ruby"},
+ {"1" => nil},
+ ]
+ target = build(:string, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date32
+ values = [
+ {"0" => Date.new(1960, 1, 1)},
+ {"1" => nil},
+ ]
+ target = build(:date32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date64
+ values = [
+ {"0" => DateTime.new(1960, 1, 1, 2, 9, 30)},
+ {"1" => nil},
+ ]
+ target = build(:date64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_second
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_milli
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30.123Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_micro
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30.123456Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_nano
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30.123456789Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ values = [
+ # 00:10:00
+ {"0" => Arrow::Time.new(unit, 60 * 10)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ values = [
+ # 00:10:00.123
+ {"0" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ values = [
+ # 00:10:00.123456
+ {"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ values = [
+ # 00:10:00.123456789
+ {"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal128
+ values = [
+ {"0" => BigDecimal("92.92")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal256
+ values = [
+ {"0" => BigDecimal("92.92")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_list
+ values = [
+ {"0" => [true, nil, false]},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_struct
+ values = [
+ {"0" => {"sub_field" => true}},
+ {"1" => nil},
+ {"0" => {"sub_field" => nil}},
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :sub_field,
+ type: :boolean,
+ },
+ ],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_map
+ values = [
+ {"0" => {"key1" => true, "key2" => nil}},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ values = [
+ {"0" => {"field1" => true}},
+ {"1" => nil},
+ {"0" => {"field2" => nil}},
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ values = [
+ {"0" => {"field1" => true}},
+ {"1" => nil},
+ {"0" => {"field2" => nil}},
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ values = [
+ {"0" => "Ruby"},
+ {"1" => nil},
+ {"0" => "GLib"},
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+end
+
+class ValuesArrayDenseUnionArrayTest < Test::Unit::TestCase
+ include ValuesDenseUnionArrayTests
+
+ def build(type, values)
+ build_array(type, values)
+ end
+end
+
+class ValuesChunkedArrayDenseUnionArrayTest < Test::Unit::TestCase
+ include ValuesDenseUnionArrayTests
+
+ def build(type, values)
+ Arrow::ChunkedArray.new([build_array(type, values)])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/values/test-list-array.rb b/src/arrow/ruby/red-arrow/test/values/test-list-array.rb
new file mode 100644
index 000000000..d2905b36b
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/values/test-list-array.rb
@@ -0,0 +1,532 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ValuesListArrayTests
+ def build_data_type(type)
+ field_description = {
+ name: :element,
+ }
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ Arrow::ListDataType.new(field: field_description)
+ end
+
+ def build_array(type, values)
+ Arrow::ListArray.new(build_data_type(type), values)
+ end
+
+ def test_null
+ values = [
+ [nil, nil, nil],
+ nil,
+ ]
+ target = build(:null, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_boolean
+ values = [
+ [true, nil, false],
+ nil,
+ ]
+ target = build(:boolean, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int8
+ values = [
+ [-(2 ** 7), nil, (2 ** 7) - 1],
+ nil,
+ ]
+ target = build(:int8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint8
+ values = [
+ [0, nil, (2 ** 8) - 1],
+ nil,
+ ]
+ target = build(:uint8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int16
+ values = [
+ [-(2 ** 15), nil, (2 ** 15) - 1],
+ nil,
+ ]
+ target = build(:int16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint16
+ values = [
+ [0, nil, (2 ** 16) - 1],
+ nil,
+ ]
+ target = build(:uint16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int32
+ values = [
+ [-(2 ** 31), nil, (2 ** 31) - 1],
+ nil,
+ ]
+ target = build(:int32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint32
+ values = [
+ [0, nil, (2 ** 32) - 1],
+ nil,
+ ]
+ target = build(:uint32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int64
+ values = [
+ [-(2 ** 63), nil, (2 ** 63) - 1],
+ nil,
+ ]
+ target = build(:int64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint64
+ values = [
+ [0, nil, (2 ** 64) - 1],
+ nil,
+ ]
+ target = build(:uint64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_float
+ values = [
+ [-1.0, nil, 1.0],
+ nil,
+ ]
+ target = build(:float, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_double
+ values = [
+ [-1.0, nil, 1.0],
+ nil,
+ ]
+ target = build(:double, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_binary
+ values = [
+ ["\x00".b, nil, "\xff".b],
+ nil,
+ ]
+ target = build(:binary, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_string
+ values = [
+ [
+ "Ruby",
+ nil,
+ "\u3042", # U+3042 HIRAGANA LETTER A
+ ],
+ nil,
+ ]
+ target = build(:string, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date32
+ values = [
+ [
+ Date.new(1960, 1, 1),
+ nil,
+ Date.new(2017, 8, 23),
+ ],
+ nil,
+ ]
+ target = build(:date32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date64
+ values = [
+ [
+ DateTime.new(1960, 1, 1, 2, 9, 30),
+ nil,
+ DateTime.new(2017, 8, 23, 14, 57, 2),
+ ],
+ nil,
+ ]
+ target = build(:date64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_second
+ values = [
+ [
+ Time.parse("1960-01-01T02:09:30Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02Z"),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_milli
+ values = [
+ [
+ Time.parse("1960-01-01T02:09:30.123Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987Z"),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_micro
+ values = [
+ [
+ Time.parse("1960-01-01T02:09:30.123456Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987654Z"),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_nano
+ values = [
+ [
+ Time.parse("1960-01-01T02:09:30.123456789Z"),
+ nil,
+ Time.parse("2017-08-23T14:57:02.987654321Z"),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ values = [
+ [
+ # 00:10:00
+ Arrow::Time.new(unit, 60 * 10),
+ nil,
+ # 02:00:09
+ Arrow::Time.new(unit, 60 * 60 * 2 + 9),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ values = [
+ [
+ # 00:10:00.123
+ Arrow::Time.new(unit, (60 * 10) * 1000 + 123),
+ nil,
+ # 02:00:09.987
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1000 + 987),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ values = [
+ [
+ # 00:10:00.123456
+ Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456),
+ nil,
+ # 02:00:09.987654
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000 + 987_654),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ values = [
+ [
+ # 00:10:00.123456789
+ Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789),
+ nil,
+ # 02:00:09.987654321
+ Arrow::Time.new(unit, (60 * 60 * 2 + 9) * 1_000_000_000 + 987_654_321),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal128
+ values = [
+ [
+ BigDecimal("92.92"),
+ nil,
+ BigDecimal("29.29"),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal256
+ values = [
+ [
+ BigDecimal("92.92"),
+ nil,
+ BigDecimal("29.29"),
+ ],
+ nil,
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_list
+ values = [
+ [
+ [
+ true,
+ nil,
+ ],
+ nil,
+ [
+ nil,
+ false,
+ ],
+ ],
+ nil,
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_struct
+ values = [
+ [
+ {"field" => true},
+ nil,
+ {"field" => nil},
+ ],
+ nil,
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :field,
+ type: :boolean,
+ },
+ ],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_map
+ values = [
+ [
+ {"key1" => true, "key2" => nil},
+ nil,
+ ],
+ nil,
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_sparse
+ omit("Need to add support for SparseUnionArrayBuilder")
+ values = [
+ [
+ {"field1" => true},
+ nil,
+ {"field2" => nil},
+ ],
+ nil,
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dense
+ omit("Need to add support for DenseUnionArrayBuilder")
+ values = [
+ [
+ {"field1" => true},
+ nil,
+ {"field2" => nil},
+ ],
+ nil,
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ values = [
+ [
+ "Ruby",
+ nil,
+ "GLib",
+ ],
+ nil,
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+end
+
+class ValuesArrayListArrayTest < Test::Unit::TestCase
+ include ValuesListArrayTests
+
+ def build(type, values)
+ build_array(type, values)
+ end
+end
+
+class ValuesChunkedArrayListArrayTest < Test::Unit::TestCase
+ include ValuesListArrayTests
+
+ def build(type, values)
+ Arrow::ChunkedArray.new([build_array(type, values)])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/values/test-map-array.rb b/src/arrow/ruby/red-arrow/test/values/test-map-array.rb
new file mode 100644
index 000000000..14b5bf6c3
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/values/test-map-array.rb
@@ -0,0 +1,433 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ValuesMapArrayTests
+ def build_data_type(item_type)
+ Arrow::MapDataType.new(
+ key: :string,
+ item: item_type
+ )
+ end
+
+ def build_array(item_type, values)
+ Arrow::MapArray.new(build_data_type(item_type), values)
+ end
+
+ def test_null
+ values = [
+ {"key1" => nil},
+ nil,
+ ]
+ target = build(:null, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_boolean
+ values = [
+ {"key1" => false, "key2" => nil},
+ nil,
+ ]
+ target = build(:boolean, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int8
+ values = [
+ {"key1" => (2 ** 7) - 1, "key2" => nil},
+ nil,
+ ]
+ target = build(:int8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint8
+ values = [
+ {"key1" => (2 ** 8) - 1, "key2" => nil},
+ nil,
+ ]
+ target = build(:uint8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint16
+ values = [
+ {"key1" => (2 ** 16) - 1, "key2" => nil},
+ nil,
+ ]
+ target = build(:uint16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int32
+ values = [
+ {"key1" => -(2 ** 31), "key2" => nil},
+ nil,
+ ]
+ target = build(:int32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint32
+ values = [
+ {"key1" => (2 ** 32) - 1, "key2" => nil},
+ nil,
+ ]
+ target = build(:uint32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int64
+ values = [
+ {"key1" => -(2 ** 63), "key2" => nil},
+ nil,
+ ]
+ target = build(:int64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint64
+ values = [
+ {"key1" => (2 ** 64) - 1, "key2" => nil},
+ nil,
+ ]
+ target = build(:uint64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_float
+ values = [
+ {"key1" => -1.0, "key2" => nil},
+ nil,
+ ]
+ target = build(:float, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_double
+ values = [
+ {"key1" => -1.0, "key2" => nil},
+ nil,
+ ]
+ target = build(:double, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_binary
+ values = [
+ {"key1" => "\xff".b, "key2" => nil},
+ nil,
+ ]
+ target = build(:binary, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_string
+ values = [
+ {"key1" => "Ruby", "key2" => nil},
+ nil,
+ ]
+ target = build(:string, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date32
+ values = [
+ {"key1" => Date.new(1960, 1, 1), "key2" => nil},
+ nil,
+ ]
+ target = build(:date32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date64
+ values = [
+ {"key1" => DateTime.new(1960, 1, 1, 2, 9, 30), "key2" => nil},
+ nil,
+ ]
+ target = build(:date64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_second
+ values = [
+ {"key1" => Time.parse("1960-01-01T02:09:30Z"), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_milli
+ values = [
+ {"key1" => Time.parse("1960-01-01T02:09:30.123Z"), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_micro
+ values = [
+ {"key1" => Time.parse("1960-01-01T02:09:30.123456Z"), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_nano
+ values = [
+ {"key1" => Time.parse("1960-01-01T02:09:30.123456789Z"), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ values = [
+ # 00:10:00
+ {"key1" => Arrow::Time.new(unit, 60 * 10), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ values = [
+ # 00:10:00.123
+ {"key1" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ values = [
+ # 00:10:00.123456
+ {"key1" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ values = [
+ # 00:10:00.123456789
+ {"key1" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal128
+ values = [
+ {"key1" => BigDecimal("92.92"), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal256
+ values = [
+ {"key1" => BigDecimal("92.92"), "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_list
+ values = [
+ {"key1" => [true, nil, false], "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_struct
+ values = [
+ {"key1" => {"field" => true}, "key2" => nil, "key3" => {"field" => nil}},
+ nil,
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :field,
+ type: :boolean,
+ },
+ ],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_map
+ values = [
+ {"key1" => {"sub_key1" => true, "sub_key2" => nil}, "key2" => nil},
+ nil,
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ values = [
+ {"key1" => {"field1" => true}, "key2" => nil, "key3" => {"field2" => nil}},
+ nil,
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ values = [
+ {"key1" => {"field1" => true}, "key2" => nil, "key3" => {"field2" => nil}},
+ nil,
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ values = [
+ {"key1" => "Ruby", "key2" => nil, "key3" => "GLib"},
+ nil,
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+end
+
+class ValuesArrayMapArrayTest < Test::Unit::TestCase
+ include ValuesMapArrayTests
+
+ def build(item_type, values)
+ build_array(item_type, values)
+ end
+end
+
+class ValuesChunkedArrayMapArrayTest < Test::Unit::TestCase
+ include ValuesMapArrayTests
+
+ def build(item_type, values)
+ Arrow::ChunkedArray.new([build_array(item_type, values)])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/values/test-sparse-union-array.rb b/src/arrow/ruby/red-arrow/test/values/test-sparse-union-array.rb
new file mode 100644
index 000000000..909d67e61
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/values/test-sparse-union-array.rb
@@ -0,0 +1,473 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ValuesSparseUnionArrayTests
+ def build_data_type(type, type_codes)
+ field_description = {}
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ Arrow::SparseUnionDataType.new(fields: [
+ field_description.merge(name: "0"),
+ field_description.merge(name: "1"),
+ ],
+ type_codes: type_codes)
+ end
+
+ def build_array(type, values)
+ type_codes = [0, 1]
+ data_type = build_data_type(type, type_codes)
+ type_ids = []
+ arrays = data_type.fields.collect do |field|
+ sub_schema = Arrow::Schema.new([field])
+ sub_records = values.collect do |value|
+ [value.nil? ? nil : value[field.name]]
+ end
+ sub_record_batch = Arrow::RecordBatch.new(sub_schema,
+ sub_records)
+ sub_record_batch.columns[0].data
+ end
+ values.each do |value|
+ if value.key?("0")
+ type_ids << type_codes[0]
+ elsif value.key?("1")
+ type_ids << type_codes[1]
+ end
+ end
+ Arrow::SparseUnionArray.new(data_type,
+ Arrow::Int8Array.new(type_ids),
+ arrays)
+ end
+
+ def test_null
+ values = [
+ {"0" => nil},
+ ]
+ target = build(:null, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_boolean
+ values = [
+ {"0" => true},
+ {"1" => nil},
+ ]
+ target = build(:boolean, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int8
+ values = [
+ {"0" => -(2 ** 7)},
+ {"1" => nil},
+ ]
+ target = build(:int8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint8
+ values = [
+ {"0" => (2 ** 8) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int16
+ values = [
+ {"0" => -(2 ** 15)},
+ {"1" => nil},
+ ]
+ target = build(:int16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint16
+ values = [
+ {"0" => (2 ** 16) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int32
+ values = [
+ {"0" => -(2 ** 31)},
+ {"1" => nil},
+ ]
+ target = build(:int32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint32
+ values = [
+ {"0" => (2 ** 32) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int64
+ values = [
+ {"0" => -(2 ** 63)},
+ {"1" => nil},
+ ]
+ target = build(:int64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint64
+ values = [
+ {"0" => (2 ** 64) - 1},
+ {"1" => nil},
+ ]
+ target = build(:uint64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_float
+ values = [
+ {"0" => -1.0},
+ {"1" => nil},
+ ]
+ target = build(:float, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_double
+ values = [
+ {"0" => -1.0},
+ {"1" => nil},
+ ]
+ target = build(:double, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_binary
+ values = [
+ {"0" => "\xff".b},
+ {"1" => nil},
+ ]
+ target = build(:binary, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_string
+ values = [
+ {"0" => "Ruby"},
+ {"1" => nil},
+ ]
+ target = build(:string, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date32
+ values = [
+ {"0" => Date.new(1960, 1, 1)},
+ {"1" => nil},
+ ]
+ target = build(:date32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date64
+ values = [
+ {"0" => DateTime.new(1960, 1, 1, 2, 9, 30)},
+ {"1" => nil},
+ ]
+ target = build(:date64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_second
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_milli
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30.123Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_micro
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30.123456Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_nano
+ values = [
+ {"0" => Time.parse("1960-01-01T02:09:30.123456789Z")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ values = [
+ # 00:10:00
+ {"0" => Arrow::Time.new(unit, 60 * 10)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ values = [
+ # 00:10:00.123
+ {"0" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ values = [
+ # 00:10:00.123456
+ {"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ values = [
+ # 00:10:00.123456789
+ {"0" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal128
+ values = [
+ {"0" => BigDecimal("92.92")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal256
+ values = [
+ {"0" => BigDecimal("92.92")},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_list
+ values = [
+ {"0" => [true, nil, false]},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_struct
+ values = [
+ {"0" => {"sub_field" => true}},
+ {"1" => nil},
+ {"0" => {"sub_field" => nil}},
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :sub_field,
+ type: :boolean,
+ },
+ ],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_map
+ values = [
+ {"0" => {"key1" => true, "key2" => nil}},
+ {"1" => nil},
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ values = [
+ {"0" => {"field1" => true}},
+ {"1" => nil},
+ {"0" => {"field2" => nil}},
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ values = [
+ {"0" => {"field1" => true}},
+ {"1" => nil},
+ {"0" => {"field2" => nil}},
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ values = [
+ {"0" => "Ruby"},
+ {"1" => nil},
+ {"0" => "GLib"},
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+end
+
+class ValuesArraySparseUnionArrayTest < Test::Unit::TestCase
+ include ValuesSparseUnionArrayTests
+
+ def build(type, values)
+ build_array(type, values)
+ end
+end
+
+class ValuesChunkedArraySparseUnionArrayTest < Test::Unit::TestCase
+ include ValuesSparseUnionArrayTests
+
+ def build(type, values)
+ Arrow::ChunkedArray.new([build_array(type, values)])
+ end
+end
diff --git a/src/arrow/ruby/red-arrow/test/values/test-struct-array.rb b/src/arrow/ruby/red-arrow/test/values/test-struct-array.rb
new file mode 100644
index 000000000..4e3396796
--- /dev/null
+++ b/src/arrow/ruby/red-arrow/test/values/test-struct-array.rb
@@ -0,0 +1,482 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ValuesStructArrayTests
+ def build_data_type(type)
+ field_description = {
+ name: :field,
+ }
+ if type.is_a?(Hash)
+ field_description = field_description.merge(type)
+ else
+ field_description[:type] = type
+ end
+ Arrow::StructDataType.new([field_description])
+ end
+
+ def build_array(type, values)
+ Arrow::StructArray.new(build_data_type(type), values)
+ end
+
+ def test_null
+ values = [
+ {"field" => nil},
+ nil,
+ ]
+ target = build(:null, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_boolean
+ values = [
+ {"field" => true},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:boolean, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int8
+ values = [
+ {"field" => -(2 ** 7)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:int8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint8
+ values = [
+ {"field" => (2 ** 8) - 1},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:uint8, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int16
+ values = [
+ {"field" => -(2 ** 15)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:int16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint16
+ values = [
+ {"field" => (2 ** 16) - 1},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:uint16, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int32
+ values = [
+ {"field" => -(2 ** 31)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:int32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint32
+ values = [
+ {"field" => (2 ** 32) - 1},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:uint32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_int64
+ values = [
+ {"field" => -(2 ** 63)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:int64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_uint64
+ values = [
+ {"field" => (2 ** 64) - 1},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:uint64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_float
+ values = [
+ {"field" => -1.0},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:float, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_double
+ values = [
+ {"field" => -1.0},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:double, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_binary
+ values = [
+ {"field" => "\xff".b},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:binary, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_string
+ values = [
+ {"field" => "Ruby"},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:string, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date32
+ values = [
+ {"field" => Date.new(1960, 1, 1)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:date32, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_date64
+ values = [
+ {"field" => DateTime.new(1960, 1, 1, 2, 9, 30)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build(:date64, values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_second
+ values = [
+ {"field" => Time.parse("1960-01-01T02:09:30Z")},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_milli
+ values = [
+ {"field" => Time.parse("1960-01-01T02:09:30.123Z")},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_micro
+ values = [
+ {"field" => Time.parse("1960-01-01T02:09:30.123456Z")},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_timestamp_nano
+ values = [
+ {"field" => Time.parse("1960-01-01T02:09:30.123456789Z")},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :timestamp,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_second
+ unit = Arrow::TimeUnit::SECOND
+ values = [
+ # 00:10:00
+ {"field" => Arrow::Time.new(unit, 60 * 10)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :time32,
+ unit: :second,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time32_milli
+ unit = Arrow::TimeUnit::MILLI
+ values = [
+ # 00:10:00.123
+ {"field" => Arrow::Time.new(unit, (60 * 10) * 1000 + 123)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :time32,
+ unit: :milli,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_micro
+ unit = Arrow::TimeUnit::MICRO
+ values = [
+ # 00:10:00.123456
+ {"field" => Arrow::Time.new(unit, (60 * 10) * 1_000_000 + 123_456)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :time64,
+ unit: :micro,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_time64_nano
+ unit = Arrow::TimeUnit::NANO
+ values = [
+ # 00:10:00.123456789
+ {"field" => Arrow::Time.new(unit, (60 * 10) * 1_000_000_000 + 123_456_789)},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :time64,
+ unit: :nano,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal128
+ values = [
+ {"field" => BigDecimal("92.92")},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :decimal128,
+ precision: 8,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_decimal256
+ values = [
+ {"field" => BigDecimal("92.92")},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :decimal256,
+ precision: 38,
+ scale: 2,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_list
+ values = [
+ {"field" => [true, nil, false]},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :list,
+ field: {
+ name: :sub_element,
+ type: :boolean,
+ },
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_struct
+ values = [
+ {"field" => {"sub_field" => true}},
+ nil,
+ {"field" => nil},
+ {"field" => {"sub_field" => nil}},
+ ]
+ target = build({
+ type: :struct,
+ fields: [
+ {
+ name: :sub_field,
+ type: :boolean,
+ },
+ ],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_map
+ values = [
+ {"field" => {"key1" => true, "key2" => nil}},
+ nil,
+ {"field" => nil},
+ ]
+ target = build({
+ type: :map,
+ key: :string,
+ item: :boolean,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_sparse_union
+ omit("Need to add support for SparseUnionArrayBuilder")
+ values = [
+ {"field" => {"field1" => true}},
+ nil,
+ {"field" => nil},
+ {"field" => {"field2" => nil}},
+ ]
+ target = build({
+ type: :sparse_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dense_union
+ omit("Need to add support for DenseUnionArrayBuilder")
+ values = [
+ {"field" => {"field1" => true}},
+ nil,
+ {"field" => nil},
+ {"field" => {"field2" => nil}},
+ ]
+ target = build({
+ type: :dense_union,
+ fields: [
+ {
+ name: :field1,
+ type: :boolean,
+ },
+ {
+ name: :field2,
+ type: :uint8,
+ },
+ ],
+ type_codes: [0, 1],
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+
+ def test_dictionary
+ omit("Need to add support for DictionaryArrayBuilder")
+ values = [
+ {"field" => "Ruby"},
+ nil,
+ {"field" => nil},
+ {"field" => "GLib"},
+ ]
+ dictionary = Arrow::StringArray.new(["GLib", "Ruby"])
+ target = build({
+ type: :dictionary,
+ index_data_type: :int8,
+ dictionary: dictionary,
+ ordered: true,
+ },
+ values)
+ assert_equal(values, target.values)
+ end
+end
+
+class ValuesArrayStructArrayTest < Test::Unit::TestCase
+ include ValuesStructArrayTests
+
+ def build(type, values)
+ build_array(type, values)
+ end
+end
+
+class ValuesChunkedArrayStructArrayTest < Test::Unit::TestCase
+ include ValuesStructArrayTests
+
+ def build(type, values)
+ Arrow::ChunkedArray.new([build_array(type, values)])
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/.gitignore b/src/arrow/ruby/red-gandiva/.gitignore
new file mode 100644
index 000000000..afd93a168
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/.gitignore
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/Gemfile.lock
+/pkg/
diff --git a/src/arrow/ruby/red-gandiva/Gemfile b/src/arrow/ruby/red-gandiva/Gemfile
new file mode 100644
index 000000000..7c4cefcf3
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/Gemfile
@@ -0,0 +1,24 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gemspec
+
+gem "red-arrow", path: "../red-arrow"
diff --git a/src/arrow/ruby/red-gandiva/LICENSE.txt b/src/arrow/ruby/red-gandiva/LICENSE.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/arrow/ruby/red-gandiva/NOTICE.txt b/src/arrow/ruby/red-gandiva/NOTICE.txt
new file mode 100644
index 000000000..e08aeda8a
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/NOTICE.txt
@@ -0,0 +1,2 @@
+Apache Arrow
+Copyright 2016 The Apache Software Foundation
diff --git a/src/arrow/ruby/red-gandiva/README.md b/src/arrow/ruby/red-gandiva/README.md
new file mode 100644
index 000000000..ac6f88c58
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/README.md
@@ -0,0 +1,68 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Red Gandiva - Gandiva Ruby
+
+Red Gandiva is the Ruby bindings of Gandiva. Red Gandiva is based on GObject Introspection.
+
+Gandiva is a toolset for compiling and evaluating expressions on Arrow data.
+
+[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime.
+
+Red Gandiva uses [Gandiva GLib](https://github.com/apache/arrow/tree/master/c_glib/gandiva-glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Gandiva.
+
+Gandiva GLib is a C wrapper for [Gandiva C++](https://github.com/apache/arrow/tree/master/cpp/gandiva). GObject Introspection can't use Gandiva C++ directly. Gandiva GLib is a bridge between Gandiva C++ and GObject Introspection.
+
+gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Gandiva uses GObject Introspection via gobject-introspection gem.
+
+## Install
+
+Install Gandiva GLib before install Red Gandiva. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
+
+Install Red Gandiva after you install Gandiva GLib:
+
+```text
+% gem install red-gandiva
+```
+
+## Usage
+
+```ruby
+require "gandiva"
+
+table = Arrow::Table.new(:field1 => Arrow::Int32Array.new([1, 2, 3, 4]),
+ :field2 => Arrow::Int32Array.new([11, 13, 15, 17]))
+schema = table.schema
+
+expression1 = schema.build_expression do |record|
+ record.field1 + record.field2
+end
+
+expression2 = schema.build_expression do |record, context|
+ context.if(record.field1 > record.field2)
+ .then(record.field1 / record.field2)
+ .else(record.field1)
+end
+
+projector = Gandiva::Projector.new(schema, [expression1, expression2])
+table.each_record_batch do |record_batch|
+ outputs = projector.evaluate(record_batch)
+ puts outputs.collect(&:values))
+end
+```
diff --git a/src/arrow/ruby/red-gandiva/Rakefile b/src/arrow/ruby/red-gandiva/Rakefile
new file mode 100644
index 000000000..579b946d4
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/Rakefile
@@ -0,0 +1,41 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "rubygems"
+require "bundler/gem_helper"
+
+base_dir = File.join(__dir__)
+
+helper = Bundler::GemHelper.new(base_dir)
+helper.install
+
+release_task = Rake::Task["release"]
+release_task.prerequisites.replace(["build", "release:rubygem_push"])
+
+desc "Run tests"
+task :test do
+ cd(base_dir) do
+ cd("dependency-check") do
+ ruby("-S", "rake")
+ end
+ ruby("test/run-test.rb")
+ end
+end
+
+task default: :test
diff --git a/src/arrow/ruby/red-gandiva/dependency-check/Rakefile b/src/arrow/ruby/red-gandiva/dependency-check/Rakefile
new file mode 100644
index 000000000..6ec86fab0
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/dependency-check/Rakefile
@@ -0,0 +1,47 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "pkg-config"
+require "native-package-installer"
+require_relative "../lib/gandiva/version"
+
+case RUBY_PLATFORM
+when /mingw|mswin/
+ task :default => "nothing"
+else
+ task :default => "dependency:check"
+end
+
+task :nothing do
+end
+
+namespace :dependency do
+ desc "Check dependency"
+ task :check do
+ unless PKGConfig.check_version?("gandiva-glib",
+ Gandiva::Version::MAJOR,
+ Gandiva::Version::MINOR,
+ Gandiva::Version::MICRO)
+ unless NativePackageInstaller.install(:debian => "libgandiva-glib-dev",
+ :redhat => "gandiva-glib-devel")
+ exit(false)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva.rb b/src/arrow/ruby/red-gandiva/lib/gandiva.rb
new file mode 100644
index 000000000..6a47a3210
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "gandiva/version"
+
+require "gandiva/loader"
+
+module Gandiva
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/arrow-schema.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/arrow-schema.rb
new file mode 100644
index 000000000..1656b4736
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/arrow-schema.rb
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class Schema
+ def build_expression(&block)
+ builder = Gandiva::ExpressionBuilder.new(self)
+ builder.build(&block)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder.rb
new file mode 100644
index 000000000..405a1f68e
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder.rb
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ class ExpressionBuilder
+ def initialize(schema)
+ @schema = schema
+ end
+
+ def build
+ builder = yield(Record.new(@schema), Context.new)
+ node = builder.build
+ Expression.new(node,
+ Arrow::Field.new("result", node.return_type))
+ end
+ end
+end
+
+require "gandiva/expression-builder/add"
+require "gandiva/expression-builder/context"
+require "gandiva/expression-builder/divide"
+require "gandiva/expression-builder/elsif"
+require "gandiva/expression-builder/equal"
+require "gandiva/expression-builder/field"
+require "gandiva/expression-builder/greater-than"
+require "gandiva/expression-builder/if"
+require "gandiva/expression-builder/literal"
+require "gandiva/expression-builder/less-than"
+require "gandiva/expression-builder/multiply"
+require "gandiva/expression-builder/record"
+require "gandiva/expression-builder/subtract"
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/add.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/add.rb
new file mode 100644
index 000000000..210d47d52
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/add.rb
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/binary-operation"
+
+module Gandiva
+ class ExpressionBuilder
+ class Add < BinaryOperation
+ def initialize(left, right)
+ super("add", left, right)
+ end
+
+ private
+ def return_type(left_node, right_node)
+ # TODO: More clever implementation. e.g. (int64, float) -> float
+ left_return_type = left_node.return_type
+ right_return_type = right_node.return_type
+ if left_return_type.bit_width > right_return_type.bit_width
+ left_return_type
+ else
+ right_return_type
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/binary-operation.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/binary-operation.rb
new file mode 100644
index 000000000..922bdc4f3
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/binary-operation.rb
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/value"
+
+module Gandiva
+ class ExpressionBuilder
+ class BinaryOperation < Value
+ def initialize(operator, left, right)
+ @operator = operator
+ @left = left
+ @right = right
+ end
+
+ def build
+ left_node = @left.build
+ right_node = @right.build
+ FunctionNode.new(@operator,
+ [left_node, right_node],
+ return_type(left_node, right_node))
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/context.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/context.rb
new file mode 100644
index 000000000..25ceee5d0
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/context.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ class ExpressionBuilder
+ class Context
+ def if(condition)
+ If.new(condition)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/divide.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/divide.rb
new file mode 100644
index 000000000..9888dc2a0
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/divide.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/binary-operation"
+
+module Gandiva
+ class ExpressionBuilder
+ class Divide < BinaryOperation
+ def initialize(left, right)
+ super("divide", left, right)
+ end
+
+ private
+ def return_type(left_node, right_node)
+ # TODO: Use float if left or right is float
+ left_node.return_type
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/elsif.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/elsif.rb
new file mode 100644
index 000000000..f5fc086d9
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/elsif.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/if"
+
+module Gandiva
+ class ExpressionBuilder
+ class Elsif < If
+ def initialize(parent, condition)
+ @parent = parent
+ super(condition)
+ end
+
+ def build
+ elsif_node = super
+ build_if_node(@parent.condition_node,
+ @parent.then_node,
+ elsif_node)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/equal.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/equal.rb
new file mode 100644
index 000000000..3e3ec2580
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/equal.rb
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/binary-operation"
+
+module Gandiva
+ class ExpressionBuilder
+ class Equal < BinaryOperation
+ def initialize(left, right)
+ super("equal", left, right)
+ end
+
+ private
+ def return_type(left_node, right_node)
+ Arrow::BooleanDataType.new
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/field.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/field.rb
new file mode 100644
index 000000000..916333e23
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/field.rb
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/value"
+
+module Gandiva
+ class ExpressionBuilder
+ class Field < Value
+ def initialize(field)
+ @field = field
+ end
+
+ def build
+ FieldNode.new(@field)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/greater-than.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/greater-than.rb
new file mode 100644
index 000000000..65d146f9e
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/greater-than.rb
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/binary-operation"
+
+module Gandiva
+ class ExpressionBuilder
+ class GreaterThan < BinaryOperation
+ def initialize(left, right)
+ super("greater_than", left, right)
+ end
+
+ private
+ def return_type(left_node, right_node)
+ Arrow::BooleanDataType.new
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/if.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/if.rb
new file mode 100644
index 000000000..c0a00c3a8
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/if.rb
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ class ExpressionBuilder
+ class If
+ def initialize(condition)
+ @condition = condition
+ @then = nil
+ @else = nil
+ end
+
+ def then(clause)
+ @then = clause
+ self
+ end
+
+ def else(clause)
+ @else = clause
+ self
+ end
+
+ def elsif(condition)
+ Elsif.new(self, condition)
+ end
+
+ def build
+ build_if_node(condition_node,
+ then_node,
+ else_node)
+ end
+
+ protected
+ def condition_node
+ @condition.build
+ end
+
+ def then_node
+ @then&.build
+ end
+
+ def else_node
+ @else&.build
+ end
+
+ private
+ def build_if_node(condition_node, then_node, else_node)
+ if then_node and else_node
+ # TODO: Validate then_node.return_type == else_node.return_type
+ return_type = then_node.return_type
+ else
+ return_type = (then_node || else_node).return_type
+ end
+ IfNode.new(condition_node,
+ then_node,
+ else_node,
+ return_type)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/less-than.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/less-than.rb
new file mode 100644
index 000000000..93d19abd1
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/less-than.rb
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/binary-operation"
+
+module Gandiva
+ class ExpressionBuilder
+ class LessThan < BinaryOperation
+ def initialize(left, right)
+ super("less_than", left, right)
+ end
+
+ private
+ def return_type(left_node, right_node)
+ Arrow::BooleanDataType.new
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/literal.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/literal.rb
new file mode 100644
index 000000000..da2de8273
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/literal.rb
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ class ExpressionBuilder
+ class Literal
+ class << self
+ def resolve(value)
+ case value
+ when true, false
+ new(BooleanLiteralNode, value)
+ when Integer
+ if value < -(2 ** 31)
+ new(Int64LiteralNode, value)
+ elsif value < -(2 ** 15)
+ new(Int32LiteralNode, value)
+ elsif value < -(2 ** 7)
+ new(Int16LiteralNode, value)
+ elsif value < 0
+ new(Int8LiteralNode, value)
+ elsif value < (2 ** 8 - 1)
+ new(UInt8LiteralNode, value)
+ elsif value < (2 ** 16 - 1)
+ new(UInt16LiteralNode, value)
+ elsif value < (2 ** 32 - 1)
+ new(UInt32LiteralNode, value)
+ else
+ new(UInt64LiteralNode, value)
+ end
+ when Float
+ new(DoubleLiteralNode, value)
+ when String
+ new(StringLiteralNode, value)
+ else
+ nil
+ end
+ end
+ end
+
+ attr_reader :value
+ def initialize(node_class, value)
+ @node_class = node_class
+ @value = value
+ end
+
+ def build
+ @node_class.new(value)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/multiply.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/multiply.rb
new file mode 100644
index 000000000..55c57a55d
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/multiply.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/binary-operation"
+
+module Gandiva
+ class ExpressionBuilder
+ class Multiply < BinaryOperation
+ def initialize(left, right)
+ super("multiply", left, right)
+ end
+
+ private
+ def return_type(left_node, right_node)
+ # TODO: Use larger type
+ right_node.return_type
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/record.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/record.rb
new file mode 100644
index 000000000..a8cd124fc
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/record.rb
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ class ExpressionBuilder
+ class Record
+ def initialize(schema)
+ @schema = schema
+ end
+
+ def respond_to_missing?(name, include_private)
+ return true if @schema[name]
+ super
+ end
+
+ def method_missing(name, *args)
+ return super unless args.empty?
+ self[name] || super
+ end
+
+ def [](name)
+ field = @schema[name]
+ if field
+ Field.new(field)
+ else
+ nil
+ end
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/subtract.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/subtract.rb
new file mode 100644
index 000000000..cc3810b72
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/subtract.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva/expression-builder/binary-operation"
+
+module Gandiva
+ class ExpressionBuilder
+ class Subtract < BinaryOperation
+ def initialize(left, right)
+ super("subtract", left, right)
+ end
+
+ private
+ def return_type(left_node, right_node)
+ # TODO: Use larger type
+ right_node.return_type
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/value.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/value.rb
new file mode 100644
index 000000000..366e08871
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/expression-builder/value.rb
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ class ExpressionBuilder
+ class Value
+ def +(right)
+ Add.new(self, resolve(right))
+ end
+
+ def -(right)
+ Subtract.new(self, resolve(right))
+ end
+
+ def *(right)
+ Multiply.new(self, resolve(right))
+ end
+
+ def /(right)
+ Divide.new(self, resolve(right))
+ end
+
+ def >(right)
+ GreaterThan.new(self, resolve(right))
+ end
+
+ def <(right)
+ LessThan.new(self, resolve(right))
+ end
+
+ def ==(right)
+ Equal.new(self, resolve(right))
+ end
+
+ private
+ def resolve(value)
+ Literal.resolve(value) or value
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/loader.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/loader.rb
new file mode 100644
index 000000000..2d8c8a713
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/loader.rb
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("Gandiva", Gandiva)
+ end
+ end
+
+ private
+ def load_method_info(info, klass, method_name)
+ case klass.name
+ when "Gandiva::BooleanLiteralNode"
+ case method_name
+ when "value?"
+ method_name = "value"
+ end
+ super(info, klass, method_name)
+ else
+ super
+ end
+ end
+
+ def post_load(repository, namespace)
+ require_libraries
+ end
+
+ def require_libraries
+ require "gandiva/arrow-schema"
+ require "gandiva/expression-builder"
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/lib/gandiva/version.rb b/src/arrow/ruby/red-gandiva/lib/gandiva/version.rb
new file mode 100644
index 000000000..c78f165ec
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/lib/gandiva/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Gandiva
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/red-gandiva.gemspec b/src/arrow/ruby/red-gandiva/red-gandiva.gemspec
new file mode 100644
index 000000000..ec4db8913
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/red-gandiva.gemspec
@@ -0,0 +1,49 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "lib/gandiva/version"
+
+Gem::Specification.new do |spec|
+ spec.name = "red-gandiva"
+ version_components = [
+ Gandiva::Version::MAJOR.to_s,
+ Gandiva::Version::MINOR.to_s,
+ Gandiva::Version::MICRO.to_s,
+ Gandiva::Version::TAG,
+ ]
+ spec.version = version_components.compact.join(".")
+ spec.homepage = "https://arrow.apache.org/"
+ spec.authors = ["Apache Arrow Developers"]
+ spec.email = ["dev@arrow.apache.org"]
+
+ spec.summary = "Red Gandiva is the Ruby bindings of Gandiva"
+ spec.description = "Gandiva is a toolset for compiling and evaluating expressions on Arrow data."
+ spec.license = "Apache-2.0"
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
+ spec.files += ["LICENSE.txt", "NOTICE.txt"]
+ spec.files += Dir.glob("lib/**/*.rb")
+ spec.test_files += Dir.glob("test/**/*")
+ spec.extensions = ["dependency-check/Rakefile"]
+
+ spec.add_runtime_dependency("red-arrow", "= #{spec.version}")
+
+ spec.add_development_dependency("bundler")
+ spec.add_development_dependency("rake")
+ spec.add_development_dependency("test-unit")
+end
diff --git a/src/arrow/ruby/red-gandiva/test/expression-builder/test-add.rb b/src/arrow/ruby/red-gandiva/test/expression-builder/test-add.rb
new file mode 100644
index 000000000..d703c4902
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/test/expression-builder/test-add.rb
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestExpressionBuilderAdd < Test::Unit::TestCase
+ def setup
+ @table = Arrow::Table.new(int32_field: Arrow::Int32Array.new([1]),
+ int64_field: Arrow::Int64Array.new([2]))
+ @schema = @table.schema
+ end
+
+ def build
+ record = Gandiva::ExpressionBuilder::Record.new(@schema)
+ builder = yield(record)
+ builder.build
+ end
+
+ test("literal") do
+ node = build do |record|
+ record.int32_field + (2 ** 63)
+ end
+ assert_equal("uint64 add((int32) int32_field, (const uint64) #{2 ** 63})",
+ node.to_s)
+ end
+
+ test("int32 + int64") do
+ node = build do |record|
+ record.int32_field + record.int64_field
+ end
+ assert_equal("int64 add((int32) int32_field, (int64) int64_field)",
+ node.to_s)
+ end
+
+ test("int64 + int32") do
+ node = build do |record|
+ record.int64_field + record.int32_field
+ end
+ assert_equal("int64 add((int64) int64_field, (int32) int32_field)",
+ node.to_s)
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/test/expression-builder/test-record.rb b/src/arrow/ruby/red-gandiva/test/expression-builder/test-record.rb
new file mode 100644
index 000000000..83a1831ba
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/test/expression-builder/test-record.rb
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestExpressionBuilderRecord < Test::Unit::TestCase
+ def setup
+ @table = Arrow::Table.new(field: Arrow::Int32Array.new([1, 13, 3, 17]))
+ @schema = @table.schema
+ end
+
+ def build
+ record = Gandiva::ExpressionBuilder::Record.new(@schema)
+ builder = yield(record)
+ builder.build
+ end
+
+ test("name") do
+ node = build do |record|
+ record.field
+ end
+ assert_equal("(int32) field",
+ node.to_s)
+ end
+
+ test("#[]") do
+ node = build do |record|
+ record[:field]
+ end
+ assert_equal("(int32) field",
+ node.to_s)
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/test/helper.rb b/src/arrow/ruby/red-gandiva/test/helper.rb
new file mode 100644
index 000000000..9c291f7ae
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/test/helper.rb
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "gandiva"
+
+require "test-unit"
diff --git a/src/arrow/ruby/red-gandiva/test/run-test.rb b/src/arrow/ruby/red-gandiva/test/run-test.rb
new file mode 100755
index 000000000..48d2c49e1
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/test/run-test.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+$VERBOSE = true
+
+require "pathname"
+
+(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path|
+ RubyInstaller::Runtime.add_dll_directory(path)
+end
+
+base_dir = Pathname.new(__dir__).parent.expand_path
+arrow_base_dir = base_dir.parent + "red-arrow"
+
+lib_dir = base_dir + "lib"
+test_dir = base_dir + "test"
+
+arrow_lib_dir = arrow_base_dir + "lib"
+arrow_ext_dir = arrow_base_dir + "ext" + "arrow"
+
+build_dir = ENV["BUILD_DIR"]
+if build_dir
+ arrow_build_dir = Pathname.new(build_dir) + "red-arrow"
+else
+ arrow_build_dir = arrow_ext_dir
+end
+
+$LOAD_PATH.unshift(arrow_build_dir.to_s)
+$LOAD_PATH.unshift(arrow_lib_dir.to_s)
+$LOAD_PATH.unshift(lib_dir.to_s)
+
+require_relative "helper"
+
+exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/src/arrow/ruby/red-gandiva/test/test-boolean-literal-node.rb b/src/arrow/ruby/red-gandiva/test/test-boolean-literal-node.rb
new file mode 100644
index 000000000..d79f72994
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/test/test-boolean-literal-node.rb
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestBooleanLiteralNode < Test::Unit::TestCase
+ def test_value
+ value = true
+ literal_node = Gandiva::BooleanLiteralNode.new(value)
+ assert_equal(value, literal_node.value)
+ end
+end
diff --git a/src/arrow/ruby/red-gandiva/test/test-projector.rb b/src/arrow/ruby/red-gandiva/test/test-projector.rb
new file mode 100644
index 000000000..d618b248a
--- /dev/null
+++ b/src/arrow/ruby/red-gandiva/test/test-projector.rb
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestProjector < Test::Unit::TestCase
+ def test_evaluate
+ table = Arrow::Table.new(:field1 => Arrow::Int32Array.new([1, 13, 3, 17]),
+ :field2 => Arrow::Int32Array.new([11, 2, 15, 17]),
+ :field3 => Arrow::Int32Array.new([1, 10, 2, 2]))
+ schema = table.schema
+
+ expression1 = schema.build_expression do |record|
+ record.field1 + record.field2
+ end
+
+ expression2 = schema.build_expression do |record, context|
+ context.if(record.field1 > record.field2)
+ .then(record.field1 + record.field2 * record.field3)
+ .elsif(record.field1 == record.field2)
+ .then(record.field1 - record.field2 / record.field3)
+ .else(record.field2)
+ end
+
+ projector = Gandiva::Projector.new(schema,
+ [expression1, expression2])
+
+ table.each_record_batch do |record_batch|
+ outputs = projector.evaluate(record_batch)
+ assert_equal([
+ [12, 15, 18, 34],
+ [11, 33, 15, 9]
+ ],
+ outputs.collect(&:values))
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-parquet/.gitignore b/src/arrow/ruby/red-parquet/.gitignore
new file mode 100644
index 000000000..afd93a168
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/.gitignore
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/Gemfile.lock
+/pkg/
diff --git a/src/arrow/ruby/red-parquet/Gemfile b/src/arrow/ruby/red-parquet/Gemfile
new file mode 100644
index 000000000..7c4cefcf3
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/Gemfile
@@ -0,0 +1,24 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gemspec
+
+gem "red-arrow", path: "../red-arrow"
diff --git a/src/arrow/ruby/red-parquet/LICENSE.txt b/src/arrow/ruby/red-parquet/LICENSE.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/arrow/ruby/red-parquet/NOTICE.txt b/src/arrow/ruby/red-parquet/NOTICE.txt
new file mode 100644
index 000000000..e08aeda8a
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/NOTICE.txt
@@ -0,0 +1,2 @@
+Apache Arrow
+Copyright 2016 The Apache Software Foundation
diff --git a/src/arrow/ruby/red-parquet/README.md b/src/arrow/ruby/red-parquet/README.md
new file mode 100644
index 000000000..ff919c537
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/README.md
@@ -0,0 +1,52 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Red Parquet - Apache Parquet Ruby
+
+Red Parquet is the Ruby bindings of Apache Parquet. Red Parquet is based on GObject Introspection.
+
+[Apache Parquet](https://parquet.apache.org/) is a columnar storage format.
+
+[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime.
+
+Red Parquet uses [Apache Parquet GLib](https://github.com/apache/arrow/tree/master/c_glib/parquet-glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Parquet.
+
+Apache Parquet GLib is a C wrapper for [Apache Parquet C++](https://github.com/apache/arrow/tree/master/cpp/src/parquet). GObject Introspection can't use Apache Parquet C++ directly. Apache Parquet GLib is a bridge between Apache Parquet C++ and GObject Introspection.
+
+gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Parquet uses GObject Introspection via gobject-introspection gem.
+
+## Install
+
+Install Apache Parquet GLib before install Red Parquet. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
+
+Install Red Parquet after you install Apache Parquet GLib:
+
+```text
+% gem install red-parquet
+```
+
+## Usage
+
+```ruby
+require "parquet"
+
+table = Arrow::Table.load("/dev/shm/data.parquet")
+# Process data in table
+table.save("/dev/shm/data-processed.parquet")
+```
diff --git a/src/arrow/ruby/red-parquet/Rakefile b/src/arrow/ruby/red-parquet/Rakefile
new file mode 100644
index 000000000..579b946d4
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/Rakefile
@@ -0,0 +1,41 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "rubygems"
+require "bundler/gem_helper"
+
+base_dir = File.join(__dir__)
+
+helper = Bundler::GemHelper.new(base_dir)
+helper.install
+
+release_task = Rake::Task["release"]
+release_task.prerequisites.replace(["build", "release:rubygem_push"])
+
+desc "Run tests"
+task :test do
+ cd(base_dir) do
+ cd("dependency-check") do
+ ruby("-S", "rake")
+ end
+ ruby("test/run-test.rb")
+ end
+end
+
+task default: :test
diff --git a/src/arrow/ruby/red-parquet/dependency-check/Rakefile b/src/arrow/ruby/red-parquet/dependency-check/Rakefile
new file mode 100644
index 000000000..58420eea4
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/dependency-check/Rakefile
@@ -0,0 +1,47 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "pkg-config"
+require "native-package-installer"
+require_relative "../lib/parquet/version"
+
+case RUBY_PLATFORM
+when /mingw|mswin/
+ task :default => "nothing"
+else
+ task :default => "dependency:check"
+end
+
+task :nothing do
+end
+
+namespace :dependency do
+ desc "Check dependency"
+ task :check do
+ unless PKGConfig.check_version?("parquet-glib",
+ Parquet::Version::MAJOR,
+ Parquet::Version::MINOR,
+ Parquet::Version::MICRO)
+ unless NativePackageInstaller.install(:debian => "libparquet-glib-dev",
+ :redhat => "parquet-glib-devel")
+ exit(false)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-parquet/lib/parquet.rb b/src/arrow/ruby/red-parquet/lib/parquet.rb
new file mode 100644
index 000000000..81ae7d3ae
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/lib/parquet.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "parquet/version"
+
+require "parquet/loader"
+
+module Parquet
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb b/src/arrow/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb
new file mode 100644
index 000000000..e3aa1ce0a
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/lib/parquet/arrow-table-loadable.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+ module ArrowTableLoadable
+ private
+ def load_as_parquet
+ input = open_input_stream
+ reader = Parquet::ArrowFileReader.new(input)
+ reader.use_threads = (@options[:use_threads] != false)
+ table = reader.read_table
+ table.instance_variable_set(:@input, input)
+ table
+ end
+ end
+end
+
+module Arrow
+ class TableLoader
+ include Parquet::ArrowTableLoadable
+ end
+end
diff --git a/src/arrow/ruby/red-parquet/lib/parquet/arrow-table-savable.rb b/src/arrow/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
new file mode 100644
index 000000000..70c597527
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+ module ArrowTableSavable
+ private
+ def save_as_parquet
+ properties = WriterProperties.new
+ @options.each do |key, value|
+ next if value.nil?
+ set_method_name = "set_#{key}"
+ next unless properties.respond_to?(set_method_name)
+ case value
+ when ::Array, ::Hash
+ value.each do |path, v|
+ properties.__send__(set_method_name, v, path)
+ end
+ else
+ properties.__send__(set_method_name, value)
+ end
+ end
+ chunk_size = @options[:chunk_size] || @table.n_rows
+ open_raw_output_stream do |output|
+ ArrowFileWriter.open(@table.schema,
+ output,
+ properties) do |writer|
+ writer.write_table(@table, chunk_size)
+ end
+ end
+ end
+ end
+end
+
+module Arrow
+ class TableSaver
+ include Parquet::ArrowTableSavable
+ end
+end
diff --git a/src/arrow/ruby/red-parquet/lib/parquet/loader.rb b/src/arrow/ruby/red-parquet/lib/parquet/loader.rb
new file mode 100644
index 000000000..5e25872ff
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/lib/parquet/loader.rb
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("Parquet", Parquet)
+ end
+ end
+
+ private
+ def post_load(repository, namespace)
+ require_libraries
+ end
+
+ def require_libraries
+ require "parquet/arrow-table-loadable"
+ require "parquet/arrow-table-savable"
+ require "parquet/writer-properties"
+ end
+
+ def load_object_info(info)
+ super
+
+ klass = @base_module.const_get(rubyish_class_name(info))
+ if klass.method_defined?(:close)
+ klass.extend(Arrow::BlockClosable)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-parquet/lib/parquet/version.rb b/src/arrow/ruby/red-parquet/lib/parquet/version.rb
new file mode 100644
index 000000000..8c9b41a36
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/lib/parquet/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end
diff --git a/src/arrow/ruby/red-parquet/lib/parquet/writer-properties.rb b/src/arrow/ruby/red-parquet/lib/parquet/writer-properties.rb
new file mode 100644
index 000000000..5881471b4
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/lib/parquet/writer-properties.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+ class WriterProperties
+ def set_dictionary(enable, path=nil)
+ if enable
+ enable_dictionary(path)
+ else
+ disable_dictionary(path)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-parquet/red-parquet.gemspec b/src/arrow/ruby/red-parquet/red-parquet.gemspec
new file mode 100644
index 000000000..dffafed19
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/red-parquet.gemspec
@@ -0,0 +1,49 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "lib/parquet/version"
+
+Gem::Specification.new do |spec|
+ spec.name = "red-parquet"
+ version_components = [
+ Parquet::Version::MAJOR.to_s,
+ Parquet::Version::MINOR.to_s,
+ Parquet::Version::MICRO.to_s,
+ Parquet::Version::TAG,
+ ]
+ spec.version = version_components.compact.join(".")
+ spec.homepage = "https://arrow.apache.org/"
+ spec.authors = ["Apache Arrow Developers"]
+ spec.email = ["dev@arrow.apache.org"]
+
+ spec.summary = "Red Parquet is the Ruby bindings of Apache Parquet"
+ spec.description = "Apache Parquet is a columnar storage format."
+ spec.license = "Apache-2.0"
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
+ spec.files += ["LICENSE.txt", "NOTICE.txt"]
+ spec.files += Dir.glob("lib/**/*.rb")
+ spec.test_files += Dir.glob("test/**/*")
+ spec.extensions = ["dependency-check/Rakefile"]
+
+ spec.add_runtime_dependency("red-arrow", "= #{spec.version}")
+
+ spec.add_development_dependency("bundler")
+ spec.add_development_dependency("rake")
+ spec.add_development_dependency("test-unit")
+end
diff --git a/src/arrow/ruby/red-parquet/test/helper.rb b/src/arrow/ruby/red-parquet/test/helper.rb
new file mode 100644
index 000000000..169d1df42
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/test/helper.rb
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "parquet"
+
+require "tempfile"
+
+require "test-unit"
diff --git a/src/arrow/ruby/red-parquet/test/run-test.rb b/src/arrow/ruby/red-parquet/test/run-test.rb
new file mode 100755
index 000000000..48d2c49e1
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/test/run-test.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+$VERBOSE = true
+
+require "pathname"
+
+(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path|
+ RubyInstaller::Runtime.add_dll_directory(path)
+end
+
+base_dir = Pathname.new(__dir__).parent.expand_path
+arrow_base_dir = base_dir.parent + "red-arrow"
+
+lib_dir = base_dir + "lib"
+test_dir = base_dir + "test"
+
+arrow_lib_dir = arrow_base_dir + "lib"
+arrow_ext_dir = arrow_base_dir + "ext" + "arrow"
+
+build_dir = ENV["BUILD_DIR"]
+if build_dir
+ arrow_build_dir = Pathname.new(build_dir) + "red-arrow"
+else
+ arrow_build_dir = arrow_ext_dir
+end
+
+$LOAD_PATH.unshift(arrow_build_dir.to_s)
+$LOAD_PATH.unshift(arrow_lib_dir.to_s)
+$LOAD_PATH.unshift(lib_dir.to_s)
+
+require_relative "helper"
+
+exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/src/arrow/ruby/red-parquet/test/test-arrow-table.rb b/src/arrow/ruby/red-parquet/test/test-arrow-table.rb
new file mode 100644
index 000000000..1ea2669e3
--- /dev/null
+++ b/src/arrow/ruby/red-parquet/test/test-arrow-table.rb
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestArrowTable < Test::Unit::TestCase
+ def setup
+ @count_field = Arrow::Field.new("count", :uint8)
+ @visible_field = Arrow::Field.new("visible", :boolean)
+ @label_field = Arrow::Field.new("label", :string)
+ schema = Arrow::Schema.new([@count_field, @visible_field, @label_field])
+ count_arrays = [
+ Arrow::UInt8Array.new([1, 2]),
+ Arrow::UInt8Array.new([4, 8, 16]),
+ Arrow::UInt8Array.new([32, 64]),
+ Arrow::UInt8Array.new([128]),
+ ]
+ visible_arrays = [
+ Arrow::BooleanArray.new([true, false, nil]),
+ Arrow::BooleanArray.new([true]),
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::BooleanArray.new([nil]),
+ Arrow::BooleanArray.new([nil]),
+ ]
+ label_arrays = [
+ Arrow::StringArray.new(["a"]),
+ Arrow::StringArray.new(["b", "c"]),
+ Arrow::StringArray.new(["d", nil, nil]),
+ Arrow::StringArray.new(["e", "f"]),
+ ]
+ @count_array = Arrow::ChunkedArray.new(count_arrays)
+ @visible_array = Arrow::ChunkedArray.new(visible_arrays)
+ @label_array = Arrow::ChunkedArray.new(label_arrays)
+ @table = Arrow::Table.new(schema,
+ [@count_array, @visible_array, @label_array])
+
+ @output = Tempfile.open(["red-parquet", ".parquet"])
+ begin
+ yield(@output)
+ ensure
+ @output.close!
+ end
+ end
+
+ def test_save_load_path
+ @table.save(@output.path)
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+
+ def test_save_load_buffer
+ buffer = Arrow::ResizableBuffer.new(1024)
+ @table.save(buffer, format: :parquet)
+ assert do
+ @table.equal_metadata(Arrow::Table.load(buffer, format: :parquet), false)
+ end
+ end
+
+ def test_save_load_compression
+ @table.save(@output.path, compression: :zstd)
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+
+ def test_save_load_compression_path
+ @table.save(@output.path, compression: {"count" => :zstd})
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+
+ def test_save_load_dictionary
+ @table.save(@output.path, dictionary: false)
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+
+ def test_save_load_dictionary_path
+ @table.save(@output.path, dictionary: [["label", false]])
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-plasma/.gitignore b/src/arrow/ruby/red-plasma/.gitignore
new file mode 100644
index 000000000..afd93a168
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/.gitignore
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/Gemfile.lock
+/pkg/
diff --git a/src/arrow/ruby/red-plasma/Gemfile b/src/arrow/ruby/red-plasma/Gemfile
new file mode 100644
index 000000000..7c4cefcf3
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/Gemfile
@@ -0,0 +1,24 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+source "https://rubygems.org/"
+
+gemspec
+
+gem "red-arrow", path: "../red-arrow"
diff --git a/src/arrow/ruby/red-plasma/LICENSE.txt b/src/arrow/ruby/red-plasma/LICENSE.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/src/arrow/ruby/red-plasma/NOTICE.txt b/src/arrow/ruby/red-plasma/NOTICE.txt
new file mode 100644
index 000000000..e08aeda8a
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/NOTICE.txt
@@ -0,0 +1,2 @@
+Apache Arrow
+Copyright 2016 The Apache Software Foundation
diff --git a/src/arrow/ruby/red-plasma/README.md b/src/arrow/ruby/red-plasma/README.md
new file mode 100644
index 000000000..9fb8fe794
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/README.md
@@ -0,0 +1,58 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Red Plasma - Plasma Ruby
+
+Red Plasma is the Ruby bindings of Plasma. Red Plasma is based on GObject Introspection.
+
+Plasma is an in-memory object store and cache for big data.
+
+[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime.
+
+Red Plasma uses [Plasma GLib](https://github.com/apache/arrow/tree/master/c_glib/plasma-glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Plasma.
+
+Plasma GLib is a C wrapper for [Plasma C++](https://github.com/apache/arrow/tree/master/cpp/plasma). GObject Introspection can't use Plasma C++ directly. Plasma GLib is a bridge between Plasma C++ and GObject Introspection.
+
+gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Plasma uses GObject Introspection via gobject-introspection gem.
+
+## Install
+
+Install Plasma GLib before install Red Plasma. See [Apache Arrow install document](https://arrow.apache.org/install/) for details.
+
+Install Red Plasma after you install Plasma GLib:
+
+```text
+% gem install red-plasma
+```
+
+## Usage
+
+Starting the Plasma store
+
+```console
+plasma-store-server -m 1000000000 -s /tmp/plasma
+```
+
+Creating a Plasma client
+
+```ruby
+require "plasma"
+
+client = Plasma::Client.new("/tmp/plasma")
+```
diff --git a/src/arrow/ruby/red-plasma/Rakefile b/src/arrow/ruby/red-plasma/Rakefile
new file mode 100644
index 000000000..579b946d4
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/Rakefile
@@ -0,0 +1,41 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "rubygems"
+require "bundler/gem_helper"
+
+base_dir = File.join(__dir__)
+
+helper = Bundler::GemHelper.new(base_dir)
+helper.install
+
+release_task = Rake::Task["release"]
+release_task.prerequisites.replace(["build", "release:rubygem_push"])
+
+desc "Run tests"
+task :test do
+ cd(base_dir) do
+ cd("dependency-check") do
+ ruby("-S", "rake")
+ end
+ ruby("test/run-test.rb")
+ end
+end
+
+task default: :test
diff --git a/src/arrow/ruby/red-plasma/dependency-check/Rakefile b/src/arrow/ruby/red-plasma/dependency-check/Rakefile
new file mode 100644
index 000000000..6792596d4
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/dependency-check/Rakefile
@@ -0,0 +1,47 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "pkg-config"
+require "native-package-installer"
+require_relative "../lib/plasma/version"
+
+case RUBY_PLATFORM
+when /mingw|mswin/
+ task :default => "nothing"
+else
+ task :default => "dependency:check"
+end
+
+task :nothing do
+end
+
+namespace :dependency do
+ desc "Check dependency"
+ task :check do
+ unless PKGConfig.check_version?("plasma-glib",
+ Plasma::Version::MAJOR,
+ Plasma::Version::MINOR,
+ Plasma::Version::MICRO)
+ unless NativePackageInstaller.install(:debian => "libplasma-glib-dev",
+ :redhat => "plasma-glib-devel")
+ exit(false)
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-plasma/lib/plasma.rb b/src/arrow/ruby/red-plasma/lib/plasma.rb
new file mode 100644
index 000000000..c8b4aa872
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/lib/plasma.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "plasma/version"
+
+require "plasma/loader"
+
+module Plasma
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-plasma/lib/plasma/client.rb b/src/arrow/ruby/red-plasma/lib/plasma/client.rb
new file mode 100644
index 000000000..d32ded6ff
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/lib/plasma/client.rb
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Plasma
+ class Client
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+ def initialize(socket_path, options=nil)
+ socket_path = socket_path.to_path if socket_path.respond_to?(:to_path)
+ if options
+ options_raw = options
+ options = ClientOptions.new
+ options_raw.each do |key, value|
+ setter = "#{key}="
+ options.__send__(setter, value) if options.respond_to?(setter)
+ end
+ end
+ initialize_raw(socket_path, options)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-plasma/lib/plasma/loader.rb b/src/arrow/ruby/red-plasma/lib/plasma/loader.rb
new file mode 100644
index 000000000..f9125a6f6
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/lib/plasma/loader.rb
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Plasma
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("Plasma", Plasma)
+ end
+ end
+
+ private
+ def post_load(repository, namespace)
+ require_libraries
+ end
+
+ def require_libraries
+ require "plasma/client"
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-plasma/lib/plasma/version.rb b/src/arrow/ruby/red-plasma/lib/plasma/version.rb
new file mode 100644
index 000000000..36ead51cb
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/lib/plasma/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Plasma
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end
diff --git a/src/arrow/ruby/red-plasma/red-plasma.gemspec b/src/arrow/ruby/red-plasma/red-plasma.gemspec
new file mode 100644
index 000000000..67e189a3c
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/red-plasma.gemspec
@@ -0,0 +1,49 @@
+# -*- ruby -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require_relative "lib/plasma/version"
+
+Gem::Specification.new do |spec|
+ spec.name = "red-plasma"
+ version_components = [
+ Plasma::Version::MAJOR.to_s,
+ Plasma::Version::MINOR.to_s,
+ Plasma::Version::MICRO.to_s,
+ Plasma::Version::TAG,
+ ]
+ spec.version = version_components.compact.join(".")
+ spec.homepage = "https://arrow.apache.org/"
+ spec.authors = ["Apache Arrow Developers"]
+ spec.email = ["dev@arrow.apache.org"]
+
+ spec.summary = "Red Plasma is the Ruby bindings of Plasma"
+ spec.description = "Plasma is an in-memory object store and cache for big data."
+ spec.license = "Apache-2.0"
+ spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
+ spec.files += ["LICENSE.txt", "NOTICE.txt"]
+ spec.files += Dir.glob("lib/**/*.rb")
+ spec.test_files += Dir.glob("test/**/*")
+ spec.extensions = ["dependency-check/Rakefile"]
+
+ spec.add_runtime_dependency("red-arrow", "= #{spec.version}")
+
+ spec.add_development_dependency("bundler")
+ spec.add_development_dependency("rake")
+ spec.add_development_dependency("test-unit")
+end
diff --git a/src/arrow/ruby/red-plasma/test/helper.rb b/src/arrow/ruby/red-plasma/test/helper.rb
new file mode 100644
index 000000000..02c545f53
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/test/helper.rb
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "plasma"
+
+require "tempfile"
+
+require "test-unit"
+
+require_relative "helper/omittable"
+require_relative "helper/plasma-store"
diff --git a/src/arrow/ruby/red-plasma/test/helper/omittable.rb b/src/arrow/ruby/red-plasma/test/helper/omittable.rb
new file mode 100644
index 000000000..a1c0334b6
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/test/helper/omittable.rb
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Helper
+ module Omittable
+ def require_gi_bindings(major, minor, micro)
+ return if GLib.check_binding_version?(major, minor, micro)
+ message =
+ "Require gobject-introspection #{major}.#{minor}.#{micro} or later: " +
+ GLib::BINDING_VERSION.join(".")
+ omit(message)
+ end
+
+ def require_gi(major, minor, micro)
+ return if GObjectIntrospection::Version.or_later?(major, minor, micro)
+ message =
+ "Require GObject Introspection #{major}.#{minor}.#{micro} or later: " +
+ GObjectIntrospection::Version::STRING
+ omit(message)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-plasma/test/helper/plasma-store.rb b/src/arrow/ruby/red-plasma/test/helper/plasma-store.rb
new file mode 100644
index 000000000..dcf1f47ae
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/test/helper/plasma-store.rb
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Helper
+ class PlasmaStore
+ def initialize(options={})
+ @path = `pkg-config --variable=plasma_store_server plasma`.chomp
+ @memory_size = options[:memory_size] || 1024 * 1024
+ @socket_file = Tempfile.new(["plasma-store", ".sock"])
+ @socket_file.close
+ @pid = nil
+ FileUtils.rm_f(socket_path)
+ end
+
+ def socket_path
+ @socket_file.path
+ end
+
+ def start
+ @pid = spawn(@path,
+ "-m", @memory_size.to_s,
+ "-s", socket_path)
+ until File.exist?(socket_path)
+ if Process.waitpid(@pid, Process::WNOHANG)
+ raise "Failed to run plasma-store-server: #{@path}"
+ end
+ end
+ end
+
+ def stop
+ return if @pid.nil?
+ Process.kill(:TERM, @pid)
+ timeout = 1
+ limit = Time.now + timeout
+ while Time.now < limit
+ return if Process.waitpid(@pid, Process::WNOHANG)
+ sleep(0.1)
+ end
+ Process.kill(:KILL, @pid)
+ Process.waitpid(@pid)
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-plasma/test/run-test.rb b/src/arrow/ruby/red-plasma/test/run-test.rb
new file mode 100755
index 000000000..48d2c49e1
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/test/run-test.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+$VERBOSE = true
+
+require "pathname"
+
+(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path|
+ RubyInstaller::Runtime.add_dll_directory(path)
+end
+
+base_dir = Pathname.new(__dir__).parent.expand_path
+arrow_base_dir = base_dir.parent + "red-arrow"
+
+lib_dir = base_dir + "lib"
+test_dir = base_dir + "test"
+
+arrow_lib_dir = arrow_base_dir + "lib"
+arrow_ext_dir = arrow_base_dir + "ext" + "arrow"
+
+build_dir = ENV["BUILD_DIR"]
+if build_dir
+ arrow_build_dir = Pathname.new(build_dir) + "red-arrow"
+else
+ arrow_build_dir = arrow_ext_dir
+end
+
+$LOAD_PATH.unshift(arrow_build_dir.to_s)
+$LOAD_PATH.unshift(arrow_lib_dir.to_s)
+$LOAD_PATH.unshift(lib_dir.to_s)
+
+require_relative "helper"
+
+exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/src/arrow/ruby/red-plasma/test/test-plasma-client.rb b/src/arrow/ruby/red-plasma/test/test-plasma-client.rb
new file mode 100644
index 000000000..d6182976c
--- /dev/null
+++ b/src/arrow/ruby/red-plasma/test/test-plasma-client.rb
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestPlasmaClient < Test::Unit::TestCase
+ include Helper::Omittable
+
+ def setup
+ @store = nil
+ require_gi_bindings(3, 3, 9)
+ @store = Helper::PlasmaStore.new
+ @store.start
+ @id = Plasma::ObjectID.new("Hello")
+ @data = "World"
+ end
+
+ def teardown
+ @store.stop if @store
+ end
+
+ def test_new_pathname
+ client = Plasma::Client.new(Pathname(@store.socket_path))
+ object = client.create(@id, @data.bytesize, nil)
+ object.data.set_data(0, @data)
+ object.seal
+
+ object = client.refer_object(@id, -1)
+ assert_equal(@data, object.data.data.to_s)
+ end
+
+ def test_new_options
+ client = Plasma::Client.new(@store.socket_path, n_retries: 1)
+ object = client.create(@id, @data.bytesize, nil)
+ object.data.set_data(0, @data)
+ object.seal
+
+ object = client.refer_object(@id, -1)
+ assert_equal(@data, object.data.data.to_s)
+ end
+end