summaryrefslogtreecommitdiffstats
path: root/src/arrow/docs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/docs
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/docs')
-rw-r--r--src/arrow/docs/.gitignore19
-rw-r--r--src/arrow/docs/Makefile248
-rw-r--r--src/arrow/docs/README.md30
-rw-r--r--src/arrow/docs/environment.yml25
-rw-r--r--src/arrow/docs/make.bat52
-rw-r--r--src/arrow/docs/requirements.txt5
-rw-r--r--src/arrow/docs/source/_static/arrow.pngbin0 -> 21636 bytes
-rw-r--r--src/arrow/docs/source/_static/favicon.icobin0 -> 15086 bytes
-rw-r--r--src/arrow/docs/source/_static/theme_overrides.css126
-rw-r--r--src/arrow/docs/source/_static/versions.json26
-rw-r--r--src/arrow/docs/source/_templates/docs-sidebar.html25
-rw-r--r--src/arrow/docs/source/_templates/layout.html5
-rw-r--r--src/arrow/docs/source/_templates/version-switcher.html60
-rw-r--r--src/arrow/docs/source/c_glib/index.rst21
-rw-r--r--src/arrow/docs/source/conf.py464
-rw-r--r--src/arrow/docs/source/cpp/api.rst42
-rw-r--r--src/arrow/docs/source/cpp/api/array.rst80
-rw-r--r--src/arrow/docs/source/cpp/api/builder.rst56
-rw-r--r--src/arrow/docs/source/cpp/api/c_abi.rst48
-rw-r--r--src/arrow/docs/source/cpp/api/compute.rst56
-rw-r--r--src/arrow/docs/source/cpp/api/cuda.rst74
-rw-r--r--src/arrow/docs/source/cpp/api/dataset.rst71
-rw-r--r--src/arrow/docs/source/cpp/api/datatype.rst102
-rw-r--r--src/arrow/docs/source/cpp/api/filesystem.rst64
-rw-r--r--src/arrow/docs/source/cpp/api/flight.rst202
-rw-r--r--src/arrow/docs/source/cpp/api/formats.rst109
-rw-r--r--src/arrow/docs/source/cpp/api/io.rst95
-rw-r--r--src/arrow/docs/source/cpp/api/ipc.rst90
-rw-r--r--src/arrow/docs/source/cpp/api/memory.rst124
-rw-r--r--src/arrow/docs/source/cpp/api/scalar.rst38
-rw-r--r--src/arrow/docs/source/cpp/api/support.rst57
-rw-r--r--src/arrow/docs/source/cpp/api/table.rst45
-rw-r--r--src/arrow/docs/source/cpp/api/tensor.rst57
-rw-r--r--src/arrow/docs/source/cpp/api/utilities.rst52
-rw-r--r--src/arrow/docs/source/cpp/arrays.rst225
-rw-r--r--src/arrow/docs/source/cpp/build_system.rst136
-rw-r--r--src/arrow/docs/source/cpp/compute.rst1606
-rw-r--r--src/arrow/docs/source/cpp/conventions.rst107
-rw-r--r--src/arrow/docs/source/cpp/csv.rst220
-rw-r--r--src/arrow/docs/source/cpp/dataset.rst417
-rw-r--r--src/arrow/docs/source/cpp/datatypes.rst68
-rw-r--r--src/arrow/docs/source/cpp/examples/cmake_minimal_build.rst28
-rw-r--r--src/arrow/docs/source/cpp/examples/compute_and_write_example.rst28
-rw-r--r--src/arrow/docs/source/cpp/examples/dataset_documentation_example.rst27
-rw-r--r--src/arrow/docs/source/cpp/examples/index.rst28
-rw-r--r--src/arrow/docs/source/cpp/examples/row_columnar_conversion.rst27
-rw-r--r--src/arrow/docs/source/cpp/examples/tuple_range_conversion.rst106
-rw-r--r--src/arrow/docs/source/cpp/flight.rst119
-rw-r--r--src/arrow/docs/source/cpp/getting_started.rst41
-rw-r--r--src/arrow/docs/source/cpp/index.rst32
-rw-r--r--src/arrow/docs/source/cpp/io.rst87
-rw-r--r--src/arrow/docs/source/cpp/ipc.rst75
-rw-r--r--src/arrow/docs/source/cpp/json.rst128
-rw-r--r--src/arrow/docs/source/cpp/memory.rst203
-rw-r--r--src/arrow/docs/source/cpp/overview.rst97
-rw-r--r--src/arrow/docs/source/cpp/parquet.rst432
-rw-r--r--src/arrow/docs/source/cpp/simple_graph.svg139
-rw-r--r--src/arrow/docs/source/cpp/streaming_execution.rst307
-rw-r--r--src/arrow/docs/source/cpp/tables.rst83
-rw-r--r--src/arrow/docs/source/developers/archery.rst87
-rw-r--r--src/arrow/docs/source/developers/benchmarks.rst179
-rw-r--r--src/arrow/docs/source/developers/computeir.rst59
-rw-r--r--src/arrow/docs/source/developers/contributing.rst362
-rw-r--r--src/arrow/docs/source/developers/cpp/building.rst510
-rw-r--r--src/arrow/docs/source/developers/cpp/conventions.rst90
-rw-r--r--src/arrow/docs/source/developers/cpp/development.rst294
-rw-r--r--src/arrow/docs/source/developers/cpp/fuzzing.rst99
-rw-r--r--src/arrow/docs/source/developers/cpp/index.rst31
-rw-r--r--src/arrow/docs/source/developers/cpp/windows.rst412
-rw-r--r--src/arrow/docs/source/developers/crossbow.rst258
-rw-r--r--src/arrow/docs/source/developers/docker.rst226
-rw-r--r--src/arrow/docs/source/developers/documentation.rst103
-rw-r--r--src/arrow/docs/source/developers/experimental_repos.rst65
-rw-r--r--src/arrow/docs/source/developers/python.rst565
-rw-r--r--src/arrow/docs/source/example.gzbin0 -> 41 bytes
-rw-r--r--src/arrow/docs/source/format/Arrow.grafflebin0 -> 4142 bytes
-rw-r--r--src/arrow/docs/source/format/Arrow.pngbin0 -> 112671 bytes
-rw-r--r--src/arrow/docs/source/format/CDataInterface.rst948
-rw-r--r--src/arrow/docs/source/format/CStreamInterface.rst218
-rw-r--r--src/arrow/docs/source/format/Columnar.rst1221
-rw-r--r--src/arrow/docs/source/format/Flight.rst152
-rw-r--r--src/arrow/docs/source/format/Guidelines.rst24
-rw-r--r--src/arrow/docs/source/format/IPC.rst24
-rw-r--r--src/arrow/docs/source/format/Integration.rst398
-rw-r--r--src/arrow/docs/source/format/Layout.rst24
-rw-r--r--src/arrow/docs/source/format/Metadata.rst24
-rw-r--r--src/arrow/docs/source/format/Other.rst63
-rw-r--r--src/arrow/docs/source/format/README.md24
-rw-r--r--src/arrow/docs/source/format/Versioning.rst70
-rw-r--r--src/arrow/docs/source/format/integration_json_examples/simple.json98
-rw-r--r--src/arrow/docs/source/format/integration_json_examples/struct.json201
-rw-r--r--src/arrow/docs/source/index.rst96
-rw-r--r--src/arrow/docs/source/java/algorithm.rst92
-rw-r--r--src/arrow/docs/source/java/index.rst31
-rw-r--r--src/arrow/docs/source/java/ipc.rst187
-rw-r--r--src/arrow/docs/source/java/reference/index.rst21
-rw-r--r--src/arrow/docs/source/java/vector.rst288
-rw-r--r--src/arrow/docs/source/java/vector_schema_root.rst74
-rw-r--r--src/arrow/docs/source/js/index.rst21
-rw-r--r--src/arrow/docs/source/python/api.rst40
-rw-r--r--src/arrow/docs/source/python/api/arrays.rst127
-rw-r--r--src/arrow/docs/source/python/api/compute.rst498
-rw-r--r--src/arrow/docs/source/python/api/cuda.rst62
-rw-r--r--src/arrow/docs/source/python/api/dataset.rst64
-rw-r--r--src/arrow/docs/source/python/api/datatypes.rst165
-rw-r--r--src/arrow/docs/source/python/api/files.rst65
-rw-r--r--src/arrow/docs/source/python/api/filesystems.rst53
-rw-r--r--src/arrow/docs/source/python/api/flight.rst91
-rw-r--r--src/arrow/docs/source/python/api/formats.rst101
-rw-r--r--src/arrow/docs/source/python/api/ipc.rst69
-rw-r--r--src/arrow/docs/source/python/api/memory.rst73
-rw-r--r--src/arrow/docs/source/python/api/misc.rst40
-rw-r--r--src/arrow/docs/source/python/api/plasma.rst33
-rw-r--r--src/arrow/docs/source/python/api/tables.rst55
-rw-r--r--src/arrow/docs/source/python/benchmarks.rst56
-rw-r--r--src/arrow/docs/source/python/compute.rst69
-rw-r--r--src/arrow/docs/source/python/csv.rst170
-rw-r--r--src/arrow/docs/source/python/cuda.rst159
-rw-r--r--src/arrow/docs/source/python/data.rst434
-rw-r--r--src/arrow/docs/source/python/dataset.rst626
-rw-r--r--src/arrow/docs/source/python/extending.rst483
-rw-r--r--src/arrow/docs/source/python/extending_types.rst324
-rw-r--r--src/arrow/docs/source/python/feather.rst109
-rw-r--r--src/arrow/docs/source/python/filesystems.rst305
-rw-r--r--src/arrow/docs/source/python/filesystems_deprecated.rst95
-rw-r--r--src/arrow/docs/source/python/getstarted.rst145
-rw-r--r--src/arrow/docs/source/python/getting_involved.rst35
-rw-r--r--src/arrow/docs/source/python/index.rst62
-rw-r--r--src/arrow/docs/source/python/install.rst90
-rw-r--r--src/arrow/docs/source/python/ipc.rst385
-rw-r--r--src/arrow/docs/source/python/json.rst117
-rw-r--r--src/arrow/docs/source/python/memory.rst298
-rw-r--r--src/arrow/docs/source/python/numpy.rst75
-rw-r--r--src/arrow/docs/source/python/pandas.rst480
-rw-r--r--src/arrow/docs/source/python/parquet.rst597
-rw-r--r--src/arrow/docs/source/python/plasma.rst462
-rw-r--r--src/arrow/docs/source/python/timestamps.rst198
-rw-r--r--src/arrow/docs/source/r/index.rst21
-rw-r--r--src/arrow/docs/source/status.rst239
139 files changed, 22543 insertions, 0 deletions
diff --git a/src/arrow/docs/.gitignore b/src/arrow/docs/.gitignore
new file mode 100644
index 000000000..d2e9f6ccc
--- /dev/null
+++ b/src/arrow/docs/.gitignore
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+_build
+source/python/generated
diff --git a/src/arrow/docs/Makefile b/src/arrow/docs/Makefile
new file mode 100644
index 000000000..fdff066a3
--- /dev/null
+++ b/src/arrow/docs/Makefile
@@ -0,0 +1,248 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+
+# Do not fail the build if there are warnings
+# SPHINXOPTS = -j8 -W
+SPHINXOPTS = -j8
+
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " applehelp to make an Apple Help Book"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " epub3 to make an epub3"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+ @echo " coverage to run coverage check of the documentation (if enabled)"
+ @echo " dummy to check syntax errors of document sources"
+
+.PHONY: clean
+clean:
+ rm -rf $(BUILDDIR)/*
+ rm -rf source/python/generated/*
+
+.PHONY: html
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyarrow.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyarrow.qhc"
+
+.PHONY: applehelp
+applehelp:
+ $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+ @echo
+ @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+ @echo "N.B. You won't be able to view it unless you put it in" \
+ "~/Library/Documentation/Help or install it in your application" \
+ "bundle."
+
+.PHONY: devhelp
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/pyarrow"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyarrow"
+ @echo "# devhelp"
+
+.PHONY: epub
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: epub3
+epub3:
+ $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
+ @echo
+ @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
+
+.PHONY: latex
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+ $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+ @echo "Testing of coverage in the sources finished, look at the " \
+ "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+.PHONY: dummy
+dummy:
+ $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
+ @echo
+ @echo "Build finished. Dummy builder generates no files."
diff --git a/src/arrow/docs/README.md b/src/arrow/docs/README.md
new file mode 100644
index 000000000..213042641
--- /dev/null
+++ b/src/arrow/docs/README.md
@@ -0,0 +1,30 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Apache Arrow Documentation
+
+This directory contains source files for building the main project
+documentation. This includes the [Arrow columnar format specification][2].
+
+Instructions for building the documentation site are found in
+[docs/source/developers/documentation.rst][1]. The build depends on the API
+documentation for some of the project subcomponents.
+
+[1]: https://github.com/apache/arrow/blob/master/docs/source/developers/documentation.rst
+[2]: https://github.com/apache/arrow/tree/master/docs/source/format
diff --git a/src/arrow/docs/environment.yml b/src/arrow/docs/environment.yml
new file mode 100644
index 000000000..8d1fe9bfb
--- /dev/null
+++ b/src/arrow/docs/environment.yml
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+channels:
+- defaults
+- conda-forge
+dependencies:
+- arrow-cpp
+- parquet-cpp
+- pyarrow
+- numpydoc
diff --git a/src/arrow/docs/make.bat b/src/arrow/docs/make.bat
new file mode 100644
index 000000000..36f2086c2
--- /dev/null
+++ b/src/arrow/docs/make.bat
@@ -0,0 +1,52 @@
+@rem Licensed to the Apache Software Foundation (ASF) under one
+@rem or more contributor license agreements. See the NOTICE file
+@rem distributed with this work for additional information
+@rem regarding copyright ownership. The ASF licenses this file
+@rem to you under the Apache License, Version 2.0 (the
+@rem "License"); you may not use this file except in compliance
+@rem with the License. You may obtain a copy of the License at
+@rem
+@rem http://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing,
+@rem software distributed under the License is distributed on an
+@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+@rem KIND, either express or implied. See the License for the
+@rem specific language governing permissions and limitations
+@rem under the License.
+
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/src/arrow/docs/requirements.txt b/src/arrow/docs/requirements.txt
new file mode 100644
index 000000000..0dbca6922
--- /dev/null
+++ b/src/arrow/docs/requirements.txt
@@ -0,0 +1,5 @@
+breathe
+ipython
+numpydoc
+sphinx==2.4.4
+pydata-sphinx-theme
diff --git a/src/arrow/docs/source/_static/arrow.png b/src/arrow/docs/source/_static/arrow.png
new file mode 100644
index 000000000..72104b075
--- /dev/null
+++ b/src/arrow/docs/source/_static/arrow.png
Binary files differ
diff --git a/src/arrow/docs/source/_static/favicon.ico b/src/arrow/docs/source/_static/favicon.ico
new file mode 100644
index 000000000..33a554a8a
--- /dev/null
+++ b/src/arrow/docs/source/_static/favicon.ico
Binary files differ
diff --git a/src/arrow/docs/source/_static/theme_overrides.css b/src/arrow/docs/source/_static/theme_overrides.css
new file mode 100644
index 000000000..d7d0bdfdb
--- /dev/null
+++ b/src/arrow/docs/source/_static/theme_overrides.css
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+/* Customizing with theme CSS variables */
+
+:root {
+ --pst-color-active-navigation: 215, 70, 51;
+ --pst-color-link-hover: 215, 70, 51;
+ --pst-color-headerlink: 215, 70, 51;
+ /* Use normal text color (like h3, ..) instead of primary color */
+ --pst-color-h1: var(--color-text-base);
+ --pst-color-h2: var(--color-text-base);
+ /* Use softer blue from bootstrap's default info color */
+ --pst-color-info: 23, 162, 184;
+ --pst-header-height: 0px;
+}
+
+code {
+ color: rgb(215, 70, 51);
+}
+
+.footer {
+ text-align: center;
+}
+
+/* Ensure the logo is properly displayed */
+
+.navbar-brand {
+ height: auto;
+ width: auto;
+}
+
+a.navbar-brand img {
+ height: auto;
+ width: auto;
+ max-height: 15vh;
+ max-width: 100%;
+}
+
+
+/* This is the bootstrap CSS style for "table-striped". Since the theme does
+not yet provide an easy way to configure this globaly, it easier to simply
+include this snippet here than updating each table in all rst files to
+add ":class: table-striped" */
+
+.table tbody tr:nth-of-type(odd) {
+ background-color: rgba(0, 0, 0, 0.05);
+}
+
+/* Iprove the vertical spacing in the C++ API docs
+(ideally this should be upstreamed to the pydata-sphinx-theme */
+
+dl.cpp dd p {
+ margin-bottom:.4rem;
+}
+
+dl.cpp.enumerator {
+ margin-bottom: 0.2rem;
+}
+
+p.breathe-sectiondef-title {
+ margin-top: 1rem;
+}
+
+/* Limit the max height of the sidebar navigation section. Because in our
+custimized template, there is more content above the navigation, i.e.
+larger logo: if we don't decrease the max-height, it will overlap with
+the footer.
+Details: min(15vh, 110px) for the logo size, 8rem for search box etc*/
+
+@media (min-width:720px) {
+ @supports (position:-webkit-sticky) or (position:sticky) {
+ .bd-links {
+ max-height: calc(100vh - min(15vh, 110px) - 8rem)
+ }
+ }
+}
+
+/* Styling to get the version dropdown and search box side-by-side on wide screens */
+
+#version-search-wrapper {
+ overflow: hidden;
+ width: inherit;
+ display: flex;
+ flex-wrap: wrap;
+ justify-content: left;
+ align-items: center;
+}
+
+#version-button {
+ padding-left: 0.5rem;
+ padding-right: 1rem;
+}
+
+#search-box {
+ flex: 1 0 12em;
+}
+
+/* Fix table text wrapping in RTD theme,
+ * see https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html
+ */
+
+@media screen {
+ table.docutils td {
+ /* !important prevents the common CSS stylesheets from overriding
+ this as on RTD they are loaded after this stylesheet */
+ white-space: normal !important;
+ }
+}
diff --git a/src/arrow/docs/source/_static/versions.json b/src/arrow/docs/source/_static/versions.json
new file mode 100644
index 000000000..d364cfe27
--- /dev/null
+++ b/src/arrow/docs/source/_static/versions.json
@@ -0,0 +1,26 @@
+[
+ {
+ "name": "6.0 (stable)",
+ "version": ""
+ },
+ {
+ "name": "5.0",
+ "version": "5.0/"
+ },
+ {
+ "name": "4.0",
+ "version": "4.0/"
+ },
+ {
+ "name": "3.0",
+ "version": "3.0/"
+ },
+ {
+ "name": "2.0",
+ "version": "2.0/"
+ },
+ {
+ "name": "1.0",
+ "version": "1.0/"
+ }
+] \ No newline at end of file
diff --git a/src/arrow/docs/source/_templates/docs-sidebar.html b/src/arrow/docs/source/_templates/docs-sidebar.html
new file mode 100644
index 000000000..fde4435df
--- /dev/null
+++ b/src/arrow/docs/source/_templates/docs-sidebar.html
@@ -0,0 +1,25 @@
+
+<a class="navbar-brand" href="{{ pathto(master_doc) }}">
+ <img src="{{ pathto('_static/' + logo, 1) }}" class="logo" alt="logo">
+</a>
+
+<div id="version-search-wrapper">
+
+{% include "version-switcher.html" %}
+
+<form id="search-box" class="bd-search d-flex align-items-center" action="{{ pathto('search') }}" method="get">
+ <i class="icon fas fa-search"></i>
+ <input type="search" class="form-control" name="q" id="search-input" placeholder="{{ theme_search_bar_text }}" aria-label="{{ theme_search_bar_text }}" autocomplete="off" >
+</form>
+
+</div>
+
+<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
+ <div class="bd-toc-item active">
+ {% if "python/api" in pagename or "python/generated" in pagename %}
+ {{ generate_nav_html("sidebar", startdepth=0, maxdepth=3, collapse=False, includehidden=True, titles_only=True) }}
+ {% else %}
+ {{ generate_nav_html("sidebar", startdepth=0, maxdepth=4, collapse=False, includehidden=True, titles_only=True) }}
+ {% endif %}
+ </div>
+</nav>
diff --git a/src/arrow/docs/source/_templates/layout.html b/src/arrow/docs/source/_templates/layout.html
new file mode 100644
index 000000000..a9d0f30bc
--- /dev/null
+++ b/src/arrow/docs/source/_templates/layout.html
@@ -0,0 +1,5 @@
+{% extends "pydata_sphinx_theme/layout.html" %}
+
+{# Silence the navbar #}
+{% block docs_navbar %}
+{% endblock %}
diff --git a/src/arrow/docs/source/_templates/version-switcher.html b/src/arrow/docs/source/_templates/version-switcher.html
new file mode 100644
index 000000000..24a8c15ac
--- /dev/null
+++ b/src/arrow/docs/source/_templates/version-switcher.html
@@ -0,0 +1,60 @@
+<div id="version-button" class="dropdown">
+ <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
+ {{ release }}
+ <span class="caret"></span>
+ </button>
+ <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
+ <!-- dropdown will be populated by javascript on page load -->
+ </div>
+</div>
+
+<script type="text/javascript">
+// Function to construct the target URL from the JSON components
+function buildURL(entry) {
+ var template = "{{ switcher_template_url }}"; // supplied by jinja
+ template = template.replace("{version}", entry.version);
+ return template;
+}
+
+// Function to check if corresponding page path exists in other version of docs
+// and, if so, go there instead of the homepage of the other docs version
+function checkPageExistsAndRedirect(event) {
+ const currentFilePath = "{{ pagename }}.html",
+ otherDocsHomepage = event.target.getAttribute("href");
+ let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
+ $.ajax({
+ type: 'HEAD',
+ url: tryUrl,
+ // if the page exists, go there
+ success: function() {
+ location.href = tryUrl;
+ }
+ }).fail(function() {
+ location.href = otherDocsHomepage;
+ });
+ return false;
+}
+
+// Function to populate the version switcher
+(function () {
+ // get JSON config
+ $.getJSON("{{ switcher_json_url }}", function(data, textStatus, jqXHR) {
+ // create the nodes first (before AJAX calls) to ensure the order is
+ // correct (for now, links will go to doc version homepage)
+ $.each(data, function(index, entry) {
+ // if no custom name specified (e.g., "latest"), use version string
+ if (!("name" in entry)) {
+ entry.name = entry.version;
+ }
+ // construct the appropriate URL, and add it to the dropdown
+ entry.url = buildURL(entry);
+ const node = document.createElement("a");
+ node.setAttribute("class", "list-group-item list-group-item-action py-1");
+ node.setAttribute("href", `${entry.url}`);
+ node.textContent = `${entry.name}`;
+ node.onclick = checkPageExistsAndRedirect;
+ $("#version_switcher").append(node);
+ });
+ });
+})();
+</script>
diff --git a/src/arrow/docs/source/c_glib/index.rst b/src/arrow/docs/source/c_glib/index.rst
new file mode 100644
index 000000000..56db23f2a
--- /dev/null
+++ b/src/arrow/docs/source/c_glib/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+C/GLib docs
+===========
+
+Stub page for the C/GLib docs; actual source is located in c_glib/doc/ sub-directory.
diff --git a/src/arrow/docs/source/conf.py b/src/arrow/docs/source/conf.py
new file mode 100644
index 000000000..150cd4181
--- /dev/null
+++ b/src/arrow/docs/source/conf.py
@@ -0,0 +1,464 @@
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+
+import datetime
+import os
+import sys
+import warnings
+from unittest import mock
+
+import pyarrow
+
+
+sys.path.extend([
+ os.path.join(os.path.dirname(__file__),
+ '..', '../..')
+
+])
+
+# Suppresses all warnings printed when sphinx is traversing the code (e.g.
+# deprecation warnings)
+warnings.filterwarnings("ignore", category=FutureWarning, message=".*pyarrow.*")
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.autosummary',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.ifconfig',
+ 'sphinx.ext.mathjax',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.napoleon',
+ 'IPython.sphinxext.ipython_directive',
+ 'IPython.sphinxext.ipython_console_highlighting',
+ 'breathe'
+]
+
+# Show members for classes in .. autosummary
+autodoc_default_options = {
+ 'members': None,
+ 'undoc-members': None,
+ 'show-inheritance': None,
+ 'inherited-members': None
+}
+
+# Breathe configuration
+breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"}
+breathe_default_project = "arrow_cpp"
+
+# Overriden conditionally below
+autodoc_mock_imports = []
+
+# ipython directive options
+ipython_mplbackend = ''
+
+# numpydoc configuration
+napoleon_use_rtype = False
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+
+source_suffix = ['.rst']
+
+autosummary_generate = True
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Apache Arrow'
+copyright = f'2016-{datetime.datetime.now().year} Apache Software Foundation'
+author = u'Apache Software Foundation'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = os.environ.get('ARROW_DOCS_VERSION',
+ pyarrow.__version__)
+# The full version, including alpha/beta/rc tags.
+release = os.environ.get('ARROW_DOCS_VERSION',
+ pyarrow.__version__)
+
+if "+" in release:
+ release = release.split(".dev")[0] + " (dev)"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'pydata_sphinx_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+ "show_toc_level": 2,
+ "google_analytics_id": "UA-107500873-1",
+}
+
+html_context = {
+ "switcher_json_url": "/docs/_static/versions.json",
+ "switcher_template_url": "https://arrow.apache.org/docs/{version}",
+ # for local testing
+ # "switcher_template_url": "http://0.0.0.0:8000/docs/{version}",
+}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+html_title = u'Apache Arrow v{}'.format(version)
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+html_logo = "_static/arrow.png"
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs. This file should be a Windows icon file (.ico) being 16x16 or
+# 32x32 pixels large.
+#
+html_favicon = "_static/favicon.ico"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom fixes to the RTD theme
+html_css_files = ['theme_overrides.css']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+html_sidebars = {
+# '**': ['sidebar-logo.html', 'sidebar-search-bs.html', 'sidebar-nav-bs.html'],
+ '**': ['docs-sidebar.html'],
+}
+
+# The base URL which points to the root of the HTML documentation,
+# used for canonical url
+html_baseurl = "https://arrow.apache.org/docs/"
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'arrowdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ # 'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'arrow.tex', u'Apache Arrow Documentation',
+ u'Apache Arrow Team', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ (master_doc, 'arrow', u'Apache Arrow Documentation',
+ [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'arrow', u'Apache Arrow Documentation',
+ author, 'Apache Arrow', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
+
+
+# -- Customization --------------------------------------------------------
+
+# Conditional API doc generation
+
+# Sphinx has two features for conditional inclusion:
+# - The "only" directive
+# https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#including-content-based-on-tags
+# - The "ifconfig" extension
+# https://www.sphinx-doc.org/en/master/usage/extensions/ifconfig.html
+#
+# Both have issues, but "ifconfig" seems to work in this setting.
+
+try:
+ import pyarrow.cuda
+ cuda_enabled = True
+except ImportError:
+ cuda_enabled = False
+ # Mock pyarrow.cuda to avoid autodoc warnings.
+ # XXX I can't get autodoc_mock_imports to work, so mock manually instead
+ # (https://github.com/sphinx-doc/sphinx/issues/2174#issuecomment-453177550)
+ pyarrow.cuda = sys.modules['pyarrow.cuda'] = mock.Mock()
+
+try:
+ import pyarrow.flight
+ flight_enabled = True
+except ImportError:
+ flight_enabled = False
+ pyarrow.flight = sys.modules['pyarrow.flight'] = mock.Mock()
+
+
+def setup(app):
+ # Use a config value to indicate whether CUDA API docs can be generated.
+ # This will also rebuild appropriately when the value changes.
+ app.add_config_value('cuda_enabled', cuda_enabled, 'env')
+ app.add_config_value('flight_enabled', flight_enabled, 'env')
diff --git a/src/arrow/docs/source/cpp/api.rst b/src/arrow/docs/source/cpp/api.rst
new file mode 100644
index 000000000..3df16a178
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api.rst
@@ -0,0 +1,42 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+*************
+API Reference
+*************
+
+.. toctree::
+ :maxdepth: 3
+
+ api/support
+ api/memory
+ api/datatype
+ api/array
+ api/scalar
+ api/builder
+ api/table
+ api/c_abi
+ api/compute
+ api/tensor
+ api/utilities
+ api/io
+ api/ipc
+ api/formats
+ api/cuda
+ api/flight
+ api/filesystem
+ api/dataset
diff --git a/src/arrow/docs/source/cpp/api/array.rst b/src/arrow/docs/source/cpp/api/array.rst
new file mode 100644
index 000000000..7f4e71158
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/array.rst
@@ -0,0 +1,80 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+======
+Arrays
+======
+
+.. doxygenclass:: arrow::ArrayData
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::Array
+ :project: arrow_cpp
+ :members:
+
+Concrete array subclasses
+=========================
+
+Primitive and temporal
+----------------------
+
+.. doxygenclass:: arrow::NullArray
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::BooleanArray
+ :project: arrow_cpp
+ :members:
+
+.. doxygengroup:: numeric-arrays
+ :content-only:
+ :members:
+
+Binary-like
+-----------
+
+.. doxygengroup:: binary-arrays
+ :content-only:
+ :members:
+
+Nested
+------
+
+.. doxygengroup:: nested-arrays
+ :content-only:
+ :members:
+
+Dictionary-encoded
+------------------
+
+.. doxygenclass:: arrow::DictionaryArray
+ :members:
+
+Extension arrays
+----------------
+
+.. doxygenclass:: arrow::ExtensionArray
+ :members:
+
+
+Chunked Arrays
+==============
+
+.. doxygenclass:: arrow::ChunkedArray
+ :project: arrow_cpp
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/builder.rst b/src/arrow/docs/source/cpp/api/builder.rst
new file mode 100644
index 000000000..9e6540aa5
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/builder.rst
@@ -0,0 +1,56 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+==============
+Array Builders
+==============
+
+.. doxygenclass:: arrow::ArrayBuilder
+ :members:
+
+Concrete builder subclasses
+===========================
+
+.. doxygenclass:: arrow::NullBuilder
+ :members:
+
+.. doxygenclass:: arrow::BooleanBuilder
+ :members:
+
+.. doxygenclass:: arrow::NumericBuilder
+ :members:
+
+.. doxygenclass:: arrow::BinaryBuilder
+ :members:
+
+.. doxygenclass:: arrow::StringBuilder
+ :members:
+
+.. doxygenclass:: arrow::FixedSizeBinaryBuilder
+ :members:
+
+.. doxygenclass:: arrow::Decimal128Builder
+ :members:
+
+.. doxygenclass:: arrow::ListBuilder
+ :members:
+
+.. doxygenclass:: arrow::StructBuilder
+ :members:
+
+.. doxygenclass:: arrow::DictionaryBuilder
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/c_abi.rst b/src/arrow/docs/source/cpp/api/c_abi.rst
new file mode 100644
index 000000000..4e451c3ec
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/c_abi.rst
@@ -0,0 +1,48 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+============
+C Interfaces
+============
+
+.. seealso::
+ The :ref:`C data interface <c-data-interface>` and
+ :ref:`C stream interface <c-stream-interface>` specifications.
+
+ABI Structures
+==============
+
+.. doxygenstruct:: ArrowSchema
+ :project: arrow_cpp
+
+.. doxygenstruct:: ArrowArray
+ :project: arrow_cpp
+
+.. doxygenstruct:: ArrowArrayStream
+ :project: arrow_cpp
+
+C Data Interface
+================
+
+.. doxygengroup:: c-data-interface
+ :content-only:
+
+C Stream Interface
+==================
+
+.. doxygengroup:: c-stream-interface
+ :content-only:
diff --git a/src/arrow/docs/source/cpp/api/compute.rst b/src/arrow/docs/source/cpp/api/compute.rst
new file mode 100644
index 000000000..3b0a89f83
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/compute.rst
@@ -0,0 +1,56 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Compute Functions
+=================
+
+Datum class
+-----------
+
+.. doxygenclass:: arrow::Datum
+ :members:
+
+Abstract Function classes
+-------------------------
+
+.. doxygengroup:: compute-functions
+ :content-only:
+ :members:
+
+Function registry
+-----------------
+
+.. doxygenclass:: arrow::compute::FunctionRegistry
+ :members:
+
+.. doxygenfunction:: arrow::compute::GetFunctionRegistry
+
+Convenience functions
+---------------------
+
+.. doxygengroup:: compute-call-function
+ :content-only:
+
+Concrete options classes
+------------------------
+
+.. doxygengroup:: compute-concrete-options
+ :content-only:
+ :members:
+ :undoc-members:
+
+.. TODO: List concrete function invocation shortcuts?
diff --git a/src/arrow/docs/source/cpp/api/cuda.rst b/src/arrow/docs/source/cpp/api/cuda.rst
new file mode 100644
index 000000000..caeb5be31
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/cuda.rst
@@ -0,0 +1,74 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+============
+CUDA support
+============
+
+Contexts
+========
+
+.. doxygenclass:: arrow::cuda::CudaDeviceManager
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::cuda::CudaContext
+ :project: arrow_cpp
+ :members:
+
+Devices
+=======
+
+.. doxygenclass:: arrow::cuda::CudaDevice
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::cuda::CudaMemoryManager
+ :project: arrow_cpp
+ :members:
+
+Buffers
+=======
+
+.. doxygenclass:: arrow::cuda::CudaBuffer
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::cuda::CudaHostBuffer
+ :project: arrow_cpp
+ :members:
+
+Memory Input / Output
+=====================
+
+.. doxygenclass:: arrow::cuda::CudaBufferReader
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::cuda::CudaBufferWriter
+ :project: arrow_cpp
+ :members:
+
+IPC
+===
+
+.. doxygenclass:: arrow::cuda::CudaIpcMemHandle
+ :project: arrow_cpp
+ :members:
+
+.. doxygengroup:: cuda-ipc-functions
+ :content-only:
diff --git a/src/arrow/docs/source/cpp/api/dataset.rst b/src/arrow/docs/source/cpp/api/dataset.rst
new file mode 100644
index 000000000..3f0df8a45
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/dataset.rst
@@ -0,0 +1,71 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=======
+Dataset
+=======
+
+Interface
+=========
+
+.. doxygenclass:: arrow::dataset::Fragment
+ :members:
+
+.. doxygenclass:: arrow::dataset::Dataset
+ :members:
+
+Partitioning
+============
+
+.. doxygengroup:: dataset-partitioning
+ :content-only:
+ :members:
+
+Dataset discovery/factories
+===========================
+
+.. doxygengroup:: dataset-discovery
+ :content-only:
+ :members:
+
+Scanning
+========
+
+.. doxygengroup:: dataset-scanning
+ :content-only:
+ :members:
+
+Concrete implementations
+========================
+
+.. doxygengroup:: dataset-implementations
+ :content-only:
+ :members:
+
+File System Datasets
+--------------------
+
+.. doxygengroup:: dataset-filesystem
+ :content-only:
+ :members:
+
+File Formats
+------------
+
+.. doxygengroup:: dataset-file-formats
+ :content-only:
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/datatype.rst b/src/arrow/docs/source/cpp/api/datatype.rst
new file mode 100644
index 000000000..2cbe1cf4d
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/datatype.rst
@@ -0,0 +1,102 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+==========
+Data Types
+==========
+
+.. doxygenenum:: arrow::Type::type
+
+.. doxygenclass:: arrow::DataType
+ :members:
+
+.. _api-type-factories:
+
+Factory functions
+=================
+
+These functions are recommended for creating data types. They may return
+new objects or existing singletons, depending on the type requested.
+
+.. doxygengroup:: type-factories
+ :project: arrow_cpp
+ :content-only:
+
+Concrete type subclasses
+========================
+
+Primitive
+---------
+
+.. doxygenclass:: arrow::NullType
+ :members:
+
+.. doxygenclass:: arrow::BooleanType
+ :members:
+
+.. doxygengroup:: numeric-datatypes
+ :content-only:
+ :members:
+
+Temporal
+--------
+
+.. doxygenenum:: arrow::TimeUnit::type
+
+.. doxygengroup:: temporal-datatypes
+ :content-only:
+ :members:
+
+Binary-like
+-----------
+
+.. doxygengroup:: binary-datatypes
+ :content-only:
+ :members:
+
+Nested
+------
+
+.. doxygengroup:: nested-datatypes
+ :content-only:
+ :members:
+
+Dictionary-encoded
+------------------
+
+.. doxygenclass:: arrow::DictionaryType
+ :members:
+
+Extension types
+---------------
+
+.. doxygenclass:: arrow::ExtensionType
+ :members:
+
+
+Fields and Schemas
+==================
+
+.. doxygengroup:: schema-factories
+ :project: arrow_cpp
+ :content-only:
+
+.. doxygenclass:: arrow::Field
+ :members:
+
+.. doxygenclass:: arrow::Schema
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/filesystem.rst b/src/arrow/docs/source/cpp/api/filesystem.rst
new file mode 100644
index 000000000..02fff9a6c
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/filesystem.rst
@@ -0,0 +1,64 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+===========
+Filesystems
+===========
+
+Interface
+=========
+
+.. doxygenenum:: arrow::fs::FileType
+
+.. doxygenstruct:: arrow::fs::FileInfo
+ :members:
+
+.. doxygenstruct:: arrow::fs::FileSelector
+ :members:
+
+.. doxygenclass:: arrow::fs::FileSystem
+ :members:
+
+High-level factory function
+===========================
+
+.. doxygengroup:: filesystem-factories
+ :content-only:
+
+Concrete implementations
+========================
+
+.. doxygenclass:: arrow::fs::SubTreeFileSystem
+ :members:
+
+.. doxygenstruct:: arrow::fs::LocalFileSystemOptions
+ :members:
+
+.. doxygenclass:: arrow::fs::LocalFileSystem
+ :members:
+
+.. doxygenstruct:: arrow::fs::S3Options
+ :members:
+
+.. doxygenclass:: arrow::fs::S3FileSystem
+ :members:
+
+.. doxygenstruct:: arrow::fs::HdfsOptions
+ :members:
+
+.. doxygenclass:: arrow::fs::HadoopFileSystem
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/flight.rst b/src/arrow/docs/source/cpp/api/flight.rst
new file mode 100644
index 000000000..7cefd66ef
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/flight.rst
@@ -0,0 +1,202 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+================
+Arrow Flight RPC
+================
+
+.. note:: Flight is currently unstable. APIs are subject to change,
+ though we don't expect drastic changes.
+
+Common Types
+============
+
+.. doxygenstruct:: arrow::flight::Action
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::ActionType
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::AddCallHeaders
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::CallInfo
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::Criteria
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::FlightDescriptor
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::FlightEndpoint
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightInfo
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::FlightPayload
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightListing
+ :project: arrow_cpp
+ :members:
+
+.. doxygenenum:: arrow::flight::FlightMethod
+ :project: arrow_cpp
+
+.. doxygenstruct:: arrow::flight::Location
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::MetadataRecordBatchReader
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::Result
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ResultStream
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::Ticket
+ :project: arrow_cpp
+ :members:
+
+Clients
+=======
+
+.. doxygenclass:: arrow::flight::FlightClient
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::FlightClientOptions
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightCallOptions
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ClientAuthHandler
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ClientMiddleware
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ClientMiddlewareFactory
+ :project: arrow_cpp
+ :members:
+
+.. doxygentypedef:: arrow::flight::TimeoutDuration
+ :project: arrow_cpp
+
+.. doxygenclass:: arrow::flight::FlightStreamReader
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightStreamWriter
+ :project: arrow_cpp
+ :members:
+
+Servers
+=======
+
+.. doxygenclass:: arrow::flight::FlightServerBase
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightServerOptions
+ :project: arrow_cpp
+ :members:
+
+.. doxygenstruct:: arrow::flight::CertKeyPair
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightDataStream
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightMessageReader
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::FlightMetadataWriter
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::RecordBatchStream
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ServerAuthHandler
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ServerCallContext
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ServerMiddleware
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::ServerMiddlewareFactory
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::SimpleFlightListing
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::flight::SimpleResultStream
+ :project: arrow_cpp
+ :members:
+
+Error Handling
+==============
+
+Error handling uses the normal :class:`arrow::Status` class, combined
+with a custom :class:`arrow::StatusDetail` object for Flight-specific
+error codes.
+
+.. doxygenenum:: arrow::flight::FlightStatusCode
+ :project: arrow_cpp
+
+.. doxygenclass:: arrow::flight::FlightStatusDetail
+ :project: arrow_cpp
+ :members:
+
+.. doxygenfunction:: arrow::flight::MakeFlightError
+ :project: arrow_cpp
diff --git a/src/arrow/docs/source/cpp/api/formats.rst b/src/arrow/docs/source/cpp/api/formats.rst
new file mode 100644
index 000000000..2f6b24802
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/formats.rst
@@ -0,0 +1,109 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+============
+File Formats
+============
+
+.. _cpp-api-csv:
+
+CSV
+===
+
+.. doxygenstruct:: arrow::csv::ConvertOptions
+ :members:
+
+.. doxygenstruct:: arrow::csv::ParseOptions
+ :members:
+
+.. doxygenstruct:: arrow::csv::ReadOptions
+ :members:
+
+.. doxygenstruct:: arrow::csv::WriteOptions
+ :members:
+
+.. doxygenclass:: arrow::csv::TableReader
+ :members:
+
+.. doxygenfunction:: arrow::csv::MakeCSVWriter(io::OutputStream *, const std::shared_ptr<Schema>&, const WriteOptions&)
+
+.. doxygenfunction:: arrow::csv::MakeCSVWriter(std::shared_ptr<io::OutputStream>, const std::shared_ptr<Schema>&, const WriteOptions&)
+
+.. doxygenfunction:: arrow::csv::WriteCSV(const RecordBatch&, const WriteOptions&, arrow::io::OutputStream *)
+
+.. doxygenfunction:: arrow::csv::WriteCSV(const Table&, const WriteOptions&, arrow::io::OutputStream *)
+
+.. _cpp-api-json:
+
+Line-separated JSON
+===================
+
+.. doxygenenum:: arrow::json::UnexpectedFieldBehavior
+
+.. doxygenstruct:: arrow::json::ReadOptions
+ :members:
+
+.. doxygenstruct:: arrow::json::ParseOptions
+ :members:
+
+.. doxygenclass:: arrow::json::TableReader
+ :members:
+
+.. _cpp-api-parquet:
+
+Parquet reader
+==============
+
+.. doxygenclass:: parquet::ReaderProperties
+ :members:
+
+.. doxygenclass:: parquet::ArrowReaderProperties
+ :members:
+
+.. doxygenclass:: parquet::ParquetFileReader
+ :members:
+
+.. doxygenclass:: parquet::arrow::FileReader
+ :members:
+
+.. doxygenclass:: parquet::arrow::FileReaderBuilder
+ :members:
+
+.. doxygengroup:: parquet-arrow-reader-factories
+ :content-only:
+
+.. doxygenclass:: parquet::StreamReader
+ :members:
+
+Parquet writer
+==============
+
+.. doxygenclass:: parquet::WriterProperties
+ :members:
+
+.. doxygenclass:: parquet::ArrowWriterProperties
+ :members:
+
+.. doxygenclass:: parquet::arrow::FileWriter
+ :members:
+
+.. doxygenfunction:: parquet::arrow::WriteTable
+
+.. doxygenclass:: parquet::StreamWriter
+ :members:
+
+.. TODO ORC
diff --git a/src/arrow/docs/source/cpp/api/io.rst b/src/arrow/docs/source/cpp/api/io.rst
new file mode 100644
index 000000000..735136a0d
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/io.rst
@@ -0,0 +1,95 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+==============
+Input / output
+==============
+
+Interfaces
+==========
+
+.. doxygenclass:: arrow::io::FileInterface
+ :members:
+
+.. doxygenclass:: arrow::io::Readable
+ :members:
+
+.. doxygenclass:: arrow::io::Seekable
+ :members:
+
+.. doxygenclass:: arrow::io::Writable
+ :members:
+
+.. doxygenclass:: arrow::io::InputStream
+ :members:
+
+.. doxygenclass:: arrow::io::RandomAccessFile
+ :members:
+
+.. doxygenclass:: arrow::io::OutputStream
+ :members:
+
+.. doxygenclass:: arrow::io::ReadWriteFileInterface
+ :members:
+
+Concrete implementations
+========================
+
+In-memory streams
+-----------------
+
+.. doxygenclass:: arrow::io::BufferReader
+ :members:
+
+.. doxygenclass:: arrow::io::MockOutputStream
+ :members:
+
+.. doxygenclass:: arrow::io::BufferOutputStream
+ :members:
+
+.. doxygenclass:: arrow::io::FixedSizeBufferWriter
+ :members:
+
+Local files
+-----------
+
+.. doxygenclass:: arrow::io::ReadableFile
+ :members:
+
+.. doxygenclass:: arrow::io::FileOutputStream
+ :members:
+
+.. doxygenclass:: arrow::io::MemoryMappedFile
+ :members:
+
+Buffering input / output wrappers
+---------------------------------
+
+.. doxygenclass:: arrow::io::BufferedInputStream
+ :members:
+
+.. doxygenclass:: arrow::io::BufferedOutputStream
+ :members:
+
+Compressed input / output wrappers
+----------------------------------
+
+.. doxygenclass:: arrow::io::CompressedInputStream
+ :members:
+
+.. doxygenclass:: arrow::io::CompressedOutputStream
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/ipc.rst b/src/arrow/docs/source/cpp/api/ipc.rst
new file mode 100644
index 000000000..6822b986a
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/ipc.rst
@@ -0,0 +1,90 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+=========
+Arrow IPC
+=========
+
+IPC options
+===========
+
+.. doxygenstruct:: arrow::ipc::IpcReadOptions
+ :members:
+
+.. doxygenstruct:: arrow::ipc::IpcWriteOptions
+ :members:
+
+Reading IPC streams and files
+=============================
+
+Blocking API
+------------
+
+Use either of these two classes, depending on which IPC format you want
+to read. The file format requires a random-access file, while the stream
+format only requires a sequential input stream.
+
+.. doxygenclass:: arrow::ipc::RecordBatchStreamReader
+ :members:
+
+.. doxygenclass:: arrow::ipc::RecordBatchFileReader
+ :members:
+
+Event-driven API
+----------------
+
+To read an IPC stream in event-driven fashion, you must implement a
+:class:`~arrow::ipc::Listener` subclass that you will pass to
+:class:`~arrow::ipc::StreamDecoder`.
+
+.. doxygenclass:: arrow::ipc::Listener
+ :members:
+
+.. doxygenclass:: arrow::ipc::StreamDecoder
+ :members:
+
+Statistics
+----------
+
+.. doxygenstruct:: arrow::ipc::ReadStats
+ :members:
+
+Writing IPC streams and files
+=============================
+
+Blocking API
+------------
+
+The IPC stream format is only optionally terminated, whereas the IPC file format
+must include a terminating footer. Thus a writer of the IPC file format must be
+explicitly finalized with :func:`~arrow::ipc::RecordBatchWriter::Close()` or the resulting
+file will be corrupt.
+
+.. doxygengroup:: record-batch-writer-factories
+ :content-only:
+
+.. doxygenclass:: arrow::ipc::RecordBatchWriter
+ :members:
+
+Statistics
+----------
+
+.. doxygenstruct:: arrow::ipc::WriteStats
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/memory.rst b/src/arrow/docs/source/cpp/api/memory.rst
new file mode 100644
index 000000000..807a4e2f7
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/memory.rst
@@ -0,0 +1,124 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Memory (management)
+===================
+
+Devices
+-------
+
+.. doxygenclass:: arrow::Device
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::CPUDevice
+ :project: arrow_cpp
+ :members:
+
+.. doxygenfunction:: arrow::default_cpu_memory_manager
+ :project: arrow_cpp
+
+Memory Managers
+---------------
+
+.. doxygenclass:: arrow::MemoryManager
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::CPUMemoryManager
+ :project: arrow_cpp
+ :members:
+
+Buffers
+-------
+
+.. doxygenclass:: arrow::Buffer
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::MutableBuffer
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::ResizableBuffer
+ :project: arrow_cpp
+ :members:
+
+Memory Pools
+------------
+
+.. doxygenfunction:: arrow::default_memory_pool
+ :project: arrow_cpp
+
+.. doxygenfunction:: arrow::jemalloc_memory_pool
+ :project: arrow_cpp
+
+.. doxygenfunction:: arrow::mimalloc_memory_pool
+ :project: arrow_cpp
+
+.. doxygenfunction:: arrow::system_memory_pool
+ :project: arrow_cpp
+
+.. doxygenclass:: arrow::MemoryPool
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::LoggingMemoryPool
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::ProxyMemoryPool
+ :project: arrow_cpp
+ :members:
+
+Allocation Functions
+--------------------
+
+These functions allocate a buffer from a particular memory pool.
+
+.. doxygengroup:: buffer-allocation-functions
+ :project: arrow_cpp
+ :content-only:
+
+Slicing
+-------
+
+.. doxygengroup:: buffer-slicing-functions
+ :project: arrow_cpp
+ :content-only:
+
+Buffer Builders
+---------------
+
+.. doxygenclass:: arrow::BufferBuilder
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::TypedBufferBuilder
+ :project: arrow_cpp
+ :members:
+
+STL Integration
+---------------
+
+.. doxygenclass:: arrow::stl::allocator
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::stl::STLMemoryPool
+ :project: arrow_cpp
+ :members:
diff --git a/src/arrow/docs/source/cpp/api/scalar.rst b/src/arrow/docs/source/cpp/api/scalar.rst
new file mode 100644
index 000000000..391c9d57b
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/scalar.rst
@@ -0,0 +1,38 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=======
+Scalars
+=======
+
+.. doxygenstruct:: arrow::Scalar
+ :project: arrow_cpp
+ :members:
+
+Factory functions
+=================
+
+.. doxygengroup:: scalar-factories
+ :content-only:
+
+Concrete scalar subclasses
+==========================
+
+.. doxygengroup:: concrete-scalar-classes
+ :content-only:
+ :members:
+ :undoc-members:
diff --git a/src/arrow/docs/source/cpp/api/support.rst b/src/arrow/docs/source/cpp/api/support.rst
new file mode 100644
index 000000000..c3310e5d8
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/support.rst
@@ -0,0 +1,57 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+===================
+Programming Support
+===================
+
+General information
+-------------------
+
+.. doxygenfunction:: arrow::GetBuildInfo
+ :project: arrow_cpp
+
+.. doxygenstruct:: arrow::BuildInfo
+ :project: arrow_cpp
+ :members:
+
+Error return and reporting
+--------------------------
+
+.. doxygenclass:: arrow::Status
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::StatusDetail
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::Result
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: parquet::ParquetException
+ :project: arrow_cpp
+ :members:
+
+.. doxygendefine:: ARROW_RETURN_NOT_OK
+
+.. doxygendefine:: ARROW_ASSIGN_OR_RAISE
+
+.. doxygendefine:: PARQUET_THROW_NOT_OK
+
+.. doxygendefine:: PARQUET_ASSIGN_OR_THROW
diff --git a/src/arrow/docs/source/cpp/api/table.rst b/src/arrow/docs/source/cpp/api/table.rst
new file mode 100644
index 000000000..53e2d72e6
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/table.rst
@@ -0,0 +1,45 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+========================
+Two-dimensional Datasets
+========================
+
+Record Batches
+==============
+
+.. doxygenclass:: arrow::RecordBatch
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::RecordBatchReader
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::TableBatchReader
+ :project: arrow_cpp
+ :members:
+
+Tables
+======
+
+.. doxygenclass:: arrow::Table
+ :project: arrow_cpp
+ :members:
+
+.. doxygenfunction:: arrow::ConcatenateTables
+ :project: arrow_cpp
diff --git a/src/arrow/docs/source/cpp/api/tensor.rst b/src/arrow/docs/source/cpp/api/tensor.rst
new file mode 100644
index 000000000..1d51786db
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/tensor.rst
@@ -0,0 +1,57 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=======
+Tensors
+=======
+
+Dense Tensors
+=============
+
+.. doxygenclass:: arrow::Tensor
+ :members:
+
+.. doxygenclass:: arrow::NumericTensor
+ :members:
+
+Sparse Tensors
+==============
+
+.. doxygenenum:: arrow::SparseTensorFormat::type
+
+.. doxygenclass:: arrow::SparseIndex
+ :members:
+
+.. doxygenclass:: arrow::SparseCOOIndex
+ :members:
+
+.. doxygenclass:: arrow::SparseCSRIndex
+ :members:
+
+.. doxygenclass:: arrow::SparseTensor
+ :members:
+
+.. doxygenclass:: arrow::SparseTensorImpl
+ :members:
+
+.. doxygentypedef:: arrow::SparseCOOTensor
+
+.. doxygentypedef:: arrow::SparseCSCMatrix
+
+.. doxygentypedef:: arrow::SparseCSFTensor
+
+.. doxygentypedef:: arrow::SparseCSRMatrix
diff --git a/src/arrow/docs/source/cpp/api/utilities.rst b/src/arrow/docs/source/cpp/api/utilities.rst
new file mode 100644
index 000000000..87c5a3bbe
--- /dev/null
+++ b/src/arrow/docs/source/cpp/api/utilities.rst
@@ -0,0 +1,52 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=========
+Utilities
+=========
+
+Decimal Numbers
+===============
+
+.. doxygenclass:: arrow::Decimal128
+ :project: arrow_cpp
+ :members:
+
+Abstract Sequences
+==================
+
+.. doxygenclass:: arrow::Iterator
+ :project: arrow_cpp
+ :members:
+
+.. doxygenclass:: arrow::VectorIterator
+ :project: arrow_cpp
+ :members:
+
+Compression
+===========
+
+.. doxygenenum:: arrow::Compression::type
+
+.. doxygenclass:: arrow::util::Codec
+ :members:
+
+.. doxygenclass:: arrow::util::Compressor
+ :members:
+
+.. doxygenclass:: arrow::util::Decompressor
+ :members:
diff --git a/src/arrow/docs/source/cpp/arrays.rst b/src/arrow/docs/source/cpp/arrays.rst
new file mode 100644
index 000000000..ff76e9d02
--- /dev/null
+++ b/src/arrow/docs/source/cpp/arrays.rst
@@ -0,0 +1,225 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+======
+Arrays
+======
+
+.. seealso::
+ :doc:`Array API reference <api/array>`
+
+The central type in Arrow is the class :class:`arrow::Array`. An array
+represents a known-length sequence of values all having the same type.
+Internally, those values are represented by one or several buffers, the
+number and meaning of which depend on the array's data type, as documented
+in :ref:`the Arrow data layout specification <format_layout>`.
+
+Those buffers consist of the value data itself and an optional bitmap buffer
+that indicates which array entries are null values. The bitmap buffer
+can be entirely omitted if the array is known to have zero null values.
+
+There are concrete subclasses of :class:`arrow::Array` for each data type,
+that help you access individual values of the array.
+
+Building an array
+=================
+
+Available strategies
+--------------------
+
+As Arrow objects are immutable, they cannot be populated directly like for
+example a ``std::vector``. Instead, several strategies can be used:
+
+* if the data already exists in memory with the right layout, you can wrap
+ said memory inside :class:`arrow::Buffer` instances and then construct
+ a :class:`arrow::ArrowData` describing the array;
+
+ .. seealso:: :ref:`cpp_memory_management`
+
+* otherwise, the :class:`arrow::ArrayBuilder` base class and its concrete
+ subclasses help building up array data incrementally, without having to
+ deal with details of the Arrow format yourself.
+
+Using ArrayBuilder and its subclasses
+-------------------------------------
+
+To build an ``Int64`` Arrow array, we can use the :class:`arrow::Int64Builder`
+class. In the following example, we build an array of the range 1 to 8 where
+the element that should hold the value 4 is nulled::
+
+ arrow::Int64Builder builder;
+ builder.Append(1);
+ builder.Append(2);
+ builder.Append(3);
+ builder.AppendNull();
+ builder.Append(5);
+ builder.Append(6);
+ builder.Append(7);
+ builder.Append(8);
+
+ auto maybe_array = builder.Finish();
+ if (!maybe_array.ok()) {
+ // ... do something on array building failure
+ }
+ std::shared_ptr<arrow::Array> array = *maybe_array;
+
+The resulting Array (which can be casted to the concrete :class:`arrow::Int64Array`
+subclass if you want to access its values) then consists of two
+:class:`arrow::Buffer`\s.
+The first buffer holds the null bitmap, which consists here of a single byte with
+the bits ``1|1|1|1|0|1|1|1``. As we use `least-significant bit (LSB) numbering`_.
+this indicates that the fourth entry in the array is null. The second
+buffer is simply an ``int64_t`` array containing all the above values.
+As the fourth entry is null, the value at that position in the buffer is
+undefined.
+
+Here is how you could access the concrete array's contents::
+
+ // Cast the Array to its actual type to access its data
+ auto int64_array = std::static_pointer_cast<arrow::Int64Array>(array);
+
+ // Get the pointer to the null bitmap.
+ const uint8_t* null_bitmap = int64_array->null_bitmap_data();
+
+ // Get the pointer to the actual data
+ const int64_t* data = int64_array->raw_values();
+
+ // Alternatively, given an array index, query its null bit and value directly
+ int64_t index = 2;
+ if (!int64_array->IsNull(index)) {
+ int64_t value = int64_array->Value(index);
+ }
+
+.. note::
+ :class:`arrow::Int64Array` (respectively :class:`arrow::Int64Builder`) is
+ just a ``typedef``, provided for convenience, of ``arrow::NumericArray<Int64Type>``
+ (respectively ``arrow::NumericBuilder<Int64Type>``).
+
+.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering
+
+Performance
+-----------
+
+While it is possible to build an array value-by-value as in the example above,
+to attain highest performance it is recommended to use the bulk appending
+methods (usually named ``AppendValues``) in the concrete :class:`arrow::ArrayBuilder`
+subclasses.
+
+If you know the number of elements in advance, it is also recommended to
+presize the working area by calling the :func:`~arrow::ArrayBuilder::Resize`
+or :func:`~arrow::ArrayBuilder::Reserve` methods.
+
+Here is how one could rewrite the above example to take advantage of those
+APIs::
+
+ arrow::Int64Builder builder;
+ // Make place for 8 values in total
+ builder.Reserve(8);
+ // Bulk append the given values (with a null in 4th place as indicated by the
+ // validity vector)
+ std::vector<bool> validity = {true, true, true, false, true, true, true, true};
+ std::vector<int64_t> values = {1, 2, 3, 0, 5, 6, 7, 8};
+ builder.AppendValues(values, validity);
+
+ auto maybe_array = builder.Finish();
+
+If you still must append values one by one, some concrete builder subclasses
+have methods marked "Unsafe" that assume the working area has been correctly
+presized, and offer higher performance in exchange::
+
+ arrow::Int64Builder builder;
+ // Make place for 8 values in total
+ builder.Reserve(8);
+ builder.UnsafeAppend(1);
+ builder.UnsafeAppend(2);
+ builder.UnsafeAppend(3);
+ builder.UnsafeAppendNull();
+ builder.UnsafeAppend(5);
+ builder.UnsafeAppend(6);
+ builder.UnsafeAppend(7);
+ builder.UnsafeAppend(8);
+
+ auto maybe_array = builder.Finish();
+
+Size Limitations and Recommendations
+====================================
+
+Some array types are structurally limited to 32-bit sizes. This is the case
+for list arrays (which can hold up to 2^31 elements), string arrays and binary
+arrays (which can hold up to 2GB of binary data), at least. Some other array
+types can hold up to 2^63 elements in the C++ implementation, but other Arrow
+implementations can have a 32-bit size limitation for those array types as well.
+
+For these reasons, it is recommended that huge data be chunked in subsets of
+more reasonable size.
+
+Chunked Arrays
+==============
+
+A :class:`arrow::ChunkedArray` is, like an array, a logical sequence of values;
+but unlike a simple array, a chunked array does not require the entire sequence
+to be physically contiguous in memory. Also, the constituents of a chunked array
+need not have the same size, but they must all have the same data type.
+
+A chunked array is constructed by aggregating any number of arrays. Here we'll
+build a chunked array with the same logical values as in the example above,
+but in two separate chunks::
+
+ std::vector<std::shared_ptr<arrow::Array>> chunks;
+ std::shared_ptr<arrow::Array> array;
+
+ // Build first chunk
+ arrow::Int64Builder builder;
+ builder.Append(1);
+ builder.Append(2);
+ builder.Append(3);
+ if (!builder.Finish(&array).ok()) {
+ // ... do something on array building failure
+ }
+ chunks.push_back(std::move(array));
+
+ // Build second chunk
+ builder.Reset();
+ builder.AppendNull();
+ builder.Append(5);
+ builder.Append(6);
+ builder.Append(7);
+ builder.Append(8);
+ if (!builder.Finish(&array).ok()) {
+ // ... do something on array building failure
+ }
+ chunks.push_back(std::move(array));
+
+ auto chunked_array = std::make_shared<arrow::ChunkedArray>(std::move(chunks));
+
+ assert(chunked_array->num_chunks() == 2);
+ // Logical length in number of values
+ assert(chunked_array->length() == 8);
+ assert(chunked_array->null_count() == 1);
+
+Slicing
+=======
+
+Like for physical memory buffers, it is possible to make zero-copy slices
+of arrays and chunked arrays, to obtain an array or chunked array referring
+to some logical subsequence of the data. This is done by calling the
+:func:`arrow::Array::Slice` and :func:`arrow::ChunkedArray::Slice` methods,
+respectively.
+
diff --git a/src/arrow/docs/source/cpp/build_system.rst b/src/arrow/docs/source/cpp/build_system.rst
new file mode 100644
index 000000000..c0d05e9da
--- /dev/null
+++ b/src/arrow/docs/source/cpp/build_system.rst
@@ -0,0 +1,136 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+===================================
+Using Arrow C++ in your own project
+===================================
+
+This section assumes you already have the Arrow C++ libraries on your
+system, either after installing them using a package manager or after
+:ref:`building them yourself <building-arrow-cpp>`.
+
+The recommended way to integrate the Arrow C++ libraries in your own
+C++ project is to use CMake's `find_package
+<https://cmake.org/cmake/help/latest/command/find_package.html>`_
+function for locating and integrating dependencies. If you don't use
+CMake as a build system, you can use `pkg-config
+<https://www.freedesktop.org/wiki/Software/pkg-config/>`_ to find
+installed the Arrow C++ libraries.
+
+CMake
+=====
+
+Basic usage
+-----------
+
+This minimal ``CMakeLists.txt`` file compiles a ``my_example.cc`` source
+file into an executable linked with the Arrow C++ shared library:
+
+.. code-block:: cmake
+
+ project(MyExample)
+
+ find_package(Arrow REQUIRED)
+
+ add_executable(my_example my_example.cc)
+ target_link_libraries(my_example PRIVATE arrow_shared)
+
+Available variables and targets
+-------------------------------
+
+The directive ``find_package(Arrow REQUIRED)`` asks CMake to find an Arrow
+C++ installation on your system. When it returns, it will have set a few
+CMake variables:
+
+* ``${Arrow_FOUND}`` is true if the Arrow C++ libraries have been found
+* ``${ARROW_VERSION}`` contains the Arrow version string
+* ``${ARROW_FULL_SO_VERSION}`` contains the Arrow DLL version string
+
+In addition, it will have created some targets that you can link against
+(note these are plain strings, not variables):
+
+* ``arrow_shared`` links to the Arrow shared libraries
+* ``arrow_static`` links to the Arrow static libraries
+
+In most cases, it is recommended to use the Arrow shared libraries.
+
+.. note::
+ CMake is case-sensitive. The names and variables listed above have to be
+ spelt exactly that way!
+
+.. seealso::
+ A Docker-based :doc:`minimal build example <examples/cmake_minimal_build>`.
+
+pkg-config
+==========
+
+Basic usage
+-----------
+
+You can get suitable build flags by the following command line:
+
+.. code-block:: shell
+
+ pkg-config --cflags --libs arrow
+
+If you want to link the Arrow C++ static library, you need to add
+``--static`` option:
+
+.. code-block:: shell
+
+ pkg-config --cflags --libs --static arrow
+
+This minimal ``Makefile`` file compiles a ``my_example.cc`` source
+file into an executable linked with the Arrow C++ shared library:
+
+.. code-block:: makefile
+
+ my_example: my_example.cc
+ $(CXX) -o $@ $(CXXFLAGS) $< $$(pkg-config --cflags --libs arrow)
+
+Many build systems support pkg-config. For example:
+
+ * `GNU Autotools <https://people.freedesktop.org/~dbn/pkg-config-guide.html#using>`_
+ * `CMake <https://cmake.org/cmake/help/latest/module/FindPkgConfig.html>`_
+ (But you should use ``find_package(Arrow)`` instead.)
+ * `Meson <https://mesonbuild.com/Reference-manual.html#dependency>`_
+
+Available packages
+------------------
+
+The Arrow C++ provides a pkg-config package for each module. Here are
+all available packages:
+
+ * ``arrow-csv``
+ * ``arrow-cuda``
+ * ``arrow-dataset``
+ * ``arrow-filesystem``
+ * ``arrow-flight-testing``
+ * ``arrow-flight``
+ * ``arrow-json``
+ * ``arrow-orc``
+ * ``arrow-python-flight``
+ * ``arrow-python``
+ * ``arrow-tensorflow``
+ * ``arrow-testing``
+ * ``arrow``
+ * ``gandiva``
+ * ``parquet``
+ * ``plasma``
diff --git a/src/arrow/docs/source/cpp/compute.rst b/src/arrow/docs/source/cpp/compute.rst
new file mode 100644
index 000000000..dd5696020
--- /dev/null
+++ b/src/arrow/docs/source/cpp/compute.rst
@@ -0,0 +1,1606 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+.. cpp:namespace:: arrow::compute
+
+=================
+Compute Functions
+=================
+
+The generic Compute API
+=======================
+
+.. TODO: describe API and how to invoke compute functions
+
+Functions and function registry
+-------------------------------
+
+Functions represent compute operations over inputs of possibly varying
+types. Internally, a function is implemented by one or several
+"kernels", depending on the concrete input types (for example, a function
+adding values from two inputs can have different kernels depending on
+whether the inputs are integral or floating-point).
+
+Functions are stored in a global :class:`FunctionRegistry` where
+they can be looked up by name.
+
+Input shapes
+------------
+
+Computation inputs are represented as a general :class:`Datum` class,
+which is a tagged union of several shapes of data such as :class:`Scalar`,
+:class:`Array` and :class:`ChunkedArray`. Many compute functions support
+both array (chunked or not) and scalar inputs, however some will mandate
+either. For example, while ``sort_indices`` requires its first and only
+input to be an array.
+
+.. _invoking-compute-functions:
+
+Invoking functions
+------------------
+
+Compute functions can be invoked by name using
+:func:`arrow::compute::CallFunction`::
+
+ std::shared_ptr<arrow::Array> numbers_array = ...;
+ std::shared_ptr<arrow::Scalar> increment = ...;
+ arrow::Datum incremented_datum;
+
+ ARROW_ASSIGN_OR_RAISE(incremented_datum,
+ arrow::compute::CallFunction("add", {numbers_array, increment}));
+ std::shared_ptr<Array> incremented_array = std::move(incremented_datum).array();
+
+(note this example uses implicit conversion from ``std::shared_ptr<Array>``
+to ``Datum``)
+
+Many compute functions are also available directly as concrete APIs, here
+:func:`arrow::compute::Add`::
+
+ std::shared_ptr<arrow::Array> numbers_array = ...;
+ std::shared_ptr<arrow::Scalar> increment = ...;
+ arrow::Datum incremented_datum;
+
+ ARROW_ASSIGN_OR_RAISE(incremented_datum,
+ arrow::compute::Add(numbers_array, increment));
+ std::shared_ptr<Array> incremented_array = std::move(incremented_datum).array();
+
+Some functions accept or require an options structure that determines the
+exact semantics of the function::
+
+ ScalarAggregateOptions scalar_aggregate_options;
+ scalar_aggregate_options.skip_nulls = false;
+
+ std::shared_ptr<arrow::Array> array = ...;
+ arrow::Datum min_max;
+
+ ARROW_ASSIGN_OR_RAISE(min_max,
+ arrow::compute::CallFunction("min_max", {array},
+ &scalar_aggregate_options));
+
+ // Unpack struct scalar result (a two-field {"min", "max"} scalar)
+ std::shared_ptr<arrow::Scalar> min_value, max_value;
+ min_value = min_max.scalar_as<arrow::StructScalar>().value[0];
+ max_value = min_max.scalar_as<arrow::StructScalar>().value[1];
+
+.. seealso::
+ :doc:`Compute API reference <api/compute>`
+
+Implicit casts
+==============
+
+Functions may require conversion of their arguments before execution if a
+kernel does not match the argument types precisely. For example comparison
+of dictionary encoded arrays is not directly supported by any kernel, but an
+implicit cast can be made allowing comparison against the decoded array.
+
+Each function may define implicit cast behaviour as appropriate. For example
+comparison and arithmetic kernels require identically typed arguments, and
+support execution against differing numeric types by promoting their arguments
+to numeric type which can accommodate any value from either input.
+
+.. _common-numeric-type:
+
+Common numeric type
+-------------------
+
+The common numeric type of a set of input numeric types is the smallest numeric
+type which can accommodate any value of any input. If any input is a floating
+point type the common numeric type is the widest floating point type among the
+inputs. Otherwise the common numeric type is integral and is signed if any input
+is signed. For example:
+
++-------------------+----------------------+------------------------------------------------+
+| Input types | Common numeric type | Notes |
++===================+======================+================================================+
+| int32, int32 | int32 | |
++-------------------+----------------------+------------------------------------------------+
+| int16, int32 | int32 | Max width is 32, promote LHS to int32 |
++-------------------+----------------------+------------------------------------------------+
+| uint16, int32 | int32 | One input signed, override unsigned |
++-------------------+----------------------+------------------------------------------------+
+| uint32, int32 | int64 | Widen to accommodate range of uint32 |
++-------------------+----------------------+------------------------------------------------+
+| uint16, uint32 | uint32 | All inputs unsigned, maintain unsigned |
++-------------------+----------------------+------------------------------------------------+
+| int16, uint32 | int64 | |
++-------------------+----------------------+------------------------------------------------+
+| uint64, int16 | int64 | int64 cannot accommodate all uint64 values |
++-------------------+----------------------+------------------------------------------------+
+| float32, int32 | float32 | Promote RHS to float32 |
++-------------------+----------------------+------------------------------------------------+
+| float32, float64 | float64 | |
++-------------------+----------------------+------------------------------------------------+
+| float32, int64 | float32 | int64 is wider, still promotes to float32 |
++-------------------+----------------------+------------------------------------------------+
+
+In particulary, note that comparing a ``uint64`` column to an ``int16`` column
+may emit an error if one of the ``uint64`` values cannot be expressed as the
+common type ``int64`` (for example, ``2 ** 63``).
+
+.. _compute-function-list:
+
+Available functions
+===================
+
+Type categories
+---------------
+
+To avoid exhaustively listing supported types, the tables below use a number
+of general type categories:
+
+* "Numeric": Integer types (Int8, etc.) and Floating-point types (Float32,
+ Float64, sometimes Float16). Some functions also accept Decimal128 and
+ Decimal256 input.
+
+* "Temporal": Date types (Date32, Date64), Time types (Time32, Time64),
+ Timestamp, Duration, Interval.
+
+* "Binary-like": Binary, LargeBinary, sometimes also FixedSizeBinary.
+
+* "String-like": String, LargeString.
+
+* "List-like": List, LargeList, sometimes also FixedSizeList.
+
+* "Nested": List-likes (including FixedSizeList), Struct, Union, and
+ related types like Map.
+
+If you are unsure whether a function supports a concrete input type, we
+recommend you try it out. Unsupported input types return a ``TypeError``
+:class:`Status`.
+
+Aggregations
+------------
+
+Scalar aggregations operate on a (chunked) array or scalar value and reduce
+the input to a single output value.
+
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++====================+=======+==================+========================+==================================+=======+
+| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| approximate_median | Unary | Numeric | Scalar Float64 | :struct:`ScalarAggregateOptions` | |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | \(2) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| count_distinct | Unary | Non-nested types | Scalar Int64 | :struct:`CountOptions` | \(2) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| index | Unary | Any | Scalar Int64 | :struct:`IndexOptions` | |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| max | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| mean | Unary | Numeric | Scalar Decimal/Float64 | :struct:`ScalarAggregateOptions` | |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| min | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| min_max | Unary | Non-nested types | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(4) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| product | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(6) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(7) |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | |
++--------------------+-------+------------------+------------------------+----------------------------------+-------+
+
+* \(1) If null values are taken into account, by setting the
+ ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_
+ logic is applied. The min_count option is not respected.
+
+* \(2) CountMode controls whether only non-null values are counted (the
+ default), only null values are counted, or all values are counted.
+
+* \(3) Output is a ``{"min": input type, "max": input type}`` Struct.
+
+ Of the interval types, only the month interval is supported, as the day-time
+ and month-day-nano types are not sortable.
+
+* \(4) Output is an array of ``{"mode": input type, "count": Int64}`` Struct.
+ It contains the *N* most common elements in the input, in descending
+ order, where *N* is given in :member:`ModeOptions::n`.
+ If two values have the same count, the smallest one comes first.
+ Note that the output can have less than *N* elements if the input has
+ less than *N* distinct values.
+
+* \(5) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the
+ input type.
+
+* \(6) Output is Float64 or input type, depending on QuantileOptions.
+
+* \(7) tdigest/t-digest computes approximate quantiles, and so only needs a
+ fixed amount of memory. See the `reference implementation
+ <https://github.com/tdunning/t-digest>`_ for details.
+
+Grouped Aggregations ("group by")
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Grouped aggregations are not directly invokable, but are used as part of a
+SQL-style "group by" operation. Like scalar aggregations, grouped aggregations
+reduce multiple input values to a single output value. Instead of aggregating
+all values of the input, however, grouped aggregations partition the input
+values on some set of "key" columns, then aggregate each group individually,
+emitting one output value per input group.
+
+As an example, for the following table:
+
++------------------+-----------------+
+| Column ``key`` | Column ``x`` |
++==================+=================+
+| "a" | 2 |
++------------------+-----------------+
+| "a" | 5 |
++------------------+-----------------+
+| "b" | null |
++------------------+-----------------+
+| "b" | null |
++------------------+-----------------+
+| null | null |
++------------------+-----------------+
+| null | 9 |
++------------------+-----------------+
+
+we can compute a sum of the column ``x``, grouped on the column ``key``.
+This gives us three groups, with the following results. Note that null is
+treated as a distinct key value.
+
++------------------+-----------------------+
+| Column ``key`` | Column ``sum(x)`` |
++==================+=======================+
+| "a" | 7 |
++------------------+-----------------------+
+| "b" | null |
++------------------+-----------------------+
+| null | 9 |
++------------------+-----------------------+
+
+The supported aggregation functions are as follows. All function names are
+prefixed with ``hash_``, which differentiates them from their scalar
+equivalents above and reflects how they are implemented internally.
+
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++=========================+=======+====================================+========================+==================================+=======+
+| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_approximate_median | Unary | Numeric | Float64 | :struct:`ScalarAggregateOptions` | |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_count_distinct | Unary | Any | Int64 | :struct:`CountOptions` | \(2) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_distinct | Unary | Any | Input type | :struct:`CountOptions` | \(2) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_max | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_mean | Unary | Numeric | Decimal/Float64 | :struct:`ScalarAggregateOptions` | |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_min | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_min_max | Unary | Non-nested, non-binary/string-like | Struct | :struct:`ScalarAggregateOptions` | \(3) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_product | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_tdigest | Unary | Numeric | FixedSizeList[Float64] | :struct:`TDigestOptions` | \(5) |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | |
++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+
+
+* \(1) If null values are taken into account, by setting the
+ :member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_
+ logic is applied. The min_count option is not respected.
+
+* \(2) CountMode controls whether only non-null values are counted
+ (the default), only null values are counted, or all values are
+ counted. For hash_distinct, it instead controls whether null values
+ are emitted. This never affects the grouping keys, only group values
+ (i.e. you may get a group where the key is null).
+
+* \(3) Output is a ``{"min": input type, "max": input type}`` Struct array.
+
+ Of the interval types, only the month interval is supported, as the day-time
+ and month-day-nano types are not sortable.
+
+* \(4) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the
+ input type.
+
+* \(5) T-digest computes approximate quantiles, and so only needs a
+ fixed amount of memory. See the `reference implementation
+ <https://github.com/tdunning/t-digest>`_ for details.
+
+Element-wise ("scalar") functions
+---------------------------------
+
+All element-wise functions accept both arrays and scalars as input. The
+semantics for unary functions are as follow:
+
+* scalar inputs produce a scalar output
+* array inputs produce an array output
+
+Binary functions have the following semantics (which is sometimes called
+"broadcasting" in other systems such as NumPy):
+
+* ``(scalar, scalar)`` inputs produce a scalar output
+* ``(array, array)`` inputs produce an array output (and both inputs must
+ be of the same length)
+* ``(scalar, array)`` and ``(array, scalar)`` produce an array output.
+ The scalar input is handled as if it were an array of the same length N
+ as the other input, with the same value repeated N times.
+
+Arithmetic functions
+~~~~~~~~~~~~~~~~~~~~
+
+These functions expect inputs of numeric type and apply a given arithmetic
+operation to each element(s) gathered from the input(s). If any of the
+input element(s) is null, the corresponding output element is null.
+For binary functions, input(s) will be cast to the
+:ref:`common numeric type <common-numeric-type>`
+(and dictionary decoded, if applicable) before the operation is applied.
+
+The default variant of these functions does not detect overflow (the result
+then typically wraps around). Most functions are also available in an
+overflow-checking variant, suffixed ``_checked``, which returns
+an ``Invalid`` :class:`Status` when overflow is detected.
+
+For functions which support decimal inputs (currently ``add``, ``subtract``,
+``multiply``, and ``divide`` and their checked variants), decimals of different
+precisions/scales will be promoted appropriately. Mixed decimal and
+floating-point arguments will cast all arguments to floating-point, while mixed
+decimal and integer arguments will cast all arguments to decimals.
+
++------------------+--------+----------------+----------------------+-------+
+| Function name | Arity | Input types | Output type | Notes |
++==================+========+================+======================+=======+
+| abs | Unary | Numeric | Numeric | |
++------------------+--------+----------------+----------------------+-------+
+| abs_checked | Unary | Numeric | Numeric | |
++------------------+--------+----------------+----------------------+-------+
+| add | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+| add_checked | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+| divide | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+| divide_checked | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+| multiply | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+| multiply_checked | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+| negate | Unary | Numeric | Numeric | |
++------------------+--------+----------------+----------------------+-------+
+| negate_checked | Unary | Signed Numeric | Signed Numeric | |
++------------------+--------+----------------+----------------------+-------+
+| power | Binary | Numeric | Numeric | |
++------------------+--------+----------------+----------------------+-------+
+| power_checked | Binary | Numeric | Numeric | |
++------------------+--------+----------------+----------------------+-------+
+| sign | Unary | Numeric | Int8/Float32/Float64 | \(2) |
++------------------+--------+----------------+----------------------+-------+
+| subtract | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+| subtract_checked | Binary | Numeric | Numeric | \(1) |
++------------------+--------+----------------+----------------------+-------+
+
+* \(1) Precision and scale of computed DECIMAL results
+
+ +------------+---------------------------------------------+
+ | Operation | Result precision and scale |
+ +============+=============================================+
+ | | add | | scale = max(s1, s2) |
+ | | subtract | | precision = max(p1-s1, p2-s2) + 1 + scale |
+ +------------+---------------------------------------------+
+ | multiply | | scale = s1 + s2 |
+ | | | precision = p1 + p2 + 1 |
+ +------------+---------------------------------------------+
+ | divide | | scale = max(4, s1 + p2 - s2 + 1) |
+ | | | precision = p1 - s1 + s2 + scale |
+ +------------+---------------------------------------------+
+
+ It's compatible with Redshift's decimal promotion rules. All decimal digits
+ are preserved for `add`, `subtract` and `multiply` operations. The result
+ precision of `divide` is at least the sum of precisions of both operands with
+ enough scale kept. Error is returned if the result precision is beyond the
+ decimal value range.
+
+* \(2) Output is any of (-1,1) for nonzero inputs and 0 for zero input.
+ NaN values return NaN. Integral values return signedness as Int8 and
+ floating-point values return it with the same type as the input values.
+
+Bit-wise functions
+~~~~~~~~~~~~~~~~~~
+
++--------------------------+------------+--------------------+---------------------+
+| Function name | Arity | Input types | Output type |
++==========================+============+====================+=====================+
+| bit_wise_and | Binary | Numeric | Numeric |
++--------------------------+------------+--------------------+---------------------+
+| bit_wise_not | Unary | Numeric | Numeric |
++--------------------------+------------+--------------------+---------------------+
+| bit_wise_or | Binary | Numeric | Numeric |
++--------------------------+------------+--------------------+---------------------+
+| bit_wise_xor | Binary | Numeric | Numeric |
++--------------------------+------------+--------------------+---------------------+
+| shift_left | Binary | Numeric | Numeric |
++--------------------------+------------+--------------------+---------------------+
+| shift_left_checked | Binary | Numeric | Numeric (1) |
++--------------------------+------------+--------------------+---------------------+
+| shift_right | Binary | Numeric | Numeric |
++--------------------------+------------+--------------------+---------------------+
+| shift_right_checked | Binary | Numeric | Numeric (1) |
++--------------------------+------------+--------------------+---------------------+
+
+* \(1) An error is emitted if the shift amount (i.e. the second input) is
+ out of bounds for the data type. However, an overflow when shifting the
+ first input is not error (truncated bits are silently discarded).
+
+Rounding functions
+~~~~~~~~~~~~~~~~~~
+
+Rounding functions displace numeric inputs to an approximate value with a simpler
+representation based on the rounding criterion.
+
++-------------------+------------+-------------+-------------------------+----------------------------------+--------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++===================+============+=============+=========================+==================================+========+
+| ceil | Unary | Numeric | Float32/Float64/Decimal | | |
++-------------------+------------+-------------+-------------------------+----------------------------------+--------+
+| floor | Unary | Numeric | Float32/Float64/Decimal | | |
++-------------------+------------+-------------+-------------------------+----------------------------------+--------+
+| round | Unary | Numeric | Float32/Float64/Decimal | :struct:`RoundOptions` | (1)(2) |
++-------------------+------------+-------------+-------------------------+----------------------------------+--------+
+| round_to_multiple | Unary | Numeric | Float32/Float64/Decimal | :struct:`RoundToMultipleOptions` | (1)(3) |
++-------------------+------------+-------------+-------------------------+----------------------------------+--------+
+| trunc | Unary | Numeric | Float32/Float64/Decimal | | |
++-------------------+------------+-------------+-------------------------+----------------------------------+--------+
+
+* \(1) Output value is a 64-bit floating-point for integral inputs and the
+ retains the same type for floating-point and decimal inputs. By default
+ rounding functions displace a value to the nearest integer using
+ HALF_TO_EVEN to resolve ties. Options are available to control the rounding
+ criterion. Both ``round`` and ``round_to_multiple`` have the ``round_mode``
+ option to set the rounding mode.
+* \(2) Round to a number of digits where the ``ndigits`` option of
+ :struct:`RoundOptions` specifies the rounding precision in terms of number
+ of digits. A negative value corresponds to digits in the non-fractional
+ part. For example, -2 corresponds to rounding to the nearest multiple of
+ 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0
+ which rounds to the nearest integer.
+* \(3) Round to a multiple where the ``multiple`` option of
+ :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding
+ multiple has to be a positive value. For example, 100 corresponds to
+ rounding to the nearest multiple of 100 (zeroing the ones and tens digits).
+ Default value of ``multiple`` is 1 which rounds to the nearest integer.
+
+For ``round`` and ``round_to_multiple``, the following rounding modes are available.
+Tie-breaking modes are prefixed with HALF and round non-ties to the nearest integer.
+The example values are given for default values of ``ndigits`` and ``multiple``.
+
++-----------------------+--------------------------------------------------------------+---------------------------+
+| ``round_mode`` | Operation performed | Example values |
++=======================+==============================================================+===========================+
+| DOWN | Round to nearest integer less than or equal in magnitude; | 3.2 -> 3, 3.7 -> 3, |
+| | also known as ``floor(x)`` | -3.2 -> -4, -3.7 -> -4 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| UP | Round to nearest integer greater than or equal in magnitude; | 3.2 -> 4, 3.7 -> 4, |
+| | also known as ``ceil(x)`` | -3.2 -> -3, -3.7 -> -3 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| TOWARDS_ZERO | Get the integral part without fractional digits; | 3.2 -> 3, 3.7 -> 3, |
+| | also known as ``trunc(x)`` | -3.2 -> -3, -3.7 -> -3 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| TOWARDS_INFINITY | Round negative values with ``DOWN`` rule, | 3.2 -> 4, 3.7 -> 4, |
+| | round positive values with ``UP`` rule | -3.2 -> -4, -3.7 -> -4 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| HALF_DOWN | Round ties with ``DOWN`` rule | 3.5 -> 3, 4.5 -> 4, |
+| | | -3.5 -> -4, -4.5 -> -5 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| HALF_UP | Round ties with ``UP`` rule | 3.5 -> 4, 4.5 -> 5, |
+| | | -3.5 -> -3, -4.5 -> -4 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| HALF_TOWARDS_ZERO | Round ties with ``TOWARDS_ZERO`` rule | 3.5 -> 3, 4.5 -> 4, |
+| | | -3.5 -> -3, -4.5 -> -4 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| HALF_TOWARDS_INFINITY | Round ties with ``TOWARDS_INFINITY`` rule | 3.5 -> 4, 4.5 -> 5, |
+| | | -3.5 -> -4, -4.5 -> -5 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| HALF_TO_EVEN | Round ties to nearest even integer | 3.5 -> 4, 4.5 -> 4, |
+| | | -3.5 -> -4, -4.5 -> -4 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+| HALF_TO_ODD | Round ties to nearest odd integer | 3.5 -> 3, 4.5 -> 5, |
+| | | -3.5 -> -3, -4.5 -> -5 |
++-----------------------+--------------------------------------------------------------+---------------------------+
+
+The following table gives examples of how ``ndigits`` (for the ``round``
+function) and ``multiple`` (for ``round_to_multiple``) influence the operance
+performed, respectively.
+
++--------------------+-------------------+---------------------------+
+| Round ``multiple`` | Round ``ndigits`` | Operation performed |
++====================+===================+===========================+
+| 1 | 0 | Round to integer |
++--------------------+-------------------+---------------------------+
+| 0.001 | 3 | Round to 3 decimal places |
++--------------------+-------------------+---------------------------+
+| 10 | -1 | Round to multiple of 10 |
++--------------------+-------------------+---------------------------+
+| 2 | NA | Round to multiple of 2 |
++--------------------+-------------------+---------------------------+
+
+Logarithmic functions
+~~~~~~~~~~~~~~~~~~~~~
+
+Logarithmic functions are also supported, and also offer ``_checked``
+variants that check for domain errors if needed.
+
++--------------------------+------------+--------------------+---------------------+
+| Function name | Arity | Input types | Output type |
++==========================+============+====================+=====================+
+| ln | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| ln_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| log10 | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| log10_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| log1p | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| log1p_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| log2 | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| log2_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| logb | Binary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| logb_checked | Binary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+
+Trigonometric functions
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Trigonometric functions are also supported, and also offer ``_checked``
+variants that check for domain errors if needed.
+
++--------------------------+------------+--------------------+---------------------+
+| Function name | Arity | Input types | Output type |
++==========================+============+====================+=====================+
+| acos | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| acos_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| asin | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| asin_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| atan | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| atan2 | Binary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| cos | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| cos_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| sin | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| sin_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| tan | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+| tan_checked | Unary | Float32/Float64 | Float32/Float64 |
++--------------------------+------------+--------------------+---------------------+
+
+Comparisons
+~~~~~~~~~~~
+
+These functions expect two inputs of numeric type (in which case they will be
+cast to the :ref:`common numeric type <common-numeric-type>` before comparison),
+or two inputs of Binary- or String-like types, or two inputs of Temporal types.
+If any input is dictionary encoded it will be expanded for the purposes of
+comparison. If any of the input elements in a pair is null, the corresponding
+output element is null. Decimal arguments will be promoted in the same way as
+for ``add`` and ``subtract``.
+
++----------------+------------+---------------------------------------------+---------------------+
+| Function names | Arity | Input types | Output type |
++================+============+=============================================+=====================+
+| equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean |
++----------------+------------+---------------------------------------------+---------------------+
+| greater | Binary | Numeric, Temporal, Binary- and String-like | Boolean |
++----------------+------------+---------------------------------------------+---------------------+
+| greater_equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean |
++----------------+------------+---------------------------------------------+---------------------+
+| less | Binary | Numeric, Temporal, Binary- and String-like | Boolean |
++----------------+------------+---------------------------------------------+---------------------+
+| less_equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean |
++----------------+------------+---------------------------------------------+---------------------+
+| not_equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean |
++----------------+------------+---------------------------------------------+---------------------+
+
+These functions take any number of inputs of numeric type (in which case they
+will be cast to the :ref:`common numeric type <common-numeric-type>` before
+comparison) or of temporal types. If any input is dictionary encoded it will be
+expanded for the purposes of comparison.
+
++------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+
+| Function names | Arity | Input types | Output type | Options class | Notes |
++==================+============+=============================================+=====================+=======================================+=======+
+| max_element_wise | Varargs | Numeric and Temporal | Numeric or Temporal | :struct:`ElementWiseAggregateOptions` | \(1) |
++------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+
+| min_element_wise | Varargs | Numeric and Temporal | Numeric or Temporal | :struct:`ElementWiseAggregateOptions` | \(1) |
++------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+
+
+* \(1) By default, nulls are skipped (but the kernel can be configured to propagate nulls).
+ For floating point values, NaN will be taken over null but not over any other value.
+
+Logical functions
+~~~~~~~~~~~~~~~~~~
+
+The normal behaviour for these functions is to emit a null if any of the
+inputs is null (similar to the semantics of ``NaN`` in floating-point
+computations).
+
+Some of them are also available in a `Kleene logic`_ variant (suffixed
+``_kleene``) where null is taken to mean "undefined". This is the
+interpretation of null used in SQL systems as well as R and Julia,
+for example.
+
+For the Kleene logic variants, therefore:
+
+* "true AND null", "null AND true" give "null" (the result is undefined)
+* "true OR null", "null OR true" give "true"
+* "false AND null", "null AND false" give "false"
+* "false OR null", "null OR false" give "null" (the result is undefined)
+
++--------------------------+------------+--------------------+---------------------+
+| Function name | Arity | Input types | Output type |
++==========================+============+====================+=====================+
+| and | Binary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+| and_kleene | Binary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+| and_not | Binary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+| and_not_kleene | Binary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+| invert | Unary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+| or | Binary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+| or_kleene | Binary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+| xor | Binary | Boolean | Boolean |
++--------------------------+------------+--------------------+---------------------+
+
+.. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics
+
+String predicates
+~~~~~~~~~~~~~~~~~
+
+These functions classify the input string elements according to their character
+contents. An empty string element emits false in the output. For ASCII
+variants of the functions (prefixed ``ascii_``), a string element with non-ASCII
+characters emits false in the output.
+
+The first set of functions operates on a character-per-character basis,
+and emit true in the output if the input contains only characters of a
+given class:
+
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| Function name | Arity | Input types | Output type | Matched character class | Notes |
++====================+=======+=============+=============+=========================+=======+
+| ascii_is_alnum | Unary | String-like | Boolean | Alphanumeric ASCII | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| ascii_is_alpha | Unary | String-like | Boolean | Alphabetic ASCII | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| ascii_is_decimal | Unary | String-like | Boolean | Decimal ASCII | \(1) |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| ascii_is_lower | Unary | String-like | Boolean | Lowercase ASCII | \(2) |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| ascii_is_printable | Unary | String-like | Boolean | Printable ASCII | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| ascii_is_space | Unary | String-like | Boolean | Whitespace ASCII | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| ascii_is_upper | Unary | String-like | Boolean | Uppercase ASCII | \(2) |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_alnum | Unary | String-like | Boolean | Alphanumeric Unicode | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_alpha | Unary | String-like | Boolean | Alphabetic Unicode | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_decimal | Unary | String-like | Boolean | Decimal Unicode | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_digit | Unary | String-like | Boolean | Unicode digit | \(3) |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_lower | Unary | String-like | Boolean | Lowercase Unicode | \(2) |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_numeric | Unary | String-like | Boolean | Numeric Unicode | \(4) |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_printable | Unary | String-like | Boolean | Printable Unicode | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_space | Unary | String-like | Boolean | Whitespace Unicode | |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+| utf8_is_upper | Unary | String-like | Boolean | Uppercase Unicode | \(2) |
++--------------------+-------+-------------+-------------+-------------------------+-------+
+
+* \(1) Also matches all numeric ASCII characters and all ASCII digits.
+
+* \(2) Non-cased characters, such as punctuation, do not match.
+
+* \(3) This is currently the same as ``utf8_is_decimal``.
+
+* \(4) Unlike ``utf8_is_decimal``, non-decimal numeric characters also match.
+
+The second set of functions also consider the character order in a string
+element:
+
++--------------------------+------------+--------------------+---------------------+---------+
+| Function name | Arity | Input types | Output type | Notes |
++==========================+============+====================+=====================+=========+
+| ascii_is_title | Unary | String-like | Boolean | \(1) |
++--------------------------+------------+--------------------+---------------------+---------+
+| utf8_is_title | Unary | String-like | Boolean | \(1) |
++--------------------------+------------+--------------------+---------------------+---------+
+
+* \(1) Output is true iff the input string element is title-cased, i.e. any
+ word starts with an uppercase character, followed by lowercase characters.
+ Word boundaries are defined by non-cased characters.
+
+The third set of functions examines string elements on a byte-per-byte basis:
+
++--------------------------+------------+--------------------+---------------------+---------+
+| Function name | Arity | Input types | Output type | Notes |
++==========================+============+====================+=====================+=========+
+| string_is_ascii | Unary | String-like | Boolean | \(1) |
++--------------------------+------------+--------------------+---------------------+---------+
+
+* \(1) Output is true iff the input string element contains only ASCII characters,
+ i.e. only bytes in [0, 127].
+
+String transforms
+~~~~~~~~~~~~~~~~~
+
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++=========================+=======+========================+========================+===================================+=======+
+| ascii_capitalize | Unary | String-like | String-like | | \(1) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_lower | Unary | String-like | String-like | | \(1) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_reverse | Unary | String-like | String-like | | \(2) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_swapcase | Unary | String-like | String-like | | \(1) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_title | Unary | String-like | String-like | | \(1) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_upper | Unary | String-like | String-like | | \(1) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| binary_length | Unary | Binary- or String-like | Int32 or Int64 | | \(3) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(4) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(5) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_capitalize | Unary | String-like | String-like | | \(8) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_length | Unary | String-like | Int32 or Int64 | | \(7) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_lower | Unary | String-like | String-like | | \(8) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(4) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_reverse | Unary | String-like | String-like | | \(9) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_swapcase | Unary | String-like | String-like | | \(8) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_title | Unary | String-like | String-like | | \(8) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_upper | Unary | String-like | String-like | | \(8) |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+
+* \(1) Each ASCII character in the input is converted to lowercase or
+ uppercase. Non-ASCII characters are left untouched.
+
+* \(2) ASCII input is reversed to the output. If non-ASCII characters
+ are present, ``Invalid`` :class:`Status` will be returned.
+
+* \(3) Output is the physical length in bytes of each input element. Output
+ type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.
+
+* \(4) Replace the slice of the substring from :member:`ReplaceSliceOptions::start`
+ (inclusive) to :member:`ReplaceSliceOptions::stop` (exclusive) by
+ :member:`ReplaceSubstringOptions::replacement`. The binary kernel measures the
+ slice in bytes, while the UTF8 kernel measures the slice in codeunits.
+
+* \(5) Replace non-overlapping substrings that match to
+ :member:`ReplaceSubstringOptions::pattern` by
+ :member:`ReplaceSubstringOptions::replacement`. If
+ :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
+ maximum number of replacements made, counting from the left.
+
+* \(6) Replace non-overlapping substrings that match to the regular expression
+ :member:`ReplaceSubstringOptions::pattern` by
+ :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
+ :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
+ maximum number of replacements made, counting from the left. Note that if the
+ pattern contains groups, backreferencing can be used.
+
+* \(7) Output is the number of characters (not bytes) of each input element.
+ Output type is Int32 for String, Int64 for LargeString.
+
+* \(8) Each UTF8-encoded character in the input is converted to lowercase or
+ uppercase.
+
+* \(9) Each UTF8-encoded code unit is written in reverse order to the output.
+ If the input is not valid UTF8, then the output is undefined (but the size of output
+ buffers will be preserved).
+
+String padding
+~~~~~~~~~~~~~~
+
+These functions append/prepend a given padding byte (ASCII) or codepoint (UTF8) in
+order to center (center), right-align (lpad), or left-align (rpad) a string.
+
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+
+| Function name | Arity | Input types | Output type | Options class |
++==========================+============+=========================+=====================+========================================+
+| ascii_center | Unary | String-like | String-like | :struct:`PadOptions` |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+
+| ascii_lpad | Unary | String-like | String-like | :struct:`PadOptions` |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+
+| ascii_rpad | Unary | String-like | String-like | :struct:`PadOptions` |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+
+| utf8_center | Unary | String-like | String-like | :struct:`PadOptions` |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+
+| utf8_lpad | Unary | String-like | String-like | :struct:`PadOptions` |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+
+| utf8_rpad | Unary | String-like | String-like | :struct:`PadOptions` |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+
+
+String trimming
+~~~~~~~~~~~~~~~
+
+These functions trim off characters on both sides (trim), or the left (ltrim) or right side (rtrim).
+
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++==========================+============+=========================+=====================+========================================+=========+
+| ascii_ltrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(1) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| ascii_ltrim_whitespace | Unary | String-like | String-like | | \(2) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| ascii_rtrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(1) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| ascii_rtrim_whitespace | Unary | String-like | String-like | | \(2) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| ascii_trim | Unary | String-like | String-like | :struct:`TrimOptions` | \(1) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| ascii_trim_whitespace | Unary | String-like | String-like | | \(2) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| utf8_ltrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(3) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| utf8_ltrim_whitespace | Unary | String-like | String-like | | \(4) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| utf8_rtrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(3) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| utf8_rtrim_whitespace | Unary | String-like | String-like | | \(4) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| utf8_trim | Unary | String-like | String-like | :struct:`TrimOptions` | \(3) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+| utf8_trim_whitespace | Unary | String-like | String-like | | \(4) |
++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+
+
+* \(1) Only characters specified in :member:`TrimOptions::characters` will be
+ trimmed off. Both the input string and the `characters` argument are
+ interpreted as ASCII characters.
+
+* \(2) Only trim off ASCII whitespace characters (``'\t'``, ``'\n'``, ``'\v'``,
+ ``'\f'``, ``'\r'`` and ``' '``).
+
+* \(3) Only characters specified in :member:`TrimOptions::characters` will be
+ trimmed off.
+
+* \(4) Only trim off Unicode whitespace characters.
+
+String splitting
+~~~~~~~~~~~~~~~~
+
+These functions split strings into lists of strings. All kernels can optionally
+be configured with a ``max_splits`` and a ``reverse`` parameter, where
+``max_splits == -1`` means no limit (the default). When ``reverse`` is true,
+the splitting is done starting from the end of the string; this is only relevant
+when a positive ``max_splits`` is given.
+
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++==========================+============+=========================+===================+==================================+=========+
+| ascii_split_whitespace | Unary | String-like | List-like | :struct:`SplitOptions` | \(1) |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| split_pattern | Unary | String-like | List-like | :struct:`SplitPatternOptions` | \(2) |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| split_pattern_regex | Unary | String-like | List-like | :struct:`SplitPatternOptions` | \(3) |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| utf8_split_whitespace | Unary | String-like | List-like | :struct:`SplitOptions` | \(4) |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+
+* \(1) A non-zero length sequence of ASCII defined whitespace bytes
+ (``'\t'``, ``'\n'``, ``'\v'``, ``'\f'``, ``'\r'`` and ``' '``) is seen
+ as separator.
+
+* \(2) The string is split when an exact pattern is found (the pattern itself
+ is not included in the output).
+
+* \(3) The string is split when a regex match is found (the matched
+ substring itself is not included in the output).
+
+* \(4) A non-zero length sequence of Unicode defined whitespace codepoints
+ is seen as separator.
+
+String component extraction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++---------------+-------+-------------+-------------+-------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++===============+=======+=============+=============+===============================+=======+
+| extract_regex | Unary | String-like | Struct | :struct:`ExtractRegexOptions` | \(1) |
++---------------+-------+-------------+-------------+-------------------------------+-------+
+
+* \(1) Extract substrings defined by a regular expression using the Google RE2
+ library. The output struct field names refer to the named capture groups,
+ e.g. 'letter' and 'digit' for the regular expression
+ ``(?P<letter>[ab])(?P<digit>\\d)``.
+
+String joining
+~~~~~~~~~~~~~~
+
+These functions do the inverse of string splitting.
+
++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
+| Function name | Arity | Input type 1 | Input type 2 | Output type | Options class | Notes |
++==========================+===========+=======================+================+===================+=======================+=========+
+| binary_join | Binary | List of string-like | String-like | String-like | | \(1) |
++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
+| binary_join_element_wise | Varargs | String-like (varargs) | String-like | String-like | :struct:`JoinOptions` | \(2) |
++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
+
+* \(1) The first input must be an array, while the second can be a scalar or array.
+ Each list of values in the first input is joined using each second input
+ as separator. If any input list is null or contains a null, the corresponding
+ output will be null.
+
+* \(2) All arguments are concatenated element-wise, with the last argument treated
+ as the separator (scalars are recycled in either case). Null separators emit
+ null. If any other argument is null, by default the corresponding output will be
+ null, but it can instead either be skipped or replaced with a given string.
+
+String Slicing
+~~~~~~~~~~~~~~
+
+This function transforms each sequence of the array to a subsequence, according
+to start and stop indices, and a non-zero step (defaulting to 1). Slicing
+semantics follow Python slicing semantics: the start index is inclusive,
+the stop index exclusive; if the step is negative, the sequence is followed
+in reverse order.
+
++--------------------------+------------+----------------+-----------------+--------------------------+---------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++==========================+============+================+=================+==========================+=========+
+| utf8_slice_codeunits | Unary | String-like | String-like | :struct:`SliceOptions` | \(1) |
++--------------------------+------------+----------------+-----------------+--------------------------+---------+
+
+* \(1) Slice string into a substring defined by (``start``, ``stop``, ``step``)
+ as given by :struct:`SliceOptions` where ``start`` and ``stop`` are measured
+ in codeunits. Null inputs emit null.
+
+Containment tests
+~~~~~~~~~~~~~~~~~
+
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++=======================+=======+===================================+================+=================================+=======+
+| count_substring | Unary | String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| count_substring_regex | Unary | String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| ends_with | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(2) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| find_substring | Unary | Binary- and String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(3) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| find_substring_regex | Unary | Binary- and String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(3) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 | :struct:`SetLookupOptions` | \(4) |
+| | | Binary- and String-like | | | |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean | :struct:`SetLookupOptions` | \(5) |
+| | | Binary- and String-like | | | |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| match_like | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(6) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| match_substring | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(7) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| match_substring_regex | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(8) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+| starts_with | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(2) |
++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+
+
+* \(1) Output is the number of occurrences of
+ :member:`MatchSubstringOptions::pattern` in the corresponding input
+ string. Output type is Int32 for Binary/String, Int64
+ for LargeBinary/LargeString.
+
+* \(2) Output is true iff :member:`MatchSubstringOptions::pattern`
+ is a suffix/prefix of the corresponding input.
+
+* \(3) Output is the index of the first occurrence of
+ :member:`MatchSubstringOptions::pattern` in the corresponding input
+ string, otherwise -1. Output type is Int32 for Binary/String, Int64
+ for LargeBinary/LargeString.
+
+* \(4) Output is the index of the corresponding input element in
+ :member:`SetLookupOptions::value_set`, if found there. Otherwise,
+ output is null.
+
+* \(5) Output is true iff the corresponding input element is equal to one
+ of the elements in :member:`SetLookupOptions::value_set`.
+
+* \(6) Output is true iff the SQL-style LIKE pattern
+ :member:`MatchSubstringOptions::pattern` fully matches the
+ corresponding input element. That is, ``%`` will match any number of
+ characters, ``_`` will match exactly one character, and any other
+ character matches itself. To match a literal percent sign or
+ underscore, precede the character with a backslash.
+
+* \(7) Output is true iff :member:`MatchSubstringOptions::pattern`
+ is a substring of the corresponding input element.
+
+* \(8) Output is true iff :member:`MatchSubstringOptions::pattern`
+ matches the corresponding input element at any position.
+
+Categorizations
+~~~~~~~~~~~~~~~
+
++-------------------+------------+---------------------+---------------------+------------------------+---------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++===================+============+=====================+=====================+========================+=========+
+| is_finite | Unary | Float, Double | Boolean | | \(1) |
++-------------------+------------+---------------------+---------------------+------------------------+---------+
+| is_inf | Unary | Float, Double | Boolean | | \(2) |
++-------------------+------------+---------------------+---------------------+------------------------+---------+
+| is_nan | Unary | Float, Double | Boolean | | \(3) |
++-------------------+------------+---------------------+---------------------+------------------------+---------+
+| is_null | Unary | Any | Boolean | :struct:`NullOptions` | \(4) |
++-------------------+------------+---------------------+---------------------+------------------------+---------+
+| is_valid | Unary | Any | Boolean | | \(5) |
++-------------------+------------+---------------------+---------------------+------------------------+---------+
+
+* \(1) Output is true iff the corresponding input element is finite (neither Infinity,
+ -Infinity, nor NaN).
+
+* \(2) Output is true iff the corresponding input element is Infinity/-Infinity.
+
+* \(3) Output is true iff the corresponding input element is NaN.
+
+* \(4) Output is true iff the corresponding input element is null. NaN values
+ can also be considered null by setting :member:`NullOptions::nan_is_null`.
+
+* \(5) Output is true iff the corresponding input element is non-null.
+
+.. _cpp-compute-scalar-selections:
+
+Selecting / multiplexing
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+For each "row" of input values, these functions emit one of the input values,
+depending on a condition.
+
++------------------+------------+---------------------------------------------------+---------------------+---------+
+| Function name | Arity | Input types | Output type | Notes |
++==================+============+===================================================+=====================+=========+
+| case_when | Varargs | Struct of Boolean (Arg 0), Any (rest) | Input type | \(1) |
++------------------+------------+---------------------------------------------------+---------------------+---------+
+| choose | Varargs | Integral (Arg 0), Fixed-width/Binary-like (rest) | Input type | \(2) |
++------------------+------------+---------------------------------------------------+---------------------+---------+
+| coalesce | Varargs | Any | Input type | \(3) |
++------------------+------------+---------------------------------------------------+---------------------+---------+
+| if_else | Ternary | Boolean (Arg 0), Any (rest) | Input type | \(4) |
++------------------+------------+---------------------------------------------------+---------------------+---------+
+
+* \(1) This function acts like a SQL "case when" statement or switch-case. The
+ input is a "condition" value, which is a struct of Booleans, followed by the
+ values for each "branch". There must be either exactly one value argument for
+ each child of the condition struct, or one more value argument than children
+ (in which case we have an "else" or "default" value). The output is of the
+ same type as the value inputs; each row will be the corresponding value from
+ the first value datum for which the corresponding Boolean is true, or the
+ corresponding value from the "default" input, or null otherwise.
+
+ Note that currently, while all types are supported, dictionaries will be
+ unpacked.
+
+* \(2) The first input must be an integral type. The rest of the arguments can be
+ any type, but must all be the same type or promotable to a common type. Each
+ value of the first input (the 'index') is used as a zero-based index into the
+ remaining arguments (i.e. index 0 is the second argument, index 1 is the third
+ argument, etc.), and the value of the output for that row will be the
+ corresponding value of the selected input at that row. If the index is null,
+ then the output will also be null.
+
+* \(3) Each row of the output will be the corresponding value of the first
+ input which is non-null for that row, otherwise null.
+
+* \(4) First input must be a Boolean scalar or array. Second and third inputs
+ could be scalars or arrays and must be of the same type. Output is an array
+ (or scalar if all inputs are scalar) of the same type as the second/ third
+ input. If the nulls present on the first input, they will be promoted to the
+ output, otherwise nulls will be chosen based on the first input values.
+
+ Also see: :ref:`replace_with_mask <cpp-compute-vector-structural-transforms>`.
+
+Structural transforms
+~~~~~~~~~~~~~~~~~~~~~
+
++---------------------+------------+-------------+------------------+------------------------------+--------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++=====================+============+=============+==================+==============================+========+
+| list_value_length | Unary | List-like | Int32 or Int64 | | \(1) |
++---------------------+------------+-------------+------------------+------------------------------+--------+
+| make_struct | Varargs | Any | Struct | :struct:`MakeStructOptions` | \(2) |
++---------------------+------------+-------------+------------------+------------------------------+--------+
+
+* \(1) Each output element is the length of the corresponding input element
+ (null if input is null). Output type is Int32 for List and FixedSizeList,
+ Int64 for LargeList.
+
+* \(2) The output struct's field types are the types of its arguments. The
+ field names are specified using an instance of :struct:`MakeStructOptions`.
+ The output shape will be scalar if all inputs are scalar, otherwise any
+ scalars will be broadcast to arrays.
+
+Conversions
+~~~~~~~~~~~
+
+A general conversion function named ``cast`` is provided which accepts a large
+number of input and output types. The type to cast to can be passed in a
+:struct:`CastOptions` instance. As an alternative, the same service is
+provided by a concrete function :func:`~arrow::compute::Cast`.
+
++-----------------+------------+--------------------+------------------+------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++=================+============+====================+==================+==============================+=======+
+| cast | Unary | Many | Variable | :struct:`CastOptions` | |
++-----------------+------------+--------------------+------------------+------------------------------+-------+
+| strftime | Unary | Temporal | String | :struct:`StrftimeOptions` | \(1) |
++-----------------+------------+--------------------+------------------+------------------------------+-------+
+| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | |
++-----------------+------------+--------------------+------------------+------------------------------+-------+
+
+The conversions available with ``cast`` are listed below. In all cases, a
+null input value is converted into a null output value.
+
+* \(1) Output precision of ``%S`` (seconds) flag depends on the input timestamp
+ precision. Timestamps with second precision are represented as integers while
+ milliseconds, microsecond and nanoseconds are represented as fixed floating
+ point numbers with 3, 6 and 9 decimal places respectively. To obtain integer
+ seconds, cast to timestamp with second resolution.
+ The character for the decimal point is localized according to the locale.
+ See `detailed formatting documentation`_ for descriptions of other flags.
+
+.. _detailed formatting documentation: https://howardhinnant.github.io/date/date.html#to_stream_formatting
+
+**Truth value extraction**
+
++-----------------------------+------------------------------------+--------------+
+| Input type | Output type | Notes |
++=============================+====================================+==============+
+| Binary- and String-like | Boolean | \(1) |
++-----------------------------+------------------------------------+--------------+
+| Numeric | Boolean | \(2) |
++-----------------------------+------------------------------------+--------------+
+
+* \(1) Output is true iff the corresponding input value has non-zero length.
+
+* \(2) Output is true iff the corresponding input value is non-zero.
+
+**Same-kind conversion**
+
++-----------------------------+------------------------------------+--------------+
+| Input type | Output type | Notes |
++=============================+====================================+==============+
+| Int32 | 32-bit Temporal | \(1) |
++-----------------------------+------------------------------------+--------------+
+| Int64 | 64-bit Temporal | \(1) |
++-----------------------------+------------------------------------+--------------+
+| (Large)Binary | (Large)String | \(2) |
++-----------------------------+------------------------------------+--------------+
+| (Large)String | (Large)Binary | \(3) |
++-----------------------------+------------------------------------+--------------+
+| Numeric | Numeric | \(4) \(5) |
++-----------------------------+------------------------------------+--------------+
+| 32-bit Temporal | Int32 | \(1) |
++-----------------------------+------------------------------------+--------------+
+| 64-bit Temporal | Int64 | \(1) |
++-----------------------------+------------------------------------+--------------+
+| Temporal | Temporal | \(4) \(5) |
++-----------------------------+------------------------------------+--------------+
+
+* \(1) No-operation cast: the raw values are kept identical, only
+ the type is changed.
+
+* \(2) Validates the contents if :member:`CastOptions::allow_invalid_utf8`
+ is false.
+
+* \(3) No-operation cast: only the type is changed.
+
+* \(4) Overflow and truncation checks are enabled depending on
+ the given :struct:`CastOptions`.
+
+* \(5) Not all such casts have been implemented.
+
+**String representations**
+
++-----------------------------+------------------------------------+---------+
+| Input type | Output type | Notes |
++=============================+====================================+=========+
+| Boolean | String-like | |
++-----------------------------+------------------------------------+---------+
+| Numeric | String-like | |
++-----------------------------+------------------------------------+---------+
+
+**Generic conversions**
+
++-----------------------------+------------------------------------+---------+
+| Input type | Output type | Notes |
++=============================+====================================+=========+
+| Dictionary | Dictionary value type | \(1) |
++-----------------------------+------------------------------------+---------+
+| Extension | Extension storage type | |
++-----------------------------+------------------------------------+---------+
+| List-like | List-like | \(2) |
++-----------------------------+------------------------------------+---------+
+| Null | Any | |
++-----------------------------+------------------------------------+---------+
+
+* \(1) The dictionary indices are unchanged, the dictionary values are
+ cast from the input value type to the output value type (if a conversion
+ is available).
+
+* \(2) The list offsets are unchanged, the list values are cast from the
+ input value type to the output value type (if a conversion is
+ available).
+
+
+Temporal component extraction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions extract datetime components (year, month, day, etc) from temporal types.
+For timestamps inputs with non-empty timezone, localized timestamp components will be returned.
+
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++====================+============+===================+===============+============================+=======+
+| day | Unary | Temporal | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| day_of_week | Unary | Temporal | Int64 | :struct:`DayOfWeekOptions` | \(1) |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| day_of_year | Unary | Temporal | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| hour | Unary | Timestamp, Time | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| iso_week | Unary | Temporal | Int64 | | \(2) |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| iso_year | Unary | Temporal | Int64 | | \(2) |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| iso_calendar | Unary | Temporal | Struct | | \(3) |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| microsecond | Unary | Timestamp, Time | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| millisecond | Unary | Timestamp, Time | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| minute | Unary | Timestamp, Time | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| month | Unary | Temporal | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| nanosecond | Unary | Timestamp, Time | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| quarter | Unary | Temporal | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| second | Unary | Timestamp, Time | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| subsecond | Unary | Timestamp, Time | Double | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| us_week | Unary | Temporal | Int64 | | \(4) |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| week | Unary | Timestamp | Int64 | :struct:`WeekOptions` | \(5) |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+| year | Unary | Temporal | Int64 | | |
++--------------------+------------+-------------------+---------------+----------------------------+-------+
+
+* \(1) Outputs the number of the day of the week. By default week begins on Monday
+ represented by 0 and ends on Sunday represented by 6. Day numbering can start with 0 or 1 based on
+ :member:`DayOfWeekOptions::count_from_zero` parameter. :member:`DayOfWeekOptions::week_start` can be
+ used to set the starting day of the week using ISO convention (Monday=1, Sunday=7).
+ :member:`DayOfWeekOptions::week_start` parameter is not affected by :member:`DayOfWeekOptions::count_from_zero`.
+
+* \(2) First ISO week has the majority (4 or more) of it's days in January. ISO year
+ starts with the first ISO week. ISO week starts on Monday.
+ See `ISO 8601 week date definition`_ for more details.
+
+* \(3) Output is a ``{"iso_year": output type, "iso_week": output type, "iso_day_of_week": output type}`` Struct.
+
+* \(4) First US week has the majority (4 or more) of its days in January. US year
+ starts with the first US week. US week starts on Sunday.
+
+* \(5) Returns week number allowing for setting several parameters.
+ If :member:`WeekOptions::week_starts_monday` is true, the week starts with Monday, else Sunday if false.
+ If :member:`WeekOptions::count_from_zero` is true, dates from the current year that fall into the last ISO week
+ of the previous year are numbered as week 0, else week 52 or 53 if false.
+ If :member:`WeekOptions::first_week_is_fully_in_year` is true, the first week (week 1) must fully be in January;
+ else if false, a week that begins on December 29, 30, or 31 is considered the first week of the new year.
+
+.. _ISO 8601 week date definition: https://en.wikipedia.org/wiki/ISO_week_date#First_week
+
+Temporal difference
+~~~~~~~~~~~~~~~~~~~
+
+These functions compute the difference between two timestamps in the
+specified unit. The difference is determined by the number of
+boundaries crossed, not the span of time. For example, the difference
+in days between 23:59:59 on one day and 00:00:01 on the next day is
+one day (since midnight was crossed), not zero days (even though less
+than 24 hours elapsed). Additionally, if the timestamp has a defined
+timezone, the difference is calculated in the local timezone. For
+instance, the difference in years between "2019-12-31 18:00:00-0500"
+and "2019-12-31 23:00:00-0500" is zero years, because the local year
+is the same, even though the UTC years would be different.
+
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| Function name | Arity | Input types | Output type | Options class |
++=================================+============+===================+=======================+============================+
+| day_time_interval_between | Binary | Temporal | DayTime interval | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| days_between | Binary | Timestamp, Date | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| hours_between | Binary | Temporal | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| microseconds_between | Binary | Temporal | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| milliseconds_between | Binary | Temporal | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| minutes_between | Binary | Temporal | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| month_day_nano_interval_between | Binary | Temporal | MonthDayNano interval | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| month_interval_between | Binary | Timestamp, Date | Month interval | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| nanoseconds_between | Binary | Temporal | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| quarters_between | Binary | Timestamp, Date | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| seconds_between | Binary | Temporal | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| weeks_between | Binary | Timestamp, Date | Int64 | :struct:`DayOfWeekOptions` |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+| years_between | Binary | Timestamp, Date | Int64 | |
++---------------------------------+------------+-------------------+-----------------------+----------------------------+
+
+Timezone handling
+~~~~~~~~~~~~~~~~~
+
+This function is meant to be used when an external system produces
+"timezone-naive" timestamps which need to be converted to "timezone-aware"
+timestamps (see for example the `definition
+<https://docs.python.org/3/library/datetime.html#aware-and-naive-objects>`__
+in the Python documentation).
+
+Input timestamps are assumed to be relative to the timezone given in
+:member:`AssumeTimezoneOptions::timezone`. They are converted to
+UTC-relative timestamps with the timezone metadata set to the above value.
+An error is returned if the timestamps already have the timezone metadata set.
+
++--------------------+------------+-------------------+---------------+----------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++====================+============+===================+===============+==================================+=======+
+| assume_timezone | Unary | Timestamp | Timestamp | :struct:`AssumeTimezoneOptions` | \(1) |
++--------------------+------------+-------------------+---------------+----------------------------------+-------+
+
+* \(1) In addition to the timezone value, :struct:`AssumeTimezoneOptions`
+ allows choosing the behaviour when a timestamp is ambiguous or nonexistent
+ in the given timezone (because of DST shifts).
+
+
+Array-wise ("vector") functions
+-------------------------------
+
+Associative transforms
+~~~~~~~~~~~~~~~~~~~~~~
+
++-------------------+-------+-----------------------------------+-------------+-------+
+| Function name | Arity | Input types | Output type | Notes |
++===================+=======+===================================+=============+=======+
+| dictionary_encode | Unary | Boolean, Null, Numeric, | Dictionary | \(1) |
+| | | Temporal, Binary- and String-like | | |
++-------------------+-------+-----------------------------------+-------------+-------+
+| unique | Unary | Boolean, Null, Numeric, | Input type | \(2) |
+| | | Temporal, Binary- and String-like | | |
++-------------------+-------+-----------------------------------+-------------+-------+
+| value_counts | Unary | Boolean, Null, Numeric, | Input type | \(3) |
+| | | Temporal, Binary- and String-like | | |
++-------------------+-------+-----------------------------------+-------------+-------+
+
+* \(1) Output is ``Dictionary(Int32, input type)``.
+
+* \(2) Duplicates are removed from the output while the original order is
+ maintained.
+
+* \(3) Output is a ``{"values": input type, "counts": Int64}`` Struct.
+ Each output element corresponds to a unique value in the input, along
+ with the number of times this value has appeared.
+
+Selections
+~~~~~~~~~~
+
+These functions select and return a subset of their input.
+
++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+
+| Function name | Arity | Input type 1 | Input type 2 | Output type | Options class | Notes |
++===============+========+==============+==============+==============+=========================+===========+
+| array_filter | Binary | Any | Boolean | Input type 1 | :struct:`FilterOptions` | \(1) \(3) |
++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+
+| array_take | Binary | Any | Boolean | Input type 1 | :struct:`TakeOptions` | \(1) \(4) |
++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+
+| drop_null | Unary | Any | - | Input type 1 | | \(1) \(2) |
++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+
+| filter | Binary | Any | Boolean | Input type 1 | :struct:`FilterOptions` | \(1) \(3) |
++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+
+| take | Binary | Any | Integer | Input type 1 | :struct:`TakeOptions` | \(1) \(4) |
++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+
+
+* \(1) Sparse unions are unsupported.
+
+* \(2) Each element in the input is appended to the output iff it is non-null.
+ If the input is a record batch or table, any null value in a column drops
+ the entire row.
+
+* \(3) Each element in input 1 (the values) is appended to the output iff
+ the corresponding element in input 2 (the filter) is true. How
+ nulls in the filter are handled can be configured using FilterOptions.
+
+* \(4) For each element *i* in input 2 (the indices), the *i*'th element
+ in input 1 (the values) is appended to the output.
+
+Sorts and partitions
+~~~~~~~~~~~~~~~~~~~~
+
+By default, in these functions, nulls are considered greater than any other value
+(they will be sorted or partitioned at the end of the array). Floating-point
+NaN values are considered greater than any other non-null value, but smaller
+than nulls. This behaviour can be changed using the ``null_placement`` setting
+in the respective option classes.
+
+.. note::
+ Binary- and String-like inputs are ordered lexicographically as bytestrings,
+ even for String types.
+
++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+| Function name | Arity | Input types | Output type | Options class | Notes |
++=======================+============+=========================================================+===================+================================+================+
+| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) |
++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) |
++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(4) \(5) |
++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(4) |
++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+
+
+* \(1) The output is an array of indices into the input, that define a
+ stable sort of the input.
+
+* \(2) The input must be an array. The default order is ascending.
+
+* \(3) The output is an array of indices into the input array, that define
+ a partial non-stable sort such that the *N*'th index points to the *N*'th
+ element in sorted order, and all indices before the *N*'th point to
+ elements less or equal to elements at or after the *N*'th (similar to
+ :func:`std::nth_element`). *N* is given in
+ :member:`PartitionNthOptions::pivot`.
+
+* \(4) The input can be an array, chunked array, record batch or
+ table. If the input is a record batch or table, one or more sort
+ keys must be specified.
+
+* \(5) The output is an array of indices into the input, that define a
+ non-stable sort of the input.
+
+.. _cpp-compute-vector-structural-transforms:
+
+Structural transforms
+~~~~~~~~~~~~~~~~~~~~~
+
++---------------------+------------+-------------------------------------+------------------+--------+
+| Function name | Arity | Input types | Output type | Notes |
++=====================+============+=====================================+==================+========+
+| list_element | Binary | List-like (Arg 0), Integral (Arg 1) | List value type | \(1) |
++---------------------+------------+-------------------------------------+------------------+--------+
+| list_flatten | Unary | List-like | List value type | \(2) |
++---------------------+------------+-------------------------------------+------------------+--------+
+| list_parent_indices | Unary | List-like | Int32 or Int64 | \(3) |
++---------------------+------------+-------------------------------------+------------------+--------+
+
+* \(1) Output is an array of the same length as the input list array. The
+ output values are the values at the specified index of each child list.
+
+* \(2) The top level of nesting is removed: all values in the list child array,
+ including nulls, are appended to the output. However, nulls in the parent
+ list array are discarded.
+
+* \(3) For each value in the list child array, the index at which it is found
+ in the list array is appended to the output. Nulls in the parent list array
+ are discarded. Output type is Int32 for List and FixedSizeList, Int64 for
+ LargeList.
+
+These functions create a copy of the first input with some elements
+replaced, based on the remaining inputs.
+
++--------------------------+------------+-----------------------+--------------+--------------+--------------+-------+
+| Function name | Arity | Input type 1 | Input type 2 | Input type 3 | Output type | Notes |
++==========================+============+=======================+==============+==============+==============+=======+
+| replace_with_mask | Ternary | Fixed-width or binary | Boolean | Input type 1 | Input type 1 | \(1) |
++--------------------------+------------+-----------------------+--------------+--------------+--------------+-------+
+
+* \(1) Each element in input 1 for which the corresponding Boolean in input 2
+ is true is replaced with the next value from input 3. A null in input 2
+ results in a corresponding null in the output.
+
+ Also see: :ref:`if_else <cpp-compute-scalar-selections>`.
diff --git a/src/arrow/docs/source/cpp/conventions.rst b/src/arrow/docs/source/cpp/conventions.rst
new file mode 100644
index 000000000..218d028ee
--- /dev/null
+++ b/src/arrow/docs/source/cpp/conventions.rst
@@ -0,0 +1,107 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+.. cpp:namespace:: arrow
+
+Conventions
+===========
+
+The Arrow C++ API follows a few simple guidelines. As with many rules,
+there may be exceptions.
+
+Language version
+----------------
+
+Arrow is C++11-compatible. A few backports are used for newer functionality,
+for example the :class:`std::string_view` class.
+
+Namespacing
+-----------
+
+All the Arrow API (except macros) is namespaced inside a ``arrow`` namespace,
+and nested namespaces thereof.
+
+Safe pointers
+-------------
+
+Arrow objects are usually passed and stored using safe pointers -- most of
+the time :class:`std::shared_ptr` but sometimes also :class:`std::unique_ptr`.
+
+Immutability
+------------
+
+Many Arrow objects are immutable: once constructed, their logical properties
+cannot change anymore. This makes it possible to use them in multi-threaded
+scenarios without requiring tedious and error-prone synchronization.
+
+There are obvious exceptions to this, such as IO objects or mutable data buffers.
+
+Error reporting
+---------------
+
+Most APIs indicate a successful or erroneous outcome by returning a
+:class:`arrow::Status` instance. Arrow doesn't throw exceptions of its
+own, but third-party exceptions might propagate through, especially
+:class:`std::bad_alloc` (but Arrow doesn't use the standard allocators for
+large data).
+
+When an API can return either an error code or a successful value, it usually
+does so by returning the template class
+:class:`arrow::Result <template\<class T\> arrow::Result>`. However,
+some APIs (usually deprecated) return :class:`arrow::Status` and pass the
+result value as an out-pointer parameter.
+
+Here is an example of checking the outcome of an operation::
+
+ const int64_t buffer_size = 4096;
+
+ auto maybe_buffer = arrow::AllocateBuffer(buffer_size, &buffer);
+ if (!maybe_buffer.ok()) {
+ // ... handle error
+ } else {
+ std::shared_ptr<arrow::Buffer> buffer = *maybe_buffer;
+ // ... use allocated buffer
+ }
+
+If the caller function itself returns a :class:`arrow::Result` or
+:class:`arrow::Status` and wants to propagate any non-successful outcome, two
+convenience macros are available:
+
+* :c:macro:`ARROW_RETURN_NOT_OK` takes a :class:`arrow::Status` parameter
+ and returns it if not successful.
+
+* :c:macro:`ARROW_ASSIGN_OR_RAISE` takes a :class:`arrow::Result` parameter,
+ assigns its result to a *lvalue* if successful, or returns the corresponding
+ :class:`arrow::Status` on error.
+
+For example::
+
+ arrow::Status DoSomething() {
+ const int64_t buffer_size = 4096;
+ std::shared_ptr<arrow::Buffer> buffer;
+ ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateBuffer(buffer_size));
+ // ... allocation successful, do something with buffer below
+
+ // return success at the end
+ return Status::OK();
+ }
+
+.. seealso::
+ :doc:`API reference for error reporting <api/support>`
diff --git a/src/arrow/docs/source/cpp/csv.rst b/src/arrow/docs/source/cpp/csv.rst
new file mode 100644
index 000000000..42b5af67d
--- /dev/null
+++ b/src/arrow/docs/source/cpp/csv.rst
@@ -0,0 +1,220 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+.. cpp:namespace:: arrow::csv
+
+=============================
+Reading and Writing CSV files
+=============================
+
+Arrow provides a fast CSV reader allowing ingestion of external data
+as Arrow tables.
+
+.. seealso::
+ :ref:`CSV reader/writer API reference <cpp-api-csv>`.
+
+Basic usage
+===========
+
+A CSV file is read from a :class:`~arrow::io::InputStream`.
+
+.. code-block:: cpp
+
+ #include "arrow/csv/api.h"
+
+ {
+ // ...
+ arrow::io::IOContext io_context = arrow::io::default_io_context();
+ std::shared_ptr<arrow::io::InputStream> input = ...;
+
+ auto read_options = arrow::csv::ReadOptions::Defaults();
+ auto parse_options = arrow::csv::ParseOptions::Defaults();
+ auto convert_options = arrow::csv::ConvertOptions::Defaults();
+
+ // Instantiate TableReader from input stream and options
+ auto maybe_reader =
+ arrow::csv::TableReader::Make(io_context,
+ input,
+ read_options,
+ parse_options,
+ convert_options);
+ if (!maybe_reader.ok()) {
+ // Handle TableReader instantiation error...
+ }
+ std::shared_ptr<arrow::csv::TableReader> reader = *maybe_reader;
+
+ // Read table from CSV file
+ auto maybe_table = reader->Read();
+ if (!maybe_table.ok()) {
+ // Handle CSV read error
+ // (for example a CSV syntax error or failed type conversion)
+ }
+ std::shared_ptr<arrow::Table> table = *maybe_table;
+ }
+
+A CSV file is written to a :class:`~arrow::io::OutputStream`.
+
+.. code-block:: cpp
+
+ #include <arrow/csv/api.h>
+ {
+ // Oneshot write
+ // ...
+ std::shared_ptr<arrow::io::OutputStream> output = ...;
+ auto write_options = arrow::csv::WriteOptions::Defaults();
+ if (WriteCSV(table, write_options, output.get()).ok()) {
+ // Handle writer error...
+ }
+ }
+ {
+ // Write incrementally
+ // ...
+ std::shared_ptr<arrow::io::OutputStream> output = ...;
+ auto write_options = arrow::csv::WriteOptions::Defaults();
+ auto maybe_writer = arrow::csv::MakeCSVWriter(output, schema, write_options);
+ if (!maybe_writer.ok()) {
+ // Handle writer instantiation error...
+ }
+ std::shared_ptr<arrow::ipc::RecordBatchWriter> writer = *maybe_writer;
+
+ // Write batches...
+ if (!writer->WriteRecordBatch(*batch).ok()) {
+ // Handle write error...
+ }
+
+ if (!writer->Close().ok()) {
+ // Handle close error...
+ }
+ if (!output->Close().ok()) {
+ // Handle file close error...
+ }
+ }
+
+.. note:: The writer does not yet support all Arrow types.
+
+Column names
+============
+
+There are three possible ways to infer column names from the CSV file:
+
+* By default, the column names are read from the first row in the CSV file
+* If :member:`ReadOptions::column_names` is set, it forces the column
+ names in the table to these values (the first row in the CSV file is
+ read as data)
+* If :member:`ReadOptions::autogenerate_column_names` is true, column names
+ will be autogenerated with the pattern "f0", "f1"... (the first row in the
+ CSV file is read as data)
+
+Column selection
+================
+
+By default, Arrow reads all columns in the CSV file. You can narrow the
+selection of columns with the :member:`ConvertOptions::include_columns`
+option. If some columns in :member:`ConvertOptions::include_columns`
+are missing from the CSV file, an error will be emitted unless
+:member:`ConvertOptions::include_missing_columns` is true, in which case
+the missing columns are assumed to contain all-null values.
+
+Interaction with column names
+-----------------------------
+
+If both :member:`ReadOptions::column_names` and
+:member:`ConvertOptions::include_columns` are specified,
+the :member:`ReadOptions::column_names` are assumed to map to CSV columns,
+and :member:`ConvertOptions::include_columns` is a subset of those column
+names that will part of the Arrow Table.
+
+Data types
+==========
+
+By default, the CSV reader infers the most appropriate data type for each
+column. Type inference considers the following data types, in order:
+
+* Null
+* Int64
+* Boolean
+* Date32
+* Time32 (with seconds unit)
+* Timestamp (with seconds unit)
+* Timestamp (with nanoseconds unit)
+* Float64
+* Dictionary<String> (if :member:`ConvertOptions::auto_dict_encode` is true)
+* Dictionary<Binary> (if :member:`ConvertOptions::auto_dict_encode` is true)
+* String
+* Binary
+
+It is possible to override type inference for select columns by setting
+the :member:`ConvertOptions::column_types` option. Explicit data types
+can be chosen from the following list:
+
+* Null
+* All Integer types
+* Float32 and Float64
+* Decimal128
+* Boolean
+* Date32 and Date64
+* Time32 and Time64
+* Timestamp
+* Binary and Large Binary
+* String and Large String (with optional UTF8 input validation)
+* Fixed-Size Binary
+* Dictionary with index type Int32 and value type one of the following:
+ Binary, String, LargeBinary, LargeString, Int32, UInt32, Int64, UInt64,
+ Float32, Float64, Decimal128
+
+Other data types do not support conversion from CSV values and will error out.
+
+Dictionary inference
+--------------------
+
+If type inference is enabled and :member:`ConvertOptions::auto_dict_encode`
+is true, the CSV reader first tries to convert string-like columns to a
+dictionary-encoded string-like array. It switches to a plain string-like
+array when the threshold in :member:`ConvertOptions::auto_dict_max_cardinality`
+is reached.
+
+Nulls
+-----
+
+Null values are recognized from the spellings stored in
+:member:`ConvertOptions::null_values`. The :func:`ConvertOptions::Defaults`
+factory method will initialize a number of conventional null spellings such
+as ``N/A``.
+
+Character encoding
+------------------
+
+CSV files are expected to be encoded in UTF8. However, non-UTF8 data
+is accepted for Binary columns.
+
+Write Options
+=============
+
+The format of written CSV files can be customized via :class:`~arrow::csv::WriteOptions`.
+Currently few options are available; more will be added in future releases.
+
+Performance
+===========
+
+By default, the CSV reader will parallelize reads in order to exploit all
+CPU cores on your machine. You can change this setting in
+:member:`ReadOptions::use_threads`. A reasonable expectation is at least
+100 MB/s per core on a performant desktop or laptop computer (measured in
+source CSV bytes, not target Arrow data bytes).
diff --git a/src/arrow/docs/source/cpp/dataset.rst b/src/arrow/docs/source/cpp/dataset.rst
new file mode 100644
index 000000000..e7161a458
--- /dev/null
+++ b/src/arrow/docs/source/cpp/dataset.rst
@@ -0,0 +1,417 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+================
+Tabular Datasets
+================
+
+.. seealso::
+ :doc:`Dataset API reference <api/dataset>`
+
+.. warning::
+
+ The ``arrow::dataset`` namespace is experimental, and a stable API
+ is not yet guaranteed.
+
+The Arrow Datasets library provides functionality to efficiently work with
+tabular, potentially larger than memory, and multi-file datasets. This includes:
+
+* A unified interface that supports different sources and file formats
+ (currently, Parquet, ORC, Feather / Arrow IPC, and CSV files) and different
+ file systems (local, cloud).
+* Discovery of sources (crawling directories, handling partitioned datasets with
+ various partitioning schemes, basic schema normalization, ...)
+* Optimized reading with predicate pushdown (filtering rows), projection
+ (selecting and deriving columns), and optionally parallel reading.
+
+The goal is to expand support to other file formats and data sources
+(e.g. database connections) in the future.
+
+Reading Datasets
+----------------
+
+For the examples below, let's create a small dataset consisting
+of a directory with two parquet files:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Reading Datasets)
+ :end-before: (Doc section: Reading Datasets)
+ :linenos:
+ :lineno-match:
+
+(See the full example at bottom: :ref:`cpp-dataset-full-example`.)
+
+Dataset discovery
+~~~~~~~~~~~~~~~~~
+
+A :class:`arrow::dataset::Dataset` object can be created using the various
+:class:`arrow::dataset::DatasetFactory` objects. Here, we'll use the
+:class:`arrow::dataset::FileSystemDatasetFactory`, which can create a dataset
+given a base directory path:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Dataset discovery)
+ :end-before: (Doc section: Dataset discovery)
+ :emphasize-lines: 6-11
+ :linenos:
+ :lineno-match:
+
+We're also passing the filesystem to use and the file format to use for reading.
+This lets us choose between (for example) reading local files or files in Amazon
+S3, or between Parquet and CSV.
+
+In addition to searching a base directory, we can list file paths manually.
+
+Creating a :class:`arrow::dataset::Dataset` does not begin reading the data
+itself. It only crawls the directory to find all the files (if needed), which can
+be retrieved with :func:`arrow::dataset::FileSystemDataset::files`:
+
+.. code-block:: cpp
+
+ // Print out the files crawled (only for FileSystemDataset)
+ for (const auto& filename : dataset->files()) {
+ std::cout << filename << std::endl;
+ }
+
+…and infers the dataset's schema (by default from the first file):
+
+.. code-block:: cpp
+
+ std::cout << dataset->schema()->ToString() << std::endl;
+
+Using the :func:`arrow::dataset::Dataset::NewScan` method, we can build a
+:class:`arrow::dataset::Scanner` and read the dataset (or a portion of it) into
+a :class:`arrow::Table` with the :func:`arrow::dataset::Scanner::ToTable`
+method:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Dataset discovery)
+ :end-before: (Doc section: Dataset discovery)
+ :emphasize-lines: 16-19
+ :linenos:
+ :lineno-match:
+
+.. TODO: iterative loading not documented pending API changes
+.. note:: Depending on the size of your dataset, this can require a lot of
+ memory; see :ref:`cpp-dataset-filtering-data` below on
+ filtering/projecting.
+
+Reading different file formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The above examples use Parquet files on local disk, but the Dataset API
+provides a consistent interface across multiple file formats and filesystems.
+(See :ref:`cpp-dataset-cloud-storage` for more information on the latter.)
+Currently, Parquet, ORC, Feather / Arrow IPC, and CSV file formats are
+supported; more formats are planned in the future.
+
+If we save the table as Feather files instead of Parquet files:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Reading different file formats)
+ :end-before: (Doc section: Reading different file formats)
+ :linenos:
+ :lineno-match:
+
+…then we can read the Feather file by passing an :class:`arrow::dataset::IpcFileFormat`:
+
+.. code-block:: cpp
+
+ auto format = std::make_shared<ds::ParquetFileFormat>();
+ // ...
+ auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, options)
+ .ValueOrDie();
+
+Customizing file formats
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+:class:`arrow::dataset::FileFormat` objects have properties that control how
+files are read. For example::
+
+ auto format = std::make_shared<ds::ParquetFileFormat>();
+ format->reader_options.dict_columns.insert("a");
+
+Will configure column ``"a"`` to be dictionary-encoded when read. Similarly,
+setting :member:`arrow::dataset::CsvFileFormat::parse_options` lets us change
+things like reading comma-separated or tab-separated data.
+
+Additionally, passing an :class:`arrow::dataset::FragmentScanOptions` to
+:func:`arrow::dataset::ScannerBuilder::FragmentScanOptions` offers fine-grained
+control over data scanning. For example, for CSV files, we can change what values
+are converted into Boolean true and false at scan time.
+
+.. _cpp-dataset-filtering-data:
+
+Filtering data
+--------------
+
+So far, we've been reading the entire dataset, but if we need only a subset of the
+data, this can waste time or memory reading data we don't need. The
+:class:`arrow::dataset::Scanner` offers control over what data to read.
+
+In this snippet, we use :func:`arrow::dataset::ScannerBuilder::Project` to select
+which columns to read:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Filtering data)
+ :end-before: (Doc section: Filtering data)
+ :emphasize-lines: 16
+ :linenos:
+ :lineno-match:
+
+Some formats, such as Parquet, can reduce I/O costs here by reading only the
+specified columns from the filesystem.
+
+A filter can be provided with :func:`arrow::dataset::ScannerBuilder::Filter`, so
+that rows which do not match the filter predicate will not be included in the
+returned table. Again, some formats, such as Parquet, can use this filter to
+reduce the amount of I/O needed.
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Filtering data)
+ :end-before: (Doc section: Filtering data)
+ :emphasize-lines: 17
+ :linenos:
+ :lineno-match:
+
+.. TODO Expressions not documented pending renamespacing
+
+Projecting columns
+------------------
+
+In addition to selecting columns, :func:`arrow::dataset::ScannerBuilder::Project`
+can also be used for more complex projections, such as renaming columns, casting
+them to other types, and even deriving new columns based on evaluating
+expressions.
+
+In this case, we pass a vector of expressions used to construct column values
+and a vector of names for the columns:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Projecting columns)
+ :end-before: (Doc section: Projecting columns)
+ :emphasize-lines: 18-28
+ :linenos:
+ :lineno-match:
+
+This also determines the column selection; only the given columns will be
+present in the resulting table. If you want to include a derived column in
+*addition* to the existing columns, you can build up the expressions from the
+dataset schema:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Projecting columns #2)
+ :end-before: (Doc section: Projecting columns #2)
+ :emphasize-lines: 17-27
+ :linenos:
+ :lineno-match:
+
+.. note:: When combining filters and projections, Arrow will determine all
+ necessary columns to read. For instance, if you filter on a column that
+ isn't ultimately selected, Arrow will still read the column to evaluate
+ the filter.
+
+Reading and writing partitioned data
+------------------------------------
+
+So far, we've been working with datasets consisting of flat directories with
+files. Oftentimes, a dataset will have one or more columns that are frequently
+filtered on. Instead of having to read and then filter the data, by organizing the
+files into a nested directory structure, we can define a partitioned dataset,
+where sub-directory names hold information about which subset of the data is
+stored in that directory. Then, we can more efficiently filter data by using that
+information to avoid loading files that don't match the filter.
+
+For example, a dataset partitioned by year and month may have the following layout:
+
+.. code-block:: text
+
+ dataset_name/
+ year=2007/
+ month=01/
+ data0.parquet
+ data1.parquet
+ ...
+ month=02/
+ data0.parquet
+ data1.parquet
+ ...
+ month=03/
+ ...
+ year=2008/
+ month=01/
+ ...
+ ...
+
+The above partitioning scheme is using "/key=value/" directory names, as found in
+Apache Hive. Under this convention, the file at
+``dataset_name/year=2007/month=01/data0.parquet`` contains only data for which
+``year == 2007`` and ``month == 01``.
+
+Let's create a small partitioned dataset. For this, we'll use Arrow's dataset
+writing functionality.
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Reading and writing partitioned data)
+ :end-before: (Doc section: Reading and writing partitioned data)
+ :emphasize-lines: 25-42
+ :linenos:
+ :lineno-match:
+
+The above created a directory with two subdirectories ("part=a" and "part=b"),
+and the Parquet files written in those directories no longer include the "part"
+column.
+
+Reading this dataset, we now specify that the dataset should use a Hive-like
+partitioning scheme:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Reading and writing partitioned data #2)
+ :end-before: (Doc section: Reading and writing partitioned data #2)
+ :emphasize-lines: 7,9-11
+ :linenos:
+ :lineno-match:
+
+Although the partition fields are not included in the actual Parquet files,
+they will be added back to the resulting table when scanning this dataset:
+
+.. code-block:: text
+
+ $ ./debug/dataset_documentation_example file:///tmp parquet_hive partitioned
+ Found fragment: /tmp/parquet_dataset/part=a/part0.parquet
+ Partition expression: (part == "a")
+ Found fragment: /tmp/parquet_dataset/part=b/part1.parquet
+ Partition expression: (part == "b")
+ Read 20 rows
+ a: int64
+ -- field metadata --
+ PARQUET:field_id: '1'
+ b: double
+ -- field metadata --
+ PARQUET:field_id: '2'
+ c: int64
+ -- field metadata --
+ PARQUET:field_id: '3'
+ part: string
+ ----
+ # snip...
+
+We can now filter on the partition keys, which avoids loading files
+altogether if they do not match the filter:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: (Doc section: Reading and writing partitioned data #3)
+ :end-before: (Doc section: Reading and writing partitioned data #3)
+ :emphasize-lines: 15-18
+ :linenos:
+ :lineno-match:
+
+Different partitioning schemes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The above example uses a Hive-like directory scheme, such as "/year=2009/month=11/day=15".
+We specified this by passing the Hive partitioning factory. In this case, the types of
+the partition keys are inferred from the file paths.
+
+It is also possible to directly construct the partitioning and explicitly define
+the schema of the partition keys. For example:
+
+.. code-block:: cpp
+
+ auto part = std::make_shared<ds::HivePartitioning>(arrow::schema({
+ arrow::field("year", arrow::int16()),
+ arrow::field("month", arrow::int8()),
+ arrow::field("day", arrow::int32())
+ }));
+
+Arrow supports another partitioning scheme, "directory partitioning", where the
+segments in the file path represent the values of the partition keys without
+including the name (the field names are implicit in the segment's index). For
+example, given field names "year", "month", and "day", one path might be
+"/2019/11/15".
+
+Since the names are not included in the file paths, these must be specified
+when constructing a directory partitioning:
+
+.. code-block:: cpp
+
+ auto part = ds::DirectoryPartitioning::MakeFactory({"year", "month", "day"});
+
+Directory partitioning also supports providing a full schema rather than inferring
+types from file paths.
+
+Reading from other data sources
+-------------------------------
+
+Reading in-memory data
+~~~~~~~~~~~~~~~~~~~~~~
+
+If you already have data in memory that you'd like to use with the Datasets API
+(e.g. to filter/project data, or to write it out to a filesystem), you can wrap it
+in an :class:`arrow::dataset::InMemoryDataset`:
+
+.. code-block:: cpp
+
+ auto table = arrow::Table::FromRecordBatches(...);
+ auto dataset = std::make_shared<arrow::dataset::InMemoryDataset>(std::move(table));
+ // Scan the dataset, filter, it, etc.
+ auto scanner_builder = dataset->NewScan();
+
+In the example, we used the InMemoryDataset to write our example data to local
+disk which was used in the rest of the example:
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :start-after: Reading and writing partitioned data
+ :end-before: Reading and writing partitioned data
+ :emphasize-lines: 24-28
+ :linenos:
+ :lineno-match:
+
+.. _cpp-dataset-cloud-storage:
+
+Reading from cloud storage
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to local files, Arrow Datasets also support reading from cloud
+storage systems, such as Amazon S3, by passing a different filesystem.
+
+See the :ref:`filesystem <cpp-filesystems>` docs for more details on the available
+filesystems.
+
+.. _cpp-dataset-full-example:
+
+Full Example
+------------
+
+.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc
+ :language: cpp
+ :linenos:
diff --git a/src/arrow/docs/source/cpp/datatypes.rst b/src/arrow/docs/source/cpp/datatypes.rst
new file mode 100644
index 000000000..9149420a4
--- /dev/null
+++ b/src/arrow/docs/source/cpp/datatypes.rst
@@ -0,0 +1,68 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+Data Types
+==========
+
+.. seealso::
+ :doc:`Datatype API reference <api/datatype>`.
+
+Data types govern how physical data is interpreted. Their :ref:`specification
+<format_columnar>` allows binary interoperability between different Arrow
+implementations, including from different programming languages and runtimes
+(for example it is possible to access the same data, without copying, from
+both Python and Java using the :py:mod:`pyarrow.jvm` bridge module).
+
+Information about a data type in C++ can be represented in three ways:
+
+1. Using a :class:`arrow::DataType` instance (e.g. as a function argument)
+2. Using a :class:`arrow::DataType` concrete subclass (e.g. as a template
+ parameter)
+3. Using a :type:`arrow::Type::type` enum value (e.g. as the condition of
+ a switch statement)
+
+The first form (using a :class:`arrow::DataType` instance) is the most idiomatic
+and flexible. Runtime-parametric types can only be fully represented with
+a DataType instance. For example, a :class:`arrow::TimestampType` needs to be
+constructed at runtime with a :type:`arrow::TimeUnit::type` parameter; a
+:class:`arrow::Decimal128Type` with *scale* and *precision* parameters;
+a :class:`arrow::ListType` with a full child type (itself a
+:class:`arrow::DataType` instance).
+
+The two other forms can be used where performance is critical, in order to
+avoid paying the price of dynamic typing and polymorphism. However, some
+amount of runtime switching can still be required for parametric types.
+It is not possible to reify all possible types at compile time, since Arrow
+data types allows arbitrary nesting.
+
+Creating data types
+-------------------
+
+To instantiate data types, it is recommended to call the provided
+:ref:`factory functions <api-type-factories>`::
+
+ std::shared_ptr<arrow::DataType> type;
+
+ // A 16-bit integer type
+ type = arrow::int16();
+ // A 64-bit timestamp type (with microsecond granularity)
+ type = arrow::timestamp(arrow::TimeUnit::MICRO);
+ // A list type of single-precision floating-point values
+ type = arrow::list(arrow::float32());
diff --git a/src/arrow/docs/source/cpp/examples/cmake_minimal_build.rst b/src/arrow/docs/source/cpp/examples/cmake_minimal_build.rst
new file mode 100644
index 000000000..f135de830
--- /dev/null
+++ b/src/arrow/docs/source/cpp/examples/cmake_minimal_build.rst
@@ -0,0 +1,28 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+Minimal build using CMake
+==========================
+
+The folder ``cpp/examples/minimal_build/`` located inside the source tree
+contains a Docker-based example of building and using Arrow from a
+third-party project, using CMake. The
+`README <https://github.com/apache/arrow/tree/master/cpp/examples/minimal_build/README.md>`_
+file in that folder has more information.
diff --git a/src/arrow/docs/source/cpp/examples/compute_and_write_example.rst b/src/arrow/docs/source/cpp/examples/compute_and_write_example.rst
new file mode 100644
index 000000000..096b97b83
--- /dev/null
+++ b/src/arrow/docs/source/cpp/examples/compute_and_write_example.rst
@@ -0,0 +1,28 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+Compute and Write CSV Example
+=============================
+
+The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside
+the source tree contains an example of creating a table of two numerical columns
+and then compariong the magnitudes of the entries in the columns and wrting out to
+a CSV file with the column entries and their comparisons. The code in the example
+is documented.
diff --git a/src/arrow/docs/source/cpp/examples/dataset_documentation_example.rst b/src/arrow/docs/source/cpp/examples/dataset_documentation_example.rst
new file mode 100644
index 000000000..2bc993f24
--- /dev/null
+++ b/src/arrow/docs/source/cpp/examples/dataset_documentation_example.rst
@@ -0,0 +1,27 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+Arrow Datasets example
+=========================
+
+The file ``cpp/examples/arrow/dataset_documentation_example.cc``
+located inside the source tree contains an example of using Arrow
+Datasets to read, write, select, and filter data. :doc:`../dataset`
+has a full walkthrough of the example.
diff --git a/src/arrow/docs/source/cpp/examples/index.rst b/src/arrow/docs/source/cpp/examples/index.rst
new file mode 100644
index 000000000..bc5bd497c
--- /dev/null
+++ b/src/arrow/docs/source/cpp/examples/index.rst
@@ -0,0 +1,28 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Examples
+========
+
+.. toctree::
+ :maxdepth: 1
+
+ cmake_minimal_build
+ compute_and_write_example
+ dataset_documentation_example
+ row_columnar_conversion
+ std::tuple-like ranges to Arrow <tuple_range_conversion>
diff --git a/src/arrow/docs/source/cpp/examples/row_columnar_conversion.rst b/src/arrow/docs/source/cpp/examples/row_columnar_conversion.rst
new file mode 100644
index 000000000..3f45864c2
--- /dev/null
+++ b/src/arrow/docs/source/cpp/examples/row_columnar_conversion.rst
@@ -0,0 +1,27 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+Row to columnar conversion
+==========================
+
+The following example converts an array of structs to a :class:`arrow::Table`
+instance, and then converts it back to the original array of structs.
+
+.. literalinclude:: ../../../../cpp/examples/arrow/row_wise_conversion_example.cc
diff --git a/src/arrow/docs/source/cpp/examples/tuple_range_conversion.rst b/src/arrow/docs/source/cpp/examples/tuple_range_conversion.rst
new file mode 100644
index 000000000..64ba23782
--- /dev/null
+++ b/src/arrow/docs/source/cpp/examples/tuple_range_conversion.rst
@@ -0,0 +1,106 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+Conversion of range of ``std::tuple``-like to ``Table`` instances
+=================================================================
+
+While the above example shows a quite manual approach of a row to columnar
+conversion, Arrow also provides some template logic to convert ranges of
+``std::tuple<..>``-like objects to tables.
+
+In the most simple case, you only need to provide the input data and the
+type conversion is then inferred at compile time.
+
+.. code::
+
+ std::vector<std::tuple<double, std::string>> rows = ..
+ std::shared_ptr<Table> table;
+
+ if (!arrow::stl::TableFromTupleRange(
+ arrow::default_memory_pool(),
+ rows, names, &table).ok()
+ ) {
+ // Error handling code should go here.
+ }
+
+In reverse, you can use ``TupleRangeFromTable`` to fill an already
+pre-allocated range with the data from a ``Table`` instance.
+
+.. code::
+
+ // An important aspect here is that the table columns need to be in the
+ // same order as the columns will later appear in the tuple. As the tuple
+ // is unnamed, matching is done on positions.
+ std::shared_ptr<Table> table = ..
+
+ // The range needs to be pre-allocated to the respective amount of rows.
+ // This allows us to pass in an arbitrary range object, not only
+ // `std::vector`.
+ std::vector<std::tuple<double, std::string>> rows(2);
+ if (!arrow::stl::TupleRangeFromTable(*table, &rows).ok()) {
+ // Error handling code should go here.
+ }
+
+Arrow itself already supports some C(++) data types for this conversion. If you
+want to support additional data types, you need to implement a specialization
+of ``arrow::stl::ConversionTraits<T>`` and the more general
+``arrow::CTypeTraits<T>``.
+
+
+.. code::
+
+ namespace arrow {
+
+ template<>
+ struct CTypeTraits<boost::posix_time::ptime> {
+ using ArrowType = ::arrow::TimestampType;
+
+ static std::shared_ptr<::arrow::DataType> type_singleton() {
+ return ::arrow::timestamp(::arrow::TimeUnit::MICRO);
+ }
+ };
+
+ }
+
+ namespace arrow { namespace stl {
+
+ template <>
+ struct ConversionTraits<boost::posix_time::ptime> : public CTypeTraits<boost::posix_time::ptime> {
+ constexpr static bool nullable = false;
+
+ // This is the specialization to load a scalar value into an Arrow builder.
+ static Status AppendRow(
+ typename TypeTraits<TimestampType>::BuilderType& builder,
+ boost::posix_time::ptime cell) {
+ boost::posix_time::ptime const epoch({1970, 1, 1}, {0, 0, 0, 0});
+ return builder.Append((cell - epoch).total_microseconds());
+ }
+
+ // Specify how we can fill the tuple from the values stored in the Arrow
+ // array.
+ static boost::posix_time::ptime GetEntry(
+ const TimestampArray& array, size_t j) {
+ return psapp::arrow::internal::timestamp_epoch
+ + boost::posix_time::time_duration(0, 0, 0, array.Value(j));
+ }
+ };
+
+ }}
+
diff --git a/src/arrow/docs/source/cpp/flight.rst b/src/arrow/docs/source/cpp/flight.rst
new file mode 100644
index 000000000..c1d2e43b9
--- /dev/null
+++ b/src/arrow/docs/source/cpp/flight.rst
@@ -0,0 +1,119 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+================
+Arrow Flight RPC
+================
+
+Arrow Flight is an RPC framework for efficient transfer of Flight data
+over the network. See :doc:`../format/Flight` for full details on
+the protocol, or :doc:`./api/flight` for API docs.
+
+Writing a Flight Service
+========================
+
+Servers are subclasses of :class:`arrow::flight::FlightServerBase`. To
+implement individual RPCs, override the RPC methods on this class.
+
+.. code-block:: cpp
+
+ class MyFlightServer : public FlightServerBase {
+ Status ListFlights(const ServerCallContext& context, const Criteria* criteria,
+ std::unique_ptr<FlightListing>* listings) override {
+ std::vector<FlightInfo> flights = ...;
+ *listings = std::unique_ptr<FlightListing>(new SimpleFlightListing(flights));
+ return Status::OK();
+ }
+ };
+
+Each RPC method always takes a
+:class:`arrow::flight::ServerCallContext` for common parameters and
+returns a :class:`arrow::Status` to indicate success or
+failure. Flight-specific error codes can be returned via
+:func:`arrow::flight::MakeFlightError`.
+
+RPC methods that return a value in addition to a status will use an
+out parameter, as shown above. Often, there are helper classes
+providing basic implementations of these out parameters. For instance,
+above, :class:`arrow::flight::SimpleFlightListing` uses a vector of
+:class:`arrow::flight::FlightInfo` objects as the result of a
+``ListFlights`` RPC.
+
+To start a server, create a :class:`arrow::flight::Location` to
+specify where to listen, and call
+:func:`arrow::flight::FlightServerBase::Init`. This will start the
+server, but won't block the rest of the program. Use
+:func:`arrow::flight::FlightServerBase::SetShutdownOnSignals` to
+enable stopping the server if an interrupt signal is received, then
+call :func:`arrow::flight::FlightServerBase::Serve` to block until the
+server stops.
+
+.. code-block:: cpp
+
+ std::unique_ptr<arrow::flight::FlightServerBase> server;
+ // Initialize server
+ arrow::flight::Location location;
+ // Listen to all interfaces on a free port
+ ARROW_CHECK_OK(arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0, &location));
+ arrow::flight::FlightServerOptions options(location);
+
+ // Start the server
+ ARROW_CHECK_OK(server->Init(options));
+ // Exit with a clean error code (0) on SIGTERM
+ ARROW_CHECK_OK(server->SetShutdownOnSignals({SIGTERM}));
+
+ std::cout << "Server listening on localhost:" << server->port() << std::endl;
+ ARROW_CHECK_OK(server->Serve());
+
+
+Enabling TLS and Authentication
+-------------------------------
+
+TLS can be enabled by providing a certificate and key pair to
+:func:`FlightServerBase::Init
+<arrow::flight::FlightServerBase::Init>`. Additionally, use
+:func:`Location::ForGrpcTls <arrow::flight::Location::ForGrpcTls>` to
+construct the :class:`arrow::flight::Location` to listen on.
+
+Similarly, authentication can be enabled by providing an
+implementation of :class:`ServerAuthHandler
+<arrow::flight::ServerAuthHandler>`. Authentication consists of two
+parts: on initial client connection, the server and client
+authentication implementations can perform any negotiation needed;
+then, on each RPC thereafter, the client provides a token. The server
+authentication handler validates the token and provides the identity
+of the client. This identity can be obtained from the
+:class:`arrow::flight::ServerCallContext`.
+
+Using the Flight Client
+=======================
+
+To connect to a Flight service, create an instance of
+:class:`arrow::flight::FlightClient` by calling :func:`Connect
+<arrow::flight::FlightClient::Connect>`. This takes a Location and
+returns the client through an out parameter. To authenticate, call
+:func:`Authenticate <arrow::flight::FlightClient::Authenticate>` with
+the desired client authentication implementation.
+
+Each RPC method returns :class:`arrow::Status` to indicate the
+success/failure of the request. Any other return values are specified
+through out parameters. They also take an optional :class:`options
+<arrow::flight::FlightCallOptions>` parameter that allows specifying a
+timeout for the call.
diff --git a/src/arrow/docs/source/cpp/getting_started.rst b/src/arrow/docs/source/cpp/getting_started.rst
new file mode 100644
index 000000000..36ea4803f
--- /dev/null
+++ b/src/arrow/docs/source/cpp/getting_started.rst
@@ -0,0 +1,41 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+User Guide
+==========
+
+.. toctree::
+
+ overview
+ conventions
+ build_system
+ memory
+ arrays
+ datatypes
+ tables
+ compute
+ streaming_execution
+ io
+ ipc
+ parquet
+ csv
+ json
+ dataset
+ flight
diff --git a/src/arrow/docs/source/cpp/index.rst b/src/arrow/docs/source/cpp/index.rst
new file mode 100644
index 000000000..b3f6e4c82
--- /dev/null
+++ b/src/arrow/docs/source/cpp/index.rst
@@ -0,0 +1,32 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+C++ Implementation
+==================
+
+.. toctree::
+ :maxdepth: 2
+
+ getting_started
+ Examples <examples/index>
+ api
+
+.. TODO add "topics" chapter
+.. - nested arrays
+.. - dictionary encoding
+
+.. TODO add "building" or "development" chapter
diff --git a/src/arrow/docs/source/cpp/io.rst b/src/arrow/docs/source/cpp/io.rst
new file mode 100644
index 000000000..6e1d261c0
--- /dev/null
+++ b/src/arrow/docs/source/cpp/io.rst
@@ -0,0 +1,87 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+.. cpp:namespace:: arrow::io
+
+==============================
+Input / output and filesystems
+==============================
+
+Arrow provides a range of C++ interfaces abstracting the concrete details
+of input / output operations. They operate on streams of untyped binary data.
+Those abstractions are used for various purposes such as reading CSV or
+Parquet data, transmitting IPC streams, and more.
+
+.. seealso::
+ :doc:`API reference for input/output facilities <api/io>`.
+
+Reading binary data
+===================
+
+Interfaces for reading binary data come in two flavours:
+
+* Sequential reading: the :class:`InputStream` interface provides
+ ``Read`` methods; it is recommended to ``Read`` to a ``Buffer`` as it
+ may in some cases avoid a memory copy.
+
+* Random access reading: the :class:`RandomAccessFile` interface
+ provides additional facilities for positioning and, most importantly,
+ the ``ReadAt`` methods which allow parallel reading from multiple threads.
+
+Concrete implementations are available for :class:`in-memory reads <BufferReader>`,
+:class:`unbuffered file reads <ReadableFile>`,
+:class:`memory-mapped file reads <MemoryMappedFile>`,
+:class:`buffered reads <BufferedInputStream>`,
+:class:`compressed reads <CompressedInputStream>`.
+
+Writing binary data
+===================
+
+Writing binary data is mostly done through the :class:`OutputStream`
+interface.
+
+Concrete implementations are available for :class:`in-memory writes <BufferOutputStream>`,
+:class:`unbuffered file writes <FileOutputStream>`,
+:class:`memory-mapped file writes <MemoryMappedFile>`,
+:class:`buffered writes <BufferedOutputStream>`,
+:class:`compressed writes <CompressedOutputStream>`.
+
+.. cpp:namespace:: arrow::fs
+
+.. _cpp-filesystems:
+
+Filesystems
+===========
+
+The :class:`filesystem interface <FileSystem>` allows abstracted access over
+various data storage backends such as the local filesystem or a S3 bucket.
+It provides input and output streams as well as directory operations.
+
+The filesystem interface exposes a simplified view of the underlying data
+storage. Data paths are represented as *abstract paths*, which are
+``/``-separated, even on Windows, and shouldn't include special path
+components such as ``.`` and ``..``. Symbolic links, if supported by the
+underlying storage, are automatically dereferenced. Only basic
+:class:`metadata <FileStats>` about file entries, such as the file size
+and modification time, is made available.
+
+Concrete implementations are available for
+:class:`local filesystem access <LocalFileSystem>`,
+:class:`HDFS <HadoopFileSystem>` and
+:class:`Amazon S3-compatible storage <S3FileSystem>`.
diff --git a/src/arrow/docs/source/cpp/ipc.rst b/src/arrow/docs/source/cpp/ipc.rst
new file mode 100644
index 000000000..ce4175bca
--- /dev/null
+++ b/src/arrow/docs/source/cpp/ipc.rst
@@ -0,0 +1,75 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+.. cpp:namespace:: arrow::ipc
+
+========================================
+Reading and writing the Arrow IPC format
+========================================
+
+.. seealso::
+ :ref:`Arrow IPC format specification <format-ipc>`.
+
+ :doc:`API reference for IPC readers and writers <api/ipc>`.
+
+Arrow C++ provides readers and writers for the Arrow IPC format which wrap
+lower level input/output, handled through the :doc:`IO interfaces <io>`.
+For reading, there is also an event-driven API that enables feeding
+arbitrary data into the IPC decoding layer asynchronously.
+
+Reading IPC streams and files
+=============================
+
+Synchronous reading
+-------------------
+
+For most cases, it is most convenient to use the :class:`RecordBatchStreamReader`
+or :class:`RecordBatchFileReader` class, depending on which variant of the IPC
+format you want to read. The former requires a :class:`~arrow::io::InputStream`
+source, while the latter requires a :class:`~arrow::io::RandomAccessFile`.
+
+Reading Arrow IPC data is inherently zero-copy if the source allows it.
+For example, a :class:`~arrow::io::BufferReader` or :class:`~arrow::io::MemoryMappedFile`
+can typically be zero-copy. Exceptions are when the data must be transformed
+on the fly, e.g. when buffer compression has been enabled on the IPC stream
+or file.
+
+Event-driven reading
+--------------------
+
+When it is necessary to process the IPC format without blocking (for example
+to integrate Arrow with an event loop), or if data is coming from an unusual
+source, use the event-driven :class:`StreamDecoder`. You will need to define
+a subclass of :class:`Listener` and implement the virtual methods for the
+desired events (for example, implement :func:`Listener::OnRecordBatchDecoded`
+to be notified of each incoming :class:`RecordBatch`).
+
+Writing IPC streams and files
+=============================
+
+Use one of the factory functions, :func:`MakeStreamWriter` or
+:func:`MakeFileWriter`, to obtain a :class:`RecordBatchWriter` instance for
+the given IPC format variant.
+
+Configuring
+===========
+
+Various aspects of reading and writing the IPC format can be configured
+using the :class:`IpcReadOptions` and :class:`IpcWriteOptions` classes,
+respectively.
diff --git a/src/arrow/docs/source/cpp/json.rst b/src/arrow/docs/source/cpp/json.rst
new file mode 100644
index 000000000..cdb742e6c
--- /dev/null
+++ b/src/arrow/docs/source/cpp/json.rst
@@ -0,0 +1,128 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+.. cpp:namespace:: arrow::json
+
+==================
+Reading JSON files
+==================
+
+Arrow allows reading line-separated JSON files as Arrow tables. Each
+independent JSON object in the input file is converted to a row in
+the target Arrow table.
+
+.. seealso::
+ :ref:`JSON reader API reference <cpp-api-json>`.
+
+Basic usage
+===========
+
+A JSON file is read from a :class:`~arrow::io::InputStream`.
+
+.. code-block:: cpp
+
+ #include "arrow/json/api.h"
+
+ {
+ // ...
+ arrow::Status st;
+ arrow::MemoryPool* pool = default_memory_pool();
+ std::shared_ptr<arrow::io::InputStream> input = ...;
+
+ auto read_options = arrow::json::ReadOptions::Defaults();
+ auto parse_options = arrow::json::ParseOptions::Defaults();
+
+ // Instantiate TableReader from input stream and options
+ std::shared_ptr<arrow::json::TableReader> reader;
+ st = arrow::json::TableReader::Make(pool, input, read_options,
+ parse_options, &reader);
+ if (!st.ok()) {
+ // Handle TableReader instantiation error...
+ }
+
+ std::shared_ptr<arrow::Table> table;
+ // Read table from JSON file
+ st = reader->Read(&table);
+ if (!st.ok()) {
+ // Handle JSON read error
+ // (for example a JSON syntax error or failed type conversion)
+ }
+ }
+
+Data types
+==========
+
+Since JSON values are typed, the possible Arrow data types on output
+depend on the input value types. Top-level JSON values should always be
+objects. The fields of top-level objects are taken to represent columns
+in the Arrow data. For each name/value pair in a JSON object, there are
+two possible modes of deciding the output data type:
+
+* if the name is in :class:`ConvertOptions::explicit_schema`,
+ conversion of the JSON value to the corresponding Arrow data type is
+ attempted;
+
+* otherwise, the Arrow data type is determined via type inference on
+ the JSON value, trying out a number of Arrow data types in order.
+
+The following tables show the possible combinations for each of those
+two modes.
+
+.. table:: Explicit conversions from JSON to Arrow
+ :align: center
+
+ +-----------------+----------------------------------------------------+
+ | JSON value type | Allowed Arrow data types |
+ +=================+====================================================+
+ | Null | Any (including Null) |
+ +-----------------+----------------------------------------------------+
+ | Number | All Integer types, Float32, Float64, |
+ | | Date32, Date64, Time32, Time64 |
+ +-----------------+----------------------------------------------------+
+ | Boolean | Boolean |
+ +-----------------+----------------------------------------------------+
+ | String | Binary, LargeBinary, String, LargeString, |
+ | | Timestamp |
+ +-----------------+----------------------------------------------------+
+ | Array | List |
+ +-----------------+----------------------------------------------------+
+ | Object (nested) | Struct |
+ +-----------------+----------------------------------------------------+
+
+.. table:: Implicit type inference from JSON to Arrow
+ :align: center
+
+ +-----------------+----------------------------------------------------+
+ | JSON value type | Inferred Arrow data types (in order) |
+ +=================+====================================================+
+ | Null | Null, any other |
+ +-----------------+----------------------------------------------------+
+ | Number | Int64, Float64 |
+ | | |
+ +-----------------+----------------------------------------------------+
+ | Boolean | Boolean |
+ +-----------------+----------------------------------------------------+
+ | String | Timestamp (with seconds unit), String |
+ | | |
+ +-----------------+----------------------------------------------------+
+ | Array | List |
+ +-----------------+----------------------------------------------------+
+ | Object (nested) | Struct |
+ +-----------------+----------------------------------------------------+
diff --git a/src/arrow/docs/source/cpp/memory.rst b/src/arrow/docs/source/cpp/memory.rst
new file mode 100644
index 000000000..ff8ffb044
--- /dev/null
+++ b/src/arrow/docs/source/cpp/memory.rst
@@ -0,0 +1,203 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+.. _cpp_memory_management:
+
+=================
+Memory Management
+=================
+
+.. seealso::
+ :doc:`Memory management API reference <api/memory>`
+
+Buffers
+=======
+
+To avoid passing around raw data pointers with varying and non-obvious
+lifetime rules, Arrow provides a generic abstraction called :class:`arrow::Buffer`.
+A Buffer encapsulates a pointer and data size, and generally also ties its
+lifetime to that of an underlying provider (in other words, a Buffer should
+*always* point to valid memory till its destruction). Buffers are untyped:
+they simply denote a physical memory area regardless of its intended meaning
+or interpretation.
+
+Buffers may be allocated by Arrow itself , or by third-party routines.
+For example, it is possible to pass the data of a Python bytestring as a Arrow
+buffer, keeping the Python object alive as necessary.
+
+In addition, buffers come in various flavours: mutable or not, resizable or
+not. Generally, you will hold a mutable buffer when building up a piece
+of data, then it will be frozen as an immutable container such as an
+:doc:`array <arrays>`.
+
+.. note::
+ Some buffers may point to non-CPU memory, such as GPU-backed memory
+ provided by a CUDA context. If you're writing a GPU-aware application,
+ you will need to be careful not to interpret a GPU memory pointer as
+ a CPU-reachable pointer, or vice-versa.
+
+Accessing Buffer Memory
+-----------------------
+
+Buffers provide fast access to the underlying memory using the
+:func:`~arrow::Buffer::size` and :func:`~arrow::Buffer::data` accessors
+(or :func:`~arrow::Buffer::mutable_data` for writable access to a mutable
+buffer).
+
+Slicing
+-------
+
+It is possible to make zero-copy slices of buffers, to obtain a buffer
+referring to some contiguous subset of the underlying data. This is done
+by calling the :func:`arrow::SliceBuffer` and :func:`arrow::SliceMutableBuffer`
+functions.
+
+Allocating a Buffer
+-------------------
+
+You can allocate a buffer yourself by calling one of the
+:func:`arrow::AllocateBuffer` or :func:`arrow::AllocateResizableBuffer`
+overloads::
+
+ arrow::Result<std::unique_ptr<Buffer>> maybe_buffer = arrow::AllocateBuffer(4096);
+ if (!maybe_buffer.ok()) {
+ // ... handle allocation error
+ }
+
+ std::shared_ptr<arrow::Buffer> buffer = *std::move(maybe_buffer);
+ uint8_t* buffer_data = buffer->mutable_data();
+ memcpy(buffer_data, "hello world", 11);
+
+Allocating a buffer this way ensures it is 64-bytes aligned and padded
+as recommended by the :doc:`Arrow memory specification <../format/Layout>`.
+
+Building a Buffer
+-----------------
+
+You can also allocate *and* build a Buffer incrementally, using the
+:class:`arrow::BufferBuilder` API::
+
+ BufferBuilder builder;
+ builder.Resize(11); // reserve enough space for 11 bytes
+ builder.Append("hello ", 6);
+ builder.Append("world", 5);
+
+ auto maybe_buffer = builder.Finish();
+ if (!maybe_buffer.ok()) {
+ // ... handle buffer allocation error
+ }
+ std::shared_ptr<arrow::Buffer> buffer = *maybe_buffer;
+
+If a Buffer is meant to contain values of a given fixed-width type (for
+example the 32-bit offsets of a List array), it can be more convenient to
+use the template :class:`arrow::TypedBufferBuilder` API::
+
+ TypedBufferBuilder<int32_t> builder;
+ builder.Reserve(2); // reserve enough space for two int32_t values
+ builder.Append(0x12345678);
+ builder.Append(-0x765643210);
+
+ auto maybe_buffer = builder.Finish();
+ if (!maybe_buffer.ok()) {
+ // ... handle buffer allocation error
+ }
+ std::shared_ptr<arrow::Buffer> buffer = *maybe_buffer;
+
+Memory Pools
+============
+
+When allocating a Buffer using the Arrow C++ API, the buffer's underlying
+memory is allocated by a :class:`arrow::MemoryPool` instance. Usually this
+will be the process-wide *default memory pool*, but many Arrow APIs allow
+you to pass another MemoryPool instance for their internal allocations.
+
+Memory pools are used for large long-lived data such as array buffers.
+Other data, such as small C++ objects and temporary workspaces, usually
+goes through the regular C++ allocators.
+
+Default Memory Pool
+-------------------
+
+The default memory pool depends on how Arrow C++ was compiled:
+
+- if enabled at compile time, a `jemalloc <http://jemalloc.net/>`_ heap;
+- otherwise, if enabled at compile time, a
+ `mimalloc <https://github.com/microsoft/mimalloc>`_ heap;
+- otherwise, the C library ``malloc`` heap.
+
+Overriding the Default Memory Pool
+----------------------------------
+
+One can override the above selection algorithm by setting the
+``ARROW_DEFAULT_MEMORY_POOL`` environment variable to one of the following
+values: ``jemalloc``, ``mimalloc`` or ``system``. This variable is inspected
+once when Arrow C++ is loaded in memory (for example when the Arrow C++ DLL
+is loaded).
+
+STL Integration
+---------------
+
+If you wish to use a Arrow memory pool to allocate the data of STL containers,
+you can do so using the :class:`arrow::stl::allocator` wrapper.
+
+Conversely, you can also use a STL allocator to allocate Arrow memory,
+using the :class:`arrow::stl::STLMemoryPool` class. However, this may be less
+performant, as STL allocators don't provide a resizing operation.
+
+Devices
+=======
+
+Many Arrow applications only access host (CPU) memory. However, in some cases
+it is desirable to handle on-device memory (such as on-board memory on a GPU)
+as well as host memory.
+
+Arrow represents the CPU and other devices using the
+:class:`arrow::Device` abstraction. The associated class :class:`arrow::MemoryManager`
+specifies how to allocate on a given device. Each device has a default memory manager, but
+additional instances may be constructed (for example, wrapping a custom
+:class:`arrow::MemoryPool` the CPU).
+:class:`arrow::MemoryManager` instances which specify how to allocate
+memory on a given device (for example, using a particular
+:class:`arrow::MemoryPool` on the CPU).
+
+Device-Agnostic Programming
+---------------------------
+
+If you receive a Buffer from third-party code, you can query whether it is
+CPU-readable by calling its :func:`~arrow::Buffer::is_cpu` method.
+
+You can also view the Buffer on a given device, in a generic way, by calling
+:func:`arrow::Buffer::View` or :func:`arrow::Buffer::ViewOrCopy`. This will
+be a no-operation if the source and destination devices are identical.
+Otherwise, a device-dependent mechanism will attempt to construct a memory
+address for the destination device that gives access to the buffer contents.
+Actual device-to-device transfer may happen lazily, when reading the buffer
+contents.
+
+Similarly, if you want to do I/O on a buffer without assuming a CPU-readable
+buffer, you can call :func:`arrow::Buffer::GetReader` and
+:func:`arrow::Buffer::GetWriter`.
+
+For example, to get an on-CPU view or copy of an arbitrary buffer, you can
+simply do::
+
+ std::shared_ptr<arrow::Buffer> arbitrary_buffer = ... ;
+ std::shared_ptr<arrow::Buffer> cpu_buffer = arrow::Buffer::ViewOrCopy(
+ arbitrary_buffer, arrow::default_cpu_memory_manager());
diff --git a/src/arrow/docs/source/cpp/overview.rst b/src/arrow/docs/source/cpp/overview.rst
new file mode 100644
index 000000000..ccebdba45
--- /dev/null
+++ b/src/arrow/docs/source/cpp/overview.rst
@@ -0,0 +1,97 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+High-Level Overview
+===================
+
+The Arrow C++ library is comprised of different parts, each of which serves
+a specific purpose.
+
+The physical layer
+------------------
+
+**Memory management** abstractions provide a uniform API over memory that
+may be allocated through various means, such as heap allocation, the memory
+mapping of a file or a static memory area. In particular, the **buffer**
+abstraction represents a contiguous area of physical data.
+
+The one-dimensional layer
+-------------------------
+
+**Data types** govern the *logical* interpretation of *physical* data.
+Many operations in Arrow are parametered, at compile-time or at runtime,
+by a data type.
+
+**Arrays** assemble one or several buffers with a data type, allowing to
+view them as a logical contiguous sequence of values (possibly nested).
+
+**Chunked arrays** are a generalization of arrays, comprising several same-type
+arrays into a longer logical sequence of values.
+
+The two-dimensional layer
+-------------------------
+
+**Schemas** describe a logical collection of several pieces of data,
+each with a distinct name and type, and optional metadata.
+
+**Tables** are collections of chunked array in accordance to a schema. They
+are the most capable dataset-providing abstraction in Arrow.
+
+**Record batches** are collections of contiguous arrays, described
+by a schema. They allow incremental construction or serialization of tables.
+
+The compute layer
+-----------------
+
+**Datums** are flexible dataset references, able to hold for example an array or table
+reference.
+
+**Kernels** are specialized computation functions running in a loop over a
+given set of datums representing input and output parameters to the functions.
+
+The IO layer
+------------
+
+**Streams** allow untyped sequential or seekable access over external data
+of various kinds (for example compressed or memory-mapped).
+
+The Inter-Process Communication (IPC) layer
+-------------------------------------------
+
+A **messaging format** allows interchange of Arrow data between processes, using
+as few copies as possible.
+
+The file formats layer
+----------------------
+
+Reading and writing Arrow data from/to various file formats is possible, for
+example **Parquet**, **CSV**, **Orc** or the Arrow-specific **Feather** format.
+
+The devices layer
+-----------------
+
+Basic **CUDA** integration is provided, allowing to describe Arrow data backed
+by GPU-allocated memory.
+
+The filesystem layer
+--------------------
+
+A filesystem abstraction allows reading and writing data from different storage
+backends, such as the local filesystem or a S3 bucket.
diff --git a/src/arrow/docs/source/cpp/parquet.rst b/src/arrow/docs/source/cpp/parquet.rst
new file mode 100644
index 000000000..88ea4e5b6
--- /dev/null
+++ b/src/arrow/docs/source/cpp/parquet.rst
@@ -0,0 +1,432 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+.. cpp:namespace:: parquet
+
+=================================
+Reading and writing Parquet files
+=================================
+
+.. seealso::
+ :ref:`Parquet reader and writer API reference <cpp-api-parquet>`.
+
+The `Parquet format <https://parquet.apache.org/documentation/latest/>`__
+is a space-efficient columnar storage format for complex data. The Parquet
+C++ implementation is part of the Apache Arrow project and benefits
+from tight integration with the Arrow C++ classes and facilities.
+
+Supported Parquet features
+==========================
+
+The Parquet format has many features, and Parquet C++ supports a subset of them.
+
+Page types
+----------
+
++-------------------+---------+
+| Page type | Notes |
++===================+=========+
+| DATA_PAGE | |
++-------------------+---------+
+| DATA_PAGE_V2 | |
++-------------------+---------+
+| DICTIONARY_PAGE | |
++-------------------+---------+
+
+*Unsupported page type:* INDEX_PAGE. When reading a Parquet file, pages of
+this type are ignored.
+
+Compression
+-----------
+
++-------------------+---------+
+| Compression codec | Notes |
++===================+=========+
+| SNAPPY | |
++-------------------+---------+
+| GZIP | |
++-------------------+---------+
+| BROTLI | |
++-------------------+---------+
+| LZ4 | \(1) |
++-------------------+---------+
+| ZSTD | |
++-------------------+---------+
+
+* \(1) On the read side, Parquet C++ is able to decompress both the regular
+ LZ4 block format and the ad-hoc Hadoop LZ4 format used by the
+ `reference Parquet implementation <https://github.com/apache/parquet-mr>`__.
+ On the write side, Parquet C++ always generates the ad-hoc Hadoop LZ4 format.
+
+*Unsupported compression codec:* LZO.
+
+Encodings
+---------
+
++--------------------------+---------+
+| Encoding | Notes |
++==========================+=========+
+| PLAIN | |
++--------------------------+---------+
+| PLAIN_DICTIONARY | |
++--------------------------+---------+
+| BIT_PACKED | |
++--------------------------+---------+
+| RLE | \(1) |
++--------------------------+---------+
+| RLE_DICTIONARY | \(2) |
++--------------------------+---------+
+| BYTE_STREAM_SPLIT | |
++--------------------------+---------+
+
+* \(1) Only supported for encoding definition and repetition levels, not values.
+
+* \(2) On the write path, RLE_DICTIONARY is only enabled if Parquet format version
+ 2.4 or greater is selected in :func:`WriterProperties::version`.
+
+*Unsupported encodings:* DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,
+DELTA_BYTE_ARRAY.
+
+Types
+-----
+
+Physical types
+~~~~~~~~~~~~~~
+
++--------------------------+-------------------------+------------+
+| Physical type | Mapped Arrow type | Notes |
++==========================+=========================+============+
+| BOOLEAN | Boolean | |
++--------------------------+-------------------------+------------+
+| INT32 | Int32 / other | \(1) |
++--------------------------+-------------------------+------------+
+| INT64 | Int64 / other | \(1) |
++--------------------------+-------------------------+------------+
+| INT96 | Timestamp (nanoseconds) | \(2) |
++--------------------------+-------------------------+------------+
+| FLOAT | Float32 | |
++--------------------------+-------------------------+------------+
+| DOUBLE | Float64 | |
++--------------------------+-------------------------+------------+
+| BYTE_ARRAY | Binary / other | \(1) \(3) |
++--------------------------+-------------------------+------------+
+| FIXED_LENGTH_BYTE_ARRAY | FixedSizeBinary / other | \(1) |
++--------------------------+-------------------------+------------+
+
+* \(1) Can be mapped to other Arrow types, depending on the logical type
+ (see below).
+
+* \(2) On the write side, :func:`ArrowWriterProperties::support_deprecated_int96_timestamps`
+ must be enabled.
+
+* \(3) On the write side, an Arrow LargeBinary can also mapped to BYTE_ARRAY.
+
+Logical types
+~~~~~~~~~~~~~
+
+Specific logical types can override the default Arrow type mapping for a given
+physical type.
+
++-------------------+-----------------------------+----------------------------+---------+
+| Logical type | Physical type | Mapped Arrow type | Notes |
++===================+=============================+============================+=========+
+| NULL | Any | Null | \(1) |
++-------------------+-----------------------------+----------------------------+---------+
+| INT | INT32 | Int8 / UInt8 / Int16 / | |
+| | | UInt16 / Int32 / UInt32 | |
++-------------------+-----------------------------+----------------------------+---------+
+| INT | INT64 | Int64 / UInt64 | |
++-------------------+-----------------------------+----------------------------+---------+
+| DECIMAL | INT32 / INT64 / BYTE_ARRAY | Decimal128 / Decimal256 | \(2) |
+| | / FIXED_LENGTH_BYTE_ARRAY | | |
++-------------------+-----------------------------+----------------------------+---------+
+| DATE | INT32 | Date32 | \(3) |
++-------------------+-----------------------------+----------------------------+---------+
+| TIME | INT32 | Time32 (milliseconds) | |
++-------------------+-----------------------------+----------------------------+---------+
+| TIME | INT64 | Time64 (micro- or | |
+| | | nanoseconds) | |
++-------------------+-----------------------------+----------------------------+---------+
+| TIMESTAMP | INT64 | Timestamp (milli-, micro- | |
+| | | or nanoseconds) | |
++-------------------+-----------------------------+----------------------------+---------+
+| STRING | BYTE_ARRAY | Utf8 | \(4) |
++-------------------+-----------------------------+----------------------------+---------+
+| LIST | Any | List | \(5) |
++-------------------+-----------------------------+----------------------------+---------+
+| MAP | Any | Map | \(6) |
++-------------------+-----------------------------+----------------------------+---------+
+
+* \(1) On the write side, the Parquet physical type INT32 is generated.
+
+* \(2) On the write side, a FIXED_LENGTH_BYTE_ARRAY is always emitted.
+
+* \(3) On the write side, an Arrow Date64 is also mapped to a Parquet DATE INT32.
+
+* \(4) On the write side, an Arrow LargeUtf8 is also mapped to a Parquet STRING.
+
+* \(5) On the write side, an Arrow LargeList or FixedSizedList is also mapped to
+ a Parquet LIST.
+
+* \(6) On the read side, a key with multiple values does not get deduplicated,
+ in contradiction with the
+ `Parquet specification <https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps>`__.
+
+*Unsupported logical types:* JSON, BSON, UUID. If such a type is encountered
+when reading a Parquet file, the default physical type mapping is used (for
+example, a Parquet JSON column may be read as Arrow Binary or FixedSizeBinary).
+
+Converted types
+~~~~~~~~~~~~~~~
+
+While converted types are deprecated in the Parquet format (they are superceded
+by logical types), they are recognized and emitted by the Parquet C++
+implementation so as to maximize compatibility with other Parquet
+implementations.
+
+Special cases
+~~~~~~~~~~~~~
+
+An Arrow Extension type is written out as its storage type. It can still
+be recreated at read time using Parquet metadata (see "Roundtripping Arrow
+types" below).
+
+An Arrow Dictionary type is written out as its value type. It can still
+be recreated at read time using Parquet metadata (see "Roundtripping Arrow
+types" below).
+
+Roundtripping Arrow types
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+While there is no bijection between Arrow types and Parquet types, it is
+possible to serialize the Arrow schema as part of the Parquet file metadata.
+This is enabled using :func:`ArrowWriterProperties::store_schema`.
+
+On the read path, the serialized schema will be automatically recognized
+and will recreate the original Arrow data, converting the Parquet data as
+required (for example, a LargeList will be recreated from the Parquet LIST
+type).
+
+As an example, when serializing an Arrow LargeList to Parquet:
+
+* The data is written out as a Parquet LIST
+
+* When read back, the Parquet LIST data is decoded as an Arrow LargeList if
+ :func:`ArrowWriterProperties::store_schema` was enabled when writing the file;
+ otherwise, it is decoded as an Arrow List.
+
+Serialization details
+"""""""""""""""""""""
+
+The Arrow schema is serialized as a :ref:`Arrow IPC <format-ipc>` schema message,
+then base64-encoded and stored under the ``ARROW:schema`` metadata key in
+the Parquet file metadata.
+
+Limitations
+~~~~~~~~~~~
+
+Writing or reading back FixedSizedList data with null entries is not supported.
+
+Encryption
+----------
+
+Parquet C++ implements all features specified in the
+`encryption specification <https://github.com/apache/parquet-format/blob/master/Encryption.md>`__,
+except for encryption of column index and bloom filter modules.
+
+More specifically, Parquet C++ supports:
+
+* AES_GCM_V1 and AES_GCM_CTR_V1 encryption algorithms.
+* AAD suffix for Footer, ColumnMetaData, Data Page, Dictionary Page,
+ Data PageHeader, Dictionary PageHeader module types. Other module types
+ (ColumnIndex, OffsetIndex, BloomFilter Header, BloomFilter Bitset) are not
+ supported.
+* EncryptionWithFooterKey and EncryptionWithColumnKey modes.
+* Encrypted Footer and Plaintext Footer modes.
+
+
+Reading Parquet files
+=====================
+
+The :class:`arrow::FileReader` class reads data for an entire
+file or row group into an :class:`::arrow::Table`.
+
+The :class:`StreamReader` and :class:`StreamWriter` classes allow for
+data to be written using a C++ input/output streams approach to
+read/write fields column by column and row by row. This approach is
+offered for ease of use and type-safety. It is of course also useful
+when data must be streamed as files are read and written
+incrementally.
+
+Please note that the performance of the :class:`StreamReader` and
+:class:`StreamWriter` classes will not be as good due to the type
+checking and the fact that column values are processed one at a time.
+
+FileReader
+----------
+
+The Parquet :class:`arrow::FileReader` requires a
+:class:`::arrow::io::RandomAccessFile` instance representing the input
+file.
+
+.. code-block:: cpp
+
+ #include "arrow/parquet/arrow/reader.h"
+
+ {
+ // ...
+ arrow::Status st;
+ arrow::MemoryPool* pool = default_memory_pool();
+ std::shared_ptr<arrow::io::RandomAccessFile> input = ...;
+
+ // Open Parquet file reader
+ std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
+ st = parquet::arrow::OpenFile(input, pool, &arrow_reader);
+ if (!st.ok()) {
+ // Handle error instantiating file reader...
+ }
+
+ // Read entire file as a single Arrow table
+ std::shared_ptr<arrow::Table> table;
+ st = arrow_reader->ReadTable(&table);
+ if (!st.ok()) {
+ // Handle error reading Parquet data...
+ }
+ }
+
+Finer-grained options are available through the
+:class:`arrow::FileReaderBuilder` helper class.
+
+.. TODO write section about performance and memory efficiency
+
+StreamReader
+------------
+
+The :class:`StreamReader` allows for Parquet files to be read using
+standard C++ input operators which ensures type-safety.
+
+Please note that types must match the schema exactly i.e. if the
+schema field is an unsigned 16-bit integer then you must supply a
+uint16_t type.
+
+Exceptions are used to signal errors. A :class:`ParquetException` is
+thrown in the following circumstances:
+
+* Attempt to read field by supplying the incorrect type.
+
+* Attempt to read beyond end of row.
+
+* Attempt to read beyond end of file.
+
+.. code-block:: cpp
+
+ #include "arrow/io/file.h"
+ #include "parquet/stream_reader.h"
+
+ {
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+
+ PARQUET_ASSIGN_OR_THROW(
+ infile,
+ arrow::io::ReadableFile::Open("test.parquet"));
+
+ parquet::StreamReader os{parquet::ParquetFileReader::Open(infile)};
+
+ std::string article;
+ float price;
+ uint32_t quantity;
+
+ while ( !os.eof() )
+ {
+ os >> article >> price >> quantity >> parquet::EndRow;
+ // ...
+ }
+ }
+
+Writing Parquet files
+=====================
+
+WriteTable
+----------
+
+The :func:`arrow::WriteTable` function writes an entire
+:class:`::arrow::Table` to an output file.
+
+.. code-block:: cpp
+
+ #include "parquet/arrow/writer.h"
+
+ {
+ std::shared_ptr<arrow::io::FileOutputStream> outfile;
+ PARQUET_ASSIGN_OR_THROW(
+ outfile,
+ arrow::io::FileOutputStream::Open("test.parquet"));
+
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
+ }
+
+StreamWriter
+------------
+
+The :class:`StreamWriter` allows for Parquet files to be written using
+standard C++ output operators. This type-safe approach also ensures
+that rows are written without omitting fields and allows for new row
+groups to be created automatically (after certain volume of data) or
+explicitly by using the :type:`EndRowGroup` stream modifier.
+
+Exceptions are used to signal errors. A :class:`ParquetException` is
+thrown in the following circumstances:
+
+* Attempt to write a field using an incorrect type.
+
+* Attempt to write too many fields in a row.
+
+* Attempt to skip a required field.
+
+.. code-block:: cpp
+
+ #include "arrow/io/file.h"
+ #include "parquet/stream_writer.h"
+
+ {
+ std::shared_ptr<arrow::io::FileOutputStream> outfile;
+
+ PARQUET_ASSIGN_OR_THROW(
+ outfile,
+ arrow::io::FileOutputStream::Open("test.parquet"));
+
+ parquet::WriterProperties::Builder builder;
+ std::shared_ptr<parquet::schema::GroupNode> schema;
+
+ // Set up builder with required compression type etc.
+ // Define schema.
+ // ...
+
+ parquet::StreamWriter os{
+ parquet::ParquetFileWriter::Open(outfile, schema, builder.build())};
+
+ // Loop over some data structure which provides the required
+ // fields to be written and write each row.
+ for (const auto& a : getArticles())
+ {
+ os << a.name() << a.price() << a.quantity() << parquet::EndRow;
+ }
+ }
diff --git a/src/arrow/docs/source/cpp/simple_graph.svg b/src/arrow/docs/source/cpp/simple_graph.svg
new file mode 100644
index 000000000..d87507224
--- /dev/null
+++ b/src/arrow/docs/source/cpp/simple_graph.svg
@@ -0,0 +1,139 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<svg width="320pt" height="404pt"
+ viewBox="0.00 0.00 388.02 404.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 400)">
+<title>G</title>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-400 384.0173,-400 384.0173,4 -4,4"/>
+<!-- scan lineitem -->
+<g id="node1" class="node">
+<title>scan lineitem</title>
+<ellipse fill="none" stroke="#000000" cx="62.2569" cy="-378" rx="62.0148" ry="18"/>
+<text text-anchor="middle" x="62.2569" y="-373.8" font-family="Times,serif" font-size="14.00" fill="#000000">scan lineitem</text>
+</g>
+<!-- filter -->
+<g id="node2" class="node">
+<title>filter</title>
+<ellipse fill="none" stroke="#000000" cx="86.2569" cy="-306" rx="29.6089" ry="18"/>
+<text text-anchor="middle" x="86.2569" y="-301.8" font-family="Times,serif" font-size="14.00" fill="#000000">filter</text>
+</g>
+<!-- scan lineitem&#45;&gt;filter -->
+<g id="edge1" class="edge">
+<title>scan lineitem&#45;&gt;filter</title>
+<path fill="none" stroke="#000000" d="M68.3132,-359.8314C70.9767,-351.8406 74.163,-342.2819 77.1065,-333.4514"/>
+<polygon fill="#000000" stroke="#000000" points="80.4439,-334.5071 80.2858,-323.9134 73.8031,-332.2934 80.4439,-334.5071"/>
+</g>
+<!-- join -->
+<g id="node3" class="node">
+<title>join</title>
+<ellipse fill="none" stroke="#000000" cx="184.2569" cy="-234" rx="27" ry="18"/>
+<text text-anchor="middle" x="184.2569" y="-229.8" font-family="Times,serif" font-size="14.00" fill="#000000">join</text>
+</g>
+<!-- filter&#45;&gt;join -->
+<g id="edge2" class="edge">
+<title>filter&#45;&gt;join</title>
+<path fill="none" stroke="#000000" d="M105.6186,-291.7751C120.5341,-280.8168 141.3184,-265.5467 157.7735,-253.4572"/>
+<polygon fill="#000000" stroke="#000000" points="159.9433,-256.2062 165.9299,-247.4648 155.7988,-250.565 159.9433,-256.2062"/>
+</g>
+<!-- join again -->
+<g id="node4" class="node">
+<title>join again</title>
+<ellipse fill="none" stroke="#000000" cx="231.2569" cy="-162" rx="49.2784" ry="18"/>
+<text text-anchor="middle" x="231.2569" y="-157.8" font-family="Times,serif" font-size="14.00" fill="#000000">join again</text>
+</g>
+<!-- join&#45;&gt;join again -->
+<g id="edge3" class="edge">
+<title>join&#45;&gt;join again</title>
+<path fill="none" stroke="#000000" d="M195.1578,-217.3008C200.8051,-208.6496 207.8305,-197.8873 214.1788,-188.1623"/>
+<polygon fill="#000000" stroke="#000000" points="217.224,-189.9002 219.7594,-179.6132 211.3623,-186.0738 217.224,-189.9002"/>
+</g>
+<!-- filter again -->
+<g id="node9" class="node">
+<title>filter again</title>
+<ellipse fill="none" stroke="#000000" cx="231.2569" cy="-90" rx="53.2645" ry="18"/>
+<text text-anchor="middle" x="231.2569" y="-85.8" font-family="Times,serif" font-size="14.00" fill="#000000">filter again</text>
+</g>
+<!-- join again&#45;&gt;filter again -->
+<g id="edge8" class="edge">
+<title>join again&#45;&gt;filter again</title>
+<path fill="none" stroke="#000000" d="M231.2569,-143.8314C231.2569,-136.131 231.2569,-126.9743 231.2569,-118.4166"/>
+<polygon fill="#000000" stroke="#000000" points="234.757,-118.4132 231.2569,-108.4133 227.757,-118.4133 234.757,-118.4132"/>
+</g>
+<!-- scan orders -->
+<g id="node5" class="node">
+<title>scan orders</title>
+<ellipse fill="none" stroke="#000000" cx="197.2569" cy="-378" rx="54.9752" ry="18"/>
+<text text-anchor="middle" x="197.2569" y="-373.8" font-family="Times,serif" font-size="14.00" fill="#000000">scan orders</text>
+</g>
+<!-- project -->
+<g id="node6" class="node">
+<title>project</title>
+<ellipse fill="none" stroke="#000000" cx="184.2569" cy="-306" rx="37.6986" ry="18"/>
+<text text-anchor="middle" x="184.2569" y="-301.8" font-family="Times,serif" font-size="14.00" fill="#000000">project</text>
+</g>
+<!-- scan orders&#45;&gt;project -->
+<g id="edge4" class="edge">
+<title>scan orders&#45;&gt;project</title>
+<path fill="none" stroke="#000000" d="M193.9765,-359.8314C192.5861,-352.131 190.9329,-342.9743 189.3877,-334.4166"/>
+<polygon fill="#000000" stroke="#000000" points="192.8028,-333.6322 187.5816,-324.4133 185.9142,-334.8761 192.8028,-333.6322"/>
+</g>
+<!-- project&#45;&gt;join -->
+<g id="edge5" class="edge">
+<title>project&#45;&gt;join</title>
+<path fill="none" stroke="#000000" d="M184.2569,-287.8314C184.2569,-280.131 184.2569,-270.9743 184.2569,-262.4166"/>
+<polygon fill="#000000" stroke="#000000" points="187.757,-262.4132 184.2569,-252.4133 180.757,-262.4133 187.757,-262.4132"/>
+</g>
+<!-- scan customers -->
+<g id="node7" class="node">
+<title>scan customers</title>
+<ellipse fill="none" stroke="#000000" cx="310.2569" cy="-306" rx="69.5216" ry="18"/>
+<text text-anchor="middle" x="310.2569" y="-301.8" font-family="Times,serif" font-size="14.00" fill="#000000">scan customers</text>
+</g>
+<!-- aggregate -->
+<g id="node8" class="node">
+<title>aggregate</title>
+<ellipse fill="none" stroke="#000000" cx="294.2569" cy="-234" rx="48.6346" ry="18"/>
+<text text-anchor="middle" x="294.2569" y="-229.8" font-family="Times,serif" font-size="14.00" fill="#000000">aggregate</text>
+</g>
+<!-- scan customers&#45;&gt;aggregate -->
+<g id="edge6" class="edge">
+<title>scan customers&#45;&gt;aggregate</title>
+<path fill="none" stroke="#000000" d="M306.2195,-287.8314C304.5083,-280.131 302.4735,-270.9743 300.5717,-262.4166"/>
+<polygon fill="#000000" stroke="#000000" points="303.9348,-261.4159 298.3488,-252.4133 297.1015,-262.9344 303.9348,-261.4159"/>
+</g>
+<!-- aggregate&#45;&gt;join again -->
+<g id="edge7" class="edge">
+<title>aggregate&#45;&gt;join again</title>
+<path fill="none" stroke="#000000" d="M279.0064,-216.5708C271.1906,-207.6385 261.5369,-196.6056 252.9595,-186.8029"/>
+<polygon fill="#000000" stroke="#000000" points="255.5861,-184.4897 246.367,-179.2687 250.3181,-189.0993 255.5861,-184.4897"/>
+</g>
+<!-- write to disk -->
+<g id="node10" class="node">
+<title>write to disk</title>
+<ellipse fill="none" stroke="#000000" cx="231.2569" cy="-18" rx="59.1276" ry="18"/>
+<text text-anchor="middle" x="231.2569" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">write to disk</text>
+</g>
+<!-- filter again&#45;&gt;write to disk -->
+<g id="edge9" class="edge">
+<title>filter again&#45;&gt;write to disk</title>
+<path fill="none" stroke="#000000" d="M231.2569,-71.8314C231.2569,-64.131 231.2569,-54.9743 231.2569,-46.4166"/>
+<polygon fill="#000000" stroke="#000000" points="234.757,-46.4132 231.2569,-36.4133 227.757,-46.4133 234.757,-46.4132"/>
+</g>
+</g>
+</svg>
diff --git a/src/arrow/docs/source/cpp/streaming_execution.rst b/src/arrow/docs/source/cpp/streaming_execution.rst
new file mode 100644
index 000000000..a3406265b
--- /dev/null
+++ b/src/arrow/docs/source/cpp/streaming_execution.rst
@@ -0,0 +1,307 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+.. cpp:namespace:: arrow::compute
+
+==========================
+Streaming execution engine
+==========================
+
+.. warning::
+
+ The streaming execution engine is experimental, and a stable API
+ is not yet guaranteed.
+
+Motivation
+----------
+
+For many complex computations, successive direct :ref:`invocation of
+compute functions <invoking-compute-functions>` is not feasible
+in either memory or computation time. Doing so causes all intermediate
+data to be fully materialized. To facilitate arbitrarily large inputs
+and more efficient resource usage, Arrow also provides a streaming query
+engine with which computations can be formulated and executed.
+
+.. image:: simple_graph.svg
+ :alt: An example graph of a streaming execution workflow.
+
+:class:`ExecNode` is provided to reify the graph of operations in a query.
+Batches of data (:struct:`ExecBatch`) flow along edges of the graph from
+node to node. Structuring the API around streams of batches allows the
+working set for each node to be tuned for optimal performance independent
+of any other nodes in the graph. Each :class:`ExecNode` processes batches
+as they are pushed to it along an edge of the graph by upstream nodes
+(its inputs), and pushes batches along an edge of the graph to downstream
+nodes (its outputs) as they are finalized.
+
+..seealso::
+
+ `SHAIKHHA, A., DASHTI, M., & KOCH, C.
+ (2018). Push versus pull-based loop fusion in query engines.
+ Journal of Functional Programming, 28.
+ <https://doi.org/10.1017/s0956796818000102>`_
+
+Overview
+--------
+
+:class:`ExecNode`
+ Each node in the graph is an implementation of the :class:`ExecNode` interface.
+
+:class:`ExecPlan`
+ A set of :class:`ExecNode` is contained and (to an extent) coordinated by an
+ :class:`ExecPlan`.
+
+:class:`ExecFactoryRegistry`
+ Instances of :class:`ExecNode` are constructed by factory functions held
+ in a :class:`ExecFactoryRegistry`.
+
+:class:`ExecNodeOptions`
+ Heterogenous parameters for factories of :class:`ExecNode` are bundled in an
+ :class:`ExecNodeOptions`.
+
+:struct:`Declaration`
+ ``dplyr``-inspired helper for efficient construction of an :class:`ExecPlan`.
+
+:struct:`ExecBatch`
+ A lightweight container for a single chunk of data in the Arrow format. In
+ contrast to :class:`RecordBatch`, :struct:`ExecBatch` is intended for use
+ exclusively in a streaming execution context (for example, it doesn't have a
+ corresponding Python binding). Furthermore columns which happen to have a
+ constant value may be represented by a :class:`Scalar` instead of an
+ :class:`Array`. In addition, :struct:`ExecBatch` may carry
+ execution-relevant properties including a guaranteed-true-filter
+ for :class:`Expression` simplification.
+
+
+An example :class:`ExecNode` implementation which simply passes all input batches
+through unchanged::
+
+ class PassthruNode : public ExecNode {
+ public:
+ // InputReceived is the main entry point for ExecNodes. It is invoked
+ // by an input of this node to push a batch here for processing.
+ void InputReceived(ExecNode* input, ExecBatch batch) override {
+ // Since this is a passthru node we simply push the batch to our
+ // only output here.
+ outputs_[0]->InputReceived(this, batch);
+ }
+
+ // ErrorReceived is called by an input of this node to report an error.
+ // ExecNodes should always forward errors to their outputs unless they
+ // are able to fully handle the error (this is rare).
+ void ErrorReceived(ExecNode* input, Status error) override {
+ outputs_[0]->ErrorReceived(this, error);
+ }
+
+ // InputFinished is used to signal how many batches will ultimately arrive.
+ // It may be called with any ordering relative to InputReceived/ErrorReceived.
+ void InputFinished(ExecNode* input, int total_batches) override {
+ outputs_[0]->InputFinished(this, total_batches);
+ }
+
+ // ExecNodes may request that their inputs throttle production of batches
+ // until they are ready for more, or stop production if no further batches
+ // are required. These signals should typically be forwarded to the inputs
+ // of the ExecNode.
+ void ResumeProducing(ExecNode* output) override { inputs_[0]->ResumeProducing(this); }
+ void PauseProducing(ExecNode* output) override { inputs_[0]->PauseProducing(this); }
+ void StopProducing(ExecNode* output) override { inputs_[0]->StopProducing(this); }
+
+ // An ExecNode has a single output schema to which all its batches conform.
+ using ExecNode::output_schema;
+
+ // ExecNodes carry basic introspection for debugging purposes
+ const char* kind_name() const override { return "PassthruNode"; }
+ using ExecNode::label;
+ using ExecNode::SetLabel;
+ using ExecNode::ToString;
+
+ // An ExecNode holds references to its inputs and outputs, so it is possible
+ // to walk the graph of execution if necessary.
+ using ExecNode::inputs;
+ using ExecNode::outputs;
+
+ // StartProducing() and StopProducing() are invoked by an ExecPlan to
+ // coordinate the graph-wide execution state. These do not need to be
+ // forwarded to inputs or outputs.
+ Status StartProducing() override { return Status::OK(); }
+ void StopProducing() override {}
+ Future<> finished() override { return inputs_[0]->finished(); }
+ };
+
+Note that each method which is associated with an edge of the graph must be invoked
+with an ``ExecNode*`` to identify the node which invoked it. For example, in an
+:class:`ExecNode` which implements ``JOIN`` this tagging might be used to differentiate
+between batches from the left or right inputs.
+``InputReceived``, ``ErrorReceived``, ``InputFinished`` may only be invoked by
+the inputs of a node, while ``ResumeProducing``, ``PauseProducing``, ``StopProducing``
+may only be invoked by outputs of a node.
+
+:class:`ExecPlan` contains the associated instances of :class:`ExecNode`
+and is used to start and stop execution of all nodes and for querying/awaiting
+their completion::
+
+ // construct an ExecPlan first to hold your nodes
+ ARROW_ASSIGN_OR_RAISE(auto plan, ExecPlan::Make(default_exec_context()));
+
+ // ... add nodes to your ExecPlan
+
+ // start all nodes in the graph
+ ARROW_RETURN_NOT_OK(plan->StartProducing());
+
+ SetUserCancellationCallback([plan] {
+ // stop all nodes in the graph
+ plan->StopProducing();
+ });
+
+ // Complete will be marked finished when all nodes have run to completion
+ // or acknowledged a StopProducing() signal. The ExecPlan should be kept
+ // alive until this future is marked finished.
+ Future<> complete = plan->finished();
+
+
+Constructing ``ExecPlan`` objects
+---------------------------------
+
+.. warning::
+
+ The following will be superceded by construction from Compute IR, see ARROW-14074.
+
+None of the concrete implementations of :class:`ExecNode` are exposed
+in headers, so they can't be constructed directly outside the
+translation unit where they are defined. Instead, factories to
+create them are provided in an extensible registry. This structure
+provides a number of benefits:
+
+- This enforces consistent construction.
+- It decouples implementations from consumers of the interface
+ (for example: we have two classes for scalar and grouped aggregate,
+ we can choose which to construct within the single factory by
+ checking whether grouping keys are provided)
+- This expedites integration with out-of-library extensions. For example
+ "scan" nodes are implemented in the separate ``libarrow_dataset.so`` library.
+- Since the class is not referencable outside the translation unit in which it
+ is defined, compilers can optimize more aggressively.
+
+Factories of :class:`ExecNode` can be retrieved by name from the registry.
+The default registry is available through
+:func:`arrow::compute::default_exec_factory_registry()`
+and can be queried for the built-in factories::
+
+ // get the factory for "filter" nodes:
+ ARROW_ASSIGN_OR_RAISE(auto make_filter,
+ default_exec_factory_registry()->GetFactory("filter"));
+
+ // factories take three arguments:
+ ARROW_ASSIGN_OR_RAISE(ExecNode* filter_node, *make_filter(
+ // the ExecPlan which should own this node
+ plan.get(),
+
+ // nodes which will send batches to this node (inputs)
+ {scan_node},
+
+ // parameters unique to "filter" nodes
+ FilterNodeOptions{filter_expression}));
+
+ // alternative shorthand:
+ ARROW_ASSIGN_OR_RAISE(filter_node, MakeExecNode("filter",
+ plan.get(), {scan_node}, FilterNodeOptions{filter_expression});
+
+Factories can also be added to the default registry as long as they are
+convertible to ``std::function<Result<ExecNode*>(
+ExecPlan*, std::vector<ExecNode*>, const ExecNodeOptions&)>``.
+
+To build an :class:`ExecPlan` representing a simple pipeline which
+reads from a :class:`RecordBatchReader` then filters, projects, and
+writes to disk::
+
+ std::shared_ptr<RecordBatchReader> reader = GetStreamOfBatches();
+ ExecNode* source_node = *MakeExecNode("source", plan.get(), {},
+ SourceNodeOptions::FromReader(
+ reader,
+ GetCpuThreadPool()));
+
+ ExecNode* filter_node = *MakeExecNode("filter", plan.get(), {source_node},
+ FilterNodeOptions{
+ greater(field_ref("score"), literal(3))
+ });
+
+ ExecNode* project_node = *MakeExecNode("project", plan.get(), {filter_node},
+ ProjectNodeOptions{
+ {add(field_ref("score"), literal(1))},
+ {"score + 1"}
+ });
+
+ arrow::dataset::internal::Initialize();
+ MakeExecNode("write", plan.get(), {project_node},
+ WriteNodeOptions{/*base_dir=*/"/dat", /*...*/});
+
+:struct:`Declaration` is a `dplyr <https://dplyr.tidyverse.org>`_-inspired
+helper which further decreases the boilerplate associated with populating
+an :class:`ExecPlan` from C++::
+
+ arrow::dataset::internal::Initialize();
+
+ std::shared_ptr<RecordBatchReader> reader = GetStreamOfBatches();
+ ASSERT_OK(Declaration::Sequence(
+ {
+ {"source", SourceNodeOptions::FromReader(
+ reader,
+ GetCpuThreadPool())},
+ {"filter", FilterNodeOptions{
+ greater(field_ref("score"), literal(3))}},
+ {"project", ProjectNodeOptions{
+ {add(field_ref("score"), literal(1))},
+ {"score + 1"}}},
+ {"write", WriteNodeOptions{/*base_dir=*/"/dat", /*...*/}},
+ })
+ .AddToPlan(plan.get()));
+
+Note that a source node can wrap anything which resembles a stream of batches.
+For example, `PR#11032 <https://github.com/apache/arrow/pull/11032>`_ adds
+support for use of a `DuckDB <https://duckdb.org>`_ query as a source node.
+Similarly, a sink node can wrap anything which absorbs a stream of batches.
+In the example above we're writing completed
+batches to disk. However we can also collect these in memory into a :class:`Table`
+or forward them to a :class:`RecordBatchReader` as an out-of-graph stream.
+This flexibility allows an :class:`ExecPlan` to be used as streaming middleware
+between any endpoints which support Arrow formatted batches.
+
+An :class:`arrow::dataset::Dataset` can also be wrapped as a source node which
+pushes all the dataset's batches into an :class:`ExecPlan`. This factory is added
+to the default registry with the name ``"scan"`` by calling
+``arrow::dataset::internal::Initialize()``::
+
+ arrow::dataset::internal::Initialize();
+
+ std::shared_ptr<Dataset> dataset = GetDataset();
+
+ ASSERT_OK(Declaration::Sequence(
+ {
+ {"scan", ScanNodeOptions{dataset,
+ /* push down predicate, projection, ... */}},
+ {"filter", FilterNodeOptions{/* ... */}},
+ // ...
+ })
+ .AddToPlan(plan.get()));
+
+Datasets may be scanned multiple times; just make multiple scan
+nodes from that dataset. (Useful for a self-join, for example.)
+Note that producing two scan nodes like this will perform all
+reads and decodes twice.
diff --git a/src/arrow/docs/source/cpp/tables.rst b/src/arrow/docs/source/cpp/tables.rst
new file mode 100644
index 000000000..ea9198771
--- /dev/null
+++ b/src/arrow/docs/source/cpp/tables.rst
@@ -0,0 +1,83 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. default-domain:: cpp
+.. highlight:: cpp
+
+============
+Tabular Data
+============
+
+.. seealso::
+ :doc:`Table and RecordBatch API reference <api/table>`.
+
+While arrays and chunked arrays represent a one-dimensional sequence of
+homogeneous values, data often comes in the form of two-dimensional sets of
+heterogeneous data (such as database tables, CSV files...). Arrow provides
+several abstractions to handle such data conveniently and efficiently.
+
+Fields
+======
+
+Fields are used to denote the particular columns of a table (and also
+the particular members of a nested data type such as :class:`arrow::StructType`).
+A field, i.e. an instance of :class:`arrow::Field`, holds together a data
+type, a field name and some optional metadata.
+
+The recommended way to create a field is to call the :func:`arrow::field`
+factory function.
+
+Schemas
+=======
+
+A schema describes the overall structure of a two-dimensional dataset such
+as a table. It holds a sequence of fields together with some optional
+schema-wide metadata (in addition to per-field metadata). The recommended
+way to create a schema is to call one the :func:`arrow::schema` factory
+function overloads::
+
+ // Create a schema describing datasets with two columns:
+ // a int32 column "A" and a utf8-encoded string column "B"
+ std::shared_ptr<arrow::Field> field_a, field_b;
+ std::shared_ptr<arrow::Schema> schema;
+
+ field_a = arrow::field("A", arrow::int32());
+ field_b = arrow::field("B", arrow::utf8());
+ schema = arrow::schema({field_a, field_b});
+
+Tables
+======
+
+A :class:`arrow::Table` is a two-dimensional dataset with chunked arrays for
+columns, together with a schema providing field names. Also, each chunked
+column must have the same logical length in number of elements (although each
+column can be chunked in a different way).
+
+Record Batches
+==============
+
+A :class:`arrow::RecordBatch` is a two-dimensional dataset of a number of
+contiguous arrays, each the same length. Like a table, a record batch also
+has a schema which must match its arrays' datatypes.
+
+Record batches are a convenient unit of work for various serialization
+and computation functions, possibly incremental.
+
+A table can be streamed as an arbitrary number of record batches using
+a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of
+record batches can be assembled to form a table using one of the
+:func:`arrow::Table::FromRecordBatches` factory function overloads.
diff --git a/src/arrow/docs/source/developers/archery.rst b/src/arrow/docs/source/developers/archery.rst
new file mode 100644
index 000000000..a587975d6
--- /dev/null
+++ b/src/arrow/docs/source/developers/archery.rst
@@ -0,0 +1,87 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _archery:
+
+Daily Development using Archery
+===============================
+
+To ease some of the daily development tasks, we developed a Python-written
+utility called Archery.
+
+Installation
+------------
+
+Archery requires Python 3.6 or later. It is recommended to install archery in
+*editable* mode with the ``-e`` flag to automatically update the installation
+when pulling the Arrow repository. After cloning the Arrow repository, from
+the top level directory install Archery by using the command
+
+.. code:: bash
+
+ pip install -e dev/archery[all]
+
+Usage
+-----
+
+You can inspect Archery usage by passing the ``--help`` flag:
+
+.. code:: bash
+
+ $ archery --help
+ Usage: archery [OPTIONS] COMMAND [ARGS]...
+
+ Apache Arrow developer utilities.
+
+ See sub-commands help with `archery <cmd> --help`.
+
+ Options:
+ --debug Increase logging with debugging output.
+ --pdb Invoke pdb on uncaught exception.
+ -q, --quiet Silence executed commands.
+ --help Show this message and exit.
+
+ Commands:
+ benchmark Arrow benchmarking.
+ build Initialize an Arrow C++ build
+ crossbow Schedule packaging tasks or nightly builds on CI services.
+ docker Interact with docker-compose based builds.
+ integration Execute protocol and Flight integration tests
+ linking Quick and dirty utilities for checking library linkage.
+ lint Check Arrow source tree for errors
+ numpydoc Lint python docstring with NumpyDoc
+ release Release releated commands.
+ trigger-bot
+
+Archery exposes independent subcommands, each of which provides dedicated
+help output, for example:
+
+.. code:: bash
+
+ $ archery docker --help
+ Usage: archery docker [OPTIONS] COMMAND [ARGS]...
+
+ Interact with docker-compose based builds.
+
+ Options:
+ --src <arrow_src> Specify Arrow source directory.
+ --help Show this message and exit.
+
+ Commands:
+ images List the available docker-compose images.
+ push Push the generated docker-compose image.
+ run Execute docker-compose builds.
diff --git a/src/arrow/docs/source/developers/benchmarks.rst b/src/arrow/docs/source/developers/benchmarks.rst
new file mode 100644
index 000000000..22eb5159d
--- /dev/null
+++ b/src/arrow/docs/source/developers/benchmarks.rst
@@ -0,0 +1,179 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _benchmarks:
+
+==========
+Benchmarks
+==========
+
+Setup
+=====
+
+First install the :ref:`Archery <archery>` utility to run the benchmark suite.
+
+Running the benchmark suite
+===========================
+
+The benchmark suites can be run with the ``benchmark run`` sub-command.
+
+.. code-block:: shell
+
+ # Run benchmarks in the current git workspace
+ archery benchmark run
+ # Storing the results in a file
+ archery benchmark run --output=run.json
+
+Sometimes, it is required to pass custom CMake flags, e.g.
+
+.. code-block:: shell
+
+ export CC=clang-8 CXX=clang++8
+ archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE"
+
+Additionally a full CMake build directory may be specified.
+
+.. code-block:: shell
+
+ archery benchmark run $HOME/arrow/cpp/release-build
+
+Comparison
+==========
+
+One goal with benchmarking is to detect performance regressions. To this end,
+``archery`` implements a benchmark comparison facility via the ``benchmark
+diff`` sub-command.
+
+In the default invocation, it will compare the current source (known as the
+current workspace in git) with local master branch:
+
+.. code-block:: shell
+
+ archery --quiet benchmark diff --benchmark-filter=FloatParsing
+ -----------------------------------------------------------------------------------
+ Non-regressions: (1)
+ -----------------------------------------------------------------------------------
+ benchmark baseline contender change % counters
+ FloatParsing<FloatType> 105.983M items/sec 105.983M items/sec 0.0 {}
+
+ ------------------------------------------------------------------------------------
+ Regressions: (1)
+ ------------------------------------------------------------------------------------
+ benchmark baseline contender change % counters
+ FloatParsing<DoubleType> 209.941M items/sec 109.941M items/sec -47.632 {}
+
+For more information, invoke the ``archery benchmark diff --help`` command for
+multiple examples of invocation.
+
+Iterating efficiently
+~~~~~~~~~~~~~~~~~~~~~
+
+Iterating with benchmark development can be a tedious process due to long
+build time and long run times. Multiple tricks can be used with
+``archery benchmark diff`` to reduce this overhead.
+
+First, the benchmark command supports comparing existing
+build directories, This can be paired with the ``--preserve`` flag to
+avoid rebuilding sources from zero.
+
+.. code-block:: shell
+
+ # First invocation clone and checkouts in a temporary directory. The
+ # directory is preserved with --preserve
+ archery benchmark diff --preserve
+
+ # Modify C++ sources
+
+ # Re-run benchmark in the previously created build directory.
+ archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build
+
+Second, a benchmark run result can be saved in a json file. This also avoids
+rebuilding the sources, but also executing the (sometimes) heavy benchmarks.
+This technique can be used as a poor's man caching.
+
+.. code-block:: shell
+
+ # Run the benchmarks on a given commit and save the result
+ archery benchmark run --output=run-head-1.json HEAD~1
+ # Compare the previous captured result with HEAD
+ archery benchmark diff HEAD run-head-1.json
+
+Third, the benchmark command supports filtering suites (``--suite-filter``)
+and benchmarks (``--benchmark-filter``), both options supports regular
+expressions.
+
+.. code-block:: shell
+
+ # Taking over a previous run, but only filtering for benchmarks matching
+ # `Kernel` and suite matching `compute-aggregate`.
+ archery benchmark diff \
+ --suite-filter=compute-aggregate --benchmark-filter=Kernel \
+ /tmp/arrow-bench*/{WORKSPACE,master}/build
+
+Instead of rerunning benchmarks on comparison, a JSON file (generated by
+``archery benchmark run``) may be specified for the contender and/or the
+baseline.
+
+.. code-block:: shell
+
+ archery benchmark run --output=baseline.json $HOME/arrow/cpp/release-build
+ git checkout some-feature
+ archery benchmark run --output=contender.json $HOME/arrow/cpp/release-build
+ archery benchmark diff contender.json baseline.json
+
+Regression detection
+====================
+
+Writing a benchmark
+~~~~~~~~~~~~~~~~~~~
+
+1. The benchmark command will filter (by default) benchmarks with the regular
+ expression ``^Regression``. This way, not all benchmarks are run by default.
+ Thus, if you want your benchmark to be verified for regression
+ automatically, the name must match.
+
+2. The benchmark command will run with the ``--benchmark_repetitions=K``
+ options for statistical significance. Thus, a benchmark should not override
+ the repetitions in the (C++) benchmark's arguments definition.
+
+3. Due to #2, a benchmark should run sufficiently fast. Often, when the input
+ does not fit in memory (L2/L3), the benchmark will be memory bound instead
+ of CPU bound. In this case, the input can be downsized.
+
+4. By default, google's benchmark library will use the cputime metric, which
+ is the sum of runtime dedicated on the CPU for all threads of the process.
+ By contrast to realtime which is the wall clock time, e.g. the difference
+ between end_time - start_time. In a single thread model, the cputime is
+ preferable since it is less affected by context switching. In a multi thread
+ scenario, the cputime will give incorrect result since the since it'll
+ be inflated by the number of threads and can be far off realtime. Thus, if
+ the benchmark is multi threaded, it might be better to use
+ ``SetRealtime()``, see this `example <https://github.com/apache/arrow/blob/a9582ea6ab2db055656809a2c579165fe6a811ba/cpp/src/arrow/io/memory-benchmark.cc#L223-L227>`_.
+
+Scripting
+=========
+
+``archery`` is written as a python library with a command line frontend. The
+library can be imported to automate some tasks.
+
+Some invocation of the command line interface can be quite verbose due to build
+output. This can be controlled/avoided with the ``--quiet`` option or the
+``--output=<file>`` can be used, e.g.
+
+.. code-block:: shell
+
+ archery benchmark diff --benchmark-filter=Kernel --output=compare.json ...
diff --git a/src/arrow/docs/source/developers/computeir.rst b/src/arrow/docs/source/developers/computeir.rst
new file mode 100644
index 000000000..9ebe1d5af
--- /dev/null
+++ b/src/arrow/docs/source/developers/computeir.rst
@@ -0,0 +1,59 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+**********************************************
+Arrow Compute IR (Intermediate Representation)
+**********************************************
+
+In the same way that the Arrow format provides a powerful tool
+for communicating data, Compute IR is intended to provide a
+consistent format for representing analytical operations against
+that data. As an arrow-native expression of computation it includes
+information such as explicit types and schemas and arrow formatted
+literal data. It is also optimized for low runtime overhead in both
+serialization and deserialization.
+
+Built-in definitions are included to enable representation of
+relational algebraic operations- the contents of a "logical query plan".
+Compute IR also has first class support for representing operations
+which are not members of a minimal relational algebra, including
+implementation and optimization details- the contents of a "physical
+query plan". This approach is taken in emulation of `MLIR`_ (Multi-Level
+Intermediate Representation), a system which has had strong successes in
+spaces of comparable complexity to representation of analytic operations.
+To borrow terms from that project, there are two mutations of interest:
+
+* Replacement of representations with semantically equivalent representations
+ which will yield better performance for consumers- an optimization pass.
+* Replacement of abstract or generic representations with more specific
+ and potentially consumer-specific representations- a lowering pass.
+ This modification corresponds to the translation of a logical plan
+ to a physical plan.
+
+Allowing representation of physical plans (and plans which are between
+logical and physical) in Compute IR enables systems to define incremental
+optimization and lowering passes which operate on and produce valid
+Compute IR. This in turn enables communication, manipulation, and inspection
+at every stage of lowering/optimization by the same tools
+used for logical-plan-equivalent-IR. This is especially useful for systems
+where such passes may depend on information only available on every node
+of a distributed consumer (for example statistics unique to that node's
+local data) or may not be universal to all backends in a heterogeneous
+consumer (for example which optimizations nodes are capable of for
+non equi joins).
+
+.. _MLIR: https://mlir.llvm.org
diff --git a/src/arrow/docs/source/developers/contributing.rst b/src/arrow/docs/source/developers/contributing.rst
new file mode 100644
index 000000000..9b81a6ff1
--- /dev/null
+++ b/src/arrow/docs/source/developers/contributing.rst
@@ -0,0 +1,362 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _contributing:
+
+****************************
+Contributing to Apache Arrow
+****************************
+
+Thanks for your interest in the Apache Arrow project. Arrow is a large project
+and may seem overwhelming when you're first getting involved.
+Contributing code is great, but that's probably not the first place to start.
+There are lots of ways to make valuable contributions to the project and
+community.
+
+This page provides some orientation for how to get involved. It also offers
+some recommendations on how to get best results when engaging with the
+community.
+
+Code of Conduct
+===============
+
+All participation in the Apache Arrow project is governed by the ASF's
+`Code of Conduct <https://www.apache.org/foundation/policies/conduct.html>`_.
+
+Join the mailing lists
+======================
+
+A good first step to getting involved in the Arrow project is to join the
+mailing lists and participate in discussions where you can.
+Projects in The Apache Software Foundation ("the ASF") use public, archived
+mailing lists to create a public record of each project's development
+activities and decision-making process.
+While lacking the immediacy of chat or other forms of communication,
+the mailing lists give participants the opportunity to slow down and be
+thoughtful in their responses, and they help developers who are spread across
+many timezones to participate more equally.
+
+See `the community page <https://arrow.apache.org/community/>`_ for links to
+subscribe to the mailing lists and to view archives.
+
+Report bugs and propose features
+================================
+
+Using the software and sharing your experience is a very helpful contribution
+itself. Those who actively develop Arrow need feedback from users on what
+works and what doesn't. Alerting us to unexpected behavior and missing features,
+even if you can't solve the problems yourself, help us understand and prioritize
+work to improve the libraries.
+
+We use `JIRA <https://issues.apache.org/jira/projects/ARROW/issues>`_
+to manage our development "todo" list and to maintain changelogs for releases.
+In addition, the project's `Confluence site <https://cwiki.apache.org/confluence/display/ARROW>`_
+has some useful higher-level views of the JIRA issues.
+
+To create a JIRA issue, you'll need to have an account on the ASF JIRA, which
+you can `sign yourself up for <https://issues.apache.org/jira/secure/Signup!default.jspa>`_.
+The JIRA server hosts bugs and issues for multiple Apache projects. The JIRA
+project name for Arrow is "ARROW".
+
+You don't need any special permissions on JIRA to be able to create issues.
+Once you are more involved in the project and want to do more on JIRA, such as
+assign yourself an issue, you will need "Contributor" permissions on the
+Apache Arrow JIRA. To get this role, ask on the mailing list for a project
+maintainer's help.
+
+Tips for using JIRA
++++++++++++++++++++
+
+Before you create a new issue, we recommend you first
+`search <https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20resolution%20%3D%20Unresolved>`_
+among existing Arrow issues.
+
+When reporting a new issue, follow these conventions to help make sure the
+right people see it:
+
+* Use the **Component** field to indicate the area of the project that your
+ issue pertains to (for example "Python" or "C++").
+* Also prefix the issue title with the component name in brackets, for example
+ ``[Python] issue name`` ; this helps when navigating lists of open issues,
+ and it also makes our changelogs more readable. Most prefixes are exactly the
+ same as the **Component** name, with the following exceptions:
+
+ * **Component:** Continuous Integration — **Summary prefix:** [CI]
+ * **Component:** Developer Tools — **Summary prefix:** [Dev]
+ * **Component:** Documentation — **Summary prefix:** [Docs]
+
+* If you're reporting something that used to work in a previous version
+ but doesn't work in the current release, you can add the "Affects version"
+ field. For feature requests and other proposals, "Affects version" isn't
+ appropriate.
+
+Project maintainers may later tweak formatting and labels to help improve their
+visibility. They may add a "Fix version" to indicate that they're considering
+it for inclusion in the next release, though adding that tag is not a
+commitment that it will be done in the next release.
+
+Tips for successful bug reports
++++++++++++++++++++++++++++++++
+
+No one likes having bugs in their software, and in an ideal world, all bugs
+would get fixed as soon as they were reported. However, time and attention are
+finite, especially in an open-source project where most contributors are
+participating in their spare time. All contributors in Apache projects are
+volunteers and act as individuals, even if they are contributing to the project
+as part of their job responsibilities.
+
+In order for your bug to get prompt
+attention, there are things you can do to make it easier for contributors to
+reproduce and fix it.
+When you're reporting a bug, please help us understand the issue by providing,
+to the best of your ability,
+
+* Clear, minimal steps to reproduce the issue, with as few non-Arrow
+ dependencies as possible. If there's a problem on reading a file, try to
+ provide as small of an example file as possible, or code to create one.
+ If your bug report says "it crashes trying to read my file, but I can't
+ share it with you," it's really hard for us to debug.
+* Any relevant operating system, language, and library version information
+* If it isn't obvious, clearly state the expected behavior and what actually
+ happened.
+
+If a developer can't get a failing unit test, they won't be able to know that
+the issue has been identified, and they won't know when it has been fixed.
+Try to anticipate the questions you might be asked by someone working to
+understand the issue and provide those supporting details up front.
+
+Other resources:
+
+* `Mozilla's bug-reporting guidelines <https://developer.mozilla.org/en-US/docs/Mozilla/QA/Bug_writing_guidelines>`_
+* `Reprex do's and don'ts <https://reprex.tidyverse.org/articles/reprex-dos-and-donts.html>`_
+
+Improve documentation
+=====================
+
+A great way to contribute to the project is to improve documentation. If you
+found some docs to be incomplete or inaccurate, share your hard-earned knowledge
+with the rest of the community.
+
+Documentation improvements are also a great way to gain some experience with
+our submission and review process, discussed below, without requiring a lot
+of local development environment setup. In fact, many documentation-only changes
+can be made directly in the GitHub web interface by clicking the "edit" button.
+This will handle making a fork and a pull request for you.
+
+Contribute code
+===============
+
+Code contributions, or "patches", are delivered in the form of GitHub pull
+requests against the `github.com/apache/arrow
+<https://github.com/apache/arrow>`_ repository.
+
+Before starting
++++++++++++++++
+
+You'll first need to select a JIRA issue to work on. Perhaps you're working on
+one you reported yourself. Otherwise, if you're looking for something,
+`search <https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20resolution%20%3D%20Unresolved>`_
+the open issues. Anything that's not in the "In Progress" state is fair game,
+even if it is "Assigned" to someone, particularly if it has not been
+recently updated. When in doubt, comment on the issue asking if they mind
+if you try to put together a pull request; interpret no response to mean that
+you're free to proceed.
+
+Please do ask questions, either on the JIRA itself or on the dev mailing list,
+if you have doubts about where to begin or what approach to take.
+This is particularly a good idea if this is your first code contribution,
+so you can get some sense of what the core developers in this part of the
+project think a good solution looks like. For best results, ask specific,
+direct questions, such as:
+
+* Do you think $PROPOSED_APPROACH is the right one?
+* In which file(s) should I be looking to make changes?
+* Is there anything related in the codebase I can look at to learn?
+
+If you ask these questions and do not get an answer, it is OK to ask again.
+
+Pull request and review
++++++++++++++++++++++++
+
+To contribute a patch:
+
+* Submit the patch as a GitHub pull request against the master branch. For a
+ tutorial, see the GitHub guides on `forking a repo <https://help.github.com/en/articles/fork-a-repo>`_
+ and `sending a pull request <https://help.github.com/en/articles/creating-a-pull-request-from-a-fork>`_.
+ So that your pull request syncs with the JIRA issue, prefix your pull request
+ name with the JIRA issue id (ex:
+ `ARROW-767: [C++] Filesystem abstraction <https://github.com/apache/arrow/pull/4225>`_).
+* Give the pull request a clear, brief description: when the pull request is
+ merged, this will be retained in the extended commit message.
+* Make sure that your code passes the unit tests. You can find instructions how
+ to run the unit tests for each Arrow component in its respective README file.
+
+Core developers and others with a stake in the part of the project your change
+affects will review, request changes, and hopefully indicate their approval
+in the end. To make the review process smooth for everyone, try to
+
+* Break your work into small, single-purpose patches if possible. It’s much
+ harder to merge in a large change with a lot of disjoint features, and
+ particularly if you're new to the project, smaller changes are much easier
+ for maintainers to accept.
+* Add new unit tests for your code.
+* Follow the style guides for the part(s) of the project you're modifying.
+ Some languages (C++ and Python, for example) run a lint check in
+ continuous integration. For all languages, see their respective developer
+ documentation and READMEs for style guidance. In general, try to make it look
+ as if the codebase has a single author, and emulate any conventions you see,
+ whether or not they are officially documented or checked.
+
+When tests are passing and the pull request has been approved by the interested
+parties, a `committer <https://arrow.apache.org/committers/>`_
+will merge the pull request. This is done with a
+command-line utility that does a squash merge, so all of your commits will be
+registered as a single commit to the master branch; this simplifies the
+connection between JIRA issues and commits, makes it easier to bisect
+history to identify where changes were introduced, and helps us be able to
+cherry-pick individual patches onto a maintenance branch.
+
+A side effect of this way of
+merging is that your pull request will appear in the GitHub interface to have
+been "closed without merge". Do not be alarmed: if you look at the bottom, you
+will see a message that says ``@user closed this in $COMMIT``. In the commit
+message of that commit, the merge tool adds the pull request description, a
+link back to the pull request, and attribution to the contributor and any
+co-authors.
+
+Local git conventions
++++++++++++++++++++++
+
+If you are tracking the Arrow source repository locally, here are some tips
+for using ``git``.
+
+All Arrow contributors work off of their personal fork of ``apache/arrow``
+and submit pull requests "upstream". Once you've cloned your fork of Arrow,
+be sure to::
+
+ $ git remote add upstream https://github.com/apache/arrow
+
+to set the "upstream" repository.
+
+You are encouraged to develop on branches, rather than your own "master" branch,
+and it helps to keep your fork's master branch synced with ``upstream/master``.
+
+To start a new branch, pull the latest from upstream first::
+
+ $ git fetch upstream
+ $ git checkout master
+ $ git pull --ff-only upstream master
+ $ git checkout -b $BRANCH
+
+It does not matter what you call your branch. Some people like to use the JIRA
+number as branch name, others use descriptive names.
+
+Once you have a branch going, you should sync with ``upstream/master``
+regularly, as many commits are merged to master every day.
+It is recommended to use ``git rebase`` rather than ``git merge``.
+To sync your local copy of a branch, you may do the following::
+
+ $ git pull upstream $BRANCH --rebase
+
+This will rebase your local commits on top of the tip of ``upstream/$BRANCH``. In case
+there are conflicts, and your local commit history has multiple commits, you may
+simplify the conflict resolution process by squashing your local commits into a single
+commit. Preserving the commit history isn't as important because when your
+feature branch is merged upstream, a squash happens automatically. If you choose this
+route, you can abort the rebase with::
+
+ $ git rebase --abort
+
+Following which, the local commits can be squashed interactively by running::
+
+ $ git rebase --interactive ORIG_HEAD~n
+
+Where ``n`` is the number of commits you have in your local branch. After the squash,
+you can try the merge again, and this time conflict resolution should be relatively
+straightforward.
+
+If you set the following in your repo's ``.git/config``, the ``--rebase`` option can be
+omitted from the ``git pull`` command, as it is implied by default. ::
+
+ [pull]
+ rebase = true
+
+Once you have an updated local copy, you can push to your remote repo. Note, since your
+remote repo still holds the old history, you would need to do a force push. ::
+
+ $ git push --force origin branch
+
+*Note about force pushing to a branch that is being reviewed:* if you want reviewers to
+look at your updates, please ensure you comment on the PR on GitHub as simply force
+pushing does not trigger a notification in the GitHub user interface.
+
+Also, once you have a pull request up, be sure you pull from ``origin``
+before rebasing and force-pushing. Arrow maintainers can push commits directly
+to your branch, which they sometimes do to help move a pull request along.
+In addition, the GitHub PR "suggestion" feature can also add commits to
+your branch, so it is possible that your local copy of your branch is missing
+some additions.
+
+.. include:: experimental_repos.rst
+
+Guidance for specific features
+==============================
+
+From time to time the community has discussions on specific types of features
+and improvements that they expect to support. This section outlines decisions
+that have been made in this regard.
+
+Endianness
+++++++++++
+
+The Arrow format allows setting endianness. Due to the popularity of
+little endian architectures most of implementation assume little endian by
+default. There has been some effort to support big endian platforms as well.
+Based on a `mailing-list discussion
+<https://mail-archives.apache.org/mod_mbox/arrow-dev/202009.mbox/%3cCAK7Z5T--HHhr9Dy43PYhD6m-XoU4qoGwQVLwZsG-kOxXjPTyZA@mail.gmail.com%3e>`__,
+the requirements for a new platform are:
+
+1. A robust (non-flaky, returning results in a reasonable time) Continuous
+ Integration setup.
+2. Benchmarks for performance critical parts of the code to demonstrate
+ no regression.
+
+Furthermore, for big-endian support, there are two levels that an
+implementation can support:
+
+1. Native endianness (all Arrow communication happens with processes of the
+ same endianness). This includes ancillary functionality such as reading
+ and writing various file formats, such as Parquet.
+2. Cross endian support (implementations will do byte reordering when
+ appropriate for :ref:`IPC <format-ipc>` and :ref:`Flight <flight-rpc>`
+ messages).
+
+The decision on what level to support is based on maintainers' preferences for
+complexity and technical risk. In general all implementations should be open
+to native endianness support (provided the CI and performance requirements
+are met). Cross endianness support is a question for individual maintainers.
+
+The current implementations aiming for cross endian support are:
+
+1. C++
+
+Implementations that do not intend to implement cross endian support:
+
+1. Java
+
+For other libraries, a discussion to gather consensus on the mailing-list
+should be had before submitting PRs.
diff --git a/src/arrow/docs/source/developers/cpp/building.rst b/src/arrow/docs/source/developers/cpp/building.rst
new file mode 100644
index 000000000..6b18c7312
--- /dev/null
+++ b/src/arrow/docs/source/developers/cpp/building.rst
@@ -0,0 +1,510 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _building-arrow-cpp:
+
+==================
+Building Arrow C++
+==================
+
+System setup
+============
+
+Arrow uses CMake as a build configuration system. We recommend building
+out-of-source. If you are not familiar with this terminology:
+
+* **In-source build**: ``cmake`` is invoked directly from the ``cpp``
+ directory. This can be inflexible when you wish to maintain multiple build
+ environments (e.g. one for debug builds and another for release builds)
+* **Out-of-source build**: ``cmake`` is invoked from another directory,
+ creating an isolated build environment that does not interact with any other
+ build environment. For example, you could create ``cpp/build-debug`` and
+ invoke ``cmake $CMAKE_ARGS ..`` from this directory
+
+Building requires:
+
+* A C++11-enabled compiler. On Linux, gcc 4.8 and higher should be
+ sufficient. For Windows, at least Visual Studio 2017 is required.
+* CMake 3.5 or higher
+* On Linux and macOS, either ``make`` or ``ninja`` build utilities
+
+On Ubuntu/Debian you can install the requirements with:
+
+.. code-block:: shell
+
+ sudo apt-get install \
+ build-essential \
+ cmake
+
+On Alpine Linux:
+
+.. code-block:: shell
+
+ apk add autoconf \
+ bash \
+ cmake \
+ g++ \
+ gcc \
+ make
+
+On Fedora Linux:
+
+.. code-block:: shell
+
+ sudo dnf install \
+ cmake \
+ gcc \
+ gcc-c++ \
+ make
+
+On macOS, you can use `Homebrew <https://brew.sh/>`_:
+
+.. code-block:: shell
+
+ git clone https://github.com/apache/arrow.git
+ cd arrow
+ brew update && brew bundle --file=cpp/Brewfile
+
+With `vcpkg <https://github.com/Microsoft/vcpkg>`_:
+
+.. code-block:: shell
+
+ git clone https://github.com/apache/arrow.git
+ cd arrow
+ vcpkg install \
+ --x-manifest-root cpp \
+ --feature-flags=versions \
+ --clean-after-build
+
+On MSYS2:
+
+.. code-block:: shell
+
+ pacman --sync --refresh --noconfirm \
+ ccache \
+ git \
+ mingw-w64-${MSYSTEM_CARCH}-boost \
+ mingw-w64-${MSYSTEM_CARCH}-brotli \
+ mingw-w64-${MSYSTEM_CARCH}-cmake \
+ mingw-w64-${MSYSTEM_CARCH}-gcc \
+ mingw-w64-${MSYSTEM_CARCH}-gflags \
+ mingw-w64-${MSYSTEM_CARCH}-glog \
+ mingw-w64-${MSYSTEM_CARCH}-gtest \
+ mingw-w64-${MSYSTEM_CARCH}-lz4 \
+ mingw-w64-${MSYSTEM_CARCH}-protobuf \
+ mingw-w64-${MSYSTEM_CARCH}-python3-numpy \
+ mingw-w64-${MSYSTEM_CARCH}-rapidjson \
+ mingw-w64-${MSYSTEM_CARCH}-snappy \
+ mingw-w64-${MSYSTEM_CARCH}-thrift \
+ mingw-w64-${MSYSTEM_CARCH}-zlib \
+ mingw-w64-${MSYSTEM_CARCH}-zstd
+
+Building
+========
+
+The build system uses ``CMAKE_BUILD_TYPE=release`` by default, so if this
+argument is omitted then a release build will be produced.
+
+.. note::
+
+ You need to more options to build on Windows. See
+ :ref:`developers-cpp-windows` for details.
+
+Minimal release build:
+
+.. code-block:: shell
+
+ git clone https://github.com/apache/arrow.git
+ cd arrow/cpp
+ mkdir release
+ cd release
+ cmake ..
+ make
+
+Minimal debug build with unit tests:
+
+.. code-block:: shell
+
+ git clone https://github.com/apache/arrow.git
+ cd arrow
+ git submodule update --init --recursive
+ export ARROW_TEST_DATA=$PWD/testing/data
+ cd cpp
+ mkdir debug
+ cd debug
+ cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON ..
+ make unittest
+
+The unit tests are not built by default. After building, one can also invoke
+the unit tests using the ``ctest`` tool provided by CMake (note that ``test``
+depends on ``python`` being available).
+
+On some Linux distributions, running the test suite might require setting an
+explicit locale. If you see any locale-related errors, try setting the
+environment variable (which requires the `locales` package or equivalent):
+
+.. code-block:: shell
+
+ export LC_ALL="en_US.UTF-8"
+
+Faster builds with Ninja
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Many contributors use the `Ninja build system <https://ninja-build.org/>`_ to
+get faster builds. It especially speeds up incremental builds. To use
+``ninja``, pass ``-GNinja`` when calling ``cmake`` and then use the ``ninja``
+command instead of ``make``.
+
+Optional Components
+~~~~~~~~~~~~~~~~~~~
+
+By default, the C++ build system creates a fairly minimal build. We have
+several optional system components which you can opt into building by passing
+boolean flags to ``cmake``.
+
+* ``-DARROW_BUILD_UTILITIES=ON`` : Build Arrow commandline utilities
+* ``-DARROW_COMPUTE=ON``: Computational kernel functions and other support
+* ``-DARROW_CSV=ON``: CSV reader module
+* ``-DARROW_CUDA=ON``: CUDA integration for GPU development. Depends on NVIDIA
+ CUDA toolkit. The CUDA toolchain used to build the library can be customized
+ by using the ``$CUDA_HOME`` environment variable.
+* ``-DARROW_DATASET=ON``: Dataset API, implies the Filesystem API
+* ``-DARROW_FILESYSTEM=ON``: Filesystem API for accessing local and remote
+ filesystems
+* ``-DARROW_FLIGHT=ON``: Arrow Flight RPC system, which depends at least on
+ gRPC
+* ``-DARROW_GANDIVA=ON``: Gandiva expression compiler, depends on LLVM,
+ Protocol Buffers, and re2
+* ``-DARROW_GANDIVA_JAVA=ON``: Gandiva JNI bindings for Java
+* ``-DARROW_GCS=ON``: Build Arrow with GCS support (requires the GCloud SDK for C++)
+* ``-DARROW_HDFS=ON``: Arrow integration with libhdfs for accessing the Hadoop
+ Filesystem
+* ``-DARROW_HIVESERVER2=ON``: Client library for HiveServer2 database protocol
+* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default
+* ``-DARROW_JSON=ON``: JSON reader module
+* ``-DARROW_MIMALLOC=ON``: Build the Arrow mimalloc-based allocator
+* ``-DARROW_ORC=ON``: Arrow integration with Apache ORC
+* ``-DARROW_PARQUET=ON``: Apache Parquet libraries and Arrow integration
+* ``-DARROW_PLASMA=ON``: Plasma Shared Memory Object Store
+* ``-DARROW_PLASMA_JAVA_CLIENT=ON``: Build Java client for Plasma
+* ``-DARROW_PYTHON=ON``: Arrow Python C++ integration library (required for
+ building pyarrow). This library must be built against the same Python version
+ for which you are building pyarrow. NumPy must also be installed. Enabling
+ this option also enables ``ARROW_COMPUTE``, ``ARROW_CSV``, ``ARROW_DATASET``,
+ ``ARROW_FILESYSTEM``, ``ARROW_HDFS``, and ``ARROW_JSON``.
+* ``-DARROW_S3=ON``: Support for Amazon S3-compatible filesystems
+* ``-DARROW_WITH_RE2=ON`` Build with support for regular expressions using the re2
+ library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` is ``ON``
+* ``-DARROW_WITH_UTF8PROC=ON``: Build with support for Unicode properties using
+ the utf8proc library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA``
+ is ``ON``
+* ``-DARROW_TENSORFLOW=ON``: Build Arrow with TensorFlow support enabled
+
+Compression options available in Arrow are:
+
+* ``-DARROW_WITH_BROTLI=ON``: Build support for Brotli compression
+* ``-DARROW_WITH_BZ2=ON``: Build support for BZ2 compression
+* ``-DARROW_WITH_LZ4=ON``: Build support for lz4 compression
+* ``-DARROW_WITH_SNAPPY=ON``: Build support for Snappy compression
+* ``-DARROW_WITH_ZLIB=ON``: Build support for zlib (gzip) compression
+* ``-DARROW_WITH_ZSTD=ON``: Build support for ZSTD compression
+
+Some features of the core Arrow shared library can be switched off for improved
+build times if they are not required for your application:
+
+* ``-DARROW_IPC=ON``: build the IPC extensions
+
+Optional Targets
+~~~~~~~~~~~~~~~~
+
+For development builds, you will often want to enable additional targets in
+enable to exercise your changes, using the following ``cmake`` options.
+
+* ``-DARROW_BUILD_BENCHMARKS=ON``: Build executable benchmarks.
+* ``-DARROW_BUILD_EXAMPLES=ON``: Build examples of using the Arrow C++ API.
+* ``-DARROW_BUILD_INTEGRATION=ON``: Build additional executables that are
+ used to exercise protocol interoperability between the different Arrow
+ implementations.
+* ``-DARROW_BUILD_UTILITIES=ON``: Build executable utilities.
+* ``-DARROW_BUILD_TESTS=ON``: Build executable unit tests.
+* ``-DARROW_ENABLE_TIMING_TESTS=ON``: If building unit tests, enable those
+ unit tests that rely on wall-clock timing (this flag is disabled on CI
+ because it can make test results flaky).
+* ``-DARROW_FUZZING=ON``: Build fuzz targets and related executables.
+
+Optional Checks
+~~~~~~~~~~~~~~~
+
+The following special checks are available as well. They instrument the
+generated code in various ways so as to detect select classes of problems
+at runtime (for example when executing unit tests).
+
+* ``-DARROW_USE_ASAN=ON``: Enable Address Sanitizer to check for memory leaks,
+ buffer overflows or other kinds of memory management issues.
+* ``-DARROW_USE_TSAN=ON``: Enable Thread Sanitizer to check for races in
+ multi-threaded code.
+* ``-DARROW_USE_UBSAN=ON``: Enable Undefined Behavior Sanitizer to check for
+ situations which trigger C++ undefined behavior.
+
+Some of those options are mutually incompatible, so you may have to build
+several times with different options if you want to exercise all of them.
+
+CMake version requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+While we support CMake 3.5 and higher, some features require a newer version of
+CMake:
+
+* Building the benchmarks requires 3.6 or higher
+* Building zstd from source requires 3.7 or higher
+* Building Gandiva JNI bindings requires 3.11 or higher
+
+LLVM and Clang Tools
+~~~~~~~~~~~~~~~~~~~~
+
+We are currently using LLVM 8 for library builds and for other developer tools
+such as code formatting with ``clang-format``. LLVM can be installed via most
+modern package managers (apt, yum, conda, Homebrew, vcpkg, chocolatey).
+
+.. _cpp-build-dependency-management:
+
+Build Dependency Management
+===========================
+
+The build system supports a number of third-party dependencies
+
+ * ``AWSSDK``: for S3 support, requires system cURL and can use the
+ ``BUNDLED`` method described below
+ * ``benchmark``: Google benchmark, for testing
+ * ``Boost``: for cross-platform support
+ * ``Brotli``: for data compression
+ * ``BZip2``: for data compression
+ * ``c-ares``: a dependency of gRPC
+ * ``gflags``: for command line utilities (formerly Googleflags)
+ * ``GLOG``: for logging
+ * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires
+ system cURL and can use the ``BUNDLED`` method described below
+ * ``gRPC``: for remote procedure calls
+ * ``GTest``: Googletest, for testing
+ * ``LLVM``: a dependency of Gandiva
+ * ``Lz4``: for data compression
+ * ``ORC``: for Apache ORC format support
+ * ``re2``: for compute kernels and Gandiva, a dependency of gRPC
+ * ``Protobuf``: Google Protocol Buffers, for data serialization
+ * ``RapidJSON``: for data serialization
+ * ``Snappy``: for data compression
+ * ``Thrift``: Apache Thrift, for data serialization
+ * ``utf8proc``: for compute kernels
+ * ``ZLIB``: for data compression
+ * ``zstd``: for data compression
+
+The CMake option ``ARROW_DEPENDENCY_SOURCE`` is a global option that instructs
+the build system how to resolve each dependency. There are a few options:
+
+* ``AUTO``: Try to find package in the system default locations and build from
+ source if not found
+* ``BUNDLED``: Building the dependency automatically from source
+* ``SYSTEM``: Finding the dependency in system paths using CMake's built-in
+ ``find_package`` function, or using ``pkg-config`` for packages that do not
+ have this feature
+* ``CONDA``: Use ``$CONDA_PREFIX`` as alternative ``SYSTEM`` PATH
+* ``VCPKG``: Find dependencies installed by vcpkg, and if not found, run
+ ``vcpkg install`` to install them
+* ``BREW``: Use Homebrew default paths as an alternative ``SYSTEM`` path
+
+The default method is ``AUTO`` unless you are developing within an active conda
+environment (detected by presence of the ``$CONDA_PREFIX`` environment
+variable), in which case it is ``CONDA``.
+
+Individual Dependency Resolution
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+While ``-DARROW_DEPENDENCY_SOURCE=$SOURCE`` sets a global default for all
+packages, the resolution strategy can be overridden for individual packages by
+setting ``-D$PACKAGE_NAME_SOURCE=..``. For example, to build Protocol Buffers
+from source, set
+
+.. code-block:: shell
+
+ -DProtobuf_SOURCE=BUNDLED
+
+This variable is unfortunately case-sensitive; the name used for each package
+is listed above, but the most up-to-date listing can be found in
+`cpp/cmake_modules/ThirdpartyToolchain.cmake <https://github.com/apache/arrow/blob/master/cpp/cmake_modules/ThirdpartyToolchain.cmake>`_.
+
+Bundled Dependency Versions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using the ``BUNDLED`` method to build a dependency from source, the
+version number from ``cpp/thirdparty/versions.txt`` is used. There is also a
+dependency source downloader script (see below), which can be used to set up
+offline builds.
+
+When using ``BUNDLED`` for dependency resolution (and if you use either the
+jemalloc or mimalloc allocators, which are recommended), statically linking the
+Arrow libraries in a third party project is more complex. See below for
+instructions about how to configure your build system in this case.
+
+Boost-related Options
+~~~~~~~~~~~~~~~~~~~~~
+
+We depend on some Boost C++ libraries for cross-platform support. In most cases,
+the Boost version available in your package manager may be new enough, and the
+build system will find it automatically. If you have Boost installed in a
+non-standard location, you can specify it by passing
+``-DBOOST_ROOT=$MY_BOOST_ROOT`` or setting the ``BOOST_ROOT`` environment
+variable.
+
+Offline Builds
+~~~~~~~~~~~~~~
+
+If you do not use the above variables to direct the Arrow build system to
+preinstalled dependencies, they will be built automatically by the Arrow build
+system. The source archive for each dependency will be downloaded via the
+internet, which can cause issues in environments with limited access to the
+internet.
+
+To enable offline builds, you can download the source artifacts yourself and
+use environment variables of the form ``ARROW_$LIBRARY_URL`` to direct the
+build system to read from a local file rather than accessing the internet.
+
+To make this easier for you, we have prepared a script
+``thirdparty/download_dependencies.sh`` which will download the correct version
+of each dependency to a directory of your choosing. It will print a list of
+bash-style environment variable statements at the end to use for your build
+script.
+
+.. code-block:: shell
+
+ # Download tarballs into $HOME/arrow-thirdparty
+ $ ./thirdparty/download_dependencies.sh $HOME/arrow-thirdparty
+
+You can then invoke CMake to create the build directory and it will use the
+declared environment variable pointing to downloaded archives instead of
+downloading them (one for each build dir!).
+
+Statically Linking
+~~~~~~~~~~~~~~~~~~
+
+When ``-DARROW_BUILD_STATIC=ON``, all build dependencies built as static
+libraries by the Arrow build system will be merged together to create a static
+library ``arrow_bundled_dependencies``. In UNIX-like environments (Linux, macOS,
+MinGW), this is called ``libarrow_bundled_dependencies.a`` and on Windows with
+Visual Studio ``arrow_bundled_dependencies.lib``. This "dependency bundle"
+library is installed in the same place as the other Arrow static libraries.
+
+If you are using CMake, the bundled dependencies will automatically be included
+when linking if you use the ``arrow_static`` CMake target. In other build
+systems, you may need to explicitly link to the dependency bundle. We created
+an `example CMake-based build configuration
+<https://github.com/apache/arrow/tree/master/cpp/examples/minimal_build>`_ to
+show you a working example.
+
+On Linux and macOS, if your application does not link to the ``pthread``
+library already, you must include ``-pthread`` in your linker setup. In CMake
+this can be accomplished with the ``Threads`` built-in package:
+
+.. code-block:: cmake
+
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
+ find_package(Threads REQUIRED)
+ target_link_libraries(my_target PRIVATE Threads::Threads)
+
+Extra debugging help
+~~~~~~~~~~~~~~~~~~~~
+
+If you use the CMake option ``-DARROW_EXTRA_ERROR_CONTEXT=ON`` it will compile
+the libraries with extra debugging information on error checks inside the
+``RETURN_NOT_OK`` macro. In unit tests with ``ASSERT_OK``, this will yield error
+outputs like:
+
+.. code-block:: shell
+
+ ../src/arrow/ipc/ipc-read-write-test.cc:609: Failure
+ Failed
+ ../src/arrow/ipc/metadata-internal.cc:508 code: TypeToFlatbuffer(fbb, *field.type(), &children, &layout, &type_enum, dictionary_memo, &type_offset)
+ ../src/arrow/ipc/metadata-internal.cc:598 code: FieldToFlatbuffer(fbb, *schema.field(i), dictionary_memo, &offset)
+ ../src/arrow/ipc/metadata-internal.cc:651 code: SchemaToFlatbuffer(fbb, schema, dictionary_memo, &fb_schema)
+ ../src/arrow/ipc/writer.cc:697 code: WriteSchemaMessage(schema_, dictionary_memo_, &schema_fb)
+ ../src/arrow/ipc/writer.cc:730 code: WriteSchema()
+ ../src/arrow/ipc/writer.cc:755 code: schema_writer.Write(&dictionaries_)
+ ../src/arrow/ipc/writer.cc:778 code: CheckStarted()
+ ../src/arrow/ipc/ipc-read-write-test.cc:574 code: writer->WriteRecordBatch(batch)
+ NotImplemented: Unable to convert type: decimal(19, 4)
+
+Deprecations and API Changes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use the compiler definition ``ARROW_NO_DEPRECATED_API`` to disable APIs that
+have been deprecated. It is a good practice to compile third party applications
+with this flag to proactively catch and account for API changes.
+
+Modular Build Targets
+~~~~~~~~~~~~~~~~~~~~~
+
+Since there are several major parts of the C++ project, we have provided
+modular CMake targets for building each library component, group of unit tests
+and benchmarks, and their dependencies:
+
+* ``make arrow`` for Arrow core libraries
+* ``make parquet`` for Parquet libraries
+* ``make gandiva`` for Gandiva (LLVM expression compiler) libraries
+* ``make plasma`` for Plasma libraries, server
+
+.. note::
+ If you have selected Ninja as CMake generator, replace ``make arrow`` with
+ ``ninja arrow``, and so on.
+
+To build the unit tests or benchmarks, add ``-tests`` or ``-benchmarks``
+to the target name. So ``make arrow-tests`` will build the Arrow core unit
+tests. Using the ``-all`` target, e.g. ``parquet-all``, will build everything.
+
+If you wish to only build and install one or more project subcomponents, we
+have provided the CMake option ``ARROW_OPTIONAL_INSTALL`` to only install
+targets that have been built. For example, if you only wish to build the
+Parquet libraries, its tests, and its dependencies, you can run:
+
+.. code-block:: shell
+
+ cmake .. -DARROW_PARQUET=ON \
+ -DARROW_OPTIONAL_INSTALL=ON \
+ -DARROW_BUILD_TESTS=ON
+ make parquet
+ make install
+
+If you omit an explicit target when invoking ``make``, all targets will be
+built.
+
+Debugging with Xcode on macOS
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Xcode is the IDE provided with macOS and can be use to develop and debug Arrow
+by generating an Xcode project:
+
+.. code-block:: shell
+
+ cd cpp
+ mkdir xcode-build
+ cd xcode-build
+ cmake .. -G Xcode -DARROW_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=DEBUG
+ open arrow.xcodeproj
+
+This will generate a project and open it in the Xcode app. As an alternative,
+the command ``xcodebuild`` will perform a command-line build using the
+generated project. It is recommended to use the "Automatically Create Schemes"
+option when first launching the project. Selecting an auto-generated scheme
+will allow you to build and run a unittest with breakpoints enabled.
diff --git a/src/arrow/docs/source/developers/cpp/conventions.rst b/src/arrow/docs/source/developers/cpp/conventions.rst
new file mode 100644
index 000000000..9db15fbcf
--- /dev/null
+++ b/src/arrow/docs/source/developers/cpp/conventions.rst
@@ -0,0 +1,90 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. highlight:: cpp
+
+===========
+Conventions
+===========
+
+This section provides some information about some of the abstractions and
+development approaches we use to solve problems common to many parts of the C++
+project.
+
+File Naming
+===========
+
+C++ source and header files should use underscores for word separation, not hyphens.
+Compiled executables, however, will automatically use hyphens (such that
+e.g. ``src/arrow/scalar_test.cc`` will be compiled into ``arrow-scalar-test``).
+
+C++ header files use the ``.h`` extension. Any header file name not
+containing ``internal`` is considered to be a public header, and will be
+automatically installed by the build.
+
+Comments and Docstrings
+=======================
+
+Regular comments start with ``//``.
+
+Doxygen docstrings start with ``///``, and Doxygen directives start with ``\``,
+like this::
+
+ /// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding.
+ ///
+ /// \param[in] size size of buffer to allocate
+ /// \param[in] pool a memory pool
+ ARROW_EXPORT
+ Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size,
+ MemoryPool* pool = NULLPTR);
+
+The summary line of a docstring uses the infinitive, not the indicative
+(for example, "Allocate a buffer" rather than "Allocates a buffer").
+
+Memory Pools
+============
+
+We provide a default memory pool with ``arrow::default_memory_pool()``.
+
+Error Handling and Exceptions
+=============================
+
+For error handling, we return ``arrow::Status`` values instead of throwing C++
+exceptions. Since the Arrow C++ libraries are intended to be useful as a
+component in larger C++ projects, using ``Status`` objects can help with good
+code hygiene by making explicit when a function is expected to be able to fail.
+
+A more recent option is to return a ``arrow::Result<T>`` object that can
+represent either a successful result with a ``T`` value, or an error result
+with a ``Status`` value.
+
+For expressing internal invariants and "cannot fail" errors, we use ``DCHECK`` macros
+defined in ``arrow/util/logging.h``. These checks are disabled in release builds
+and are intended to catch internal development errors, particularly when
+refactoring. These macros are not to be included in any public header files.
+
+Since we do not use exceptions, we avoid doing expensive work in object
+constructors. Objects that are expensive to construct may often have private
+constructors, with public static factory methods that return ``Status`` or
+``Result<T>``.
+
+There are a number of object constructors, like ``arrow::Schema`` and
+``arrow::RecordBatch`` where larger STL container objects like ``std::vector`` may
+be created. While it is possible for ``std::bad_alloc`` to be thrown in these
+constructors, the circumstances where they would are somewhat esoteric, and it
+is likely that an application would have encountered other more serious
+problems prior to having ``std::bad_alloc`` thrown in a constructor.
diff --git a/src/arrow/docs/source/developers/cpp/development.rst b/src/arrow/docs/source/developers/cpp/development.rst
new file mode 100644
index 000000000..4098f1c4e
--- /dev/null
+++ b/src/arrow/docs/source/developers/cpp/development.rst
@@ -0,0 +1,294 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+======================
+Development Guidelines
+======================
+
+This section provides information for developers who wish to contribute to the
+C++ codebase.
+
+.. note::
+
+ Since most of the project's developers work on Linux or macOS, not all
+ features or developer tools are uniformly supported on Windows. If you are
+ on Windows, have a look at :ref:`developers-cpp-windows`.
+
+Compiler warning levels
+=======================
+
+The ``BUILD_WARNING_LEVEL`` CMake option switches between sets of predetermined
+compiler warning levels that we use for code tidiness. For release builds, the
+default warning level is ``PRODUCTION``, while for debug builds the default is
+``CHECKIN``.
+
+When using ``CHECKIN`` for debug builds, ``-Werror`` is added when using gcc
+and clang, causing build failures for any warning, and ``/WX`` is set with MSVC
+having the same effect.
+
+Running unit tests
+==================
+
+The ``-DARROW_BUILD_TESTS=ON`` CMake option enables building of unit test
+executables. You can then either run them individually, by launching the
+desired executable, or run them all at once by launching the ``ctest``
+executable (which is part of the CMake suite).
+
+A possible invocation is something like::
+
+ $ ctest -j16 --output-on-failure
+
+where the ``-j16`` option runs up to 16 tests in parallel, taking advantage
+of multiple CPU cores and hardware threads.
+
+Running benchmarks
+==================
+
+The ``-DARROW_BUILD_BENCHMARKS=ON`` CMake option enables building of benchmark
+executables. You can then run benchmarks individually by launching the
+corresponding executable from the command line, e.g.::
+
+ $ ./build/release/arrow-builder-benchmark
+
+.. note::
+ For meaningful benchmark numbers, it is very strongly recommended to build
+ in ``Release`` mode, so as to enable compiler optimizations.
+
+Code Style, Linting, and CI
+===========================
+
+This project follows `Google's C++ Style Guide
+<https://google.github.io/styleguide/cppguide.html>`_ with minor exceptions:
+
+* We relax the line length restriction to 90 characters.
+* We use the ``NULLPTR`` macro in header files (instead of ``nullptr``) defined
+ in ``src/arrow/util/macros.h`` to support building C++/CLI (ARROW-1134)
+* We relax the guide's rules regarding structs. For public headers we should
+ use struct only for objects that are principally simple data containers where
+ it is OK to expose all the internal members and any methods are primarily
+ conveniences. For private headers the rules are relaxed further and structs
+ can be used where convenient for types that do not need access control even
+ though they may not be simple data containers.
+
+Our continuous integration builds on GitHub Actions run the unit test
+suites on a variety of platforms and configuration, including using
+Address Sanitizer and Undefined Behavior Sanitizer to check for various
+patterns of misbehaviour such as memory leaks. In addition, the
+codebase is subjected to a number of code style and code cleanliness checks.
+
+In order to have a passing CI build, your modified git branch must pass the
+following checks:
+
+* C++ builds with the project's active version of ``clang`` without
+ compiler warnings with ``-DBUILD_WARNING_LEVEL=CHECKIN``. Note that
+ there are classes of warnings (such as ``-Wdocumentation``, see more
+ on this below) that are not caught by ``gcc``.
+* CMake files pass style checks, can be fixed by running
+ ``archery lint --cmake-format --fix``. This requires Python
+ 3 and `cmake_format <https://github.com/cheshirekow/cmake_format>`_ (note:
+ this currently does not work on Windows)
+* Passes various C++ (and others) style checks, checked with the ``lint``
+ subcommand to :ref:`Archery <archery>`. This can also be fixed locally
+ by running ``archery lint --cpplint --fix``.
+
+In order to account for variations in the behavior of ``clang-format`` between
+major versions of LLVM, we pin the version of ``clang-format`` used (current
+LLVM 8).
+
+Depending on how you installed clang-format, the build system may not be able
+to find it. You can provide an explicit path to your LLVM installation (or the
+root path for the clang tools) with the environment variable
+`$CLANG_TOOLS_PATH` or by passing ``-DClangTools_PATH=$PATH_TO_CLANG_TOOLS`` when
+invoking CMake.
+
+To make linting more reproducible for everyone, we provide a ``docker-compose``
+target that is executable from the root of the repository:
+
+.. code-block:: shell
+
+ docker-compose run ubuntu-lint
+
+Cleaning includes with include-what-you-use (IWYU)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We occasionally use Google's `include-what-you-use
+<https://github.com/include-what-you-use/include-what-you-use>`_ tool, also
+known as IWYU, to remove unnecessary imports.
+
+To begin using IWYU, you must first build it by following the instructions in
+the project's documentation. Once the ``include-what-you-use`` executable is in
+your ``$PATH``, you must run CMake with ``-DCMAKE_EXPORT_COMPILE_COMMANDS=ON``
+in a new out-of-source CMake build directory like so:
+
+.. code-block:: shell
+
+ mkdir -p $ARROW_ROOT/cpp/iwyu
+ cd $ARROW_ROOT/cpp/iwyu
+ cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ -DARROW_PYTHON=ON \
+ -DARROW_PARQUET=ON \
+ -DARROW_FLIGHT=ON \
+ -DARROW_PLASMA=ON \
+ -DARROW_GANDIVA=ON \
+ -DARROW_BUILD_BENCHMARKS=ON \
+ -DARROW_BUILD_BENCHMARKS_REFERENCE=ON \
+ -DARROW_BUILD_TESTS=ON \
+ -DARROW_BUILD_UTILITIES=ON \
+ -DARROW_S3=ON \
+ -DARROW_WITH_BROTLI=ON \
+ -DARROW_WITH_BZ2=ON \
+ -DARROW_WITH_LZ4=ON \
+ -DARROW_WITH_SNAPPY=ON \
+ -DARROW_WITH_ZLIB=ON \
+ -DARROW_WITH_ZSTD=ON ..
+
+In order for IWYU to run on the desired component in the codebase, it must be
+enabled by the CMake configuration flags. Once this is done, you can run IWYU
+on the whole codebase by running a helper ``iwyu.sh`` script:
+
+.. code-block:: shell
+
+ IWYU_SH=$ARROW_ROOT/cpp/build-support/iwyu/iwyu.sh
+ ./$IWYU_SH
+
+Since this is very time consuming, you can check a subset of files matching
+some string pattern with the special "match" option
+
+.. code-block:: shell
+
+ ./$IWYU_SH match $PATTERN
+
+For example, if you wanted to do IWYU checks on all files in
+``src/arrow/array``, you could run
+
+.. code-block:: shell
+
+ ./$IWYU_SH match arrow/array
+
+Checking for ABI and API stability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To build ABI compliance reports, you need to install the two tools
+``abi-dumper`` and ``abi-compliance-checker``.
+
+Build Arrow C++ in Debug mode, alternatively you could use ``-Og`` which also
+builds with the necessary symbols but includes a bit of code optimization.
+Once the build has finished, you can generate ABI reports using:
+
+.. code-block:: shell
+
+ abi-dumper -lver 9 debug/libarrow.so -o ABI-9.dump
+
+The above version number is freely selectable. As we want to compare versions,
+you should now ``git checkout`` the version you want to compare it to and re-run
+the above command using a different version number. Once both reports are
+generated, you can build a comparison report using
+
+.. code-block:: shell
+
+ abi-compliance-checker -l libarrow -d1 ABI-PY-9.dump -d2 ABI-PY-10.dump
+
+The report is then generated in ``compat_reports/libarrow`` as a HTML.
+
+API Documentation
+=================
+
+We use Doxygen style comments (``///``) in header files for comments
+that we wish to show up in API documentation for classes and
+functions.
+
+When using ``clang`` and building with
+``-DBUILD_WARNING_LEVEL=CHECKIN``, the ``-Wdocumentation`` flag is
+used which checks for some common documentation inconsistencies, like
+documenting some, but not all function parameters with ``\param``. See
+the `LLVM documentation warnings section
+<https://releases.llvm.org/7.0.1/tools/clang/docs/DiagnosticsReference.html#wdocumentation>`_
+for more about this.
+
+While we publish the API documentation as part of the main Sphinx-based
+documentation site, you can also build the C++ API documentation anytime using
+Doxygen. Run the following command from the ``cpp/apidoc`` directory:
+
+.. code-block:: shell
+
+ doxygen Doxyfile
+
+This requires `Doxygen <https://www.doxygen.org>`_ to be installed.
+
+Apache Parquet Development
+==========================
+
+To build the C++ libraries for Apache Parquet, add the flag
+``-DARROW_PARQUET=ON`` when invoking CMake.
+To build Apache Parquet with encryption support, add the flag
+``-DPARQUET_REQUIRE_ENCRYPTION=ON`` when invoking CMake. The Parquet libraries and unit tests
+can be built with the ``parquet`` make target:
+
+.. code-block:: shell
+
+ make parquet
+
+On Linux and macOS if you do not have Apache Thrift installed on your system,
+or you are building with ``-DThrift_SOURCE=BUNDLED``, you must install
+``bison`` and ``flex`` packages. On Windows we handle these build dependencies
+automatically when building Thrift from source.
+
+Running ``ctest -L unittest`` will run all built C++ unit tests, while ``ctest -L
+parquet`` will run only the Parquet unit tests. The unit tests depend on an
+environment variable ``PARQUET_TEST_DATA`` that depends on a git submodule to the
+repository https://github.com/apache/parquet-testing:
+
+.. code-block:: shell
+
+ git submodule update --init
+ export PARQUET_TEST_DATA=$ARROW_ROOT/cpp/submodules/parquet-testing/data
+
+Here ``$ARROW_ROOT`` is the absolute path to the Arrow codebase.
+
+Arrow Flight RPC
+================
+
+In addition to the Arrow dependencies, Flight requires:
+
+* gRPC (>= 1.14, roughly)
+* Protobuf (>= 3.6, earlier versions may work)
+* c-ares (used by gRPC)
+
+By default, Arrow will try to download and build these dependencies
+when building Flight.
+
+The optional ``flight`` libraries and tests can be built by passing
+``-DARROW_FLIGHT=ON``.
+
+.. code-block:: shell
+
+ cmake .. -DARROW_FLIGHT=ON -DARROW_BUILD_TESTS=ON
+ make
+
+You can also use existing installations of the extra dependencies.
+When building, set the environment variables ``gRPC_ROOT`` and/or
+``Protobuf_ROOT`` and/or ``c-ares_ROOT``.
+
+We are developing against recent versions of gRPC, and the versions. The
+``grpc-cpp`` package available from https://conda-forge.org/ is one reliable
+way to obtain gRPC in a cross-platform way. You may try using system libraries
+for gRPC and Protobuf, but these are likely to be too old. On macOS, you can
+try `Homebrew <https://brew.sh/>`_:
+
+.. code-block:: shell
+
+ brew install grpc
diff --git a/src/arrow/docs/source/developers/cpp/fuzzing.rst b/src/arrow/docs/source/developers/cpp/fuzzing.rst
new file mode 100644
index 000000000..41398a13d
--- /dev/null
+++ b/src/arrow/docs/source/developers/cpp/fuzzing.rst
@@ -0,0 +1,99 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=================
+Fuzzing Arrow C++
+=================
+
+To make the handling of invalid input more robust, we have enabled
+fuzz testing on several parts of the Arrow C++ feature set, currently:
+
+* the IPC stream format
+* the IPC file format
+* the Parquet file format
+
+We welcome any contribution to expand the scope of fuzz testing and cover
+areas ingesting potentially invalid or malicious data.
+
+Fuzz Targets and Utilities
+==========================
+
+By passing the ``-DARROW_FUZZING=ON`` CMake option, you will build
+the fuzz targets corresponding to the aforementioned Arrow features, as well
+as additional related utilities.
+
+Generating the seed corpus
+--------------------------
+
+Fuzzing essentially explores the domain space by randomly mutating previously
+tested inputs, without having any high-level understanding of the area being
+fuzz-tested. However, the domain space is so huge that this strategy alone
+may fail to actually produce any "interesting" inputs.
+
+To guide the process, it is therefore important to provide a *seed corpus*
+of valid (or invalid, but remarkable) inputs from which the fuzzing
+infrastructure can derive new inputs for testing. A script is provided
+to automate that task. Assuming the fuzzing executables can be found in
+``build/debug``, the seed corpus can be generated thusly:
+
+.. code-block:: shell
+
+ $ ./build-support/fuzzing/generate_corpuses.sh build/debug
+
+Continuous fuzzing infrastructure
+=================================
+
+The process of fuzz testing is computationally intensive and therefore
+benefits from dedicated computing facilities. Arrow C++ is exercised by
+the `OSS-Fuzz`_ continuous fuzzing infrastructure operated by Google.
+
+Issues found by OSS-Fuzz are notified and available to a limited set of
+`core developers <https://github.com/google/oss-fuzz/blob/master/projects/arrow/project.yaml>`_.
+If you are a Arrow core developer and want to be added to that list, you can
+ask on the :ref:`mailing-list <contributing>`.
+
+.. _OSS-Fuzz: https://google.github.io/oss-fuzz/
+
+Reproducing locally
+===================
+
+When a crash is found by fuzzing, it is often useful to download the data
+used to produce the crash, and use it to reproduce the crash so as to debug
+and investigate.
+
+Assuming you are in a subdirectory inside ``cpp``, the following command
+would allow you to build the fuzz targets with debug information and the
+various sanitizer checks enabled.
+
+.. code-block:: shell
+
+ $ cmake .. -GNinja \
+ -DCMAKE_BUILD_TYPE=Debug \
+ -DARROW_USE_ASAN=on \
+ -DARROW_USE_UBSAN=on \
+ -DARROW_FUZZING=on
+
+Then, assuming you have downloaded the crashing data file (let's call it
+``testcase-arrow-ipc-file-fuzz-123465``), you can reproduce the crash
+by running the affected fuzz target on that file:
+
+.. code-block:: shell
+
+ $ build/debug/arrow-ipc-file-fuzz testcase-arrow-ipc-file-fuzz-123465
+
+(you may want to run that command under a debugger so as to inspect the
+program state more closely)
diff --git a/src/arrow/docs/source/developers/cpp/index.rst b/src/arrow/docs/source/developers/cpp/index.rst
new file mode 100644
index 000000000..36c9778be
--- /dev/null
+++ b/src/arrow/docs/source/developers/cpp/index.rst
@@ -0,0 +1,31 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _cpp-development:
+
+***************
+C++ Development
+***************
+
+.. toctree::
+ :maxdepth: 2
+
+ building
+ development
+ windows
+ conventions
+ fuzzing
diff --git a/src/arrow/docs/source/developers/cpp/windows.rst b/src/arrow/docs/source/developers/cpp/windows.rst
new file mode 100644
index 000000000..ee5a613bc
--- /dev/null
+++ b/src/arrow/docs/source/developers/cpp/windows.rst
@@ -0,0 +1,412 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _developers-cpp-windows:
+
+=====================
+Developing on Windows
+=====================
+
+Like Linux and macOS, we have worked to enable builds to work "out of the box"
+with CMake for a reasonably large subset of the project.
+
+.. _windows-system-setup:
+
+System Setup
+============
+
+Microsoft provides the free Visual Studio Community edition. When doing
+development in the shell, you must initialize the development environment
+each time you open the shell.
+
+For Visual Studio 2017, execute the following batch script:
+
+.. code-block:: shell
+
+ "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64
+
+For Visual Studio 2019, the script is:
+
+.. code-block:: shell
+
+ "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64
+
+One can configure a console emulator like `cmder <https://cmder.net/>`_ to
+automatically launch this when starting a new development console.
+
+Using conda-forge for build dependencies
+========================================
+
+`Miniconda <https://conda.io/miniconda.html>`_ is a minimal Python distribution
+including the `conda <https://conda.io>`_ package manager. Some memers of the
+Apache Arrow community participate in the maintenance of `conda-forge
+<https://conda-forge.org/>`_, a community-maintained cross-platform package
+repository for conda.
+
+To use ``conda-forge`` for your C++ build dependencies on Windows, first
+download and install a 64-bit distribution from the `Miniconda homepage
+<https://conda.io/miniconda.html>`_
+
+To configure ``conda`` to use the ``conda-forge`` channel by default, launch a
+command prompt (``cmd.exe``), run the initialization command shown
+:ref:`above<windows-system-setup>` (``vcvarsall.bat`` or ``VsDevCmd.bat``), then
+run the command:
+
+.. code-block:: shell
+
+ conda config --add channels conda-forge
+
+Now, you can bootstrap a build environment (call from the root directory of the
+Arrow codebase):
+
+.. code-block:: shell
+
+ conda create -y -n arrow-dev --file=ci\conda_env_cpp.txt
+
+Then "activate" this conda environment with:
+
+.. code-block:: shell
+
+ activate arrow-dev
+
+If the environment has been activated, the Arrow build system will
+automatically see the ``%CONDA_PREFIX%`` environment variable and use that for
+resolving the build dependencies. This is equivalent to setting
+
+.. code-block:: shell
+
+ -DARROW_DEPENDENCY_SOURCE=SYSTEM ^
+ -DARROW_PACKAGE_PREFIX=%CONDA_PREFIX%\Library
+
+To use the Visual Studio IDE with this conda environment activated, launch it by
+running the command ``devenv`` from the same command prompt.
+
+Note that dependencies installed as conda packages are built in release mode and
+cannot link with debug builds. If you intend to use ``-DCMAKE_BUILD_TYPE=debug``
+then you must build the packages from source.
+``-DCMAKE_BUILD_TYPE=relwithdebinfo`` is also available, which produces a build
+that can both be linked with release libraries and be debugged.
+
+.. note::
+
+ If you run into any problems using conda packages for dependencies, a very
+ common problem is mixing packages from the ``defaults`` channel with those
+ from ``conda-forge``. You can examine the installed packages in your
+ environment (and their origin) with ``conda list``
+
+Using vcpkg for build dependencies
+========================================
+
+`vcpkg <https://github.com/microsoft/vcpkg>`_ is an open source package manager
+from Microsoft. It hosts community-contributed ports of C and C++ packages and
+their dependencies. Arrow includes a manifest file `cpp/vcpkg.json
+<https://github.com/apache/arrow/blob/master/cpp/vcpkg.json>`_ that specifies
+which vcpkg packages are required to build the C++ library.
+
+To use vcpkg for C++ build dependencies on Windows, first
+`install <https://docs.microsoft.com/en-us/cpp/build/install-vcpkg>`_ and
+`integrate <https://docs.microsoft.com/en-us/cpp/build/integrate-vcpkg>`_
+vcpkg. Then change working directory in ``cmd.exe`` to the root directory
+of Arrow and run the command:
+
+.. code-block:: shell
+
+ vcpkg install ^
+ --triplet x64-windows ^
+ --x-manifest-root cpp ^
+ --feature-flags=versions ^
+ --clean-after-build
+
+On Windows, vcpkg builds dynamic link libraries by default. Use the triplet
+``x64-windows-static`` to build static libraries. vcpkg downloads source
+packages and compiles them locally, so installing dependencies with vcpkg is
+more time-consuming than with conda.
+
+Then in your ``cmake`` command, to use dependencies installed by vcpkg, set:
+
+.. code-block:: shell
+
+ -DARROW_DEPENDENCY_SOURCE=VCPKG
+
+You can optionally set other variables to override the default CMake
+configurations for vcpkg, including:
+
+* ``-DCMAKE_TOOLCHAIN_FILE``: by default, the CMake scripts automatically find
+ the location of the vcpkg CMake toolchain file ``vcpkg.cmake``; use this to
+ instead specify its location
+* ``-DVCPKG_TARGET_TRIPLET``: by default, the CMake scripts attempt to infer the
+ vcpkg
+ `triplet <https://github.com/microsoft/vcpkg/blob/master/docs/users/triplets.md>`_;
+ use this to instead specify the triplet
+* ``-DARROW_DEPENDENCY_USE_SHARED``: default is ``ON``; set to ``OFF`` for
+ static libraries
+* ``-DVCPKG_MANIFEST_MODE``: default is ``ON``; set to ``OFF`` to ignore the
+ ``vcpkg.json`` manifest file and only look for vcpkg packages that are
+ already installed under the directory where vcpkg is installed
+
+
+Building using Visual Studio (MSVC) Solution Files
+==================================================
+
+Change working directory in ``cmd.exe`` to the root directory of Arrow and do
+an out of source build by generating a MSVC solution:
+
+.. code-block:: shell
+
+ cd cpp
+ mkdir build
+ cd build
+ cmake .. -G "Visual Studio 15 2017" -A x64 ^
+ -DARROW_BUILD_TESTS=ON
+ cmake --build . --config Release
+
+For newer versions of Visual Studio, specify the generator
+``Visual Studio 16 2019`` or see ``cmake --help`` for available
+generators.
+
+Building with Ninja and clcache
+===============================
+
+The `Ninja <https://ninja-build.org/>`_ build system offers better build
+parallelization, and the optional `clcache
+<https://github.com/frerich/clcache/>`_ compiler cache keeps track of
+past compilations to avoid running them over and over again (in a way similar
+to the Unix-specific ``ccache``).
+
+Newer versions of Visual Studio include Ninja. To see if your Visual Studio
+includes Ninja, run the initialization command shown
+:ref:`above<windows-system-setup>` (``vcvarsall.bat`` or ``VsDevCmd.bat``), then
+run ``ninja --version``.
+
+If Ninja is not included in your version of Visual Studio, and you are using
+conda, activate your conda environment and install Ninja and clcache:
+
+.. code-block:: shell
+
+ activate arrow-dev
+ conda install -c conda-forge ninja
+ pip install git+https://github.com/frerich/clcache.git
+
+If you are not using conda,
+`install Ninja from another source <https://github.com/ninja-build/ninja/wiki/Pre-built-Ninja-packages>`_
+and optionally
+`install clcache from another source <https://github.com/frerich/clcache/wiki/Installation>`_
+.
+
+After installation is complete, change working directory in ``cmd.exe`` to the root directory of Arrow and
+do an out of source build by generating Ninja files:
+
+.. code-block:: shell
+
+ cd cpp
+ mkdir build
+ cd build
+ cmake -G "Ninja" ^
+ -DCMAKE_C_COMPILER=clcache ^
+ -DCMAKE_CXX_COMPILER=clcache ^
+ -DARROW_BUILD_TESTS=ON ^
+ -DGTest_SOURCE=BUNDLED ..
+ cmake --build . --config Release
+
+Setting ``CMAKE_C_COMPILER`` and ``CMAKE_CXX_COMPILER`` in the command line
+of ``cmake`` is the preferred method of using ``clcache``. Alternatively, you
+can set ``CC`` and ``CXX`` environment variables before calling ``cmake``:
+
+.. code-block:: shell
+
+ ...
+ set CC=clcache
+ set CXX=clcache
+ cmake -G "Ninja" ^
+ ...
+
+
+
+Building with NMake
+===================
+
+Change working directory in ``cmd.exe`` to the root directory of Arrow and
+do an out of source build using ``nmake``:
+
+.. code-block:: shell
+
+ cd cpp
+ mkdir build
+ cd build
+ cmake -G "NMake Makefiles" ..
+ nmake
+
+Building on MSYS2
+=================
+
+You can build on MSYS2 terminal, ``cmd.exe`` or PowerShell terminal.
+
+On MSYS2 terminal:
+
+.. code-block:: shell
+
+ cd cpp
+ mkdir build
+ cd build
+ cmake -G "MSYS Makefiles" ..
+ make
+
+On ``cmd.exe`` or PowerShell terminal, you can use the following batch
+file:
+
+.. code-block:: batch
+
+ setlocal
+
+ REM For 64bit
+ set MINGW_PACKAGE_PREFIX=mingw-w64-x86_64
+ set MINGW_PREFIX=c:\msys64\mingw64
+ set MSYSTEM=MINGW64
+
+ set PATH=%MINGW_PREFIX%\bin;c:\msys64\usr\bin;%PATH%
+
+ rmdir /S /Q cpp\build
+ mkdir cpp\build
+ pushd cpp\build
+ cmake -G "MSYS Makefiles" .. || exit /B
+ make || exit /B
+ popd
+
+Debug builds
+============
+
+To build a Debug version of Arrow, you should have pre-installed a Debug
+version of Boost. It's recommended to configure ``cmake`` with the following
+variables for Debug build:
+
+* ``-DARROW_BOOST_USE_SHARED=OFF``: enables static linking with boost debug
+ libs and simplifies run-time loading of 3rd parties
+* ``-DBOOST_ROOT``: sets the root directory of boost libs. (Optional)
+* ``-DBOOST_LIBRARYDIR``: sets the directory with boost lib files. (Optional)
+
+The command line to build Arrow in Debug mode will look something like this:
+
+.. code-block:: shell
+
+ cd cpp
+ mkdir build
+ cd build
+ cmake .. -G "Visual Studio 15 2017" -A x64 ^
+ -DARROW_BOOST_USE_SHARED=OFF ^
+ -DCMAKE_BUILD_TYPE=Debug ^
+ -DBOOST_ROOT=C:/local/boost_1_63_0 ^
+ -DBOOST_LIBRARYDIR=C:/local/boost_1_63_0/lib64-msvc-14.0
+ cmake --build . --config Debug
+
+Windows dependency resolution issues
+====================================
+
+Because Windows uses ``.lib`` files for both static and dynamic linking of
+dependencies, the static library sometimes may be named something different
+like ``%PACKAGE%_static.lib`` to distinguish itself. If you are statically
+linking some dependencies, we provide some options
+
+* ``-DBROTLI_MSVC_STATIC_LIB_SUFFIX=%BROTLI_SUFFIX%``
+* ``-DSNAPPY_MSVC_STATIC_LIB_SUFFIX=%SNAPPY_SUFFIX%``
+* ``-LZ4_MSVC_STATIC_LIB_SUFFIX=%LZ4_SUFFIX%``
+* ``-ZSTD_MSVC_STATIC_LIB_SUFFIX=%ZSTD_SUFFIX%``
+
+To get the latest build instructions, you can reference `ci/appveyor-built.bat
+<https://github.com/apache/arrow/blob/master/ci/appveyor-cpp-build.bat>`_,
+which is used by automated Appveyor builds.
+
+Statically linking to Arrow on Windows
+======================================
+
+The Arrow headers on Windows static library builds (enabled by the CMake
+option ``ARROW_BUILD_STATIC``) use the preprocessor macro ``ARROW_STATIC`` to
+suppress dllimport/dllexport marking of symbols. Projects that statically link
+against Arrow on Windows additionally need this definition. The Unix builds do
+not use the macro.
+
+Replicating Appveyor Builds
+===========================
+
+For people more familiar with linux development but need to replicate a failing
+appveyor build, here are some rough notes from replicating the
+``Static_Crt_Build`` (make unittest will probably still fail but many unit
+tests can be made with there individual make targets).
+
+1. Microsoft offers trial VMs for `Windows with Microsoft Visual Studio
+ <https://developer.microsoft.com/en-us/windows/downloads/virtual-machines>`_.
+ Download and install a version.
+2. Run the VM and install `Git <https://git-scm.com/>`_, `CMake
+ <https://cmake.org/>`_, and Miniconda or Anaconda (these instructions assume
+ Anaconda). Also install the `"Build Tools for Visual Studio"
+ <https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2019>`_.
+ Make sure to select the C++ toolchain in the installer wizard, and reboot
+ after installation.
+3. Download `pre-built Boost debug binaries
+ <https://sourceforge.net/projects/boost/files/boost-binaries/>`_ and install
+ it.
+
+ Run this from an Anaconda/Miniconda command prompt (*not* PowerShell prompt),
+ and make sure to run "vcvarsall.bat x64" first. The location of vcvarsall.bat
+ will depend, it may be under a different path than commonly indicated,
+ e.g. "``C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat``"
+ with the 2019 build tools.
+
+.. code-block:: shell
+
+ cd $EXTRACT_BOOST_DIRECTORY
+ .\bootstrap.bat
+ @rem This is for static libraries needed for static_crt_build in appveyor
+ .\b2 link=static --with-filesystem --with-regex --with-system install
+ @rem this should put libraries and headers in c:\Boost
+
+4. Activate anaconda/miniconda:
+
+.. code-block:: shell
+
+ @rem this might differ for miniconda
+ C:\Users\User\Anaconda3\Scripts\activate
+
+5. Clone and change directories to the arrow source code (you might need to
+ install git).
+6. Setup environment variables:
+
+.. code-block:: shell
+
+ @rem Change the build type based on which appveyor job you want.
+ SET JOB=Static_Crt_Build
+ SET GENERATOR=Ninja
+ SET APPVEYOR_BUILD_WORKER_IMAGE=Visual Studio 2017
+ SET USE_CLCACHE=false
+ SET ARROW_BUILD_GANDIVA=OFF
+ SET ARROW_LLVM_VERSION=8.0.*
+ SET PYTHON=3.6
+ SET ARCH=64
+ SET PATH=C:\Users\User\Anaconda3;C:\Users\User\Anaconda3\Scripts;C:\Users\User\Anaconda3\Library\bin;%PATH%
+ SET BOOST_LIBRARYDIR=C:\Boost\lib
+ SET BOOST_ROOT=C:\Boost
+
+7. Run appveyor scripts:
+
+.. code-block:: shell
+
+ conda install -c conda-forge --file .\ci\conda_env_cpp.txt
+ .\ci\appveyor-cpp-setup.bat
+ @rem this might fail but at this point most unit tests should be buildable by there individual targets
+ @rem see next line for example.
+ .\ci\appveyor-cpp-build.bat
+ @rem you can also just invoke cmake directly with the desired options
+ cmake --build . --config Release --target arrow-compute-hash-test
diff --git a/src/arrow/docs/source/developers/crossbow.rst b/src/arrow/docs/source/developers/crossbow.rst
new file mode 100644
index 000000000..cb49a2446
--- /dev/null
+++ b/src/arrow/docs/source/developers/crossbow.rst
@@ -0,0 +1,258 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Packaging and Testing with Crossbow
+===================================
+
+The content of ``arrow/dev/tasks`` directory aims for automating the process of
+Arrow packaging and integration testing.
+
+Packages:
+ - C++ and Python `conda-forge packages`_ for Linux, Mac and Windows
+ - Python `Wheels`_ for Linux, Mac and Windows
+ - C++ and GLib `Linux packages`_ for multiple distributions
+ - Java for Gandiva
+
+Integration tests:
+ - Various docker tests
+ - Pandas
+ - Dask
+ - Turbodbc
+ - HDFS
+ - Spark
+
+Architecture
+------------
+
+Executors
+~~~~~~~~~
+
+Individual jobs are executed on public CI services, currently:
+
+- Linux: TravisCI, CircleCI, Azure Pipelines
+- Mac: TravisCI, Azure Pipelines
+- Windows: AppVeyor, Azure Pipelines
+
+Queue
+~~~~~
+
+Because of the nature of how the CI services work, the scheduling of
+jobs happens through an additional git repository, which acts like a job
+queue for the tasks. Anyone can host a ``queue`` repository which is usually
+called as ``crossbow``.
+
+A job is a git commit on a particular git branch, containing only the required
+configuration file to run the requested build (like ``.travis.yml``,
+``appveyor.yml`` or ``azure-pipelines.yml``).
+
+Scheduler
+~~~~~~~~~
+
+Crossbow handles version generation, task rendering and
+submission. The tasks are defined in ``tasks.yml``.
+
+Install
+-------
+
+The following guide depends on GitHub, but theoretically any git
+server can be used.
+
+If you are not using the `ursacomputing/crossbow <https://github.com/ursacomputing/crossbow>`_
+repository, you will need to complete the first two steps, otherwise procede
+to step 3:
+
+1. `Create the queue repository`_
+
+2. Enable `TravisCI`_, `Appveyor`_, `Azure Pipelines`_ and `CircleCI`_
+ integrations on for the newly created queue repository.
+
+ - turn off Travis’ `auto cancellation`_ feature on branches
+
+3. Clone either ursacomputing/crossbow if you are using that, or the newly
+ created repository next to the arrow repository:
+
+ By default the scripts looks for ``crossbow`` next to arrow repository, but
+ this can configured through command line arguments.
+
+ .. code:: bash
+
+ git clone https://github.com/<user>/crossbow crossbow
+
+ **Important note:** Crossbow only supports GitHub token based
+ authentication. Although it overwrites the repository urls provided with ssh
+ protocol, it's advisable to use the HTTPS repository URLs.
+
+4. `Create a Personal Access Token`_ with ``repo`` and ``workflow`` permissions (other
+ permissions are not needed)
+
+5. Locally export the token as an environment variable:
+
+ .. code:: bash
+
+ export CROSSBOW_GITHUB_TOKEN=<token>
+
+ or pass as an argument to the CLI script ``--github-token``
+
+6. Export the previously created GitHub token on both CI services:
+
+ Use ``CROSSBOW_GITHUB_TOKEN`` encrypted environment variable. You can
+ set them at the following URLs, where ``ghuser`` is the GitHub
+ username and ``ghrepo`` is the GitHub repository name (typically
+ ``crossbow``):
+
+ - TravisCI: ``https://travis-ci.org/<ghuser>/<ghrepo>/settings``
+ - Appveyor:
+ ``https://ci.appveyor.com/project/<ghuser>/<ghrepo>/settings/environment``
+ - CircleCI:
+ ``https://circleci.com/gh/<ghuser>/<ghrepo>/edit#env-vars``
+
+ On Appveyor check the ``skip branches without appveyor.yml`` checkbox
+ on the web UI under crossbow repository’s settings.
+
+7. Install Python (minimum supported version is 3.6):
+
+ Miniconda is preferred, see installation instructions:
+ https://conda.io/docs/user-guide/install/index.html
+
+8. Install the archery toolset containing crossbow itself:
+
+ .. code:: bash
+
+ pip install -e "arrow/dev/archery[crossbow]"
+
+9. Try running it:
+
+ .. code:: bash
+
+ $ archery crossbow --help
+
+Usage
+-----
+
+The script does the following:
+
+1. Detects the current repository, thus supports forks. The following
+ snippet will build kszucs’s fork instead of the upstream apache/arrow
+ repository.
+
+ .. code:: bash
+
+ $ git clone https://github.com/kszucs/arrow
+ $ git clone https://github.com/kszucs/crossbow
+
+ $ cd arrow/dev/tasks
+ $ archery crossbow submit --help # show the available options
+ $ archery crossbow submit conda-win conda-linux conda-osx
+
+2. Gets the HEAD commit of the currently checked out branch and
+ generates the version number based on `setuptools_scm`_. So to build
+ a particular branch check out before running the script:
+
+ .. code:: bash
+
+ git checkout ARROW-<ticket number>
+ archery crossbow submit --dry-run conda-linux conda-osx
+
+ Note that the arrow branch must be pushed beforehand, because the
+ script will clone the selected branch.
+
+3. Reads and renders the required build configurations with the
+ parameters substituted.
+
+4. Create a branch per task, prefixed with the job id. For example to
+ build conda recipes on linux it will create a new branch:
+ ``crossbow@build-<id>-conda-linux``.
+
+5. Pushes the modified branches to GitHub which triggers the builds. For
+ authentication it uses GitHub OAuth tokens described in the install
+ section.
+
+Query the build status
+~~~~~~~~~~~~~~~~~~~~~~
+
+Build id (which has a corresponding branch in the queue repository) is returned
+by the ``submit`` command.
+
+.. code:: bash
+
+ archery crossbow status <build id / branch name>
+
+Download the build artifacts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: bash
+
+ archery crossbow artifacts <build id / branch name>
+
+Examples
+~~~~~~~~
+
+Submit command accepts a list of task names and/or a list of task-group names
+to select which tasks to build.
+
+Run multiple builds:
+
+.. code:: bash
+
+ $ archery crossbow submit debian-stretch conda-linux-gcc-py37-r40
+ Repository: https://github.com/kszucs/arrow@tasks
+ Commit SHA: 810a718836bb3a8cefc053055600bdcc440e6702
+ Version: 0.9.1.dev48+g810a7188.d20180414
+ Pushed branches:
+ - debian-stretch
+ - conda-linux-gcc-py37-r40
+
+Just render without applying or committing the changes:
+
+.. code:: bash
+
+ $ archery crossbow submit --dry-run task_name
+
+Run only ``conda`` package builds and a Linux one:
+
+.. code:: bash
+
+ $ archery crossbow submit --group conda centos-7
+
+Run ``wheel`` builds:
+
+.. code:: bash
+
+ $ archery crossbow submit --group wheel
+
+There are multiple task groups in the ``tasks.yml`` like docker, integration
+and cpp-python for running docker based tests.
+
+``archery crossbow submit`` supports multiple options and arguments, for more
+see its help page:
+
+.. code:: bash
+
+ $ archery crossbow submit --help
+
+
+.. _conda-forge packages: conda-recipes
+.. _Wheels: python-wheels
+.. _Linux packages: linux-packages
+.. _Create the queue repository: https://help.github.com/articles/creating-a-new-repository
+.. _TravisCI: https://travis-ci.org/getting_started
+.. _Appveyor: https://www.appveyor.com/docs/
+.. _CircleCI: https://circleci.com/docs/2.0/getting-started/
+.. _Azure Pipelines: https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/pipelines-sign-up
+.. _auto cancellation: https://docs.travis-ci.com/user/customizing-the-build/#Building-only-the-latest-commit
+.. _Create a Personal Access Token: https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/
+.. _setuptools_scm: https://pypi.python.org/pypi/setuptools_scm
diff --git a/src/arrow/docs/source/developers/docker.rst b/src/arrow/docs/source/developers/docker.rst
new file mode 100644
index 000000000..36b468752
--- /dev/null
+++ b/src/arrow/docs/source/developers/docker.rst
@@ -0,0 +1,226 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _docker-builds:
+
+Running Docker Builds
+=====================
+
+Most of our Linux based Continuous Integration tasks are decoupled from public
+CI services using `Docker <https://docs.docker.com/>`_ and
+`docker-compose <https://docs.docker.com/compose/>`_. Keeping the CI configuration
+minimal makes local reproducibility possible.
+
+Usage
+-----
+
+There are multiple ways to execute the docker based builds.
+The recommended way is to use the :ref:`Archery <archery>` tool:
+
+Examples
+~~~~~~~~
+
+**List the available images:**
+
+.. code:: bash
+
+ archery docker images
+
+**Execute a build:**
+
+.. code:: bash
+
+ archery docker run conda-python
+
+Archery calls the following docker-compose commands:
+
+.. code:: bash
+
+ docker-compose pull --ignore-pull-failures conda-cpp
+ docker-compose pull --ignore-pull-failures conda-python
+ docker-compose build conda-cpp
+ docker-compose build conda-python
+ docker-compose run --rm conda-python
+
+**Show the docker-compose commands instead of executing them:**
+
+.. code:: bash
+
+ archery docker run --dry-run conda-python
+
+**To disable the image pulling:**
+
+.. code:: bash
+
+ archery docker run --no-cache conda-python
+
+Which translates to:
+
+.. code:: bash
+
+ docker-compose build --no-cache conda-cpp
+ docker-compose build --no-cache conda-python
+ docker-compose run --rm conda-python
+
+**To disable the cache only for the leaf image:**
+
+Useful to force building the development version of a dependency.
+In case of the example below the command builds the
+``conda-cpp > conda-python > conda-python-pandas`` branch of the image tree
+where the leaf image is ``conda-python-pandas``.
+
+.. code:: bash
+
+ PANDAS=master archery docker run --no-leaf-cache conda-python-pandas
+
+Which translates to:
+
+.. code:: bash
+
+ export PANDAS=master
+ docker-compose pull --ignore-pull-failures conda-cpp
+ docker-compose pull --ignore-pull-failures conda-python
+ docker-compose build conda-cpp
+ docker-compose build conda-python
+ docker-compose build --no-cache conda-python-pandas
+ docker-compose run --rm conda-python-pandas
+
+Note that it doesn't pull the conda-python-pandas image and disable the cache
+when building it.
+
+``PANDAS`` is a `build parameter <Docker Build Parameters>`_, see the
+defaults in the .env file.
+
+**To entirely skip building the image:**
+
+The layer-caching mechanism of docker-compose can be less reliable than
+docker's, depending on the version, the ``cache_from`` build entry, and the
+backend used (docker-py, docker-cli, docker-cli and buildkit). This can lead to
+different layer hashes - even when executing the same build command
+repeatedly - eventually causing cache misses full image rebuilds.
+
+*If the image has been already built but the cache doesn't work properly*, it
+can be useful to skip the build phases:
+
+.. code:: bash
+
+ # first run ensures that the image is built
+ archery docker run conda-python
+
+ # if the second run tries the build the image again and none of the files
+ # referenced in the relevant dockerfile have changed, then it indicates a
+ # cache miss caused by the issue described above
+ archery docker run conda-python
+
+ # since the image is properly built with the first command, there is no
+ # need to rebuild it, so manually disable the pull and build phases to
+ # spare the some time
+ archery docker run --no-pull --no-build conda-python
+
+**Pass environment variables to the container:**
+
+Most of the build scripts used within the containers can be configured through
+environment variables. Pass them using ``--env`` or ``-e`` CLI options -
+similar to the ``docker run`` and ``docker-compose run`` interface.
+
+.. code:: bash
+
+ archery docker run --env CMAKE_BUILD_TYPE=release ubuntu-cpp
+
+For the available environment variables in the C++ builds see the
+``ci/scripts/cpp_build.sh`` script.
+
+**Run the image with custom command:**
+
+Custom docker commands may be passed as the second argument to
+``archery docker run``.
+
+The following example starts an interactive ``bash`` session in the container
+- useful for debugging the build interactively:
+
+.. code:: bash
+
+ archery docker run ubuntu-cpp bash
+
+Docker Volume Caches
+~~~~~~~~~~~~~~~~~~~~
+
+Most of the compose container have specific directories mounted from the host
+to reuse ``ccache`` and ``maven`` artifacts. These docker volumes are placed
+in the ``.docker`` directory.
+
+In order to clean up the cache simply delete one or more directories (or the
+whole ``.docker`` directory).
+
+
+Development
+-----------
+
+The docker-compose configuration is tuned towards reusable development
+containers using hierarchical images. For example multiple language bindings
+are dependent on the C++ implementation, so instead of redefining the
+C++ environment multiple Dockerfiles, we can reuse the exact same base C++
+image when building Glib, Ruby, R and Python bindings.
+This reduces duplication and streamlines maintenance, but makes the
+docker-compose configuration more complicated.
+
+Docker Build Parameters
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The build time parameters are pushed down to the dockerfiles to make the
+image building more flexible. These parameters are usually called as docker
+build args, but we pass these values as environment variables to
+docker-compose.yml. The build parameters are extensively used for:
+
+- defining the docker registry used for caching
+- platform architectures
+- operation systems and versions
+- defining various versions if dependencies
+
+The default parameter values are stored in the top level .env file.
+For detailed examples see the docker-compose.yml.
+
+Build Scripts
+~~~~~~~~~~~~~
+
+The scripts maintainted under ci/scripts directory should be kept
+parametrizable but reasonably minimal to clearly encapsulate the tasks it is
+responsible for. Like:
+
+- ``cpp_build.sh``: build the C++ implementation without running the tests.
+- ``cpp_test.sh``: execute the C++ tests.
+- ``python_build.sh``: build the Python bindings without running the tests.
+- ``python_test.sh``: execute the python tests.
+- ``docs_build.sh``: build the Sphinx documentation.
+- ``integration_dask.sh``: execute the dask integration tests.
+- ``integration_pandas.sh``: execute the pandas integration tests.
+- ``install_minio.sh``: install minio server for multiple platforms.
+- ``install_conda.sh``: install miniconda for multiple platforms.
+- ``install_gcs_testbench.sh``: install the GCS testbench for multiple platforms.
+
+The parametrization (like the C++ CMake options) is achieved via environment
+variables with useful defaults to keep the build configurations declarative.
+
+A good example is ``cpp_build.sh`` build script which forwards environment
+variables as CMake options - so the same scripts can be invoked in various
+configurations without the necessity of changing it. For examples see how the
+environment variables are passed in the docker-compose.yml's C++ images.
+
+Adding New Images
+~~~~~~~~~~~~~~~~~
+
+See the inline comments available in the docker-compose.yml file.
diff --git a/src/arrow/docs/source/developers/documentation.rst b/src/arrow/docs/source/developers/documentation.rst
new file mode 100644
index 000000000..813cc9cbd
--- /dev/null
+++ b/src/arrow/docs/source/developers/documentation.rst
@@ -0,0 +1,103 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _building-docs:
+
+Building the Documentation
+==========================
+
+Prerequisites
+-------------
+
+The documentation build process uses `Doxygen <http://www.doxygen.nl/>`_ and
+`Sphinx <http://www.sphinx-doc.org/>`_ along with a few extensions.
+
+If you're using Conda, the required software can be installed in a single line:
+
+.. code-block:: shell
+
+ conda install -c conda-forge --file=ci/conda_env_sphinx.txt
+
+Otherwise, you'll first need to install `Doxygen <http://www.doxygen.nl/>`_
+yourself (for example from your distribution's official repositories, if
+using Linux). Then you can install the Python-based requirements with the
+following command:
+
+.. code-block:: shell
+
+ pip install -r docs/requirements.txt
+
+Building
+--------
+
+.. note::
+
+ If you are building the documentation on Windows, not all sections
+ may build properly.
+
+These two steps are mandatory and must be executed in order.
+
+#. Process the C++ API using Doxygen
+
+ .. code-block:: shell
+
+ pushd cpp/apidoc
+ doxygen
+ popd
+
+#. Build the complete documentation using Sphinx.
+
+ .. note::
+
+ This step requires the pyarrow library is installed
+ in your python environment. One way to accomplish
+ this is to follow the build instructions at :ref:`python-development`
+ and then run ``python setup.py install`` in arrow/python
+ (it is best to do this in a dedicated conda/virtual environment).
+
+ .. code-block:: shell
+
+ pushd docs
+ make html
+ popd
+
+.. note::
+
+ Note that building the documentation may fail if your build of pyarrow is
+ not sufficiently comprehensive. Portions of the Python API documentation
+ will also not build without CUDA support having been built.
+
+After these steps are completed, the documentation is rendered in HTML
+format in ``docs/_build/html``. In particular, you can point your browser
+at ``docs/_build/html/index.html`` to read the docs and review any changes
+you made.
+
+Building with Docker
+--------------------
+
+You can use :ref:`Archery <archery>` to build the documentation within a
+Docker container.
+
+.. code-block:: shell
+
+ archery docker run ubuntu-docs
+
+The final output is located under ``docs/_build/html``.
+
+.. seealso::
+
+ :ref:`docker-builds`.
diff --git a/src/arrow/docs/source/developers/experimental_repos.rst b/src/arrow/docs/source/developers/experimental_repos.rst
new file mode 100644
index 000000000..f13adba2b
--- /dev/null
+++ b/src/arrow/docs/source/developers/experimental_repos.rst
@@ -0,0 +1,65 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Experimental repositories
+=========================
+
+Apache Arrow has an explicit policy over developing experimental repositories
+in the context of
+`rules for revolutionaries <https://grep.codeconsult.ch/2020/04/07/rules-for-revolutionaries-2000-edition/>`_.
+
+The main motivation for this policy is to offer a lightweight mechanism to
+conduct experimental work, with the necessary creative freedom, within the ASF
+and the Apache Arrow governance model. This policy allows committers to work on
+new repositories, as they offer many important tools to manage it (e.g. github
+issues, “watch”, “github stars” to measure overall interest).
+
+Process
++++++++
+
+* A committer *may* initiate experimental work by creating a separate git
+ repository within the Apache Arrow (e.g. via `selfserve <https://selfserve.apache.org/>`_)
+ and announcing it on the mailing list, together with its goals, and a link to the
+ newly created repository.
+* The committer *must* initiate an email thread with the sole purpose of
+ presenting updates to the community about the status of the repo.
+* There *must not* be official releases from the repository.
+* Any decision to make the experimental repo official in any way, whether by merging or migrating, *must* be discussed and voted on in the mailing list.
+* The committer is responsible for managing issues, documentation, CI of the repository,
+ including licensing checks.
+* The committer decides when the repository is archived.
+
+Repository management
++++++++++++++++++++++
+
+* The repository *must* be under ``apache/``
+* The repository’s name *must* be prefixed by ``arrow-experimental-``
+* The committer has full permissions over the repository (within possible in ASF)
+* Push / merge permissions *must only* be granted to Apache Arrow committers
+
+Development process
++++++++++++++++++++
+
+* The repository must follow the ASF requirements about 3rd party code.
+* The committer decides how to manage issues, PRs, etc.
+
+Divergences
++++++++++++
+
+* If any of the “must” above fails to materialize and no correction measure
+ is taken by the committer upon request, the PMC *should* take ownership
+ and decide what to do.
diff --git a/src/arrow/docs/source/developers/python.rst b/src/arrow/docs/source/developers/python.rst
new file mode 100644
index 000000000..3795512ef
--- /dev/null
+++ b/src/arrow/docs/source/developers/python.rst
@@ -0,0 +1,565 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+.. _python-development:
+
+==================
+Python Development
+==================
+
+This page provides general Python development guidelines and source build
+instructions for all platforms.
+
+Coding Style
+============
+
+We follow a similar PEP8-like coding style to the `pandas project
+<https://github.com/pandas-dev/pandas>`_. To check style issues, use the
+:ref:`Archery <archery>` subcommand ``lint``:
+
+.. code-block:: shell
+
+ pip install -e arrow/dev/archery[lint]
+
+.. code-block:: shell
+
+ archery lint --python
+
+Some of the issues can be automatically fixed by passing the ``--fix`` option:
+
+.. code-block:: shell
+
+ archery lint --python --fix
+
+Unit Testing
+============
+
+We are using `pytest <https://docs.pytest.org/en/latest/>`_ to develop our unit
+test suite. After building the project (see below) you can run its unit tests
+like so:
+
+.. code-block:: shell
+
+ pytest pyarrow
+
+Package requirements to run the unit tests are found in
+``requirements-test.txt`` and can be installed if needed with ``pip install -r
+requirements-test.txt``.
+
+The project has a number of custom command line options for its test
+suite. Some tests are disabled by default, for example. To see all the options,
+run
+
+.. code-block:: shell
+
+ pytest pyarrow --help
+
+and look for the "custom options" section.
+
+Test Groups
+-----------
+
+We have many tests that are grouped together using pytest marks. Some of these
+are disabled by default. To enable a test group, pass ``--$GROUP_NAME``,
+e.g. ``--parquet``. To disable a test group, prepend ``disable``, so
+``--disable-parquet`` for example. To run **only** the unit tests for a
+particular group, prepend ``only-`` instead, for example ``--only-parquet``.
+
+The test groups currently include:
+
+* ``gandiva``: tests for Gandiva expression compiler (uses LLVM)
+* ``hdfs``: tests that use libhdfs or libhdfs3 to access the Hadoop filesystem
+* ``hypothesis``: tests that use the ``hypothesis`` module for generating
+ random test cases. Note that ``--hypothesis`` doesn't work due to a quirk
+ with pytest, so you have to pass ``--enable-hypothesis``
+* ``large_memory``: Test requiring a large amount of system RAM
+* ``orc``: Apache ORC tests
+* ``parquet``: Apache Parquet tests
+* ``plasma``: Plasma Object Store tests
+* ``s3``: Tests for Amazon S3
+* ``tensorflow``: Tests that involve TensorFlow
+* ``flight``: Flight RPC tests
+
+Benchmarking
+------------
+
+For running the benchmarks, see :ref:`python-benchmarks`.
+
+Building on Linux and MacOS
+=============================
+
+System Requirements
+-------------------
+
+On macOS, any modern XCode (6.4 or higher; the current version is 10) is
+sufficient.
+
+On Linux, for this guide, we require a minimum of gcc 4.8, or clang 3.7 or
+higher. You can check your version by running
+
+.. code-block:: shell
+
+ $ gcc --version
+
+If the system compiler is older than gcc 4.8, it can be set to a newer version
+using the ``$CC`` and ``$CXX`` environment variables:
+
+.. code-block:: shell
+
+ export CC=gcc-4.8
+ export CXX=g++-4.8
+
+Environment Setup and Build
+---------------------------
+
+First, let's clone the Arrow git repository:
+
+.. code-block:: shell
+
+ mkdir repos
+ cd repos
+ git clone https://github.com/apache/arrow.git
+
+You should now see
+
+.. code-block:: shell
+
+ $ ls -l
+ total 8
+ drwxrwxr-x 12 wesm wesm 4096 Apr 15 19:19 arrow/
+
+Pull in the test data and setup the environment variables:
+
+.. code-block:: shell
+
+ pushd arrow
+ git submodule init
+ git submodule update
+ export PARQUET_TEST_DATA="${PWD}/cpp/submodules/parquet-testing/data"
+ export ARROW_TEST_DATA="${PWD}/testing/data"
+ popd
+
+Using Conda
+~~~~~~~~~~~
+
+.. note::
+
+ Using conda to build Arrow on macOS is complicated by the
+ fact that the `conda-forge compilers require an older macOS SDK <https://stackoverflow.com/a/55798942>`_.
+ Conda offers some `installation instructions <https://docs.conda.io/projects/conda-build/en/latest/resources/compiler-tools.html#macos-sdk>`_;
+ the alternative would be to use :ref:`Homebrew <python-homebrew>` and
+ ``pip`` instead.
+
+Let's create a conda environment with all the C++ build and Python dependencies
+from conda-forge, targeting development for Python 3.7:
+
+On Linux and macOS:
+
+.. code-block:: shell
+
+ conda create -y -n pyarrow-dev -c conda-forge \
+ --file arrow/ci/conda_env_unix.txt \
+ --file arrow/ci/conda_env_cpp.txt \
+ --file arrow/ci/conda_env_python.txt \
+ --file arrow/ci/conda_env_gandiva.txt \
+ compilers \
+ python=3.7 \
+ pandas
+
+As of January 2019, the ``compilers`` package is needed on many Linux
+distributions to use packages from conda-forge.
+
+With this out of the way, you can now activate the conda environment
+
+.. code-block:: shell
+
+ conda activate pyarrow-dev
+
+For Windows, see the `Building on Windows`_ section below.
+
+We need to set some environment variables to let Arrow's build system know
+about our build toolchain:
+
+.. code-block:: shell
+
+ export ARROW_HOME=$CONDA_PREFIX
+
+Using pip
+~~~~~~~~~
+
+.. warning::
+
+ If you installed Python using the Anaconda distribution or `Miniconda
+ <https://conda.io/miniconda.html>`_, you cannot currently use ``virtualenv``
+ to manage your development. Please follow the conda-based development
+ instructions instead.
+
+.. _python-homebrew:
+
+On macOS, use Homebrew to install all dependencies required for
+building Arrow C++:
+
+.. code-block:: shell
+
+ brew update && brew bundle --file=arrow/cpp/Brewfile
+
+See :ref:`here <cpp-build-dependency-management>` for a list of dependencies you
+may need.
+
+On Debian/Ubuntu, you need the following minimal set of dependencies. All other
+dependencies will be automatically built by Arrow's third-party toolchain.
+
+.. code-block:: shell
+
+ $ sudo apt-get install libjemalloc-dev libboost-dev \
+ libboost-filesystem-dev \
+ libboost-system-dev \
+ libboost-regex-dev \
+ python-dev \
+ autoconf \
+ flex \
+ bison
+
+If you are building Arrow for Python 3, install ``python3-dev`` instead of ``python-dev``.
+
+On Arch Linux, you can get these dependencies via pacman.
+
+.. code-block:: shell
+
+ $ sudo pacman -S jemalloc boost
+
+Now, let's create a Python virtualenv with all Python dependencies in the same
+folder as the repositories and a target installation folder:
+
+.. code-block:: shell
+
+ virtualenv pyarrow
+ source ./pyarrow/bin/activate
+ pip install -r arrow/python/requirements-build.txt \
+ -r arrow/python/requirements-test.txt
+
+ # This is the folder where we will install the Arrow libraries during
+ # development
+ mkdir dist
+
+If your cmake version is too old on Linux, you could get a newer one via
+``pip install cmake``.
+
+We need to set some environment variables to let Arrow's build system know
+about our build toolchain:
+
+.. code-block:: shell
+
+ export ARROW_HOME=$(pwd)/dist
+ export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
+
+Build and test
+--------------
+
+Now build and install the Arrow C++ libraries:
+
+.. code-block:: shell
+
+ mkdir arrow/cpp/build
+ pushd arrow/cpp/build
+
+ cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+ -DCMAKE_INSTALL_LIBDIR=lib \
+ -DARROW_WITH_BZ2=ON \
+ -DARROW_WITH_ZLIB=ON \
+ -DARROW_WITH_ZSTD=ON \
+ -DARROW_WITH_LZ4=ON \
+ -DARROW_WITH_SNAPPY=ON \
+ -DARROW_WITH_BROTLI=ON \
+ -DARROW_PARQUET=ON \
+ -DARROW_PYTHON=ON \
+ -DARROW_BUILD_TESTS=ON \
+ ..
+ make -j4
+ make install
+ popd
+
+There are a number of optional components that can can be switched ON by
+adding flags with ``ON``:
+
+* ``ARROW_FLIGHT``: RPC framework
+* ``ARROW_GANDIVA``: LLVM-based expression compiler
+* ``ARROW_ORC``: Support for Apache ORC file format
+* ``ARROW_PARQUET``: Support for Apache Parquet file format
+* ``ARROW_PLASMA``: Shared memory object store
+
+Anything set to ``ON`` above can also be turned off. Note that some compression
+libraries are needed for Parquet support.
+
+If multiple versions of Python are installed in your environment, you may have
+to pass additional parameters to cmake so that it can find the right
+executable, headers and libraries. For example, specifying
+``-DPython3_EXECUTABLE=$VIRTUAL_ENV/bin/python`` (assuming that you're in
+virtualenv) enables cmake to choose the python executable which you are using.
+
+.. note::
+
+ On Linux systems with support for building on multiple architectures,
+ ``make`` may install libraries in the ``lib64`` directory by default. For
+ this reason we recommend passing ``-DCMAKE_INSTALL_LIBDIR=lib`` because the
+ Python build scripts assume the library directory is ``lib``
+
+.. note::
+
+ If you have conda installed but are not using it to manage dependencies,
+ and you have trouble building the C++ library, you may need to set
+ ``-DARROW_DEPENDENCY_SOURCE=AUTO`` or some other value (described
+ :ref:`here <cpp-build-dependency-management>`)
+ to explicitly tell CMake not to use conda.
+
+.. note::
+
+ With older versions of ``cmake`` (<3.15) you might need to pass ``-DPYTHON_EXECUTABLE``
+ instead of ``-DPython3_EXECUTABLE``. See `cmake documentation <https://cmake.org/cmake/help/latest/module/FindPython3.html#artifacts-specification>`
+ for more details.
+
+For any other C++ build challenges, see :ref:`cpp-development`.
+
+Now, build pyarrow:
+
+.. code-block:: shell
+
+ pushd arrow/python
+ export PYARROW_WITH_PARQUET=1
+ python setup.py build_ext --inplace
+ popd
+
+If you did not build one of the optional components, set the corresponding
+``PYARROW_WITH_$COMPONENT`` environment variable to 0.
+
+Now you are ready to install test dependencies and run `Unit Testing`_, as
+described above.
+
+To build a self-contained wheel (including the Arrow and Parquet C++
+libraries), one can set ``--bundle-arrow-cpp``:
+
+.. code-block:: shell
+
+ pip install wheel # if not installed
+ python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \
+ --bundle-arrow-cpp bdist_wheel
+
+Docker examples
+~~~~~~~~~~~~~~~
+
+If you are having difficulty building the Python library from source, take a
+look at the ``python/examples/minimal_build`` directory which illustrates a
+complete build and test from source both with the conda and pip/virtualenv
+build methods.
+
+Building with CUDA support
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :mod:`pyarrow.cuda` module offers support for using Arrow platform
+components with Nvidia's CUDA-enabled GPU devices. To build with this support,
+pass ``-DARROW_CUDA=ON`` when building the C++ libraries, and set the following
+environment variable when building pyarrow:
+
+.. code-block:: shell
+
+ export PYARROW_WITH_CUDA=1
+
+Debugging
+---------
+
+Since pyarrow depends on the Arrow C++ libraries, debugging can
+frequently involve crossing between Python and C++ shared libraries.
+
+Using gdb on Linux
+~~~~~~~~~~~~~~~~~~
+
+To debug the C++ libraries with gdb while running the Python unit
+ test, first start pytest with gdb:
+
+.. code-block:: shell
+
+ gdb --args python -m pytest pyarrow/tests/test_to_run.py -k $TEST_TO_MATCH
+
+To set a breakpoint, use the same gdb syntax that you would when
+debugging a C++ unittest, for example:
+
+.. code-block:: shell
+
+ (gdb) b src/arrow/python/arrow_to_pandas.cc:1874
+ No source file named src/arrow/python/arrow_to_pandas.cc.
+ Make breakpoint pending on future shared library load? (y or [n]) y
+ Breakpoint 1 (src/arrow/python/arrow_to_pandas.cc:1874) pending.
+
+Building on Windows
+===================
+
+Building on Windows requires one of the following compilers to be installed:
+
+- `Build Tools for Visual Studio 2017 <https://download.visualstudio.microsoft.com/download/pr/3e542575-929e-4297-b6c6-bef34d0ee648/639c868e1219c651793aff537a1d3b77/vs_buildtools.exe>`_
+- Visual Studio 2017
+
+During the setup of Build Tools ensure at least one Windows SDK is selected.
+
+Visual Studio 2019 and its build tools are currently not supported.
+
+We bootstrap a conda environment similar to above, but skipping some of the
+Linux/macOS-only packages:
+
+First, starting from fresh clones of Apache Arrow:
+
+.. code-block:: shell
+
+ git clone https://github.com/apache/arrow.git
+
+.. code-block:: shell
+
+ conda create -y -n pyarrow-dev -c conda-forge ^
+ --file arrow\ci\conda_env_cpp.txt ^
+ --file arrow\ci\conda_env_python.txt ^
+ --file arrow\ci\conda_env_gandiva.txt ^
+ python=3.7
+ conda activate pyarrow-dev
+
+Now, we build and install Arrow C++ libraries.
+
+We set a number of environment variables:
+
+- the path of the installation directory of the Arrow C++ libraries as
+ ``ARROW_HOME``
+- add the path of installed DLL libraries to ``PATH``
+- and choose the compiler to be used
+
+.. code-block:: shell
+
+ set ARROW_HOME=%cd%\arrow-dist
+ set PATH=%ARROW_HOME%\bin;%PATH%
+ set PYARROW_CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+Let's configure, build and install the Arrow C++ libraries:
+
+.. code-block:: shell
+
+ mkdir arrow\cpp\build
+ pushd arrow\cpp\build
+ cmake -G "%PYARROW_CMAKE_GENERATOR%" ^
+ -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
+ -DCMAKE_UNITY_BUILD=ON ^
+ -DARROW_CXXFLAGS="/WX /MP" ^
+ -DARROW_WITH_LZ4=on ^
+ -DARROW_WITH_SNAPPY=on ^
+ -DARROW_WITH_ZLIB=on ^
+ -DARROW_WITH_ZSTD=on ^
+ -DARROW_PARQUET=on ^
+ -DARROW_PYTHON=on ^
+ ..
+ cmake --build . --target INSTALL --config Release
+ popd
+
+Now, we can build pyarrow:
+
+.. code-block:: shell
+
+ pushd arrow\python
+ set PYARROW_WITH_PARQUET=1
+ python setup.py build_ext --inplace
+ popd
+
+.. note::
+
+ For building pyarrow, the above defined environment variables need to also
+ be set. Remember this if to want to re-build ``pyarrow`` after your initial build.
+
+Then run the unit tests with:
+
+.. code-block:: shell
+
+ pushd arrow\python
+ py.test pyarrow -v
+ popd
+
+.. note::
+
+ With the above instructions the Arrow C++ libraries are not bundled with
+ the Python extension. This is recommended for development as it allows the
+ C++ libraries to be re-built separately.
+
+ As a consequence however, ``python setup.py install`` will also not install
+ the Arrow C++ libraries. Therefore, to use ``pyarrow`` in python, ``PATH``
+ must contain the directory with the Arrow .dll-files.
+
+ If you want to bundle the Arrow C++ libraries with ``pyarrow`` add
+ ``--bundle-arrow-cpp`` as build parameter:
+
+ ``python setup.py build_ext --bundle-arrow-cpp``
+
+ Important: If you combine ``--bundle-arrow-cpp`` with ``--inplace`` the
+ Arrow C++ libraries get copied to the python source tree and are not cleared
+ by ``python setup.py clean``. They remain in place and will take precedence
+ over any later Arrow C++ libraries contained in ``PATH``. This can lead to
+ incompatibilities when ``pyarrow`` is later built without
+ ``--bundle-arrow-cpp``.
+
+Running C++ unit tests for Python integration
+---------------------------------------------
+
+Running C++ unit tests should not be necessary for most developers. If you do
+want to run them, you need to pass ``-DARROW_BUILD_TESTS=ON`` during
+configuration of the Arrow C++ library build:
+
+.. code-block:: shell
+
+ mkdir arrow\cpp\build
+ pushd arrow\cpp\build
+ cmake -G "%PYARROW_CMAKE_GENERATOR%" ^
+ -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
+ -DARROW_CXXFLAGS="/WX /MP" ^
+ -DARROW_PARQUET=on ^
+ -DARROW_PYTHON=on ^
+ -DARROW_BUILD_TESTS=ON ^
+ ..
+ cmake --build . --target INSTALL --config Release
+ popd
+
+
+Getting ``arrow-python-test.exe`` (C++ unit tests for python integration) to
+run is a bit tricky because your ``%PYTHONHOME%`` must be configured to point
+to the active conda environment:
+
+.. code-block:: shell
+
+ set PYTHONHOME=%CONDA_PREFIX%
+ pushd arrow\cpp\build\release\Release
+ arrow-python-test.exe
+ popd
+
+To run all tests of the Arrow C++ library, you can also run ``ctest``:
+
+.. code-block:: shell
+
+ set PYTHONHOME=%CONDA_PREFIX%
+ pushd arrow\cpp\build
+ ctest
+ popd
+
+Windows Caveats
+---------------
+
+Some components are not supported yet on Windows:
+
+* Flight RPC
+* Plasma
diff --git a/src/arrow/docs/source/example.gz b/src/arrow/docs/source/example.gz
new file mode 100644
index 000000000..4fc60405c
--- /dev/null
+++ b/src/arrow/docs/source/example.gz
Binary files differ
diff --git a/src/arrow/docs/source/format/Arrow.graffle b/src/arrow/docs/source/format/Arrow.graffle
new file mode 100644
index 000000000..f4eead922
--- /dev/null
+++ b/src/arrow/docs/source/format/Arrow.graffle
Binary files differ
diff --git a/src/arrow/docs/source/format/Arrow.png b/src/arrow/docs/source/format/Arrow.png
new file mode 100644
index 000000000..1b09aa2d8
--- /dev/null
+++ b/src/arrow/docs/source/format/Arrow.png
Binary files differ
diff --git a/src/arrow/docs/source/format/CDataInterface.rst b/src/arrow/docs/source/format/CDataInterface.rst
new file mode 100644
index 000000000..20446411a
--- /dev/null
+++ b/src/arrow/docs/source/format/CDataInterface.rst
@@ -0,0 +1,948 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _c-data-interface:
+
+==========================
+The Arrow C data interface
+==========================
+
+Rationale
+=========
+
+Apache Arrow is designed to be a universal in-memory format for the representation
+of tabular ("columnar") data. However, some projects may face a difficult
+choice between either depending on a fast-evolving project such as the
+Arrow C++ library, or having to reimplement adapters for data interchange,
+which may require significant, redundant development effort.
+
+The Arrow C data interface defines a very small, stable set of C definitions
+that can be easily *copied* in any project's source code and used for columnar
+data interchange in the Arrow format. For non-C/C++ languages and runtimes,
+it should be almost as easy to translate the C definitions into the
+corresponding C FFI declarations.
+
+Applications and libraries can therefore work with Arrow memory without
+necessarily using Arrow libraries or reinventing the wheel. Developers can
+choose between tight integration
+with the Arrow *software project* (benefitting from the growing array of
+facilities exposed by e.g. the C++ or Java implementations of Apache Arrow,
+but with the cost of a dependency) or minimal integration with the Arrow
+*format* only.
+
+Goals
+-----
+
+* Expose an ABI-stable interface.
+* Make it easy for third-party projects to implement support for (including partial
+ support where sufficient), with little initial investment.
+* Allow zero-copy sharing of Arrow data between independent runtimes
+ and components running in the same process.
+* Match the Arrow array concepts closely to avoid the development of
+ yet another marshalling layer.
+* Avoid the need for one-to-one adaptation layers such as the limited
+ JPype-based bridge between Java and Python.
+* Enable integration without an explicit dependency (either at compile-time
+ or runtime) on the Arrow software project.
+
+Ideally, the Arrow C data interface can become a low-level *lingua franca*
+for sharing columnar data at runtime and establish Arrow as the universal
+building block in the columnar processing ecosystem.
+
+Non-goals
+---------
+
+* Expose a C API mimicking operations available in higher-level runtimes
+ (such as C++, Java...).
+* Data sharing between distinct processes or storage persistence.
+
+
+Comparison with the Arrow IPC format
+------------------------------------
+
+Pros of the C data interface vs. the IPC format:
+
+* No dependency on Flatbuffers.
+* No buffer reassembly (data is already exposed in logical Arrow format).
+* Zero-copy by design.
+* Easy to reimplement from scratch.
+* Minimal C definition that can be easily copied into other codebases.
+* Resource lifetime management through a custom release callback.
+
+Pros of the IPC format vs. the data interface:
+
+* Works across processes and machines.
+* Allows data storage and persistence.
+* Being a streamable format, the IPC format has room for composing more features
+ (such as integrity checks, compression...).
+* Does not require explicit C data access.
+
+Data type description -- format strings
+=======================================
+
+A data type is described using a format string. The format string only
+encodes information about the top-level type; for nested type, child types
+are described separately. Also, metadata is encoded in a separate string.
+
+The format strings are designed to be easily parsable, even from a language
+such as C. The most common primitive formats have one-character format
+strings:
+
++-----------------+--------------------------+------------+
+| Format string | Arrow data type | Notes |
++=================+==========================+============+
+| ``n`` | null | |
++-----------------+--------------------------+------------+
+| ``b`` | boolean | |
++-----------------+--------------------------+------------+
+| ``c`` | int8 | |
++-----------------+--------------------------+------------+
+| ``C`` | uint8 | |
++-----------------+--------------------------+------------+
+| ``s`` | int16 | |
++-----------------+--------------------------+------------+
+| ``S`` | uint16 | |
++-----------------+--------------------------+------------+
+| ``i`` | int32 | |
++-----------------+--------------------------+------------+
+| ``I`` | uint32 | |
++-----------------+--------------------------+------------+
+| ``l`` | int64 | |
++-----------------+--------------------------+------------+
+| ``L`` | uint64 | |
++-----------------+--------------------------+------------+
+| ``e`` | float16 | |
++-----------------+--------------------------+------------+
+| ``f`` | float32 | |
++-----------------+--------------------------+------------+
+| ``g`` | float64 | |
++-----------------+--------------------------+------------+
+
++-----------------+---------------------------------------------------+------------+
+| Format string | Arrow data type | Notes |
++=================+===================================================+============+
+| ``z`` | binary | |
++-----------------+---------------------------------------------------+------------+
+| ``Z`` | large binary | |
++-----------------+---------------------------------------------------+------------+
+| ``u`` | utf-8 string | |
++-----------------+---------------------------------------------------+------------+
+| ``U`` | large utf-8 string | |
++-----------------+---------------------------------------------------+------------+
+| ``d:19,10`` | decimal128 [precision 19, scale 10] | |
++-----------------+---------------------------------------------------+------------+
+| ``d:19,10,NNN`` | decimal bitwidth = NNN [precision 19, scale 10] | |
++-----------------+---------------------------------------------------+------------+
+| ``w:42`` | fixed-width binary [42 bytes] | |
++-----------------+---------------------------------------------------+------------+
+
+Temporal types have multi-character format strings starting with ``t``:
+
++-----------------+---------------------------------------------------+------------+
+| Format string | Arrow data type | Notes |
++=================+===================================================+============+
+| ``tdD`` | date32 [days] | |
++-----------------+---------------------------------------------------+------------+
+| ``tdm`` | date64 [milliseconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``tts`` | time32 [seconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``ttm`` | time32 [milliseconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``ttu`` | time64 [microseconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``ttn`` | time64 [nanoseconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``tss:...`` | timestamp [seconds] with timezone "..." | \(1) |
++-----------------+---------------------------------------------------+------------+
+| ``tsm:...`` | timestamp [milliseconds] with timezone "..." | \(1) |
++-----------------+---------------------------------------------------+------------+
+| ``tsu:...`` | timestamp [microseconds] with timezone "..." | \(1) |
++-----------------+---------------------------------------------------+------------+
+| ``tsn:...`` | timestamp [nanoseconds] with timezone "..." | \(1) |
++-----------------+---------------------------------------------------+------------+
+| ``tDs`` | duration [seconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``tDm`` | duration [milliseconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``tDu`` | duration [microseconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``tDn`` | duration [nanoseconds] | |
++-----------------+---------------------------------------------------+------------+
+| ``tiM`` | interval [months] | |
++-----------------+---------------------------------------------------+------------+
+| ``tiD`` | interval [days, time] | |
++-----------------+---------------------------------------------------+------------+
+| ``tin`` | interval [month, day, nanoseconds] | |
++-----------------+---------------------------------------------------+------------+
+
+
+Dictionary-encoded types do not have a specific format string. Instead, the
+format string of the base array represents the dictionary index type, and the
+value type can be read from the dependent dictionary array (see below
+"Dictionary-encoded arrays").
+
+Nested types have multiple-character format strings starting with ``+``. The
+names and types of child fields are read from the child arrays.
+
++------------------------+---------------------------------------------------+------------+
+| Format string | Arrow data type | Notes |
++========================+===================================================+============+
+| ``+l`` | list | |
++------------------------+---------------------------------------------------+------------+
+| ``+L`` | large list | |
++------------------------+---------------------------------------------------+------------+
+| ``+w:123`` | fixed-sized list [123 items] | |
++------------------------+---------------------------------------------------+------------+
+| ``+s`` | struct | |
++------------------------+---------------------------------------------------+------------+
+| ``+m`` | map | \(2) |
++------------------------+---------------------------------------------------+------------+
+| ``+ud:I,J,...`` | dense union with type ids I,J... | |
++------------------------+---------------------------------------------------+------------+
+| ``+us:I,J,...`` | sparse union with type ids I,J... | |
++------------------------+---------------------------------------------------+------------+
+
+Notes:
+
+(1)
+ The timezone string is appended as-is after the colon character ``:``, without
+ any quotes. If the timezone is empty, the colon ``:`` must still be included.
+
+(2)
+ As specified in the Arrow columnar format, the map type has a single child type
+ named ``entries``, itself a 2-child struct type of ``(key, value)``.
+
+Examples
+--------
+
+* A dictionary-encoded ``decimal128(precision = 12, scale = 5)`` array
+ with ``int16`` indices has format string ``s``, and its dependent dictionary
+ array has format string ``d:12,5``.
+* A ``list<uint64>`` array has format string ``+l``, and its single child
+ has format string ``L``.
+* A ``struct<ints: int32, floats: float32>`` has format string ``+s``; its two
+ children have names ``ints`` and ``floats``, and format strings ``i`` and
+ ``f`` respectively.
+* A ``map<string, float64>`` array has format string ``+m``; its single child
+ has name ``entries`` and format string ``+s``; its two grandchildren have names
+ ``key`` and ``value``, and format strings ``u`` and ``g`` respectively.
+* A ``sparse_union<ints: int32, floats: float32>`` with type ids ``4, 5``
+ has format string ``+us:4,5``; its two children have names ``ints`` and
+ ``floats``, and format strings ``i`` and ``f`` respectively.
+
+
+Structure definitions
+=====================
+
+The following free-standing definitions are enough to support the Arrow
+C data interface in your project. Like the rest of the Arrow project, they
+are available under the Apache License 2.0.
+
+.. code-block:: c
+
+ #define ARROW_FLAG_DICTIONARY_ORDERED 1
+ #define ARROW_FLAG_NULLABLE 2
+ #define ARROW_FLAG_MAP_KEYS_SORTED 4
+
+ struct ArrowSchema {
+ // Array type description
+ const char* format;
+ const char* name;
+ const char* metadata;
+ int64_t flags;
+ int64_t n_children;
+ struct ArrowSchema** children;
+ struct ArrowSchema* dictionary;
+
+ // Release callback
+ void (*release)(struct ArrowSchema*);
+ // Opaque producer-specific data
+ void* private_data;
+ };
+
+ struct ArrowArray {
+ // Array data description
+ int64_t length;
+ int64_t null_count;
+ int64_t offset;
+ int64_t n_buffers;
+ int64_t n_children;
+ const void** buffers;
+ struct ArrowArray** children;
+ struct ArrowArray* dictionary;
+
+ // Release callback
+ void (*release)(struct ArrowArray*);
+ // Opaque producer-specific data
+ void* private_data;
+ };
+
+The ArrowSchema structure
+-------------------------
+
+The ``ArrowSchema`` structure describes the type and metadata of an exported
+array or record batch. It has the following fields:
+
+.. c:member:: const char* ArrowSchema.format
+
+ Mandatory. A null-terminated, UTF8-encoded string describing
+ the data type. If the data type is nested, child types are not
+ encoded here but in the :c:member:`ArrowSchema.children` structures.
+
+ Consumers MAY decide not to support all data types, but they
+ should document this limitation.
+
+.. c:member:: const char* ArrowSchema.name
+
+ Optional. A null-terminated, UTF8-encoded string of the field
+ or array name. This is mainly used to reconstruct child fields
+ of nested types.
+
+ Producers MAY decide not to provide this information, and consumers
+ MAY decide to ignore it. If omitted, MAY be NULL or an empty string.
+
+.. c:member:: const char* ArrowSchema.metadata
+
+ Optional. A binary string describing the type's metadata.
+ If the data type is nested, child types are not encoded here but
+ in the :c:member:`ArrowSchema.children` structures.
+
+ This string is not null-terminated but follows a specific format::
+
+ int32: number of key/value pairs (noted N below)
+ int32: byte length of key 0
+ key 0 (not null-terminated)
+ int32: byte length of value 0
+ value 0 (not null-terminated)
+ ...
+ int32: byte length of key N - 1
+ key N - 1 (not null-terminated)
+ int32: byte length of value N - 1
+ value N - 1 (not null-terminated)
+
+ Integers are stored in native endianness. For example, the metadata
+ ``[('key1', 'value1')]`` is encoded on a little-endian machine as::
+
+ \x01\x00\x00\x00\x04\x00\x00\x00key1\x06\x00\x00\x00value1
+
+ On a big-endian machine, the same example would be encoded as::
+
+ \x00\x00\x00\x01\x00\x00\x00\x04key1\x00\x00\x00\x06value1
+
+ If omitted, this field MUST be NULL (not an empty string).
+
+ Consumers MAY choose to ignore this information.
+
+.. c:member:: int64_t ArrowSchema.flags
+
+ Optional. A bitfield of flags enriching the type description.
+ Its value is computed by OR'ing together the flag values.
+ The following flags are available:
+
+ * ``ARROW_FLAG_NULLABLE``: whether this field is semantically nullable
+ (regardless of whether it actually has null values).
+ * ``ARROW_FLAG_DICTIONARY_ORDERED``: for dictionary-encoded types,
+ whether the ordering of dictionary indices is semantically meaningful.
+ * ``ARROW_FLAG_MAP_KEYS_SORTED``: for map types, whether the keys within
+ each map value are sorted.
+
+ If omitted, MUST be 0.
+
+ Consumers MAY choose to ignore some or all of the flags. Even then,
+ they SHOULD keep this value around so as to propagate its information
+ to their own consumers.
+
+.. c:member:: int64_t ArrowSchema.n_children
+
+ Mandatory. The number of children this type has.
+
+.. c:member:: ArrowSchema** ArrowSchema.children
+
+ Optional. A C array of pointers to each child type of this type.
+ There must be :c:member:`ArrowSchema.n_children` pointers.
+
+ MAY be NULL only if :c:member:`ArrowSchema.n_children` is 0.
+
+.. c:member:: ArrowSchema* ArrowSchema.dictionary
+
+ Optional. A pointer to the type of dictionary values.
+
+ MUST be present if the ArrowSchema represents a dictionary-encoded type.
+ MUST be NULL otherwise.
+
+.. c:member:: void (*ArrowSchema.release)(struct ArrowSchema*)
+
+ Mandatory. A pointer to a producer-provided release callback.
+
+ See below for memory management and release callback semantics.
+
+.. c:member:: void* ArrowSchema.private_data
+
+ Optional. An opaque pointer to producer-provided private data.
+
+ Consumers MUST not process this member. Lifetime of this member
+ is handled by the producer, and especially by the release callback.
+
+
+The ArrowArray structure
+------------------------
+
+The ``ArrowArray`` describes the data of an exported array or record batch.
+For the ``ArrowArray`` structure to be interpreted type, the array type
+or record batch schema must already be known. This is either done by
+convention -- for example a producer API that always produces the same data
+type -- or by passing a ``ArrowSchema`` on the side.
+
+It has the following fields:
+
+.. c:member:: int64_t ArrowArray.length
+
+ Mandatory. The logical length of the array (i.e. its number of items).
+
+.. c:member:: int64_t ArrowArray.null_count
+
+ Mandatory. The number of null items in the array. MAY be -1 if not
+ yet computed.
+
+.. c:member:: int64_t ArrowArray.offset
+
+ Mandatory. The logical offset inside the array (i.e. the number of items
+ from the physical start of the buffers). MUST be 0 or positive.
+
+ Producers MAY specify that they will only produce 0-offset arrays to
+ ease implementation of consumer code.
+ Consumers MAY decide not to support non-0-offset arrays, but they
+ should document this limitation.
+
+.. c:member:: int64_t ArrowArray.n_buffers
+
+ Mandatory. The number of physical buffers backing this array. The
+ number of buffers is a function of the data type, as described in the
+ :ref:`Columnar format specification <format_columnar>`.
+
+ Buffers of children arrays are not included.
+
+.. c:member:: const void** ArrowArray.buffers
+
+ Mandatory. A C array of pointers to the start of each physical buffer
+ backing this array. Each `void*` pointer is the physical start of
+ a contiguous buffer. There must be :c:member:`ArrowArray.n_buffers` pointers.
+
+ The producer MUST ensure that each contiguous buffer is large enough to
+ represent `length + offset` values encoded according to the
+ :ref:`Columnar format specification <format_columnar>`.
+
+ It is recommended, but not required, that the memory addresses of the
+ buffers be aligned at least according to the type of primitive data that
+ they contain. Consumers MAY decide not to support unaligned memory.
+
+ The pointer to the null bitmap buffer, if the data type specifies one,
+ MAY be NULL only if :c:member:`ArrowArray.null_count` is 0.
+
+ Buffers of children arrays are not included.
+
+.. c:member:: int64_t ArrowArray.n_children
+
+ Mandatory. The number of children this array has. The number of children
+ is a function of the data type, as described in the
+ :ref:`Columnar format specification <format_columnar>`.
+
+.. c:member:: ArrowArray** ArrowArray.children
+
+ Optional. A C array of pointers to each child array of this array.
+ There must be :c:member:`ArrowArray.n_children` pointers.
+
+ MAY be NULL only if :c:member:`ArrowArray.n_children` is 0.
+
+.. c:member:: ArrowArray* ArrowArray.dictionary
+
+ Optional. A pointer to the underlying array of dictionary values.
+
+ MUST be present if the ArrowArray represents a dictionary-encoded array.
+ MUST be NULL otherwise.
+
+.. c:member:: void (*ArrowArray.release)(struct ArrowArray*)
+
+ Mandatory. A pointer to a producer-provided release callback.
+
+ See below for memory management and release callback semantics.
+
+.. c:member:: void* ArrowArray.private_data
+
+ Optional. An opaque pointer to producer-provided private data.
+
+ Consumers MUST not process this member. Lifetime of this member
+ is handled by the producer, and especially by the release callback.
+
+
+Dictionary-encoded arrays
+-------------------------
+
+For dictionary-encoded arrays, the :c:member:`ArrowSchema.format` string
+encodes the *index* type. The dictionary *value* type can be read
+from the :c:member:`ArrowSchema.dictionary` structure.
+
+The same holds for :c:member:`ArrowArray` structure: while the parent
+structure points to the index data, the :c:member:`ArrowArray.dictionary`
+points to the dictionary values array.
+
+Extension arrays
+----------------
+
+For extension arrays, the :c:member:`ArrowSchema.format` string encodes the
+*storage* type. Information about the extension type is encoded in the
+:c:member:`ArrowSchema.metadata` string, similarly to the
+:ref:`IPC format <format_metadata_extension_types>`. Specifically, the
+metadata key ``ARROW:extension:name`` encodes the extension type name,
+and the metadata key ``ARROW:extension:metadata`` encodes the
+implementation-specific serialization of the extension type (for
+parameterized extension types). The base64 encoding of metadata values
+ensures that any possible serialization is representable.
+
+The ``ArrowArray`` structure exported from an extension array simply points
+to the storage data of the extension array.
+
+Memory management
+-----------------
+
+The ``ArrowSchema`` and ``ArrowArray`` structures follow the same conventions
+for memory management. The term *"base structure"* below refers to the
+``ArrowSchema`` or ``ArrowArray`` that is passed between producer and consumer
+-- not any child structure thereof.
+
+Member allocation
+'''''''''''''''''
+
+It is intended for the base structure to be stack- or heap-allocated by the
+consumer. In this case, the producer API should take a pointer to the
+consumer-allocated structure.
+
+However, any data pointed to by the struct MUST be allocated and maintained
+by the producer. This includes the format and metadata strings, the arrays
+of buffer and children pointers, etc.
+
+Therefore, the consumer MUST not try to interfere with the producer's
+handling of these members' lifetime. The only way the consumer influences
+data lifetime is by calling the base structure's ``release`` callback.
+
+.. _c-data-interface-released:
+
+Released structure
+''''''''''''''''''
+
+A released structure is indicated by setting its ``release`` callback to NULL.
+Before reading and interpreting a structure's data, consumers SHOULD check
+for a NULL release callback and treat it accordingly (probably by erroring
+out).
+
+Release callback semantics -- for consumers
+'''''''''''''''''''''''''''''''''''''''''''
+
+Consumers MUST call a base structure's release callback when they won't be using
+it anymore, but they MUST not call any of its children's release callbacks
+(including the optional dictionary). The producer is responsible for releasing
+the children.
+
+In any case, a consumer MUST not try to access the base structure anymore
+after calling its release callback -- including any associated data such
+as its children.
+
+Release callback semantics -- for producers
+'''''''''''''''''''''''''''''''''''''''''''
+
+If producers need additional information for lifetime handling (for
+example, a C++ producer may want to use ``shared_ptr`` for array and
+buffer lifetime), they MUST use the ``private_data`` member to locate the
+required bookkeeping information.
+
+The release callback MUST not assume that the structure will be located
+at the same memory location as when it was originally produced. The consumer
+is free to move the structure around (see "Moving an array").
+
+The release callback MUST walk all children structures (including the optional
+dictionary) and call their own release callbacks.
+
+The release callback MUST free any data area directly owned by the structure
+(such as the buffers and children members).
+
+The release callback MUST mark the structure as released, by setting
+its ``release`` member to NULL.
+
+Below is a good starting point for implementing a release callback, where the
+TODO area must be filled with producer-specific deallocation code:
+
+.. code-block:: c
+
+ static void ReleaseExportedArray(struct ArrowArray* array) {
+ // This should not be called on already released array
+ assert(array->format != NULL);
+
+ // Release children
+ for (int64_t i = 0; i < array->n_children; ++i) {
+ struct ArrowArray* child = array->children[i];
+ if (child->release != NULL) {
+ child->release(child);
+ assert(child->release == NULL);
+ }
+ }
+
+ // Release dictionary
+ struct ArrowArray* dict = array->dictionary;
+ if (dict != NULL && dict->release != NULL) {
+ dict->release(dict);
+ assert(dict->release == NULL);
+ }
+
+ // TODO here: release and/or deallocate all data directly owned by
+ // the ArrowArray struct, such as the private_data.
+
+ // Mark array released
+ array->release = NULL;
+ }
+
+
+Moving an array
+'''''''''''''''
+
+The consumer can *move* the ``ArrowArray`` structure by bitwise copying or
+shallow member-wise copying. Then it MUST mark the source structure released
+(see "released structure" above for how to do it) but *without* calling the
+release callback. This ensures that only one live copy of the struct is
+active at any given time and that lifetime is correctly communicated to
+the producer.
+
+As usual, the release callback will be called on the destination structure
+when it is not needed anymore.
+
+Moving child arrays
+~~~~~~~~~~~~~~~~~~~
+
+It is also possible to move one or several child arrays, but the parent
+``ArrowArray`` structure MUST be released immediately afterwards, as it
+won't point to valid child arrays anymore.
+
+The main use case for this is to keep alive only a subset of child arrays
+(for example if you are only interested in certain columns of the data),
+while releasing the others.
+
+.. note::
+
+ For moving to work correctly, the ``ArrowArray`` structure has to be
+ trivially relocatable. Therefore, pointer members inside the ``ArrowArray``
+ structure (including ``private_data``) MUST not point inside the structure
+ itself. Also, external pointers to the structure MUST not be separately
+ stored by the producer. Instead, the producer MUST use the ``private_data``
+ member so as to remember any necessary bookkeeping information.
+
+Record batches
+--------------
+
+A record batch can be trivially considered as an equivalent struct array with
+additional top-level metadata.
+
+Example use case
+================
+
+A C++ database engine wants to provide the option to deliver results in Arrow
+format, but without imposing themselves a dependency on the Arrow software
+libraries. With the Arrow C data interface, the engine can let the caller pass
+a pointer to a ``ArrowArray`` structure, and fill it with the next chunk of
+results.
+
+It can do so without including the Arrow C++ headers or linking with the
+Arrow DLLs. Furthermore, the database engine's C API can benefit other
+runtimes and libraries that know about the Arrow C data interface,
+through e.g. a C FFI layer.
+
+C producer examples
+===================
+
+Exporting a simple ``int32`` array
+----------------------------------
+
+Export a non-nullable ``int32`` type with empty metadata. In this case,
+all ``ArrowSchema`` members point to statically-allocated data, so the
+release callback is trivial.
+
+.. code-block:: c
+
+ static void release_int32_type(struct ArrowSchema* schema) {
+ // Mark released
+ schema->release = NULL;
+ }
+
+ void export_int32_type(struct ArrowSchema* schema) {
+ *schema = (struct ArrowSchema) {
+ // Type description
+ .format = "i",
+ .name = "",
+ .metadata = NULL,
+ .flags = 0,
+ .n_children = 0,
+ .children = NULL,
+ .dictionary = NULL,
+ // Bookkeeping
+ .release = &release_int32_type
+ };
+ }
+
+Export a C-malloc()ed array of the same type as a Arrow array, transferring
+ownership to the consumer through the release callback:
+
+.. code-block:: c
+
+ static void release_int32_array(struct ArrowArray* array) {
+ assert(array->n_buffers == 2);
+ // Free the buffers and the buffers array
+ free((void *) array->buffers[1]);
+ free(array->buffers);
+ // Mark released
+ array->release = NULL;
+ }
+
+ void export_int32_array(const int32_t* data, int64_t nitems,
+ struct ArrowArray* array) {
+ // Initialize primitive fields
+ *array = (struct ArrowArray) {
+ // Data description
+ .length = nitems,
+ .offset = 0,
+ .null_count = 0,
+ .n_buffers = 2,
+ .n_children = 0,
+ .children = NULL,
+ .dictionary = NULL,
+ // Bookkeeping
+ .release = &release_int32_array
+ };
+ // Allocate list of buffers
+ array->buffers = (const void**) malloc(sizeof(void*) * array->n_buffers);
+ assert(array->buffers != NULL);
+ array->buffers[0] = NULL; // no nulls, null bitmap can be omitted
+ array->buffers[1] = data;
+ }
+
+Exporting a ``struct<float32, utf8>`` array
+-------------------------------------------
+
+Export the array type as a ``ArrowSchema`` with C-malloc()ed children:
+
+.. code-block:: c
+
+ static void release_malloced_type(struct ArrowSchema* schema) {
+ int i;
+ for (i = 0; i < schema->n_children; ++i) {
+ struct ArrowSchema* child = schema->children[i];
+ if (child->release != NULL) {
+ child->release(child);
+ }
+ }
+ free(schema->children);
+ // Mark released
+ schema->release = NULL;
+ }
+
+ void export_float32_utf8_type(struct ArrowSchema* schema) {
+ struct ArrowSchema* child;
+
+ //
+ // Initialize parent type
+ //
+ *schema = (struct ArrowSchema) {
+ // Type description
+ .format = "+s",
+ .name = "",
+ .metadata = NULL,
+ .flags = 0,
+ .n_children = 2,
+ .dictionary = NULL,
+ // Bookkeeping
+ .release = &release_malloced_type
+ };
+ // Allocate list of children types
+ schema->children = malloc(sizeof(struct ArrowSchema*) * schema->n_children);
+
+ //
+ // Initialize child type #0
+ //
+ child = schema->children[0] = malloc(sizeof(struct ArrowSchema));
+ *child = (struct ArrowSchema) {
+ // Type description
+ .format = "f",
+ .name = "floats",
+ .metadata = NULL,
+ .flags = ARROW_FLAG_NULLABLE,
+ .n_children = 0,
+ .dictionary = NULL,
+ .children = NULL,
+ // Bookkeeping
+ .release = &release_malloced_type
+ };
+
+ //
+ // Initialize child type #1
+ //
+ child = schema->children[1] = malloc(sizeof(struct ArrowSchema));
+ *child = (struct ArrowSchema) {
+ // Type description
+ .format = "u",
+ .name = "strings",
+ .metadata = NULL,
+ .flags = ARROW_FLAG_NULLABLE,
+ .n_children = 0,
+ .dictionary = NULL,
+ .children = NULL,
+ // Bookkeeping
+ .release = &release_malloced_type
+ };
+ }
+
+Export C-malloc()ed arrays in Arrow-compatible layout as an Arrow struct array,
+transferring ownership to the consumer:
+
+.. code-block:: c
+
+ static void release_malloced_array(struct ArrowArray* array) {
+ int i;
+ // Free children
+ for (i = 0; i < array->n_children; ++i) {
+ struct ArrowArray* child = array->children[i];
+ if (child->release != NULL) {
+ child->release(child);
+ }
+ }
+ free(array->children);
+ // Free buffers
+ for (i = 0; i < array->n_buffers; ++i) {
+ free((void *) array->buffers[i]);
+ }
+ free(array->buffers);
+ // Mark released
+ array->release = NULL;
+ }
+
+ void export_float32_utf8_array(
+ int64_t nitems,
+ const uint8_t* float32_nulls, const float* float32_data,
+ const uint8_t* utf8_nulls, const int32_t* utf8_offsets, const uint8_t* utf8_data,
+ struct ArrowArray* array) {
+ struct ArrowArray* child;
+
+ //
+ // Initialize parent array
+ //
+ *array = (struct ArrowArray) {
+ // Data description
+ .length = nitems,
+ .offset = 0,
+ .null_count = 0,
+ .n_buffers = 1,
+ .n_children = 2,
+ .dictionary = NULL,
+ // Bookkeeping
+ .release = &release_malloced_array
+ };
+ // Allocate list of parent buffers
+ array->buffers = malloc(sizeof(void*) * array->n_buffers);
+ array->buffers[0] = NULL; // no nulls, null bitmap can be omitted
+ // Allocate list of children arrays
+ array->children = malloc(sizeof(struct ArrowArray*) * array->n_children);
+
+ //
+ // Initialize child array #0
+ //
+ child = array->children[0] = malloc(sizeof(struct ArrowArray));
+ *child = (struct ArrowArray) {
+ // Data description
+ .length = nitems,
+ .offset = 0,
+ .null_count = -1,
+ .n_buffers = 2,
+ .n_children = 0,
+ .dictionary = NULL,
+ .children = NULL,
+ // Bookkeeping
+ .release = &release_malloced_array
+ };
+ child->buffers = malloc(sizeof(void*) * array->n_buffers);
+ child->buffers[0] = float32_nulls;
+ child->buffers[1] = float32_data;
+
+ //
+ // Initialize child array #1
+ //
+ child = array->children[1] = malloc(sizeof(struct ArrowArray));
+ *child = (struct ArrowArray) {
+ // Data description
+ .length = nitems,
+ .offset = 0,
+ .null_count = -1,
+ .n_buffers = 3,
+ .n_children = 0,
+ .dictionary = NULL,
+ .children = NULL,
+ // Bookkeeping
+ .release = &release_malloced_array
+ };
+ child->buffers = malloc(sizeof(void*) * array->n_buffers);
+ child->buffers[0] = utf8_nulls;
+ child->buffers[1] = utf8_offsets;
+ child->buffers[2] = utf8_data;
+ }
+
+
+Why two distinct structures?
+============================
+
+In many cases, the same type or schema description applies to multiple,
+possibly short, batches of data. To avoid paying the cost of exporting
+and importing the type description for each batch, the ``ArrowSchema``
+can be passed once, separately, at the beginning of the conversation between
+producer and consumer.
+
+In other cases yet, the data type is fixed by the producer API, and may not
+need to be communicated at all.
+
+However, if a producer is focused on one-shot exchange of data, it can
+communicate the ``ArrowSchema`` and ``ArrowArray`` structures in the same
+API call.
+
+Updating this specification
+===========================
+
+Once this specification is supported in an official Arrow release, the C
+ABI is frozen. This means the ``ArrowSchema`` and ``ArrowArray`` structure
+definitions should not change in any way -- including adding new members.
+
+Backwards-compatible changes are allowed, for example new
+:c:member:`ArrowSchema.flags` values or expanded possibilities for
+the :c:member:`ArrowSchema.format` string.
+
+Any incompatible changes should be part of a new specification, for example
+"Arrow C data interface v2".
+
+Inspiration
+===========
+
+The Arrow C data interface is inspired by the `Python buffer protocol`_,
+which has proven immensely successful in allowing various Python libraries
+exchange numerical data with no knowledge of each other and near-zero
+adaptation cost.
+
+
+.. _Python buffer protocol: https://www.python.org/dev/peps/pep-3118/
diff --git a/src/arrow/docs/source/format/CStreamInterface.rst b/src/arrow/docs/source/format/CStreamInterface.rst
new file mode 100644
index 000000000..b8ccce355
--- /dev/null
+++ b/src/arrow/docs/source/format/CStreamInterface.rst
@@ -0,0 +1,218 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. highlight:: c
+
+.. _c-stream-interface:
+
+============================
+The Arrow C stream interface
+============================
+
+.. warning::
+ This interface is experimental and may evolve based on feedback from
+ early users. ABI stability is not guaranteed yet. Feel free to
+ `contact us <https://arrow.apache.org/community/>`__.
+
+The C stream interface builds on the structures defined in the
+:ref:`C data interface <c-data-interface>` and combines them into a higher-level
+specification so as to ease the communication of streaming data within a single
+process.
+
+Semantics
+=========
+
+An Arrow C stream exposes a streaming source of data chunks, each with the
+same schema. Chunks are obtained by calling a blocking pull-style iteration
+function.
+
+Structure definition
+====================
+
+The C stream interface is defined by a single ``struct`` definition::
+
+ struct ArrowArrayStream {
+ // Callbacks providing stream functionality
+ int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
+ int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
+ const char* (*get_last_error)(struct ArrowArrayStream*);
+
+ // Release callback
+ void (*release)(struct ArrowArrayStream*);
+
+ // Opaque producer-specific data
+ void* private_data;
+ };
+
+The ArrowArrayStream structure
+------------------------------
+
+The ``ArrowArrayStream`` provides the required callbacks to interact with a
+streaming source of Arrow arrays. It has the following fields:
+
+.. c:member:: int (*ArrowArrayStream.get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out)
+
+ *Mandatory.* This callback allows the consumer to query the schema of
+ the chunks of data in the stream. The schema is the same for all
+ data chunks.
+
+ This callback must NOT be called on a released ``ArrowArrayStream``.
+
+ *Return value:* 0 on success, a non-zero
+ :ref:`error code <c-stream-interface-error-codes>` otherwise.
+
+.. c:member:: int (*ArrowArrayStream.get_next)(struct ArrowArrayStream*, struct ArrowArray* out)
+
+ *Mandatory.* This callback allows the consumer to get the next chunk
+ of data in the stream.
+
+ This callback must NOT be called on a released ``ArrowArrayStream``.
+
+ *Return value:* 0 on success, a non-zero
+ :ref:`error code <c-stream-interface-error-codes>` otherwise.
+
+ On success, the consumer must check whether the ``ArrowArray`` is
+ marked :ref:`released <c-data-interface-released>`. If the
+ ``ArrowArray`` is released, then the end of stream has been reached.
+ Otherwise, the ``ArrowArray`` contains a valid data chunk.
+
+.. c:member:: const char* (*ArrowArrayStream.get_last_error)(struct ArrowArrayStream*)
+
+ *Mandatory.* This callback allows the consumer to get a textual description
+ of the last error.
+
+ This callback must ONLY be called if the last operation on the
+ ``ArrowArrayStream`` returned an error. It must NOT be called on a
+ released ``ArrowArrayStream``.
+
+ *Return value:* a pointer to a NULL-terminated character string (UTF8-encoded).
+ NULL can also be returned if no detailed description is available.
+
+ The returned pointer is only guaranteed to be valid until the next call of
+ one of the stream's callbacks. The character string it points to should
+ be copied to consumer-managed storage if it is intended to survive longer.
+
+.. c:member:: void (*ArrowArrayStream.release)(struct ArrowArrayStream*)
+
+ *Mandatory.* A pointer to a producer-provided release callback.
+
+.. c:member:: void* ArrowArrayStream.private_data
+
+ *Optional.* An opaque pointer to producer-provided private data.
+
+ Consumers MUST not process this member. Lifetime of this member
+ is handled by the producer, and especially by the release callback.
+
+
+.. _c-stream-interface-error-codes:
+
+Error codes
+-----------
+
+The ``get_schema`` and ``get_next`` callbacks may return an error under the form
+of a non-zero integer code. Such error codes should be interpreted like
+``errno`` numbers (as defined by the local platform). Note that the symbolic
+forms of these constants are stable from platform to platform, but their numeric
+values are platform-specific.
+
+In particular, it is recommended to recognize the following values:
+
+* ``EINVAL``: for a parameter or input validation error
+* ``ENOMEM``: for a memory allocation failure (out of memory)
+* ``EIO``: for a generic input/output error
+
+.. seealso::
+ `Standard POSIX error codes <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html>`__.
+
+ `Error codes recognized by the Windows C runtime library
+ <https://docs.microsoft.com/en-us/cpp/c-runtime-library/errno-doserrno-sys-errlist-and-sys-nerr>`__.
+
+Result lifetimes
+----------------
+
+The data returned by the ``get_schema`` and ``get_next`` callbacks must be
+released independently. Their lifetimes are not tied to that of the
+``ArrowArrayStream``.
+
+Stream lifetime
+---------------
+
+Lifetime of the C stream is managed using a release callback with similar
+usage as in the :ref:`C data interface <c-data-interface-released>`.
+
+
+C consumer example
+==================
+
+Let's say a particular database provides the following C API to execute
+a SQL query and return the result set as a Arrow C stream::
+
+ void MyDB_Query(const char* query, struct ArrowArrayStream* result_set);
+
+Then a consumer could use the following code to iterate over the results::
+
+ static void handle_error(int errcode, struct ArrowArrayStream* stream) {
+ // Print stream error
+ const char* errdesc = stream->get_last_error(stream);
+ if (errdesc != NULL) {
+ fputs(errdesc, stderr);
+ } else {
+ fputs(strerror(errcode), stderr);
+ }
+ // Release stream and abort
+ stream->release(stream),
+ exit(1);
+ }
+
+ void run_query() {
+ struct ArrowArrayStream stream;
+ struct ArrowSchema schema;
+ struct ArrowArray chunk;
+ int errcode;
+
+ MyDB_Query("SELECT * FROM my_table", &stream);
+
+ // Query result set schema
+ errcode = stream.get_schema(&stream, &schema);
+ if (errcode != 0) {
+ handle_error(errcode, &stream);
+ }
+
+ int64_t num_rows = 0;
+
+ // Iterate over results: loop until error or end of stream
+ while ((errcode = stream.get_next(&stream, &chunk) == 0) &&
+ chunk.release != NULL) {
+ // Do something with chunk...
+ fprintf(stderr, "Result chunk: got %lld rows\n", chunk.length);
+ num_rows += chunk.length;
+
+ // Release chunk
+ chunk.release(&chunk);
+ }
+
+ // Was it an error?
+ if (errcode != 0) {
+ handle_error(errcode, &stream);
+ }
+
+ fprintf(stderr, "Result stream ended: total %lld rows\n", num_rows);
+
+ // Release schema and stream
+ schema.release(&schema);
+ stream.release(&stream);
+ }
diff --git a/src/arrow/docs/source/format/Columnar.rst b/src/arrow/docs/source/format/Columnar.rst
new file mode 100644
index 000000000..85261e7d9
--- /dev/null
+++ b/src/arrow/docs/source/format/Columnar.rst
@@ -0,0 +1,1221 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _format_columnar:
+
+*********************
+Arrow Columnar Format
+*********************
+
+*Version: 1.0*
+
+The "Arrow Columnar Format" includes a language-agnostic in-memory
+data structure specification, metadata serialization, and a protocol
+for serialization and generic data transport.
+
+This document is intended to provide adequate detail to create a new
+implementation of the columnar format without the aid of an existing
+implementation. We utilize Google's `Flatbuffers`_ project for
+metadata serialization, so it will be necessary to refer to the
+project's `Flatbuffers protocol definition files`_
+while reading this document.
+
+The columnar format has some key features:
+
+* Data adjacency for sequential access (scans)
+* O(1) (constant-time) random access
+* SIMD and vectorization-friendly
+* Relocatable without "pointer swizzling", allowing for true zero-copy
+ access in shared memory
+
+The Arrow columnar format provides analytical performance and data
+locality guarantees in exchange for comparatively more expensive
+mutation operations. This document is concerned only with in-memory
+data representation and serialization details; issues such as
+coordinating mutation of data structures are left to be handled by
+implementations.
+
+Terminology
+===========
+
+Since different projects have used different words to describe various
+concepts, here is a small glossary to help disambiguate.
+
+* **Array** or **Vector**: a sequence of values with known length all
+ having the same type. These terms are used interchangeably in
+ different Arrow implementations, but we use "array" in this
+ document.
+* **Slot**: a single logical value in an array of some particular data type
+* **Buffer** or **Contiguous memory region**: a sequential virtual
+ address space with a given length. Any byte can be reached via a
+ single pointer offset less than the region's length.
+* **Physical Layout**: The underlying memory layout for an array
+ without taking into account any value semantics. For example, a
+ 32-bit signed integer array and 32-bit floating point array have the
+ same layout.
+* **Parent** and **child arrays**: names to express relationships
+ between physical value arrays in a nested type structure. For
+ example, a ``List<T>``-type parent array has a T-type array as its
+ child (see more on lists below).
+* **Primitive type**: a data type having no child types. This includes
+ such types as fixed bit-width, variable-size binary, and null types.
+* **Nested type**: a data type whose full structure depends on one or
+ more other child types. Two fully-specified nested types are equal
+ if and only if their child types are equal. For example, ``List<U>``
+ is distinct from ``List<V>`` iff U and V are different types.
+* **Logical type**: An application-facing semantic value type that is
+ implemented using some physical layout. For example, Decimal
+ values are stored as 16 bytes in a fixed-size binary
+ layout. Similarly, strings can be stored as ``List<1-byte>``. A
+ timestamp may be stored as 64-bit fixed-size layout.
+
+.. _format_layout:
+
+Physical Memory Layout
+======================
+
+Arrays are defined by a few pieces of metadata and data:
+
+* A logical data type.
+* A sequence of buffers.
+* A length as a 64-bit signed integer. Implementations are permitted
+ to be limited to 32-bit lengths, see more on this below.
+* A null count as a 64-bit signed integer.
+* An optional **dictionary**, for dictionary-encoded arrays.
+
+Nested arrays additionally have a sequence of one or more sets of
+these items, called the **child arrays**.
+
+Each logical data type has a well-defined physical layout. Here are
+the different physical layouts defined by Arrow:
+
+* **Primitive (fixed-size)**: a sequence of values each having the
+ same byte or bit width
+* **Variable-size Binary**: a sequence of values each having a variable
+ byte length. Two variants of this layout are supported using 32-bit
+ and 64-bit length encoding.
+* **Fixed-size List**: a nested layout where each value has the same
+ number of elements taken from a child data type.
+* **Variable-size List**: a nested layout where each value is a
+ variable-length sequence of values taken from a child data type. Two
+ variants of this layout are supported using 32-bit and 64-bit length
+ encoding.
+* **Struct**: a nested layout consisting of a collection of named
+ child **fields** each having the same length but possibly different
+ types.
+* **Sparse** and **Dense Union**: a nested layout representing a
+ sequence of values, each of which can have type chosen from a
+ collection of child array types.
+* **Null**: a sequence of all null values, having null logical type
+
+The Arrow columnar memory layout only applies to *data* and not
+*metadata*. Implementations are free to represent metadata in-memory
+in whichever form is convenient for them. We handle metadata
+**serialization** in an implementation-independent way using
+`Flatbuffers`_, detailed below.
+
+Buffer Alignment and Padding
+----------------------------
+
+Implementations are recommended to allocate memory on aligned
+addresses (multiple of 8- or 64-bytes) and pad (overallocate) to a
+length that is a multiple of 8 or 64 bytes. When serializing Arrow
+data for interprocess communication, these alignment and padding
+requirements are enforced. If possible, we suggest that you prefer
+using 64-byte alignment and padding. Unless otherwise noted, padded
+bytes do not need to have a specific value.
+
+The alignment requirement follows best practices for optimized memory
+access:
+
+* Elements in numeric arrays will be guaranteed to be retrieved via aligned access.
+* On some architectures alignment can help limit partially used cache lines.
+
+The recommendation for 64 byte alignment comes from the `Intel
+performance guide`_ that recommends alignment of memory to match SIMD
+register width. The specific padding length was chosen because it
+matches the largest SIMD instruction registers available on widely
+deployed x86 architecture (Intel AVX-512).
+
+The recommended padding of 64 bytes allows for using `SIMD`_
+instructions consistently in loops without additional conditional
+checks. This should allow for simpler, efficient and CPU
+cache-friendly code. In other words, we can load the entire 64-byte
+buffer into a 512-bit wide SIMD register and get data-level
+parallelism on all the columnar values packed into the 64-byte
+buffer. Guaranteed padding can also allow certain compilers to
+generate more optimized code directly (e.g. One can safely use Intel's
+``-qopt-assume-safe-padding``).
+
+Array lengths
+-------------
+
+Array lengths are represented in the Arrow metadata as a 64-bit signed
+integer. An implementation of Arrow is considered valid even if it only
+supports lengths up to the maximum 32-bit signed integer, though. If using
+Arrow in a multi-language environment, we recommend limiting lengths to
+2 :sup:`31` - 1 elements or less. Larger data sets can be represented using
+multiple array chunks.
+
+Null count
+----------
+
+The number of null value slots is a property of the physical array and
+considered part of the data structure. The null count is represented
+in the Arrow metadata as a 64-bit signed integer, as it may be as
+large as the array length.
+
+Validity bitmaps
+----------------
+
+Any value in an array may be semantically null, whether primitive or nested
+type.
+
+All array types, with the exception of union types (more on these later),
+utilize a dedicated memory buffer, known as the validity (or "null") bitmap, to
+encode the nullness or non-nullness of each value slot. The validity bitmap
+must be large enough to have at least 1 bit for each array slot.
+
+Whether any array slot is valid (non-null) is encoded in the respective bits of
+this bitmap. A 1 (set bit) for index ``j`` indicates that the value is not null,
+while a 0 (bit not set) indicates that it is null. Bitmaps are to be
+initialized to be all unset at allocation time (this includes padding): ::
+
+ is_valid[j] -> bitmap[j / 8] & (1 << (j % 8))
+
+We use `least-significant bit (LSB) numbering`_ (also known as
+bit-endianness). This means that within a group of 8 bits, we read
+right-to-left: ::
+
+ values = [0, 1, null, 2, null, 3]
+
+ bitmap
+ j mod 8 7 6 5 4 3 2 1 0
+ 0 0 1 0 1 0 1 1
+
+Arrays having a 0 null count may choose to not allocate the validity
+bitmap. Implementations may choose to always allocate one anyway as a
+matter of convenience, but this should be noted when memory is being
+shared.
+
+Nested type arrays except for union types have their own validity bitmap and
+null count regardless of the null count and valid bits of their child arrays.
+
+Array slots which are null are not required to have a particular
+value; any "masked" memory can have any value and need not be zeroed,
+though implementations frequently choose to zero memory for null
+values.
+
+Fixed-size Primitive Layout
+---------------------------
+
+A primitive value array represents an array of values each having the
+same physical slot width typically measured in bytes, though the spec
+also provides for bit-packed types (e.g. boolean values encoded in
+bits).
+
+Internally, the array contains a contiguous memory buffer whose total
+size is at least as large as the slot width multiplied by the array
+length. For bit-packed types, the size is rounded up to the nearest
+byte.
+
+The associated validity bitmap is contiguously allocated (as described
+above) but does not need to be adjacent in memory to the values
+buffer.
+
+**Example Layout: Int32 Array**
+
+For example a primitive array of int32s: ::
+
+ [1, null, 2, 4, 8]
+
+Would look like: ::
+
+ * Length: 5, Null count: 1
+ * Validity bitmap buffer:
+
+ |Byte 0 (validity bitmap) | Bytes 1-63 |
+ |-------------------------|-----------------------|
+ | 00011101 | 0 (padding) |
+
+ * Value Buffer:
+
+ |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 |
+ |------------|-------------|-------------|-------------|-------------|-------------|
+ | 1 | unspecified | 2 | 4 | 8 | unspecified |
+
+**Example Layout: Non-null int32 Array**
+
+``[1, 2, 3, 4, 8]`` has two possible layouts: ::
+
+ * Length: 5, Null count: 0
+ * Validity bitmap buffer:
+
+ | Byte 0 (validity bitmap) | Bytes 1-63 |
+ |--------------------------|-----------------------|
+ | 00011111 | 0 (padding) |
+
+ * Value Buffer:
+
+ |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 |
+ |------------|-------------|-------------|-------------|-------------|-------------|
+ | 1 | 2 | 3 | 4 | 8 | unspecified |
+
+or with the bitmap elided: ::
+
+ * Length 5, Null count: 0
+ * Validity bitmap buffer: Not required
+ * Value Buffer:
+
+ |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 |
+ |------------|-------------|-------------|-------------|-------------|-------------|
+ | 1 | 2 | 3 | 4 | 8 | unspecified |
+
+Variable-size Binary Layout
+---------------------------
+
+Each value in this layout consists of 0 or more bytes. While primitive
+arrays have a single values buffer, variable-size binary have an
+**offsets** buffer and **data** buffer.
+
+The offsets buffer contains `length + 1` signed integers (either
+32-bit or 64-bit, depending on the logical type), which encode the
+start position of each slot in the data buffer. The length of the
+value in each slot is computed using the difference between the offset
+at that slot's index and the subsequent offset. For example, the
+position and length of slot j is computed as:
+
+::
+
+ slot_position = offsets[j]
+ slot_length = offsets[j + 1] - offsets[j] // (for 0 <= j < length)
+
+It should be noted that a null value may have a positive slot length.
+That is, a null value may occupy a **non-empty** memory space in the data
+buffer. When this is true, the content of the corresponding memory space
+is undefined.
+
+Generally the first value in the offsets array is 0, and the last slot
+is the length of the values array. When serializing this layout, we
+recommend normalizing the offsets to start at 0.
+
+Variable-size List Layout
+-------------------------
+
+List is a nested type which is semantically similar to variable-size
+binary. It is defined by two buffers, a validity bitmap and an offsets
+buffer, and a child array. The offsets are the same as in the
+variable-size binary case, and both 32-bit and 64-bit signed integer
+offsets are supported options for the offsets. Rather than referencing
+an additional data buffer, instead these offsets reference the child
+array.
+
+Similar to the layout of variable-size binary, a null value may
+correspond to a **non-empty** segment in the child array. When this is
+true, the content of the corresponding segment can be arbitrary.
+
+A list type is specified like ``List<T>``, where ``T`` is any type
+(primitive or nested). In these examples we use 32-bit offsets where
+the 64-bit offset version would be denoted by ``LargeList<T>``.
+
+**Example Layout: ``List<Int8>`` Array**
+
+We illustrate an example of ``List<Int8>`` with length 4 having values::
+
+ [[12, -7, 25], null, [0, -127, 127, 50], []]
+
+will have the following representation: ::
+
+ * Length: 4, Null count: 1
+ * Validity bitmap buffer:
+
+ | Byte 0 (validity bitmap) | Bytes 1-63 |
+ |--------------------------|-----------------------|
+ | 00001101 | 0 (padding) |
+
+ * Offsets buffer (int32)
+
+ | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 |
+ |------------|-------------|-------------|-------------|-------------|-------------|
+ | 0 | 3 | 3 | 7 | 7 | unspecified |
+
+ * Values array (Int8array):
+ * Length: 7, Null count: 0
+ * Validity bitmap buffer: Not required
+ * Values buffer (int8)
+
+ | Bytes 0-6 | Bytes 7-63 |
+ |------------------------------|-------------|
+ | 12, -7, 25, 0, -127, 127, 50 | unspecified |
+
+**Example Layout: ``List<List<Int8>>``**
+
+``[[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], [[9, 10]]]``
+
+will be represented as follows: ::
+
+ * Length 3
+ * Nulls count: 0
+ * Validity bitmap buffer: Not required
+ * Offsets buffer (int32)
+
+ | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 |
+ |------------|------------|------------|-------------|-------------|
+ | 0 | 2 | 5 | 6 | unspecified |
+
+ * Values array (`List<Int8>`)
+ * Length: 6, Null count: 1
+ * Validity bitmap buffer:
+
+ | Byte 0 (validity bitmap) | Bytes 1-63 |
+ |--------------------------|-------------|
+ | 00110111 | 0 (padding) |
+
+ * Offsets buffer (int32)
+
+ | Bytes 0-27 | Bytes 28-63 |
+ |----------------------|-------------|
+ | 0, 2, 4, 7, 7, 8, 10 | unspecified |
+
+ * Values array (Int8):
+ * Length: 10, Null count: 0
+ * Validity bitmap buffer: Not required
+
+ | Bytes 0-9 | Bytes 10-63 |
+ |-------------------------------|-------------|
+ | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified |
+
+Fixed-Size List Layout
+----------------------
+
+Fixed-Size List is a nested type in which each array slot contains a
+fixed-size sequence of values all having the same type.
+
+A fixed size list type is specified like ``FixedSizeList<T>[N]``,
+where ``T`` is any type (primitive or nested) and ``N`` is a 32-bit
+signed integer representing the length of the lists.
+
+A fixed size list array is represented by a values array, which is a
+child array of type T. T may also be a nested type. The value in slot
+``j`` of a fixed size list array is stored in an ``N``-long slice of
+the values array, starting at an offset of ``j * N``.
+
+**Example Layout: ``FixedSizeList<byte>[4]`` Array**
+
+Here we illustrate ``FixedSizeList<byte>[4]``.
+
+For an array of length 4 with respective values: ::
+
+ [[192, 168, 0, 12], null, [192, 168, 0, 25], [192, 168, 0, 1]]
+
+will have the following representation: ::
+
+ * Length: 4, Null count: 1
+ * Validity bitmap buffer:
+
+ | Byte 0 (validity bitmap) | Bytes 1-63 |
+ |--------------------------|-----------------------|
+ | 00001101 | 0 (padding) |
+
+ * Values array (byte array):
+ * Length: 16, Null count: 0
+ * validity bitmap buffer: Not required
+
+ | Bytes 0-3 | Bytes 4-7 | Bytes 8-15 |
+ |-----------------|-------------|---------------------------------|
+ | 192, 168, 0, 12 | unspecified | 192, 168, 0, 25, 192, 168, 0, 1 |
+
+
+Struct Layout
+-------------
+
+A struct is a nested type parameterized by an ordered sequence of
+types (which can all be distinct), called its fields. Each field must
+have a UTF8-encoded name, and these field names are part of the type
+metadata.
+
+A struct array does not have any additional allocated physical storage
+for its values. A struct array must still have an allocated validity
+bitmap, if it has one or more null values.
+
+Physically, a struct array has one child array for each field. The
+child arrays are independent and need not be adjacent to each other in
+memory.
+
+For example, the struct (field names shown here as strings for illustration
+purposes)::
+
+ Struct <
+ name: VarBinary
+ age: Int32
+ >
+
+has two child arrays, one ``VarBinary`` array (using variable-size binary
+layout) and one 4-byte primitive value array having ``Int32`` logical
+type.
+
+**Example Layout: ``Struct<VarBinary, Int32>``**
+
+The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: ::
+
+ * Length: 4, Null count: 1
+ * Validity bitmap buffer:
+
+ |Byte 0 (validity bitmap) | Bytes 1-63 |
+ |-------------------------|-----------------------|
+ | 00001011 | 0 (padding) |
+
+ * Children arrays:
+ * field-0 array (`VarBinary`):
+ * Length: 4, Null count: 2
+ * Validity bitmap buffer:
+
+ | Byte 0 (validity bitmap) | Bytes 1-63 |
+ |--------------------------|-----------------------|
+ | 00001001 | 0 (padding) |
+
+ * Offsets buffer:
+
+ | Bytes 0-19 |
+ |----------------|
+ | 0, 3, 3, 3, 7 |
+
+ * Values array:
+ * Length: 7, Null count: 0
+ * Validity bitmap buffer: Not required
+
+ * Value buffer:
+
+ | Bytes 0-6 |
+ |----------------|
+ | joemark |
+
+ * field-1 array (int32 array):
+ * Length: 4, Null count: 1
+ * Validity bitmap buffer:
+
+ | Byte 0 (validity bitmap) | Bytes 1-63 |
+ |--------------------------|-----------------------|
+ | 00001011 | 0 (padding) |
+
+ * Value Buffer:
+
+ |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 |
+ |------------|-------------|-------------|-------------|-------------|
+ | 1 | 2 | unspecified | 4 | unspecified |
+
+While a struct does not have physical storage for each of its semantic
+slots (i.e. each scalar C-like struct), an entire struct slot can be
+set to null via the validity bitmap. Any of the child field arrays can
+have null values according to their respective independent validity
+bitmaps. This implies that for a particular struct slot the validity
+bitmap for the struct array might indicate a null slot when one or
+more of its child arrays has a non-null value in their corresponding
+slot. When reading the struct array the parent validity bitmap takes
+priority. This is illustrated in the example above, the child arrays
+have valid entries for the null struct but are 'hidden' from the
+consumer by the parent array's validity bitmap. However, when treated
+independently corresponding values of the children array will be
+non-null.
+
+Union Layout
+------------
+
+A union is defined by an ordered sequence of types; each slot in the
+union can have a value chosen from these types. The types are named
+like a struct's fields, and the names are part of the type metadata.
+
+Unlike other data types, unions do not have their own validity bitmap. Instead,
+the nullness of each slot is determined exclusively by the child arrays which
+are composed to create the union.
+
+We define two distinct union types, "dense" and "sparse", that are
+optimized for different use cases.
+
+Dense Union
+~~~~~~~~~~~
+
+Dense union represents a mixed-type array with 5 bytes of overhead for
+each value. Its physical layout is as follows:
+
+* One child array for each type
+* Types buffer: A buffer of 8-bit signed integers. Each type in the
+ union has a corresponding type id whose values are found in this
+ buffer. A union with more than 127 possible types can be modeled as
+ a union of unions.
+* Offsets buffer: A buffer of signed int32 values indicating the
+ relative offset into the respective child array for the type in a
+ given slot. The respective offsets for each child value array must
+ be in order / increasing.
+
+Critically, the dense union allows for minimal overhead in the ubiquitous
+union-of-structs with non-overlapping-fields use case (``Union<s1: Struct1, s2:
+Struct2, s3: Struct3, ...>``)
+
+**Example Layout: Dense union**
+
+An example layout for logical union of: ``Union<f: float, i: int32>``
+having the values: ``[{f=1.2}, null, {f=3.4}, {i=5}]``
+
+::
+
+ * Length: 4, Null count: 0
+ * Types buffer:
+
+ |Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 |
+ |---------|-------------|----------|----------|-------------|
+ | 0 | 0 | 0 | 1 | unspecified |
+
+ * Offset buffer:
+
+ |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 |
+ |----------|-------------|------------|-------------|-------------|
+ | 0 | 1 | 2 | 0 | unspecified |
+
+ * Children arrays:
+ * Field-0 array (f: float):
+ * Length: 2, Null count: 1
+ * Validity bitmap buffer: 00000101
+
+ * Value Buffer:
+
+ | Bytes 0-11 | Bytes 12-63 |
+ |----------------|-------------|
+ | 1.2, null, 3.4 | unspecified |
+
+
+ * Field-1 array (i: int32):
+ * Length: 1, Null count: 0
+ * Validity bitmap buffer: Not required
+
+ * Value Buffer:
+
+ | Bytes 0-3 | Bytes 4-63 |
+ |-----------|-------------|
+ | 5 | unspecified |
+
+Sparse Union
+~~~~~~~~~~~~
+
+A sparse union has the same structure as a dense union, with the omission of
+the offsets array. In this case, the child arrays are each equal in length to
+the length of the union.
+
+While a sparse union may use significantly more space compared with a
+dense union, it has some advantages that may be desirable in certain
+use cases:
+
+* A sparse union is more amenable to vectorized expression evaluation in some use cases.
+* Equal-length arrays can be interpreted as a union by only defining the types array.
+
+**Example layout: ``SparseUnion<u0: Int32, u1: Float, u2: VarBinary>``**
+
+For the union array: ::
+
+ [{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, {u2='mark'}]
+
+will have the following layout: ::
+
+ * Length: 6, Null count: 0
+ * Types buffer:
+
+ | Byte 0 | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Bytes 6-63 |
+ |------------|-------------|-------------|-------------|-------------|--------------|-----------------------|
+ | 0 | 1 | 2 | 1 | 0 | 2 | unspecified (padding) |
+
+ * Children arrays:
+
+ * u0 (Int32):
+ * Length: 6, Null count: 4
+ * Validity bitmap buffer:
+
+ |Byte 0 (validity bitmap) | Bytes 1-63 |
+ |-------------------------|-----------------------|
+ |00010001 | 0 (padding) |
+
+ * Value buffer:
+
+ |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 |
+ |------------|-------------|-------------|-------------|-------------|--------------|-----------------------|
+ | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) |
+
+ * u1 (float):
+ * Length: 6, Null count: 4
+ * Validity bitmap buffer:
+
+ |Byte 0 (validity bitmap) | Bytes 1-63 |
+ |-------------------------|-----------------------|
+ | 00001010 | 0 (padding) |
+
+ * Value buffer:
+
+ |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 |
+ |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------|
+ | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) |
+
+ * u2 (`VarBinary`)
+ * Length: 6, Null count: 4
+ * Validity bitmap buffer:
+
+ | Byte 0 (validity bitmap) | Bytes 1-63 |
+ |--------------------------|-----------------------|
+ | 00100100 | 0 (padding) |
+
+ * Offsets buffer (int32)
+
+ | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 |
+ |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------|
+ | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified |
+
+ * Values array (VarBinary):
+ * Length: 7, Null count: 0
+ * Validity bitmap buffer: Not required
+
+ | Bytes 0-6 | Bytes 7-63 |
+ |------------|-----------------------|
+ | joemark | unspecified (padding) |
+
+Only the slot in the array corresponding to the type index is considered. All
+"unselected" values are ignored and could be any semantically correct array
+value.
+
+Null Layout
+-----------
+
+We provide a simplified memory-efficient layout for the Null data type
+where all values are null. In this case no memory buffers are
+allocated.
+
+.. _dictionary-encoded-layout:
+
+Dictionary-encoded Layout
+-------------------------
+
+Dictionary encoding is a data representation technique to represent
+values by integers referencing a **dictionary** usually consisting of
+unique values. It can be effective when you have data with many
+repeated values.
+
+Any array can be dictionary-encoded. The dictionary is stored as an optional
+property of an array. When a field is dictionary encoded, the values are
+represented by an array of non-negative integers representing the index of the
+value in the dictionary. The memory layout for a dictionary-encoded array is
+the same as that of a primitive integer layout. The dictionary is handled as a
+separate columnar array with its own respective layout.
+
+As an example, you could have the following data: ::
+
+ type: VarBinary
+
+ ['foo', 'bar', 'foo', 'bar', null, 'baz']
+
+In dictionary-encoded form, this could appear as:
+
+::
+
+ data VarBinary (dictionary-encoded)
+ index_type: Int32
+ values: [0, 1, 0, 1, null, 2]
+
+ dictionary
+ type: VarBinary
+ values: ['foo', 'bar', 'baz']
+
+Note that a dictionary is permitted to contain duplicate values or
+nulls:
+
+::
+
+ data VarBinary (dictionary-encoded)
+ index_type: Int32
+ values: [0, 1, 3, 1, 4, 2]
+
+ dictionary
+ type: VarBinary
+ values: ['foo', 'bar', 'baz', 'foo', null]
+
+The null count of such arrays is dictated only by the validity bitmap
+of its indices, irrespective of any null values in the dictionary.
+
+Since unsigned integers can be more difficult to work with in some cases
+(e.g. in the JVM), we recommend preferring signed integers over unsigned
+integers for representing dictionary indices. Additionally, we recommend
+avoiding using 64-bit unsigned integer indices unless they are required by an
+application.
+
+We discuss dictionary encoding as it relates to serialization further
+below.
+
+Buffer Listing for Each Layout
+------------------------------
+
+For the avoidance of ambiguity, we provide listing the order and type
+of memory buffers for each layout.
+
+.. csv-table:: Buffer Layouts
+ :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2"
+ :widths: 30, 20, 20, 20
+
+ "Primitive",validity,data,
+ "Variable Binary",validity,offsets,data
+ "List",validity,offsets,
+ "Fixed-size List",validity,,
+ "Struct",validity,,
+ "Sparse Union",type ids,,
+ "Dense Union",type ids,offsets,
+ "Null",,,
+ "Dictionary-encoded",validity,data (indices),
+
+Logical Types
+=============
+
+The `Schema.fbs`_ defines built-in logical types supported by the
+Arrow columnar format. Each logical type uses one of the above
+physical layouts. Nested logical types may have different physical
+layouts depending on the particular realization of the type.
+
+We do not go into detail about the logical types definitions in this
+document as we consider `Schema.fbs`_ to be authoritative.
+
+.. _format-ipc:
+
+Serialization and Interprocess Communication (IPC)
+==================================================
+
+The primitive unit of serialized data in the columnar format is the
+"record batch". Semantically, a record batch is an ordered collection
+of arrays, known as its **fields**, each having the same length as one
+another but potentially different data types. A record batch's field
+names and types collectively form the batch's **schema**.
+
+In this section we define a protocol for serializing record batches
+into a stream of binary payloads and reconstructing record batches
+from these payloads without need for memory copying.
+
+The columnar IPC protocol utilizes a one-way stream of binary messages
+of these types:
+
+* Schema
+* RecordBatch
+* DictionaryBatch
+
+We specify a so-called *encapsulated IPC message* format which
+includes a serialized Flatbuffer type along with an optional message
+body. We define this message format before describing how to serialize
+each constituent IPC message type.
+
+Encapsulated message format
+---------------------------
+
+For simple streaming and file-based serialization, we define a
+"encapsulated" message format for interprocess communication. Such
+messages can be "deserialized" into in-memory Arrow array objects by
+examining only the message metadata without any need to copy or move
+any of the actual data.
+
+The encapsulated binary message format is as follows:
+
+* A 32-bit continuation indicator. The value ``0xFFFFFFFF`` indicates
+ a valid message. This component was introduced in version 0.15.0 in
+ part to address the 8-byte alignment requirement of Flatbuffers
+* A 32-bit little-endian length prefix indicating the metadata size
+* The message metadata as using the ``Message`` type defined in
+ `Message.fbs`_
+* Padding bytes to an 8-byte boundary
+* The message body, whose length must be a multiple of 8 bytes
+
+Schematically, we have: ::
+
+ <continuation: 0xFFFFFFFF>
+ <metadata_size: int32>
+ <metadata_flatbuffer: bytes>
+ <padding>
+ <message body>
+
+The complete serialized message must be a multiple of 8 bytes so that messages
+can be relocated between streams. Otherwise the amount of padding between the
+metadata and the message body could be non-deterministic.
+
+The ``metadata_size`` includes the size of the ``Message`` plus
+padding. The ``metadata_flatbuffer`` contains a serialized ``Message``
+Flatbuffer value, which internally includes:
+
+* A version number
+* A particular message value (one of ``Schema``, ``RecordBatch``, or
+ ``DictionaryBatch``)
+* The size of the message body
+* A ``custom_metadata`` field for any application-supplied metadata
+
+When read from an input stream, generally the ``Message`` metadata is
+initially parsed and validated to obtain the body size. Then the body
+can be read.
+
+Schema message
+--------------
+
+The Flatbuffers files `Schema.fbs`_ contains the definitions for all
+built-in logical data types and the ``Schema`` metadata type which
+represents the schema of a given record batch. A schema consists of
+an ordered sequence of fields, each having a name and type. A
+serialized ``Schema`` does not contain any data buffers, only type
+metadata.
+
+The ``Field`` Flatbuffers type contains the metadata for a single
+array. This includes:
+
+* The field's name
+* The field's logical type
+* Whether the field is semantically nullable. While this has no
+ bearing on the array's physical layout, many systems distinguish
+ nullable and non-nullable fields and we want to allow them to
+ preserve this metadata to enable faithful schema round trips.
+* A collection of child ``Field`` values, for nested types
+* A ``dictionary`` property indicating whether the field is
+ dictionary-encoded or not. If it is, a dictionary "id" is assigned
+ to allow matching a subsequent dictionary IPC message with the
+ appropriate field.
+
+We additionally provide both schema-level and field-level
+``custom_metadata`` attributes allowing for systems to insert their
+own application defined metadata to customize behavior.
+
+RecordBatch message
+-------------------
+
+A RecordBatch message contains the actual data buffers corresponding
+to the physical memory layout determined by a schema. The metadata for
+this message provides the location and size of each buffer, permitting
+Array data structures to be reconstructed using pointer arithmetic and
+thus no memory copying.
+
+The serialized form of the record batch is the following:
+
+* The ``data header``, defined as the ``RecordBatch`` type in
+ `Message.fbs`_.
+* The ``body``, a flat sequence of memory buffers written end-to-end
+ with appropriate padding to ensure a minimum of 8-byte alignment
+
+The data header contains the following:
+
+* The length and null count for each flattened field in the record
+ batch
+* The memory offset and length of each constituent ``Buffer`` in the
+ record batch's body
+
+Fields and buffers are flattened by a pre-order depth-first traversal
+of the fields in the record batch. For example, let's consider the
+schema ::
+
+ col1: Struct<a: Int32, b: List<item: Int64>, c: Float64>
+ col2: Utf8
+
+The flattened version of this is: ::
+
+ FieldNode 0: Struct name='col1'
+ FieldNode 1: Int32 name='a'
+ FieldNode 2: List name='b'
+ FieldNode 3: Int64 name='item'
+ FieldNode 4: Float64 name='c'
+ FieldNode 5: Utf8 name='col2'
+
+For the buffers produced, we would have the following (refer to the
+table above): ::
+
+ buffer 0: field 0 validity
+ buffer 1: field 1 validity
+ buffer 2: field 1 values
+ buffer 3: field 2 validity
+ buffer 4: field 2 offsets
+ buffer 5: field 3 validity
+ buffer 6: field 3 values
+ buffer 7: field 4 validity
+ buffer 8: field 4 values
+ buffer 9: field 5 validity
+ buffer 10: field 5 offsets
+ buffer 11: field 5 data
+
+The ``Buffer`` Flatbuffers value describes the location and size of a
+piece of memory. Generally these are interpreted relative to the
+**encapsulated message format** defined below.
+
+The ``size`` field of ``Buffer`` is not required to account for padding
+bytes. Since this metadata can be used to communicate in-memory pointer
+addresses between libraries, it is recommended to set ``size`` to the actual
+memory size rather than the padded size.
+
+Byte Order (`Endianness`_)
+---------------------------
+
+The Arrow format is little endian by default.
+
+Serialized Schema metadata has an endianness field indicating
+endianness of RecordBatches. Typically this is the endianness of the
+system where the RecordBatch was generated. The main use case is
+exchanging RecordBatches between systems with the same Endianness. At
+first we will return an error when trying to read a Schema with an
+endianness that does not match the underlying system. The reference
+implementation is focused on Little Endian and provides tests for
+it. Eventually we may provide automatic conversion via byte swapping.
+
+IPC Streaming Format
+--------------------
+
+We provide a streaming protocol or "format" for record batches. It is
+presented as a sequence of encapsulated messages, each of which
+follows the format above. The schema comes first in the stream, and it
+is the same for all of the record batches that follow. If any fields
+in the schema are dictionary-encoded, one or more ``DictionaryBatch``
+messages will be included. ``DictionaryBatch`` and ``RecordBatch``
+messages may be interleaved, but before any dictionary key is used in
+a ``RecordBatch`` it should be defined in a ``DictionaryBatch``. ::
+
+ <SCHEMA>
+ <DICTIONARY 0>
+ ...
+ <DICTIONARY k - 1>
+ <RECORD BATCH 0>
+ ...
+ <DICTIONARY x DELTA>
+ ...
+ <DICTIONARY y DELTA>
+ ...
+ <RECORD BATCH n - 1>
+ <EOS [optional]: 0xFFFFFFFF 0x00000000>
+
+.. note:: An edge-case for interleaved dictionary and record batches occurs
+ when the record batches contain dictionary encoded arrays that are
+ completely null. In this case, the dictionary for the encoded column might
+ appear after the first record batch.
+
+When a stream reader implementation is reading a stream, after each
+message, it may read the next 8 bytes to determine both if the stream
+continues and the size of the message metadata that follows. Once the
+message flatbuffer is read, you can then read the message body.
+
+The stream writer can signal end-of-stream (EOS) either by writing 8 bytes
+containing the 4-byte continuation indicator (``0xFFFFFFFF``) followed by 0
+metadata length (``0x00000000``) or closing the stream interface. We
+recommend the ".arrows" file extension for the streaming format although
+in many cases these streams will not ever be stored as files.
+
+IPC File Format
+---------------
+
+We define a "file format" supporting random access that is an extension of
+the stream format. The file starts and ends with a magic string ``ARROW1``
+(plus padding). What follows in the file is identical to the stream format.
+At the end of the file, we write a *footer* containing a redundant copy of
+the schema (which is a part of the streaming format) plus memory offsets and
+sizes for each of the data blocks in the file. This enables random access to
+any record batch in the file. See `File.fbs`_ for the precise details of the
+file footer.
+
+Schematically we have: ::
+
+ <magic number "ARROW1">
+ <empty padding bytes [to 8 byte boundary]>
+ <STREAMING FORMAT with EOS>
+ <FOOTER>
+ <FOOTER SIZE: int32>
+ <magic number "ARROW1">
+
+In the file format, there is no requirement that dictionary keys
+should be defined in a ``DictionaryBatch`` before they are used in a
+``RecordBatch``, as long as the keys are defined somewhere in the
+file. Further more, it is invalid to have more than one **non-delta**
+dictionary batch per dictionary ID (i.e. dictionary replacement is not
+supported). Delta dictionaries are applied in the order they appear in
+the file footer. We recommend the ".arrow" extension for files created with
+this format.
+
+Dictionary Messages
+-------------------
+
+Dictionaries are written in the stream and file formats as a sequence of record
+batches, each having a single field. The complete semantic schema for a
+sequence of record batches, therefore, consists of the schema along with all of
+the dictionaries. The dictionary types are found in the schema, so it is
+necessary to read the schema to first determine the dictionary types so that
+the dictionaries can be properly interpreted: ::
+
+ table DictionaryBatch {
+ id: long;
+ data: RecordBatch;
+ isDelta: boolean = false;
+ }
+
+The dictionary ``id`` in the message metadata can be referenced one or more times
+in the schema, so that dictionaries can even be used for multiple fields. See
+the :ref:`dictionary-encoded-layout` section for more about the semantics of
+dictionary-encoded data.
+
+The dictionary ``isDelta`` flag allows existing dictionaries to be
+expanded for future record batch materializations. A dictionary batch
+with ``isDelta`` set indicates that its vector should be concatenated
+with those of any previous batches with the same ``id``. In a stream
+which encodes one column, the list of strings ``["A", "B", "C", "B",
+"D", "C", "E", "A"]``, with a delta dictionary batch could take the
+form: ::
+
+ <SCHEMA>
+ <DICTIONARY 0>
+ (0) "A"
+ (1) "B"
+ (2) "C"
+
+ <RECORD BATCH 0>
+ 0
+ 1
+ 2
+ 1
+
+ <DICTIONARY 0 DELTA>
+ (3) "D"
+ (4) "E"
+
+ <RECORD BATCH 1>
+ 3
+ 2
+ 4
+ 0
+ EOS
+
+Alternatively, if ``isDelta`` is set to false, then the dictionary
+replaces the existing dictionary for the same ID. Using the same
+example as above, an alternate encoding could be: ::
+
+
+ <SCHEMA>
+ <DICTIONARY 0>
+ (0) "A"
+ (1) "B"
+ (2) "C"
+
+ <RECORD BATCH 0>
+ 0
+ 1
+ 2
+ 1
+
+ <DICTIONARY 0>
+ (0) "A"
+ (1) "C"
+ (2) "D"
+ (3) "E"
+
+ <RECORD BATCH 1>
+ 2
+ 1
+ 3
+ 0
+ EOS
+
+
+Custom Application Metadata
+---------------------------
+
+We provide a ``custom_metadata`` field at three levels to provide a
+mechanism for developers to pass application-specific metadata in
+Arrow protocol messages. This includes ``Field``, ``Schema``, and
+``Message``.
+
+The colon symbol ``:`` is to be used as a namespace separator. It can
+be used multiple times in a key.
+
+The ``ARROW`` pattern is a reserved namespace for internal Arrow use
+in the ``custom_metadata`` fields. For example,
+``ARROW:extension:name``.
+
+.. _format_metadata_extension_types:
+
+Extension Types
+---------------
+
+User-defined "extension" types can be defined setting certain
+``KeyValue`` pairs in ``custom_metadata`` in the ``Field`` metadata
+structure. These extension keys are:
+
+* ``'ARROW:extension:name'`` for the string name identifying the
+ custom data type. We recommend that you use a "namespace"-style
+ prefix for extension type names to minimize the possibility of
+ conflicts with multiple Arrow readers and writers in the same
+ application. For example, use ``myorg.name_of_type`` instead of
+ simply ``name_of_type``
+* ``'ARROW:extension:metadata'`` for a serialized representation
+ of the ``ExtensionType`` necessary to reconstruct the custom type
+
+This extension metadata can annotate any of the built-in Arrow logical
+types. The intent is that an implementation that does not support an
+extension type can still handle the underlying data. For example a
+16-byte UUID value could be embedded in ``FixedSizeBinary(16)``, and
+implementations that do not have this extension type can still work
+with the underlying binary values and pass along the
+``custom_metadata`` in subsequent Arrow protocol messages.
+
+Extension types may or may not use the
+``'ARROW:extension:metadata'`` field. Let's consider some example
+extension types:
+
+* ``uuid`` represented as ``FixedSizeBinary(16)`` with empty metadata
+* ``latitude-longitude`` represented as ``struct<latitude: double,
+ longitude: double>``, and empty metadata
+* ``tensor`` (multidimensional array) stored as ``Binary`` values and
+ having serialized metadata indicating the data type and shape of
+ each value. This could be JSON like ``{'type': 'int8', 'shape': [4,
+ 5]}`` for a 4x5 cell tensor.
+* ``trading-time`` represented as ``Timestamp`` with serialized
+ metadata indicating the market trading calendar the data corresponds
+ to
+
+Implementation guidelines
+=========================
+
+An execution engine (or framework, or UDF executor, or storage engine,
+etc) can implement only a subset of the Arrow spec and/or extend it
+given the following constraints:
+
+Implementing a subset the spec
+------------------------------
+
+* **If only producing (and not consuming) arrow vectors**: Any subset
+ of the vector spec and the corresponding metadata can be implemented.
+* **If consuming and producing vectors**: There is a minimal subset of
+ vectors to be supported. Production of a subset of vectors and
+ their corresponding metadata is always fine. Consumption of vectors
+ should at least convert the unsupported input vectors to the
+ supported subset (for example Timestamp.millis to timestamp.micros
+ or int32 to int64).
+
+Extensibility
+-------------
+
+An execution engine implementor can also extend their memory
+representation with their own vectors internally as long as they are
+never exposed. Before sending data to another system expecting Arrow
+data, these custom vectors should be converted to a type that exist in
+the Arrow spec.
+
+.. _Flatbuffers: http://github.com/google/flatbuffers
+.. _Flatbuffers protocol definition files: https://github.com/apache/arrow/tree/master/format
+.. _Schema.fbs: https://github.com/apache/arrow/blob/master/format/Schema.fbs
+.. _Message.fbs: https://github.com/apache/arrow/blob/master/format/Message.fbs
+.. _File.fbs: https://github.com/apache/arrow/blob/master/format/File.fbs
+.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering
+.. _Intel performance guide: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors
+.. _Endianness: https://en.wikipedia.org/wiki/Endianness
+.. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates
+.. _Parquet: https://parquet.apache.org/documentation/latest/
diff --git a/src/arrow/docs/source/format/Flight.rst b/src/arrow/docs/source/format/Flight.rst
new file mode 100644
index 000000000..c79c56386
--- /dev/null
+++ b/src/arrow/docs/source/format/Flight.rst
@@ -0,0 +1,152 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _flight-rpc:
+
+Arrow Flight RPC
+================
+
+Arrow Flight is an RPC framework for high-performance data services
+based on Arrow data, and is built on top of gRPC_ and the :doc:`IPC
+format <IPC>`.
+
+Flight is organized around streams of Arrow record batches, being
+either downloaded from or uploaded to another service. A set of
+metadata methods offers discovery and introspection of streams, as
+well as the ability to implement application-specific methods.
+
+Methods and message wire formats are defined by Protobuf, enabling
+interoperability with clients that may support gRPC and Arrow
+separately, but not Flight. However, Flight implementations include
+further optimizations to avoid overhead in usage of Protobuf (mostly
+around avoiding excessive memory copies).
+
+.. _gRPC: https://grpc.io/
+
+RPC Methods
+-----------
+
+Flight defines a set of RPC methods for uploading/downloading data,
+retrieving metadata about a data stream, listing available data
+streams, and for implementing application-specific RPC methods. A
+Flight service implements some subset of these methods, while a Flight
+client can call any of these methods. Thus, one Flight client can
+connect to any Flight service and perform basic operations.
+
+Data streams are identified by descriptors, which are either a path or
+an arbitrary binary command. A client that wishes to download the data
+would:
+
+#. Construct or acquire a ``FlightDescriptor`` for the data set they
+ are interested in. A client may know what descriptor they want
+ already, or they may use methods like ``ListFlights`` to discover
+ them.
+#. Call ``GetFlightInfo(FlightDescriptor)`` to get a ``FlightInfo``
+ message containing details on where the data is located (as well as
+ other metadata, like the schema and possibly an estimate of the
+ dataset size).
+
+ Flight does not require that data live on the same server as
+ metadata: this call may list other servers to connect to. The
+ ``FlightInfo`` message includes a ``Ticket``, an opaque binary
+ token that the server uses to identify the exact data set being
+ requested.
+#. Connect to other servers (if needed).
+#. Call ``DoGet(Ticket)`` to get back a stream of Arrow record
+ batches.
+
+To upload data, a client would:
+
+#. Construct or acquire a ``FlightDescriptor``, as before.
+#. Call ``DoPut(FlightData)`` and upload a stream of Arrow record
+ batches. They would also include the ``FlightDescriptor`` with the
+ first message.
+
+See `Protocol Buffer Definitions`_ for full details on the methods and
+messages involved.
+
+Authentication
+--------------
+
+Flight supports application-implemented authentication
+methods. Authentication, if enabled, has two phases: at connection
+time, the client and server can exchange any number of messages. Then,
+the client can provide a token alongside each call, and the server can
+validate that token.
+
+Applications may use any part of this; for instance, they may ignore
+the initial handshake and send an externally acquired token on each
+call, or they may establish trust during the handshake and not
+validate a token for each call. (Note that the latter is not secure if
+you choose to deploy a layer 7 load balancer, as is common with gRPC.)
+
+Error Handling
+--------------
+
+Arrow Flight defines its own set of error codes. The implementation
+differs between languages (e.g. in C++, Unimplemented is a general
+Arrow error status while it's a Flight-specific exception in Java),
+but the following set is exposed:
+
++----------------+-------------------------------------------+
+|Error Code |Description |
++================+===========================================+
+|UNKNOWN |An unknown error. The default if no other |
+| |error applies. |
++----------------+-------------------------------------------+
+|INTERNAL |An error internal to the service |
+| |implementation occurred. |
++----------------+-------------------------------------------+
+|INVALID_ARGUMENT|The client passed an invalid argument to |
+| |the RPC. |
++----------------+-------------------------------------------+
+|TIMED_OUT |The operation exceeded a timeout or |
+| |deadline. |
++----------------+-------------------------------------------+
+|NOT_FOUND |The requested resource (action, data |
+| |stream) was not found. |
++----------------+-------------------------------------------+
+|ALREADY_EXISTS |The resource already exists. |
++----------------+-------------------------------------------+
+|CANCELLED |The operation was cancelled (either by the |
+| |client or the server). |
++----------------+-------------------------------------------+
+|UNAUTHENTICATED |The client is not authenticated. |
++----------------+-------------------------------------------+
+|UNAUTHORIZED |The client is authenticated, but does not |
+| |have permissions for the requested |
+| |operation. |
++----------------+-------------------------------------------+
+|UNIMPLEMENTED |The RPC is not implemented. |
++----------------+-------------------------------------------+
+|UNAVAILABLE |The server is not available. May be emitted|
+| |by the client for connectivity reasons. |
++----------------+-------------------------------------------+
+
+
+External Resources
+------------------
+
+- https://arrow.apache.org/blog/2018/10/09/0.11.0-release/
+- https://www.slideshare.net/JacquesNadeau5/apache-arrow-flight-overview
+
+Protocol Buffer Definitions
+---------------------------
+
+.. literalinclude:: ../../../format/Flight.proto
+ :language: protobuf
+ :linenos:
diff --git a/src/arrow/docs/source/format/Guidelines.rst b/src/arrow/docs/source/format/Guidelines.rst
new file mode 100644
index 000000000..40624521a
--- /dev/null
+++ b/src/arrow/docs/source/format/Guidelines.rst
@@ -0,0 +1,24 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+:orphan:
+
+Implementation Guidelines
+=========================
+
+The contents of this document have relocated to the main :ref:`Columnar
+Specification <format_columnar>` page.
diff --git a/src/arrow/docs/source/format/IPC.rst b/src/arrow/docs/source/format/IPC.rst
new file mode 100644
index 000000000..65b47f7d7
--- /dev/null
+++ b/src/arrow/docs/source/format/IPC.rst
@@ -0,0 +1,24 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+:orphan:
+
+IPC
+===
+
+The contents of this document have relocated to the main :ref:`Columnar
+Specification <format_columnar>` page.
diff --git a/src/arrow/docs/source/format/Integration.rst b/src/arrow/docs/source/format/Integration.rst
new file mode 100644
index 000000000..22d595e99
--- /dev/null
+++ b/src/arrow/docs/source/format/Integration.rst
@@ -0,0 +1,398 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _format_integration_testing:
+
+Integration Testing
+===================
+
+Our strategy for integration testing between Arrow implementations is:
+
+* Test datasets are specified in a custom human-readable, JSON-based format
+ designed exclusively for Arrow's integration tests
+* Each implementation provides a testing executable capable of converting
+ between the JSON and the binary Arrow file representation
+* The test executable is also capable of validating the contents of a binary
+ file against a corresponding JSON file
+
+Running integration tests
+-------------------------
+
+The integration test data generator and runner are implemented inside
+the :ref:`Archery <archery>` utility.
+
+The integration tests are run using the ``archery integration`` command.
+
+.. code-block:: shell
+
+ archery integration --help
+
+In order to run integration tests, you'll first need to build each component
+you want to include. See the respective developer docs for C++, Java, etc.
+for instructions on building those.
+
+Some languages may require additional build options to enable integration
+testing. For C++, for example, you need to add ``-DARROW_BUILD_INTEGRATION=ON``
+to your cmake command.
+
+Depending on which components you have built, you can enable and add them to
+the archery test run. For example, if you only have the C++ project built, run:
+
+.. code-block:: shell
+
+ archery integration --with-cpp=1
+
+
+For Java, it may look like:
+
+.. code-block:: shell
+
+ VERSION=0.11.0-SNAPSHOT
+ export ARROW_JAVA_INTEGRATION_JAR=$JAVA_DIR/tools/target/arrow-tools-$VERSION-jar-with-dependencies.jar
+ archery integration --with-cpp=1 --with-java=1
+
+To run all tests, including Flight integration tests, do:
+
+.. code-block:: shell
+
+ archery integration --with-all --run-flight
+
+Note that we run these tests in continuous integration, and the CI job uses
+docker-compose. You may also run the docker-compose job locally, or at least
+refer to it if you have questions about how to build other languages or enable
+certain tests.
+
+See :ref:`docker-builds` for more information about the project's
+``docker-compose`` configuration.
+
+JSON test data format
+---------------------
+
+A JSON representation of Arrow columnar data is provided for
+cross-language integration testing purposes.
+This representation is `not canonical <https://lists.apache.org/thread.html/6947fb7666a0f9cc27d9677d2dad0fb5990f9063b7cf3d80af5e270f%40%3Cdev.arrow.apache.org%3E>`_
+but it provides a human-readable way of verifying language implementations.
+
+See `here <https://github.com/apache/arrow/tree/master/docs/source/format/integration_json_examples>`_
+for some examples of this JSON data.
+
+.. can we check in more examples, e.g. from the generated_*.json test files?
+
+The high level structure of a JSON integration test files is as follows:
+
+**Data file** ::
+
+ {
+ "schema": /*Schema*/,
+ "batches": [ /*RecordBatch*/ ],
+ "dictionaries": [ /*DictionaryBatch*/ ],
+ }
+
+All files contain ``schema`` and ``batches``, while ``dictionaries`` is only
+present if there are dictionary type fields in the schema.
+
+**Schema** ::
+
+ {
+ "fields" : [
+ /* Field */
+ ],
+ "metadata" : /* Metadata */
+ }
+
+**Field** ::
+
+ {
+ "name" : "name_of_the_field",
+ "nullable" : /* boolean */,
+ "type" : /* Type */,
+ "children" : [ /* Field */ ],
+ "dictionary": {
+ "id": /* integer */,
+ "indexType": /* Type */,
+ "isOrdered": /* boolean */
+ },
+ "metadata" : /* Metadata */
+ }
+
+The ``dictionary`` attribute is present if and only if the ``Field`` corresponds to a
+dictionary type, and its ``id`` maps onto a column in the ``DictionaryBatch``. In this
+case the ``type`` attribute describes the value type of the dictionary.
+
+For primitive types, ``children`` is an empty array.
+
+**Metadata** ::
+
+ null |
+ [ {
+ "key": /* string */,
+ "value": /* string */
+ } ]
+
+A key-value mapping of custom metadata. It may be omitted or null, in which case it is
+considered equivalent to ``[]`` (no metadata). Duplicated keys are not forbidden here.
+
+**Type**: ::
+
+ {
+ "name" : "null|struct|list|largelist|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map"
+ }
+
+A ``Type`` will have other fields as defined in
+`Schema.fbs <https://github.com/apache/arrow/tree/master/format/Schema.fbs>`_
+depending on its name.
+
+Int: ::
+
+ {
+ "name" : "int",
+ "bitWidth" : /* integer */,
+ "isSigned" : /* boolean */
+ }
+
+FloatingPoint: ::
+
+ {
+ "name" : "floatingpoint",
+ "precision" : "HALF|SINGLE|DOUBLE"
+ }
+
+FixedSizeBinary: ::
+
+ {
+ "name" : "fixedsizebinary",
+ "byteWidth" : /* byte width */
+ }
+
+Decimal: ::
+
+ {
+ "name" : "decimal",
+ "precision" : /* integer */,
+ "scale" : /* integer */
+ }
+
+Timestamp: ::
+
+ {
+ "name" : "timestamp",
+ "unit" : "$TIME_UNIT",
+ "timezone": "$timezone"
+ }
+
+``$TIME_UNIT`` is one of ``"SECOND|MILLISECOND|MICROSECOND|NANOSECOND"``
+
+"timezone" is an optional string.
+
+Duration: ::
+
+ {
+ "name" : "duration",
+ "unit" : "$TIME_UNIT"
+ }
+
+Date: ::
+
+ {
+ "name" : "date",
+ "unit" : "DAY|MILLISECOND"
+ }
+
+Time: ::
+
+ {
+ "name" : "time",
+ "unit" : "$TIME_UNIT",
+ "bitWidth": /* integer: 32 or 64 */
+ }
+
+Interval: ::
+
+ {
+ "name" : "interval",
+ "unit" : "YEAR_MONTH|DAY_TIME"
+ }
+
+Union: ::
+
+ {
+ "name" : "union",
+ "mode" : "SPARSE|DENSE",
+ "typeIds" : [ /* integer */ ]
+ }
+
+The ``typeIds`` field in ``Union`` are the codes used to denote which member of
+the union is active in each array slot. Note that in general these discriminants are not identical
+to the index of the corresponding child array.
+
+List: ::
+
+ {
+ "name": "list"
+ }
+
+The type that the list is a "list of" will be included in the ``Field``'s
+"children" member, as a single ``Field`` there. For example, for a list of
+``int32``, ::
+
+ {
+ "name": "list_nullable",
+ "type": {
+ "name": "list"
+ },
+ "nullable": true,
+ "children": [
+ {
+ "name": "item",
+ "type": {
+ "name": "int",
+ "isSigned": true,
+ "bitWidth": 32
+ },
+ "nullable": true,
+ "children": []
+ }
+ ]
+ }
+
+FixedSizeList: ::
+
+ {
+ "name": "fixedsizelist",
+ "listSize": /* integer */
+ }
+
+This type likewise comes with a length-1 "children" array.
+
+Struct: ::
+
+ {
+ "name": "struct"
+ }
+
+The ``Field``'s "children" contains an array of ``Fields`` with meaningful
+names and types.
+
+Map: ::
+
+ {
+ "name": "map",
+ "keysSorted": /* boolean */
+ }
+
+The ``Field``'s "children" contains a single ``struct`` field, which itself
+contains 2 children, named "key" and "value".
+
+Null: ::
+
+ {
+ "name": "null"
+ }
+
+Extension types are, as in the IPC format, represented as their underlying
+storage type plus some dedicated field metadata to reconstruct the extension
+type. For example, assuming a "uuid" extension type backed by a
+FixedSizeBinary(16) storage, here is how a "uuid" field would be represented::
+
+ {
+ "name" : "name_of_the_field",
+ "nullable" : /* boolean */,
+ "type" : {
+ "name" : "fixedsizebinary",
+ "byteWidth" : 16
+ },
+ "children" : [],
+ "metadata" : [
+ {"key": "ARROW:extension:name", "value": "uuid"},
+ {"key": "ARROW:extension:metadata", "value": "uuid-serialized"}
+ ]
+ }
+
+**RecordBatch**::
+
+ {
+ "count": /* integer number of rows */,
+ "columns": [ /* FieldData */ ]
+ }
+
+**DictionaryBatch**::
+
+ {
+ "id": /* integer */,
+ "data": [ /* RecordBatch */ ]
+ }
+
+**FieldData**::
+
+ {
+ "name": "field_name",
+ "count" "field_length",
+ "$BUFFER_TYPE": /* BufferData */
+ ...
+ "$BUFFER_TYPE": /* BufferData */
+ "children": [ /* FieldData */ ]
+ }
+
+The "name" member of a ``Field`` in the ``Schema`` corresponds to the "name"
+of a ``FieldData`` contained in the "columns" of a ``RecordBatch``.
+For nested types (list, struct, etc.), ``Field``'s "children" each have a
+"name" that corresponds to the "name" of a ``FieldData`` inside the
+"children" of that ``FieldData``.
+For ``FieldData`` inside of a ``DictionaryBatch``, the "name" field does not
+correspond to anything.
+
+Here ``$BUFFER_TYPE`` is one of ``VALIDITY``, ``OFFSET`` (for
+variable-length types, such as strings and lists), ``TYPE_ID`` (for unions),
+or ``DATA``.
+
+``BufferData`` is encoded based on the type of buffer:
+
+* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable
+ ``Field`` still has a ``VALIDITY`` array, even though all values are 1.
+* ``OFFSET``: a JSON array of integers for 32-bit offsets or
+ string-formatted integers for 64-bit offsets
+* ``TYPE_ID``: a JSON array of integers
+* ``DATA``: a JSON array of encoded values
+
+The value encoding for ``DATA`` is different depending on the logical
+type:
+
+* For boolean type: an array of 1 (true) and 0 (false).
+* For integer-based types (including timestamps): an array of JSON numbers.
+* For 64-bit integers: an array of integers formatted as JSON strings,
+ so as to avoid loss of precision.
+* For floating point types: an array of JSON numbers. Values are limited
+ to 3 decimal places to avoid loss of precision.
+* For binary types, an array of uppercase hex-encoded strings, so as
+ to represent arbitrary binary data.
+* For UTF-8 string types, an array of JSON strings.
+
+For "list" and "largelist" types, ``BufferData`` has ``VALIDITY`` and
+``OFFSET``, and the rest of the data is inside "children". These child
+``FieldData`` contain all of the same attributes as non-child data, so in
+the example of a list of ``int32``, the child data has ``VALIDITY`` and
+``DATA``.
+
+For "fixedsizelist", there is no ``OFFSET`` member because the offsets are
+implied by the field's "listSize".
+
+Note that the "count" for these child data may not match the parent "count".
+For example, if a ``RecordBatch`` has 7 rows and contains a ``FixedSizeList``
+of ``listSize`` 4, then the data inside the "children" of that ``FieldData``
+will have count 28.
+
+For "null" type, ``BufferData`` does not contain any buffers.
diff --git a/src/arrow/docs/source/format/Layout.rst b/src/arrow/docs/source/format/Layout.rst
new file mode 100644
index 000000000..4568f31c5
--- /dev/null
+++ b/src/arrow/docs/source/format/Layout.rst
@@ -0,0 +1,24 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+:orphan:
+
+Physical Memory Layout
+======================
+
+The contents of this document have relocated to the main :ref:`Columnar
+Specification <format_columnar>` page.
diff --git a/src/arrow/docs/source/format/Metadata.rst b/src/arrow/docs/source/format/Metadata.rst
new file mode 100644
index 000000000..55045abb0
--- /dev/null
+++ b/src/arrow/docs/source/format/Metadata.rst
@@ -0,0 +1,24 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+:orphan:
+
+Format Metadata
+===============
+
+The contents of this document have relocated to the main :ref:`Columnar
+Specification <format_columnar>` page.
diff --git a/src/arrow/docs/source/format/Other.rst b/src/arrow/docs/source/format/Other.rst
new file mode 100644
index 000000000..9504998d6
--- /dev/null
+++ b/src/arrow/docs/source/format/Other.rst
@@ -0,0 +1,63 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Other Data Structures
+=====================
+
+Our Flatbuffers protocol files have metadata for some other data
+structures defined to allow other kinds of applications to take
+advantage of common interprocess communication machinery. These data
+structures are not considered to be part of the columnar format.
+
+An Arrow columnar implementation is not required to implement these
+types.
+
+Tensor (Multi-dimensional Array)
+--------------------------------
+
+The ``Tensor`` message types provides a way to write a
+multidimensional array of fixed-size values (such as a NumPy ndarray).
+
+When writing a standalone encapsulated tensor message, we use the
+encapsulated IPC format defined in the :ref:`Columnar Specification
+<format_columnar>`, but additionally align the starting offset of the
+tensor body to be a multiple of 64 bytes: ::
+
+ <metadata prefix and metadata>
+ <PADDING>
+ <tensor body>
+
+Sparse Tensor
+-------------
+
+``SparseTensor`` represents a multidimensional array whose elements
+are generally almost all zeros.
+
+When writing a standalone encapsulated sparse tensor message, we use
+the encapsulated IPC format defined in the :ref:`Columnar Specification
+<format_columnar>`, but additionally align the starting offsets of the
+sparse index and the sparse tensor body (if writing to a shared memory
+region) to be multiples of 64 bytes: ::
+
+ <metadata prefix and metadata>
+ <PADDING>
+ <sparse index>
+ <PADDING>
+ <sparse tensor body>
+
+The contents of the sparse tensor index depends on what kind of sparse
+format is used.
diff --git a/src/arrow/docs/source/format/README.md b/src/arrow/docs/source/format/README.md
new file mode 100644
index 000000000..68a2d72b5
--- /dev/null
+++ b/src/arrow/docs/source/format/README.md
@@ -0,0 +1,24 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Apache Arrow Format Documentation
+
+These documents go together with the Flatbuffers and Protocol Buffers
+protocol definition files to provide sufficient detail necessary to
+build a new Arrow implementation. \ No newline at end of file
diff --git a/src/arrow/docs/source/format/Versioning.rst b/src/arrow/docs/source/format/Versioning.rst
new file mode 100644
index 000000000..b70656987
--- /dev/null
+++ b/src/arrow/docs/source/format/Versioning.rst
@@ -0,0 +1,70 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Format Versioning and Stability
+===============================
+
+Starting with version 1.0.0, Apache Arrow utilizes
+**two versions** to describe each release of the project:
+the **Format Version** and the **Library Version**. Each Library
+Version has a corresponding Format Version, and multiple versions of
+the library may have the same format version. For example, library
+versions 2.0.0 and 3.0.0 may both track format version 1.0.0.
+
+For library versions prior to 1.0.0, major releases may contain API
+changes. From 1.0.0 onward, we follow `Semantic Versioning
+<https://semver.org/>`_ with regards to communicating API changes. We
+expect most releases to be major library releases.
+
+Backward Compatibility
+----------------------
+
+A newer versioned client library will be able to read any data and
+metadata produced by an older client library.
+
+So long as the **major** format version is not changed, a newer
+library is backward compatible with an older library.
+
+Forward Compatibility
+---------------------
+
+An older client library must be able to either read data generated
+from a new client library or detect that it cannot properly read the
+data.
+
+An increase in the **minor** version of the format version, such as
+1.0.0 to 1.1.0, indicates that 1.1.0 contains new features not
+available in 1.0.0. So long as these features are not used (such as a
+new logical data type), forward compatibility is preserved.
+
+Long-Term Stability
+-------------------
+
+A change in the format major version (e.g. from 1.0.0 to 2.0.0)
+indicates a disruption to these compatibility guarantees in some way.
+We **do not expect** this to be a frequent occurrence.
+This would be an exceptional
+event and, should this come to pass, we would exercise caution in
+ensuring that production applications are not harmed.
+
+Pre-1.0.0 Versions
+------------------
+
+We made no forward or backward compatibility guarantees for
+versions prior to 1.0.0. However, we made every effort to ensure
+that new clients can read serialized data produced by library version
+0.8.0 and onward.
diff --git a/src/arrow/docs/source/format/integration_json_examples/simple.json b/src/arrow/docs/source/format/integration_json_examples/simple.json
new file mode 100644
index 000000000..663472919
--- /dev/null
+++ b/src/arrow/docs/source/format/integration_json_examples/simple.json
@@ -0,0 +1,98 @@
+{
+ "schema": {
+ "fields": [
+ {
+ "name": "foo",
+ "type": {"name": "int", "isSigned": true, "bitWidth": 32},
+ "nullable": true,
+ "children": []
+ },
+ {
+ "name": "bar",
+ "type": {"name": "floatingpoint", "precision": "DOUBLE"},
+ "nullable": true,
+ "children": []
+ },
+ {
+ "name": "baz",
+ "type": {"name": "utf8"},
+ "nullable": true,
+ "children": []
+ }
+ ]
+ },
+ "batches": [
+ {
+ "count": 5,
+ "columns": [
+ {
+ "name": "foo",
+ "count": 5,
+ "VALIDITY": [1, 0, 1, 1, 1],
+ "DATA": [1, 2, 3, 4, 5]
+ },
+ {
+ "name": "bar",
+ "count": 5,
+ "VALIDITY": [1, 0, 0, 1, 1],
+ "DATA": [1.0, 2.0, 3.0, 4.0, 5.0]
+ },
+ {
+ "name": "baz",
+ "count": 5,
+ "VALIDITY": [1, 0, 0, 1, 1],
+ "OFFSET": [0, 2, 2, 2, 5, 9],
+ "DATA": ["aa", "", "", "bbb", "cccc"]
+ }
+ ]
+ },
+ {
+ "count": 5,
+ "columns": [
+ {
+ "name": "foo",
+ "count": 5,
+ "VALIDITY": [1, 1, 1, 1, 1],
+ "DATA": [1, 2, 3, 4, 5]
+ },
+ {
+ "name": "bar",
+ "count": 5,
+ "VALIDITY": [1, 1, 1, 1, 1],
+ "DATA": [1.0, 2.0, 3.0, 4.0, 5.0]
+ },
+ {
+ "name": "baz",
+ "count": 5,
+ "VALIDITY": [1, 1, 1, 1, 1],
+ "OFFSET": [0, 2, 3, 4, 7, 11],
+ "DATA": ["aa", "b", "c", "ddd", "eeee"]
+ }
+ ]
+ },
+ {
+ "count": 5,
+ "columns": [
+ {
+ "name": "foo",
+ "count": 5,
+ "VALIDITY": [0, 0, 0, 0, 0],
+ "DATA": [1, 2, 3, 4, 5]
+ },
+ {
+ "name": "bar",
+ "count": 5,
+ "VALIDITY": [0, 0, 0, 0, 0],
+ "DATA": [1.0, 2.0, 3.0, 4.0, 5.0]
+ },
+ {
+ "name": "baz",
+ "count": 5,
+ "VALIDITY": [0, 0, 0, 0, 0],
+ "OFFSET": [0, 0, 0, 0, 0, 0],
+ "DATA": ["", "", "", "", ""]
+ }
+ ]
+ }
+ ]
+}
diff --git a/src/arrow/docs/source/format/integration_json_examples/struct.json b/src/arrow/docs/source/format/integration_json_examples/struct.json
new file mode 100644
index 000000000..4e6cc774e
--- /dev/null
+++ b/src/arrow/docs/source/format/integration_json_examples/struct.json
@@ -0,0 +1,201 @@
+{
+ "schema": {
+ "fields": [
+ {
+ "name": "struct_nullable",
+ "type": {
+ "name": "struct"
+ },
+ "nullable": true,
+ "children": [
+ {
+ "name": "f1",
+ "type": {
+ "name": "int",
+ "isSigned": true,
+ "bitWidth": 32
+ },
+ "nullable": true,
+ "children": []
+ },
+ {
+ "name": "f2",
+ "type": {
+ "name": "utf8"
+ },
+ "nullable": true,
+ "children": []
+ }
+ ]
+ }
+ ]
+ },
+ "batches": [
+ {
+ "count": 7,
+ "columns": [
+ {
+ "name": "struct_nullable",
+ "count": 7,
+ "VALIDITY": [
+ 0,
+ 1,
+ 1,
+ 1,
+ 0,
+ 1,
+ 0
+ ],
+ "children": [
+ {
+ "name": "f1",
+ "count": 7,
+ "VALIDITY": [
+ 1,
+ 0,
+ 1,
+ 1,
+ 1,
+ 0,
+ 0
+ ],
+ "DATA": [
+ 1402032511,
+ 290876774,
+ 137773603,
+ 410361374,
+ 1959836418,
+ 1995074679,
+ -163525262
+ ]
+ },
+ {
+ "name": "f2",
+ "count": 7,
+ "VALIDITY": [
+ 0,
+ 1,
+ 1,
+ 1,
+ 0,
+ 1,
+ 0
+ ],
+ "OFFSET": [
+ 0,
+ 0,
+ 7,
+ 14,
+ 21,
+ 21,
+ 28,
+ 28
+ ],
+ "DATA": [
+ "",
+ "MhRNxD4",
+ "3F9HBxK",
+ "aVd88fp",
+ "",
+ "3loZrRf",
+ ""
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "count": 10,
+ "columns": [
+ {
+ "name": "struct_nullable",
+ "count": 10,
+ "VALIDITY": [
+ 0,
+ 1,
+ 1,
+ 0,
+ 1,
+ 0,
+ 0,
+ 1,
+ 1,
+ 1
+ ],
+ "children": [
+ {
+ "name": "f1",
+ "count": 10,
+ "VALIDITY": [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 1,
+ 0,
+ 0,
+ 0
+ ],
+ "DATA": [
+ -2041500147,
+ 1715692943,
+ -35444996,
+ 1425496657,
+ 112765084,
+ 1760754983,
+ 413888857,
+ 2039738337,
+ -1924327700,
+ 670528518
+ ]
+ },
+ {
+ "name": "f2",
+ "count": 10,
+ "VALIDITY": [
+ 1,
+ 0,
+ 0,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 1,
+ 0
+ ],
+ "OFFSET": [
+ 0,
+ 7,
+ 7,
+ 7,
+ 14,
+ 21,
+ 28,
+ 35,
+ 42,
+ 49,
+ 49
+ ],
+ "DATA": [
+ "AS5oARE",
+ "",
+ "",
+ "JGdagcX",
+ "78SLiRw",
+ "vbGf7OY",
+ "5uh5fTs",
+ "0ilsf82",
+ "LjS9MbU",
+ ""
+ ]
+ }
+ ]
+ }
+ ]
+ }
+ ]
+}
diff --git a/src/arrow/docs/source/index.rst b/src/arrow/docs/source/index.rst
new file mode 100644
index 000000000..90d6ac09b
--- /dev/null
+++ b/src/arrow/docs/source/index.rst
@@ -0,0 +1,96 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Apache Arrow
+============
+
+Apache Arrow is a development platform for in-memory analytics. It contains a
+set of technologies that enable big data systems to process and move data
+fast. It specifies a standardized language-independent columnar memory format
+for flat and hierarchical data, organized for efficient analytic operations on
+modern hardware.
+
+The project is developing a multi-language collection of libraries for solving
+systems problems related to in-memory analytical data processing. This includes
+such topics as:
+
+* Zero-copy shared memory and RPC-based data movement
+* Reading and writing file formats (like CSV, Apache ORC, and Apache Parquet)
+* In-memory analytics and query processing
+
+**To learn how to use Arrow refer to the documentation specific to your
+target environment.**
+
+.. _toc.usage:
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Supported Environments
+
+ C/GLib <c_glib/index>
+ C++ <cpp/index>
+ C# <https://github.com/apache/arrow/blob/master/csharp/README.md>
+ Go <https://godoc.org/github.com/apache/arrow/go/arrow>
+ Java <java/index>
+ JavaScript <js/index>
+ Julia <https://github.com/apache/arrow/blob/master/julia/Arrow/README.md>
+ MATLAB <https://github.com/apache/arrow/blob/master/matlab/README.md>
+ Python <python/index>
+ R <r/index>
+ Ruby <https://github.com/apache/arrow/blob/master/ruby/README.md>
+ Rust <https://docs.rs/crate/arrow/>
+ status
+
+.. _toc.cookbook:
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Cookbooks
+
+ C++ <https://arrow.apache.org/cookbook/cpp/>
+ Python <https://arrow.apache.org/cookbook/py/>
+ R <https://arrow.apache.org/cookbook/r/>
+
+.. _toc.columnar:
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Specifications and Protocols
+
+ format/Versioning
+ format/Columnar
+ format/Flight
+ format/Integration
+ format/CDataInterface
+ format/CStreamInterface
+ format/Other
+
+.. _toc.development:
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Development
+
+ developers/contributing
+ developers/cpp/index
+ developers/python
+ developers/archery
+ developers/crossbow
+ developers/docker
+ developers/benchmarks
+ developers/documentation
+ developers/computeir
diff --git a/src/arrow/docs/source/java/algorithm.rst b/src/arrow/docs/source/java/algorithm.rst
new file mode 100644
index 000000000..f838398af
--- /dev/null
+++ b/src/arrow/docs/source/java/algorithm.rst
@@ -0,0 +1,92 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Java Algorithms
+===============
+
+Arrow's Java library provides algorithms for some commonly-used
+functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm``
+package of the ``algorithm`` module.
+
+Comparing Vector Elements
+-------------------------
+
+Comparing vector elements is the basic for many algorithms. Vector
+elements can be compared in one of the two ways:
+
+1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``.
+Currently, this type of comparison is supported through the ``org.apache.arrow.vector.compare.VectorValueEqualizer``
+interface.
+
+2. **Ordering comparison**: there are three possible results for this type of comparisons: ``less than``, ``equal to ``
+and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``.
+
+We provide default implementations to compare vector elements. However, users can also define ways
+for customized comparisons.
+
+Vector Element Search
+---------------------
+
+A search algorithm tries to find a particular value in a vector. When successful, a vector index is
+returned; otherwise, a ``-1`` is returned. The following search algorithms are provided:
+
+1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is
+found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements
+in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``.
+
+2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time.
+However, it is only applicable to sorted vectors. To get a sorted vector,
+one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm
+is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``.
+
+3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search
+for a value. To make this process faster, one can split the vector into multiple partitions, and perform the
+search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``.
+
+4. **Range search**: for many scenarios, there can be multiple matching values in the vector.
+If the vector is sorted, the matching values reside in a contiguous region in the vector. The
+range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time.
+An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``.
+
+Vector Sorting
+--------------
+
+Given a vector, a sorting algorithm turns it into a sorted one. The sorting criteria must
+be specified by some ordering comparison operation. The sorting algorithms can be
+classified into the following categories:
+
+1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original
+vector, without creating any new vector. So it just returns the original vector after the sorting operations.
+Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place
+sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors.
+
+2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead,
+it copies vector elements to a new vector in sorted order, and returns the new vector.
+We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter``
+and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter``
+for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time.
+
+3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer
+vector, which correspond to indices of vector elements in sorted order. With the index vector, one can
+easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k``th
+smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``,
+which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type.
+
+Other Algorithms
+----------------
+
+Other algorithms include vector deduplication, dictionary encoding, etc., in the ``algorithm`` module.
diff --git a/src/arrow/docs/source/java/index.rst b/src/arrow/docs/source/java/index.rst
new file mode 100644
index 000000000..65a7a3a4f
--- /dev/null
+++ b/src/arrow/docs/source/java/index.rst
@@ -0,0 +1,31 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Java Implementation
+===================
+
+This is the documentation of the Java API of Apache Arrow. For more details
+on the Arrow format and other language bindings see the :doc:`parent documentation <../index>`.
+
+.. toctree::
+ :maxdepth: 2
+
+ vector
+ vector_schema_root
+ ipc
+ algorithm
+ Reference (javadoc) <reference/index>
diff --git a/src/arrow/docs/source/java/ipc.rst b/src/arrow/docs/source/java/ipc.rst
new file mode 100644
index 000000000..7cab480c4
--- /dev/null
+++ b/src/arrow/docs/source/java/ipc.rst
@@ -0,0 +1,187 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+===========================
+Reading/Writing IPC formats
+===========================
+Arrow defines two types of binary formats for serializing record batches:
+
+* **Streaming format**: for sending an arbitrary number of record
+ batches. The format must be processed from start to end, and does not support
+ random access
+
+* **File or Random Access format**: for serializing a fixed number of record
+ batches. It supports random access, and thus is very useful when used with
+ memory maps
+
+Writing and Reading Streaming Format
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+First, let's populate a :class:`VectorSchemaRoot` with a small batch of records
+
+.. code-block:: Java
+
+ BitVector bitVector = new BitVector("boolean", allocator);
+ VarCharVector varCharVector = new VarCharVector("varchar", allocator);
+ for (int i = 0; i < 10; i++) {
+ bitVector.setSafe(i, i % 2 == 0 ? 0 : 1);
+ varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8));
+ }
+ bitVector.setValueCount(10);
+ varCharVector.setValueCount(10);
+
+ List<Field> fields = Arrays.asList(bitVector.getField(), varCharVector.getField());
+ List<FieldVector> vectors = Arrays.asList(bitVector, varCharVector);
+ VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
+
+Now, we can begin writing a stream containing some number of these batches. For this we use :class:`ArrowStreamWriter`
+(DictionaryProvider used for any vectors that are dictionary encoded is optional and can be null))::
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowStreamWriter writer = new ArrowStreamWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out));
+
+
+Here we used an in-memory stream, but this could have been a socket or some other IO stream. Then we can do
+
+.. code-block:: Java
+
+ writer.start();
+ // write the first batch
+ writer.writeBatch();
+
+ // write another four batches.
+ for (int i = 0; i < 4; i++) {
+ // populate VectorSchemaRoot data and write the second batch
+ BitVector childVector1 = (BitVector)root.getVector(0);
+ VarCharVector childVector2 = (VarCharVector)root.getVector(1);
+ childVector1.reset();
+ childVector2.reset();
+ ... do some populate work here, could be different for each batch
+ writer.writeBatch();
+ }
+
+ // end
+ writer.end();
+
+Note since the :class:`VectorSchemaRoot` in writer is a container that can hold batches, batches flow through
+:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before `writeBatch` so that later batches
+could overwrite previous ones.
+
+Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches.
+We can read such a stream with :class:`ArrowStreamReader`, note that :class:`VectorSchemaRoot` within
+reader will be loaded with new values on every call to :class:`loadNextBatch()`
+
+.. code-block:: Java
+
+ try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) {
+ Schema schema = reader.getVectorSchemaRoot().getSchema();
+ for (int i = 0; i < 5; i++) {
+ // This will be loaded with new values on every call to loadNextBatch
+ VectorSchemaRoot readBatch = reader.getVectorSchemaRoot();
+ reader.loadNextBatch();
+ ... do something with readBatch
+ }
+
+ }
+
+Here we also give a simple example with dictionary encoded vectors
+
+.. code-block:: Java
+
+ DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
+ // create dictionary and provider
+ final VarCharVector dictVector = new VarCharVector("dict", allocator);
+ dictVector.allocateNewSafe();
+ dictVector.setSafe(0, "aa".getBytes());
+ dictVector.setSafe(1, "bb".getBytes());
+ dictVector.setSafe(2, "cc".getBytes());
+ dictVector.setValueCount(3);
+
+ Dictionary dictionary =
+ new Dictionary(dictVector, new DictionaryEncoding(1L, false, /*indexType=*/null));
+ provider.put(dictionary);
+
+ // create vector and encode it
+ final VarCharVector vector = new VarCharVector("vector", allocator);
+ vector.allocateNewSafe();
+ vector.setSafe(0, "bb".getBytes());
+ vector.setSafe(1, "bb".getBytes());
+ vector.setSafe(2, "cc".getBytes());
+ vector.setSafe(3, "aa".getBytes());
+ vector.setValueCount(4);
+
+ // get the encoded vector
+ IntVector encodedVector = (IntVector) DictionaryEncoder.encode(vector, dictionary);
+
+ // create VectorSchemaRoot
+ List<Field> fields = Arrays.asList(encodedVector.getField());
+ List<FieldVector> vectors = Arrays.asList(encodedVector);
+ VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
+
+ // write data
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out));
+ writer.start();
+ writer.writeBatch();
+ writer.end();
+
+ // read data
+ try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) {
+ reader.loadNextBatch();
+ VectorSchemaRoot readRoot = reader.getVectorSchemaRoot();
+ // get the encoded vector
+ IntVector intVector = (IntVector) readRoot.getVector(0);
+
+ // get dictionaries and decode the vector
+ Map<Long, Dictionary> dictionaryMap = reader.getDictionaryVectors();
+ long dictionaryId = intVector.getField().getDictionary().getId();
+ VarCharVector varCharVector =
+ (VarCharVector) DictionaryEncoder.decode(intVector, dictionaryMap.get(dictionaryId));
+
+ }
+
+Writing and Reading Random Access Files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The :class:`ArrowFileWriter` has the same API as :class:`ArrowStreamWriter`
+
+.. code-block:: Java
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ ArrowFileWriter writer = new ArrowFileWriter(root, null, Channels.newChannel(out));
+ writer.start();
+ // write the first batch
+ writer.writeBatch();
+ // write another four batches.
+ for (int i = 0; i < 4; i++) {
+ ... do populate work
+ writer.writeBatch();
+ }
+ writer.end();
+
+The difference between :class:`ArrowFileReader` and :class:`ArrowStreamReader` is that the input source
+must have a ``seek`` method for random access. Because we have access to the entire payload, we know the
+number of record batches in the file, and can read any at random
+
+.. code-block:: Java
+
+ try (ArrowFileReader reader = new ArrowFileReader(
+ new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator)) {
+
+ // read the 4-th batch
+ ArrowBlock block = reader.getRecordBlocks().get(3);
+ reader.loadRecordBatch(block);
+ VectorSchemaRoot readBatch = reader.getVectorSchemaRoot();
+ }
diff --git a/src/arrow/docs/source/java/reference/index.rst b/src/arrow/docs/source/java/reference/index.rst
new file mode 100644
index 000000000..523ac0c7f
--- /dev/null
+++ b/src/arrow/docs/source/java/reference/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Java Reference (javadoc)
+========================
+
+Stub page for the Java reference docs; actual source is located in the java/ directory.
diff --git a/src/arrow/docs/source/java/vector.rst b/src/arrow/docs/source/java/vector.rst
new file mode 100644
index 000000000..ece07d0a7
--- /dev/null
+++ b/src/arrow/docs/source/java/vector.rst
@@ -0,0 +1,288 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+===========
+ValueVector
+===========
+
+:class:`ValueVector` interface (which called Array in C++ implementation and
+the :doc:`the specification <../format/Columnar>`) is an abstraction that is used to store a
+sequence of values having the same type in an individual column. Internally, those values are
+represented by one or several buffers, the number and meaning of which depend on the vector’s data type.
+
+There are concrete subclasses of :class:`ValueVector` for each primitive data type
+and nested type described in the specification. There are a few differences in naming
+with the type names described in the specification:
+Table with non-intuitive names (BigInt = 64 bit integer, etc).
+
+It is important that vector is allocated before attempting to read or write,
+:class:`ValueVector` "should" strive to guarantee this order of operation:
+create > allocate > mutate > set value count > access > clear (or allocate to start the process over).
+We will go through a concrete example to demonstrate each operation in the next section.
+
+Vector Life Cycle
+=================
+
+As discussed above, each vector goes through several steps in its life cycle,
+and each step is triggered by a vector operation. In particular, we have the following vector operations:
+
+1. **Vector creation**: we create a new vector object by, for example, the vector constructor.
+The following code creates a new ``IntVector`` by the constructor:
+
+.. code-block:: Java
+
+ RootAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+ ...
+ IntVector vector = new IntVector("int vector", allocator);
+
+By now, a vector object is created. However, no underlying memory has been allocated, so we need the
+following step.
+
+2. **Vector allocation**: in this step, we allocate memory for the vector. For most vectors, we
+have two options: 1) if we know the maximum vector capacity, we can specify it by calling the
+``allocateNew(int)`` method; 2) otherwise, we should call the ``allocateNew()`` method, and a default
+capacity will be allocated for it. For our running example, we assume that the vector capacity never
+exceeds 10:
+
+.. code-block:: Java
+
+ vector.allocateNew(10);
+
+3. **Vector mutation**: now we can populate the vector with values we desire. For all vectors, we can populate
+vector values through vector writers (An example will be given in the next section). For primitive types,
+we can also mutate the vector by the set methods. There are two classes of set methods: 1) if we can
+be sure the vector has enough capacity, we can call the ``set(index, value)`` method. 2) if we are not sure
+about the vector capacity, we should call the ``setSafe(index, value)`` method, which will automatically
+take care of vector reallocation, if the capacity is not sufficient. For our running example, we know the
+vector has enough capacity, so we can call
+
+.. code-block:: Java
+
+ vector.set(/*index*/5, /*value*/25);
+
+4. **Set value count**: for this step, we set the value count of the vector by calling the
+``setValueCount(int)`` method:
+
+.. code-block:: Java
+
+ vector.setValueCount(10);
+
+After this step, the vector enters an immutable state. In other words, we should no longer mutate it.
+(Unless we reuse the vector by allocating it again. This will be discussed shortly.)
+
+5. **Vector access**: it is time to access vector values. Similarly, we have two options to access values:
+1) get methods and 2) vector reader. Vector reader works for all types of vectors, while get methods are
+only available for primitive vectors. A concrete example for vector reader will be given in the next section.
+Below is an example of vector access by get method:
+
+.. code-block:: Java
+
+ int value = vector.get(5); // value == 25
+
+6. **Vector clear**: when we are done with the vector, we should clear it to release its memory. This is done by
+calling the ``close()`` method:
+
+.. code-block:: Java
+
+ vector.close();
+
+Some points to note about the steps above:
+
+* The steps are not necessarily performed in a linear sequence. Instead, they can be in a loop. For example,
+ when a vector enters the access step, we can also go back to the vector mutation step, and then set value
+ count, access vector, and so on.
+
+* We should try to make sure the above steps are carried out in order. Otherwise, the vector
+ may be in an undefined state, and some unexpected behavior may occur. However, this restriction
+ is not strict. That means it is possible that we violates the order above, but still get
+ correct results.
+
+* When mutating vector values through set methods, we should prefer ``set(index, value)`` methods to
+ ``setSafe(index, value)`` methods whenever possible, to avoid unnecessary performance overhead of handling
+ vector capacity.
+
+* All vectors implement the ``AutoCloseable`` interface. So they must be closed explicitly when they are
+ no longer used, to avoid resource leak. To make sure of this, it is recommended to place vector related operations
+ into a try-with-resources block.
+
+* For fixed width vectors (e.g. IntVector), we can set values at different indices in arbitrary orders.
+ For variable width vectors (e.g. VarCharVector), however, we must set values in non-decreasing order of the
+ indices. Otherwise, the values after the set position will become invalid. For example, suppose we use the
+ following statements to populate a variable width vector:
+
+.. code-block:: Java
+
+ VarCharVector vector = new VarCharVector("vector", allocator);
+ vector.allocateNew();
+ vector.setSafe(0, "zero");
+ vector.setSafe(1, "one");
+ ...
+ vector.setSafe(9, "nine");
+
+Then we set the value at position 5 again:
+
+.. code-block:: Java
+
+ vector.setSafe(5, "5");
+
+After that, the values at positions 6, 7, 8, and 9 of the vector will become invalid.
+
+Building ValueVector
+====================
+
+Note that the current implementation doesn't enforce the rule that Arrow objects are immutable.
+:class:`ValueVector` instances could be created directly by using new keyword, there are
+set/setSafe APIs and concrete subclasses of FieldWriter for populating values.
+
+For example, the code below shows how to build a :class:`BigIntVector`, in this case, we build a
+vector of the range 0 to 7 where the element that should hold the fourth value is nulled
+
+.. code-block:: Java
+
+ try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+ BigIntVector vector = new BigIntVector("vector", allocator)) {
+ vector.allocateNew(8);
+ vector.set(0, 1);
+ vector.set(1, 2);
+ vector.set(2, 3);
+ vector.setNull(3);
+ vector.set(4, 5);
+ vector.set(5, 6);
+ vector.set(6, 7);
+ vector.set(7, 8);
+ vector.setValueCount(8); // this will finalizes the vector by convention.
+ ...
+ }
+
+The :class:`BigIntVector` holds two ArrowBufs. The first buffer holds the null bitmap, which consists
+here of a single byte with the bits 1|1|1|1|0|1|1|1 (the bit is 1 if the value is non-null).
+The second buffer contains all the above values. As the fourth entry is null, the value at that position
+in the buffer is undefined. Note compared with set API, setSafe API would check value capacity before setting
+values and reallocate buffers if necessary.
+
+Here is how to build a vector using writer
+
+.. code-block:: Java
+
+ try (BigIntVector vector = new BigIntVector("vector", allocator);
+ BigIntWriter writer = new BigIntWriterImpl(vector)) {
+ writer.setPosition(0);
+ writer.writeBigInt(1);
+ writer.setPosition(1);
+ writer.writeBigInt(2);
+ writer.setPosition(2);
+ writer.writeBigInt(3);
+ // writer.setPosition(3) is not called which means the forth value is null.
+ writer.setPosition(4);
+ writer.writeBigInt(5);
+ writer.setPosition(5);
+ writer.writeBigInt(6);
+ writer.setPosition(6);
+ writer.writeBigInt(7);
+ writer.setPosition(7);
+ writer.writeBigInt(8);
+ }
+
+There are get API and concrete subclasses of :class:`FieldReader` for accessing vector values, what needs
+to be declared is that writer/reader is not as efficient as direct access
+
+.. code-block:: Java
+
+ // access via get API
+ for (int i = 0; i < vector.getValueCount(); i++) {
+ if (!vector.isNull(i)) {
+ System.out.println(vector.get(i));
+ }
+ }
+
+ // access via reader
+ BigIntReader reader = vector.getReader();
+ for (int i = 0; i < vector.getValueCount(); i++) {
+ reader.setPosition(i);
+ if (reader.isSet()) {
+ System.out.println(reader.readLong());
+ }
+ }
+
+Building ListVector
+===================
+
+A :class:`ListVector` is a vector that holds a list of values for each index. Working with one you need to handle the same steps as mentioned above (create > allocate > mutate > set value count > access > clear), but the details of how you accomplish this are slightly different since you need to both create the vector and set the list of values for each index.
+
+For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid.
+
+.. code-block:: Java
+
+ try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+ ListVector listVector = ListVector.empty("vector", allocator)) {
+ UnionListWriter writer = listVector.getWriter();
+ for (int i = 0; i < 10; i++) {
+ writer.startList();
+ writer.setPosition(i);
+ for (int j = 0; j < 5; j++) {
+ writer.writeInt(j * i);
+ }
+ writer.setValueCount(5);
+ writer.endList();
+ }
+ listVector.setValueCount(10);
+ }
+
+:class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values.
+
+.. code-block:: Java
+
+ // access via get API
+ for (int i = 0; i < listVector.getValueCount(); i++) {
+ if (!listVector.isNull(i)) {
+ ArrayList<Integer> elements = (ArrayList<Integer>) listVector.getObject(i);
+ for (Integer element : elements) {
+ System.out.println(element);
+ }
+ }
+ }
+
+ // access via reader
+ UnionListReader reader = listVector.getReader();
+ for (int i = 0; i < listVector.getValueCount(); i++) {
+ reader.setPosition(i);
+ while (reader.next()) {
+ IntReader intReader = reader.reader();
+ if (intReader.isSet()) {
+ System.out.println(intReader.readInteger());
+ }
+ }
+ }
+
+Slicing
+=======
+
+Similar with C++ implementation, it is possible to make zero-copy slices of vectors to obtain a vector
+referring to some logical sub-sequence of the data through :class:`TransferPair`
+
+.. code-block:: Java
+
+ IntVector vector = new IntVector("intVector", allocator);
+ for (int i = 0; i < 10; i++) {
+ vector.setSafe(i, i);
+ }
+ vector.setValueCount(10);
+
+ TransferPair tp = vector.getTransferPair(allocator);
+ tp.splitAndTransfer(0, 5);
+ IntVector sliced = (IntVector) tp.getTo();
+ // In this case, the vector values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] and the sliceVector values are [0, 1, 2, 3, 4].
diff --git a/src/arrow/docs/source/java/vector_schema_root.rst b/src/arrow/docs/source/java/vector_schema_root.rst
new file mode 100644
index 000000000..7f787d9d5
--- /dev/null
+++ b/src/arrow/docs/source/java/vector_schema_root.rst
@@ -0,0 +1,74 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+================
+VectorSchemaRoot
+================
+A :class:`VectorSchemaRoot` is a container that can hold batches, batches flow through :class:`VectorSchemaRoot`
+as part of a pipeline. Note this is different from other implementations (i.e. in C++ and Python,
+a :class:`RecordBatch` is a collection of equal-length vector instances and was created each time for a new batch).
+
+The recommended usage for :class:`VectorSchemaRoot` is creating a single :class:`VectorSchemaRoot`
+based on the known schema and populated data over and over into the same VectorSchemaRoot in a stream
+of batches rather than creating a new :class:`VectorSchemaRoot` instance each time
+(see `Numba <https://github.com/apache/arrow/tree/master/java/flight/src/main/java/org/apache/arrow/flight>`_ or
+``ArrowFileWriter`` for better understanding). Thus at any one point a VectorSchemaRoot may have data or
+may have no data (say it was transferred downstream or not yet populated).
+
+
+Here is the example of building a :class:`VectorSchemaRoot`
+
+.. code-block:: Java
+
+ BitVector bitVector = new BitVector("boolean", allocator);
+ VarCharVector varCharVector = new VarCharVector("varchar", allocator);
+ bitVector.allocateNew();
+ varCharVector.allocateNew();
+ for (int i = 0; i < 10; i++) {
+ bitVector.setSafe(i, i % 2 == 0 ? 0 : 1);
+ varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8));
+ }
+ bitVector.setValueCount(10);
+ varCharVector.setValueCount(10);
+
+ List<Field> fields = Arrays.asList(bitVector.getField(), varCharVector.getField());
+ List<FieldVector> vectors = Arrays.asList(bitVector, varCharVector);
+ VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(fields, vectors);
+
+The vectors within a :class:`VectorSchemaRoot` could be loaded/unloaded via :class:`VectorLoader` and :class:`VectorUnloader`.
+:class:`VectorLoader` and :class:`VectorUnloader` handles converting between :class:`VectorSchemaRoot` and :class:`ArrowRecordBatch`(
+representation of a RecordBatch :doc:`IPC <../format/IPC.rst>` message). Examples as below
+
+.. code-block:: Java
+
+ // create a VectorSchemaRoot root1 and convert its data into recordBatch
+ VectorSchemaRoot root1 = new VectorSchemaRoot(fields, vectors);
+ VectorUnloader unloader = new VectorUnloader(root1);
+ ArrowRecordBatch recordBatch = unloader.getRecordBatch();
+
+ // create a VectorSchemaRoot root2 and load the recordBatch
+ VectorSchemaRoot root2 = VectorSchemaRoot.create(root1.getSchema(), allocator);
+ VectorLoader loader = new VectorLoader(root2);
+ loader.load(recordBatch);
+
+A new :class:`VectorSchemaRoot` could be sliced from an existing instance with zero-copy
+
+.. code-block:: Java
+
+ // 0 indicates start index (inclusive) and 5 indicated length (exclusive).
+ VectorSchemaRoot newRoot = vectorSchemaRoot.slice(0, 5);
+
diff --git a/src/arrow/docs/source/js/index.rst b/src/arrow/docs/source/js/index.rst
new file mode 100644
index 000000000..77813c137
--- /dev/null
+++ b/src/arrow/docs/source/js/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+JavaScript docs
+===============
+
+Stub page for the JavaScript docs; actual source is located in js/ sub-directory.
diff --git a/src/arrow/docs/source/python/api.rst b/src/arrow/docs/source/python/api.rst
new file mode 100644
index 000000000..12cf4e068
--- /dev/null
+++ b/src/arrow/docs/source/python/api.rst
@@ -0,0 +1,40 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _api:
+
+*************
+API Reference
+*************
+
+.. toctree::
+ :maxdepth: 2
+
+ api/datatypes
+ api/arrays
+ api/memory
+ api/compute
+ api/files
+ api/tables
+ api/ipc
+ api/flight
+ api/formats
+ api/filesystems
+ api/dataset
+ api/plasma
+ api/cuda
+ api/misc
diff --git a/src/arrow/docs/source/python/api/arrays.rst b/src/arrow/docs/source/python/api/arrays.rst
new file mode 100644
index 000000000..dbc4c0bd1
--- /dev/null
+++ b/src/arrow/docs/source/python/api/arrays.rst
@@ -0,0 +1,127 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _api.array:
+.. currentmodule:: pyarrow
+
+Arrays and Scalars
+==================
+
+Factory Functions
+-----------------
+
+These functions create new Arrow arrays:
+
+.. autosummary::
+ :toctree: ../generated/
+
+ array
+ nulls
+
+Array Types
+-----------
+
+An array's Python class depends on its data type. Concrete array classes
+may expose data type-specific methods or properties.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ Array
+ BooleanArray
+ FloatingPointArray
+ IntegerArray
+ Int8Array
+ Int16Array
+ Int32Array
+ Int64Array
+ NullArray
+ NumericArray
+ UInt8Array
+ UInt16Array
+ UInt32Array
+ UInt64Array
+ BinaryArray
+ StringArray
+ FixedSizeBinaryArray
+ LargeBinaryArray
+ LargeStringArray
+ Time32Array
+ Time64Array
+ Date32Array
+ Date64Array
+ TimestampArray
+ DurationArray
+ MonthDayNanoIntervalArray
+ Decimal128Array
+ DictionaryArray
+ ListArray
+ FixedSizeListArray
+ LargeListArray
+ StructArray
+ UnionArray
+ ExtensionArray
+
+.. _api.scalar:
+
+Scalars
+-------
+
+This function constructs a new Arrow scalar:
+
+.. autosummary::
+ :toctree: ../generated/
+
+ scalar
+
+A scalar's python class depends on its data type. Concrete scalar
+classes may expose data type-specific methods or properties.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ NA
+ Scalar
+ BooleanScalar
+ Int8Scalar
+ Int16Scalar
+ Int32Scalar
+ Int64Scalar
+ UInt8Scalar
+ UInt16Scalar
+ UInt32Scalar
+ UInt64Scalar
+ FloatScalar
+ DoubleScalar
+ BinaryScalar
+ StringScalar
+ FixedSizeBinaryScalar
+ LargeBinaryScalar
+ LargeStringScalar
+ Time32Scalar
+ Time64Scalar
+ Date32Scalar
+ Date64Scalar
+ TimestampScalar
+ DurationScalar
+ MonthDayNanoIntervalScalar
+ Decimal128Scalar
+ DictionaryScalar
+ ListScalar
+ LargeListScalar
+ StructScalar
+ UnionScalar
diff --git a/src/arrow/docs/source/python/api/compute.rst b/src/arrow/docs/source/python/api/compute.rst
new file mode 100644
index 000000000..521182f8a
--- /dev/null
+++ b/src/arrow/docs/source/python/api/compute.rst
@@ -0,0 +1,498 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _api.compute:
+.. currentmodule:: pyarrow.compute
+
+Compute Functions
+=================
+
+Aggregations
+------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ all
+ any
+ approximate_median
+ count
+ count_distinct
+ index
+ max
+ mean
+ min
+ min_max
+ mode
+ product
+ quantile
+ stddev
+ sum
+ tdigest
+ variance
+
+Grouped Aggregations
+--------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ hash_all
+ hash_any
+ hash_approximate_median
+ hash_count
+ hash_count_distinct
+ hash_distinct
+ hash_max
+ hash_mean
+ hash_min
+ hash_min_max
+ hash_product
+ hash_stddev
+ hash_sum
+ hash_tdigest
+ hash_variance
+
+Arithmetic Functions
+--------------------
+
+By default these functions do not detect overflow. Most functions are also
+available in an overflow-checking variant, suffixed ``_checked``, which
+throws an ``ArrowInvalid`` exception when overflow is detected.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ abs
+ abs_checked
+ add
+ add_checked
+ divide
+ divide_checked
+ multiply
+ multiply_checked
+ negate
+ negate_checked
+ power
+ power_checked
+ sign
+ subtract
+ subtract_checked
+
+Bit-wise Functions
+------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ bit_wise_and
+ bit_wise_not
+ bit_wise_or
+ bit_wise_xor
+ shift_left
+ shift_left_checked
+ shift_right
+ shift_right_checked
+
+Rounding Functions
+------------------
+
+Rounding functions displace numeric inputs to an approximate value with a simpler
+representation based on the rounding criterion.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ceil
+ floor
+ round
+ round_to_multiple
+ trunc
+
+Logarithmic Functions
+---------------------
+
+Logarithmic functions are also supported, and also offer ``_checked``
+variants which detect domain errors.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ln
+ ln_checked
+ log10
+ log10_checked
+ log1p
+ log1p_checked
+ log2
+ log2_checked
+ logb
+ logb_checked
+
+Trigonometric Functions
+-----------------------
+
+Trigonometric functions are also supported, and also offer ``_checked``
+variants which detect domain errors where appropriate.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ acos
+ acos_checked
+ asin
+ asin_checked
+ atan
+ atan2
+ cos
+ cos_checked
+ sin
+ sin_checked
+ tan
+ tan_checked
+
+Comparisons
+-----------
+
+These functions expect two inputs of the same type. If one of the inputs is `null`
+they return ``null``.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ equal
+ greater
+ greater_equal
+ less
+ less_equal
+ not_equal
+
+These functions take any number of arguments of a numeric or temporal type.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ max_element_wise
+ min_element_wise
+
+Logical Functions
+-----------------
+
+These functions normally emit a null when one of the inputs is null. However, Kleene
+logic variants are provided (suffixed ``_kleene``). See User Guide for details.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ and_
+ and_kleene
+ and_not
+ and_not_kleene
+ invert
+ or_
+ or_kleene
+ xor
+
+String Predicates
+-----------------
+
+In these functions an empty string emits false in the output. For ASCII
+variants (prefixed ``ascii_``) a string element with non-ASCII characters
+emits false in the output.
+
+The first set of functions emit true if the input contains only
+characters of a given class.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ascii_is_alnum
+ ascii_is_alpha
+ ascii_is_decimal
+ ascii_is_lower
+ ascii_is_printable
+ ascii_is_space
+ ascii_is_upper
+ utf8_is_alnum
+ utf8_is_alpha
+ utf8_is_decimal
+ utf8_is_digit
+ utf8_is_lower
+ utf8_is_numeric
+ utf8_is_printable
+ utf8_is_space
+ utf8_is_upper
+
+The second set of functions also consider the order of characters
+in the string element.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ascii_is_title
+ utf8_is_title
+
+The third set of functions examines string elements on
+a byte-by-byte basis.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ string_is_ascii
+
+String Transforms
+-----------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ascii_capitalize
+ ascii_lower
+ ascii_reverse
+ ascii_swapcase
+ ascii_title
+ ascii_upper
+ binary_length
+ binary_replace_slice
+ replace_substring
+ replace_substring_regex
+ utf8_capitalize
+ utf8_length
+ utf8_lower
+ utf8_replace_slice
+ utf8_reverse
+ utf8_swapcase
+ utf8_title
+ utf8_upper
+
+String Padding
+--------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ascii_center
+ ascii_lpad
+ ascii_rpad
+ utf8_center
+ utf8_lpad
+ utf8_rpad
+
+String Trimming
+---------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ascii_ltrim
+ ascii_ltrim_whitespace
+ ascii_rtrim
+ ascii_rtrim_whitespace
+ ascii_trim
+ ascii_trim_whitespace
+ utf8_ltrim
+ utf8_ltrim_whitespace
+ utf8_rtrim
+ utf8_rtrim_whitespace
+ utf8_trim
+ utf8_trim_whitespace
+
+String Splitting
+----------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ascii_split_whitespace
+ split_pattern
+ split_pattern_regex
+ utf8_split_whitespace
+
+String Component Extraction
+---------------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ extract_regex
+
+String Joining
+--------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ binary_join
+ binary_join_element_wise
+
+String Slicing
+--------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ utf8_slice_codeunits
+
+Containment Tests
+-----------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ count_substring
+ count_substring_regex
+ ends_with
+ find_substring
+ find_substring_regex
+ index_in
+ is_in
+ match_like
+ match_substring
+ match_substring_regex
+ starts_with
+
+Categorizations
+---------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ is_finite
+ is_inf
+ is_nan
+ is_null
+ is_valid
+
+Selecting / Multiplexing
+------------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ case_when
+ choose
+ coalesce
+ if_else
+
+Conversions
+-----------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ cast
+ strftime
+ strptime
+
+Temporal Component Extraction
+-----------------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ day
+ day_of_week
+ day_of_year
+ hour
+ iso_week
+ iso_year
+ iso_calendar
+ microsecond
+ millisecond
+ minute
+ month
+ nanosecond
+ quarter
+ second
+ subsecond
+ us_week
+ week
+ year
+
+Temporal Difference
+-------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ day_time_interval_between
+ days_between
+ hours_between
+ microseconds_between
+ milliseconds_between
+ minutes_between
+ month_day_nano_interval_between
+ month_interval_between
+ nanoseconds_between
+ quarters_between
+ seconds_between
+ weeks_between
+ years_between
+
+Timezone Handling
+-----------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ assume_timezone
+
+Associative Transforms
+----------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ dictionary_encode
+ unique
+ value_counts
+
+Selections
+----------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ array_filter
+ array_take
+ drop_null
+ filter
+ take
+
+Sorts and Partitions
+--------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ array_sort_indices
+ partition_nth_indices
+ select_k_unstable
+ sort_indices
+
+Structural Transforms
+---------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ list_element
+ list_flatten
+ list_parent_indices
+ list_value_length
+ make_struct
+ replace_with_mask
diff --git a/src/arrow/docs/source/python/api/cuda.rst b/src/arrow/docs/source/python/api/cuda.rst
new file mode 100644
index 000000000..364f03240
--- /dev/null
+++ b/src/arrow/docs/source/python/api/cuda.rst
@@ -0,0 +1,62 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.cuda
+
+CUDA Integration
+================
+
+.. ifconfig:: not cuda_enabled
+
+ .. error::
+ This documentation was built without CUDA enabled. The CUDA
+ API docs are not available.
+
+.. NOTE We still generate those API docs (with empty docstrings)
+.. when CUDA is disabled and `pyarrow.cuda` mocked (see conf.py).
+.. Otherwise we'd get autodoc warnings, see https://github.com/sphinx-doc/sphinx/issues/4770
+
+CUDA Contexts
+-------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ Context
+
+CUDA Buffers
+------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ CudaBuffer
+ new_host_buffer
+ HostBuffer
+ BufferReader
+ BufferWriter
+
+Serialization and IPC
+---------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ serialize_record_batch
+ read_record_batch
+ read_message
+ IpcMemHandle
diff --git a/src/arrow/docs/source/python/api/dataset.rst b/src/arrow/docs/source/python/api/dataset.rst
new file mode 100644
index 000000000..9718006ab
--- /dev/null
+++ b/src/arrow/docs/source/python/api/dataset.rst
@@ -0,0 +1,64 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.dataset
+
+.. _api.dataset:
+
+Dataset
+=======
+
+.. warning::
+
+ The ``pyarrow.dataset`` module is experimental (specifically the classes),
+ and a stable API is not yet guaranteed.
+
+Factory functions
+-----------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ dataset
+ parquet_dataset
+ partitioning
+ field
+ scalar
+ write_dataset
+
+Classes
+-------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ FileFormat
+ ParquetFileFormat
+ ORCFileFormat
+ IpcFileFormat
+ CsvFileFormat
+ Partitioning
+ PartitioningFactory
+ DirectoryPartitioning
+ HivePartitioning
+ Dataset
+ FileSystemDataset
+ FileSystemFactoryOptions
+ FileSystemDatasetFactory
+ UnionDataset
+ Scanner
+ Expression
diff --git a/src/arrow/docs/source/python/api/datatypes.rst b/src/arrow/docs/source/python/api/datatypes.rst
new file mode 100644
index 000000000..48a254a00
--- /dev/null
+++ b/src/arrow/docs/source/python/api/datatypes.rst
@@ -0,0 +1,165 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _api.types:
+.. currentmodule:: pyarrow
+
+Data Types and Schemas
+======================
+
+Factory Functions
+-----------------
+
+These should be used to create Arrow data types and schemas.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ null
+ bool_
+ int8
+ int16
+ int32
+ int64
+ uint8
+ uint16
+ uint32
+ uint64
+ float16
+ float32
+ float64
+ time32
+ time64
+ timestamp
+ date32
+ date64
+ duration
+ month_day_nano_interval
+ binary
+ string
+ utf8
+ large_binary
+ large_string
+ large_utf8
+ decimal128
+ list_
+ large_list
+ map_
+ struct
+ dictionary
+ field
+ schema
+ from_numpy_dtype
+
+Utility Functions
+-----------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ unify_schemas
+
+.. _api.type_classes:
+.. currentmodule:: pyarrow
+
+Type Classes
+------------
+
+Do not instantiate these classes directly. Instead, call one of the factory
+functions above.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ DataType
+ DictionaryType
+ ListType
+ MapType
+ StructType
+ UnionType
+ TimestampType
+ Time32Type
+ Time64Type
+ FixedSizeBinaryType
+ Decimal128Type
+ Field
+ Schema
+
+Specific classes and functions for extension types.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ExtensionType
+ PyExtensionType
+ register_extension_type
+ unregister_extension_type
+
+
+.. _api.types.checking:
+.. currentmodule:: pyarrow.types
+
+Type Checking
+-------------
+
+These functions are predicates to check whether a :class:`DataType` instance
+represents a given data type (such as ``int32``) or general category
+(such as "is a signed integer").
+
+.. autosummary::
+ :toctree: ../generated/
+
+ is_boolean
+ is_integer
+ is_signed_integer
+ is_unsigned_integer
+ is_int8
+ is_int16
+ is_int32
+ is_int64
+ is_uint8
+ is_uint16
+ is_uint32
+ is_uint64
+ is_floating
+ is_float16
+ is_float32
+ is_float64
+ is_decimal
+ is_list
+ is_large_list
+ is_struct
+ is_union
+ is_nested
+ is_temporal
+ is_timestamp
+ is_date
+ is_date32
+ is_date64
+ is_time
+ is_time32
+ is_time64
+ is_null
+ is_binary
+ is_unicode
+ is_string
+ is_large_binary
+ is_large_unicode
+ is_large_string
+ is_fixed_size_binary
+ is_map
+ is_dictionary
diff --git a/src/arrow/docs/source/python/api/files.rst b/src/arrow/docs/source/python/api/files.rst
new file mode 100644
index 000000000..106dfde8a
--- /dev/null
+++ b/src/arrow/docs/source/python/api/files.rst
@@ -0,0 +1,65 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+
+Streams and File Access
+=======================
+
+.. _api.io:
+
+Factory Functions
+-----------------
+
+These factory functions are the recommended way to create a Arrow stream.
+They accept various kinds of sources, such as in-memory buffers or on-disk files.
+
+.. autosummary::
+ :toctree: ../generated/
+
+ input_stream
+ output_stream
+ memory_map
+ create_memory_map
+
+Stream Classes
+--------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ NativeFile
+ OSFile
+ PythonFile
+ BufferReader
+ BufferOutputStream
+ FixedSizeBufferWriter
+ MemoryMappedFile
+ CompressedInputStream
+ CompressedOutputStream
+
+File Systems
+------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ hdfs.connect
+ LocalFileSystem
+
+.. class:: HadoopFileSystem
+ :noindex:
diff --git a/src/arrow/docs/source/python/api/filesystems.rst b/src/arrow/docs/source/python/api/filesystems.rst
new file mode 100644
index 000000000..3e2ac29ee
--- /dev/null
+++ b/src/arrow/docs/source/python/api/filesystems.rst
@@ -0,0 +1,53 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.fs
+
+Filesystems
+===========
+
+.. _api.fs:
+
+Interface
+---------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ FileInfo
+ FileSelector
+ FileSystem
+
+Concrete Subclasses
+-------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ LocalFileSystem
+ S3FileSystem
+ HadoopFileSystem
+ SubTreeFileSystem
+
+To define filesystems with behavior implemented in Python:
+
+.. autosummary::
+ :toctree: ../generated/
+
+ PyFileSystem
+ FileSystemHandler
+ FSSpecHandler
diff --git a/src/arrow/docs/source/python/api/flight.rst b/src/arrow/docs/source/python/api/flight.rst
new file mode 100644
index 000000000..0cfbb6b4b
--- /dev/null
+++ b/src/arrow/docs/source/python/api/flight.rst
@@ -0,0 +1,91 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.flight
+
+Arrow Flight
+============
+
+.. ifconfig:: not flight_enabled
+
+ .. error::
+ This documentation was built without Flight enabled. The Flight
+ API docs are not available.
+
+.. NOTE We still generate those API docs (with empty docstrings)
+.. when Flight is disabled and `pyarrow.flight` mocked (see conf.py).
+.. Otherwise we'd get autodoc warnings, see https://github.com/sphinx-doc/sphinx/issues/4770
+
+.. warning:: Flight is currently unstable. APIs are subject to change,
+ though we don't expect drastic changes.
+
+Common Types
+------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ Action
+ ActionType
+ DescriptorType
+ FlightDescriptor
+ FlightEndpoint
+ FlightInfo
+ Location
+ Ticket
+ Result
+
+Flight Client
+-------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ FlightCallOptions
+ FlightClient
+ ClientMiddlewareFactory
+ ClientMiddleware
+
+Flight Server
+-------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ FlightServerBase
+ GeneratorStream
+ RecordBatchStream
+ ServerMiddlewareFactory
+ ServerMiddleware
+
+Authentication
+--------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ClientAuthHandler
+ ServerAuthHandler
+
+Middleware
+----------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ FlightMethod
+ CallInfo
diff --git a/src/arrow/docs/source/python/api/formats.rst b/src/arrow/docs/source/python/api/formats.rst
new file mode 100644
index 000000000..fdc28040a
--- /dev/null
+++ b/src/arrow/docs/source/python/api/formats.rst
@@ -0,0 +1,101 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Tabular File Formats
+====================
+
+.. _api.csv:
+
+CSV Files
+---------
+
+.. currentmodule:: pyarrow.csv
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ConvertOptions
+ CSVStreamingReader
+ CSVWriter
+ ISO8601
+ ParseOptions
+ ReadOptions
+ WriteOptions
+ open_csv
+ read_csv
+ write_csv
+
+.. _api.feather:
+
+Feather Files
+-------------
+
+.. currentmodule:: pyarrow.feather
+
+.. autosummary::
+ :toctree: ../generated/
+
+ read_feather
+ read_table
+ write_feather
+
+.. _api.json:
+
+JSON Files
+----------
+
+.. currentmodule:: pyarrow.json
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ReadOptions
+ ParseOptions
+ read_json
+
+.. _api.parquet:
+
+Parquet Files
+-------------
+
+.. currentmodule:: pyarrow.parquet
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ParquetDataset
+ ParquetFile
+ ParquetWriter
+ read_table
+ read_metadata
+ read_pandas
+ read_schema
+ write_metadata
+ write_table
+ write_to_dataset
+
+.. _api.orc:
+
+ORC Files
+---------
+
+.. currentmodule:: pyarrow.orc
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ORCFile
diff --git a/src/arrow/docs/source/python/api/ipc.rst b/src/arrow/docs/source/python/api/ipc.rst
new file mode 100644
index 000000000..83ff53de7
--- /dev/null
+++ b/src/arrow/docs/source/python/api/ipc.rst
@@ -0,0 +1,69 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+
+.. _api.ipc:
+
+Serialization and IPC
+=====================
+
+Inter-Process Communication
+---------------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ipc.new_file
+ ipc.open_file
+ ipc.new_stream
+ ipc.open_stream
+ ipc.read_message
+ ipc.read_record_batch
+ ipc.get_record_batch_size
+ ipc.read_tensor
+ ipc.write_tensor
+ ipc.get_tensor_size
+ ipc.IpcWriteOptions
+ ipc.Message
+ ipc.MessageReader
+ ipc.RecordBatchFileReader
+ ipc.RecordBatchFileWriter
+ ipc.RecordBatchStreamReader
+ ipc.RecordBatchStreamWriter
+
+Serialization
+-------------
+
+.. warning::
+
+ The serialization functionality is deprecated in pyarrow 2.0, and will
+ be removed in a future version. Use the standard library ``pickle`` or
+ the IPC functionality of pyarrow (see :ref:`ipc`).
+
+
+.. autosummary::
+ :toctree: ../generated/
+
+ serialize
+ serialize_to
+ deserialize
+ deserialize_components
+ deserialize_from
+ read_serialized
+ SerializedPyObject
+ SerializationContext
diff --git a/src/arrow/docs/source/python/api/memory.rst b/src/arrow/docs/source/python/api/memory.rst
new file mode 100644
index 000000000..f4382ba23
--- /dev/null
+++ b/src/arrow/docs/source/python/api/memory.rst
@@ -0,0 +1,73 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+
+.. _api.memory:
+
+Buffers and Memory
+==================
+
+In-Memory Buffers
+-----------------
+
+Factory Functions
+~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+ :toctree: ../generated/
+
+ allocate_buffer
+ py_buffer
+ foreign_buffer
+
+Classes
+~~~~~~~
+
+.. autosummary::
+ :toctree: ../generated/
+
+ Buffer
+ ResizableBuffer
+
+Miscellaneous
+~~~~~~~~~~~~~
+
+.. autosummary::
+ :toctree: ../generated/
+
+ Codec
+ compress
+ decompress
+
+.. _api.memory_pool:
+
+Memory Pools
+------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ MemoryPool
+ default_memory_pool
+ jemalloc_memory_pool
+ mimalloc_memory_pool
+ system_memory_pool
+ jemalloc_set_decay_ms
+ set_memory_pool
+ log_memory_allocations
+ total_allocated_bytes
diff --git a/src/arrow/docs/source/python/api/misc.rst b/src/arrow/docs/source/python/api/misc.rst
new file mode 100644
index 000000000..c13b80620
--- /dev/null
+++ b/src/arrow/docs/source/python/api/misc.rst
@@ -0,0 +1,40 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+
+Miscellaneous
+=============
+
+Multi-Threading
+---------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ cpu_count
+ set_cpu_count
+
+Using with C extensions
+-----------------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ get_include
+ get_libraries
+ get_library_dirs
diff --git a/src/arrow/docs/source/python/api/plasma.rst b/src/arrow/docs/source/python/api/plasma.rst
new file mode 100644
index 000000000..8df9e4e21
--- /dev/null
+++ b/src/arrow/docs/source/python/api/plasma.rst
@@ -0,0 +1,33 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.plasma
+
+.. _api.plasma:
+
+Plasma In-Memory Object Store
+=============================
+
+Classes
+-------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ObjectID
+ PlasmaClient
+ PlasmaBuffer
diff --git a/src/arrow/docs/source/python/api/tables.rst b/src/arrow/docs/source/python/api/tables.rst
new file mode 100644
index 000000000..6e7a3b6e1
--- /dev/null
+++ b/src/arrow/docs/source/python/api/tables.rst
@@ -0,0 +1,55 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+
+.. _api.table:
+
+Tables and Tensors
+==================
+
+Factory Functions
+-----------------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ chunked_array
+ concat_arrays
+ concat_tables
+ record_batch
+ table
+
+Classes
+-------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ ChunkedArray
+ RecordBatch
+ Table
+
+.. _api.tensor:
+
+Tensors
+-------
+
+.. autosummary::
+ :toctree: ../generated/
+
+ Tensor
diff --git a/src/arrow/docs/source/python/benchmarks.rst b/src/arrow/docs/source/python/benchmarks.rst
new file mode 100644
index 000000000..aee83b778
--- /dev/null
+++ b/src/arrow/docs/source/python/benchmarks.rst
@@ -0,0 +1,56 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _python-benchmarks:
+
+Benchmarks
+==========
+
+The ``pyarrow`` package comes with a suite of benchmarks meant to
+run with `ASV`_. You'll need to install the ``asv`` package first
+(``pip install asv`` or ``conda install -c conda-forge asv``).
+
+Running the benchmarks
+----------------------
+
+To run the benchmarks for a locally-built Arrow, run ``asv dev`` or
+``asv run --python=same``.
+
+We use conda environments as part of running the benchmarks. To use the ``asv``
+setup, you must set the ``$CONDA_HOME`` environment variable to point to the
+root of your conda installation.
+
+Running for arbitrary Git revisions
+-----------------------------------
+
+ASV allows to store results and generate graphs of the benchmarks over
+the project's evolution. You need to have the latest development version of ASV:
+
+.. code::
+
+ pip install git+https://github.com/airspeed-velocity/asv
+
+Now you should be ready to run ``asv run`` or whatever other command
+suits your needs. Note that this can be quite long, as each Arrow needs
+to be rebuilt for each Git revision you're running the benchmarks for.
+
+Compatibility
+-------------
+
+We only expect the benchmarking setup to work on a Unix-like system with bash.
+
+.. _asv: https://asv.readthedocs.org/
diff --git a/src/arrow/docs/source/python/compute.rst b/src/arrow/docs/source/python/compute.rst
new file mode 100644
index 000000000..133520de9
--- /dev/null
+++ b/src/arrow/docs/source/python/compute.rst
@@ -0,0 +1,69 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.compute
+.. _compute:
+
+=================
+Compute Functions
+=================
+
+Arrow supports logical compute operations over inputs of possibly
+varying types. Many compute functions support both array (chunked or not)
+and scalar inputs, but some will mandate either. For example,
+``sort_indices`` requires its first and only input to be an array.
+
+Below are a few simple examples:
+
+ >>> import pyarrow as pa
+ >>> import pyarrow.compute as pc
+ >>> a = pa.array([1, 1, 2, 3])
+ >>> pc.sum(a)
+ <pyarrow.Int64Scalar: 7>
+ >>> b = pa.array([4, 1, 2, 8])
+ >>> pc.equal(a, b)
+ <pyarrow.lib.BooleanArray object at 0x7f686e4eef30>
+ [
+ false,
+ true,
+ true,
+ false
+ ]
+ >>> x, y = pa.scalar(7.8), pa.scalar(9.3)
+ >>> pc.multiply(x, y)
+ <pyarrow.DoubleScalar: 72.54>
+
+These functions can do more than just element-by-element operations.
+Here is an example of sorting a table:
+
+ >>> import pyarrow as pa
+ >>> import pyarrow.compute as pc
+ >>> t = pa.table({'x':[1,2,3],'y':[3,2,1]})
+ >>> i = pc.sort_indices(t, sort_keys=[('y', 'ascending')])
+ >>> i
+ <pyarrow.lib.UInt64Array object at 0x7fcee5df75e8>
+ [
+ 2,
+ 1,
+ 0
+ ]
+
+
+
+.. seealso::
+
+ :ref:`Available compute functions (C++ documentation) <compute-function-list>`.
diff --git a/src/arrow/docs/source/python/csv.rst b/src/arrow/docs/source/python/csv.rst
new file mode 100644
index 000000000..1724c63f4
--- /dev/null
+++ b/src/arrow/docs/source/python/csv.rst
@@ -0,0 +1,170 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.csv
+.. _csv:
+
+Reading and Writing CSV files
+=============================
+
+Arrow supports reading and writing columnar data from/to CSV files.
+The features currently offered are the following:
+
+* multi-threaded or single-threaded reading
+* automatic decompression of input files (based on the filename extension,
+ such as ``my_data.csv.gz``)
+* fetching column names from the first row in the CSV file
+* column-wise type inference and conversion to one of ``null``, ``int64``,
+ ``float64``, ``date32``, ``time32[s]``, ``timestamp[s]``, ``timestamp[ns]``,
+ ``string`` or ``binary`` data
+* opportunistic dictionary encoding of ``string`` and ``binary`` columns
+ (disabled by default)
+* detecting various spellings of null values such as ``NaN`` or ``#N/A``
+* writing CSV files with options to configure the exact output format
+
+Usage
+-----
+
+CSV reading and writing functionality is available through the
+:mod:`pyarrow.csv` module. In many cases, you will simply call the
+:func:`read_csv` function with the file path you want to read from::
+
+ >>> from pyarrow import csv
+ >>> fn = 'tips.csv.gz'
+ >>> table = csv.read_csv(fn)
+ >>> table
+ pyarrow.Table
+ total_bill: double
+ tip: double
+ sex: string
+ smoker: string
+ day: string
+ time: string
+ size: int64
+ >>> len(table)
+ 244
+ >>> df = table.to_pandas()
+ >>> df.head()
+ total_bill tip sex smoker day time size
+ 0 16.99 1.01 Female No Sun Dinner 2
+ 1 10.34 1.66 Male No Sun Dinner 3
+ 2 21.01 3.50 Male No Sun Dinner 3
+ 3 23.68 3.31 Male No Sun Dinner 2
+ 4 24.59 3.61 Female No Sun Dinner 4
+
+To write CSV files, just call :func:`write_csv` with a
+:class:`pyarrow.RecordBatch` or :class:`pyarrow.Table` and a path or
+file-like object::
+
+ >>> import pyarrow as pa
+ >>> import pyarrow.csv as csv
+ >>> csv.write_csv(table, "tips.csv")
+ >>> with pa.CompressedOutputStream("tips.csv.gz", "gzip") as out:
+ ... csv.write_csv(table, out)
+
+.. note:: The writer does not yet support all Arrow types.
+
+Customized parsing
+------------------
+
+To alter the default parsing settings in case of reading CSV files with an
+unusual structure, you should create a :class:`ParseOptions` instance
+and pass it to :func:`read_csv`.
+
+Customized conversion
+---------------------
+
+To alter how CSV data is converted to Arrow types and data, you should create
+a :class:`ConvertOptions` instance and pass it to :func:`read_csv`::
+
+ import pyarrow as pa
+ import pyarrow.csv as csv
+
+ table = csv.read_csv('tips.csv.gz', convert_options=pa.csv.ConvertOptions(
+ column_types={
+ 'total_bill': pa.decimal128(precision=10, scale=2),
+ 'tip': pa.decimal128(precision=10, scale=2),
+ }
+ ))
+
+
+Incremental reading
+-------------------
+
+For memory-constrained environments, it is also possible to read a CSV file
+one batch at a time, using :func:`open_csv`.
+
+There are a few caveats:
+
+1. For now, the incremental reader is always single-threaded (regardless of
+ :attr:`ReadOptions.use_threads`)
+
+2. Type inference is done on the first block and types are frozen afterwards;
+ to make sure the right data types are inferred, either set
+ :attr:`ReadOptions.block_size` to a large enough value, or use
+ :attr:`ConvertOptions.column_types` to set the desired data types explicitly.
+
+Character encoding
+------------------
+
+By default, CSV files are expected to be encoded in UTF8. Non-UTF8 data
+is accepted for ``binary`` columns. The encoding can be changed using
+the :class:`ReadOptions` class.
+
+Customized writing
+------------------
+
+To alter the default write settings in case of writing CSV files with
+different conventions, you can create a :class:`WriteOptions` instance and
+pass it to :func:`write_csv`::
+
+ >>> import pyarrow as pa
+ >>> import pyarrow.csv as csv
+ >>> # Omit the header row (include_header=True is the default)
+ >>> options = csv.WriteOptions(include_header=False)
+ >>> csv.write_csv(table, "data.csv", options)
+
+Incremental writing
+-------------------
+
+To write CSV files one batch at a time, create a :class:`CSVWriter`. This
+requires the output (a path or file-like object), the schema of the data to
+be written, and optionally write options as described above::
+
+ >>> import pyarrow as pa
+ >>> import pyarrow.csv as csv
+ >>> with csv.CSVWriter("data.csv", table.schema) as writer:
+ >>> writer.write_table(table)
+
+Performance
+-----------
+
+Due to the structure of CSV files, one cannot expect the same levels of
+performance as when reading dedicated binary formats like
+:ref:`Parquet <Parquet>`. Nevertheless, Arrow strives to reduce the
+overhead of reading CSV files. A reasonable expectation is at least
+100 MB/s per core on a performant desktop or laptop computer (measured
+in source CSV bytes, not target Arrow data bytes).
+
+Performance options can be controlled through the :class:`ReadOptions` class.
+Multi-threaded reading is the default for highest performance, distributing
+the workload efficiently over all available cores.
+
+.. note::
+ The number of concurrent threads is automatically inferred by Arrow.
+ You can inspect and change it using the :func:`~pyarrow.cpu_count()`
+ and :func:`~pyarrow.set_cpu_count()` functions, respectively.
diff --git a/src/arrow/docs/source/python/cuda.rst b/src/arrow/docs/source/python/cuda.rst
new file mode 100644
index 000000000..b0150c1c5
--- /dev/null
+++ b/src/arrow/docs/source/python/cuda.rst
@@ -0,0 +1,159 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.cuda
+
+CUDA Integration
+================
+
+Arrow is not limited to CPU buffers (located in the computer's main memory,
+also named "host memory"). It also has provisions for accessing buffers
+located on a CUDA-capable GPU device (in "device memory").
+
+.. note::
+ This functionality is optional and must have been enabled at build time.
+ If this is not done by your package manager, you might have to build Arrow
+ yourself.
+
+CUDA Contexts
+-------------
+
+A CUDA context represents access to a particular CUDA-capable device.
+For example, this is creating a CUDA context accessing CUDA device number 0::
+
+ >>> from pyarrow import cuda
+ >>> ctx = cuda.Context(0)
+ >>>
+
+CUDA Buffers
+------------
+
+A CUDA buffer can be created by copying data from host memory to the memory
+of a CUDA device, using the :meth:`Context.buffer_from_data` method.
+The source data can be any Python buffer-like object, including Arrow buffers::
+
+ >>> import numpy as np
+ >>> arr = np.arange(4, dtype=np.int32)
+ >>> arr.nbytes
+ 16
+ >>> cuda_buf = ctx.buffer_from_data(arr)
+ >>> type(cuda_buf)
+ pyarrow._cuda.CudaBuffer
+ >>> cuda_buf.size # The buffer's size in bytes
+ 16
+ >>> cuda_buf.address # The buffer's address in device memory
+ 30088364544
+ >>> cuda_buf.context.device_number
+ 0
+
+Conversely, you can copy back a CUDA buffer to device memory, getting a regular
+CPU buffer::
+
+ >>> buf = cuda_buf.copy_to_host()
+ >>> type(buf)
+ pyarrow.lib.Buffer
+ >>> np.frombuffer(buf, dtype=np.int32)
+ array([0, 1, 2, 3], dtype=int32)
+
+.. warning::
+ Many Arrow functions expect a CPU buffer but will not check the buffer's
+ actual type. You will get a crash if you pass a CUDA buffer to such a
+ function::
+
+ >>> pa.py_buffer(b"x" * 16).equals(cuda_buf)
+ Segmentation fault
+
+Numba Integration
+-----------------
+
+There is not much you can do directly with Arrow CUDA buffers from Python,
+but they support interoperation with `Numba <https://numba.pydata.org/>`_,
+a JIT compiler which can turn Python code into optimized CUDA kernels.
+
+Arrow to Numba
+~~~~~~~~~~~~~~
+
+First let's define a Numba CUDA kernel operating on an ``int32`` array. Here,
+we will simply increment each array element (assuming the array is writable)::
+
+ import numba.cuda
+
+ @numba.cuda.jit
+ def increment_by_one(an_array):
+ pos = numba.cuda.grid(1)
+ if pos < an_array.size:
+ an_array[pos] += 1
+
+Then we need to wrap our CUDA buffer into a Numba "device array" with the right
+array metadata (shape, strides and datatype). This is necessary so that Numba
+can identify the array's characteristics and compile the kernel with the
+appropriate type declarations.
+
+In this case the metadata can simply be got from the original Numpy array.
+Note the GPU data isn't copied, just pointed to::
+
+ >>> from numba.cuda.cudadrv.devicearray import DeviceNDArray
+ >>> device_arr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=cuda_buf.to_numba())
+
+(ideally we could have defined an Arrow array in CPU memory, copied it to CUDA
+memory without losing type information, and then invoked the Numba kernel on it
+without constructing the DeviceNDArray by hand; this is not yet possible)
+
+Finally we can run the Numba CUDA kernel on the Numba device array (here
+with a 16x16 grid size)::
+
+ >>> increment_by_one[16, 16](device_arr)
+
+And the results can be checked by copying back the CUDA buffer to CPU memory::
+
+ >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32)
+ array([1, 2, 3, 4], dtype=int32)
+
+Numba to Arrow
+~~~~~~~~~~~~~~
+
+Conversely, a Numba-created device array can be viewed as an Arrow CUDA buffer,
+using the :meth:`CudaBuffer.from_numba` factory method.
+
+For the sake of example, let's first create a Numba device array::
+
+ >>> arr = np.arange(10, 14, dtype=np.int32)
+ >>> arr
+ array([10, 11, 12, 13], dtype=int32)
+ >>> device_arr = numba.cuda.to_device(arr)
+
+Then we can create a CUDA buffer pointing the device array's memory.
+We don't need to pass a CUDA context explicitly this time: the appropriate
+CUDA context is automatically retrieved and adapted from the Numba object.
+
+::
+
+ >>> cuda_buf = cuda.CudaBuffer.from_numba(device_arr.gpu_data)
+ >>> cuda_buf.size
+ 16
+ >>> cuda_buf.address
+ 30088364032
+ >>> cuda_buf.context.device_number
+ 0
+
+Of course, we can copy the CUDA buffer back to host memory::
+
+ >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32)
+ array([10, 11, 12, 13], dtype=int32)
+
+.. seealso::
+ Documentation for Numba's `CUDA support <https://numba.pydata.org/numba-doc/latest/cuda/index.html>`_.
diff --git a/src/arrow/docs/source/python/data.rst b/src/arrow/docs/source/python/data.rst
new file mode 100644
index 000000000..b8a90039f
--- /dev/null
+++ b/src/arrow/docs/source/python/data.rst
@@ -0,0 +1,434 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+.. _data:
+
+Data Types and In-Memory Data Model
+===================================
+
+Apache Arrow defines columnar array data structures by composing type metadata
+with memory buffers, like the ones explained in the documentation on
+:ref:`Memory and IO <io>`. These data structures are exposed in Python through
+a series of interrelated classes:
+
+* **Type Metadata**: Instances of ``pyarrow.DataType``, which describe a logical
+ array type
+* **Schemas**: Instances of ``pyarrow.Schema``, which describe a named
+ collection of types. These can be thought of as the column types in a
+ table-like object.
+* **Arrays**: Instances of ``pyarrow.Array``, which are atomic, contiguous
+ columnar data structures composed from Arrow Buffer objects
+* **Record Batches**: Instances of ``pyarrow.RecordBatch``, which are a
+ collection of Array objects with a particular Schema
+* **Tables**: Instances of ``pyarrow.Table``, a logical table data structure in
+ which each column consists of one or more ``pyarrow.Array`` objects of the
+ same type.
+
+We will examine these in the sections below in a series of examples.
+
+.. _data.types:
+
+Type Metadata
+-------------
+
+Apache Arrow defines language agnostic column-oriented data structures for
+array data. These include:
+
+* **Fixed-length primitive types**: numbers, booleans, date and times, fixed
+ size binary, decimals, and other values that fit into a given number
+* **Variable-length primitive types**: binary, string
+* **Nested types**: list, struct, and union
+* **Dictionary type**: An encoded categorical type (more on this later)
+
+Each logical data type in Arrow has a corresponding factory function for
+creating an instance of that type object in Python:
+
+.. ipython:: python
+
+ import pyarrow as pa
+ t1 = pa.int32()
+ t2 = pa.string()
+ t3 = pa.binary()
+ t4 = pa.binary(10)
+ t5 = pa.timestamp('ms')
+
+ t1
+ print(t1)
+ print(t4)
+ print(t5)
+
+We use the name **logical type** because the **physical** storage may be the
+same for one or more types. For example, ``int64``, ``float64``, and
+``timestamp[ms]`` all occupy 64 bits per value.
+
+These objects are `metadata`; they are used for describing the data in arrays,
+schemas, and record batches. In Python, they can be used in functions where the
+input data (e.g. Python objects) may be coerced to more than one Arrow type.
+
+The :class:`~pyarrow.Field` type is a type plus a name and optional
+user-defined metadata:
+
+.. ipython:: python
+
+ f0 = pa.field('int32_field', t1)
+ f0
+ f0.name
+ f0.type
+
+Arrow supports **nested value types** like list, struct, and union. When
+creating these, you must pass types or fields to indicate the data types of the
+types' children. For example, we can define a list of int32 values with:
+
+.. ipython:: python
+
+ t6 = pa.list_(t1)
+ t6
+
+A `struct` is a collection of named fields:
+
+.. ipython:: python
+
+ fields = [
+ pa.field('s0', t1),
+ pa.field('s1', t2),
+ pa.field('s2', t4),
+ pa.field('s3', t6),
+ ]
+
+ t7 = pa.struct(fields)
+ print(t7)
+
+For convenience, you can pass ``(name, type)`` tuples directly instead of
+:class:`~pyarrow.Field` instances:
+
+.. ipython:: python
+
+ t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)])
+ print(t8)
+ t8 == t7
+
+
+See :ref:`Data Types API <api.types>` for a full listing of data type
+functions.
+
+.. _data.schema:
+
+Schemas
+-------
+
+The :class:`~pyarrow.Schema` type is similar to the ``struct`` array type; it
+defines the column names and types in a record batch or table data
+structure. The :func:`pyarrow.schema` factory function makes new Schema objects in
+Python:
+
+.. ipython:: python
+
+ my_schema = pa.schema([('field0', t1),
+ ('field1', t2),
+ ('field2', t4),
+ ('field3', t6)])
+ my_schema
+
+In some applications, you may not create schemas directly, only using the ones
+that are embedded in :ref:`IPC messages <ipc>`.
+
+.. _data.array:
+
+Arrays
+------
+
+For each data type, there is an accompanying array data structure for holding
+memory buffers that define a single contiguous chunk of columnar array
+data. When you are using PyArrow, this data may come from IPC tools, though it
+can also be created from various types of Python sequences (lists, NumPy
+arrays, pandas data).
+
+A simple way to create arrays is with ``pyarrow.array``, which is similar to
+the ``numpy.array`` function. By default PyArrow will infer the data type
+for you:
+
+.. ipython:: python
+
+ arr = pa.array([1, 2, None, 3])
+ arr
+
+But you may also pass a specific data type to override type inference:
+
+.. ipython:: python
+
+ pa.array([1, 2], type=pa.uint16())
+
+The array's ``type`` attribute is the corresponding piece of type metadata:
+
+.. ipython:: python
+
+ arr.type
+
+Each in-memory array has a known length and null count (which will be 0 if
+there are no null values):
+
+.. ipython:: python
+
+ len(arr)
+ arr.null_count
+
+Scalar values can be selected with normal indexing. ``pyarrow.array`` converts
+``None`` values to Arrow nulls; we return the special ``pyarrow.NA`` value for
+nulls:
+
+.. ipython:: python
+
+ arr[0]
+ arr[2]
+
+Arrow data is immutable, so values can be selected but not assigned.
+
+Arrays can be sliced without copying:
+
+.. ipython:: python
+
+ arr[1:3]
+
+None values and NAN handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As mentioned in the above section, the Python object ``None`` is always
+converted to an Arrow null element on the conversion to ``pyarrow.Array``. For
+the float NaN value which is either represented by the Python object
+``float('nan')`` or ``numpy.nan`` we normally convert it to a *valid* float
+value during the conversion. If an integer input is supplied to
+``pyarrow.array`` that contains ``np.nan``, ``ValueError`` is raised.
+
+To handle better compatibility with Pandas, we support interpreting NaN values as
+null elements. This is enabled automatically on all ``from_pandas`` function and
+can be enable on the other conversion functions by passing ``from_pandas=True``
+as a function parameter.
+
+List arrays
+~~~~~~~~~~~
+
+``pyarrow.array`` is able to infer the type of simple nested data structures
+like lists:
+
+.. ipython:: python
+
+ nested_arr = pa.array([[], None, [1, 2], [None, 1]])
+ print(nested_arr.type)
+
+Struct arrays
+~~~~~~~~~~~~~
+
+For other kinds of nested arrays, such as struct arrays, you currently need
+to pass the type explicitly. Struct arrays can be initialized from a
+sequence of Python dicts or tuples:
+
+.. ipython:: python
+
+ ty = pa.struct([('x', pa.int8()),
+ ('y', pa.bool_())])
+ pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty)
+ pa.array([(3, True), (4, False)], type=ty)
+
+When initializing a struct array, nulls are allowed both at the struct
+level and at the individual field level. If initializing from a sequence
+of Python dicts, a missing dict key is handled as a null value:
+
+.. ipython:: python
+
+ pa.array([{'x': 1}, None, {'y': None}], type=ty)
+
+You can also construct a struct array from existing arrays for each of the
+struct's components. In this case, data storage will be shared with the
+individual arrays, and no copy is involved:
+
+.. ipython:: python
+
+ xs = pa.array([5, 6, 7], type=pa.int16())
+ ys = pa.array([False, True, True])
+ arr = pa.StructArray.from_arrays((xs, ys), names=('x', 'y'))
+ arr.type
+ arr
+
+Union arrays
+~~~~~~~~~~~~
+
+The union type represents a nested array type where each value can be one
+(and only one) of a set of possible types. There are two possible
+storage types for union arrays: sparse and dense.
+
+In a sparse union array, each of the child arrays has the same length
+as the resulting union array. They are adjuncted with a ``int8`` "types"
+array that tells, for each value, from which child array it must be
+selected:
+
+.. ipython:: python
+
+ xs = pa.array([5, 6, 7])
+ ys = pa.array([False, False, True])
+ types = pa.array([0, 1, 1], type=pa.int8())
+ union_arr = pa.UnionArray.from_sparse(types, [xs, ys])
+ union_arr.type
+ union_arr
+
+In a dense union array, you also pass, in addition to the ``int8`` "types"
+array, a ``int32`` "offsets" array that tells, for each value, at
+each offset in the selected child array it can be found:
+
+.. ipython:: python
+
+ xs = pa.array([5, 6, 7])
+ ys = pa.array([False, True])
+ types = pa.array([0, 1, 1, 0, 0], type=pa.int8())
+ offsets = pa.array([0, 0, 1, 1, 2], type=pa.int32())
+ union_arr = pa.UnionArray.from_dense(types, offsets, [xs, ys])
+ union_arr.type
+ union_arr
+
+.. _data.dictionary:
+
+Dictionary Arrays
+~~~~~~~~~~~~~~~~~
+
+The **Dictionary** type in PyArrow is a special array type that is similar to a
+factor in R or a ``pandas.Categorical``. It enables one or more record batches
+in a file or stream to transmit integer *indices* referencing a shared
+**dictionary** containing the distinct values in the logical array. This is
+particularly often used with strings to save memory and improve performance.
+
+The way that dictionaries are handled in the Apache Arrow format and the way
+they appear in C++ and Python is slightly different. We define a special
+:class:`~.DictionaryArray` type with a corresponding dictionary type. Let's
+consider an example:
+
+.. ipython:: python
+
+ indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
+ dictionary = pa.array(['foo', 'bar', 'baz'])
+
+ dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
+ dict_array
+
+Here we have:
+
+.. ipython:: python
+
+ print(dict_array.type)
+ dict_array.indices
+ dict_array.dictionary
+
+When using :class:`~.DictionaryArray` with pandas, the analogue is
+``pandas.Categorical`` (more on this later):
+
+.. ipython:: python
+
+ dict_array.to_pandas()
+
+.. _data.record_batch:
+
+Record Batches
+--------------
+
+A **Record Batch** in Apache Arrow is a collection of equal-length array
+instances. Let's consider a collection of arrays:
+
+.. ipython:: python
+
+ data = [
+ pa.array([1, 2, 3, 4]),
+ pa.array(['foo', 'bar', 'baz', None]),
+ pa.array([True, None, False, True])
+ ]
+
+A record batch can be created from this list of arrays using
+``RecordBatch.from_arrays``:
+
+.. ipython:: python
+
+ batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])
+ batch.num_columns
+ batch.num_rows
+ batch.schema
+
+ batch[1]
+
+A record batch can be sliced without copying memory like an array:
+
+.. ipython:: python
+
+ batch2 = batch.slice(1, 3)
+ batch2[1]
+
+.. _data.table:
+
+Tables
+------
+
+The PyArrow :class:`~.Table` type is not part of the Apache Arrow
+specification, but is rather a tool to help with wrangling multiple record
+batches and array pieces as a single logical dataset. As a relevant example, we
+may receive multiple small record batches in a socket stream, then need to
+concatenate them into contiguous memory for use in NumPy or pandas. The Table
+object makes this efficient without requiring additional memory copying.
+
+Considering the record batch we created above, we can create a Table containing
+one or more copies of the batch using ``Table.from_batches``:
+
+.. ipython:: python
+
+ batches = [batch] * 5
+ table = pa.Table.from_batches(batches)
+ table
+ table.num_rows
+
+The table's columns are instances of :class:`~.ChunkedArray`, which is a
+container for one or more arrays of the same type.
+
+.. ipython:: python
+
+ c = table[0]
+ c
+ c.num_chunks
+ c.chunk(0)
+
+As you'll see in the :ref:`pandas section <pandas_interop>`, we can convert
+these objects to contiguous NumPy arrays for use in pandas:
+
+.. ipython:: python
+
+ c.to_pandas()
+
+Multiple tables can also be concatenated together to form a single table using
+``pyarrow.concat_tables``, if the schemas are equal:
+
+.. ipython:: python
+
+ tables = [table] * 2
+ table_all = pa.concat_tables(tables)
+ table_all.num_rows
+ c = table_all[0]
+ c.num_chunks
+
+This is similar to ``Table.from_batches``, but uses tables as input instead of
+record batches. Record batches can be made into tables, but not the other way
+around, so if your data is already in table form, then use
+``pyarrow.concat_tables``.
+
+Custom Schema and Field Metadata
+--------------------------------
+
+TODO
diff --git a/src/arrow/docs/source/python/dataset.rst b/src/arrow/docs/source/python/dataset.rst
new file mode 100644
index 000000000..e2d8c900b
--- /dev/null
+++ b/src/arrow/docs/source/python/dataset.rst
@@ -0,0 +1,626 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.dataset
+
+.. _dataset:
+
+Tabular Datasets
+================
+
+.. warning::
+
+ The ``pyarrow.dataset`` module is experimental (specifically the classes),
+ and a stable API is not yet guaranteed.
+
+The ``pyarrow.dataset`` module provides functionality to efficiently work with
+tabular, potentially larger than memory, and multi-file datasets. This includes:
+
+* A unified interface that supports different sources and file formats
+ (Parquet, ORC, Feather / Arrow IPC, and CSV files) and different file systems
+ (local, cloud).
+* Discovery of sources (crawling directories, handle directory-based partitioned
+ datasets, basic schema normalization, ..)
+* Optimized reading with predicate pushdown (filtering rows), projection
+ (selecting and deriving columns), and optionally parallel reading.
+
+Currently, only Parquet, ORC, Feather / Arrow IPC, and CSV files are
+supported. The goal is to expand this in the future to other file formats and
+data sources (e.g. database connections).
+
+For those familiar with the existing :class:`pyarrow.parquet.ParquetDataset` for
+reading Parquet datasets: ``pyarrow.dataset``'s goal is similar but not specific
+to the Parquet format and not tied to Python: the same datasets API is exposed
+in the R bindings or Arrow. In addition ``pyarrow.dataset`` boasts improved
+performance and new features (e.g. filtering within files rather than only on
+partition keys).
+
+
+Reading Datasets
+----------------
+
+.. TODO Full blown example with NYC taxi data to show off, afterwards explain all parts:
+
+For the examples below, let's create a small dataset consisting
+of a directory with two parquet files:
+
+.. ipython:: python
+
+ import tempfile
+ import pathlib
+ import pyarrow as pa
+ import pyarrow.parquet as pq
+ import numpy as np
+
+ base = pathlib.Path(tempfile.gettempdir())
+ (base / "parquet_dataset").mkdir(exist_ok=True)
+
+ # creating an Arrow Table
+ table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5})
+
+ # writing it into two parquet files
+ pq.write_table(table.slice(0, 5), base / "parquet_dataset/data1.parquet")
+ pq.write_table(table.slice(5, 10), base / "parquet_dataset/data2.parquet")
+
+Dataset discovery
+~~~~~~~~~~~~~~~~~
+
+A :class:`Dataset` object can be created with the :func:`dataset` function. We
+can pass it the path to the directory containing the data files:
+
+.. ipython:: python
+
+ import pyarrow.dataset as ds
+ dataset = ds.dataset(base / "parquet_dataset", format="parquet")
+ dataset
+
+In addition to searching a base directory, :func:`dataset` accepts a path to a
+single file or a list of file paths.
+
+Creating a :class:`Dataset` object does not begin reading the data itself. If
+needed, it only crawls the directory to find all the files:
+
+.. ipython:: python
+
+ dataset.files
+
+... and infers the dataset's schema (by default from the first file):
+
+.. ipython:: python
+
+ print(dataset.schema.to_string(show_field_metadata=False))
+
+Using the :meth:`Dataset.to_table` method we can read the dataset (or a portion
+of it) into a pyarrow Table (note that depending on the size of your dataset
+this can require a lot of memory, see below on filtering / iterative loading):
+
+.. ipython:: python
+
+ dataset.to_table()
+ # converting to pandas to see the contents of the scanned table
+ dataset.to_table().to_pandas()
+
+Reading different file formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The above examples use Parquet files as dataset sources but the Dataset API
+provides a consistent interface across multiple file formats and filesystems.
+Currently, Parquet, ORC, Feather / Arrow IPC, and CSV file formats are
+supported; more formats are planned in the future.
+
+If we save the table as Feather files instead of Parquet files:
+
+.. ipython:: python
+
+ import pyarrow.feather as feather
+
+ feather.write_feather(table, base / "data.feather")
+
+…then we can read the Feather file using the same functions, but with specifying
+``format="feather"``:
+
+.. ipython:: python
+
+ dataset = ds.dataset(base / "data.feather", format="feather")
+ dataset.to_table().to_pandas().head()
+
+Customizing file formats
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The format name as a string, like::
+
+ ds.dataset(..., format="parquet")
+
+is short hand for a default constructed :class:`ParquetFileFormat`::
+
+ ds.dataset(..., format=ds.ParquetFileFormat())
+
+The :class:`FileFormat` objects can be customized using keywords. For example::
+
+ parquet_format = ds.ParquetFileFormat(read_options={'dictionary_columns': ['a']})
+ ds.dataset(..., format=parquet_format)
+
+Will configure column ``"a"`` to be dictionary encoded on scan.
+
+Filtering data
+--------------
+
+To avoid reading all data when only needing a subset, the ``columns`` and
+``filter`` keywords can be used.
+
+The ``columns`` keyword can be used to only read the specified columns:
+
+.. ipython:: python
+
+ dataset = ds.dataset(base / "parquet_dataset", format="parquet")
+ dataset.to_table(columns=['a', 'b']).to_pandas()
+
+With the ``filter`` keyword, rows which do not match the filter predicate will
+not be included in the returned table. The keyword expects a boolean
+:class:`Expression` referencing at least one of the columns:
+
+.. ipython:: python
+
+ dataset.to_table(filter=ds.field('a') >= 7).to_pandas()
+ dataset.to_table(filter=ds.field('c') == 2).to_pandas()
+
+The easiest way to construct those :class:`Expression` objects is by using the
+:func:`field` helper function. Any column - not just partition columns - can be
+referenced using the :func:`field` function (which creates a
+:class:`FieldExpression`). Operator overloads are provided to compose filters
+including the comparisons (equal, larger/less than, etc), set membership
+testing, and boolean combinations (``&``, ``|``, ``~``):
+
+.. ipython:: python
+
+ ds.field('a') != 3
+ ds.field('a').isin([1, 2, 3])
+ (ds.field('a') > ds.field('b')) & (ds.field('b') > 1)
+
+Note that :class:`Expression` objects can **not** be combined by python logical
+operators ``and``, ``or`` and ``not``.
+
+Projecting columns
+------------------
+
+The ``columns`` keyword can be used to read a subset of the columns of the
+dataset by passing it a list of column names. The keyword can also be used
+for more complex projections in combination with expressions.
+
+In this case, we pass it a dictionary with the keys being the resulting
+column names and the values the expression that is used to construct the column
+values:
+
+.. ipython:: python
+
+ projection = {
+ "a_renamed": ds.field("a"),
+ "b_as_float32": ds.field("b").cast("float32"),
+ "c_1": ds.field("c") == 1,
+ }
+ dataset.to_table(columns=projection).to_pandas().head()
+
+The dictionary also determines the column selection (only the keys in the
+dictionary will be present as columns in the resulting table). If you want
+to include a derived column in *addition* to the existing columns, you can
+build up the dictionary from the dataset schema:
+
+.. ipython:: python
+
+ projection = {col: ds.field(col) for col in dataset.schema.names}
+ projection.update({"b_large": ds.field("b") > 1})
+ dataset.to_table(columns=projection).to_pandas().head()
+
+
+Reading partitioned data
+------------------------
+
+Above, a dataset consisting of a flat directory with files was shown. However, a
+dataset can exploit a nested directory structure defining a partitioned dataset,
+where the sub-directory names hold information about which subset of the data is
+stored in that directory.
+
+For example, a dataset partitioned by year and month may look like on disk:
+
+.. code-block:: text
+
+ dataset_name/
+ year=2007/
+ month=01/
+ data0.parquet
+ data1.parquet
+ ...
+ month=02/
+ data0.parquet
+ data1.parquet
+ ...
+ month=03/
+ ...
+ year=2008/
+ month=01/
+ ...
+ ...
+
+The above partitioning scheme is using "/key=value/" directory names, as found
+in Apache Hive.
+
+Let's create a small partitioned dataset. The :func:`~pyarrow.parquet.write_to_dataset`
+function can write such hive-like partitioned datasets.
+
+.. ipython:: python
+
+ table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5,
+ 'part': ['a'] * 5 + ['b'] * 5})
+ pq.write_to_dataset(table, str(base / "parquet_dataset_partitioned"),
+ partition_cols=['part'])
+
+The above created a directory with two subdirectories ("part=a" and "part=b"),
+and the Parquet files written in those directories no longer include the "part"
+column.
+
+Reading this dataset with :func:`dataset`, we now specify that the dataset
+should use a hive-like partitioning scheme with the ``partitioning`` keyword:
+
+.. ipython:: python
+
+ dataset = ds.dataset(str(base / "parquet_dataset_partitioned"), format="parquet",
+ partitioning="hive")
+ dataset.files
+
+Although the partition fields are not included in the actual Parquet files,
+they will be added back to the resulting table when scanning this dataset:
+
+.. ipython:: python
+
+ dataset.to_table().to_pandas().head(3)
+
+We can now filter on the partition keys, which avoids loading files
+altogether if they do not match the filter:
+
+.. ipython:: python
+
+ dataset.to_table(filter=ds.field("part") == "b").to_pandas()
+
+
+Different partitioning schemes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The above example uses a hive-like directory scheme, such as "/year=2009/month=11/day=15".
+We specified this passing the ``partitioning="hive"`` keyword. In this case,
+the types of the partition keys are inferred from the file paths.
+
+It is also possible to explicitly define the schema of the partition keys
+using the :func:`partitioning` function. For example:
+
+.. code-block:: python
+
+ part = ds.partitioning(
+ pa.schema([("year", pa.int16()), ("month", pa.int8()), ("day", pa.int32())]),
+ flavor="hive"
+ )
+ dataset = ds.dataset(..., partitioning=part)
+
+"Directory partitioning" is also supported, where the segments in the file path
+represent the values of the partition keys without including the name (the
+field name are implicit in the segment's index). For example, given field names
+"year", "month", and "day", one path might be "/2019/11/15".
+
+Since the names are not included in the file paths, these must be specified
+when constructing a directory partitioning:
+
+.. code-block:: python
+
+ part = ds.partitioning(field_names=["year", "month", "day"])
+
+Directory partitioning also supports providing a full schema rather than inferring
+types from file paths.
+
+
+Reading from cloud storage
+--------------------------
+
+In addition to local files, pyarrow also supports reading from cloud storage.
+Currently, :class:`HDFS <pyarrow.fs.HadoopFileSystem>` and
+:class:`Amazon S3-compatible storage <pyarrow.fs.S3FileSystem>` are supported.
+
+When passing a file URI, the file system will be inferred. For example,
+specifying a S3 path:
+
+.. code-block:: python
+
+ dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"])
+
+Typically, you will want to customize the connection parameters, and then
+a file system object can be created and passed to the ``filesystem`` keyword:
+
+.. code-block:: python
+
+ from pyarrow import fs
+
+ s3 = fs.S3FileSystem(region="us-east-2")
+ dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=s3,
+ partitioning=["year", "month"])
+
+The currently available classes are :class:`~pyarrow.fs.S3FileSystem` and
+:class:`~pyarrow.fs.HadoopFileSystem`. See the :ref:`filesystem` docs for more
+details.
+
+
+Reading from Minio
+------------------
+
+In addition to cloud storage, pyarrow also supports reading from a
+`MinIO <https://github.com/minio/minio>`_ object storage instance emulating S3
+APIs. Paired with `toxiproxy <https://github.com/shopify/toxiproxy>`_, this is
+useful for testing or benchmarking.
+
+.. code-block:: python
+
+ from pyarrow import fs
+
+ # By default, MinIO will listen for unencrypted HTTP traffic.
+ minio = fs.S3FileSystem(scheme="http", endpoint="localhost:9000")
+ dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=minio,
+ partitioning=["year", "month"])
+
+
+Working with Parquet Datasets
+-----------------------------
+
+While the Datasets API provides a unified interface to different file formats,
+some specific methods exist for Parquet Datasets.
+
+Some processing frameworks such as Dask (optionally) use a ``_metadata`` file
+with partitioned datasets which includes information about the schema and the
+row group metadata of the full dataset. Using such a file can give a more
+efficient creation of a parquet Dataset, since it does not need to infer the
+schema and crawl the directories for all Parquet files (this is especially the
+case for filesystems where accessing files is expensive). The
+:func:`parquet_dataset` function allows us to create a Dataset from a partitioned
+dataset with a ``_metadata`` file:
+
+.. code-block:: python
+
+ dataset = ds.parquet_dataset("/path/to/dir/_metadata")
+
+By default, the constructed :class:`Dataset` object for Parquet datasets maps
+each fragment to a single Parquet file. If you want fragments mapping to each
+row group of a Parquet file, you can use the ``split_by_row_group()`` method of
+the fragments:
+
+.. code-block:: python
+
+ fragments = list(dataset.get_fragments())
+ fragments[0].split_by_row_group()
+
+This method returns a list of new Fragments mapping to each row group of
+the original Fragment (Parquet file). Both ``get_fragments()`` and
+``split_by_row_group()`` accept an optional filter expression to get a
+filtered list of fragments.
+
+
+Manual specification of the Dataset
+-----------------------------------
+
+The :func:`dataset` function allows easy creation of a Dataset viewing a directory,
+crawling all subdirectories for files and partitioning information. However
+sometimes discovery is not required and the dataset's files and partitions
+are already known (for example, when this information is stored in metadata).
+In this case it is possible to create a Dataset explicitly without any
+automatic discovery or inference.
+
+For the example here, we are going to use a dataset where the file names contain
+additional partitioning information:
+
+.. ipython:: python
+
+ # creating a dummy dataset: directory with two files
+ table = pa.table({'col1': range(3), 'col2': np.random.randn(3)})
+ (base / "parquet_dataset_manual").mkdir(exist_ok=True)
+ pq.write_table(table, base / "parquet_dataset_manual" / "data_2018.parquet")
+ pq.write_table(table, base / "parquet_dataset_manual" / "data_2019.parquet")
+
+To create a Dataset from a list of files, we need to specify the paths, schema,
+format, filesystem, and partition expressions manually:
+
+.. ipython:: python
+
+ from pyarrow import fs
+
+ schema = pa.schema([("year", pa.int64()), ("col1", pa.int64()), ("col2", pa.float64())])
+
+ dataset = ds.FileSystemDataset.from_paths(
+ ["data_2018.parquet", "data_2019.parquet"], schema=schema, format=ds.ParquetFileFormat(),
+ filesystem=fs.SubTreeFileSystem(str(base / "parquet_dataset_manual"), fs.LocalFileSystem()),
+ partitions=[ds.field('year') == 2018, ds.field('year') == 2019])
+
+Since we specified the "partition expressions" for our files, this information
+is materialized as columns when reading the data and can be used for filtering:
+
+.. ipython:: python
+
+ dataset.to_table().to_pandas()
+ dataset.to_table(filter=ds.field('year') == 2019).to_pandas()
+
+Another benefit of manually listing the files is that the order of the files
+controls the order of the data. When performing an ordered read (or a read to
+a table) then the rows returned will match the order of the files given. This
+only applies when the dataset is constructed with a list of files. There
+are no order guarantees given when the files are instead discovered by scanning
+a directory.
+
+Iterative (out of core or streaming) reads
+------------------------------------------
+
+The previous examples have demonstrated how to read the data into a table using :func:`~Dataset.to_table`. This is
+useful if the dataset is small or there is only a small amount of data that needs to
+be read. The dataset API contains additional methods to read and process large amounts
+of data in a streaming fashion.
+
+The easiest way to do this is to use the method :meth:`Dataset.to_batches`. This
+method returns an iterator of record batches. For example, we can use this method to
+calculate the average of a column without loading the entire column into memory:
+
+.. ipython:: python
+
+ import pyarrow.compute as pc
+
+ col2_sum = 0
+ count = 0
+ for batch in dataset.to_batches(columns=["col2"], filter=~ds.field("col2").is_null()):
+ col2_sum += pc.sum(batch.column("col2")).as_py()
+ count += batch.num_rows
+ mean_a = col2_sum/count
+
+Customizing the batch size
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An iterative read of a dataset is often called a "scan" of the dataset and pyarrow
+uses an object called a :class:`Scanner` to do this. A Scanner is created for you
+automatically by the :func:`~Dataset.to_table` and :func:`~Dataset.to_batches` method of the dataset.
+Any arguments you pass to these methods will be passed on to the Scanner constructor.
+
+One of those parameters is the ``batch_size``. This controls the maximum size of the
+batches returned by the scanner. Batches can still be smaller than the ``batch_size``
+if the dataset consists of small files or those files themselves consist of small
+row groups. For example, a parquet file with 10,000 rows per row group will yield
+batches with, at most, 10,000 rows unless the ``batch_size`` is set to a smaller value.
+
+The default batch size is one million rows and this is typically a good default but
+you may want to customize it if you are reading a large number of columns.
+
+Writing Datasets
+----------------
+
+The dataset API also simplifies writing data to a dataset using :func:`write_dataset` . This can be useful when
+you want to partition your data or you need to write a large amount of data. A
+basic dataset write is similar to writing a table except that you specify a directory
+instead of a filename.
+
+.. ipython:: python
+
+ base = pathlib.Path(tempfile.gettempdir())
+ dataset_root = base / "sample_dataset"
+ dataset_root.mkdir(exist_ok=True)
+
+ table = pa.table({"a": range(10), "b": np.random.randn(10), "c": [1, 2] * 5})
+ ds.write_dataset(table, dataset_root, format="parquet")
+
+The above example will create a single file named part-0.parquet in our sample_dataset
+directory.
+
+.. warning::
+
+ If you run the example again it will replace the existing part-0.parquet file.
+ Appending files to an existing dataset requires specifying a new
+ ``basename_template`` for each call to ``ds.write_dataset``
+ to avoid overwrite.
+
+Writing partitioned data
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+A partitioning object can be used to specify how your output data should be partitioned.
+This uses the same kind of partitioning objects we used for reading datasets. To write
+our above data out to a partitioned directory we only need to specify how we want the
+dataset to be partitioned. For example:
+
+.. ipython:: python
+
+ part = ds.partitioning(
+ pa.schema([("c", pa.int16())]), flavor="hive"
+ )
+ ds.write_dataset(table, dataset_root, format="parquet", partitioning=part)
+
+This will create two files. Half our data will be in the dataset_root/c=1 directory and
+the other half will be in the dataset_root/c=2 directory.
+
+Writing large amounts of data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The above examples wrote data from a table. If you are writing a large amount of data
+you may not be able to load everything into a single in-memory table. Fortunately, the
+:func:`~Dataset.write_dataset` method also accepts an iterable of record batches. This makes it really
+simple, for example, to repartition a large dataset without loading the entire dataset
+into memory:
+
+.. ipython:: python
+
+ old_part = ds.partitioning(
+ pa.schema([("c", pa.int16())]), flavor="hive"
+ )
+ new_part = ds.partitioning(
+ pa.schema([("c", pa.int16())]), flavor=None
+ )
+ input_dataset = ds.dataset(dataset_root, partitioning=old_part)
+ new_root = base / "repartitioned_dataset"
+ # A scanner can act as an iterator of record batches but you could also receive
+ # data from the network (e.g. via flight), from your own scanning, or from any
+ # other method that yields record batches. In addition, you can pass a dataset
+ # into write_dataset directly but this method is useful if you want to customize
+ # the scanner (e.g. to filter the input dataset or set a maximum batch size)
+ scanner = input_dataset.scanner(use_async=True)
+
+ ds.write_dataset(scanner, new_root, format="parquet", partitioning=new_part)
+
+After the above example runs our data will be in dataset_root/1 and dataset_root/2
+directories. In this simple example we are not changing the structure of the data
+(only the directory naming schema) but you could also use this mechnaism to change
+which columns are used to partition the dataset. This is useful when you expect to
+query your data in specific ways and you can utilize partitioning to reduce the
+amount of data you need to read.
+
+Customizing & inspecting written files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default the dataset API will create files named "part-i.format" where "i" is a integer
+generated during the write and "format" is the file format specified in the write_dataset
+call. For simple datasets it may be possible to know which files will be created but for
+larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used
+to supply a visitor that will be called as each file is created:
+
+.. ipython:: python
+
+ def file_visitor(written_file):
+ print(f"path={written_file.path}")
+ print(f"metadata={written_file.metadata}")
+
+.. ipython:: python
+
+ ds.write_dataset(table, base / "dataset_visited", format="parquet", partitioning=part,
+ file_visitor=file_visitor)
+
+This will allow you to collect the filenames that belong to the dataset and store them elsewhere
+which can be useful when you want to avoid scanning directories the next time you need to read
+the data. It can also be used to generate the _metadata index file used by other tools such as
+dask or spark to create an index of the dataset.
+
+Configuring format-specific parameters during a write
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to the common options shared by all formats there are also format specific options
+that are unique to a particular format. For example, to allow truncated timestamps while writing
+Parquet files:
+
+.. ipython:: python
+
+ dataset_root = base / "sample_dataset2"
+ dataset_root.mkdir(exist_ok=True)
+
+ parquet_format = ds.ParquetFileFormat()
+ write_options = parquet_format.make_write_options(allow_truncated_timestamps=True)
+ ds.write_dataset(table, dataset_root, format="parquet", partitioning=part,
+ file_options=write_options)
diff --git a/src/arrow/docs/source/python/extending.rst b/src/arrow/docs/source/python/extending.rst
new file mode 100644
index 000000000..5e00e7905
--- /dev/null
+++ b/src/arrow/docs/source/python/extending.rst
@@ -0,0 +1,483 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+.. cpp:namespace:: arrow
+
+.. _extending:
+
+Using pyarrow from C++ and Cython Code
+======================================
+
+pyarrow provides both a Cython and C++ API, allowing your own native code
+to interact with pyarrow objects.
+
+C++ API
+-------
+
+.. default-domain:: cpp
+
+The Arrow C++ header files are bundled with a pyarrow installation.
+To get the absolute path to this directory (like ``numpy.get_include()``), use:
+
+.. code-block:: python
+
+ import pyarrow as pa
+ pa.get_include()
+
+Assuming the path above is on your compiler's include path, the pyarrow API
+can be included using the following directive:
+
+.. code-block:: cpp
+
+ #include <arrow/python/pyarrow.h>
+
+This will not include other parts of the Arrow API, which you will need
+to include yourself (for example ``arrow/api.h``).
+
+When building C extensions that use the Arrow C++ libraries, you must add
+appropriate linker flags. We have provided functions ``pyarrow.get_libraries``
+and ``pyarrow.get_library_dirs`` which return a list of library names and
+likely library install locations (if you installed pyarrow with pip or
+conda). These must be included when declaring your C extensions with
+setuptools (see below).
+
+Initializing the API
+~~~~~~~~~~~~~~~~~~~~
+
+.. function:: int import_pyarrow()
+
+ Initialize inner pointers of the pyarrow API. On success, 0 is
+ returned. Otherwise, -1 is returned and a Python exception is set.
+
+ It is mandatory to call this function before calling any other function
+ in the pyarrow C++ API. Failing to do so will likely lead to crashes.
+
+Wrapping and Unwrapping
+~~~~~~~~~~~~~~~~~~~~~~~
+
+pyarrow provides the following functions to go back and forth between
+Python wrappers (as exposed by the pyarrow Python API) and the underlying
+C++ objects.
+
+.. function:: bool arrow::py::is_array(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`Array` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.Array` instance.
+
+.. function:: bool arrow::py::is_batch(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`RecordBatch` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.RecordBatch` instance.
+
+.. function:: bool arrow::py::is_buffer(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`Buffer` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.Buffer` instance.
+
+.. function:: bool arrow::py::is_data_type(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`DataType` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.DataType` instance.
+
+.. function:: bool arrow::py::is_field(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`Field` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.Field` instance.
+
+.. function:: bool arrow::py::is_scalar(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`Scalar` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.Scalar` instance.
+
+.. function:: bool arrow::py::is_schema(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`Schema` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.Schema` instance.
+
+.. function:: bool arrow::py::is_table(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`Table` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.Table` instance.
+
+.. function:: bool arrow::py::is_tensor(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :class:`Tensor` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.Tensor` instance.
+
+.. function:: bool arrow::py::is_sparse_coo_tensor(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :type:`SparseCOOTensor` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.SparseCOOTensor` instance.
+
+.. function:: bool arrow::py::is_sparse_csc_matrix(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :type:`SparseCSCMatrix` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.SparseCSCMatrix` instance.
+
+.. function:: bool arrow::py::is_sparse_csf_tensor(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :type:`SparseCSFTensor` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.SparseCSFTensor` instance.
+
+.. function:: bool arrow::py::is_sparse_csr_matrix(PyObject* obj)
+
+ Return whether *obj* wraps an Arrow C++ :type:`SparseCSRMatrix` pointer;
+ in other words, whether *obj* is a :py:class:`pyarrow.SparseCSRMatrix` instance.
+
+
+The following functions expect a pyarrow object, unwrap the underlying
+Arrow C++ API pointer, and return it as a :class:`Result` object. An error
+may be returned if the input object doesn't have the expected type.
+
+.. function:: Result<std::shared_ptr<Array>> arrow::py::unwrap_array(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`Array` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<RecordBatch>> arrow::py::unwrap_batch(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`RecordBatch` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<Buffer>> arrow::py::unwrap_buffer(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`Buffer` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<DataType>> arrow::py::unwrap_data_type(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`DataType` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<Field>> arrow::py::unwrap_field(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`Field` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<Scalar>> arrow::py::unwrap_scalar(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`Scalar` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<Schema>> arrow::py::unwrap_schema(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`Schema` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<Table>> arrow::py::unwrap_table(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`Table` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<Tensor>> arrow::py::unwrap_tensor(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :class:`Tensor` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<SparseCOOTensor>> arrow::py::unwrap_sparse_coo_tensor(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :type:`SparseCOOTensor` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<SparseCSCMatrix>> arrow::py::unwrap_sparse_csc_matrix(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :type:`SparseCSCMatrix` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<SparseCSFTensor>> arrow::py::unwrap_sparse_csf_tensor(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :type:`SparseCSFTensor` pointer from *obj*.
+
+.. function:: Result<std::shared_ptr<SparseCSRMatrix>> arrow::py::unwrap_sparse_csr_matrix(PyObject* obj)
+
+ Unwrap and return the Arrow C++ :type:`SparseCSRMatrix` pointer from *obj*.
+
+
+The following functions take an Arrow C++ API pointer and wrap it in a
+pyarray object of the corresponding type. A new reference is returned.
+On error, NULL is returned and a Python exception is set.
+
+.. function:: PyObject* arrow::py::wrap_array(const std::shared_ptr<Array>& array)
+
+ Wrap the Arrow C++ *array* in a :py:class:`pyarrow.Array` instance.
+
+.. function:: PyObject* arrow::py::wrap_batch(const std::shared_ptr<RecordBatch>& batch)
+
+ Wrap the Arrow C++ record *batch* in a :py:class:`pyarrow.RecordBatch` instance.
+
+.. function:: PyObject* arrow::py::wrap_buffer(const std::shared_ptr<Buffer>& buffer)
+
+ Wrap the Arrow C++ *buffer* in a :py:class:`pyarrow.Buffer` instance.
+
+.. function:: PyObject* arrow::py::wrap_data_type(const std::shared_ptr<DataType>& data_type)
+
+ Wrap the Arrow C++ *data_type* in a :py:class:`pyarrow.DataType` instance.
+
+.. function:: PyObject* arrow::py::wrap_field(const std::shared_ptr<Field>& field)
+
+ Wrap the Arrow C++ *field* in a :py:class:`pyarrow.Field` instance.
+
+.. function:: PyObject* arrow::py::wrap_scalar(const std::shared_ptr<Scalar>& scalar)
+
+ Wrap the Arrow C++ *scalar* in a :py:class:`pyarrow.Scalar` instance.
+
+.. function:: PyObject* arrow::py::wrap_schema(const std::shared_ptr<Schema>& schema)
+
+ Wrap the Arrow C++ *schema* in a :py:class:`pyarrow.Schema` instance.
+
+.. function:: PyObject* arrow::py::wrap_table(const std::shared_ptr<Table>& table)
+
+ Wrap the Arrow C++ *table* in a :py:class:`pyarrow.Table` instance.
+
+.. function:: PyObject* arrow::py::wrap_tensor(const std::shared_ptr<Tensor>& tensor)
+
+ Wrap the Arrow C++ *tensor* in a :py:class:`pyarrow.Tensor` instance.
+
+.. function:: PyObject* arrow::py::wrap_sparse_coo_tensor(const std::shared_ptr<SparseCOOTensor>& sparse_tensor)
+
+ Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCOOTensor` instance.
+
+.. function:: PyObject* arrow::py::wrap_sparse_csc_matrix(const std::shared_ptr<SparseCSCMatrix>& sparse_tensor)
+
+ Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSCMatrix` instance.
+
+.. function:: PyObject* arrow::py::wrap_sparse_csf_tensor(const std::shared_ptr<SparseCSFTensor>& sparse_tensor)
+
+ Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSFTensor` instance.
+
+.. function:: PyObject* arrow::py::wrap_sparse_csr_matrix(const std::shared_ptr<SparseCSRMatrix>& sparse_tensor)
+
+ Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSRMatrix` instance.
+
+
+Cython API
+----------
+
+.. default-domain:: py
+
+The Cython API more or less mirrors the C++ API, but the calling convention
+can be different as required by Cython. In Cython, you don't need to
+initialize the API as that will be handled automatically by the ``cimport``
+directive.
+
+.. note::
+ Classes from the Arrow C++ API are renamed when exposed in Cython, to
+ avoid named clashes with the corresponding Python classes. For example,
+ C++ Arrow arrays have the ``CArray`` type and ``Array`` is the
+ corresponding Python wrapper class.
+
+Wrapping and Unwrapping
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The following functions expect a pyarrow object, unwrap the underlying
+Arrow C++ API pointer, and return it. NULL is returned (without setting
+an exception) if the input is not of the right type.
+
+.. function:: pyarrow_unwrap_array(obj) -> shared_ptr[CArray]
+
+ Unwrap the Arrow C++ :cpp:class:`Array` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_batch(obj) -> shared_ptr[CRecordBatch]
+
+ Unwrap the Arrow C++ :cpp:class:`RecordBatch` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_buffer(obj) -> shared_ptr[CBuffer]
+
+ Unwrap the Arrow C++ :cpp:class:`Buffer` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_data_type(obj) -> shared_ptr[CDataType]
+
+ Unwrap the Arrow C++ :cpp:class:`CDataType` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_field(obj) -> shared_ptr[CField]
+
+ Unwrap the Arrow C++ :cpp:class:`Field` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_scalar(obj) -> shared_ptr[CScalar]
+
+ Unwrap the Arrow C++ :cpp:class:`Scalar` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_schema(obj) -> shared_ptr[CSchema]
+
+ Unwrap the Arrow C++ :cpp:class:`Schema` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_table(obj) -> shared_ptr[CTable]
+
+ Unwrap the Arrow C++ :cpp:class:`Table` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_tensor(obj) -> shared_ptr[CTensor]
+
+ Unwrap the Arrow C++ :cpp:class:`Tensor` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_sparse_coo_tensor(obj) -> shared_ptr[CSparseCOOTensor]
+
+ Unwrap the Arrow C++ :cpp:type:`SparseCOOTensor` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_sparse_csc_matrix(obj) -> shared_ptr[CSparseCSCMatrix]
+
+ Unwrap the Arrow C++ :cpp:type:`SparseCSCMatrix` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_sparse_csf_tensor(obj) -> shared_ptr[CSparseCSFTensor]
+
+ Unwrap the Arrow C++ :cpp:type:`SparseCSFTensor` pointer from *obj*.
+
+.. function:: pyarrow_unwrap_sparse_csr_matrix(obj) -> shared_ptr[CSparseCSRMatrix]
+
+ Unwrap the Arrow C++ :cpp:type:`SparseCSRMatrix` pointer from *obj*.
+
+
+The following functions take a Arrow C++ API pointer and wrap it in a
+pyarray object of the corresponding type. An exception is raised on error.
+
+.. function:: pyarrow_wrap_array(const shared_ptr[CArray]& array) -> object
+
+ Wrap the Arrow C++ *array* in a Python :class:`pyarrow.Array` instance.
+
+.. function:: pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& batch) -> object
+
+ Wrap the Arrow C++ record *batch* in a Python :class:`pyarrow.RecordBatch` instance.
+
+.. function:: pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buffer) -> object
+
+ Wrap the Arrow C++ *buffer* in a Python :class:`pyarrow.Buffer` instance.
+
+.. function:: pyarrow_wrap_data_type(const shared_ptr[CDataType]& data_type) -> object
+
+ Wrap the Arrow C++ *data_type* in a Python :class:`pyarrow.DataType` instance.
+
+.. function:: pyarrow_wrap_field(const shared_ptr[CField]& field) -> object
+
+ Wrap the Arrow C++ *field* in a Python :class:`pyarrow.Field` instance.
+
+.. function:: pyarrow_wrap_resizable_buffer(const shared_ptr[CResizableBuffer]& buffer) -> object
+
+ Wrap the Arrow C++ resizable *buffer* in a Python :class:`pyarrow.ResizableBuffer` instance.
+
+.. function:: pyarrow_wrap_scalar(const shared_ptr[CScalar]& scalar) -> object
+
+ Wrap the Arrow C++ *scalar* in a Python :class:`pyarrow.Scalar` instance.
+
+.. function:: pyarrow_wrap_schema(const shared_ptr[CSchema]& schema) -> object
+
+ Wrap the Arrow C++ *schema* in a Python :class:`pyarrow.Schema` instance.
+
+.. function:: pyarrow_wrap_table(const shared_ptr[CTable]& table) -> object
+
+ Wrap the Arrow C++ *table* in a Python :class:`pyarrow.Table` instance.
+
+.. function:: pyarrow_wrap_tensor(const shared_ptr[CTensor]& tensor) -> object
+
+ Wrap the Arrow C++ *tensor* in a Python :class:`pyarrow.Tensor` instance.
+
+.. function:: pyarrow_wrap_sparse_coo_tensor(const shared_ptr[CSparseCOOTensor]& sparse_tensor) -> object
+
+ Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseCOOTensor` instance.
+
+.. function:: pyarrow_wrap_sparse_csc_matrix(const shared_ptr[CSparseCSCMatrix]& sparse_tensor) -> object
+
+ Wrap the Arrow C++ *CSC sparse tensor* in a Python :class:`pyarrow.SparseCSCMatrix` instance.
+
+.. function:: pyarrow_wrap_sparse_csf_tensor(const shared_ptr[CSparseCSFTensor]& sparse_tensor) -> object
+
+ Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseCSFTensor` instance.
+
+.. function:: pyarrow_wrap_sparse_csr_matrix(const shared_ptr[CSparseCSRMatrix]& sparse_tensor) -> object
+
+ Wrap the Arrow C++ *CSR sparse tensor* in a Python :class:`pyarrow.SparseCSRMatrix` instance.
+
+
+Example
+~~~~~~~
+
+The following Cython module shows how to unwrap a Python object and call
+the underlying C++ object's API.
+
+.. code-block:: python
+
+ # distutils: language=c++
+
+ from pyarrow.lib cimport *
+
+
+ def get_array_length(obj):
+ # Just an example function accessing both the pyarrow Cython API
+ # and the Arrow C++ API
+ cdef shared_ptr[CArray] arr = pyarrow_unwrap_array(obj)
+ if arr.get() == NULL:
+ raise TypeError("not an array")
+ return arr.get().length()
+
+To build this module, you will need a slightly customized ``setup.py`` file
+(this is assuming the file above is named ``example.pyx``):
+
+.. code-block:: python
+
+ from setuptools import setup
+ from Cython.Build import cythonize
+
+ import os
+ import numpy as np
+ import pyarrow as pa
+
+
+ ext_modules = cythonize("example.pyx")
+
+ for ext in ext_modules:
+ # The Numpy C headers are currently required
+ ext.include_dirs.append(np.get_include())
+ ext.include_dirs.append(pa.get_include())
+ ext.libraries.extend(pa.get_libraries())
+ ext.library_dirs.extend(pa.get_library_dirs())
+
+ if os.name == 'posix':
+ ext.extra_compile_args.append('-std=c++11')
+
+ # Try uncommenting the following line on Linux
+ # if you get weird linker errors or runtime crashes
+ # ext.define_macros.append(("_GLIBCXX_USE_CXX11_ABI", "0"))
+
+
+ setup(ext_modules=ext_modules)
+
+
+Compile the extension:
+
+.. code-block:: bash
+
+ python setup.py build_ext --inplace
+
+Building Extensions against PyPI Wheels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Python wheels have the Arrow C++ libraries bundled in the top level
+``pyarrow/`` install directory. On Linux and macOS, these libraries have an ABI
+tag like ``libarrow.so.17`` which means that linking with ``-larrow`` using the
+linker path provided by ``pyarrow.get_library_dirs()`` will not work right out
+of the box. To fix this, you must run ``pyarrow.create_library_symlinks()``
+once as a user with write access to the directory where pyarrow is
+installed. This function will attempt to create symlinks like
+``pyarrow/libarrow.so``. For example:
+
+.. code-block:: bash
+
+ pip install pyarrow
+ python -c "import pyarrow; pyarrow.create_library_symlinks()"
+
+Toolchain Compatibility (Linux)
+"""""""""""""""""""""""""""""""
+
+The Python wheels for Linux are built using the
+`PyPA manylinux images <https://quay.io/organization/pypa>`_ which use
+the CentOS `devtoolset-8` or `devtoolset-9` depending on which manylinux
+wheel version (2010 or 2014) is being used. In addition to the other notes
+above, if you are compiling C++ using these shared libraries, you will need
+to make sure you use a compatible toolchain as well or you might see a
+segfault during runtime.
+
+Also, if you encounter errors when linking or loading the library, consider
+setting the ``_GLIBCXX_USE_CXX11_ABI`` preprocessor macro to ``0``
+(for example by adding ``-D_GLIBCXX_USE_CXX11_ABI=0`` to ``CFLAGS``).
diff --git a/src/arrow/docs/source/python/extending_types.rst b/src/arrow/docs/source/python/extending_types.rst
new file mode 100644
index 000000000..689724a4a
--- /dev/null
+++ b/src/arrow/docs/source/python/extending_types.rst
@@ -0,0 +1,324 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+.. _extending_types:
+
+Extending pyarrow
+=================
+
+Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol
+-----------------------------------------------------------------------------
+
+The :func:`pyarrow.array` function has built-in support for Python sequences,
+numpy arrays and pandas 1D objects (Series, Index, Categorical, ..) to convert
+those to Arrow arrays. This can be extended for other array-like objects
+by implementing the ``__arrow_array__`` method (similar to numpy's ``__array__``
+protocol).
+
+For example, to support conversion of your duck array class to an Arrow array,
+define the ``__arrow_array__`` method to return an Arrow array::
+
+ class MyDuckArray:
+
+ ...
+
+ def __arrow_array__(self, type=None):
+ # convert the underlying array values to a pyarrow Array
+ import pyarrow
+ return pyarrow.array(..., type=type)
+
+The ``__arrow_array__`` method takes an optional `type` keyword which is passed
+through from :func:`pyarrow.array`. The method is allowed to return either
+a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`.
+
+
+Defining extension types ("user-defined types")
+-----------------------------------------------
+
+Arrow has the notion of extension types in the metadata specification as a
+possibility to extend the built-in types. This is done by annotating any of the
+built-in Arrow logical types (the "storage type") with a custom type name and
+optional serialized representation ("ARROW:extension:name" and
+"ARROW:extension:metadata" keys in the Field’s custom_metadata of an IPC
+message).
+See the :ref:`format_metadata_extension_types` section of the metadata
+specification for more details.
+
+Pyarrow allows you to define such extension types from Python.
+
+There are currently two ways:
+
+* Subclassing :class:`PyExtensionType`: the (de)serialization is based on pickle.
+ This is a good option for an extension type that is only used from Python.
+* Subclassing :class:`ExtensionType`: this allows to give a custom
+ Python-independent name and serialized metadata, that can potentially be
+ recognized by other (non-Python) Arrow implementations such as PySpark.
+
+For example, we could define a custom UUID type for 128-bit numbers which can
+be represented as ``FixedSizeBinary`` type with 16 bytes.
+Using the first approach, we create a ``UuidType`` subclass, and implement the
+``__reduce__`` method to ensure the class can be properly pickled::
+
+ class UuidType(pa.PyExtensionType):
+
+ def __init__(self):
+ pa.PyExtensionType.__init__(self, pa.binary(16))
+
+ def __reduce__(self):
+ return UuidType, ()
+
+This can now be used to create arrays and tables holding the extension type::
+
+ >>> uuid_type = UuidType()
+ >>> uuid_type.extension_name
+ 'arrow.py_extension_type'
+ >>> uuid_type.storage_type
+ FixedSizeBinaryType(fixed_size_binary[16])
+
+ >>> import uuid
+ >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16))
+ >>> arr = pa.ExtensionArray.from_storage(uuid_type, storage_array)
+ >>> arr
+ <pyarrow.lib.ExtensionArray object at 0x7f75c2f300a0>
+ [
+ A6861959108644B797664AEEE686B682,
+ 718747F48E5F4058A7261E2B6B228BE8,
+ 7FE201227D624D96A5CD8639DEF2A68B,
+ C6CA8C7F95744BFD9462A40B3F57A86C
+ ]
+
+This array can be included in RecordBatches, sent over IPC and received in
+another Python process. The custom UUID type will be preserved there, as long
+as the definition of the class is available (the type can be unpickled).
+
+For example, creating a RecordBatch and writing it to a stream using the
+IPC protocol::
+
+ >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"])
+ >>> sink = pa.BufferOutputStream()
+ >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer:
+ ... writer.write_batch(batch)
+ >>> buf = sink.getvalue()
+
+and then reading it back yields the proper type::
+
+ >>> with pa.ipc.open_stream(buf) as reader:
+ ... result = reader.read_all()
+ >>> result.column('ext').type
+ UuidType(extension<arrow.py_extension_type>)
+
+We can define the same type using the other option::
+
+ class UuidType(pa.ExtensionType):
+
+ def __init__(self):
+ pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid")
+
+ def __arrow_ext_serialize__(self):
+ # since we don't have a parameterized type, we don't need extra
+ # metadata to be deserialized
+ return b''
+
+ @classmethod
+ def __arrow_ext_deserialize__(self, storage_type, serialized):
+ # return an instance of this subclass given the serialized
+ # metadata.
+ return UuidType()
+
+This is a slightly longer implementation (you need to implement the special
+methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__``), and the
+extension type needs to be registered to be received through IPC (using
+:func:`register_extension_type`), but it has
+now a unique name::
+
+ >>> uuid_type = UuidType()
+ >>> uuid_type.extension_name
+ 'my_package.uuid'
+
+ >>> pa.register_extension_type(uuid_type)
+
+The receiving application doesn't need to be Python but can still recognize
+the extension type as a "uuid" type, if it has implemented its own extension
+type to receive it.
+If the type is not registered in the receiving application, it will fall back
+to the storage type.
+
+Parameterized extension type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The above example used a fixed storage type with no further metadata. But
+more flexible, parameterized extension types are also possible.
+
+The example given here implements an extension type for the `pandas "period"
+data type <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-span-representation>`__,
+representing time spans (e.g., a frequency of a day, a month, a quarter, etc).
+It is stored as an int64 array which is interpreted as the number of time spans
+of the given frequency since 1970.
+
+::
+
+ class PeriodType(pa.ExtensionType):
+
+ def __init__(self, freq):
+ # attributes need to be set first before calling
+ # super init (as that calls serialize)
+ self._freq = freq
+ pa.ExtensionType.__init__(self, pa.int64(), 'my_package.period')
+
+ @property
+ def freq(self):
+ return self._freq
+
+ def __arrow_ext_serialize__(self):
+ return "freq={}".format(self.freq).encode()
+
+ @classmethod
+ def __arrow_ext_deserialize__(cls, storage_type, serialized):
+ # return an instance of this subclass given the serialized
+ # metadata.
+ serialized = serialized.decode()
+ assert serialized.startswith("freq=")
+ freq = serialized.split('=')[1]
+ return PeriodType(freq)
+
+Here, we ensure to store all information in the serialized metadata that is
+needed to reconstruct the instance (in the ``__arrow_ext_deserialize__`` class
+method), in this case the frequency string.
+
+Note that, once created, the data type instance is considered immutable. If,
+in the example above, the ``freq`` parameter would change after instantiation,
+the reconstruction of the type instance after IPC will be incorrect.
+In the example above, the ``freq`` parameter is therefore stored in a private
+attribute with a public read-only property to access it.
+
+Parameterized extension types are also possible using the pickle-based type
+subclassing :class:`PyExtensionType`. The equivalent example for the period
+data type from above would look like::
+
+ class PeriodType(pa.PyExtensionType):
+
+ def __init__(self, freq):
+ self._freq = freq
+ pa.PyExtensionType.__init__(self, pa.int64())
+
+ @property
+ def freq(self):
+ return self._freq
+
+ def __reduce__(self):
+ return PeriodType, (self.freq,)
+
+Also the storage type does not need to be fixed but can be parameterized.
+
+Custom extension array class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, all arrays with an extension type are constructed or deserialized into
+a built-in :class:`ExtensionArray` object. Nevertheless, one could want to subclass
+:class:`ExtensionArray` in order to add some custom logic specific to the extension
+type. Arrow allows to do so by adding a special method ``__arrow_ext_class__`` to the
+definition of the extension type.
+
+For instance, let us consider the example from the `Numpy Quickstart <https://docs.scipy.org/doc/numpy-1.13.0/user/quickstart.html>`_ of points in 3D space.
+We can store these as a fixed-size list, where we wish to be able to extract
+the data as a 2-D Numpy array ``(N, 3)`` without any copy::
+
+ class Point3DArray(pa.ExtensionArray):
+ def to_numpy_array(self):
+ return self.storage.flatten().to_numpy().reshape((-1, 3))
+
+
+ class Point3DType(pa.PyExtensionType):
+ def __init__(self):
+ pa.PyExtensionType.__init__(self, pa.list_(pa.float32(), 3))
+
+ def __reduce__(self):
+ return Point3DType, ()
+
+ def __arrow_ext_class__(self):
+ return Point3DArray
+
+Arrays built using this extension type now have the expected custom array class::
+
+ >>> storage = pa.array([[1, 2, 3], [4, 5, 6]], pa.list_(pa.float32(), 3))
+ >>> arr = pa.ExtensionArray.from_storage(Point3DType(), storage)
+ >>> arr
+ <__main__.Point3DArray object at 0x7f40dea80670>
+ [
+ [
+ 1,
+ 2,
+ 3
+ ],
+ [
+ 4,
+ 5,
+ 6
+ ]
+ ]
+
+The additional methods in the extension class are then available to the user::
+
+ >>> arr.to_numpy_array()
+ array([[1., 2., 3.],
+ [4., 5., 6.]], dtype=float32)
+
+
+This array can be sent over IPC, received in another Python process, and the custom
+extension array class will be preserved (as long as the definitions of the classes above
+are available).
+
+The same ``__arrow_ext_class__`` specialization can be used with custom types defined
+by subclassing :class:`ExtensionType`.
+
+
+Conversion to pandas
+~~~~~~~~~~~~~~~~~~~~
+
+The conversion to pandas (in :meth:`Table.to_pandas`) of columns with an
+extension type can controlled in case there is a corresponding
+`pandas extension array <https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extension-types>`__
+for your extension type.
+
+For this, the :meth:`ExtensionType.to_pandas_dtype` method needs to be
+implemented, and should return a ``pandas.api.extensions.ExtensionDtype``
+subclass instance.
+
+Using the pandas period type from above as example, this would look like::
+
+ class PeriodType(pa.ExtensionType):
+ ...
+
+ def to_pandas_dtype(self):
+ import pandas as pd
+ return pd.PeriodDtype(freq=self.freq)
+
+Secondly, the pandas ``ExtensionDtype`` on its turn needs to have the
+``__from_arrow__`` method implemented: a method that given a pyarrow Array
+or ChunkedArray of the extension type can construct the corresponding
+pandas ``ExtensionArray``. This method should have the following signature::
+
+
+ class MyExtensionDtype(pd.api.extensions.ExtensionDtype):
+ ...
+
+ def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> pandas.ExtensionArray:
+ ...
+
+This way, you can control the conversion of a pyarrow ``Array`` of your pyarrow
+extension type to a pandas ``ExtensionArray`` that can be stored in a DataFrame.
diff --git a/src/arrow/docs/source/python/feather.rst b/src/arrow/docs/source/python/feather.rst
new file mode 100644
index 000000000..026ea987a
--- /dev/null
+++ b/src/arrow/docs/source/python/feather.rst
@@ -0,0 +1,109 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+
+.. _feather:
+
+Feather File Format
+===================
+
+Feather is a portable file format for storing Arrow tables or data frames (from
+languages like Python or R) that utilizes the :ref:`Arrow IPC format <ipc>`
+internally. Feather was created early in the Arrow project as a proof of
+concept for fast, language-agnostic data frame storage for Python (pandas) and
+R. There are two file format versions for Feather:
+
+* Version 2 (V2), the default version, which is exactly represented as the
+ Arrow IPC file format on disk. V2 files support storing all Arrow data types
+ as well as compression with LZ4 or ZSTD. V2 was first made available in
+ Apache Arrow 0.17.0.
+* Version 1 (V1), a legacy version available starting in 2016, replaced by
+ V2. V1 files are distinct from Arrow IPC files and lack many features, such
+ as the ability to store all Arrow data types. V1 files also lack compression
+ support. We intend to maintain read support for V1 for the foreseeable
+ future.
+
+The ``pyarrow.feather`` module contains the read and write functions for the
+format. :func:`~pyarrow.feather.write_feather` accepts either a
+:class:`~pyarrow.Table` or ``pandas.DataFrame`` object:
+
+.. code-block:: python
+
+ import pyarrow.feather as feather
+ feather.write_feather(df, '/path/to/file')
+
+:func:`~pyarrow.feather.read_feather` reads a Feather file as a
+``pandas.DataFrame``. :func:`~pyarrow.feather.read_table` reads a Feather file
+as a :class:`~pyarrow.Table`. Internally, :func:`~pyarrow.feather.read_feather`
+simply calls :func:`~pyarrow.feather.read_table` and the result is converted to
+pandas:
+
+.. code-block:: python
+
+ # Result is pandas.DataFrame
+ read_df = feather.read_feather('/path/to/file')
+
+ # Result is pyarrow.Table
+ read_arrow = feather.read_table('/path/to/file')
+
+These functions can read and write with file-paths or file-like objects. For
+example:
+
+.. code-block:: python
+
+ with open('/path/to/file', 'wb') as f:
+ feather.write_feather(df, f)
+
+ with open('/path/to/file', 'rb') as f:
+ read_df = feather.read_feather(f)
+
+A file input to ``read_feather`` must support seeking.
+
+Using Compression
+-----------------
+
+As of Apache Arrow version 0.17.0, Feather V2 files (the default version)
+support two fast compression libraries, LZ4 (using the frame format) and
+ZSTD. LZ4 is used by default if it is available (which it should be if you
+obtained pyarrow through a normal package manager):
+
+.. code-block:: python
+
+ # Uses LZ4 by default
+ feather.write_feather(df, file_path)
+
+ # Use LZ4 explicitly
+ feather.write_feather(df, file_path, compression='lz4')
+
+ # Use ZSTD
+ feather.write_feather(df, file_path, compression='zstd')
+
+ # Do not compress
+ feather.write_feather(df, file_path, compression='uncompressed')
+
+Note that the default LZ4 compression generally yields much smaller files
+without sacrificing much read or write performance. In some instances,
+LZ4-compressed files may be faster to read and write than uncompressed due to
+reduced disk IO requirements.
+
+Writing Version 1 (V1) Files
+----------------------------
+
+For compatibility with libraries without support for Version 2 files, you can
+write the version 1 format by passing ``version=1`` to ``write_feather``. We
+intend to maintain read support for V1 for the foreseeable future.
diff --git a/src/arrow/docs/source/python/filesystems.rst b/src/arrow/docs/source/python/filesystems.rst
new file mode 100644
index 000000000..1ddb4dfa2
--- /dev/null
+++ b/src/arrow/docs/source/python/filesystems.rst
@@ -0,0 +1,305 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _filesystem:
+
+.. currentmodule:: pyarrow.fs
+
+Filesystem Interface
+====================
+
+PyArrow comes with an abstract filesystem interface, as well as concrete
+implementations for various storage types.
+
+The filesystem interface provides input and output streams as well as
+directory operations. A simplified view of the underlying data
+storage is exposed. Data paths are represented as *abstract paths*, which
+are ``/``-separated, even on Windows, and shouldn't include special path
+components such as ``.`` and ``..``. Symbolic links, if supported by the
+underlying storage, are automatically dereferenced. Only basic
+:class:`metadata <FileInfo>` about file entries, such as the file size
+and modification time, is made available.
+
+The core interface is represented by the base class :class:`FileSystem`.
+
+Pyarrow implements natively the following filesystem subclasses:
+
+* :ref:`filesystem-localfs` (:class:`LocalFileSystem`)
+* :ref:`filesystem-s3` (:class:`S3FileSystem`)
+* :ref:`filesystem-hdfs` (:class:`HadoopFileSystem`)
+
+It is also possible to use your own fsspec-compliant filesystem with pyarrow functionalities as described in the section :ref:`filesystem-fsspec`.
+
+
+.. _filesystem-usage:
+
+Usage
+-----
+
+Instantiating a filesystem
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A FileSystem object can be created with one of the constructors (and check the
+respective constructor for its options)::
+
+ >>> from pyarrow import fs
+ >>> local = fs.LocalFileSystem()
+
+or alternatively inferred from a URI::
+
+ >>> s3, path = fs.FileSystem.from_uri("s3://my-bucket")
+ >>> s3
+ <pyarrow._s3fs.S3FileSystem at 0x7f6760cbf4f0>
+ >>> path
+ 'my-bucket'
+
+
+Reading and writing files
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Several of the IO-related functions in PyArrow accept either a URI (and infer
+the filesystem) or an explicit ``filesystem`` argument to specify the filesystem
+to read or write from. For example, the :meth:`pyarrow.parquet.read_table`
+function can be used in the following ways::
+
+ import pyarrow.parquet as pq
+
+ # using a URI -> filesystem is inferred
+ pq.read_table("s3://my-bucket/data.parquet")
+ # using a path and filesystem
+ s3 = fs.S3FileSystem(..)
+ pq.read_table("my-bucket/data.parquet", filesystem=s3)
+
+The filesystem interface further allows to open files for reading (input) or
+writing (output) directly, which can be combined with functions that work with
+file-like objects. For example::
+
+ import pyarrow as pa
+
+ local = fs.LocalFileSystem()
+
+ with local.open_output_stream("test.arrow") as file:
+ with pa.RecordBatchFileWriter(file, table.schema) as writer:
+ writer.write_table(table)
+
+
+Listing files
+~~~~~~~~~~~~~
+
+Inspecting the directories and files on a filesystem can be done with the
+:meth:`FileSystem.get_file_info` method. To list the contents of a directory,
+use the :class:`FileSelector` object to specify the selection::
+
+ >>> local.get_file_info(fs.FileSelector("dataset/", recursive=True))
+ [<FileInfo for 'dataset/part=B': type=FileType.Directory>,
+ <FileInfo for 'dataset/part=B/data0.parquet': type=FileType.File, size=1564>,
+ <FileInfo for 'dataset/part=A': type=FileType.Directory>,
+ <FileInfo for 'dataset/part=A/data0.parquet': type=FileType.File, size=1564>]
+
+This returns a list of :class:`FileInfo` objects, containing information about
+the type (file or directory), the size, the date last modified, etc.
+
+You can also get this information for a single explicit path (or list of
+paths)::
+
+ >>> local.get_file_info('test.arrow')
+ <FileInfo for 'test.arrow': type=FileType.File, size=3250>
+
+ >>> local.get_file_info('non_existent')
+ <FileInfo for 'non_existent': type=FileType.NotFound>
+
+
+.. _filesystem-localfs:
+
+Local FS
+--------
+
+The :class:`LocalFileSystem` allows you to access files on the local machine.
+
+Example how to write to disk and read it back::
+
+ >>> from pyarrow import fs
+ >>> local = fs.LocalFileSystem()
+ >>> with local.open_output_stream('/tmp/pyarrowtest.dat') as stream:
+ stream.write(b'data')
+ 4
+ >>> with local.open_input_stream('/tmp/pyarrowtest.dat') as stream:
+ print(stream.readall())
+ b'data'
+
+
+.. _filesystem-s3:
+
+S3
+--
+
+PyArrow implements natively a S3 filesystem for S3 compatible storage.
+
+The :class:`S3FileSystem` constructor has several options to configure the S3
+connection (e.g. credentials, the region, an endpoint override, etc). In
+addition, the constructor will also inspect configured S3 credentials as
+supported by AWS (for example the ``AWS_ACCESS_KEY_ID`` and
+``AWS_SECRET_ACCESS_KEY`` environment variables).
+
+Example how you can read contents from a S3 bucket::
+
+ >>> from pyarrow import fs
+ >>> s3 = fs.S3FileSystem(region='eu-west-3')
+
+ # List all contents in a bucket, recursively
+ >>> s3.get_file_info(fs.FileSelector('my-test-bucket', recursive=True))
+ [<FileInfo for 'my-test-bucket/File1': type=FileType.File, size=10>,
+ <FileInfo for 'my-test-bucket/File5': type=FileType.File, size=10>,
+ <FileInfo for 'my-test-bucket/Dir1': type=FileType.Directory>,
+ <FileInfo for 'my-test-bucket/Dir2': type=FileType.Directory>,
+ <FileInfo for 'my-test-bucket/EmptyDir': type=FileType.Directory>,
+ <FileInfo for 'my-test-bucket/Dir1/File2': type=FileType.File, size=11>,
+ <FileInfo for 'my-test-bucket/Dir1/Subdir': type=FileType.Directory>,
+ <FileInfo for 'my-test-bucket/Dir2/Subdir': type=FileType.Directory>,
+ <FileInfo for 'my-test-bucket/Dir2/Subdir/File3': type=FileType.File, size=10>]
+
+ # Open a file for reading and download its contents
+ >>> f = s3.open_input_stream('my-test-bucket/Dir1/File2')
+ >>> f.readall()
+ b'some data'
+
+.. seealso::
+
+ See the `AWS docs <https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html>`__
+ for the different ways to configure the AWS credentials.
+
+
+.. _filesystem-hdfs:
+
+Hadoop Distributed File System (HDFS)
+-------------------------------------
+
+PyArrow comes with bindings to the Hadoop File System (based on C++ bindings
+using ``libhdfs``, a JNI-based interface to the Java Hadoop client). You connect
+using the :class:`HadoopFileSystem` constructor:
+
+.. code-block:: python
+
+ from pyarrow import fs
+ hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path)
+
+The ``libhdfs`` library is loaded **at runtime** (rather than at link / library
+load time, since the library may not be in your LD_LIBRARY_PATH), and relies on
+some environment variables.
+
+* ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has
+ `lib/native/libhdfs.so`.
+
+* ``JAVA_HOME``: the location of your Java SDK installation.
+
+* ``ARROW_LIBHDFS_DIR`` (optional): explicit location of ``libhdfs.so`` if it is
+ installed somewhere other than ``$HADOOP_HOME/lib/native``.
+
+* ``CLASSPATH``: must contain the Hadoop jars. You can set these using:
+
+ .. code-block:: shell
+
+ export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
+ # or on Windows
+ %HADOOP_HOME%/bin/hadoop classpath --glob > %CLASSPATH%
+
+ In contrast to the legacy HDFS filesystem with ``pa.hdfs.connect``, setting
+ ``CLASSPATH`` is not optional (pyarrow will not attempt to infer it).
+
+.. _filesystem-fsspec:
+
+Using fsspec-compatible filesystems with Arrow
+----------------------------------------------
+
+The filesystems mentioned above are natively supported by Arrow C++ / PyArrow.
+The Python ecosystem, however, also has several filesystem packages. Those
+packages following the `fsspec`_ interface can be used in PyArrow as well.
+
+Functions accepting a filesystem object will also accept an fsspec subclass.
+For example::
+
+ # creating an fsspec-based filesystem object for Google Cloud Storage
+ import gcsfs
+ fs = gcsfs.GCSFileSystem(project='my-google-project')
+
+ # using this to read a partitioned dataset
+ import pyarrow.dataset as ds
+ ds.dataset("data/", filesystem=fs)
+
+Similarly for Azure Blob Storage::
+
+ import adlfs
+ # ... load your credentials and configure the filesystem
+ fs = adlfs.AzureBlobFileSystem(account_name=account_name, account_key=account_key)
+
+ import pyarrow.dataset as ds
+ ds.dataset("mycontainer/data/", filesystem=fs)
+
+Under the hood, the fsspec filesystem object is wrapped into a python-based
+PyArrow filesystem (:class:`PyFileSystem`) using :class:`FSSpecHandler`.
+You can also manually do this to get an object with the PyArrow FileSystem
+interface::
+
+ from pyarrow.fs import PyFileSystem, FSSpecHandler
+ pa_fs = PyFileSystem(FSSpecHandler(fs))
+
+Then all the functionalities of :class:`FileSystem` are accessible::
+
+ # write data
+ with pa_fs.open_output_stream('mycontainer/pyarrowtest.dat') as stream:
+ stream.write(b'data')
+
+ # read data
+ with pa_fs.open_input_stream('mycontainer/pyarrowtest.dat') as stream:
+ print(stream.readall())
+ #b'data'
+
+ # read a partitioned dataset
+ ds.dataset("data/", filesystem=pa_fs)
+
+
+Using Arrow filesystems with fsspec
+-----------------------------------
+
+The Arrow FileSystem interface has a limited, developer-oriented API surface.
+This is sufficient for basic interactions and for using this with
+Arrow's IO functionality. On the other hand, the `fsspec`_ interface provides
+a very large API with many helper methods. If you want to use those, or if you
+need to interact with a package that expects fsspec-compatible filesystem
+objects, you can wrap an Arrow FileSystem object with fsspec.
+
+Starting with ``fsspec`` version 2021.09, the ``ArrowFSWrapper`` can be used
+for this::
+
+ >>> from pyarrow import fs
+ >>> local = fs.LocalFileSystem()
+ >>> from fsspec.implementations.arrow import ArrowFSWrapper
+ >>> local_fsspec = ArrowFSWrapper(local)
+
+The resulting object now has an fsspec-compatible interface, while being backed
+by the Arrow FileSystem under the hood.
+Example usage to create a directory and file, and list the content::
+
+ >>> local_fsspec.mkdir("./test")
+ >>> local_fsspec.touch("./test/file.txt")
+ >>> local_fsspec.ls("./test/")
+ ['./test/file.txt']
+
+For more information, see the `fsspec`_ documentation.
+
+
+.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/
diff --git a/src/arrow/docs/source/python/filesystems_deprecated.rst b/src/arrow/docs/source/python/filesystems_deprecated.rst
new file mode 100644
index 000000000..04887e977
--- /dev/null
+++ b/src/arrow/docs/source/python/filesystems_deprecated.rst
@@ -0,0 +1,95 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Filesystem Interface (legacy)
+=============================
+
+.. warning::
+ This section documents the deprecated filesystem layer. You should
+ use the :ref:`new filesystem layer <filesystem>` instead.
+
+.. _hdfs:
+
+Hadoop File System (HDFS)
+-------------------------
+
+PyArrow comes with bindings to a C++-based interface to the Hadoop File
+System. You connect like so:
+
+.. code-block:: python
+
+ import pyarrow as pa
+ fs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path)
+ with fs.open(path, 'rb') as f:
+ # Do something with f
+
+By default, ``pyarrow.hdfs.HadoopFileSystem`` uses libhdfs, a JNI-based
+interface to the Java Hadoop client. This library is loaded **at runtime**
+(rather than at link / library load time, since the library may not be in your
+LD_LIBRARY_PATH), and relies on some environment variables.
+
+* ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has
+ `lib/native/libhdfs.so`.
+
+* ``JAVA_HOME``: the location of your Java SDK installation.
+
+* ``ARROW_LIBHDFS_DIR`` (optional): explicit location of ``libhdfs.so`` if it is
+ installed somewhere other than ``$HADOOP_HOME/lib/native``.
+
+* ``CLASSPATH``: must contain the Hadoop jars. You can set these using:
+
+.. code-block:: shell
+
+ export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob`
+
+If ``CLASSPATH`` is not set, then it will be set automatically if the
+``hadoop`` executable is in your system path, or if ``HADOOP_HOME`` is set.
+
+You can also use libhdfs3, a thirdparty C++ library for HDFS from Pivotal Labs:
+
+.. code-block:: python
+
+ fs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path,
+ driver='libhdfs3')
+
+HDFS API
+~~~~~~~~
+
+.. currentmodule:: pyarrow
+
+.. autosummary::
+ :toctree: generated/
+
+ hdfs.connect
+ HadoopFileSystem.cat
+ HadoopFileSystem.chmod
+ HadoopFileSystem.chown
+ HadoopFileSystem.delete
+ HadoopFileSystem.df
+ HadoopFileSystem.disk_usage
+ HadoopFileSystem.download
+ HadoopFileSystem.exists
+ HadoopFileSystem.get_capacity
+ HadoopFileSystem.get_space_used
+ HadoopFileSystem.info
+ HadoopFileSystem.ls
+ HadoopFileSystem.mkdir
+ HadoopFileSystem.open
+ HadoopFileSystem.rename
+ HadoopFileSystem.rm
+ HadoopFileSystem.upload
+ HdfsFile
diff --git a/src/arrow/docs/source/python/getstarted.rst b/src/arrow/docs/source/python/getstarted.rst
new file mode 100644
index 000000000..36e4707ad
--- /dev/null
+++ b/src/arrow/docs/source/python/getstarted.rst
@@ -0,0 +1,145 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _getstarted:
+
+Getting Started
+===============
+
+Arrow manages data in arrays (:class:`pyarrow.Array`), which can be
+grouped in tables (:class:`pyarrow.Table`) to represent columns of data
+in tabular data.
+
+Arrow also provides support for various formats to get those tabular
+data in and out of disk and networks. Most commonly used formats are
+Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`).
+
+Creating Arrays and Tables
+--------------------------
+
+Arrays in Arrow are collections of data of uniform type. That allows
+Arrow to use the best performing implementation to store the data and
+perform computations on it. So each array is meant to have data and
+a type
+
+.. ipython:: python
+
+ import pyarrow as pa
+
+ days = pa.array([1, 12, 17, 23, 28], type=pa.int8())
+
+Multiple arrays can be combined in tables to form the columns
+in tabular data when attached to a column name
+
+.. ipython:: python
+
+ months = pa.array([1, 3, 5, 7, 1], type=pa.int8())
+ years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16())
+
+ birthdays_table = pa.table([days, months, years],
+ names=["days", "months", "years"])
+
+ birthdays_table
+
+See :ref:`data` for more details.
+
+Saving and Loading Tables
+-------------------------
+
+Once you have tabular data, Arrow provides out of the box
+the features to save and restore that data for common formats
+like Parquet:
+
+.. ipython:: python
+
+ import pyarrow.parquet as pq
+
+ pq.write_table(birthdays_table, 'birthdays.parquet')
+
+Once you have your data on disk, loading it back is a single function call,
+and Arrow is heavily optimized for memory and speed so loading
+data will be as quick as possible
+
+.. ipython:: python
+
+ reloaded_birthdays = pq.read_table('birthdays.parquet')
+
+ reloaded_birthdays
+
+Saving and loading back data in arrow is usually done through
+:ref:`Parquet <parquet>`, :ref:`IPC format <ipc>` (:ref:`feather`),
+:ref:`CSV <csv>` or :ref:`Line-Delimited JSON <json>` formats.
+
+Performing Computations
+-----------------------
+
+Arrow ships with a bunch of compute functions that can be applied
+to its arrays and tables, so through the compute functions
+it's possible to apply transformations to the data
+
+.. ipython:: python
+
+ import pyarrow.compute as pc
+
+ pc.value_counts(birthdays_table["years"])
+
+See :ref:`compute` for a list of available compute functions and
+how to use them.
+
+Working with large data
+-----------------------
+
+Arrow also provides the :class:`pyarrow.dataset` API to work with
+large data, which will handle for you partitioning of your data in
+smaller chunks
+
+.. ipython:: python
+
+ import pyarrow.dataset as ds
+
+ ds.write_dataset(birthdays_table, "savedir", format="parquet",
+ partitioning=ds.partitioning(
+ pa.schema([birthdays_table.schema.field("years")])
+ ))
+
+Loading back the partitioned dataset will detect the chunks
+
+.. ipython:: python
+
+ birthdays_dataset = ds.dataset("savedir", format="parquet", partitioning=["years"])
+
+ birthdays_dataset.files
+
+and will lazily load chunks of data only when iterating over them
+
+.. ipython:: python
+
+ import datetime
+
+ current_year = datetime.datetime.utcnow().year
+ for table_chunk in birthdays_dataset.to_batches():
+ print("AGES", pc.subtract(current_year, table_chunk["years"]))
+
+For further details on how to work with big datasets, how to filter them,
+how to project them, etc., refer to :ref:`dataset` documentation.
+
+Continuining from here
+----------------------
+
+For digging further into Arrow, you might want to read the
+:doc:`PyArrow Documentation <./index>` itself or the
+`Arrow Python Cookbook <https://arrow.apache.org/cookbook/py/>`_
diff --git a/src/arrow/docs/source/python/getting_involved.rst b/src/arrow/docs/source/python/getting_involved.rst
new file mode 100644
index 000000000..7159bdfb0
--- /dev/null
+++ b/src/arrow/docs/source/python/getting_involved.rst
@@ -0,0 +1,35 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Getting Involved
+================
+
+Right now the primary audience for Apache Arrow are the developers of data
+systems; most people will use Apache Arrow indirectly through systems that use
+it for internal data handling and interoperating with other Arrow-enabled
+systems.
+
+Even if you do not plan to contribute to Apache Arrow itself or Arrow
+integrations in other projects, we'd be happy to have you involved:
+
+ * Join the mailing list: send an email to
+ `dev-subscribe@arrow.apache.org <mailto:dev-subscribe@arrow.apache.org>`_.
+ Share your ideas and use cases for the project or read through the
+ `Archive <http://mail-archives.apache.org/mod_mbox/arrow-dev/>`_.
+ * Follow our activity on `JIRA <https://issues.apache.org/jira/browse/ARROW>`_
+ * Learn the `Format / Specification
+ <https://github.com/apache/arrow/tree/master/format>`_
diff --git a/src/arrow/docs/source/python/index.rst b/src/arrow/docs/source/python/index.rst
new file mode 100644
index 000000000..0ffa40545
--- /dev/null
+++ b/src/arrow/docs/source/python/index.rst
@@ -0,0 +1,62 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+PyArrow - Apache Arrow Python bindings
+======================================
+
+This is the documentation of the Python API of Apache Arrow.
+
+Apache Arrow is a development platform for in-memory analytics.
+It contains a set of technologies that enable big data systems to store, process and move data fast.
+
+See the :doc:`parent documentation <../index>` for additional details on
+the Arrow Project itself, on the Arrow format and the other language bindings.
+
+The Arrow Python bindings (also named "PyArrow") have first-class integration
+with NumPy, pandas, and built-in Python objects. They are based on the C++
+implementation of Arrow.
+
+Here will we detail the usage of the Python API for Arrow and the leaf
+libraries that add additional functionality such as reading Apache Parquet
+files into Arrow structures.
+
+.. toctree::
+ :maxdepth: 2
+
+ install
+ getstarted
+ data
+ compute
+ memory
+ ipc
+ filesystems
+ filesystems_deprecated
+ plasma
+ numpy
+ pandas
+ timestamps
+ csv
+ feather
+ json
+ parquet
+ dataset
+ cuda
+ extending_types
+ extending
+ api
+ getting_involved
+ benchmarks
diff --git a/src/arrow/docs/source/python/install.rst b/src/arrow/docs/source/python/install.rst
new file mode 100644
index 000000000..3c23d8a0f
--- /dev/null
+++ b/src/arrow/docs/source/python/install.rst
@@ -0,0 +1,90 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Installing PyArrow
+==================
+
+System Compatibility
+--------------------
+
+PyArrow is regularly built and tested on Windows, macOS and various Linux
+distributions (including Ubuntu 16.04, Ubuntu 18.04). We strongly recommend
+using a 64-bit system.
+
+Python Compatibility
+--------------------
+
+PyArrow is currently compatible with Python 3.6, 3.7, 3.8, and 3.9.
+
+Using Conda
+-----------
+
+Install the latest version of PyArrow from
+`conda-forge <https://conda-forge.org/>`_ using `Conda <https://conda.io>`_:
+
+.. code-block:: bash
+
+ conda install -c conda-forge pyarrow
+
+Using Pip
+---------
+
+Install the latest version from `PyPI <https://pypi.org/>`_ (Windows, Linux,
+and macOS):
+
+.. code-block:: bash
+
+ pip install pyarrow
+
+If you encounter any importing issues of the pip wheels on Windows, you may
+need to install the `Visual C++ Redistributable for Visual Studio 2015
+<https://www.microsoft.com/en-us/download/details.aspx?id=48145>`_.
+
+.. warning::
+ On Linux, you will need pip >= 19.0 to detect the prebuilt binary packages.
+
+Installing from source
+----------------------
+
+See :ref:`python-development`.
+
+Installing Nightly Packages
+---------------------------
+
+.. warning::
+ These packages are not official releases. Use them at your own risk.
+
+PyArrow has nightly wheels and conda packages for testing purposes.
+
+These may be suitable for downstream libraries in their continuous integration
+setup to maintain compatibility with the upcoming PyArrow features,
+deprecations and/or feature removals.
+
+Install the development version of PyArrow from `arrow-nightlies
+<https://anaconda.org/arrow-nightlies/pyarrow>`_ conda channel:
+
+.. code-block:: bash
+
+ conda install -c arrow-nightlies pyarrow
+
+Install the development version from an `alternative PyPI
+<https://gemfury.com/arrow-nightlies>`_ index:
+
+.. code-block:: bash
+
+ pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ \
+ --prefer-binary --pre pyarrow
diff --git a/src/arrow/docs/source/python/ipc.rst b/src/arrow/docs/source/python/ipc.rst
new file mode 100644
index 000000000..0ba557b64
--- /dev/null
+++ b/src/arrow/docs/source/python/ipc.rst
@@ -0,0 +1,385 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+
+.. _ipc:
+
+Streaming, Serialization, and IPC
+=================================
+
+Writing and Reading Streams
+---------------------------
+
+Arrow defines two types of binary formats for serializing record batches:
+
+* **Streaming format**: for sending an arbitrary length sequence of record
+ batches. The format must be processed from start to end, and does not support
+ random access
+
+* **File or Random Access format**: for serializing a fixed number of record
+ batches. Supports random access, and thus is very useful when used with
+ memory maps
+
+To follow this section, make sure to first read the section on :ref:`Memory and
+IO <io>`.
+
+Using streams
+~~~~~~~~~~~~~
+
+First, let's create a small record batch:
+
+.. ipython:: python
+
+ import pyarrow as pa
+
+ data = [
+ pa.array([1, 2, 3, 4]),
+ pa.array(['foo', 'bar', 'baz', None]),
+ pa.array([True, None, False, True])
+ ]
+
+ batch = pa.record_batch(data, names=['f0', 'f1', 'f2'])
+ batch.num_rows
+ batch.num_columns
+
+Now, we can begin writing a stream containing some number of these batches. For
+this we use :class:`~pyarrow.RecordBatchStreamWriter`, which can write to a
+writeable ``NativeFile`` object or a writeable Python object. For convenience,
+this one can be created with :func:`~pyarrow.ipc.new_stream`:
+
+.. ipython:: python
+
+ sink = pa.BufferOutputStream()
+
+ with pa.ipc.new_stream(sink, batch.schema) as writer:
+ for i in range(5):
+ writer.write_batch(batch)
+
+Here we used an in-memory Arrow buffer stream (``sink``),
+but this could have been a socket or some other IO sink.
+
+When creating the ``StreamWriter``, we pass the schema, since the schema
+(column names and types) must be the same for all of the batches sent in this
+particular stream. Now we can do:
+
+.. ipython:: python
+
+ buf = sink.getvalue()
+ buf.size
+
+Now ``buf`` contains the complete stream as an in-memory byte buffer. We can
+read such a stream with :class:`~pyarrow.RecordBatchStreamReader` or the
+convenience function ``pyarrow.ipc.open_stream``:
+
+.. ipython:: python
+
+ with pa.ipc.open_stream(buf) as reader:
+ schema = reader.schema
+ batches = [b for b in reader]
+
+ schema
+ len(batches)
+
+We can check the returned batches are the same as the original input:
+
+.. ipython:: python
+
+ batches[0].equals(batch)
+
+An important point is that if the input source supports zero-copy reads
+(e.g. like a memory map, or ``pyarrow.BufferReader``), then the returned
+batches are also zero-copy and do not allocate any new memory on read.
+
+Writing and Reading Random Access Files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :class:`~pyarrow.RecordBatchFileWriter` has the same API as
+:class:`~pyarrow.RecordBatchStreamWriter`. You can create one with
+:func:`~pyarrow.ipc.new_file`:
+
+.. ipython:: python
+
+ sink = pa.BufferOutputStream()
+
+ with pa.ipc.new_file(sink, batch.schema) as writer:
+ for i in range(10):
+ writer.write_batch(batch)
+
+ buf = sink.getvalue()
+ buf.size
+
+The difference between :class:`~pyarrow.RecordBatchFileReader` and
+:class:`~pyarrow.RecordBatchStreamReader` is that the input source must have a
+``seek`` method for random access. The stream reader only requires read
+operations. We can also use the :func:`~pyarrow.ipc.open_file` method to open a file:
+
+.. ipython:: python
+
+ with pa.ipc.open_file(buf) as reader:
+ num_record_batches = reader.num_record_batches
+ b = reader.get_batch(3)
+
+Because we have access to the entire payload, we know the number of record
+batches in the file, and can read any at random.
+
+.. ipython:: python
+
+ num_record_batches
+ b.equals(batch)
+
+Reading from Stream and File Format for pandas
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The stream and file reader classes have a special ``read_pandas`` method to
+simplify reading multiple record batches and converting them to a single
+DataFrame output:
+
+.. ipython:: python
+
+ with pa.ipc.open_file(buf) as reader:
+ df = reader.read_pandas()
+
+ df[:5]
+
+Efficiently Writing and Reading Arrow Data
+------------------------------------------
+
+Being optimized for zero copy and memory mapped data, Arrow allows to easily
+read and write arrays consuming the minimum amount of resident memory.
+
+When writing and reading raw Arrow data, we can use the Arrow File Format
+or the Arrow Streaming Format.
+
+To dump an array to file, you can use the :meth:`~pyarrow.ipc.new_file`
+which will provide a new :class:`~pyarrow.ipc.RecordBatchFileWriter` instance
+that can be used to write batches of data to that file.
+
+For example to write an array of 10M integers, we could write it in 1000 chunks
+of 10000 entries:
+
+.. ipython:: python
+
+ BATCH_SIZE = 10000
+ NUM_BATCHES = 1000
+
+ schema = pa.schema([pa.field('nums', pa.int32())])
+
+ with pa.OSFile('bigfile.arrow', 'wb') as sink:
+ with pa.ipc.new_file(sink, schema) as writer:
+ for row in range(NUM_BATCHES):
+ batch = pa.record_batch([pa.array(range(BATCH_SIZE), type=pa.int32())], schema)
+ writer.write(batch)
+
+record batches support multiple columns, so in practice we always write the
+equivalent of a :class:`~pyarrow.Table`.
+
+Writing in batches is effective because we in theory need to keep in memory only
+the current batch we are writing. But when reading back, we can be even more effective
+by directly mapping the data from disk and avoid allocating any new memory on read.
+
+Under normal conditions, reading back our file will consume a few hundred megabytes
+of memory:
+
+.. ipython:: python
+
+ with pa.OSFile('bigfile.arrow', 'rb') as source:
+ loaded_array = pa.ipc.open_file(source).read_all()
+
+ print("LEN:", len(loaded_array))
+ print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20))
+
+To more efficiently read big data from disk, we can memory map the file, so that
+Arrow can directly reference the data mapped from disk and avoid having to
+allocate its own memory.
+In such case the operating system will be able to page in the mapped memory
+lazily and page it out without any write back cost when under pressure,
+allowing to more easily read arrays bigger than the total memory.
+
+.. ipython:: python
+
+ with pa.memory_map('bigfile.arrow', 'rb') as source:
+ loaded_array = pa.ipc.open_file(source).read_all()
+ print("LEN:", len(loaded_array))
+ print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20))
+
+.. note::
+
+ Other high level APIs like :meth:`~pyarrow.parquet.read_table` also provide a
+ ``memory_map`` option. But in those cases, the memory mapping can't help with
+ reducing resident memory consumption. See :ref:`parquet_mmap` for details.
+
+Arbitrary Object Serialization
+------------------------------
+
+.. warning::
+
+ The custom serialization functionality is deprecated in pyarrow 2.0, and
+ will be removed in a future version.
+
+ While the serialization functions in this section utilize the Arrow stream
+ protocol internally, they do not produce data that is compatible with the
+ above ``ipc.open_file`` and ``ipc.open_stream`` functions.
+
+ For arbitrary objects, you can use the standard library ``pickle``
+ functionality instead. For pyarrow objects, you can use the IPC
+ serialization format through the ``pyarrow.ipc`` module, as explained
+ above.
+
+ PyArrow serialization was originally meant to provide a higher-performance
+ alternative to ``pickle`` thanks to zero-copy semantics. However,
+ ``pickle`` protocol 5 gained support for zero-copy using out-of-band
+ buffers, and can be used instead for similar benefits.
+
+In ``pyarrow`` we are able to serialize and deserialize many kinds of Python
+objects. As an example, consider a dictionary containing NumPy arrays:
+
+.. ipython:: python
+
+ import numpy as np
+
+ data = {
+ i: np.random.randn(500, 500)
+ for i in range(100)
+ }
+
+We use the ``pyarrow.serialize`` function to convert this data to a byte
+buffer:
+
+.. ipython:: python
+ :okwarning:
+
+ buf = pa.serialize(data).to_buffer()
+ type(buf)
+ buf.size
+
+``pyarrow.serialize`` creates an intermediate object which can be converted to
+a buffer (the ``to_buffer`` method) or written directly to an output stream.
+
+``pyarrow.deserialize`` converts a buffer-like object back to the original
+Python object:
+
+.. ipython:: python
+ :okwarning:
+
+ restored_data = pa.deserialize(buf)
+ restored_data[0]
+
+
+Serializing Custom Data Types
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If an unrecognized data type is encountered when serializing an object,
+``pyarrow`` will fall back on using ``pickle`` for converting that type to a
+byte string. There may be a more efficient way, though.
+
+Consider a class with two members, one of which is a NumPy array:
+
+.. code-block:: python
+
+ class MyData:
+ def __init__(self, name, data):
+ self.name = name
+ self.data = data
+
+We write functions to convert this to and from a dictionary with simpler types:
+
+.. code-block:: python
+
+ def _serialize_MyData(val):
+ return {'name': val.name, 'data': val.data}
+
+ def _deserialize_MyData(data):
+ return MyData(data['name'], data['data']
+
+then, we must register these functions in a ``SerializationContext`` so that
+``MyData`` can be recognized:
+
+.. code-block:: python
+
+ context = pa.SerializationContext()
+ context.register_type(MyData, 'MyData',
+ custom_serializer=_serialize_MyData,
+ custom_deserializer=_deserialize_MyData)
+
+Lastly, we use this context as an additional argument to ``pyarrow.serialize``:
+
+.. code-block:: python
+
+ buf = pa.serialize(val, context=context).to_buffer()
+ restored_val = pa.deserialize(buf, context=context)
+
+The ``SerializationContext`` also has convenience methods ``serialize`` and
+``deserialize``, so these are equivalent statements:
+
+.. code-block:: python
+
+ buf = context.serialize(val).to_buffer()
+ restored_val = context.deserialize(buf)
+
+Component-based Serialization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For serializing Python objects containing some number of NumPy arrays, Arrow
+buffers, or other data types, it may be desirable to transport their serialized
+representation without having to produce an intermediate copy using the
+``to_buffer`` method. To motivate this, suppose we have a list of NumPy arrays:
+
+.. ipython:: python
+
+ import numpy as np
+ data = [np.random.randn(10, 10) for i in range(5)]
+
+The call ``pa.serialize(data)`` does not copy the memory inside each of these
+NumPy arrays. This serialized representation can be then decomposed into a
+dictionary containing a sequence of ``pyarrow.Buffer`` objects containing
+metadata for each array and references to the memory inside the arrays. To do
+this, use the ``to_components`` method:
+
+.. ipython:: python
+ :okwarning:
+
+ serialized = pa.serialize(data)
+ components = serialized.to_components()
+
+The particular details of the output of ``to_components`` are not too
+important. The objects in the ``'data'`` field are ``pyarrow.Buffer`` objects,
+which are zero-copy convertible to Python ``memoryview`` objects:
+
+.. ipython:: python
+
+ memoryview(components['data'][0])
+
+A memoryview can be converted back to a Arrow ``Buffer`` with
+``pyarrow.py_buffer``:
+
+.. ipython:: python
+
+ mv = memoryview(components['data'][0])
+ buf = pa.py_buffer(mv)
+
+An object can be reconstructed from its component-based representation using
+``deserialize_components``:
+
+.. ipython:: python
+ :okwarning:
+
+ restored_data = pa.deserialize_components(components)
+ restored_data[0]
+
+``deserialize_components`` is also available as a method on
+``SerializationContext`` objects.
diff --git a/src/arrow/docs/source/python/json.rst b/src/arrow/docs/source/python/json.rst
new file mode 100644
index 000000000..99ecbc19a
--- /dev/null
+++ b/src/arrow/docs/source/python/json.rst
@@ -0,0 +1,117 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow.json
+.. _json:
+
+Reading JSON files
+==================
+
+Arrow supports reading columnar data from line-delimited JSON files.
+In this context, a JSON file consists of multiple JSON objects, one per line,
+representing individual data rows. For example, this file represents
+two rows of data with four columns "a", "b", "c", "d":
+
+.. code-block:: json
+
+ {"a": 1, "b": 2.0, "c": "foo", "d": false}
+ {"a": 4, "b": -5.5, "c": null, "d": true}
+
+The features currently offered are the following:
+
+* multi-threaded or single-threaded reading
+* automatic decompression of input files (based on the filename extension,
+ such as ``my_data.json.gz``)
+* sophisticated type inference (see below)
+
+.. note::
+ Currently only the line-delimited JSON format is supported.
+
+
+Usage
+-----
+
+JSON reading functionality is available through the :mod:`pyarrow.json` module.
+In many cases, you will simply call the :func:`read_json` function
+with the file path you want to read from::
+
+ >>> from pyarrow import json
+ >>> fn = 'my_data.json'
+ >>> table = json.read_json(fn)
+ >>> table
+ pyarrow.Table
+ a: int64
+ b: double
+ c: string
+ d: bool
+ >>> table.to_pandas()
+ a b c d
+ 0 1 2.0 foo False
+ 1 4 -5.5 None True
+
+
+Automatic Type Inference
+------------------------
+
+Arrow :ref:`data types <data.types>` are inferred from the JSON types and
+values of each column:
+
+* JSON null values convert to the ``null`` type, but can fall back to any
+ other type.
+* JSON booleans convert to ``bool_``.
+* JSON numbers convert to ``int64``, falling back to ``float64`` if a
+ non-integer is encountered.
+* JSON strings of the kind "YYYY-MM-DD" and "YYYY-MM-DD hh:mm:ss" convert
+ to ``timestamp[s]``, falling back to ``utf8`` if a conversion error occurs.
+* JSON arrays convert to a ``list`` type, and inference proceeds recursively
+ on the JSON arrays' values.
+* Nested JSON objects convert to a ``struct`` type, and inference proceeds
+ recursively on the JSON objects' values.
+
+Thus, reading this JSON file:
+
+.. code-block:: json
+
+ {"a": [1, 2], "b": {"c": true, "d": "1991-02-03"}}
+ {"a": [3, 4, 5], "b": {"c": false, "d": "2019-04-01"}}
+
+returns the following data::
+
+ >>> table = json.read_json("my_data.json")
+ >>> table
+ pyarrow.Table
+ a: list<item: int64>
+ child 0, item: int64
+ b: struct<c: bool, d: timestamp[s]>
+ child 0, c: bool
+ child 1, d: timestamp[s]
+ >>> table.to_pandas()
+ a b
+ 0 [1, 2] {'c': True, 'd': 1991-02-03 00:00:00}
+ 1 [3, 4, 5] {'c': False, 'd': 2019-04-01 00:00:00}
+
+
+Customized parsing
+------------------
+
+To alter the default parsing settings in case of reading JSON files with an
+unusual structure, you should create a :class:`ParseOptions` instance
+and pass it to :func:`read_json`. For example, you can pass an explicit
+:ref:`schema <data.schema>` in order to bypass automatic type inference.
+
+Similarly, you can choose performance settings by passing a
+:class:`ReadOptions` instance to :func:`read_json`.
diff --git a/src/arrow/docs/source/python/memory.rst b/src/arrow/docs/source/python/memory.rst
new file mode 100644
index 000000000..4febc668c
--- /dev/null
+++ b/src/arrow/docs/source/python/memory.rst
@@ -0,0 +1,298 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+.. highlight:: python
+
+.. _io:
+
+========================
+Memory and IO Interfaces
+========================
+
+This section will introduce you to the major concepts in PyArrow's memory
+management and IO systems:
+
+* Buffers
+* Memory pools
+* File-like and stream-like objects
+
+Referencing and Allocating Memory
+=================================
+
+pyarrow.Buffer
+--------------
+
+The :class:`Buffer` object wraps the C++ :cpp:class:`arrow::Buffer` type
+which is the primary tool for memory management in Apache Arrow in C++. It permits
+higher-level array classes to safely interact with memory which they may or may
+not own. ``arrow::Buffer`` can be zero-copy sliced to permit Buffers to cheaply
+reference other Buffers, while preserving memory lifetime and clean
+parent-child relationships.
+
+There are many implementations of ``arrow::Buffer``, but they all provide a
+standard interface: a data pointer and length. This is similar to Python's
+built-in `buffer protocol` and ``memoryview`` objects.
+
+A :class:`Buffer` can be created from any Python object implementing
+the buffer protocol by calling the :func:`py_buffer` function. Let's consider
+a bytes object:
+
+.. ipython:: python
+
+ import pyarrow as pa
+
+ data = b'abcdefghijklmnopqrstuvwxyz'
+ buf = pa.py_buffer(data)
+ buf
+ buf.size
+
+Creating a Buffer in this way does not allocate any memory; it is a zero-copy
+view on the memory exported from the ``data`` bytes object.
+
+External memory, under the form of a raw pointer and size, can also be
+referenced using the :func:`foreign_buffer` function.
+
+Buffers can be used in circumstances where a Python buffer or memoryview is
+required, and such conversions are zero-copy:
+
+.. ipython:: python
+
+ memoryview(buf)
+
+The Buffer's :meth:`~Buffer.to_pybytes` method converts the Buffer's data to a
+Python bytestring (thus making a copy of the data):
+
+.. ipython:: python
+
+ buf.to_pybytes()
+
+Memory Pools
+------------
+
+All memory allocations and deallocations (like ``malloc`` and ``free`` in C)
+are tracked in an instance of :class:`MemoryPool`. This means that we can
+then precisely track amount of memory that has been allocated:
+
+.. ipython:: python
+
+ pa.total_allocated_bytes()
+
+Let's allocate a resizable :class:`Buffer` from the default pool:
+
+.. ipython:: python
+
+ buf = pa.allocate_buffer(1024, resizable=True)
+ pa.total_allocated_bytes()
+ buf.resize(2048)
+ pa.total_allocated_bytes()
+
+The default allocator requests memory in a minimum increment of 64 bytes. If
+the buffer is garbaged-collected, all of the memory is freed:
+
+.. ipython:: python
+
+ buf = None
+ pa.total_allocated_bytes()
+
+Besides the default built-in memory pool, there may be additional memory pools
+to choose (such as `mimalloc <https://github.com/microsoft/mimalloc>`_)
+from depending on how Arrow was built. One can get the backend
+name for a memory pool::
+
+ >>> pa.default_memory_pool().backend_name
+ 'jemalloc'
+
+.. seealso::
+ :ref:`API documentation for memory pools <api.memory_pool>`.
+
+.. seealso::
+ On-GPU buffers using Arrow's optional :doc:`CUDA integration <cuda>`.
+
+
+Input and Output
+================
+
+.. _io.native_file:
+
+The Arrow C++ libraries have several abstract interfaces for different kinds of
+IO objects:
+
+* Read-only streams
+* Read-only files supporting random access
+* Write-only streams
+* Write-only files supporting random access
+* File supporting reads, writes, and random access
+
+In the interest of making these objects behave more like Python's built-in
+``file`` objects, we have defined a :class:`~pyarrow.NativeFile` base class
+which implements the same API as regular Python file objects.
+
+:class:`~pyarrow.NativeFile` has some important features which make it
+preferable to using Python files with PyArrow where possible:
+
+* Other Arrow classes can access the internal C++ IO objects natively, and do
+ not need to acquire the Python GIL
+* Native C++ IO may be able to do zero-copy IO, such as with memory maps
+
+There are several kinds of :class:`~pyarrow.NativeFile` options available:
+
+* :class:`~pyarrow.OSFile`, a native file that uses your operating system's
+ file descriptors
+* :class:`~pyarrow.MemoryMappedFile`, for reading (zero-copy) and writing with
+ memory maps
+* :class:`~pyarrow.BufferReader`, for reading :class:`~pyarrow.Buffer` objects
+ as a file
+* :class:`~pyarrow.BufferOutputStream`, for writing data in-memory, producing a
+ Buffer at the end
+* :class:`~pyarrow.FixedSizeBufferWriter`, for writing data into an already
+ allocated Buffer
+* :class:`~pyarrow.HdfsFile`, for reading and writing data to the Hadoop Filesystem
+* :class:`~pyarrow.PythonFile`, for interfacing with Python file objects in C++
+* :class:`~pyarrow.CompressedInputStream` and
+ :class:`~pyarrow.CompressedOutputStream`, for on-the-fly compression or
+ decompression to/from another stream
+
+There are also high-level APIs to make instantiating common kinds of streams
+easier.
+
+High-Level API
+--------------
+
+Input Streams
+~~~~~~~~~~~~~
+
+The :func:`~pyarrow.input_stream` function allows creating a readable
+:class:`~pyarrow.NativeFile` from various kinds of sources.
+
+* If passed a :class:`~pyarrow.Buffer` or a ``memoryview`` object, a
+ :class:`~pyarrow.BufferReader` will be returned:
+
+ .. ipython:: python
+
+ buf = memoryview(b"some data")
+ stream = pa.input_stream(buf)
+ stream.read(4)
+
+* If passed a string or file path, it will open the given file on disk
+ for reading, creating a :class:`~pyarrow.OSFile`. Optionally, the file
+ can be compressed: if its filename ends with a recognized extension
+ such as ``.gz``, its contents will automatically be decompressed on
+ reading.
+
+ .. ipython:: python
+
+ import gzip
+ with gzip.open('example.gz', 'wb') as f:
+ f.write(b'some data\n' * 3)
+
+ stream = pa.input_stream('example.gz')
+ stream.read()
+
+* If passed a Python file object, it will wrapped in a :class:`PythonFile`
+ such that the Arrow C++ libraries can read data from it (at the expense
+ of a slight overhead).
+
+Output Streams
+~~~~~~~~~~~~~~
+
+:func:`~pyarrow.output_stream` is the equivalent function for output streams
+and allows creating a writable :class:`~pyarrow.NativeFile`. It has the same
+features as explained above for :func:`~pyarrow.input_stream`, such as being
+able to write to buffers or do on-the-fly compression.
+
+.. ipython:: python
+
+ with pa.output_stream('example1.dat') as stream:
+ stream.write(b'some data')
+
+ f = open('example1.dat', 'rb')
+ f.read()
+
+
+On-Disk and Memory Mapped Files
+-------------------------------
+
+PyArrow includes two ways to interact with data on disk: standard operating
+system-level file APIs, and memory-mapped files. In regular Python we can
+write:
+
+.. ipython:: python
+
+ with open('example2.dat', 'wb') as f:
+ f.write(b'some example data')
+
+Using pyarrow's :class:`~pyarrow.OSFile` class, you can write:
+
+.. ipython:: python
+
+ with pa.OSFile('example3.dat', 'wb') as f:
+ f.write(b'some example data')
+
+For reading files, you can use :class:`~pyarrow.OSFile` or
+:class:`~pyarrow.MemoryMappedFile`. The difference between these is that
+:class:`~pyarrow.OSFile` allocates new memory on each read, like Python file
+objects. In reads from memory maps, the library constructs a buffer referencing
+the mapped memory without any memory allocation or copying:
+
+.. ipython:: python
+
+ file_obj = pa.OSFile('example2.dat')
+ mmap = pa.memory_map('example3.dat')
+ file_obj.read(4)
+ mmap.read(4)
+
+The ``read`` method implements the standard Python file ``read`` API. To read
+into Arrow Buffer objects, use ``read_buffer``:
+
+.. ipython:: python
+
+ mmap.seek(0)
+ buf = mmap.read_buffer(4)
+ print(buf)
+ buf.to_pybytes()
+
+Many tools in PyArrow, particular the Apache Parquet interface and the file and
+stream messaging tools, are more efficient when used with these ``NativeFile``
+types than with normal Python file objects.
+
+.. ipython:: python
+ :suppress:
+
+ buf = mmap = file_obj = None
+ !rm example.dat
+ !rm example2.dat
+
+In-Memory Reading and Writing
+-----------------------------
+
+To assist with serialization and deserialization of in-memory data, we have
+file interfaces that can read and write to Arrow Buffers.
+
+.. ipython:: python
+
+ writer = pa.BufferOutputStream()
+ writer.write(b'hello, friends')
+
+ buf = writer.getvalue()
+ buf
+ buf.size
+ reader = pa.BufferReader(buf)
+ reader.seek(7)
+ reader.read(7)
+
+These have similar semantics to Python's built-in ``io.BytesIO``.
diff --git a/src/arrow/docs/source/python/numpy.rst b/src/arrow/docs/source/python/numpy.rst
new file mode 100644
index 000000000..870f9cb73
--- /dev/null
+++ b/src/arrow/docs/source/python/numpy.rst
@@ -0,0 +1,75 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _numpy_interop:
+
+NumPy Integration
+=================
+
+PyArrow allows converting back and forth from
+`NumPy <https://www.numpy.org/>`_ arrays to Arrow :ref:`Arrays <data.array>`.
+
+NumPy to Arrow
+--------------
+
+To convert a NumPy array to Arrow, one can simply call the :func:`pyarrow.array`
+factory function.
+
+.. code-block:: pycon
+
+ >>> import numpy as np
+ >>> import pyarrow as pa
+ >>> data = np.arange(10, dtype='int16')
+ >>> arr = pa.array(data)
+ >>> arr
+ <pyarrow.lib.Int16Array object at 0x7fb1d1e6ae58>
+ [
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9
+ ]
+
+Converting from NumPy supports a wide range of input dtypes, including
+structured dtypes or strings.
+
+Arrow to NumPy
+--------------
+
+In the reverse direction, it is possible to produce a view of an Arrow Array
+for use with NumPy using the :meth:`~pyarrow.Array.to_numpy` method.
+This is limited to primitive types for which NumPy has the same physical
+representation as Arrow, and assuming the Arrow data has no nulls.
+
+.. code-block:: pycon
+
+ >>> import numpy as np
+ >>> import pyarrow as pa
+ >>> arr = pa.array([4, 5, 6], type=pa.int32())
+ >>> view = arr.to_numpy()
+ >>> view
+ array([4, 5, 6], dtype=int32)
+
+For more complex data types, you have to use the :meth:`~pyarrow.Array.to_pandas`
+method (which will construct a Numpy array with Pandas semantics for, e.g.,
+representation of null values).
diff --git a/src/arrow/docs/source/python/pandas.rst b/src/arrow/docs/source/python/pandas.rst
new file mode 100644
index 000000000..aa030cfff
--- /dev/null
+++ b/src/arrow/docs/source/python/pandas.rst
@@ -0,0 +1,480 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _pandas_interop:
+
+Pandas Integration
+==================
+
+To interface with `pandas <https://pandas.pydata.org/>`_, PyArrow provides
+various conversion routines to consume pandas structures and convert back
+to them.
+
+.. note::
+ While pandas uses NumPy as a backend, it has enough peculiarities
+ (such as a different type system, and support for null values) that this
+ is a separate topic from :ref:`numpy_interop`.
+
+To follow examples in this document, make sure to run:
+
+.. ipython:: python
+
+ import pandas as pd
+ import pyarrow as pa
+
+DataFrames
+----------
+
+The equivalent to a pandas DataFrame in Arrow is a :ref:`Table <data.table>`.
+Both consist of a set of named columns of equal length. While pandas only
+supports flat columns, the Table also provides nested columns, thus it can
+represent more data than a DataFrame, so a full conversion is not always possible.
+
+Conversion from a Table to a DataFrame is done by calling
+:meth:`pyarrow.Table.to_pandas`. The inverse is then achieved by using
+:meth:`pyarrow.Table.from_pandas`.
+
+.. code-block:: python
+
+ import pyarrow as pa
+ import pandas as pd
+
+ df = pd.DataFrame({"a": [1, 2, 3]})
+ # Convert from pandas to Arrow
+ table = pa.Table.from_pandas(df)
+ # Convert back to pandas
+ df_new = table.to_pandas()
+
+ # Infer Arrow schema from pandas
+ schema = pa.Schema.from_pandas(df)
+
+By default ``pyarrow`` tries to preserve and restore the ``.index``
+data as accurately as possible. See the section below for more about
+this, and how to disable this logic.
+
+Series
+------
+
+In Arrow, the most similar structure to a pandas Series is an Array.
+It is a vector that contains data of the same type as linear memory. You can
+convert a pandas Series to an Arrow Array using :meth:`pyarrow.Array.from_pandas`.
+As Arrow Arrays are always nullable, you can supply an optional mask using
+the ``mask`` parameter to mark all null-entries.
+
+Handling pandas Indexes
+-----------------------
+
+Methods like :meth:`pyarrow.Table.from_pandas` have a
+``preserve_index`` option which defines how to preserve (store) or not
+to preserve (to not store) the data in the ``index`` member of the
+corresponding pandas object. This data is tracked using schema-level
+metadata in the internal ``arrow::Schema`` object.
+
+The default of ``preserve_index`` is ``None``, which behaves as
+follows:
+
+* ``RangeIndex`` is stored as metadata-only, not requiring any extra
+ storage.
+* Other index types are stored as one or more physical data columns in
+ the resulting :class:`Table`
+
+To not store the index at all pass ``preserve_index=False``. Since
+storing a ``RangeIndex`` can cause issues in some limited scenarios
+(such as storing multiple DataFrame objects in a Parquet file), to
+force all index data to be serialized in the resulting table, pass
+``preserve_index=True``.
+
+Type differences
+----------------
+
+With the current design of pandas and Arrow, it is not possible to convert all
+column types unmodified. One of the main issues here is that pandas has no
+support for nullable columns of arbitrary type. Also ``datetime64`` is currently
+fixed to nanosecond resolution. On the other side, Arrow might be still missing
+support for some types.
+
+pandas -> Arrow Conversion
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++------------------------+--------------------------+
+| Source Type (pandas) | Destination Type (Arrow) |
++========================+==========================+
+| ``bool`` | ``BOOL`` |
++------------------------+--------------------------+
+| ``(u)int{8,16,32,64}`` | ``(U)INT{8,16,32,64}`` |
++------------------------+--------------------------+
+| ``float32`` | ``FLOAT`` |
++------------------------+--------------------------+
+| ``float64`` | ``DOUBLE`` |
++------------------------+--------------------------+
+| ``str`` / ``unicode`` | ``STRING`` |
++------------------------+--------------------------+
+| ``pd.Categorical`` | ``DICTIONARY`` |
++------------------------+--------------------------+
+| ``pd.Timestamp`` | ``TIMESTAMP(unit=ns)`` |
++------------------------+--------------------------+
+| ``datetime.date`` | ``DATE`` |
++------------------------+--------------------------+
+
+Arrow -> pandas Conversion
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++-------------------------------------+--------------------------------------------------------+
+| Source Type (Arrow) | Destination Type (pandas) |
++=====================================+========================================================+
+| ``BOOL`` | ``bool`` |
++-------------------------------------+--------------------------------------------------------+
+| ``BOOL`` *with nulls* | ``object`` (with values ``True``, ``False``, ``None``) |
++-------------------------------------+--------------------------------------------------------+
+| ``(U)INT{8,16,32,64}`` | ``(u)int{8,16,32,64}`` |
++-------------------------------------+--------------------------------------------------------+
+| ``(U)INT{8,16,32,64}`` *with nulls* | ``float64`` |
++-------------------------------------+--------------------------------------------------------+
+| ``FLOAT`` | ``float32`` |
++-------------------------------------+--------------------------------------------------------+
+| ``DOUBLE`` | ``float64`` |
++-------------------------------------+--------------------------------------------------------+
+| ``STRING`` | ``str`` |
++-------------------------------------+--------------------------------------------------------+
+| ``DICTIONARY`` | ``pd.Categorical`` |
++-------------------------------------+--------------------------------------------------------+
+| ``TIMESTAMP(unit=*)`` | ``pd.Timestamp`` (``np.datetime64[ns]``) |
++-------------------------------------+--------------------------------------------------------+
+| ``DATE`` | ``object``(with ``datetime.date`` objects) |
++-------------------------------------+--------------------------------------------------------+
+
+Categorical types
+~~~~~~~~~~~~~~~~~
+
+`Pandas categorical <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_
+columns are converted to :ref:`Arrow dictionary arrays <data.dictionary>`,
+a special array type optimized to handle repeated and limited
+number of possible values.
+
+.. ipython:: python
+
+ df = pd.DataFrame({"cat": pd.Categorical(["a", "b", "c", "a", "b", "c"])})
+ df.cat.dtype.categories
+ df
+
+ table = pa.Table.from_pandas(df)
+ table
+
+We can inspect the :class:`~.ChunkedArray` of the created table and see the
+same categories of the Pandas DataFrame.
+
+.. ipython:: python
+
+ column = table[0]
+ chunk = column.chunk(0)
+ chunk.dictionary
+ chunk.indices
+
+Datetime (Timestamp) types
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`Pandas Timestamps <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html>`_
+use the ``datetime64[ns]`` type in Pandas and are converted to an Arrow
+:class:`~.TimestampArray`.
+
+.. ipython:: python
+
+ df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=3)})
+ df.dtypes
+ df
+
+ table = pa.Table.from_pandas(df)
+ table
+
+In this example the Pandas Timestamp is time zone aware
+(``UTC`` on this case), and this information is used to create the Arrow
+:class:`~.TimestampArray`.
+
+Date types
+~~~~~~~~~~
+
+While dates can be handled using the ``datetime64[ns]`` type in
+pandas, some systems work with object arrays of Python's built-in
+``datetime.date`` object:
+
+.. ipython:: python
+
+ from datetime import date
+ s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)])
+ s
+
+When converting to an Arrow array, the ``date32`` type will be used by
+default:
+
+.. ipython:: python
+
+ arr = pa.array(s)
+ arr.type
+ arr[0]
+
+To use the 64-bit ``date64``, specify this explicitly:
+
+.. ipython:: python
+
+ arr = pa.array(s, type='date64')
+ arr.type
+
+When converting back with ``to_pandas``, object arrays of
+``datetime.date`` objects are returned:
+
+.. ipython:: python
+
+ arr.to_pandas()
+
+If you want to use NumPy's ``datetime64`` dtype instead, pass
+``date_as_object=False``:
+
+.. ipython:: python
+
+ s2 = pd.Series(arr.to_pandas(date_as_object=False))
+ s2.dtype
+
+.. warning::
+
+ As of Arrow ``0.13`` the parameter ``date_as_object`` is ``True``
+ by default. Older versions must pass ``date_as_object=True`` to
+ obtain this behavior
+
+Time types
+~~~~~~~~~~
+
+The builtin ``datetime.time`` objects inside Pandas data structures will be
+converted to an Arrow ``time64`` and :class:`~.Time64Array` respectively.
+
+.. ipython:: python
+
+ from datetime import time
+ s = pd.Series([time(1, 1, 1), time(2, 2, 2)])
+ s
+
+ arr = pa.array(s)
+ arr.type
+ arr
+
+When converting to pandas, arrays of ``datetime.time`` objects are returned:
+
+.. ipython:: python
+
+ arr.to_pandas()
+
+Nullable types
+--------------
+
+In Arrow all data types are nullable, meaning they support storing missing
+values. In pandas, however, not all data types have support for missing data.
+Most notably, the default integer data types do not, and will get casted
+to float when missing values are introduced. Therefore, when an Arrow array
+or table gets converted to pandas, integer columns will become float when
+missing values are present:
+
+.. code-block:: python
+
+ >>> arr = pa.array([1, 2, None])
+ >>> arr
+ <pyarrow.lib.Int64Array object at 0x7f07d467c640>
+ [
+ 1,
+ 2,
+ null
+ ]
+ >>> arr.to_pandas()
+ 0 1.0
+ 1 2.0
+ 2 NaN
+ dtype: float64
+
+Pandas has experimental nullable data types
+(https://pandas.pydata.org/docs/user_guide/integer_na.html). Arrows supports
+round trip conversion for those:
+
+.. code-block:: python
+
+ >>> df = pd.DataFrame({'a': pd.Series([1, 2, None], dtype="Int64")})
+ >>> df
+ a
+ 0 1
+ 1 2
+ 2 <NA>
+
+ >>> table = pa.table(df)
+ >>> table
+ Out[32]:
+ pyarrow.Table
+ a: int64
+ ----
+ a: [[1,2,null]]
+
+ >>> table.to_pandas()
+ a
+ 0 1
+ 1 2
+ 2 <NA>
+
+ >>> table.to_pandas().dtypes
+ a Int64
+ dtype: object
+
+This roundtrip conversion works because metadata about the original pandas
+DataFrame gets stored in the Arrow table. However, if you have Arrow data (or
+e.g. a Parquet file) not originating from a pandas DataFrame with nullable
+data types, the default conversion to pandas will not use those nullable
+dtypes.
+
+The :meth:`pyarrow.Table.to_pandas` method has a ``types_mapper`` keyword
+that can be used to override the default data type used for the resulting
+pandas DataFrame. This way, you can instruct Arrow to create a pandas
+DataFrame using nullable dtypes.
+
+.. code-block:: python
+
+ >>> table = pa.table({"a": [1, 2, None]})
+ >>> table.to_pandas()
+ a
+ 0 1.0
+ 1 2.0
+ 2 NaN
+ >>> table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get)
+ a
+ 0 1
+ 1 2
+ 2 <NA>
+
+The ``types_mapper`` keyword expects a function that will return the pandas
+data type to use given a pyarrow data type. By using the ``dict.get`` method,
+we can create such a function using a dictionary.
+
+If you want to use all currently supported nullable dtypes by pandas, this
+dictionary becomes:
+
+.. code-block:: python
+
+ dtype_mapping = {
+ pa.int8(): pd.Int8Dtype(),
+ pa.int16(): pd.Int16Dtype(),
+ pa.int32(): pd.Int32Dtype(),
+ pa.int64(): pd.Int64Dtype(),
+ pa.uint8(): pd.UInt8Dtype(),
+ pa.uint16(): pd.UInt16Dtype(),
+ pa.uint32(): pd.UInt32Dtype(),
+ pa.uint64(): pd.UInt64Dtype(),
+ pa.bool_(): pd.BooleanDtype(),
+ pa.float32(): pd.Float32Dtype(),
+ pa.float64(): pd.Float64Dtype(),
+ pa.string(): pd.StringDtype(),
+ }
+
+ df = table.to_pandas(types_mapper=dtype_mapping.get)
+
+
+When using the pandas API for reading Parquet files (``pd.read_parquet(..)``),
+this can also be achieved by passing ``use_nullable_dtypes``:
+
+.. code-block:: python
+
+ df = pd.read_parquet(path, use_nullable_dtypes=True)
+
+
+Memory Usage and Zero Copy
+--------------------------
+
+When converting from Arrow data structures to pandas objects using various
+``to_pandas`` methods, one must occasionally be mindful of issues related to
+performance and memory usage.
+
+Since pandas's internal data representation is generally different from the
+Arrow columnar format, zero copy conversions (where no memory allocation or
+computation is required) are only possible in certain limited cases.
+
+In the worst case scenario, calling ``to_pandas`` will result in two versions
+of the data in memory, one for Arrow and one for pandas, yielding approximately
+twice the memory footprint. We have implement some mitigations for this case,
+particularly when creating large ``DataFrame`` objects, that we describe below.
+
+Zero Copy Series Conversions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Zero copy conversions from ``Array`` or ``ChunkedArray`` to NumPy arrays or
+pandas Series are possible in certain narrow cases:
+
+* The Arrow data is stored in an integer (signed or unsigned ``int8`` through
+ ``int64``) or floating point type (``float16`` through ``float64``). This
+ includes many numeric types as well as timestamps.
+* The Arrow data has no null values (since these are represented using bitmaps
+ which are not supported by pandas).
+* For ``ChunkedArray``, the data consists of a single chunk,
+ i.e. ``arr.num_chunks == 1``. Multiple chunks will always require a copy
+ because of pandas's contiguousness requirement.
+
+In these scenarios, ``to_pandas`` or ``to_numpy`` will be zero copy. In all
+other scenarios, a copy will be required.
+
+Reducing Memory Use in ``Table.to_pandas``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As of this writing, pandas applies a data management strategy called
+"consolidation" to collect like-typed DataFrame columns in two-dimensional
+NumPy arrays, referred to internally as "blocks". We have gone to great effort
+to construct the precise "consolidated" blocks so that pandas will not perform
+any further allocation or copies after we hand off the data to
+``pandas.DataFrame``. The obvious downside of this consolidation strategy is
+that it forces a "memory doubling".
+
+To try to limit the potential effects of "memory doubling" during
+``Table.to_pandas``, we provide a couple of options:
+
+* ``split_blocks=True``, when enabled ``Table.to_pandas`` produces one internal
+ DataFrame "block" for each column, skipping the "consolidation" step. Note
+ that many pandas operations will trigger consolidation anyway, but the peak
+ memory use may be less than the worst case scenario of a full memory
+ doubling. As a result of this option, we are able to do zero copy conversions
+ of columns in the same cases where we can do zero copy with ``Array`` and
+ ``ChunkedArray``.
+* ``self_destruct=True``, this destroys the internal Arrow memory buffers in
+ each column ``Table`` object as they are converted to the pandas-compatible
+ representation, potentially releasing memory to the operating system as soon
+ as a column is converted. Note that this renders the calling ``Table`` object
+ unsafe for further use, and any further methods called will cause your Python
+ process to crash.
+
+Used together, the call
+
+.. code-block:: python
+
+ df = table.to_pandas(split_blocks=True, self_destruct=True)
+ del table # not necessary, but a good practice
+
+will yield significantly lower memory usage in some scenarios. Without these
+options, ``to_pandas`` will always double memory.
+
+Note that ``self_destruct=True`` is not guaranteed to save memory. Since the
+conversion happens column by column, memory is also freed column by column. But
+if multiple columns share an underlying buffer, then no memory will be freed
+until all of those columns are converted. In particular, due to implementation
+details, data that comes from IPC or Flight is prone to this, as memory will be
+laid out as follows::
+
+ Record Batch 0: Allocation 0: array 0 chunk 0, array 1 chunk 0, ...
+ Record Batch 1: Allocation 1: array 0 chunk 1, array 1 chunk 1, ...
+ ...
+
+In this case, no memory can be freed until the entire table is converted, even
+with ``self_destruct=True``.
diff --git a/src/arrow/docs/source/python/parquet.rst b/src/arrow/docs/source/python/parquet.rst
new file mode 100644
index 000000000..82461ec5d
--- /dev/null
+++ b/src/arrow/docs/source/python/parquet.rst
@@ -0,0 +1,597 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+.. _parquet:
+
+Reading and Writing the Apache Parquet Format
+=============================================
+
+The `Apache Parquet <http://parquet.apache.org/>`_ project provides a
+standardized open-source columnar storage format for use in data analysis
+systems. It was created originally for use in `Apache Hadoop
+<http://hadoop.apache.org/>`_ with systems like `Apache Drill
+<http://drill.apache.org>`_, `Apache Hive <http://hive.apache.org>`_, `Apache
+Impala (incubating) <http://impala.apache.org>`_, and `Apache Spark
+<http://spark.apache.org>`_ adopting it as a shared standard for high
+performance data IO.
+
+Apache Arrow is an ideal in-memory transport layer for data that is being read
+or written with Parquet files. We have been concurrently developing the `C++
+implementation of Apache Parquet <http://github.com/apache/parquet-cpp>`_,
+which includes a native, multithreaded C++ adapter to and from in-memory Arrow
+data. PyArrow includes Python bindings to this code, which thus enables reading
+and writing Parquet files with pandas as well.
+
+Obtaining pyarrow with Parquet Support
+--------------------------------------
+
+If you installed ``pyarrow`` with pip or conda, it should be built with Parquet
+support bundled:
+
+.. ipython:: python
+
+ import pyarrow.parquet as pq
+
+If you are building ``pyarrow`` from source, you must use
+``-DARROW_PARQUET=ON`` when compiling the C++ libraries and enable the Parquet
+extensions when building ``pyarrow``. See the :ref:`Python Development
+<python-development>` page for more details.
+
+Reading and Writing Single Files
+--------------------------------
+
+The functions :func:`~.parquet.read_table` and :func:`~.parquet.write_table`
+read and write the :ref:`pyarrow.Table <data.table>` object, respectively.
+
+Let's look at a simple table:
+
+.. ipython:: python
+
+ import numpy as np
+ import pandas as pd
+ import pyarrow as pa
+
+ df = pd.DataFrame({'one': [-1, np.nan, 2.5],
+ 'two': ['foo', 'bar', 'baz'],
+ 'three': [True, False, True]},
+ index=list('abc'))
+ table = pa.Table.from_pandas(df)
+
+We write this to Parquet format with ``write_table``:
+
+.. ipython:: python
+
+ import pyarrow.parquet as pq
+ pq.write_table(table, 'example.parquet')
+
+This creates a single Parquet file. In practice, a Parquet dataset may consist
+of many files in many directories. We can read a single file back with
+``read_table``:
+
+.. ipython:: python
+
+ table2 = pq.read_table('example.parquet')
+ table2.to_pandas()
+
+You can pass a subset of columns to read, which can be much faster than reading
+the whole file (due to the columnar layout):
+
+.. ipython:: python
+
+ pq.read_table('example.parquet', columns=['one', 'three'])
+
+When reading a subset of columns from a file that used a Pandas dataframe as the
+source, we use ``read_pandas`` to maintain any additional index column data:
+
+.. ipython:: python
+
+ pq.read_pandas('example.parquet', columns=['two']).to_pandas()
+
+We need not use a string to specify the origin of the file. It can be any of:
+
+* A file path as a string
+* A :ref:`NativeFile <io.native_file>` from PyArrow
+* A Python file object
+
+In general, a Python file object will have the worst read performance, while a
+string file path or an instance of :class:`~.NativeFile` (especially memory
+maps) will perform the best.
+
+.. _parquet_mmap:
+
+Reading Parquet and Memory Mapping
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Because Parquet data needs to be decoded from the Parquet format
+and compression, it can't be directly mapped from disk.
+Thus the ``memory_map`` option might perform better on some systems
+but won't help much with resident memory consumption.
+
+.. code-block:: python
+
+ >>> pq_array = pa.parquet.read_table("area1.parquet", memory_map=True)
+ >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20))
+ RSS: 4299MB
+
+ >>> pq_array = pa.parquet.read_table("area1.parquet", memory_map=False)
+ >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20))
+ RSS: 4299MB
+
+If you need to deal with Parquet data bigger than memory,
+the :ref:`dataset` and partitioning is probably what you are looking for.
+
+Parquet file writing options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:func:`~pyarrow.parquet.write_table()` has a number of options to
+control various settings when writing a Parquet file.
+
+* ``version``, the Parquet format version to use. ``'1.0'`` ensures
+ compatibility with older readers, while ``'2.4'`` and greater values
+ enable more Parquet types and encodings.
+* ``data_page_size``, to control the approximate size of encoded data
+ pages within a column chunk. This currently defaults to 1MB.
+* ``flavor``, to set compatibility options particular to a Parquet
+ consumer like ``'spark'`` for Apache Spark.
+
+See the :func:`~pyarrow.parquet.write_table()` docstring for more details.
+
+There are some additional data type handling-specific options
+described below.
+
+Omitting the DataFrame index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using ``pa.Table.from_pandas`` to convert to an Arrow table, by default
+one or more special columns are added to keep track of the index (row
+labels). Storing the index takes extra space, so if your index is not valuable,
+you may choose to omit it by passing ``preserve_index=False``
+
+.. ipython:: python
+
+ df = pd.DataFrame({'one': [-1, np.nan, 2.5],
+ 'two': ['foo', 'bar', 'baz'],
+ 'three': [True, False, True]},
+ index=list('abc'))
+ df
+ table = pa.Table.from_pandas(df, preserve_index=False)
+
+Then we have:
+
+.. ipython:: python
+
+ pq.write_table(table, 'example_noindex.parquet')
+ t = pq.read_table('example_noindex.parquet')
+ t.to_pandas()
+
+Here you see the index did not survive the round trip.
+
+Finer-grained Reading and Writing
+---------------------------------
+
+``read_table`` uses the :class:`~.ParquetFile` class, which has other features:
+
+.. ipython:: python
+
+ parquet_file = pq.ParquetFile('example.parquet')
+ parquet_file.metadata
+ parquet_file.schema
+
+As you can learn more in the `Apache Parquet format
+<https://github.com/apache/parquet-format>`_, a Parquet file consists of
+multiple row groups. ``read_table`` will read all of the row groups and
+concatenate them into a single table. You can read individual row groups with
+``read_row_group``:
+
+.. ipython:: python
+
+ parquet_file.num_row_groups
+ parquet_file.read_row_group(0)
+
+We can similarly write a Parquet file with multiple row groups by using
+``ParquetWriter``:
+
+.. ipython:: python
+
+ with pq.ParquetWriter('example2.parquet', table.schema) as writer:
+ for i in range(3):
+ writer.write_table(table)
+
+ pf2 = pq.ParquetFile('example2.parquet')
+ pf2.num_row_groups
+
+Inspecting the Parquet File Metadata
+------------------------------------
+
+The ``FileMetaData`` of a Parquet file can be accessed through
+:class:`~.ParquetFile` as shown above:
+
+.. ipython:: python
+
+ parquet_file = pq.ParquetFile('example.parquet')
+ metadata = parquet_file.metadata
+
+or can also be read directly using :func:`~parquet.read_metadata`:
+
+.. ipython:: python
+
+ metadata = pq.read_metadata('example.parquet')
+ metadata
+
+The returned ``FileMetaData`` object allows to inspect the
+`Parquet file metadata <https://github.com/apache/parquet-format#metadata>`__,
+such as the row groups and column chunk metadata and statistics:
+
+.. ipython:: python
+
+ metadata.row_group(0)
+ metadata.row_group(0).column(0)
+
+.. ipython:: python
+ :suppress:
+
+ !rm example.parquet
+ !rm example_noindex.parquet
+ !rm example2.parquet
+ !rm example3.parquet
+
+Data Type Handling
+------------------
+
+Reading types as DictionaryArray
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``read_dictionary`` option in ``read_table`` and ``ParquetDataset`` will
+cause columns to be read as ``DictionaryArray``, which will become
+``pandas.Categorical`` when converted to pandas. This option is only valid for
+string and binary column types, and it can yield significantly lower memory use
+and improved performance for columns with many repeated string values.
+
+.. code-block:: python
+
+ pq.read_table(table, where, read_dictionary=['binary_c0', 'stringb_c2'])
+
+Storing timestamps
+~~~~~~~~~~~~~~~~~~
+
+Some Parquet readers may only support timestamps stored in millisecond
+(``'ms'``) or microsecond (``'us'``) resolution. Since pandas uses nanoseconds
+to represent timestamps, this can occasionally be a nuisance. By default
+(when writing version 1.0 Parquet files), the nanoseconds will be cast to
+microseconds ('us').
+
+In addition, We provide the ``coerce_timestamps`` option to allow you to select
+the desired resolution:
+
+.. code-block:: python
+
+ pq.write_table(table, where, coerce_timestamps='ms')
+
+If a cast to a lower resolution value may result in a loss of data, by default
+an exception will be raised. This can be suppressed by passing
+``allow_truncated_timestamps=True``:
+
+.. code-block:: python
+
+ pq.write_table(table, where, coerce_timestamps='ms',
+ allow_truncated_timestamps=True)
+
+Timestamps with nanoseconds can be stored without casting when using the
+more recent Parquet format version 2.6:
+
+.. code-block:: python
+
+ pq.write_table(table, where, version='2.6')
+
+However, many Parquet readers do not yet support this newer format version, and
+therefore the default is to write version 1.0 files. When compatibility across
+different processing frameworks is required, it is recommended to use the
+default version 1.0.
+
+Older Parquet implementations use ``INT96`` based storage of
+timestamps, but this is now deprecated. This includes some older
+versions of Apache Impala and Apache Spark. To write timestamps in
+this format, set the ``use_deprecated_int96_timestamps`` option to
+``True`` in ``write_table``.
+
+.. code-block:: python
+
+ pq.write_table(table, where, use_deprecated_int96_timestamps=True)
+
+Compression, Encoding, and File Compatibility
+---------------------------------------------
+
+The most commonly used Parquet implementations use dictionary encoding when
+writing files; if the dictionaries grow too large, then they "fall back" to
+plain encoding. Whether dictionary encoding is used can be toggled using the
+``use_dictionary`` option:
+
+.. code-block:: python
+
+ pq.write_table(table, where, use_dictionary=False)
+
+The data pages within a column in a row group can be compressed after the
+encoding passes (dictionary, RLE encoding). In PyArrow we use Snappy
+compression by default, but Brotli, Gzip, and uncompressed are also supported:
+
+.. code-block:: python
+
+ pq.write_table(table, where, compression='snappy')
+ pq.write_table(table, where, compression='gzip')
+ pq.write_table(table, where, compression='brotli')
+ pq.write_table(table, where, compression='none')
+
+Snappy generally results in better performance, while Gzip may yield smaller
+files.
+
+These settings can also be set on a per-column basis:
+
+.. code-block:: python
+
+ pq.write_table(table, where, compression={'foo': 'snappy', 'bar': 'gzip'},
+ use_dictionary=['foo', 'bar'])
+
+Partitioned Datasets (Multiple Files)
+------------------------------------------------
+
+Multiple Parquet files constitute a Parquet *dataset*. These may present in a
+number of ways:
+
+* A list of Parquet absolute file paths
+* A directory name containing nested directories defining a partitioned dataset
+
+A dataset partitioned by year and month may look like on disk:
+
+.. code-block:: text
+
+ dataset_name/
+ year=2007/
+ month=01/
+ 0.parq
+ 1.parq
+ ...
+ month=02/
+ 0.parq
+ 1.parq
+ ...
+ month=03/
+ ...
+ year=2008/
+ month=01/
+ ...
+ ...
+
+Writing to Partitioned Datasets
+-------------------------------
+
+You can write a partitioned dataset for any ``pyarrow`` file system that is a
+file-store (e.g. local, HDFS, S3). The default behaviour when no filesystem is
+added is to use the local filesystem.
+
+.. code-block:: python
+
+ # Local dataset write
+ pq.write_to_dataset(table, root_path='dataset_name',
+ partition_cols=['one', 'two'])
+
+The root path in this case specifies the parent directory to which data will be
+saved. The partition columns are the column names by which to partition the
+dataset. Columns are partitioned in the order they are given. The partition
+splits are determined by the unique values in the partition columns.
+
+To use another filesystem you only need to add the filesystem parameter, the
+individual table writes are wrapped using ``with`` statements so the
+``pq.write_to_dataset`` function does not need to be.
+
+.. code-block:: python
+
+ # Remote file-system example
+ from pyarrow.fs import HadoopFileSystem
+ fs = HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path)
+ pq.write_to_dataset(table, root_path='dataset_name',
+ partition_cols=['one', 'two'], filesystem=fs)
+
+Compatibility Note: if using ``pq.write_to_dataset`` to create a table that
+will then be used by HIVE then partition column values must be compatible with
+the allowed character set of the HIVE version you are running.
+
+Writing ``_metadata`` and ``_common_medata`` files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some processing frameworks such as Spark or Dask (optionally) use ``_metadata``
+and ``_common_metadata`` files with partitioned datasets.
+
+Those files include information about the schema of the full dataset (for
+``_common_metadata``) and potentially all row group metadata of all files in the
+partitioned dataset as well (for ``_metadata``). The actual files are
+metadata-only Parquet files. Note this is not a Parquet standard, but a
+convention set in practice by those frameworks.
+
+Using those files can give a more efficient creation of a parquet Dataset,
+since it can use the stored schema and and file paths of all row groups,
+instead of inferring the schema and crawling the directories for all Parquet
+files (this is especially the case for filesystems where accessing files
+is expensive).
+
+The :func:`~pyarrow.parquet.write_to_dataset` function does not automatically
+write such metadata files, but you can use it to gather the metadata and
+combine and write them manually:
+
+.. code-block:: python
+
+ # Write a dataset and collect metadata information of all written files
+ metadata_collector = []
+ pq.write_to_dataset(table, root_path, metadata_collector=metadata_collector)
+
+ # Write the ``_common_metadata`` parquet file without row groups statistics
+ pq.write_metadata(table.schema, root_path / '_common_metadata')
+
+ # Write the ``_metadata`` parquet file with row groups statistics of all files
+ pq.write_metadata(
+ table.schema, root_path / '_metadata',
+ metadata_collector=metadata_collector
+ )
+
+When not using the :func:`~pyarrow.parquet.write_to_dataset` function, but
+writing the individual files of the partitioned dataset using
+:func:`~pyarrow.parquet.write_table` or :class:`~pyarrow.parquet.ParquetWriter`,
+the ``metadata_collector`` keyword can also be used to collect the FileMetaData
+of the written files. In this case, you need to ensure to set the file path
+contained in the row group metadata yourself before combining the metadata, and
+the schemas of all different files and collected FileMetaData objects should be
+the same:
+
+.. code-block:: python
+
+ metadata_collector = []
+ pq.write_table(
+ table1, root_path / "year=2017/data1.parquet",
+ metadata_collector=metadata_collector
+ )
+
+ # set the file path relative to the root of the partitioned dataset
+ metadata_collector[-1].set_file_path("year=2017/data1.parquet")
+
+ # combine and write the metadata
+ metadata = metadata_collector[0]
+ for _meta in metadata_collector[1:]:
+ metadata.append_row_groups(_meta)
+ metadata.write_metadata_file(root_path / "_metadata")
+
+ # or use pq.write_metadata to combine and write in a single step
+ pq.write_metadata(
+ table1.schema, root_path / "_metadata",
+ metadata_collector=metadata_collector
+ )
+
+Reading from Partitioned Datasets
+------------------------------------------------
+
+The :class:`~.ParquetDataset` class accepts either a directory name or a list
+of file paths, and can discover and infer some common partition structures,
+such as those produced by Hive:
+
+.. code-block:: python
+
+ dataset = pq.ParquetDataset('dataset_name/')
+ table = dataset.read()
+
+You can also use the convenience function ``read_table`` exposed by
+``pyarrow.parquet`` that avoids the need for an additional Dataset object
+creation step.
+
+.. code-block:: python
+
+ table = pq.read_table('dataset_name')
+
+Note: the partition columns in the original table will have their types
+converted to Arrow dictionary types (pandas categorical) on load. Ordering of
+partition columns is not preserved through the save/load process. If reading
+from a remote filesystem into a pandas dataframe you may need to run
+``sort_index`` to maintain row ordering (as long as the ``preserve_index``
+option was enabled on write).
+
+.. note::
+
+ The ParquetDataset is being reimplemented based on the new generic Dataset
+ API (see the :ref:`dataset` docs for an overview). This is not yet the
+ default, but can already be enabled by passing the ``use_legacy_dataset=False``
+ keyword to :class:`ParquetDataset` or :func:`read_table`::
+
+ pq.ParquetDataset('dataset_name/', use_legacy_dataset=False)
+
+ Enabling this gives the following new features:
+
+ - Filtering on all columns (using row group statistics) instead of only on
+ the partition keys.
+ - More fine-grained partitioning: support for a directory partitioning scheme
+ in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of
+ "/year=2019/month=11/day=15/"), and the ability to specify a schema for
+ the partition keys.
+ - General performance improvement and bug fixes.
+
+ It also has the following changes in behaviour:
+
+ - The partition keys need to be explicitly included in the ``columns``
+ keyword when you want to include them in the result while reading a
+ subset of the columns
+
+ This new implementation is already enabled in ``read_table``, and in the
+ future, this will be turned on by default for ``ParquetDataset``. The new
+ implementation does not yet cover all existing ParquetDataset features (e.g.
+ specifying the ``metadata``, or the ``pieces`` property API). Feedback is
+ very welcome.
+
+
+Using with Spark
+----------------
+
+Spark places some constraints on the types of Parquet files it will read. The
+option ``flavor='spark'`` will set these options automatically and also
+sanitize field characters unsupported by Spark SQL.
+
+Multithreaded Reads
+-------------------
+
+Each of the reading functions by default use multi-threading for reading
+columns in parallel. Depending on the speed of IO
+and how expensive it is to decode the columns in a particular file
+(particularly with GZIP compression), this can yield significantly higher data
+throughput.
+
+This can be disabled by specifying ``use_threads=False``.
+
+.. note::
+ The number of threads to use concurrently is automatically inferred by Arrow
+ and can be inspected using the :func:`~pyarrow.cpu_count()` function.
+
+Reading from cloud storage
+--------------------------
+
+In addition to local files, pyarrow supports other filesystems, such as cloud
+filesystems, through the ``filesystem`` keyword:
+
+.. code-block:: python
+
+ from pyarrow import fs
+
+ s3 = fs.S3FileSystem(region="us-east-2")
+ table = pq.read_table("bucket/object/key/prefix", filesystem=s3)
+
+Currently, :class:`HDFS <pyarrow.fs.HadoopFileSystem>` and
+:class:`Amazon S3-compatible storage <pyarrow.fs.S3FileSystem>` are
+supported. See the :ref:`filesystem` docs for more details. For those
+built-in filesystems, the filesystem can also be inferred from the file path,
+if specified as a URI:
+
+.. code-block:: python
+
+ table = pq.read_table("s3://bucket/object/key/prefix")
+
+Other filesystems can still be supported if there is an
+`fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`__-compatible
+implementation available. See :ref:`filesystem-fsspec` for more details.
+One example is Azure Blob storage, which can be interfaced through the
+`adlfs <https://github.com/dask/adlfs>`__ package.
+
+.. code-block:: python
+
+ from adlfs import AzureBlobFileSystem
+
+ abfs = AzureBlobFileSystem(account_name="XXXX", account_key="XXXX", container_name="XXXX")
+ table = pq.read_table("file.parquet", filesystem=abfs)
diff --git a/src/arrow/docs/source/python/plasma.rst b/src/arrow/docs/source/python/plasma.rst
new file mode 100644
index 000000000..51c7b6eaf
--- /dev/null
+++ b/src/arrow/docs/source/python/plasma.rst
@@ -0,0 +1,462 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. currentmodule:: pyarrow
+.. _plasma:
+
+The Plasma In-Memory Object Store
+=================================
+
+.. note::
+
+ As present, Plasma is only supported for use on Linux and macOS.
+
+The Plasma API
+--------------
+
+Starting the Plasma store
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can start the Plasma store by issuing a terminal command similar to the
+following:
+
+.. code-block:: bash
+
+ plasma_store -m 1000000000 -s /tmp/plasma
+
+The ``-m`` flag specifies the size of the store in bytes, and the ``-s`` flag
+specifies the socket that the store will listen at. Thus, the above command
+allows the Plasma store to use up to 1GB of memory, and sets the socket to
+``/tmp/plasma``.
+
+Leaving the current terminal window open as long as Plasma store should keep
+running. Messages, concerning such as disconnecting clients, may occasionally be
+printed to the screen. To stop running the Plasma store, you can press
+``Ctrl-C`` in the terminal.
+
+Creating a Plasma client
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+To start a Plasma client from Python, call ``plasma.connect`` using the same
+socket name:
+
+.. code-block:: python
+
+ import pyarrow.plasma as plasma
+ client = plasma.connect("/tmp/plasma")
+
+If the following error occurs from running the above Python code, that
+means that either the socket given is incorrect, or the ``./plasma_store`` is
+not currently running. Check to see if the Plasma store is still running.
+
+.. code-block:: shell
+
+ >>> client = plasma.connect("/tmp/plasma")
+ Connection to socket failed for pathname /tmp/plasma
+ Could not connect to socket /tmp/plasma
+
+
+Object IDs
+^^^^^^^^^^
+
+Each object in the Plasma store should be associated with a unique ID. The
+Object ID then serves as a key that any client can use to retrieve that object
+from the Plasma store. You can form an ``ObjectID`` object from a byte string of
+length 20.
+
+.. code-block:: shell
+
+ # Create an ObjectID.
+ >>> id = plasma.ObjectID(20 * b"a")
+
+ # The character "a" is encoded as 61 in hex.
+ >>> id
+ ObjectID(6161616161616161616161616161616161616161)
+
+The random generation of Object IDs is often good enough to ensure unique IDs.
+You can easily create a helper function that randomly generates object IDs as
+follows:
+
+.. code-block:: python
+
+ import numpy as np
+
+ def random_object_id():
+ return plasma.ObjectID(np.random.bytes(20))
+
+Putting and Getting Python Objects
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Plasma supports two APIs for creating and accessing objects: A high level
+API that allows storing and retrieving Python objects and a low level
+API that allows creating, writing and sealing buffers and operating on
+the binary data directly. In this section we describe the high level API.
+
+This is how you can put and get a Python object:
+
+.. code-block:: python
+
+ # Create a python object.
+ object_id = client.put("hello, world")
+
+ # Get the object.
+ client.get(object_id)
+
+This works with all Python objects supported by the Arrow Python object
+serialization.
+
+You can also get multiple objects at the same time (which can be more
+efficient since it avoids IPC round trips):
+
+.. code-block:: python
+
+ # Create multiple python objects.
+ object_id1 = client.put(1)
+ object_id2 = client.put(2)
+ object_id3 = client.put(3)
+
+ # Get the objects.
+ client.get([object_id1, object_id2, object_id3])
+
+Furthermore, it is possible to provide a timeout for the get call. If the
+object is not available within the timeout, the special object
+`pyarrow.ObjectNotAvailable` will be returned.
+
+Creating an Object Buffer
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Objects are created in Plasma in two stages. First, they are **created**, which
+allocates a buffer for the object. At this point, the client can write to the
+buffer and construct the object within the allocated buffer.
+
+To create an object for Plasma, you need to create an object ID, as well as
+give the object's maximum size in bytes.
+
+.. code-block:: python
+
+ # Create an object buffer.
+ object_id = plasma.ObjectID(20 * b"a")
+ object_size = 1000
+ buffer = memoryview(client.create(object_id, object_size))
+
+ # Write to the buffer.
+ for i in range(1000):
+ buffer[i] = i % 128
+
+When the client is done, the client **seals** the buffer, making the object
+immutable, and making it available to other Plasma clients.
+
+.. code-block:: python
+
+ # Seal the object. This makes the object immutable and available to other clients.
+ client.seal(object_id)
+
+
+Getting an Object Buffer
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+After an object has been sealed, any client who knows the object ID can get
+the object buffer.
+
+.. code-block:: python
+
+ # Create a different client. Note that this second client could be
+ # created in the same or in a separate, concurrent Python session.
+ client2 = plasma.connect("/tmp/plasma")
+
+ # Get the object in the second client. This blocks until the object has been sealed.
+ object_id2 = plasma.ObjectID(20 * b"a")
+ [buffer2] = client2.get_buffers([object_id])
+
+If the object has not been sealed yet, then the call to client.get_buffers will
+block until the object has been sealed by the client constructing the object.
+Using the ``timeout_ms`` argument to get, you can specify a timeout for this (in
+milliseconds). After the timeout, the interpreter will yield control back.
+
+.. code-block:: shell
+
+ >>> buffer
+ <memory at 0x7fdbdc96e708>
+ >>> buffer[1]
+ 1
+ >>> buffer2
+ <plasma.plasma.PlasmaBuffer object at 0x7fdbf2770e88>
+ >>> view2 = memoryview(buffer2)
+ >>> view2[1]
+ 1
+ >>> view2[129]
+ 1
+ >>> bytes(buffer[1:4])
+ b'\x01\x02\x03'
+ >>> bytes(view2[1:4])
+ b'\x01\x02\x03'
+
+
+Listing objects in the store
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The objects in the store can be listed in the following way (note that
+this functionality is currently experimental and the concrete representation
+of the object info might change in the future):
+
+.. code-block:: python
+
+ import pyarrow.plasma as plasma
+ import time
+
+ client = plasma.connect("/tmp/plasma")
+
+ client.put("hello, world")
+ # Sleep a little so we get different creation times
+ time.sleep(2)
+ client.put("another object")
+ # Create an object that is not sealed yet
+ object_id = plasma.ObjectID.from_random()
+ client.create(object_id, 100)
+ print(client.list())
+
+ >>> {ObjectID(4cba8f80c54c6d265b46c2cdfcee6e32348b12be): {'construct_duration': 0,
+ >>> 'create_time': 1535223642,
+ >>> 'data_size': 460,
+ >>> 'metadata_size': 0,
+ >>> 'ref_count': 0,
+ >>> 'state': 'sealed'},
+ >>> ObjectID(a7598230b0c26464c9d9c99ae14773ee81485428): {'construct_duration': 0,
+ >>> 'create_time': 1535223644,
+ >>> 'data_size': 460,
+ >>> 'metadata_size': 0,
+ >>> 'ref_count': 0,
+ >>> 'state': 'sealed'},
+ >>> ObjectID(e603ab0c92098ebf08f90bfcea33ff98f6476870): {'construct_duration': -1,
+ >>> 'create_time': 1535223644,
+ >>> 'data_size': 100,
+ >>> 'metadata_size': 0,
+ >>> 'ref_count': 1,
+ >>> 'state': 'created'}}
+
+
+Using Arrow and Pandas with Plasma
+----------------------------------
+
+Storing Arrow Objects in Plasma
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To store an Arrow object in Plasma, we must first **create** the object and then
+**seal** it. However, Arrow objects such as ``Tensors`` may be more complicated
+to write than simple binary data.
+
+To create the object in Plasma, you still need an ``ObjectID`` and a size to
+pass in. To find out the size of your Arrow object, you can use pyarrow
+API such as ``pyarrow.ipc.get_tensor_size``.
+
+.. code-block:: python
+
+ import numpy as np
+ import pyarrow as pa
+
+ # Create a pyarrow.Tensor object from a numpy random 2-dimensional array
+ data = np.random.randn(10, 4)
+ tensor = pa.Tensor.from_numpy(data)
+
+ # Create the object in Plasma
+ object_id = plasma.ObjectID(np.random.bytes(20))
+ data_size = pa.ipc.get_tensor_size(tensor)
+ buf = client.create(object_id, data_size)
+
+To write the Arrow ``Tensor`` object into the buffer, you can use Plasma to
+convert the ``memoryview`` buffer into a ``pyarrow.FixedSizeBufferWriter``
+object. A ``pyarrow.FixedSizeBufferWriter`` is a format suitable for Arrow's
+``pyarrow.ipc.write_tensor``:
+
+.. code-block:: python
+
+ # Write the tensor into the Plasma-allocated buffer
+ stream = pa.FixedSizeBufferWriter(buf)
+ pa.ipc.write_tensor(tensor, stream) # Writes tensor's 552 bytes to Plasma stream
+
+To finish storing the Arrow object in Plasma, call ``seal``:
+
+.. code-block:: python
+
+ # Seal the Plasma object
+ client.seal(object_id)
+
+Getting Arrow Objects from Plasma
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To read the object, first retrieve it as a ``PlasmaBuffer`` using its object ID.
+
+.. code-block:: python
+
+ # Get the arrow object by ObjectID.
+ [buf2] = client.get_buffers([object_id])
+
+To convert the ``PlasmaBuffer`` back into an Arrow ``Tensor``, first create a
+pyarrow ``BufferReader`` object from it. You can then pass the ``BufferReader``
+into ``pyarrow.ipc.read_tensor`` to reconstruct the Arrow ``Tensor`` object:
+
+.. code-block:: python
+
+ # Reconstruct the Arrow tensor object.
+ reader = pa.BufferReader(buf2)
+ tensor2 = pa.ipc.read_tensor(reader)
+
+Finally, you can use ``pyarrow.ipc.read_tensor`` to convert the Arrow object
+back into numpy data:
+
+.. code-block:: python
+
+ # Convert back to numpy
+ array = tensor2.to_numpy()
+
+Storing Pandas DataFrames in Plasma
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Storing a Pandas ``DataFrame`` still follows the **create** then **seal**
+process of storing an object in the Plasma store, however one cannot directly
+write the ``DataFrame`` to Plasma with Pandas alone. Plasma also needs to know
+the size of the ``DataFrame`` to allocate a buffer for.
+
+See :ref:`pandas_interop` for more information on using Arrow with Pandas.
+
+You can create the pyarrow equivalent of a Pandas ``DataFrame`` by using
+``pyarrow.from_pandas`` to convert it to a ``RecordBatch``.
+
+.. code-block:: python
+
+ import pyarrow as pa
+ import pandas as pd
+
+ # Create a Pandas DataFrame
+ d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
+ 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
+ df = pd.DataFrame(d)
+
+ # Convert the Pandas DataFrame into a PyArrow RecordBatch
+ record_batch = pa.RecordBatch.from_pandas(df)
+
+Creating the Plasma object requires an ``ObjectID`` and the size of the
+data. Now that we have converted the Pandas ``DataFrame`` into a PyArrow
+``RecordBatch``, use the ``MockOutputStream`` to determine the
+size of the Plasma object.
+
+.. code-block:: python
+
+ # Create the Plasma object from the PyArrow RecordBatch. Most of the work here
+ # is done to determine the size of buffer to request from the object store.
+ object_id = plasma.ObjectID(np.random.bytes(20))
+ mock_sink = pa.MockOutputStream()
+ with pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) as stream_writer:
+ stream_writer.write_batch(record_batch)
+ data_size = mock_sink.size()
+ buf = client.create(object_id, data_size)
+
+The DataFrame can now be written to the buffer as follows.
+
+.. code-block:: python
+
+ # Write the PyArrow RecordBatch to Plasma
+ stream = pa.FixedSizeBufferWriter(buf)
+ with pa.RecordBatchStreamWriter(stream, record_batch.schema) as stream_writer:
+ stream_writer.write_batch(record_batch)
+
+Finally, seal the finished object for use by all clients:
+
+.. code-block:: python
+
+ # Seal the Plasma object
+ client.seal(object_id)
+
+Getting Pandas DataFrames from Plasma
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Since we store the Pandas DataFrame as a PyArrow ``RecordBatch`` object,
+to get the object back from the Plasma store, we follow similar steps
+to those specified in `Getting Arrow Objects from Plasma`_.
+
+We first have to convert the ``PlasmaBuffer`` returned from
+``client.get_buffers`` into an Arrow ``BufferReader`` object.
+
+.. code-block:: python
+
+ # Fetch the Plasma object
+ [data] = client.get_buffers([object_id]) # Get PlasmaBuffer from ObjectID
+ buffer = pa.BufferReader(data)
+
+From the ``BufferReader``, we can create a specific ``RecordBatchStreamReader``
+in Arrow to reconstruct the stored PyArrow ``RecordBatch`` object.
+
+.. code-block:: python
+
+ # Convert object back into an Arrow RecordBatch
+ reader = pa.RecordBatchStreamReader(buffer)
+ record_batch = reader.read_next_batch()
+
+The last step is to convert the PyArrow ``RecordBatch`` object back into
+the original Pandas ``DataFrame`` structure.
+
+.. code-block:: python
+
+ # Convert back into Pandas
+ result = record_batch.to_pandas()
+
+Using Plasma with Huge Pages
+----------------------------
+
+On Linux it is possible to use the Plasma store with huge pages for increased
+throughput. You first need to create a file system and activate huge pages with
+
+.. code-block:: shell
+
+ sudo mkdir -p /mnt/hugepages
+ gid=`id -g`
+ uid=`id -u`
+ sudo mount -t hugetlbfs -o uid=$uid -o gid=$gid none /mnt/hugepages
+ sudo bash -c "echo $gid > /proc/sys/vm/hugetlb_shm_group"
+ sudo bash -c "echo 20000 > /proc/sys/vm/nr_hugepages"
+
+Note that you only need root access to create the file system, not for
+running the object store. You can then start the Plasma store with the ``-d``
+flag for the mount point of the huge page file system and the ``-h`` flag
+which indicates that huge pages are activated:
+
+.. code-block:: shell
+
+ plasma_store -s /tmp/plasma -m 10000000000 -d /mnt/hugepages -h
+
+You can test this with the following script:
+
+.. code-block:: python
+
+ import numpy as np
+ import pyarrow as pa
+ import pyarrow.plasma as plasma
+ import time
+
+ client = plasma.connect("/tmp/plasma")
+
+ data = np.random.randn(100000000)
+ tensor = pa.Tensor.from_numpy(data)
+
+ object_id = plasma.ObjectID(np.random.bytes(20))
+ buf = client.create(object_id, pa.ipc.get_tensor_size(tensor))
+
+ stream = pa.FixedSizeBufferWriter(buf)
+ stream.set_memcopy_threads(4)
+ a = time.time()
+ pa.ipc.write_tensor(tensor, stream)
+ print("Writing took ", time.time() - a)
diff --git a/src/arrow/docs/source/python/timestamps.rst b/src/arrow/docs/source/python/timestamps.rst
new file mode 100644
index 000000000..fb4da5cc0
--- /dev/null
+++ b/src/arrow/docs/source/python/timestamps.rst
@@ -0,0 +1,198 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+**********
+Timestamps
+**********
+
+Arrow/Pandas Timestamps
+=======================
+
+Arrow timestamps are stored as a 64-bit integer with column metadata to
+associate a time unit (e.g. milliseconds, microseconds, or nanoseconds), and an
+optional time zone. Pandas (`Timestamp`) uses a 64-bit integer representing
+nanoseconds and an optional time zone.
+Python/Pandas timestamp types without a associated time zone are referred to as
+"Time Zone Naive". Python/Pandas timestamp types with an associated time zone are
+referred to as "Time Zone Aware".
+
+
+Timestamp Conversions
+=====================
+
+Pandas/Arrow ⇄ Spark
+--------------------
+
+Spark stores timestamps as 64-bit integers representing microseconds since
+the UNIX epoch. It does not store any metadata about time zones with its
+timestamps.
+
+Spark interprets timestamps with the *session local time zone*, (i.e.
+``spark.sql.session.timeZone``). If that time zone is undefined, Spark turns to
+the default system time zone. For simplicity's sake below, the session
+local time zone is always defined.
+
+This implies a few things when round-tripping timestamps:
+
+#. Timezone information is lost (all timestamps that result from
+ converting from spark to arrow/pandas are "time zone naive").
+#. Timestamps are truncated to microseconds.
+#. The session time zone might have unintuitive impacts on
+ translation of timestamp values.
+
+Spark to Pandas (through Apache Arrow)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following cases assume the Spark configuration
+``spark.sql.execution.arrow.enabled`` is set to ``"true"``.
+
+::
+
+ >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)],
+ ... 'aware': [Timestamp(year=2019, month=1, day=1,
+ ... nanosecond=500, tz=timezone(timedelta(hours=-8)))]})
+ >>> pdf
+ naive aware
+ 0 2018-10-01 2018-10-01 00:00:00.000000500-08:00
+
+ >>> spark.conf.set("spark.sql.session.timeZone", "UTC")
+ >>> utc_df = sqlContext.createDataFrame(pdf)
+ >>> utf_df.show()
+ +-------------------+-------------------+
+ | naive| aware|
+ +-------------------+-------------------+
+ |2019-01-01 00:00:00|2019-01-01 08:00:00|
+ +-------------------+-------------------+
+
+Note that conversion of the aware timestamp is shifted to reflect the time
+assuming UTC (it represents the same instant in time). For naive
+timestamps, Spark treats them as being in the system local
+time zone and converts them UTC. Recall that internally, the schema
+for spark dataframe's does not store any time zone information with
+timestamps.
+
+Now if the session time zone is set to US Pacific Time (PST) we don't
+see any shift in the display of the aware time zone (it
+still represents the same instant in time):
+
+::
+
+ >>> spark.conf.set("spark.sql.session.timeZone", "US/Pacific")
+ >>> pst_df = sqlContext.createDataFrame(pdf)
+ >>> pst_df.show()
+ +-------------------+-------------------+
+ | naive| aware|
+ +-------------------+-------------------+
+ |2019-01-01 00:00:00|2019-01-01 00:00:00|
+ +-------------------+-------------------+
+
+Looking again at utc_df.show() we see one of the impacts of the session time
+zone. The naive timestamp was initially converted assuming UTC, the instant it
+reflects is actually earlier than the naive time zone from the PST converted
+data frame:
+
+::
+
+ >>> utc_df.show()
+ +-------------------+-------------------+
+ | naive| aware|
+ +-------------------+-------------------+
+ |2018-12-31 16:00:00|2019-01-01 00:00:00|
+ +-------------------+-------------------+
+
+Spark to Pandas
+~~~~~~~~~~~~~~~
+
+We can observe what happens when converting back to Arrow/Pandas. Assuming the
+session time zone is still PST:
+
+::
+
+ >>> pst_df.show()
+ +-------------------+-------------------+
+ | naive| aware|
+ +-------------------+-------------------+
+ |2019-01-01 00:00:00|2019-01-01 00:00:00|
+ +-------------------+-------------------+
+
+
+ >>> pst_df.toPandas()
+ naive aware
+ 0 2019-01-01 2019-01-01
+ >>> pst_df.toPandas().info()
+ <class 'pandas.core.frame.DataFrame'>
+ RangeIndex: 1 entries, 0 to 0
+ Data columns (total 2 columns):
+ naive 1 non-null datetime64[ns]
+ aware 1 non-null datetime64[ns]
+ dtypes: datetime64[ns](2)
+ memory usage: 96.0 bytes
+
+Notice that, in addition to being a "time zone naive" timestamp, the 'aware'
+value will now differ when converting to an epoch offset. Spark does the conversion
+by first converting to the session time zone (or system local time zone if
+session time zones isn't set) and then localizes to remove the time zone
+information. This results in the timestamp being 8 hours before the original
+time:
+
+::
+
+ >>> pst_df.toPandas()['aware'][0]
+ Timestamp('2019-01-01 00:00:00')
+ >>> pdf['aware'][0]
+ Timestamp('2019-01-01 00:00:00.000000500-0800', tz='UTC-08:00')
+ >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600
+ -8.0
+
+The same type of conversion happens with the data frame converted while
+the session time zone was UTC. In this case both naive and aware
+represent different instants in time (the naive instant is due to
+the change in session time zone between creating data frames):
+
+::
+
+ >>> utc_df.show()
+ +-------------------+-------------------+
+ | naive| aware|
+ +-------------------+-------------------+
+ |2018-12-31 16:00:00|2019-01-01 00:00:00|
+ +-------------------+-------------------+
+
+ >>> utc_df.toPandas()
+ naive aware
+ 0 2018-12-31 16:00:00 2019-01-01
+
+Note that the surprising shift for aware doesn't happen
+when the session time zone is UTC (but the timestamps
+still become "time zone naive"):
+
+::
+
+ >>> spark.conf.set("spark.sql.session.timeZone", "UTC")
+ >>> pst_df.show()
+ +-------------------+-------------------+
+ | naive| aware|
+ +-------------------+-------------------+
+ |2019-01-01 08:00:00|2019-01-01 08:00:00|
+ +-------------------+-------------------+
+
+ >>> pst_df.toPandas()['aware'][0]
+ Timestamp('2019-01-01 08:00:00')
+ >>> pdf['aware'][0]
+ Timestamp('2019-01-01 00:00:00.000000500-0800', tz='UTC-08:00')
+ >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600
+ 0.0
diff --git a/src/arrow/docs/source/r/index.rst b/src/arrow/docs/source/r/index.rst
new file mode 100644
index 000000000..b799544bb
--- /dev/null
+++ b/src/arrow/docs/source/r/index.rst
@@ -0,0 +1,21 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+R docs
+======
+
+Stub page for the R docs; actual source is located in r/ sub-directory.
diff --git a/src/arrow/docs/source/status.rst b/src/arrow/docs/source/status.rst
new file mode 100644
index 000000000..8e3e998df
--- /dev/null
+++ b/src/arrow/docs/source/status.rst
@@ -0,0 +1,239 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+
+.. http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied. See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+=====================
+Implementation Status
+=====================
+
+The following tables summarize the features available in the various official
+Arrow libraries. Unless otherwise stated, the Python, R, Ruby and C/GLib
+libraries follow the C++ Arrow library.
+
+Data Types
+==========
+
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia |
+| (primitive) | | | | | | | |
++===================+=======+=======+=======+============+=======+=======+=======+
+| Null | ✓ | ✓ | ✓ | | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Boolean | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Int8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| UInt8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Float16 | | | ✓ | | | | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Float32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Decimal128 | ✓ | ✓ | ✓ | | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Decimal256 | ✓ | ✓ | | | ✓ | | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Date32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Time32/64 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Timestamp | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Duration | ✓ | ✓ | ✓ | | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Interval | ✓ | ✓ | ✓ | | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Fixed Size Binary | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Binary | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Large Binary | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia |
+| (nested) | | | | | | | |
++===================+=======+=======+=======+============+=======+=======+=======+
+| Fixed Size List | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| List | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Large List | ✓ | ✓ | | | | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Struct | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Map | ✓ | ✓ | ✓ | ✓ | | | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Dense Union | ✓ | ✓ | | | | | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Sparse Union | ✓ | ✓ | | | | | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia |
+| (special) | | | | | | | |
++===================+=======+=======+=======+============+=======+=======+=======+
+| Dictionary | ✓ | ✓ (1) | | ✓ (1) | | ✓ (1) | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+| Extension | ✓ | ✓ | ✓ | | | | ✓ |
++-------------------+-------+-------+-------+------------+-------+-------+-------+
+
+Notes:
+
+* \(1) Nested dictionaries not supported
+
+.. seealso::
+ The :ref:`format_columnar` specification.
+
+
+IPC Format
+==========
+
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| IPC Feature | C++ | Java | Go | JavaScript | C# | Rust | Julia |
+| | | | | | | | |
++=============================+=======+=======+=======+============+=======+=======+=======+
+| Arrow stream format | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Arrow file format | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Record batches | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Dictionaries | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Replacement dictionaries | ✓ | ✓ | | | | | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Delta dictionaries | ✓ (1) | | | | | | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Tensors | ✓ | | | | | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Sparse tensors | ✓ | | | | | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Buffer compression | ✓ | ✓ (3) | ✓ | | | | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Endianness conversion | ✓ (2) | | | | | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Custom schema metadata | ✓ | ✓ | ✓ | | | ✓ | ✓ |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+
+Notes:
+
+* \(1) Delta dictionaries not supported on nested dictionaries
+
+* \(2) Data with non-native endianness can be byte-swapped automatically when reading.
+
+* \(3) LZ4 Codec currently is quite inefficient. ARROW-11901 tracks improving performance.
+
+.. seealso::
+ The :ref:`format-ipc` specification.
+
+
+Flight RPC
+==========
+
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Flight RPC Feature | C++ | Java | Go | JavaScript | C# | Rust | Julia |
+| | | | | | | | |
++=============================+=======+=======+=======+============+=======+=======+=======+
+| gRPC transport | ✓ | ✓ | ✓ | | ✓ (1) | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| gRPC + TLS transport | ✓ | ✓ | ✓ | | ✓ | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| RPC error codes | ✓ | ✓ | ✓ | | ✓ | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Authentication handlers | ✓ | ✓ | ✓ | | ✓ (2) | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Custom client middleware | ✓ | ✓ | ✓ | | | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+| Custom server middleware | ✓ | ✓ | ✓ | | | | |
++-----------------------------+-------+-------+-------+------------+-------+-------+-------+
+
+Notes:
+
+* \(1) No support for handshake or DoExchange.
+* \(2) Support using AspNetCore authentication handlers.
+
+.. seealso::
+ The :ref:`flight-rpc` specification.
+
+
+C Data Interface
+================
+
++-----------------------------+-----+--------+---+------+----+------+--------+------+
+| Feature | C++ | Python | R | Rust | Go | Java | C/GLib | Ruby |
+| | | | | | | | | |
++=============================+=====+========+===+======+====+======+========+======+
+| Schema export | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-----+--------+---+------+----+------+--------+------+
+| Array export | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-----+--------+---+------+----+------+--------+------+
+| Schema import | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-----+--------+---+------+----+------+--------+------+
+| Array import | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-----+--------+---+------+----+------+--------+------+
+
+.. seealso::
+ The :ref:`C Data Interface <c-data-interface>` specification.
+
+
+C Stream Interface (experimental)
+=================================
+
++-----------------------------+-----+--------+----+--------+------+
+| Feature | C++ | Python | Go | C/GLib | Ruby |
+| | | | | | |
++=============================+=====+========+====+========+======+
+| Stream export | ✓ | ✓ | | ✓ | ✓ |
++-----------------------------+-----+--------+----+--------+------+
+| Stream import | ✓ | ✓ | ✓ | ✓ | ✓ |
++-----------------------------+-----+--------+----+--------+------+
+
+.. seealso::
+ The :ref:`C Stream Interface <c-stream-interface>` specification.
+
+
+Third-Party Data Formats
+========================
+
++-----------------------------+---------+---------+-------+------------+-------+---------+-------+
+| Format | C++ | Java | Go | JavaScript | C# | Rust | Julia |
+| | | | | | | | |
++=============================+=========+=========+=======+============+=======+=========+=======+
+| Avro | | R | | | | | |
++-----------------------------+---------+---------+-------+------------+-------+---------+-------+
+| CSV | R | | R/W | | | R/W | R/W |
++-----------------------------+---------+---------+-------+------------+-------+---------+-------+
+| ORC | R/W | R (2) | | | | | |
++-----------------------------+---------+---------+-------+------------+-------+---------+-------+
+| Parquet | R/W | R (3) | | | | R/W (1) | |
++-----------------------------+---------+---------+-------+------------+-------+---------+-------+
+
+Notes:
+
+* *R* = Read supported
+
+* *W* = Write supported
+
+* \(1) Nested read/write not supported.
+
+* \(2) Through JNI bindings. (Provided by ``org.apache.arrow.orc:arrow-orc``)
+
+* \(3) Through JNI bindings to Arrow C++ Datasets. (Provided by ``org.apache.arrow:arrow-dataset``)