diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/docs | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/docs')
139 files changed, 22543 insertions, 0 deletions
diff --git a/src/arrow/docs/.gitignore b/src/arrow/docs/.gitignore new file mode 100644 index 000000000..d2e9f6ccc --- /dev/null +++ b/src/arrow/docs/.gitignore @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +_build +source/python/generated diff --git a/src/arrow/docs/Makefile b/src/arrow/docs/Makefile new file mode 100644 index 000000000..fdff066a3 --- /dev/null +++ b/src/arrow/docs/Makefile @@ -0,0 +1,248 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. + +# Do not fail the build if there are warnings +# SPHINXOPTS = -j8 -W +SPHINXOPTS = -j8 + +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " epub3 to make an epub3" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + @echo " dummy to check syntax errors of document sources" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + rm -rf source/python/generated/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyarrow.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyarrow.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pyarrow" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyarrow" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: epub3 +epub3: + $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 + @echo + @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +.PHONY: dummy +dummy: + $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy + @echo + @echo "Build finished. Dummy builder generates no files." diff --git a/src/arrow/docs/README.md b/src/arrow/docs/README.md new file mode 100644 index 000000000..213042641 --- /dev/null +++ b/src/arrow/docs/README.md @@ -0,0 +1,30 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# Apache Arrow Documentation + +This directory contains source files for building the main project +documentation. This includes the [Arrow columnar format specification][2]. + +Instructions for building the documentation site are found in +[docs/source/developers/documentation.rst][1]. The build depends on the API +documentation for some of the project subcomponents. + +[1]: https://github.com/apache/arrow/blob/master/docs/source/developers/documentation.rst +[2]: https://github.com/apache/arrow/tree/master/docs/source/format diff --git a/src/arrow/docs/environment.yml b/src/arrow/docs/environment.yml new file mode 100644 index 000000000..8d1fe9bfb --- /dev/null +++ b/src/arrow/docs/environment.yml @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +channels: +- defaults +- conda-forge +dependencies: +- arrow-cpp +- parquet-cpp +- pyarrow +- numpydoc diff --git a/src/arrow/docs/make.bat b/src/arrow/docs/make.bat new file mode 100644 index 000000000..36f2086c2 --- /dev/null +++ b/src/arrow/docs/make.bat @@ -0,0 +1,52 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/src/arrow/docs/requirements.txt b/src/arrow/docs/requirements.txt new file mode 100644 index 000000000..0dbca6922 --- /dev/null +++ b/src/arrow/docs/requirements.txt @@ -0,0 +1,5 @@ +breathe +ipython +numpydoc +sphinx==2.4.4 +pydata-sphinx-theme diff --git a/src/arrow/docs/source/_static/arrow.png b/src/arrow/docs/source/_static/arrow.png Binary files differnew file mode 100644 index 000000000..72104b075 --- /dev/null +++ b/src/arrow/docs/source/_static/arrow.png diff --git a/src/arrow/docs/source/_static/favicon.ico b/src/arrow/docs/source/_static/favicon.ico Binary files differnew file mode 100644 index 000000000..33a554a8a --- /dev/null +++ b/src/arrow/docs/source/_static/favicon.ico diff --git a/src/arrow/docs/source/_static/theme_overrides.css b/src/arrow/docs/source/_static/theme_overrides.css new file mode 100644 index 000000000..d7d0bdfdb --- /dev/null +++ b/src/arrow/docs/source/_static/theme_overrides.css @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +/* Customizing with theme CSS variables */ + +:root { + --pst-color-active-navigation: 215, 70, 51; + --pst-color-link-hover: 215, 70, 51; + --pst-color-headerlink: 215, 70, 51; + /* Use normal text color (like h3, ..) instead of primary color */ + --pst-color-h1: var(--color-text-base); + --pst-color-h2: var(--color-text-base); + /* Use softer blue from bootstrap's default info color */ + --pst-color-info: 23, 162, 184; + --pst-header-height: 0px; +} + +code { + color: rgb(215, 70, 51); +} + +.footer { + text-align: center; +} + +/* Ensure the logo is properly displayed */ + +.navbar-brand { + height: auto; + width: auto; +} + +a.navbar-brand img { + height: auto; + width: auto; + max-height: 15vh; + max-width: 100%; +} + + +/* This is the bootstrap CSS style for "table-striped". Since the theme does +not yet provide an easy way to configure this globaly, it easier to simply +include this snippet here than updating each table in all rst files to +add ":class: table-striped" */ + +.table tbody tr:nth-of-type(odd) { + background-color: rgba(0, 0, 0, 0.05); +} + +/* Iprove the vertical spacing in the C++ API docs +(ideally this should be upstreamed to the pydata-sphinx-theme */ + +dl.cpp dd p { + margin-bottom:.4rem; +} + +dl.cpp.enumerator { + margin-bottom: 0.2rem; +} + +p.breathe-sectiondef-title { + margin-top: 1rem; +} + +/* Limit the max height of the sidebar navigation section. Because in our +custimized template, there is more content above the navigation, i.e. +larger logo: if we don't decrease the max-height, it will overlap with +the footer. +Details: min(15vh, 110px) for the logo size, 8rem for search box etc*/ + +@media (min-width:720px) { + @supports (position:-webkit-sticky) or (position:sticky) { + .bd-links { + max-height: calc(100vh - min(15vh, 110px) - 8rem) + } + } +} + +/* Styling to get the version dropdown and search box side-by-side on wide screens */ + +#version-search-wrapper { + overflow: hidden; + width: inherit; + display: flex; + flex-wrap: wrap; + justify-content: left; + align-items: center; +} + +#version-button { + padding-left: 0.5rem; + padding-right: 1rem; +} + +#search-box { + flex: 1 0 12em; +} + +/* Fix table text wrapping in RTD theme, + * see https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html + */ + +@media screen { + table.docutils td { + /* !important prevents the common CSS stylesheets from overriding + this as on RTD they are loaded after this stylesheet */ + white-space: normal !important; + } +} diff --git a/src/arrow/docs/source/_static/versions.json b/src/arrow/docs/source/_static/versions.json new file mode 100644 index 000000000..d364cfe27 --- /dev/null +++ b/src/arrow/docs/source/_static/versions.json @@ -0,0 +1,26 @@ +[ + { + "name": "6.0 (stable)", + "version": "" + }, + { + "name": "5.0", + "version": "5.0/" + }, + { + "name": "4.0", + "version": "4.0/" + }, + { + "name": "3.0", + "version": "3.0/" + }, + { + "name": "2.0", + "version": "2.0/" + }, + { + "name": "1.0", + "version": "1.0/" + } +]
\ No newline at end of file diff --git a/src/arrow/docs/source/_templates/docs-sidebar.html b/src/arrow/docs/source/_templates/docs-sidebar.html new file mode 100644 index 000000000..fde4435df --- /dev/null +++ b/src/arrow/docs/source/_templates/docs-sidebar.html @@ -0,0 +1,25 @@ + +<a class="navbar-brand" href="{{ pathto(master_doc) }}"> + <img src="{{ pathto('_static/' + logo, 1) }}" class="logo" alt="logo"> +</a> + +<div id="version-search-wrapper"> + +{% include "version-switcher.html" %} + +<form id="search-box" class="bd-search d-flex align-items-center" action="{{ pathto('search') }}" method="get"> + <i class="icon fas fa-search"></i> + <input type="search" class="form-control" name="q" id="search-input" placeholder="{{ theme_search_bar_text }}" aria-label="{{ theme_search_bar_text }}" autocomplete="off" > +</form> + +</div> + +<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> + <div class="bd-toc-item active"> + {% if "python/api" in pagename or "python/generated" in pagename %} + {{ generate_nav_html("sidebar", startdepth=0, maxdepth=3, collapse=False, includehidden=True, titles_only=True) }} + {% else %} + {{ generate_nav_html("sidebar", startdepth=0, maxdepth=4, collapse=False, includehidden=True, titles_only=True) }} + {% endif %} + </div> +</nav> diff --git a/src/arrow/docs/source/_templates/layout.html b/src/arrow/docs/source/_templates/layout.html new file mode 100644 index 000000000..a9d0f30bc --- /dev/null +++ b/src/arrow/docs/source/_templates/layout.html @@ -0,0 +1,5 @@ +{% extends "pydata_sphinx_theme/layout.html" %} + +{# Silence the navbar #} +{% block docs_navbar %} +{% endblock %} diff --git a/src/arrow/docs/source/_templates/version-switcher.html b/src/arrow/docs/source/_templates/version-switcher.html new file mode 100644 index 000000000..24a8c15ac --- /dev/null +++ b/src/arrow/docs/source/_templates/version-switcher.html @@ -0,0 +1,60 @@ +<div id="version-button" class="dropdown"> + <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> + {{ release }} + <span class="caret"></span> + </button> + <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> + <!-- dropdown will be populated by javascript on page load --> + </div> +</div> + +<script type="text/javascript"> +// Function to construct the target URL from the JSON components +function buildURL(entry) { + var template = "{{ switcher_template_url }}"; // supplied by jinja + template = template.replace("{version}", entry.version); + return template; +} + +// Function to check if corresponding page path exists in other version of docs +// and, if so, go there instead of the homepage of the other docs version +function checkPageExistsAndRedirect(event) { + const currentFilePath = "{{ pagename }}.html", + otherDocsHomepage = event.target.getAttribute("href"); + let tryUrl = `${otherDocsHomepage}${currentFilePath}`; + $.ajax({ + type: 'HEAD', + url: tryUrl, + // if the page exists, go there + success: function() { + location.href = tryUrl; + } + }).fail(function() { + location.href = otherDocsHomepage; + }); + return false; +} + +// Function to populate the version switcher +(function () { + // get JSON config + $.getJSON("{{ switcher_json_url }}", function(data, textStatus, jqXHR) { + // create the nodes first (before AJAX calls) to ensure the order is + // correct (for now, links will go to doc version homepage) + $.each(data, function(index, entry) { + // if no custom name specified (e.g., "latest"), use version string + if (!("name" in entry)) { + entry.name = entry.version; + } + // construct the appropriate URL, and add it to the dropdown + entry.url = buildURL(entry); + const node = document.createElement("a"); + node.setAttribute("class", "list-group-item list-group-item-action py-1"); + node.setAttribute("href", `${entry.url}`); + node.textContent = `${entry.name}`; + node.onclick = checkPageExistsAndRedirect; + $("#version_switcher").append(node); + }); + }); +})(); +</script> diff --git a/src/arrow/docs/source/c_glib/index.rst b/src/arrow/docs/source/c_glib/index.rst new file mode 100644 index 000000000..56db23f2a --- /dev/null +++ b/src/arrow/docs/source/c_glib/index.rst @@ -0,0 +1,21 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +C/GLib docs +=========== + +Stub page for the C/GLib docs; actual source is located in c_glib/doc/ sub-directory. diff --git a/src/arrow/docs/source/conf.py b/src/arrow/docs/source/conf.py new file mode 100644 index 000000000..150cd4181 --- /dev/null +++ b/src/arrow/docs/source/conf.py @@ -0,0 +1,464 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# + +import datetime +import os +import sys +import warnings +from unittest import mock + +import pyarrow + + +sys.path.extend([ + os.path.join(os.path.dirname(__file__), + '..', '../..') + +]) + +# Suppresses all warnings printed when sphinx is traversing the code (e.g. +# deprecation warnings) +warnings.filterwarnings("ignore", category=FutureWarning, message=".*pyarrow.*") + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.ifconfig', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'IPython.sphinxext.ipython_directive', + 'IPython.sphinxext.ipython_console_highlighting', + 'breathe' +] + +# Show members for classes in .. autosummary +autodoc_default_options = { + 'members': None, + 'undoc-members': None, + 'show-inheritance': None, + 'inherited-members': None +} + +# Breathe configuration +breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} +breathe_default_project = "arrow_cpp" + +# Overriden conditionally below +autodoc_mock_imports = [] + +# ipython directive options +ipython_mplbackend = '' + +# numpydoc configuration +napoleon_use_rtype = False + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# + +source_suffix = ['.rst'] + +autosummary_generate = True + +# The encoding of source files. +# +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Apache Arrow' +copyright = f'2016-{datetime.datetime.now().year} Apache Software Foundation' +author = u'Apache Software Foundation' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = os.environ.get('ARROW_DOCS_VERSION', + pyarrow.__version__) +# The full version, including alpha/beta/rc tags. +release = os.environ.get('ARROW_DOCS_VERSION', + pyarrow.__version__) + +if "+" in release: + release = release.split(".dev")[0] + " (dev)" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# +# today = '' +# +# Else, today_fmt is used as the format for a strftime call. +# +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'pydata_sphinx_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = { + "show_toc_level": 2, + "google_analytics_id": "UA-107500873-1", +} + +html_context = { + "switcher_json_url": "/docs/_static/versions.json", + "switcher_template_url": "https://arrow.apache.org/docs/{version}", + # for local testing + # "switcher_template_url": "http://0.0.0.0:8000/docs/{version}", +} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. +# "<project> v<release> documentation" by default. +# +html_title = u'Apache Arrow v{}'.format(version) + +# A shorter title for the navigation bar. Default is the same as html_title. +# +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# +html_logo = "_static/arrow.png" + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or +# 32x32 pixels large. +# +html_favicon = "_static/favicon.ico" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom fixes to the RTD theme +html_css_files = ['theme_overrides.css'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# +# html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. +# The empty string is equivalent to '%b %d, %Y'. +# +# html_last_updated_fmt = None + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# +html_sidebars = { +# '**': ['sidebar-logo.html', 'sidebar-search-bs.html', 'sidebar-nav-bs.html'], + '**': ['docs-sidebar.html'], +} + +# The base URL which points to the root of the HTML documentation, +# used for canonical url +html_baseurl = "https://arrow.apache.org/docs/" + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# +# html_additional_pages = {} + +# If false, no module index is generated. +# +# html_domain_indices = True + +# If false, no index is generated. +# +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' +# +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# 'ja' uses this config value. +# 'zh' user can custom change `jieba` dictionary path. +# +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# +# html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'arrowdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'arrow.tex', u'Apache Arrow Documentation', + u'Apache Arrow Team', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# +# latex_use_parts = False + +# If true, show page references after internal links. +# +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# +# latex_appendices = [] + +# It false, will not define \strong, \code, itleref, \crossref ... but only +# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added +# packages. +# +# latex_keep_old_macro_names = True + +# If false, no module index is generated. +# +# latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'arrow', u'Apache Arrow Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +# +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'arrow', u'Apache Arrow Documentation', + author, 'Apache Arrow', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# +# texinfo_appendices = [] + +# If false, no module index is generated. +# +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# +# texinfo_no_detailmenu = False + + +# -- Customization -------------------------------------------------------- + +# Conditional API doc generation + +# Sphinx has two features for conditional inclusion: +# - The "only" directive +# https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#including-content-based-on-tags +# - The "ifconfig" extension +# https://www.sphinx-doc.org/en/master/usage/extensions/ifconfig.html +# +# Both have issues, but "ifconfig" seems to work in this setting. + +try: + import pyarrow.cuda + cuda_enabled = True +except ImportError: + cuda_enabled = False + # Mock pyarrow.cuda to avoid autodoc warnings. + # XXX I can't get autodoc_mock_imports to work, so mock manually instead + # (https://github.com/sphinx-doc/sphinx/issues/2174#issuecomment-453177550) + pyarrow.cuda = sys.modules['pyarrow.cuda'] = mock.Mock() + +try: + import pyarrow.flight + flight_enabled = True +except ImportError: + flight_enabled = False + pyarrow.flight = sys.modules['pyarrow.flight'] = mock.Mock() + + +def setup(app): + # Use a config value to indicate whether CUDA API docs can be generated. + # This will also rebuild appropriately when the value changes. + app.add_config_value('cuda_enabled', cuda_enabled, 'env') + app.add_config_value('flight_enabled', flight_enabled, 'env') diff --git a/src/arrow/docs/source/cpp/api.rst b/src/arrow/docs/source/cpp/api.rst new file mode 100644 index 000000000..3df16a178 --- /dev/null +++ b/src/arrow/docs/source/cpp/api.rst @@ -0,0 +1,42 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +************* +API Reference +************* + +.. toctree:: + :maxdepth: 3 + + api/support + api/memory + api/datatype + api/array + api/scalar + api/builder + api/table + api/c_abi + api/compute + api/tensor + api/utilities + api/io + api/ipc + api/formats + api/cuda + api/flight + api/filesystem + api/dataset diff --git a/src/arrow/docs/source/cpp/api/array.rst b/src/arrow/docs/source/cpp/api/array.rst new file mode 100644 index 000000000..7f4e71158 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/array.rst @@ -0,0 +1,80 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +====== +Arrays +====== + +.. doxygenclass:: arrow::ArrayData + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::Array + :project: arrow_cpp + :members: + +Concrete array subclasses +========================= + +Primitive and temporal +---------------------- + +.. doxygenclass:: arrow::NullArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::BooleanArray + :project: arrow_cpp + :members: + +.. doxygengroup:: numeric-arrays + :content-only: + :members: + +Binary-like +----------- + +.. doxygengroup:: binary-arrays + :content-only: + :members: + +Nested +------ + +.. doxygengroup:: nested-arrays + :content-only: + :members: + +Dictionary-encoded +------------------ + +.. doxygenclass:: arrow::DictionaryArray + :members: + +Extension arrays +---------------- + +.. doxygenclass:: arrow::ExtensionArray + :members: + + +Chunked Arrays +============== + +.. doxygenclass:: arrow::ChunkedArray + :project: arrow_cpp + :members: diff --git a/src/arrow/docs/source/cpp/api/builder.rst b/src/arrow/docs/source/cpp/api/builder.rst new file mode 100644 index 000000000..9e6540aa5 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/builder.rst @@ -0,0 +1,56 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============== +Array Builders +============== + +.. doxygenclass:: arrow::ArrayBuilder + :members: + +Concrete builder subclasses +=========================== + +.. doxygenclass:: arrow::NullBuilder + :members: + +.. doxygenclass:: arrow::BooleanBuilder + :members: + +.. doxygenclass:: arrow::NumericBuilder + :members: + +.. doxygenclass:: arrow::BinaryBuilder + :members: + +.. doxygenclass:: arrow::StringBuilder + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryBuilder + :members: + +.. doxygenclass:: arrow::Decimal128Builder + :members: + +.. doxygenclass:: arrow::ListBuilder + :members: + +.. doxygenclass:: arrow::StructBuilder + :members: + +.. doxygenclass:: arrow::DictionaryBuilder + :members: diff --git a/src/arrow/docs/source/cpp/api/c_abi.rst b/src/arrow/docs/source/cpp/api/c_abi.rst new file mode 100644 index 000000000..4e451c3ec --- /dev/null +++ b/src/arrow/docs/source/cpp/api/c_abi.rst @@ -0,0 +1,48 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +C Interfaces +============ + +.. seealso:: + The :ref:`C data interface <c-data-interface>` and + :ref:`C stream interface <c-stream-interface>` specifications. + +ABI Structures +============== + +.. doxygenstruct:: ArrowSchema + :project: arrow_cpp + +.. doxygenstruct:: ArrowArray + :project: arrow_cpp + +.. doxygenstruct:: ArrowArrayStream + :project: arrow_cpp + +C Data Interface +================ + +.. doxygengroup:: c-data-interface + :content-only: + +C Stream Interface +================== + +.. doxygengroup:: c-stream-interface + :content-only: diff --git a/src/arrow/docs/source/cpp/api/compute.rst b/src/arrow/docs/source/cpp/api/compute.rst new file mode 100644 index 000000000..3b0a89f83 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/compute.rst @@ -0,0 +1,56 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Compute Functions +================= + +Datum class +----------- + +.. doxygenclass:: arrow::Datum + :members: + +Abstract Function classes +------------------------- + +.. doxygengroup:: compute-functions + :content-only: + :members: + +Function registry +----------------- + +.. doxygenclass:: arrow::compute::FunctionRegistry + :members: + +.. doxygenfunction:: arrow::compute::GetFunctionRegistry + +Convenience functions +--------------------- + +.. doxygengroup:: compute-call-function + :content-only: + +Concrete options classes +------------------------ + +.. doxygengroup:: compute-concrete-options + :content-only: + :members: + :undoc-members: + +.. TODO: List concrete function invocation shortcuts? diff --git a/src/arrow/docs/source/cpp/api/cuda.rst b/src/arrow/docs/source/cpp/api/cuda.rst new file mode 100644 index 000000000..caeb5be31 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/cuda.rst @@ -0,0 +1,74 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +CUDA support +============ + +Contexts +======== + +.. doxygenclass:: arrow::cuda::CudaDeviceManager + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::cuda::CudaContext + :project: arrow_cpp + :members: + +Devices +======= + +.. doxygenclass:: arrow::cuda::CudaDevice + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::cuda::CudaMemoryManager + :project: arrow_cpp + :members: + +Buffers +======= + +.. doxygenclass:: arrow::cuda::CudaBuffer + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::cuda::CudaHostBuffer + :project: arrow_cpp + :members: + +Memory Input / Output +===================== + +.. doxygenclass:: arrow::cuda::CudaBufferReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::cuda::CudaBufferWriter + :project: arrow_cpp + :members: + +IPC +=== + +.. doxygenclass:: arrow::cuda::CudaIpcMemHandle + :project: arrow_cpp + :members: + +.. doxygengroup:: cuda-ipc-functions + :content-only: diff --git a/src/arrow/docs/source/cpp/api/dataset.rst b/src/arrow/docs/source/cpp/api/dataset.rst new file mode 100644 index 000000000..3f0df8a45 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/dataset.rst @@ -0,0 +1,71 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======= +Dataset +======= + +Interface +========= + +.. doxygenclass:: arrow::dataset::Fragment + :members: + +.. doxygenclass:: arrow::dataset::Dataset + :members: + +Partitioning +============ + +.. doxygengroup:: dataset-partitioning + :content-only: + :members: + +Dataset discovery/factories +=========================== + +.. doxygengroup:: dataset-discovery + :content-only: + :members: + +Scanning +======== + +.. doxygengroup:: dataset-scanning + :content-only: + :members: + +Concrete implementations +======================== + +.. doxygengroup:: dataset-implementations + :content-only: + :members: + +File System Datasets +-------------------- + +.. doxygengroup:: dataset-filesystem + :content-only: + :members: + +File Formats +------------ + +.. doxygengroup:: dataset-file-formats + :content-only: + :members: diff --git a/src/arrow/docs/source/cpp/api/datatype.rst b/src/arrow/docs/source/cpp/api/datatype.rst new file mode 100644 index 000000000..2cbe1cf4d --- /dev/null +++ b/src/arrow/docs/source/cpp/api/datatype.rst @@ -0,0 +1,102 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========== +Data Types +========== + +.. doxygenenum:: arrow::Type::type + +.. doxygenclass:: arrow::DataType + :members: + +.. _api-type-factories: + +Factory functions +================= + +These functions are recommended for creating data types. They may return +new objects or existing singletons, depending on the type requested. + +.. doxygengroup:: type-factories + :project: arrow_cpp + :content-only: + +Concrete type subclasses +======================== + +Primitive +--------- + +.. doxygenclass:: arrow::NullType + :members: + +.. doxygenclass:: arrow::BooleanType + :members: + +.. doxygengroup:: numeric-datatypes + :content-only: + :members: + +Temporal +-------- + +.. doxygenenum:: arrow::TimeUnit::type + +.. doxygengroup:: temporal-datatypes + :content-only: + :members: + +Binary-like +----------- + +.. doxygengroup:: binary-datatypes + :content-only: + :members: + +Nested +------ + +.. doxygengroup:: nested-datatypes + :content-only: + :members: + +Dictionary-encoded +------------------ + +.. doxygenclass:: arrow::DictionaryType + :members: + +Extension types +--------------- + +.. doxygenclass:: arrow::ExtensionType + :members: + + +Fields and Schemas +================== + +.. doxygengroup:: schema-factories + :project: arrow_cpp + :content-only: + +.. doxygenclass:: arrow::Field + :members: + +.. doxygenclass:: arrow::Schema + :members: diff --git a/src/arrow/docs/source/cpp/api/filesystem.rst b/src/arrow/docs/source/cpp/api/filesystem.rst new file mode 100644 index 000000000..02fff9a6c --- /dev/null +++ b/src/arrow/docs/source/cpp/api/filesystem.rst @@ -0,0 +1,64 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=========== +Filesystems +=========== + +Interface +========= + +.. doxygenenum:: arrow::fs::FileType + +.. doxygenstruct:: arrow::fs::FileInfo + :members: + +.. doxygenstruct:: arrow::fs::FileSelector + :members: + +.. doxygenclass:: arrow::fs::FileSystem + :members: + +High-level factory function +=========================== + +.. doxygengroup:: filesystem-factories + :content-only: + +Concrete implementations +======================== + +.. doxygenclass:: arrow::fs::SubTreeFileSystem + :members: + +.. doxygenstruct:: arrow::fs::LocalFileSystemOptions + :members: + +.. doxygenclass:: arrow::fs::LocalFileSystem + :members: + +.. doxygenstruct:: arrow::fs::S3Options + :members: + +.. doxygenclass:: arrow::fs::S3FileSystem + :members: + +.. doxygenstruct:: arrow::fs::HdfsOptions + :members: + +.. doxygenclass:: arrow::fs::HadoopFileSystem + :members: diff --git a/src/arrow/docs/source/cpp/api/flight.rst b/src/arrow/docs/source/cpp/api/flight.rst new file mode 100644 index 000000000..7cefd66ef --- /dev/null +++ b/src/arrow/docs/source/cpp/api/flight.rst @@ -0,0 +1,202 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +================ +Arrow Flight RPC +================ + +.. note:: Flight is currently unstable. APIs are subject to change, + though we don't expect drastic changes. + +Common Types +============ + +.. doxygenstruct:: arrow::flight::Action + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::ActionType + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::AddCallHeaders + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::CallInfo + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::Criteria + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::FlightDescriptor + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::FlightEndpoint + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightInfo + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::FlightPayload + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightListing + :project: arrow_cpp + :members: + +.. doxygenenum:: arrow::flight::FlightMethod + :project: arrow_cpp + +.. doxygenstruct:: arrow::flight::Location + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::MetadataRecordBatchReader + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::Result + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ResultStream + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::Ticket + :project: arrow_cpp + :members: + +Clients +======= + +.. doxygenclass:: arrow::flight::FlightClient + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::FlightClientOptions + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightCallOptions + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ClientAuthHandler + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ClientMiddleware + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ClientMiddlewareFactory + :project: arrow_cpp + :members: + +.. doxygentypedef:: arrow::flight::TimeoutDuration + :project: arrow_cpp + +.. doxygenclass:: arrow::flight::FlightStreamReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightStreamWriter + :project: arrow_cpp + :members: + +Servers +======= + +.. doxygenclass:: arrow::flight::FlightServerBase + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightServerOptions + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::flight::CertKeyPair + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightDataStream + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightMessageReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::FlightMetadataWriter + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::RecordBatchStream + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ServerAuthHandler + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ServerCallContext + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ServerMiddleware + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::ServerMiddlewareFactory + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::SimpleFlightListing + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::flight::SimpleResultStream + :project: arrow_cpp + :members: + +Error Handling +============== + +Error handling uses the normal :class:`arrow::Status` class, combined +with a custom :class:`arrow::StatusDetail` object for Flight-specific +error codes. + +.. doxygenenum:: arrow::flight::FlightStatusCode + :project: arrow_cpp + +.. doxygenclass:: arrow::flight::FlightStatusDetail + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::flight::MakeFlightError + :project: arrow_cpp diff --git a/src/arrow/docs/source/cpp/api/formats.rst b/src/arrow/docs/source/cpp/api/formats.rst new file mode 100644 index 000000000..2f6b24802 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/formats.rst @@ -0,0 +1,109 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +File Formats +============ + +.. _cpp-api-csv: + +CSV +=== + +.. doxygenstruct:: arrow::csv::ConvertOptions + :members: + +.. doxygenstruct:: arrow::csv::ParseOptions + :members: + +.. doxygenstruct:: arrow::csv::ReadOptions + :members: + +.. doxygenstruct:: arrow::csv::WriteOptions + :members: + +.. doxygenclass:: arrow::csv::TableReader + :members: + +.. doxygenfunction:: arrow::csv::MakeCSVWriter(io::OutputStream *, const std::shared_ptr<Schema>&, const WriteOptions&) + +.. doxygenfunction:: arrow::csv::MakeCSVWriter(std::shared_ptr<io::OutputStream>, const std::shared_ptr<Schema>&, const WriteOptions&) + +.. doxygenfunction:: arrow::csv::WriteCSV(const RecordBatch&, const WriteOptions&, arrow::io::OutputStream *) + +.. doxygenfunction:: arrow::csv::WriteCSV(const Table&, const WriteOptions&, arrow::io::OutputStream *) + +.. _cpp-api-json: + +Line-separated JSON +=================== + +.. doxygenenum:: arrow::json::UnexpectedFieldBehavior + +.. doxygenstruct:: arrow::json::ReadOptions + :members: + +.. doxygenstruct:: arrow::json::ParseOptions + :members: + +.. doxygenclass:: arrow::json::TableReader + :members: + +.. _cpp-api-parquet: + +Parquet reader +============== + +.. doxygenclass:: parquet::ReaderProperties + :members: + +.. doxygenclass:: parquet::ArrowReaderProperties + :members: + +.. doxygenclass:: parquet::ParquetFileReader + :members: + +.. doxygenclass:: parquet::arrow::FileReader + :members: + +.. doxygenclass:: parquet::arrow::FileReaderBuilder + :members: + +.. doxygengroup:: parquet-arrow-reader-factories + :content-only: + +.. doxygenclass:: parquet::StreamReader + :members: + +Parquet writer +============== + +.. doxygenclass:: parquet::WriterProperties + :members: + +.. doxygenclass:: parquet::ArrowWriterProperties + :members: + +.. doxygenclass:: parquet::arrow::FileWriter + :members: + +.. doxygenfunction:: parquet::arrow::WriteTable + +.. doxygenclass:: parquet::StreamWriter + :members: + +.. TODO ORC diff --git a/src/arrow/docs/source/cpp/api/io.rst b/src/arrow/docs/source/cpp/api/io.rst new file mode 100644 index 000000000..735136a0d --- /dev/null +++ b/src/arrow/docs/source/cpp/api/io.rst @@ -0,0 +1,95 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============== +Input / output +============== + +Interfaces +========== + +.. doxygenclass:: arrow::io::FileInterface + :members: + +.. doxygenclass:: arrow::io::Readable + :members: + +.. doxygenclass:: arrow::io::Seekable + :members: + +.. doxygenclass:: arrow::io::Writable + :members: + +.. doxygenclass:: arrow::io::InputStream + :members: + +.. doxygenclass:: arrow::io::RandomAccessFile + :members: + +.. doxygenclass:: arrow::io::OutputStream + :members: + +.. doxygenclass:: arrow::io::ReadWriteFileInterface + :members: + +Concrete implementations +======================== + +In-memory streams +----------------- + +.. doxygenclass:: arrow::io::BufferReader + :members: + +.. doxygenclass:: arrow::io::MockOutputStream + :members: + +.. doxygenclass:: arrow::io::BufferOutputStream + :members: + +.. doxygenclass:: arrow::io::FixedSizeBufferWriter + :members: + +Local files +----------- + +.. doxygenclass:: arrow::io::ReadableFile + :members: + +.. doxygenclass:: arrow::io::FileOutputStream + :members: + +.. doxygenclass:: arrow::io::MemoryMappedFile + :members: + +Buffering input / output wrappers +--------------------------------- + +.. doxygenclass:: arrow::io::BufferedInputStream + :members: + +.. doxygenclass:: arrow::io::BufferedOutputStream + :members: + +Compressed input / output wrappers +---------------------------------- + +.. doxygenclass:: arrow::io::CompressedInputStream + :members: + +.. doxygenclass:: arrow::io::CompressedOutputStream + :members: diff --git a/src/arrow/docs/source/cpp/api/ipc.rst b/src/arrow/docs/source/cpp/api/ipc.rst new file mode 100644 index 000000000..6822b986a --- /dev/null +++ b/src/arrow/docs/source/cpp/api/ipc.rst @@ -0,0 +1,90 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +========= +Arrow IPC +========= + +IPC options +=========== + +.. doxygenstruct:: arrow::ipc::IpcReadOptions + :members: + +.. doxygenstruct:: arrow::ipc::IpcWriteOptions + :members: + +Reading IPC streams and files +============================= + +Blocking API +------------ + +Use either of these two classes, depending on which IPC format you want +to read. The file format requires a random-access file, while the stream +format only requires a sequential input stream. + +.. doxygenclass:: arrow::ipc::RecordBatchStreamReader + :members: + +.. doxygenclass:: arrow::ipc::RecordBatchFileReader + :members: + +Event-driven API +---------------- + +To read an IPC stream in event-driven fashion, you must implement a +:class:`~arrow::ipc::Listener` subclass that you will pass to +:class:`~arrow::ipc::StreamDecoder`. + +.. doxygenclass:: arrow::ipc::Listener + :members: + +.. doxygenclass:: arrow::ipc::StreamDecoder + :members: + +Statistics +---------- + +.. doxygenstruct:: arrow::ipc::ReadStats + :members: + +Writing IPC streams and files +============================= + +Blocking API +------------ + +The IPC stream format is only optionally terminated, whereas the IPC file format +must include a terminating footer. Thus a writer of the IPC file format must be +explicitly finalized with :func:`~arrow::ipc::RecordBatchWriter::Close()` or the resulting +file will be corrupt. + +.. doxygengroup:: record-batch-writer-factories + :content-only: + +.. doxygenclass:: arrow::ipc::RecordBatchWriter + :members: + +Statistics +---------- + +.. doxygenstruct:: arrow::ipc::WriteStats + :members: diff --git a/src/arrow/docs/source/cpp/api/memory.rst b/src/arrow/docs/source/cpp/api/memory.rst new file mode 100644 index 000000000..807a4e2f7 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/memory.rst @@ -0,0 +1,124 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Memory (management) +=================== + +Devices +------- + +.. doxygenclass:: arrow::Device + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::CPUDevice + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::default_cpu_memory_manager + :project: arrow_cpp + +Memory Managers +--------------- + +.. doxygenclass:: arrow::MemoryManager + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::CPUMemoryManager + :project: arrow_cpp + :members: + +Buffers +------- + +.. doxygenclass:: arrow::Buffer + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::MutableBuffer + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ResizableBuffer + :project: arrow_cpp + :members: + +Memory Pools +------------ + +.. doxygenfunction:: arrow::default_memory_pool + :project: arrow_cpp + +.. doxygenfunction:: arrow::jemalloc_memory_pool + :project: arrow_cpp + +.. doxygenfunction:: arrow::mimalloc_memory_pool + :project: arrow_cpp + +.. doxygenfunction:: arrow::system_memory_pool + :project: arrow_cpp + +.. doxygenclass:: arrow::MemoryPool + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::LoggingMemoryPool + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ProxyMemoryPool + :project: arrow_cpp + :members: + +Allocation Functions +-------------------- + +These functions allocate a buffer from a particular memory pool. + +.. doxygengroup:: buffer-allocation-functions + :project: arrow_cpp + :content-only: + +Slicing +------- + +.. doxygengroup:: buffer-slicing-functions + :project: arrow_cpp + :content-only: + +Buffer Builders +--------------- + +.. doxygenclass:: arrow::BufferBuilder + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TypedBufferBuilder + :project: arrow_cpp + :members: + +STL Integration +--------------- + +.. doxygenclass:: arrow::stl::allocator + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::stl::STLMemoryPool + :project: arrow_cpp + :members: diff --git a/src/arrow/docs/source/cpp/api/scalar.rst b/src/arrow/docs/source/cpp/api/scalar.rst new file mode 100644 index 000000000..391c9d57b --- /dev/null +++ b/src/arrow/docs/source/cpp/api/scalar.rst @@ -0,0 +1,38 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======= +Scalars +======= + +.. doxygenstruct:: arrow::Scalar + :project: arrow_cpp + :members: + +Factory functions +================= + +.. doxygengroup:: scalar-factories + :content-only: + +Concrete scalar subclasses +========================== + +.. doxygengroup:: concrete-scalar-classes + :content-only: + :members: + :undoc-members: diff --git a/src/arrow/docs/source/cpp/api/support.rst b/src/arrow/docs/source/cpp/api/support.rst new file mode 100644 index 000000000..c3310e5d8 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/support.rst @@ -0,0 +1,57 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=================== +Programming Support +=================== + +General information +------------------- + +.. doxygenfunction:: arrow::GetBuildInfo + :project: arrow_cpp + +.. doxygenstruct:: arrow::BuildInfo + :project: arrow_cpp + :members: + +Error return and reporting +-------------------------- + +.. doxygenclass:: arrow::Status + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::StatusDetail + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::Result + :project: arrow_cpp + :members: + +.. doxygenclass:: parquet::ParquetException + :project: arrow_cpp + :members: + +.. doxygendefine:: ARROW_RETURN_NOT_OK + +.. doxygendefine:: ARROW_ASSIGN_OR_RAISE + +.. doxygendefine:: PARQUET_THROW_NOT_OK + +.. doxygendefine:: PARQUET_ASSIGN_OR_THROW diff --git a/src/arrow/docs/source/cpp/api/table.rst b/src/arrow/docs/source/cpp/api/table.rst new file mode 100644 index 000000000..53e2d72e6 --- /dev/null +++ b/src/arrow/docs/source/cpp/api/table.rst @@ -0,0 +1,45 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======================== +Two-dimensional Datasets +======================== + +Record Batches +============== + +.. doxygenclass:: arrow::RecordBatch + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::RecordBatchReader + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::TableBatchReader + :project: arrow_cpp + :members: + +Tables +====== + +.. doxygenclass:: arrow::Table + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::ConcatenateTables + :project: arrow_cpp diff --git a/src/arrow/docs/source/cpp/api/tensor.rst b/src/arrow/docs/source/cpp/api/tensor.rst new file mode 100644 index 000000000..1d51786db --- /dev/null +++ b/src/arrow/docs/source/cpp/api/tensor.rst @@ -0,0 +1,57 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======= +Tensors +======= + +Dense Tensors +============= + +.. doxygenclass:: arrow::Tensor + :members: + +.. doxygenclass:: arrow::NumericTensor + :members: + +Sparse Tensors +============== + +.. doxygenenum:: arrow::SparseTensorFormat::type + +.. doxygenclass:: arrow::SparseIndex + :members: + +.. doxygenclass:: arrow::SparseCOOIndex + :members: + +.. doxygenclass:: arrow::SparseCSRIndex + :members: + +.. doxygenclass:: arrow::SparseTensor + :members: + +.. doxygenclass:: arrow::SparseTensorImpl + :members: + +.. doxygentypedef:: arrow::SparseCOOTensor + +.. doxygentypedef:: arrow::SparseCSCMatrix + +.. doxygentypedef:: arrow::SparseCSFTensor + +.. doxygentypedef:: arrow::SparseCSRMatrix diff --git a/src/arrow/docs/source/cpp/api/utilities.rst b/src/arrow/docs/source/cpp/api/utilities.rst new file mode 100644 index 000000000..87c5a3bbe --- /dev/null +++ b/src/arrow/docs/source/cpp/api/utilities.rst @@ -0,0 +1,52 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========= +Utilities +========= + +Decimal Numbers +=============== + +.. doxygenclass:: arrow::Decimal128 + :project: arrow_cpp + :members: + +Abstract Sequences +================== + +.. doxygenclass:: arrow::Iterator + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::VectorIterator + :project: arrow_cpp + :members: + +Compression +=========== + +.. doxygenenum:: arrow::Compression::type + +.. doxygenclass:: arrow::util::Codec + :members: + +.. doxygenclass:: arrow::util::Compressor + :members: + +.. doxygenclass:: arrow::util::Decompressor + :members: diff --git a/src/arrow/docs/source/cpp/arrays.rst b/src/arrow/docs/source/cpp/arrays.rst new file mode 100644 index 000000000..ff76e9d02 --- /dev/null +++ b/src/arrow/docs/source/cpp/arrays.rst @@ -0,0 +1,225 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +====== +Arrays +====== + +.. seealso:: + :doc:`Array API reference <api/array>` + +The central type in Arrow is the class :class:`arrow::Array`. An array +represents a known-length sequence of values all having the same type. +Internally, those values are represented by one or several buffers, the +number and meaning of which depend on the array's data type, as documented +in :ref:`the Arrow data layout specification <format_layout>`. + +Those buffers consist of the value data itself and an optional bitmap buffer +that indicates which array entries are null values. The bitmap buffer +can be entirely omitted if the array is known to have zero null values. + +There are concrete subclasses of :class:`arrow::Array` for each data type, +that help you access individual values of the array. + +Building an array +================= + +Available strategies +-------------------- + +As Arrow objects are immutable, they cannot be populated directly like for +example a ``std::vector``. Instead, several strategies can be used: + +* if the data already exists in memory with the right layout, you can wrap + said memory inside :class:`arrow::Buffer` instances and then construct + a :class:`arrow::ArrowData` describing the array; + + .. seealso:: :ref:`cpp_memory_management` + +* otherwise, the :class:`arrow::ArrayBuilder` base class and its concrete + subclasses help building up array data incrementally, without having to + deal with details of the Arrow format yourself. + +Using ArrayBuilder and its subclasses +------------------------------------- + +To build an ``Int64`` Arrow array, we can use the :class:`arrow::Int64Builder` +class. In the following example, we build an array of the range 1 to 8 where +the element that should hold the value 4 is nulled:: + + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + + auto maybe_array = builder.Finish(); + if (!maybe_array.ok()) { + // ... do something on array building failure + } + std::shared_ptr<arrow::Array> array = *maybe_array; + +The resulting Array (which can be casted to the concrete :class:`arrow::Int64Array` +subclass if you want to access its values) then consists of two +:class:`arrow::Buffer`\s. +The first buffer holds the null bitmap, which consists here of a single byte with +the bits ``1|1|1|1|0|1|1|1``. As we use `least-significant bit (LSB) numbering`_. +this indicates that the fourth entry in the array is null. The second +buffer is simply an ``int64_t`` array containing all the above values. +As the fourth entry is null, the value at that position in the buffer is +undefined. + +Here is how you could access the concrete array's contents:: + + // Cast the Array to its actual type to access its data + auto int64_array = std::static_pointer_cast<arrow::Int64Array>(array); + + // Get the pointer to the null bitmap. + const uint8_t* null_bitmap = int64_array->null_bitmap_data(); + + // Get the pointer to the actual data + const int64_t* data = int64_array->raw_values(); + + // Alternatively, given an array index, query its null bit and value directly + int64_t index = 2; + if (!int64_array->IsNull(index)) { + int64_t value = int64_array->Value(index); + } + +.. note:: + :class:`arrow::Int64Array` (respectively :class:`arrow::Int64Builder`) is + just a ``typedef``, provided for convenience, of ``arrow::NumericArray<Int64Type>`` + (respectively ``arrow::NumericBuilder<Int64Type>``). + +.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering + +Performance +----------- + +While it is possible to build an array value-by-value as in the example above, +to attain highest performance it is recommended to use the bulk appending +methods (usually named ``AppendValues``) in the concrete :class:`arrow::ArrayBuilder` +subclasses. + +If you know the number of elements in advance, it is also recommended to +presize the working area by calling the :func:`~arrow::ArrayBuilder::Resize` +or :func:`~arrow::ArrayBuilder::Reserve` methods. + +Here is how one could rewrite the above example to take advantage of those +APIs:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Reserve(8); + // Bulk append the given values (with a null in 4th place as indicated by the + // validity vector) + std::vector<bool> validity = {true, true, true, false, true, true, true, true}; + std::vector<int64_t> values = {1, 2, 3, 0, 5, 6, 7, 8}; + builder.AppendValues(values, validity); + + auto maybe_array = builder.Finish(); + +If you still must append values one by one, some concrete builder subclasses +have methods marked "Unsafe" that assume the working area has been correctly +presized, and offer higher performance in exchange:: + + arrow::Int64Builder builder; + // Make place for 8 values in total + builder.Reserve(8); + builder.UnsafeAppend(1); + builder.UnsafeAppend(2); + builder.UnsafeAppend(3); + builder.UnsafeAppendNull(); + builder.UnsafeAppend(5); + builder.UnsafeAppend(6); + builder.UnsafeAppend(7); + builder.UnsafeAppend(8); + + auto maybe_array = builder.Finish(); + +Size Limitations and Recommendations +==================================== + +Some array types are structurally limited to 32-bit sizes. This is the case +for list arrays (which can hold up to 2^31 elements), string arrays and binary +arrays (which can hold up to 2GB of binary data), at least. Some other array +types can hold up to 2^63 elements in the C++ implementation, but other Arrow +implementations can have a 32-bit size limitation for those array types as well. + +For these reasons, it is recommended that huge data be chunked in subsets of +more reasonable size. + +Chunked Arrays +============== + +A :class:`arrow::ChunkedArray` is, like an array, a logical sequence of values; +but unlike a simple array, a chunked array does not require the entire sequence +to be physically contiguous in memory. Also, the constituents of a chunked array +need not have the same size, but they must all have the same data type. + +A chunked array is constructed by aggregating any number of arrays. Here we'll +build a chunked array with the same logical values as in the example above, +but in two separate chunks:: + + std::vector<std::shared_ptr<arrow::Array>> chunks; + std::shared_ptr<arrow::Array> array; + + // Build first chunk + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + // Build second chunk + builder.Reset(); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + if (!builder.Finish(&array).ok()) { + // ... do something on array building failure + } + chunks.push_back(std::move(array)); + + auto chunked_array = std::make_shared<arrow::ChunkedArray>(std::move(chunks)); + + assert(chunked_array->num_chunks() == 2); + // Logical length in number of values + assert(chunked_array->length() == 8); + assert(chunked_array->null_count() == 1); + +Slicing +======= + +Like for physical memory buffers, it is possible to make zero-copy slices +of arrays and chunked arrays, to obtain an array or chunked array referring +to some logical subsequence of the data. This is done by calling the +:func:`arrow::Array::Slice` and :func:`arrow::ChunkedArray::Slice` methods, +respectively. + diff --git a/src/arrow/docs/source/cpp/build_system.rst b/src/arrow/docs/source/cpp/build_system.rst new file mode 100644 index 000000000..c0d05e9da --- /dev/null +++ b/src/arrow/docs/source/cpp/build_system.rst @@ -0,0 +1,136 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +=================================== +Using Arrow C++ in your own project +=================================== + +This section assumes you already have the Arrow C++ libraries on your +system, either after installing them using a package manager or after +:ref:`building them yourself <building-arrow-cpp>`. + +The recommended way to integrate the Arrow C++ libraries in your own +C++ project is to use CMake's `find_package +<https://cmake.org/cmake/help/latest/command/find_package.html>`_ +function for locating and integrating dependencies. If you don't use +CMake as a build system, you can use `pkg-config +<https://www.freedesktop.org/wiki/Software/pkg-config/>`_ to find +installed the Arrow C++ libraries. + +CMake +===== + +Basic usage +----------- + +This minimal ``CMakeLists.txt`` file compiles a ``my_example.cc`` source +file into an executable linked with the Arrow C++ shared library: + +.. code-block:: cmake + + project(MyExample) + + find_package(Arrow REQUIRED) + + add_executable(my_example my_example.cc) + target_link_libraries(my_example PRIVATE arrow_shared) + +Available variables and targets +------------------------------- + +The directive ``find_package(Arrow REQUIRED)`` asks CMake to find an Arrow +C++ installation on your system. When it returns, it will have set a few +CMake variables: + +* ``${Arrow_FOUND}`` is true if the Arrow C++ libraries have been found +* ``${ARROW_VERSION}`` contains the Arrow version string +* ``${ARROW_FULL_SO_VERSION}`` contains the Arrow DLL version string + +In addition, it will have created some targets that you can link against +(note these are plain strings, not variables): + +* ``arrow_shared`` links to the Arrow shared libraries +* ``arrow_static`` links to the Arrow static libraries + +In most cases, it is recommended to use the Arrow shared libraries. + +.. note:: + CMake is case-sensitive. The names and variables listed above have to be + spelt exactly that way! + +.. seealso:: + A Docker-based :doc:`minimal build example <examples/cmake_minimal_build>`. + +pkg-config +========== + +Basic usage +----------- + +You can get suitable build flags by the following command line: + +.. code-block:: shell + + pkg-config --cflags --libs arrow + +If you want to link the Arrow C++ static library, you need to add +``--static`` option: + +.. code-block:: shell + + pkg-config --cflags --libs --static arrow + +This minimal ``Makefile`` file compiles a ``my_example.cc`` source +file into an executable linked with the Arrow C++ shared library: + +.. code-block:: makefile + + my_example: my_example.cc + $(CXX) -o $@ $(CXXFLAGS) $< $$(pkg-config --cflags --libs arrow) + +Many build systems support pkg-config. For example: + + * `GNU Autotools <https://people.freedesktop.org/~dbn/pkg-config-guide.html#using>`_ + * `CMake <https://cmake.org/cmake/help/latest/module/FindPkgConfig.html>`_ + (But you should use ``find_package(Arrow)`` instead.) + * `Meson <https://mesonbuild.com/Reference-manual.html#dependency>`_ + +Available packages +------------------ + +The Arrow C++ provides a pkg-config package for each module. Here are +all available packages: + + * ``arrow-csv`` + * ``arrow-cuda`` + * ``arrow-dataset`` + * ``arrow-filesystem`` + * ``arrow-flight-testing`` + * ``arrow-flight`` + * ``arrow-json`` + * ``arrow-orc`` + * ``arrow-python-flight`` + * ``arrow-python`` + * ``arrow-tensorflow`` + * ``arrow-testing`` + * ``arrow`` + * ``gandiva`` + * ``parquet`` + * ``plasma`` diff --git a/src/arrow/docs/source/cpp/compute.rst b/src/arrow/docs/source/cpp/compute.rst new file mode 100644 index 000000000..dd5696020 --- /dev/null +++ b/src/arrow/docs/source/cpp/compute.rst @@ -0,0 +1,1606 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp +.. cpp:namespace:: arrow::compute + +================= +Compute Functions +================= + +The generic Compute API +======================= + +.. TODO: describe API and how to invoke compute functions + +Functions and function registry +------------------------------- + +Functions represent compute operations over inputs of possibly varying +types. Internally, a function is implemented by one or several +"kernels", depending on the concrete input types (for example, a function +adding values from two inputs can have different kernels depending on +whether the inputs are integral or floating-point). + +Functions are stored in a global :class:`FunctionRegistry` where +they can be looked up by name. + +Input shapes +------------ + +Computation inputs are represented as a general :class:`Datum` class, +which is a tagged union of several shapes of data such as :class:`Scalar`, +:class:`Array` and :class:`ChunkedArray`. Many compute functions support +both array (chunked or not) and scalar inputs, however some will mandate +either. For example, while ``sort_indices`` requires its first and only +input to be an array. + +.. _invoking-compute-functions: + +Invoking functions +------------------ + +Compute functions can be invoked by name using +:func:`arrow::compute::CallFunction`:: + + std::shared_ptr<arrow::Array> numbers_array = ...; + std::shared_ptr<arrow::Scalar> increment = ...; + arrow::Datum incremented_datum; + + ARROW_ASSIGN_OR_RAISE(incremented_datum, + arrow::compute::CallFunction("add", {numbers_array, increment})); + std::shared_ptr<Array> incremented_array = std::move(incremented_datum).array(); + +(note this example uses implicit conversion from ``std::shared_ptr<Array>`` +to ``Datum``) + +Many compute functions are also available directly as concrete APIs, here +:func:`arrow::compute::Add`:: + + std::shared_ptr<arrow::Array> numbers_array = ...; + std::shared_ptr<arrow::Scalar> increment = ...; + arrow::Datum incremented_datum; + + ARROW_ASSIGN_OR_RAISE(incremented_datum, + arrow::compute::Add(numbers_array, increment)); + std::shared_ptr<Array> incremented_array = std::move(incremented_datum).array(); + +Some functions accept or require an options structure that determines the +exact semantics of the function:: + + ScalarAggregateOptions scalar_aggregate_options; + scalar_aggregate_options.skip_nulls = false; + + std::shared_ptr<arrow::Array> array = ...; + arrow::Datum min_max; + + ARROW_ASSIGN_OR_RAISE(min_max, + arrow::compute::CallFunction("min_max", {array}, + &scalar_aggregate_options)); + + // Unpack struct scalar result (a two-field {"min", "max"} scalar) + std::shared_ptr<arrow::Scalar> min_value, max_value; + min_value = min_max.scalar_as<arrow::StructScalar>().value[0]; + max_value = min_max.scalar_as<arrow::StructScalar>().value[1]; + +.. seealso:: + :doc:`Compute API reference <api/compute>` + +Implicit casts +============== + +Functions may require conversion of their arguments before execution if a +kernel does not match the argument types precisely. For example comparison +of dictionary encoded arrays is not directly supported by any kernel, but an +implicit cast can be made allowing comparison against the decoded array. + +Each function may define implicit cast behaviour as appropriate. For example +comparison and arithmetic kernels require identically typed arguments, and +support execution against differing numeric types by promoting their arguments +to numeric type which can accommodate any value from either input. + +.. _common-numeric-type: + +Common numeric type +------------------- + +The common numeric type of a set of input numeric types is the smallest numeric +type which can accommodate any value of any input. If any input is a floating +point type the common numeric type is the widest floating point type among the +inputs. Otherwise the common numeric type is integral and is signed if any input +is signed. For example: + ++-------------------+----------------------+------------------------------------------------+ +| Input types | Common numeric type | Notes | ++===================+======================+================================================+ +| int32, int32 | int32 | | ++-------------------+----------------------+------------------------------------------------+ +| int16, int32 | int32 | Max width is 32, promote LHS to int32 | ++-------------------+----------------------+------------------------------------------------+ +| uint16, int32 | int32 | One input signed, override unsigned | ++-------------------+----------------------+------------------------------------------------+ +| uint32, int32 | int64 | Widen to accommodate range of uint32 | ++-------------------+----------------------+------------------------------------------------+ +| uint16, uint32 | uint32 | All inputs unsigned, maintain unsigned | ++-------------------+----------------------+------------------------------------------------+ +| int16, uint32 | int64 | | ++-------------------+----------------------+------------------------------------------------+ +| uint64, int16 | int64 | int64 cannot accommodate all uint64 values | ++-------------------+----------------------+------------------------------------------------+ +| float32, int32 | float32 | Promote RHS to float32 | ++-------------------+----------------------+------------------------------------------------+ +| float32, float64 | float64 | | ++-------------------+----------------------+------------------------------------------------+ +| float32, int64 | float32 | int64 is wider, still promotes to float32 | ++-------------------+----------------------+------------------------------------------------+ + +In particulary, note that comparing a ``uint64`` column to an ``int16`` column +may emit an error if one of the ``uint64`` values cannot be expressed as the +common type ``int64`` (for example, ``2 ** 63``). + +.. _compute-function-list: + +Available functions +=================== + +Type categories +--------------- + +To avoid exhaustively listing supported types, the tables below use a number +of general type categories: + +* "Numeric": Integer types (Int8, etc.) and Floating-point types (Float32, + Float64, sometimes Float16). Some functions also accept Decimal128 and + Decimal256 input. + +* "Temporal": Date types (Date32, Date64), Time types (Time32, Time64), + Timestamp, Duration, Interval. + +* "Binary-like": Binary, LargeBinary, sometimes also FixedSizeBinary. + +* "String-like": String, LargeString. + +* "List-like": List, LargeList, sometimes also FixedSizeList. + +* "Nested": List-likes (including FixedSizeList), Struct, Union, and + related types like Map. + +If you are unsure whether a function supports a concrete input type, we +recommend you try it out. Unsupported input types return a ``TypeError`` +:class:`Status`. + +Aggregations +------------ + +Scalar aggregations operate on a (chunked) array or scalar value and reduce +the input to a single output value. + ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++====================+=======+==================+========================+==================================+=======+ +| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| approximate_median | Unary | Numeric | Scalar Float64 | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | \(2) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| count_distinct | Unary | Non-nested types | Scalar Int64 | :struct:`CountOptions` | \(2) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| index | Unary | Any | Scalar Int64 | :struct:`IndexOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| max | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| mean | Unary | Numeric | Scalar Decimal/Float64 | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| min | Unary | Non-nested types | Scalar Input type | :struct:`ScalarAggregateOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| min_max | Unary | Non-nested types | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(4) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| product | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(6) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(7) | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ +| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++--------------------+-------+------------------+------------------------+----------------------------------+-------+ + +* \(1) If null values are taken into account, by setting the + ScalarAggregateOptions parameter skip_nulls = false, then `Kleene logic`_ + logic is applied. The min_count option is not respected. + +* \(2) CountMode controls whether only non-null values are counted (the + default), only null values are counted, or all values are counted. + +* \(3) Output is a ``{"min": input type, "max": input type}`` Struct. + + Of the interval types, only the month interval is supported, as the day-time + and month-day-nano types are not sortable. + +* \(4) Output is an array of ``{"mode": input type, "count": Int64}`` Struct. + It contains the *N* most common elements in the input, in descending + order, where *N* is given in :member:`ModeOptions::n`. + If two values have the same count, the smallest one comes first. + Note that the output can have less than *N* elements if the input has + less than *N* distinct values. + +* \(5) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the + input type. + +* \(6) Output is Float64 or input type, depending on QuantileOptions. + +* \(7) tdigest/t-digest computes approximate quantiles, and so only needs a + fixed amount of memory. See the `reference implementation + <https://github.com/tdunning/t-digest>`_ for details. + +Grouped Aggregations ("group by") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Grouped aggregations are not directly invokable, but are used as part of a +SQL-style "group by" operation. Like scalar aggregations, grouped aggregations +reduce multiple input values to a single output value. Instead of aggregating +all values of the input, however, grouped aggregations partition the input +values on some set of "key" columns, then aggregate each group individually, +emitting one output value per input group. + +As an example, for the following table: + ++------------------+-----------------+ +| Column ``key`` | Column ``x`` | ++==================+=================+ +| "a" | 2 | ++------------------+-----------------+ +| "a" | 5 | ++------------------+-----------------+ +| "b" | null | ++------------------+-----------------+ +| "b" | null | ++------------------+-----------------+ +| null | null | ++------------------+-----------------+ +| null | 9 | ++------------------+-----------------+ + +we can compute a sum of the column ``x``, grouped on the column ``key``. +This gives us three groups, with the following results. Note that null is +treated as a distinct key value. + ++------------------+-----------------------+ +| Column ``key`` | Column ``sum(x)`` | ++==================+=======================+ +| "a" | 7 | ++------------------+-----------------------+ +| "b" | null | ++------------------+-----------------------+ +| null | 9 | ++------------------+-----------------------+ + +The supported aggregation functions are as follows. All function names are +prefixed with ``hash_``, which differentiates them from their scalar +equivalents above and reflects how they are implemented internally. + ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=========================+=======+====================================+========================+==================================+=======+ +| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_approximate_median | Unary | Numeric | Float64 | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_count_distinct | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_distinct | Unary | Any | Input type | :struct:`CountOptions` | \(2) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_max | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_mean | Unary | Numeric | Decimal/Float64 | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_min | Unary | Non-nested, non-binary/string-like | Input type | :struct:`ScalarAggregateOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_min_max | Unary | Non-nested, non-binary/string-like | Struct | :struct:`ScalarAggregateOptions` | \(3) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_product | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_tdigest | Unary | Numeric | FixedSizeList[Float64] | :struct:`TDigestOptions` | \(5) | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ +| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | ++-------------------------+-------+------------------------------------+------------------------+----------------------------------+-------+ + +* \(1) If null values are taken into account, by setting the + :member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_ + logic is applied. The min_count option is not respected. + +* \(2) CountMode controls whether only non-null values are counted + (the default), only null values are counted, or all values are + counted. For hash_distinct, it instead controls whether null values + are emitted. This never affects the grouping keys, only group values + (i.e. you may get a group where the key is null). + +* \(3) Output is a ``{"min": input type, "max": input type}`` Struct array. + + Of the interval types, only the month interval is supported, as the day-time + and month-day-nano types are not sortable. + +* \(4) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the + input type. + +* \(5) T-digest computes approximate quantiles, and so only needs a + fixed amount of memory. See the `reference implementation + <https://github.com/tdunning/t-digest>`_ for details. + +Element-wise ("scalar") functions +--------------------------------- + +All element-wise functions accept both arrays and scalars as input. The +semantics for unary functions are as follow: + +* scalar inputs produce a scalar output +* array inputs produce an array output + +Binary functions have the following semantics (which is sometimes called +"broadcasting" in other systems such as NumPy): + +* ``(scalar, scalar)`` inputs produce a scalar output +* ``(array, array)`` inputs produce an array output (and both inputs must + be of the same length) +* ``(scalar, array)`` and ``(array, scalar)`` produce an array output. + The scalar input is handled as if it were an array of the same length N + as the other input, with the same value repeated N times. + +Arithmetic functions +~~~~~~~~~~~~~~~~~~~~ + +These functions expect inputs of numeric type and apply a given arithmetic +operation to each element(s) gathered from the input(s). If any of the +input element(s) is null, the corresponding output element is null. +For binary functions, input(s) will be cast to the +:ref:`common numeric type <common-numeric-type>` +(and dictionary decoded, if applicable) before the operation is applied. + +The default variant of these functions does not detect overflow (the result +then typically wraps around). Most functions are also available in an +overflow-checking variant, suffixed ``_checked``, which returns +an ``Invalid`` :class:`Status` when overflow is detected. + +For functions which support decimal inputs (currently ``add``, ``subtract``, +``multiply``, and ``divide`` and their checked variants), decimals of different +precisions/scales will be promoted appropriately. Mixed decimal and +floating-point arguments will cast all arguments to floating-point, while mixed +decimal and integer arguments will cast all arguments to decimals. + ++------------------+--------+----------------+----------------------+-------+ +| Function name | Arity | Input types | Output type | Notes | ++==================+========+================+======================+=======+ +| abs | Unary | Numeric | Numeric | | ++------------------+--------+----------------+----------------------+-------+ +| abs_checked | Unary | Numeric | Numeric | | ++------------------+--------+----------------+----------------------+-------+ +| add | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ +| add_checked | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ +| divide | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ +| divide_checked | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ +| multiply | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ +| multiply_checked | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ +| negate | Unary | Numeric | Numeric | | ++------------------+--------+----------------+----------------------+-------+ +| negate_checked | Unary | Signed Numeric | Signed Numeric | | ++------------------+--------+----------------+----------------------+-------+ +| power | Binary | Numeric | Numeric | | ++------------------+--------+----------------+----------------------+-------+ +| power_checked | Binary | Numeric | Numeric | | ++------------------+--------+----------------+----------------------+-------+ +| sign | Unary | Numeric | Int8/Float32/Float64 | \(2) | ++------------------+--------+----------------+----------------------+-------+ +| subtract | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ +| subtract_checked | Binary | Numeric | Numeric | \(1) | ++------------------+--------+----------------+----------------------+-------+ + +* \(1) Precision and scale of computed DECIMAL results + + +------------+---------------------------------------------+ + | Operation | Result precision and scale | + +============+=============================================+ + | | add | | scale = max(s1, s2) | + | | subtract | | precision = max(p1-s1, p2-s2) + 1 + scale | + +------------+---------------------------------------------+ + | multiply | | scale = s1 + s2 | + | | | precision = p1 + p2 + 1 | + +------------+---------------------------------------------+ + | divide | | scale = max(4, s1 + p2 - s2 + 1) | + | | | precision = p1 - s1 + s2 + scale | + +------------+---------------------------------------------+ + + It's compatible with Redshift's decimal promotion rules. All decimal digits + are preserved for `add`, `subtract` and `multiply` operations. The result + precision of `divide` is at least the sum of precisions of both operands with + enough scale kept. Error is returned if the result precision is beyond the + decimal value range. + +* \(2) Output is any of (-1,1) for nonzero inputs and 0 for zero input. + NaN values return NaN. Integral values return signedness as Int8 and + floating-point values return it with the same type as the input values. + +Bit-wise functions +~~~~~~~~~~~~~~~~~~ + ++--------------------------+------------+--------------------+---------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+====================+=====================+ +| bit_wise_and | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| bit_wise_not | Unary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| bit_wise_or | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| bit_wise_xor | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| shift_left | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| shift_left_checked | Binary | Numeric | Numeric (1) | ++--------------------------+------------+--------------------+---------------------+ +| shift_right | Binary | Numeric | Numeric | ++--------------------------+------------+--------------------+---------------------+ +| shift_right_checked | Binary | Numeric | Numeric (1) | ++--------------------------+------------+--------------------+---------------------+ + +* \(1) An error is emitted if the shift amount (i.e. the second input) is + out of bounds for the data type. However, an overflow when shifting the + first input is not error (truncated bits are silently discarded). + +Rounding functions +~~~~~~~~~~~~~~~~~~ + +Rounding functions displace numeric inputs to an approximate value with a simpler +representation based on the rounding criterion. + ++-------------------+------------+-------------+-------------------------+----------------------------------+--------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===================+============+=============+=========================+==================================+========+ +| ceil | Unary | Numeric | Float32/Float64/Decimal | | | ++-------------------+------------+-------------+-------------------------+----------------------------------+--------+ +| floor | Unary | Numeric | Float32/Float64/Decimal | | | ++-------------------+------------+-------------+-------------------------+----------------------------------+--------+ +| round | Unary | Numeric | Float32/Float64/Decimal | :struct:`RoundOptions` | (1)(2) | ++-------------------+------------+-------------+-------------------------+----------------------------------+--------+ +| round_to_multiple | Unary | Numeric | Float32/Float64/Decimal | :struct:`RoundToMultipleOptions` | (1)(3) | ++-------------------+------------+-------------+-------------------------+----------------------------------+--------+ +| trunc | Unary | Numeric | Float32/Float64/Decimal | | | ++-------------------+------------+-------------+-------------------------+----------------------------------+--------+ + +* \(1) Output value is a 64-bit floating-point for integral inputs and the + retains the same type for floating-point and decimal inputs. By default + rounding functions displace a value to the nearest integer using + HALF_TO_EVEN to resolve ties. Options are available to control the rounding + criterion. Both ``round`` and ``round_to_multiple`` have the ``round_mode`` + option to set the rounding mode. +* \(2) Round to a number of digits where the ``ndigits`` option of + :struct:`RoundOptions` specifies the rounding precision in terms of number + of digits. A negative value corresponds to digits in the non-fractional + part. For example, -2 corresponds to rounding to the nearest multiple of + 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0 + which rounds to the nearest integer. +* \(3) Round to a multiple where the ``multiple`` option of + :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding + multiple has to be a positive value. For example, 100 corresponds to + rounding to the nearest multiple of 100 (zeroing the ones and tens digits). + Default value of ``multiple`` is 1 which rounds to the nearest integer. + +For ``round`` and ``round_to_multiple``, the following rounding modes are available. +Tie-breaking modes are prefixed with HALF and round non-ties to the nearest integer. +The example values are given for default values of ``ndigits`` and ``multiple``. + ++-----------------------+--------------------------------------------------------------+---------------------------+ +| ``round_mode`` | Operation performed | Example values | ++=======================+==============================================================+===========================+ +| DOWN | Round to nearest integer less than or equal in magnitude; | 3.2 -> 3, 3.7 -> 3, | +| | also known as ``floor(x)`` | -3.2 -> -4, -3.7 -> -4 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| UP | Round to nearest integer greater than or equal in magnitude; | 3.2 -> 4, 3.7 -> 4, | +| | also known as ``ceil(x)`` | -3.2 -> -3, -3.7 -> -3 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| TOWARDS_ZERO | Get the integral part without fractional digits; | 3.2 -> 3, 3.7 -> 3, | +| | also known as ``trunc(x)`` | -3.2 -> -3, -3.7 -> -3 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| TOWARDS_INFINITY | Round negative values with ``DOWN`` rule, | 3.2 -> 4, 3.7 -> 4, | +| | round positive values with ``UP`` rule | -3.2 -> -4, -3.7 -> -4 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| HALF_DOWN | Round ties with ``DOWN`` rule | 3.5 -> 3, 4.5 -> 4, | +| | | -3.5 -> -4, -4.5 -> -5 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| HALF_UP | Round ties with ``UP`` rule | 3.5 -> 4, 4.5 -> 5, | +| | | -3.5 -> -3, -4.5 -> -4 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| HALF_TOWARDS_ZERO | Round ties with ``TOWARDS_ZERO`` rule | 3.5 -> 3, 4.5 -> 4, | +| | | -3.5 -> -3, -4.5 -> -4 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| HALF_TOWARDS_INFINITY | Round ties with ``TOWARDS_INFINITY`` rule | 3.5 -> 4, 4.5 -> 5, | +| | | -3.5 -> -4, -4.5 -> -5 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| HALF_TO_EVEN | Round ties to nearest even integer | 3.5 -> 4, 4.5 -> 4, | +| | | -3.5 -> -4, -4.5 -> -4 | ++-----------------------+--------------------------------------------------------------+---------------------------+ +| HALF_TO_ODD | Round ties to nearest odd integer | 3.5 -> 3, 4.5 -> 5, | +| | | -3.5 -> -3, -4.5 -> -5 | ++-----------------------+--------------------------------------------------------------+---------------------------+ + +The following table gives examples of how ``ndigits`` (for the ``round`` +function) and ``multiple`` (for ``round_to_multiple``) influence the operance +performed, respectively. + ++--------------------+-------------------+---------------------------+ +| Round ``multiple`` | Round ``ndigits`` | Operation performed | ++====================+===================+===========================+ +| 1 | 0 | Round to integer | ++--------------------+-------------------+---------------------------+ +| 0.001 | 3 | Round to 3 decimal places | ++--------------------+-------------------+---------------------------+ +| 10 | -1 | Round to multiple of 10 | ++--------------------+-------------------+---------------------------+ +| 2 | NA | Round to multiple of 2 | ++--------------------+-------------------+---------------------------+ + +Logarithmic functions +~~~~~~~~~~~~~~~~~~~~~ + +Logarithmic functions are also supported, and also offer ``_checked`` +variants that check for domain errors if needed. + ++--------------------------+------------+--------------------+---------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+====================+=====================+ +| ln | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| ln_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| log10 | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| log10_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| log1p | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| log1p_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| log2 | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| log2_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| logb | Binary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| logb_checked | Binary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ + +Trigonometric functions +~~~~~~~~~~~~~~~~~~~~~~~ + +Trigonometric functions are also supported, and also offer ``_checked`` +variants that check for domain errors if needed. + ++--------------------------+------------+--------------------+---------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+====================+=====================+ +| acos | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| acos_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| asin | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| asin_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| atan | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| atan2 | Binary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| cos | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| cos_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| sin | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| sin_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| tan | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ +| tan_checked | Unary | Float32/Float64 | Float32/Float64 | ++--------------------------+------------+--------------------+---------------------+ + +Comparisons +~~~~~~~~~~~ + +These functions expect two inputs of numeric type (in which case they will be +cast to the :ref:`common numeric type <common-numeric-type>` before comparison), +or two inputs of Binary- or String-like types, or two inputs of Temporal types. +If any input is dictionary encoded it will be expanded for the purposes of +comparison. If any of the input elements in a pair is null, the corresponding +output element is null. Decimal arguments will be promoted in the same way as +for ``add`` and ``subtract``. + ++----------------+------------+---------------------------------------------+---------------------+ +| Function names | Arity | Input types | Output type | ++================+============+=============================================+=====================+ +| equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean | ++----------------+------------+---------------------------------------------+---------------------+ +| greater | Binary | Numeric, Temporal, Binary- and String-like | Boolean | ++----------------+------------+---------------------------------------------+---------------------+ +| greater_equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean | ++----------------+------------+---------------------------------------------+---------------------+ +| less | Binary | Numeric, Temporal, Binary- and String-like | Boolean | ++----------------+------------+---------------------------------------------+---------------------+ +| less_equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean | ++----------------+------------+---------------------------------------------+---------------------+ +| not_equal | Binary | Numeric, Temporal, Binary- and String-like | Boolean | ++----------------+------------+---------------------------------------------+---------------------+ + +These functions take any number of inputs of numeric type (in which case they +will be cast to the :ref:`common numeric type <common-numeric-type>` before +comparison) or of temporal types. If any input is dictionary encoded it will be +expanded for the purposes of comparison. + ++------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+ +| Function names | Arity | Input types | Output type | Options class | Notes | ++==================+============+=============================================+=====================+=======================================+=======+ +| max_element_wise | Varargs | Numeric and Temporal | Numeric or Temporal | :struct:`ElementWiseAggregateOptions` | \(1) | ++------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+ +| min_element_wise | Varargs | Numeric and Temporal | Numeric or Temporal | :struct:`ElementWiseAggregateOptions` | \(1) | ++------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+ + +* \(1) By default, nulls are skipped (but the kernel can be configured to propagate nulls). + For floating point values, NaN will be taken over null but not over any other value. + +Logical functions +~~~~~~~~~~~~~~~~~~ + +The normal behaviour for these functions is to emit a null if any of the +inputs is null (similar to the semantics of ``NaN`` in floating-point +computations). + +Some of them are also available in a `Kleene logic`_ variant (suffixed +``_kleene``) where null is taken to mean "undefined". This is the +interpretation of null used in SQL systems as well as R and Julia, +for example. + +For the Kleene logic variants, therefore: + +* "true AND null", "null AND true" give "null" (the result is undefined) +* "true OR null", "null OR true" give "true" +* "false AND null", "null AND false" give "false" +* "false OR null", "null OR false" give "null" (the result is undefined) + ++--------------------------+------------+--------------------+---------------------+ +| Function name | Arity | Input types | Output type | ++==========================+============+====================+=====================+ +| and | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| and_kleene | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| and_not | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| and_not_kleene | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| invert | Unary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| or | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| or_kleene | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ +| xor | Binary | Boolean | Boolean | ++--------------------------+------------+--------------------+---------------------+ + +.. _Kleene logic: https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics + +String predicates +~~~~~~~~~~~~~~~~~ + +These functions classify the input string elements according to their character +contents. An empty string element emits false in the output. For ASCII +variants of the functions (prefixed ``ascii_``), a string element with non-ASCII +characters emits false in the output. + +The first set of functions operates on a character-per-character basis, +and emit true in the output if the input contains only characters of a +given class: + ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| Function name | Arity | Input types | Output type | Matched character class | Notes | ++====================+=======+=============+=============+=========================+=======+ +| ascii_is_alnum | Unary | String-like | Boolean | Alphanumeric ASCII | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| ascii_is_alpha | Unary | String-like | Boolean | Alphabetic ASCII | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| ascii_is_decimal | Unary | String-like | Boolean | Decimal ASCII | \(1) | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| ascii_is_lower | Unary | String-like | Boolean | Lowercase ASCII | \(2) | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| ascii_is_printable | Unary | String-like | Boolean | Printable ASCII | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| ascii_is_space | Unary | String-like | Boolean | Whitespace ASCII | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| ascii_is_upper | Unary | String-like | Boolean | Uppercase ASCII | \(2) | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_alnum | Unary | String-like | Boolean | Alphanumeric Unicode | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_alpha | Unary | String-like | Boolean | Alphabetic Unicode | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_decimal | Unary | String-like | Boolean | Decimal Unicode | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_digit | Unary | String-like | Boolean | Unicode digit | \(3) | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_lower | Unary | String-like | Boolean | Lowercase Unicode | \(2) | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_numeric | Unary | String-like | Boolean | Numeric Unicode | \(4) | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_printable | Unary | String-like | Boolean | Printable Unicode | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_space | Unary | String-like | Boolean | Whitespace Unicode | | ++--------------------+-------+-------------+-------------+-------------------------+-------+ +| utf8_is_upper | Unary | String-like | Boolean | Uppercase Unicode | \(2) | ++--------------------+-------+-------------+-------------+-------------------------+-------+ + +* \(1) Also matches all numeric ASCII characters and all ASCII digits. + +* \(2) Non-cased characters, such as punctuation, do not match. + +* \(3) This is currently the same as ``utf8_is_decimal``. + +* \(4) Unlike ``utf8_is_decimal``, non-decimal numeric characters also match. + +The second set of functions also consider the character order in a string +element: + ++--------------------------+------------+--------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==========================+============+====================+=====================+=========+ +| ascii_is_title | Unary | String-like | Boolean | \(1) | ++--------------------------+------------+--------------------+---------------------+---------+ +| utf8_is_title | Unary | String-like | Boolean | \(1) | ++--------------------------+------------+--------------------+---------------------+---------+ + +* \(1) Output is true iff the input string element is title-cased, i.e. any + word starts with an uppercase character, followed by lowercase characters. + Word boundaries are defined by non-cased characters. + +The third set of functions examines string elements on a byte-per-byte basis: + ++--------------------------+------------+--------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==========================+============+====================+=====================+=========+ +| string_is_ascii | Unary | String-like | Boolean | \(1) | ++--------------------------+------------+--------------------+---------------------+---------+ + +* \(1) Output is true iff the input string element contains only ASCII characters, + i.e. only bytes in [0, 127]. + +String transforms +~~~~~~~~~~~~~~~~~ + ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=========================+=======+========================+========================+===================================+=======+ +| ascii_capitalize | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_lower | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_reverse | Unary | String-like | String-like | | \(2) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_swapcase | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_title | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_upper | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| binary_length | Unary | Binary- or String-like | Int32 or Int64 | | \(3) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(4) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(5) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_capitalize | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_length | Unary | String-like | Int32 or Int64 | | \(7) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_lower | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(4) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_reverse | Unary | String-like | String-like | | \(9) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_swapcase | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_title | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_upper | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ + +* \(1) Each ASCII character in the input is converted to lowercase or + uppercase. Non-ASCII characters are left untouched. + +* \(2) ASCII input is reversed to the output. If non-ASCII characters + are present, ``Invalid`` :class:`Status` will be returned. + +* \(3) Output is the physical length in bytes of each input element. Output + type is Int32 for Binary / String, Int64 for LargeBinary / LargeString. + +* \(4) Replace the slice of the substring from :member:`ReplaceSliceOptions::start` + (inclusive) to :member:`ReplaceSliceOptions::stop` (exclusive) by + :member:`ReplaceSubstringOptions::replacement`. The binary kernel measures the + slice in bytes, while the UTF8 kernel measures the slice in codeunits. + +* \(5) Replace non-overlapping substrings that match to + :member:`ReplaceSubstringOptions::pattern` by + :member:`ReplaceSubstringOptions::replacement`. If + :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the + maximum number of replacements made, counting from the left. + +* \(6) Replace non-overlapping substrings that match to the regular expression + :member:`ReplaceSubstringOptions::pattern` by + :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If + :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the + maximum number of replacements made, counting from the left. Note that if the + pattern contains groups, backreferencing can be used. + +* \(7) Output is the number of characters (not bytes) of each input element. + Output type is Int32 for String, Int64 for LargeString. + +* \(8) Each UTF8-encoded character in the input is converted to lowercase or + uppercase. + +* \(9) Each UTF8-encoded code unit is written in reverse order to the output. + If the input is not valid UTF8, then the output is undefined (but the size of output + buffers will be preserved). + +String padding +~~~~~~~~~~~~~~ + +These functions append/prepend a given padding byte (ASCII) or codepoint (UTF8) in +order to center (center), right-align (lpad), or left-align (rpad) a string. + ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++==========================+============+=========================+=====================+========================================+ +| ascii_center | Unary | String-like | String-like | :struct:`PadOptions` | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+ +| ascii_lpad | Unary | String-like | String-like | :struct:`PadOptions` | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+ +| ascii_rpad | Unary | String-like | String-like | :struct:`PadOptions` | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+ +| utf8_center | Unary | String-like | String-like | :struct:`PadOptions` | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+ +| utf8_lpad | Unary | String-like | String-like | :struct:`PadOptions` | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+ +| utf8_rpad | Unary | String-like | String-like | :struct:`PadOptions` | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+ + +String trimming +~~~~~~~~~~~~~~~ + +These functions trim off characters on both sides (trim), or the left (ltrim) or right side (rtrim). + ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++==========================+============+=========================+=====================+========================================+=========+ +| ascii_ltrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(1) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| ascii_ltrim_whitespace | Unary | String-like | String-like | | \(2) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| ascii_rtrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(1) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| ascii_rtrim_whitespace | Unary | String-like | String-like | | \(2) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| ascii_trim | Unary | String-like | String-like | :struct:`TrimOptions` | \(1) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| ascii_trim_whitespace | Unary | String-like | String-like | | \(2) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| utf8_ltrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(3) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| utf8_ltrim_whitespace | Unary | String-like | String-like | | \(4) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| utf8_rtrim | Unary | String-like | String-like | :struct:`TrimOptions` | \(3) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| utf8_rtrim_whitespace | Unary | String-like | String-like | | \(4) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| utf8_trim | Unary | String-like | String-like | :struct:`TrimOptions` | \(3) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ +| utf8_trim_whitespace | Unary | String-like | String-like | | \(4) | ++--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ + +* \(1) Only characters specified in :member:`TrimOptions::characters` will be + trimmed off. Both the input string and the `characters` argument are + interpreted as ASCII characters. + +* \(2) Only trim off ASCII whitespace characters (``'\t'``, ``'\n'``, ``'\v'``, + ``'\f'``, ``'\r'`` and ``' '``). + +* \(3) Only characters specified in :member:`TrimOptions::characters` will be + trimmed off. + +* \(4) Only trim off Unicode whitespace characters. + +String splitting +~~~~~~~~~~~~~~~~ + +These functions split strings into lists of strings. All kernels can optionally +be configured with a ``max_splits`` and a ``reverse`` parameter, where +``max_splits == -1`` means no limit (the default). When ``reverse`` is true, +the splitting is done starting from the end of the string; this is only relevant +when a positive ``max_splits`` is given. + ++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++==========================+============+=========================+===================+==================================+=========+ +| ascii_split_whitespace | Unary | String-like | List-like | :struct:`SplitOptions` | \(1) | ++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+ +| split_pattern | Unary | String-like | List-like | :struct:`SplitPatternOptions` | \(2) | ++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+ +| split_pattern_regex | Unary | String-like | List-like | :struct:`SplitPatternOptions` | \(3) | ++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+ +| utf8_split_whitespace | Unary | String-like | List-like | :struct:`SplitOptions` | \(4) | ++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+ + +* \(1) A non-zero length sequence of ASCII defined whitespace bytes + (``'\t'``, ``'\n'``, ``'\v'``, ``'\f'``, ``'\r'`` and ``' '``) is seen + as separator. + +* \(2) The string is split when an exact pattern is found (the pattern itself + is not included in the output). + +* \(3) The string is split when a regex match is found (the matched + substring itself is not included in the output). + +* \(4) A non-zero length sequence of Unicode defined whitespace codepoints + is seen as separator. + +String component extraction +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++---------------+-------+-------------+-------------+-------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===============+=======+=============+=============+===============================+=======+ +| extract_regex | Unary | String-like | Struct | :struct:`ExtractRegexOptions` | \(1) | ++---------------+-------+-------------+-------------+-------------------------------+-------+ + +* \(1) Extract substrings defined by a regular expression using the Google RE2 + library. The output struct field names refer to the named capture groups, + e.g. 'letter' and 'digit' for the regular expression + ``(?P<letter>[ab])(?P<digit>\\d)``. + +String joining +~~~~~~~~~~~~~~ + +These functions do the inverse of string splitting. + ++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+ +| Function name | Arity | Input type 1 | Input type 2 | Output type | Options class | Notes | ++==========================+===========+=======================+================+===================+=======================+=========+ +| binary_join | Binary | List of string-like | String-like | String-like | | \(1) | ++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+ +| binary_join_element_wise | Varargs | String-like (varargs) | String-like | String-like | :struct:`JoinOptions` | \(2) | ++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+ + +* \(1) The first input must be an array, while the second can be a scalar or array. + Each list of values in the first input is joined using each second input + as separator. If any input list is null or contains a null, the corresponding + output will be null. + +* \(2) All arguments are concatenated element-wise, with the last argument treated + as the separator (scalars are recycled in either case). Null separators emit + null. If any other argument is null, by default the corresponding output will be + null, but it can instead either be skipped or replaced with a given string. + +String Slicing +~~~~~~~~~~~~~~ + +This function transforms each sequence of the array to a subsequence, according +to start and stop indices, and a non-zero step (defaulting to 1). Slicing +semantics follow Python slicing semantics: the start index is inclusive, +the stop index exclusive; if the step is negative, the sequence is followed +in reverse order. + ++--------------------------+------------+----------------+-----------------+--------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++==========================+============+================+=================+==========================+=========+ +| utf8_slice_codeunits | Unary | String-like | String-like | :struct:`SliceOptions` | \(1) | ++--------------------------+------------+----------------+-----------------+--------------------------+---------+ + +* \(1) Slice string into a substring defined by (``start``, ``stop``, ``step``) + as given by :struct:`SliceOptions` where ``start`` and ``stop`` are measured + in codeunits. Null inputs emit null. + +Containment tests +~~~~~~~~~~~~~~~~~ + ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+=======+===================================+================+=================================+=======+ +| count_substring | Unary | String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| count_substring_regex | Unary | String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(1) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| ends_with | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(2) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| find_substring | Unary | Binary- and String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(3) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| find_substring_regex | Unary | Binary- and String-like | Int32 or Int64 | :struct:`MatchSubstringOptions` | \(3) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 | :struct:`SetLookupOptions` | \(4) | +| | | Binary- and String-like | | | | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean | :struct:`SetLookupOptions` | \(5) | +| | | Binary- and String-like | | | | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| match_like | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(6) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| match_substring | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(7) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| match_substring_regex | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(8) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ +| starts_with | Unary | String-like | Boolean | :struct:`MatchSubstringOptions` | \(2) | ++-----------------------+-------+-----------------------------------+----------------+---------------------------------+-------+ + +* \(1) Output is the number of occurrences of + :member:`MatchSubstringOptions::pattern` in the corresponding input + string. Output type is Int32 for Binary/String, Int64 + for LargeBinary/LargeString. + +* \(2) Output is true iff :member:`MatchSubstringOptions::pattern` + is a suffix/prefix of the corresponding input. + +* \(3) Output is the index of the first occurrence of + :member:`MatchSubstringOptions::pattern` in the corresponding input + string, otherwise -1. Output type is Int32 for Binary/String, Int64 + for LargeBinary/LargeString. + +* \(4) Output is the index of the corresponding input element in + :member:`SetLookupOptions::value_set`, if found there. Otherwise, + output is null. + +* \(5) Output is true iff the corresponding input element is equal to one + of the elements in :member:`SetLookupOptions::value_set`. + +* \(6) Output is true iff the SQL-style LIKE pattern + :member:`MatchSubstringOptions::pattern` fully matches the + corresponding input element. That is, ``%`` will match any number of + characters, ``_`` will match exactly one character, and any other + character matches itself. To match a literal percent sign or + underscore, precede the character with a backslash. + +* \(7) Output is true iff :member:`MatchSubstringOptions::pattern` + is a substring of the corresponding input element. + +* \(8) Output is true iff :member:`MatchSubstringOptions::pattern` + matches the corresponding input element at any position. + +Categorizations +~~~~~~~~~~~~~~~ + ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===================+============+=====================+=====================+========================+=========+ +| is_finite | Unary | Float, Double | Boolean | | \(1) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_inf | Unary | Float, Double | Boolean | | \(2) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_nan | Unary | Float, Double | Boolean | | \(3) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_null | Unary | Any | Boolean | :struct:`NullOptions` | \(4) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ +| is_valid | Unary | Any | Boolean | | \(5) | ++-------------------+------------+---------------------+---------------------+------------------------+---------+ + +* \(1) Output is true iff the corresponding input element is finite (neither Infinity, + -Infinity, nor NaN). + +* \(2) Output is true iff the corresponding input element is Infinity/-Infinity. + +* \(3) Output is true iff the corresponding input element is NaN. + +* \(4) Output is true iff the corresponding input element is null. NaN values + can also be considered null by setting :member:`NullOptions::nan_is_null`. + +* \(5) Output is true iff the corresponding input element is non-null. + +.. _cpp-compute-scalar-selections: + +Selecting / multiplexing +~~~~~~~~~~~~~~~~~~~~~~~~ + +For each "row" of input values, these functions emit one of the input values, +depending on a condition. + ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| Function name | Arity | Input types | Output type | Notes | ++==================+============+===================================================+=====================+=========+ +| case_when | Varargs | Struct of Boolean (Arg 0), Any (rest) | Input type | \(1) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| choose | Varargs | Integral (Arg 0), Fixed-width/Binary-like (rest) | Input type | \(2) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| coalesce | Varargs | Any | Input type | \(3) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ +| if_else | Ternary | Boolean (Arg 0), Any (rest) | Input type | \(4) | ++------------------+------------+---------------------------------------------------+---------------------+---------+ + +* \(1) This function acts like a SQL "case when" statement or switch-case. The + input is a "condition" value, which is a struct of Booleans, followed by the + values for each "branch". There must be either exactly one value argument for + each child of the condition struct, or one more value argument than children + (in which case we have an "else" or "default" value). The output is of the + same type as the value inputs; each row will be the corresponding value from + the first value datum for which the corresponding Boolean is true, or the + corresponding value from the "default" input, or null otherwise. + + Note that currently, while all types are supported, dictionaries will be + unpacked. + +* \(2) The first input must be an integral type. The rest of the arguments can be + any type, but must all be the same type or promotable to a common type. Each + value of the first input (the 'index') is used as a zero-based index into the + remaining arguments (i.e. index 0 is the second argument, index 1 is the third + argument, etc.), and the value of the output for that row will be the + corresponding value of the selected input at that row. If the index is null, + then the output will also be null. + +* \(3) Each row of the output will be the corresponding value of the first + input which is non-null for that row, otherwise null. + +* \(4) First input must be a Boolean scalar or array. Second and third inputs + could be scalars or arrays and must be of the same type. Output is an array + (or scalar if all inputs are scalar) of the same type as the second/ third + input. If the nulls present on the first input, they will be promoted to the + output, otherwise nulls will be chosen based on the first input values. + + Also see: :ref:`replace_with_mask <cpp-compute-vector-structural-transforms>`. + +Structural transforms +~~~~~~~~~~~~~~~~~~~~~ + ++---------------------+------------+-------------+------------------+------------------------------+--------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=====================+============+=============+==================+==============================+========+ +| list_value_length | Unary | List-like | Int32 or Int64 | | \(1) | ++---------------------+------------+-------------+------------------+------------------------------+--------+ +| make_struct | Varargs | Any | Struct | :struct:`MakeStructOptions` | \(2) | ++---------------------+------------+-------------+------------------+------------------------------+--------+ + +* \(1) Each output element is the length of the corresponding input element + (null if input is null). Output type is Int32 for List and FixedSizeList, + Int64 for LargeList. + +* \(2) The output struct's field types are the types of its arguments. The + field names are specified using an instance of :struct:`MakeStructOptions`. + The output shape will be scalar if all inputs are scalar, otherwise any + scalars will be broadcast to arrays. + +Conversions +~~~~~~~~~~~ + +A general conversion function named ``cast`` is provided which accepts a large +number of input and output types. The type to cast to can be passed in a +:struct:`CastOptions` instance. As an alternative, the same service is +provided by a concrete function :func:`~arrow::compute::Cast`. + ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=================+============+====================+==================+==============================+=======+ +| cast | Unary | Many | Variable | :struct:`CastOptions` | | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| strftime | Unary | Temporal | String | :struct:`StrftimeOptions` | \(1) | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ + +The conversions available with ``cast`` are listed below. In all cases, a +null input value is converted into a null output value. + +* \(1) Output precision of ``%S`` (seconds) flag depends on the input timestamp + precision. Timestamps with second precision are represented as integers while + milliseconds, microsecond and nanoseconds are represented as fixed floating + point numbers with 3, 6 and 9 decimal places respectively. To obtain integer + seconds, cast to timestamp with second resolution. + The character for the decimal point is localized according to the locale. + See `detailed formatting documentation`_ for descriptions of other flags. + +.. _detailed formatting documentation: https://howardhinnant.github.io/date/date.html#to_stream_formatting + +**Truth value extraction** + ++-----------------------------+------------------------------------+--------------+ +| Input type | Output type | Notes | ++=============================+====================================+==============+ +| Binary- and String-like | Boolean | \(1) | ++-----------------------------+------------------------------------+--------------+ +| Numeric | Boolean | \(2) | ++-----------------------------+------------------------------------+--------------+ + +* \(1) Output is true iff the corresponding input value has non-zero length. + +* \(2) Output is true iff the corresponding input value is non-zero. + +**Same-kind conversion** + ++-----------------------------+------------------------------------+--------------+ +| Input type | Output type | Notes | ++=============================+====================================+==============+ +| Int32 | 32-bit Temporal | \(1) | ++-----------------------------+------------------------------------+--------------+ +| Int64 | 64-bit Temporal | \(1) | ++-----------------------------+------------------------------------+--------------+ +| (Large)Binary | (Large)String | \(2) | ++-----------------------------+------------------------------------+--------------+ +| (Large)String | (Large)Binary | \(3) | ++-----------------------------+------------------------------------+--------------+ +| Numeric | Numeric | \(4) \(5) | ++-----------------------------+------------------------------------+--------------+ +| 32-bit Temporal | Int32 | \(1) | ++-----------------------------+------------------------------------+--------------+ +| 64-bit Temporal | Int64 | \(1) | ++-----------------------------+------------------------------------+--------------+ +| Temporal | Temporal | \(4) \(5) | ++-----------------------------+------------------------------------+--------------+ + +* \(1) No-operation cast: the raw values are kept identical, only + the type is changed. + +* \(2) Validates the contents if :member:`CastOptions::allow_invalid_utf8` + is false. + +* \(3) No-operation cast: only the type is changed. + +* \(4) Overflow and truncation checks are enabled depending on + the given :struct:`CastOptions`. + +* \(5) Not all such casts have been implemented. + +**String representations** + ++-----------------------------+------------------------------------+---------+ +| Input type | Output type | Notes | ++=============================+====================================+=========+ +| Boolean | String-like | | ++-----------------------------+------------------------------------+---------+ +| Numeric | String-like | | ++-----------------------------+------------------------------------+---------+ + +**Generic conversions** + ++-----------------------------+------------------------------------+---------+ +| Input type | Output type | Notes | ++=============================+====================================+=========+ +| Dictionary | Dictionary value type | \(1) | ++-----------------------------+------------------------------------+---------+ +| Extension | Extension storage type | | ++-----------------------------+------------------------------------+---------+ +| List-like | List-like | \(2) | ++-----------------------------+------------------------------------+---------+ +| Null | Any | | ++-----------------------------+------------------------------------+---------+ + +* \(1) The dictionary indices are unchanged, the dictionary values are + cast from the input value type to the output value type (if a conversion + is available). + +* \(2) The list offsets are unchanged, the list values are cast from the + input value type to the output value type (if a conversion is + available). + + +Temporal component extraction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These functions extract datetime components (year, month, day, etc) from temporal types. +For timestamps inputs with non-empty timezone, localized timestamp components will be returned. + ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++====================+============+===================+===============+============================+=======+ +| day | Unary | Temporal | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| day_of_week | Unary | Temporal | Int64 | :struct:`DayOfWeekOptions` | \(1) | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| day_of_year | Unary | Temporal | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| hour | Unary | Timestamp, Time | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| iso_week | Unary | Temporal | Int64 | | \(2) | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| iso_year | Unary | Temporal | Int64 | | \(2) | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| iso_calendar | Unary | Temporal | Struct | | \(3) | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| microsecond | Unary | Timestamp, Time | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| millisecond | Unary | Timestamp, Time | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| minute | Unary | Timestamp, Time | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| month | Unary | Temporal | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| nanosecond | Unary | Timestamp, Time | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| quarter | Unary | Temporal | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| second | Unary | Timestamp, Time | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| subsecond | Unary | Timestamp, Time | Double | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| us_week | Unary | Temporal | Int64 | | \(4) | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| week | Unary | Timestamp | Int64 | :struct:`WeekOptions` | \(5) | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ +| year | Unary | Temporal | Int64 | | | ++--------------------+------------+-------------------+---------------+----------------------------+-------+ + +* \(1) Outputs the number of the day of the week. By default week begins on Monday + represented by 0 and ends on Sunday represented by 6. Day numbering can start with 0 or 1 based on + :member:`DayOfWeekOptions::count_from_zero` parameter. :member:`DayOfWeekOptions::week_start` can be + used to set the starting day of the week using ISO convention (Monday=1, Sunday=7). + :member:`DayOfWeekOptions::week_start` parameter is not affected by :member:`DayOfWeekOptions::count_from_zero`. + +* \(2) First ISO week has the majority (4 or more) of it's days in January. ISO year + starts with the first ISO week. ISO week starts on Monday. + See `ISO 8601 week date definition`_ for more details. + +* \(3) Output is a ``{"iso_year": output type, "iso_week": output type, "iso_day_of_week": output type}`` Struct. + +* \(4) First US week has the majority (4 or more) of its days in January. US year + starts with the first US week. US week starts on Sunday. + +* \(5) Returns week number allowing for setting several parameters. + If :member:`WeekOptions::week_starts_monday` is true, the week starts with Monday, else Sunday if false. + If :member:`WeekOptions::count_from_zero` is true, dates from the current year that fall into the last ISO week + of the previous year are numbered as week 0, else week 52 or 53 if false. + If :member:`WeekOptions::first_week_is_fully_in_year` is true, the first week (week 1) must fully be in January; + else if false, a week that begins on December 29, 30, or 31 is considered the first week of the new year. + +.. _ISO 8601 week date definition: https://en.wikipedia.org/wiki/ISO_week_date#First_week + +Temporal difference +~~~~~~~~~~~~~~~~~~~ + +These functions compute the difference between two timestamps in the +specified unit. The difference is determined by the number of +boundaries crossed, not the span of time. For example, the difference +in days between 23:59:59 on one day and 00:00:01 on the next day is +one day (since midnight was crossed), not zero days (even though less +than 24 hours elapsed). Additionally, if the timestamp has a defined +timezone, the difference is calculated in the local timezone. For +instance, the difference in years between "2019-12-31 18:00:00-0500" +and "2019-12-31 23:00:00-0500" is zero years, because the local year +is the same, even though the UTC years would be different. + ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++=================================+============+===================+=======================+============================+ +| day_time_interval_between | Binary | Temporal | DayTime interval | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| days_between | Binary | Timestamp, Date | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| hours_between | Binary | Temporal | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| microseconds_between | Binary | Temporal | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| milliseconds_between | Binary | Temporal | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| minutes_between | Binary | Temporal | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| month_day_nano_interval_between | Binary | Temporal | MonthDayNano interval | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| month_interval_between | Binary | Timestamp, Date | Month interval | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| nanoseconds_between | Binary | Temporal | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| quarters_between | Binary | Timestamp, Date | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| seconds_between | Binary | Temporal | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| weeks_between | Binary | Timestamp, Date | Int64 | :struct:`DayOfWeekOptions` | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ +| years_between | Binary | Timestamp, Date | Int64 | | ++---------------------------------+------------+-------------------+-----------------------+----------------------------+ + +Timezone handling +~~~~~~~~~~~~~~~~~ + +This function is meant to be used when an external system produces +"timezone-naive" timestamps which need to be converted to "timezone-aware" +timestamps (see for example the `definition +<https://docs.python.org/3/library/datetime.html#aware-and-naive-objects>`__ +in the Python documentation). + +Input timestamps are assumed to be relative to the timezone given in +:member:`AssumeTimezoneOptions::timezone`. They are converted to +UTC-relative timestamps with the timezone metadata set to the above value. +An error is returned if the timestamps already have the timezone metadata set. + ++--------------------+------------+-------------------+---------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++====================+============+===================+===============+==================================+=======+ +| assume_timezone | Unary | Timestamp | Timestamp | :struct:`AssumeTimezoneOptions` | \(1) | ++--------------------+------------+-------------------+---------------+----------------------------------+-------+ + +* \(1) In addition to the timezone value, :struct:`AssumeTimezoneOptions` + allows choosing the behaviour when a timestamp is ambiguous or nonexistent + in the given timezone (because of DST shifts). + + +Array-wise ("vector") functions +------------------------------- + +Associative transforms +~~~~~~~~~~~~~~~~~~~~~~ + ++-------------------+-------+-----------------------------------+-------------+-------+ +| Function name | Arity | Input types | Output type | Notes | ++===================+=======+===================================+=============+=======+ +| dictionary_encode | Unary | Boolean, Null, Numeric, | Dictionary | \(1) | +| | | Temporal, Binary- and String-like | | | ++-------------------+-------+-----------------------------------+-------------+-------+ +| unique | Unary | Boolean, Null, Numeric, | Input type | \(2) | +| | | Temporal, Binary- and String-like | | | ++-------------------+-------+-----------------------------------+-------------+-------+ +| value_counts | Unary | Boolean, Null, Numeric, | Input type | \(3) | +| | | Temporal, Binary- and String-like | | | ++-------------------+-------+-----------------------------------+-------------+-------+ + +* \(1) Output is ``Dictionary(Int32, input type)``. + +* \(2) Duplicates are removed from the output while the original order is + maintained. + +* \(3) Output is a ``{"values": input type, "counts": Int64}`` Struct. + Each output element corresponds to a unique value in the input, along + with the number of times this value has appeared. + +Selections +~~~~~~~~~~ + +These functions select and return a subset of their input. + ++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+ +| Function name | Arity | Input type 1 | Input type 2 | Output type | Options class | Notes | ++===============+========+==============+==============+==============+=========================+===========+ +| array_filter | Binary | Any | Boolean | Input type 1 | :struct:`FilterOptions` | \(1) \(3) | ++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+ +| array_take | Binary | Any | Boolean | Input type 1 | :struct:`TakeOptions` | \(1) \(4) | ++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+ +| drop_null | Unary | Any | - | Input type 1 | | \(1) \(2) | ++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+ +| filter | Binary | Any | Boolean | Input type 1 | :struct:`FilterOptions` | \(1) \(3) | ++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+ +| take | Binary | Any | Integer | Input type 1 | :struct:`TakeOptions` | \(1) \(4) | ++---------------+--------+--------------+--------------+--------------+-------------------------+-----------+ + +* \(1) Sparse unions are unsupported. + +* \(2) Each element in the input is appended to the output iff it is non-null. + If the input is a record batch or table, any null value in a column drops + the entire row. + +* \(3) Each element in input 1 (the values) is appended to the output iff + the corresponding element in input 2 (the filter) is true. How + nulls in the filter are handled can be configured using FilterOptions. + +* \(4) For each element *i* in input 2 (the indices), the *i*'th element + in input 1 (the values) is appended to the output. + +Sorts and partitions +~~~~~~~~~~~~~~~~~~~~ + +By default, in these functions, nulls are considered greater than any other value +(they will be sorted or partitioned at the end of the array). Floating-point +NaN values are considered greater than any other non-null value, but smaller +than nulls. This behaviour can be changed using the ``null_placement`` setting +in the respective option classes. + +.. note:: + Binary- and String-like inputs are ordered lexicographically as bytestrings, + even for String types. + ++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+============+=========================================================+===================+================================+================+ +| array_sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`ArraySortOptions` | \(1) \(2) | ++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ +| partition_nth_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`PartitionNthOptions` | \(3) | ++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ +| select_k_unstable | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SelectKOptions` | \(4) \(5) | ++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ +| sort_indices | Unary | Boolean, Numeric, Temporal, Binary- and String-like | UInt64 | :struct:`SortOptions` | \(1) \(4) | ++-----------------------+------------+---------------------------------------------------------+-------------------+--------------------------------+----------------+ + +* \(1) The output is an array of indices into the input, that define a + stable sort of the input. + +* \(2) The input must be an array. The default order is ascending. + +* \(3) The output is an array of indices into the input array, that define + a partial non-stable sort such that the *N*'th index points to the *N*'th + element in sorted order, and all indices before the *N*'th point to + elements less or equal to elements at or after the *N*'th (similar to + :func:`std::nth_element`). *N* is given in + :member:`PartitionNthOptions::pivot`. + +* \(4) The input can be an array, chunked array, record batch or + table. If the input is a record batch or table, one or more sort + keys must be specified. + +* \(5) The output is an array of indices into the input, that define a + non-stable sort of the input. + +.. _cpp-compute-vector-structural-transforms: + +Structural transforms +~~~~~~~~~~~~~~~~~~~~~ + ++---------------------+------------+-------------------------------------+------------------+--------+ +| Function name | Arity | Input types | Output type | Notes | ++=====================+============+=====================================+==================+========+ +| list_element | Binary | List-like (Arg 0), Integral (Arg 1) | List value type | \(1) | ++---------------------+------------+-------------------------------------+------------------+--------+ +| list_flatten | Unary | List-like | List value type | \(2) | ++---------------------+------------+-------------------------------------+------------------+--------+ +| list_parent_indices | Unary | List-like | Int32 or Int64 | \(3) | ++---------------------+------------+-------------------------------------+------------------+--------+ + +* \(1) Output is an array of the same length as the input list array. The + output values are the values at the specified index of each child list. + +* \(2) The top level of nesting is removed: all values in the list child array, + including nulls, are appended to the output. However, nulls in the parent + list array are discarded. + +* \(3) For each value in the list child array, the index at which it is found + in the list array is appended to the output. Nulls in the parent list array + are discarded. Output type is Int32 for List and FixedSizeList, Int64 for + LargeList. + +These functions create a copy of the first input with some elements +replaced, based on the remaining inputs. + ++--------------------------+------------+-----------------------+--------------+--------------+--------------+-------+ +| Function name | Arity | Input type 1 | Input type 2 | Input type 3 | Output type | Notes | ++==========================+============+=======================+==============+==============+==============+=======+ +| replace_with_mask | Ternary | Fixed-width or binary | Boolean | Input type 1 | Input type 1 | \(1) | ++--------------------------+------------+-----------------------+--------------+--------------+--------------+-------+ + +* \(1) Each element in input 1 for which the corresponding Boolean in input 2 + is true is replaced with the next value from input 3. A null in input 2 + results in a corresponding null in the output. + + Also see: :ref:`if_else <cpp-compute-scalar-selections>`. diff --git a/src/arrow/docs/source/cpp/conventions.rst b/src/arrow/docs/source/cpp/conventions.rst new file mode 100644 index 000000000..218d028ee --- /dev/null +++ b/src/arrow/docs/source/cpp/conventions.rst @@ -0,0 +1,107 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: arrow + +Conventions +=========== + +The Arrow C++ API follows a few simple guidelines. As with many rules, +there may be exceptions. + +Language version +---------------- + +Arrow is C++11-compatible. A few backports are used for newer functionality, +for example the :class:`std::string_view` class. + +Namespacing +----------- + +All the Arrow API (except macros) is namespaced inside a ``arrow`` namespace, +and nested namespaces thereof. + +Safe pointers +------------- + +Arrow objects are usually passed and stored using safe pointers -- most of +the time :class:`std::shared_ptr` but sometimes also :class:`std::unique_ptr`. + +Immutability +------------ + +Many Arrow objects are immutable: once constructed, their logical properties +cannot change anymore. This makes it possible to use them in multi-threaded +scenarios without requiring tedious and error-prone synchronization. + +There are obvious exceptions to this, such as IO objects or mutable data buffers. + +Error reporting +--------------- + +Most APIs indicate a successful or erroneous outcome by returning a +:class:`arrow::Status` instance. Arrow doesn't throw exceptions of its +own, but third-party exceptions might propagate through, especially +:class:`std::bad_alloc` (but Arrow doesn't use the standard allocators for +large data). + +When an API can return either an error code or a successful value, it usually +does so by returning the template class +:class:`arrow::Result <template\<class T\> arrow::Result>`. However, +some APIs (usually deprecated) return :class:`arrow::Status` and pass the +result value as an out-pointer parameter. + +Here is an example of checking the outcome of an operation:: + + const int64_t buffer_size = 4096; + + auto maybe_buffer = arrow::AllocateBuffer(buffer_size, &buffer); + if (!maybe_buffer.ok()) { + // ... handle error + } else { + std::shared_ptr<arrow::Buffer> buffer = *maybe_buffer; + // ... use allocated buffer + } + +If the caller function itself returns a :class:`arrow::Result` or +:class:`arrow::Status` and wants to propagate any non-successful outcome, two +convenience macros are available: + +* :c:macro:`ARROW_RETURN_NOT_OK` takes a :class:`arrow::Status` parameter + and returns it if not successful. + +* :c:macro:`ARROW_ASSIGN_OR_RAISE` takes a :class:`arrow::Result` parameter, + assigns its result to a *lvalue* if successful, or returns the corresponding + :class:`arrow::Status` on error. + +For example:: + + arrow::Status DoSomething() { + const int64_t buffer_size = 4096; + std::shared_ptr<arrow::Buffer> buffer; + ARROW_ASSIGN_OR_RAISE(buffer, arrow::AllocateBuffer(buffer_size)); + // ... allocation successful, do something with buffer below + + // return success at the end + return Status::OK(); + } + +.. seealso:: + :doc:`API reference for error reporting <api/support>` diff --git a/src/arrow/docs/source/cpp/csv.rst b/src/arrow/docs/source/cpp/csv.rst new file mode 100644 index 000000000..42b5af67d --- /dev/null +++ b/src/arrow/docs/source/cpp/csv.rst @@ -0,0 +1,220 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: arrow::csv + +============================= +Reading and Writing CSV files +============================= + +Arrow provides a fast CSV reader allowing ingestion of external data +as Arrow tables. + +.. seealso:: + :ref:`CSV reader/writer API reference <cpp-api-csv>`. + +Basic usage +=========== + +A CSV file is read from a :class:`~arrow::io::InputStream`. + +.. code-block:: cpp + + #include "arrow/csv/api.h" + + { + // ... + arrow::io::IOContext io_context = arrow::io::default_io_context(); + std::shared_ptr<arrow::io::InputStream> input = ...; + + auto read_options = arrow::csv::ReadOptions::Defaults(); + auto parse_options = arrow::csv::ParseOptions::Defaults(); + auto convert_options = arrow::csv::ConvertOptions::Defaults(); + + // Instantiate TableReader from input stream and options + auto maybe_reader = + arrow::csv::TableReader::Make(io_context, + input, + read_options, + parse_options, + convert_options); + if (!maybe_reader.ok()) { + // Handle TableReader instantiation error... + } + std::shared_ptr<arrow::csv::TableReader> reader = *maybe_reader; + + // Read table from CSV file + auto maybe_table = reader->Read(); + if (!maybe_table.ok()) { + // Handle CSV read error + // (for example a CSV syntax error or failed type conversion) + } + std::shared_ptr<arrow::Table> table = *maybe_table; + } + +A CSV file is written to a :class:`~arrow::io::OutputStream`. + +.. code-block:: cpp + + #include <arrow/csv/api.h> + { + // Oneshot write + // ... + std::shared_ptr<arrow::io::OutputStream> output = ...; + auto write_options = arrow::csv::WriteOptions::Defaults(); + if (WriteCSV(table, write_options, output.get()).ok()) { + // Handle writer error... + } + } + { + // Write incrementally + // ... + std::shared_ptr<arrow::io::OutputStream> output = ...; + auto write_options = arrow::csv::WriteOptions::Defaults(); + auto maybe_writer = arrow::csv::MakeCSVWriter(output, schema, write_options); + if (!maybe_writer.ok()) { + // Handle writer instantiation error... + } + std::shared_ptr<arrow::ipc::RecordBatchWriter> writer = *maybe_writer; + + // Write batches... + if (!writer->WriteRecordBatch(*batch).ok()) { + // Handle write error... + } + + if (!writer->Close().ok()) { + // Handle close error... + } + if (!output->Close().ok()) { + // Handle file close error... + } + } + +.. note:: The writer does not yet support all Arrow types. + +Column names +============ + +There are three possible ways to infer column names from the CSV file: + +* By default, the column names are read from the first row in the CSV file +* If :member:`ReadOptions::column_names` is set, it forces the column + names in the table to these values (the first row in the CSV file is + read as data) +* If :member:`ReadOptions::autogenerate_column_names` is true, column names + will be autogenerated with the pattern "f0", "f1"... (the first row in the + CSV file is read as data) + +Column selection +================ + +By default, Arrow reads all columns in the CSV file. You can narrow the +selection of columns with the :member:`ConvertOptions::include_columns` +option. If some columns in :member:`ConvertOptions::include_columns` +are missing from the CSV file, an error will be emitted unless +:member:`ConvertOptions::include_missing_columns` is true, in which case +the missing columns are assumed to contain all-null values. + +Interaction with column names +----------------------------- + +If both :member:`ReadOptions::column_names` and +:member:`ConvertOptions::include_columns` are specified, +the :member:`ReadOptions::column_names` are assumed to map to CSV columns, +and :member:`ConvertOptions::include_columns` is a subset of those column +names that will part of the Arrow Table. + +Data types +========== + +By default, the CSV reader infers the most appropriate data type for each +column. Type inference considers the following data types, in order: + +* Null +* Int64 +* Boolean +* Date32 +* Time32 (with seconds unit) +* Timestamp (with seconds unit) +* Timestamp (with nanoseconds unit) +* Float64 +* Dictionary<String> (if :member:`ConvertOptions::auto_dict_encode` is true) +* Dictionary<Binary> (if :member:`ConvertOptions::auto_dict_encode` is true) +* String +* Binary + +It is possible to override type inference for select columns by setting +the :member:`ConvertOptions::column_types` option. Explicit data types +can be chosen from the following list: + +* Null +* All Integer types +* Float32 and Float64 +* Decimal128 +* Boolean +* Date32 and Date64 +* Time32 and Time64 +* Timestamp +* Binary and Large Binary +* String and Large String (with optional UTF8 input validation) +* Fixed-Size Binary +* Dictionary with index type Int32 and value type one of the following: + Binary, String, LargeBinary, LargeString, Int32, UInt32, Int64, UInt64, + Float32, Float64, Decimal128 + +Other data types do not support conversion from CSV values and will error out. + +Dictionary inference +-------------------- + +If type inference is enabled and :member:`ConvertOptions::auto_dict_encode` +is true, the CSV reader first tries to convert string-like columns to a +dictionary-encoded string-like array. It switches to a plain string-like +array when the threshold in :member:`ConvertOptions::auto_dict_max_cardinality` +is reached. + +Nulls +----- + +Null values are recognized from the spellings stored in +:member:`ConvertOptions::null_values`. The :func:`ConvertOptions::Defaults` +factory method will initialize a number of conventional null spellings such +as ``N/A``. + +Character encoding +------------------ + +CSV files are expected to be encoded in UTF8. However, non-UTF8 data +is accepted for Binary columns. + +Write Options +============= + +The format of written CSV files can be customized via :class:`~arrow::csv::WriteOptions`. +Currently few options are available; more will be added in future releases. + +Performance +=========== + +By default, the CSV reader will parallelize reads in order to exploit all +CPU cores on your machine. You can change this setting in +:member:`ReadOptions::use_threads`. A reasonable expectation is at least +100 MB/s per core on a performant desktop or laptop computer (measured in +source CSV bytes, not target Arrow data bytes). diff --git a/src/arrow/docs/source/cpp/dataset.rst b/src/arrow/docs/source/cpp/dataset.rst new file mode 100644 index 000000000..e7161a458 --- /dev/null +++ b/src/arrow/docs/source/cpp/dataset.rst @@ -0,0 +1,417 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +================ +Tabular Datasets +================ + +.. seealso:: + :doc:`Dataset API reference <api/dataset>` + +.. warning:: + + The ``arrow::dataset`` namespace is experimental, and a stable API + is not yet guaranteed. + +The Arrow Datasets library provides functionality to efficiently work with +tabular, potentially larger than memory, and multi-file datasets. This includes: + +* A unified interface that supports different sources and file formats + (currently, Parquet, ORC, Feather / Arrow IPC, and CSV files) and different + file systems (local, cloud). +* Discovery of sources (crawling directories, handling partitioned datasets with + various partitioning schemes, basic schema normalization, ...) +* Optimized reading with predicate pushdown (filtering rows), projection + (selecting and deriving columns), and optionally parallel reading. + +The goal is to expand support to other file formats and data sources +(e.g. database connections) in the future. + +Reading Datasets +---------------- + +For the examples below, let's create a small dataset consisting +of a directory with two parquet files: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Reading Datasets) + :end-before: (Doc section: Reading Datasets) + :linenos: + :lineno-match: + +(See the full example at bottom: :ref:`cpp-dataset-full-example`.) + +Dataset discovery +~~~~~~~~~~~~~~~~~ + +A :class:`arrow::dataset::Dataset` object can be created using the various +:class:`arrow::dataset::DatasetFactory` objects. Here, we'll use the +:class:`arrow::dataset::FileSystemDatasetFactory`, which can create a dataset +given a base directory path: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Dataset discovery) + :end-before: (Doc section: Dataset discovery) + :emphasize-lines: 6-11 + :linenos: + :lineno-match: + +We're also passing the filesystem to use and the file format to use for reading. +This lets us choose between (for example) reading local files or files in Amazon +S3, or between Parquet and CSV. + +In addition to searching a base directory, we can list file paths manually. + +Creating a :class:`arrow::dataset::Dataset` does not begin reading the data +itself. It only crawls the directory to find all the files (if needed), which can +be retrieved with :func:`arrow::dataset::FileSystemDataset::files`: + +.. code-block:: cpp + + // Print out the files crawled (only for FileSystemDataset) + for (const auto& filename : dataset->files()) { + std::cout << filename << std::endl; + } + +…and infers the dataset's schema (by default from the first file): + +.. code-block:: cpp + + std::cout << dataset->schema()->ToString() << std::endl; + +Using the :func:`arrow::dataset::Dataset::NewScan` method, we can build a +:class:`arrow::dataset::Scanner` and read the dataset (or a portion of it) into +a :class:`arrow::Table` with the :func:`arrow::dataset::Scanner::ToTable` +method: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Dataset discovery) + :end-before: (Doc section: Dataset discovery) + :emphasize-lines: 16-19 + :linenos: + :lineno-match: + +.. TODO: iterative loading not documented pending API changes +.. note:: Depending on the size of your dataset, this can require a lot of + memory; see :ref:`cpp-dataset-filtering-data` below on + filtering/projecting. + +Reading different file formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The above examples use Parquet files on local disk, but the Dataset API +provides a consistent interface across multiple file formats and filesystems. +(See :ref:`cpp-dataset-cloud-storage` for more information on the latter.) +Currently, Parquet, ORC, Feather / Arrow IPC, and CSV file formats are +supported; more formats are planned in the future. + +If we save the table as Feather files instead of Parquet files: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Reading different file formats) + :end-before: (Doc section: Reading different file formats) + :linenos: + :lineno-match: + +…then we can read the Feather file by passing an :class:`arrow::dataset::IpcFileFormat`: + +.. code-block:: cpp + + auto format = std::make_shared<ds::ParquetFileFormat>(); + // ... + auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, options) + .ValueOrDie(); + +Customizing file formats +~~~~~~~~~~~~~~~~~~~~~~~~ + +:class:`arrow::dataset::FileFormat` objects have properties that control how +files are read. For example:: + + auto format = std::make_shared<ds::ParquetFileFormat>(); + format->reader_options.dict_columns.insert("a"); + +Will configure column ``"a"`` to be dictionary-encoded when read. Similarly, +setting :member:`arrow::dataset::CsvFileFormat::parse_options` lets us change +things like reading comma-separated or tab-separated data. + +Additionally, passing an :class:`arrow::dataset::FragmentScanOptions` to +:func:`arrow::dataset::ScannerBuilder::FragmentScanOptions` offers fine-grained +control over data scanning. For example, for CSV files, we can change what values +are converted into Boolean true and false at scan time. + +.. _cpp-dataset-filtering-data: + +Filtering data +-------------- + +So far, we've been reading the entire dataset, but if we need only a subset of the +data, this can waste time or memory reading data we don't need. The +:class:`arrow::dataset::Scanner` offers control over what data to read. + +In this snippet, we use :func:`arrow::dataset::ScannerBuilder::Project` to select +which columns to read: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Filtering data) + :end-before: (Doc section: Filtering data) + :emphasize-lines: 16 + :linenos: + :lineno-match: + +Some formats, such as Parquet, can reduce I/O costs here by reading only the +specified columns from the filesystem. + +A filter can be provided with :func:`arrow::dataset::ScannerBuilder::Filter`, so +that rows which do not match the filter predicate will not be included in the +returned table. Again, some formats, such as Parquet, can use this filter to +reduce the amount of I/O needed. + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Filtering data) + :end-before: (Doc section: Filtering data) + :emphasize-lines: 17 + :linenos: + :lineno-match: + +.. TODO Expressions not documented pending renamespacing + +Projecting columns +------------------ + +In addition to selecting columns, :func:`arrow::dataset::ScannerBuilder::Project` +can also be used for more complex projections, such as renaming columns, casting +them to other types, and even deriving new columns based on evaluating +expressions. + +In this case, we pass a vector of expressions used to construct column values +and a vector of names for the columns: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Projecting columns) + :end-before: (Doc section: Projecting columns) + :emphasize-lines: 18-28 + :linenos: + :lineno-match: + +This also determines the column selection; only the given columns will be +present in the resulting table. If you want to include a derived column in +*addition* to the existing columns, you can build up the expressions from the +dataset schema: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Projecting columns #2) + :end-before: (Doc section: Projecting columns #2) + :emphasize-lines: 17-27 + :linenos: + :lineno-match: + +.. note:: When combining filters and projections, Arrow will determine all + necessary columns to read. For instance, if you filter on a column that + isn't ultimately selected, Arrow will still read the column to evaluate + the filter. + +Reading and writing partitioned data +------------------------------------ + +So far, we've been working with datasets consisting of flat directories with +files. Oftentimes, a dataset will have one or more columns that are frequently +filtered on. Instead of having to read and then filter the data, by organizing the +files into a nested directory structure, we can define a partitioned dataset, +where sub-directory names hold information about which subset of the data is +stored in that directory. Then, we can more efficiently filter data by using that +information to avoid loading files that don't match the filter. + +For example, a dataset partitioned by year and month may have the following layout: + +.. code-block:: text + + dataset_name/ + year=2007/ + month=01/ + data0.parquet + data1.parquet + ... + month=02/ + data0.parquet + data1.parquet + ... + month=03/ + ... + year=2008/ + month=01/ + ... + ... + +The above partitioning scheme is using "/key=value/" directory names, as found in +Apache Hive. Under this convention, the file at +``dataset_name/year=2007/month=01/data0.parquet`` contains only data for which +``year == 2007`` and ``month == 01``. + +Let's create a small partitioned dataset. For this, we'll use Arrow's dataset +writing functionality. + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Reading and writing partitioned data) + :end-before: (Doc section: Reading and writing partitioned data) + :emphasize-lines: 25-42 + :linenos: + :lineno-match: + +The above created a directory with two subdirectories ("part=a" and "part=b"), +and the Parquet files written in those directories no longer include the "part" +column. + +Reading this dataset, we now specify that the dataset should use a Hive-like +partitioning scheme: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Reading and writing partitioned data #2) + :end-before: (Doc section: Reading and writing partitioned data #2) + :emphasize-lines: 7,9-11 + :linenos: + :lineno-match: + +Although the partition fields are not included in the actual Parquet files, +they will be added back to the resulting table when scanning this dataset: + +.. code-block:: text + + $ ./debug/dataset_documentation_example file:///tmp parquet_hive partitioned + Found fragment: /tmp/parquet_dataset/part=a/part0.parquet + Partition expression: (part == "a") + Found fragment: /tmp/parquet_dataset/part=b/part1.parquet + Partition expression: (part == "b") + Read 20 rows + a: int64 + -- field metadata -- + PARQUET:field_id: '1' + b: double + -- field metadata -- + PARQUET:field_id: '2' + c: int64 + -- field metadata -- + PARQUET:field_id: '3' + part: string + ---- + # snip... + +We can now filter on the partition keys, which avoids loading files +altogether if they do not match the filter: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: (Doc section: Reading and writing partitioned data #3) + :end-before: (Doc section: Reading and writing partitioned data #3) + :emphasize-lines: 15-18 + :linenos: + :lineno-match: + +Different partitioning schemes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The above example uses a Hive-like directory scheme, such as "/year=2009/month=11/day=15". +We specified this by passing the Hive partitioning factory. In this case, the types of +the partition keys are inferred from the file paths. + +It is also possible to directly construct the partitioning and explicitly define +the schema of the partition keys. For example: + +.. code-block:: cpp + + auto part = std::make_shared<ds::HivePartitioning>(arrow::schema({ + arrow::field("year", arrow::int16()), + arrow::field("month", arrow::int8()), + arrow::field("day", arrow::int32()) + })); + +Arrow supports another partitioning scheme, "directory partitioning", where the +segments in the file path represent the values of the partition keys without +including the name (the field names are implicit in the segment's index). For +example, given field names "year", "month", and "day", one path might be +"/2019/11/15". + +Since the names are not included in the file paths, these must be specified +when constructing a directory partitioning: + +.. code-block:: cpp + + auto part = ds::DirectoryPartitioning::MakeFactory({"year", "month", "day"}); + +Directory partitioning also supports providing a full schema rather than inferring +types from file paths. + +Reading from other data sources +------------------------------- + +Reading in-memory data +~~~~~~~~~~~~~~~~~~~~~~ + +If you already have data in memory that you'd like to use with the Datasets API +(e.g. to filter/project data, or to write it out to a filesystem), you can wrap it +in an :class:`arrow::dataset::InMemoryDataset`: + +.. code-block:: cpp + + auto table = arrow::Table::FromRecordBatches(...); + auto dataset = std::make_shared<arrow::dataset::InMemoryDataset>(std::move(table)); + // Scan the dataset, filter, it, etc. + auto scanner_builder = dataset->NewScan(); + +In the example, we used the InMemoryDataset to write our example data to local +disk which was used in the rest of the example: + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :start-after: Reading and writing partitioned data + :end-before: Reading and writing partitioned data + :emphasize-lines: 24-28 + :linenos: + :lineno-match: + +.. _cpp-dataset-cloud-storage: + +Reading from cloud storage +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to local files, Arrow Datasets also support reading from cloud +storage systems, such as Amazon S3, by passing a different filesystem. + +See the :ref:`filesystem <cpp-filesystems>` docs for more details on the available +filesystems. + +.. _cpp-dataset-full-example: + +Full Example +------------ + +.. literalinclude:: ../../../cpp/examples/arrow/dataset_documentation_example.cc + :language: cpp + :linenos: diff --git a/src/arrow/docs/source/cpp/datatypes.rst b/src/arrow/docs/source/cpp/datatypes.rst new file mode 100644 index 000000000..9149420a4 --- /dev/null +++ b/src/arrow/docs/source/cpp/datatypes.rst @@ -0,0 +1,68 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Data Types +========== + +.. seealso:: + :doc:`Datatype API reference <api/datatype>`. + +Data types govern how physical data is interpreted. Their :ref:`specification +<format_columnar>` allows binary interoperability between different Arrow +implementations, including from different programming languages and runtimes +(for example it is possible to access the same data, without copying, from +both Python and Java using the :py:mod:`pyarrow.jvm` bridge module). + +Information about a data type in C++ can be represented in three ways: + +1. Using a :class:`arrow::DataType` instance (e.g. as a function argument) +2. Using a :class:`arrow::DataType` concrete subclass (e.g. as a template + parameter) +3. Using a :type:`arrow::Type::type` enum value (e.g. as the condition of + a switch statement) + +The first form (using a :class:`arrow::DataType` instance) is the most idiomatic +and flexible. Runtime-parametric types can only be fully represented with +a DataType instance. For example, a :class:`arrow::TimestampType` needs to be +constructed at runtime with a :type:`arrow::TimeUnit::type` parameter; a +:class:`arrow::Decimal128Type` with *scale* and *precision* parameters; +a :class:`arrow::ListType` with a full child type (itself a +:class:`arrow::DataType` instance). + +The two other forms can be used where performance is critical, in order to +avoid paying the price of dynamic typing and polymorphism. However, some +amount of runtime switching can still be required for parametric types. +It is not possible to reify all possible types at compile time, since Arrow +data types allows arbitrary nesting. + +Creating data types +------------------- + +To instantiate data types, it is recommended to call the provided +:ref:`factory functions <api-type-factories>`:: + + std::shared_ptr<arrow::DataType> type; + + // A 16-bit integer type + type = arrow::int16(); + // A 64-bit timestamp type (with microsecond granularity) + type = arrow::timestamp(arrow::TimeUnit::MICRO); + // A list type of single-precision floating-point values + type = arrow::list(arrow::float32()); diff --git a/src/arrow/docs/source/cpp/examples/cmake_minimal_build.rst b/src/arrow/docs/source/cpp/examples/cmake_minimal_build.rst new file mode 100644 index 000000000..f135de830 --- /dev/null +++ b/src/arrow/docs/source/cpp/examples/cmake_minimal_build.rst @@ -0,0 +1,28 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Minimal build using CMake +========================== + +The folder ``cpp/examples/minimal_build/`` located inside the source tree +contains a Docker-based example of building and using Arrow from a +third-party project, using CMake. The +`README <https://github.com/apache/arrow/tree/master/cpp/examples/minimal_build/README.md>`_ +file in that folder has more information. diff --git a/src/arrow/docs/source/cpp/examples/compute_and_write_example.rst b/src/arrow/docs/source/cpp/examples/compute_and_write_example.rst new file mode 100644 index 000000000..096b97b83 --- /dev/null +++ b/src/arrow/docs/source/cpp/examples/compute_and_write_example.rst @@ -0,0 +1,28 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Compute and Write CSV Example +============================= + +The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside +the source tree contains an example of creating a table of two numerical columns +and then compariong the magnitudes of the entries in the columns and wrting out to +a CSV file with the column entries and their comparisons. The code in the example +is documented. diff --git a/src/arrow/docs/source/cpp/examples/dataset_documentation_example.rst b/src/arrow/docs/source/cpp/examples/dataset_documentation_example.rst new file mode 100644 index 000000000..2bc993f24 --- /dev/null +++ b/src/arrow/docs/source/cpp/examples/dataset_documentation_example.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Arrow Datasets example +========================= + +The file ``cpp/examples/arrow/dataset_documentation_example.cc`` +located inside the source tree contains an example of using Arrow +Datasets to read, write, select, and filter data. :doc:`../dataset` +has a full walkthrough of the example. diff --git a/src/arrow/docs/source/cpp/examples/index.rst b/src/arrow/docs/source/cpp/examples/index.rst new file mode 100644 index 000000000..bc5bd497c --- /dev/null +++ b/src/arrow/docs/source/cpp/examples/index.rst @@ -0,0 +1,28 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Examples +======== + +.. toctree:: + :maxdepth: 1 + + cmake_minimal_build + compute_and_write_example + dataset_documentation_example + row_columnar_conversion + std::tuple-like ranges to Arrow <tuple_range_conversion> diff --git a/src/arrow/docs/source/cpp/examples/row_columnar_conversion.rst b/src/arrow/docs/source/cpp/examples/row_columnar_conversion.rst new file mode 100644 index 000000000..3f45864c2 --- /dev/null +++ b/src/arrow/docs/source/cpp/examples/row_columnar_conversion.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Row to columnar conversion +========================== + +The following example converts an array of structs to a :class:`arrow::Table` +instance, and then converts it back to the original array of structs. + +.. literalinclude:: ../../../../cpp/examples/arrow/row_wise_conversion_example.cc diff --git a/src/arrow/docs/source/cpp/examples/tuple_range_conversion.rst b/src/arrow/docs/source/cpp/examples/tuple_range_conversion.rst new file mode 100644 index 000000000..64ba23782 --- /dev/null +++ b/src/arrow/docs/source/cpp/examples/tuple_range_conversion.rst @@ -0,0 +1,106 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +Conversion of range of ``std::tuple``-like to ``Table`` instances +================================================================= + +While the above example shows a quite manual approach of a row to columnar +conversion, Arrow also provides some template logic to convert ranges of +``std::tuple<..>``-like objects to tables. + +In the most simple case, you only need to provide the input data and the +type conversion is then inferred at compile time. + +.. code:: + + std::vector<std::tuple<double, std::string>> rows = .. + std::shared_ptr<Table> table; + + if (!arrow::stl::TableFromTupleRange( + arrow::default_memory_pool(), + rows, names, &table).ok() + ) { + // Error handling code should go here. + } + +In reverse, you can use ``TupleRangeFromTable`` to fill an already +pre-allocated range with the data from a ``Table`` instance. + +.. code:: + + // An important aspect here is that the table columns need to be in the + // same order as the columns will later appear in the tuple. As the tuple + // is unnamed, matching is done on positions. + std::shared_ptr<Table> table = .. + + // The range needs to be pre-allocated to the respective amount of rows. + // This allows us to pass in an arbitrary range object, not only + // `std::vector`. + std::vector<std::tuple<double, std::string>> rows(2); + if (!arrow::stl::TupleRangeFromTable(*table, &rows).ok()) { + // Error handling code should go here. + } + +Arrow itself already supports some C(++) data types for this conversion. If you +want to support additional data types, you need to implement a specialization +of ``arrow::stl::ConversionTraits<T>`` and the more general +``arrow::CTypeTraits<T>``. + + +.. code:: + + namespace arrow { + + template<> + struct CTypeTraits<boost::posix_time::ptime> { + using ArrowType = ::arrow::TimestampType; + + static std::shared_ptr<::arrow::DataType> type_singleton() { + return ::arrow::timestamp(::arrow::TimeUnit::MICRO); + } + }; + + } + + namespace arrow { namespace stl { + + template <> + struct ConversionTraits<boost::posix_time::ptime> : public CTypeTraits<boost::posix_time::ptime> { + constexpr static bool nullable = false; + + // This is the specialization to load a scalar value into an Arrow builder. + static Status AppendRow( + typename TypeTraits<TimestampType>::BuilderType& builder, + boost::posix_time::ptime cell) { + boost::posix_time::ptime const epoch({1970, 1, 1}, {0, 0, 0, 0}); + return builder.Append((cell - epoch).total_microseconds()); + } + + // Specify how we can fill the tuple from the values stored in the Arrow + // array. + static boost::posix_time::ptime GetEntry( + const TimestampArray& array, size_t j) { + return psapp::arrow::internal::timestamp_epoch + + boost::posix_time::time_duration(0, 0, 0, array.Value(j)); + } + }; + + }} + diff --git a/src/arrow/docs/source/cpp/flight.rst b/src/arrow/docs/source/cpp/flight.rst new file mode 100644 index 000000000..c1d2e43b9 --- /dev/null +++ b/src/arrow/docs/source/cpp/flight.rst @@ -0,0 +1,119 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +================ +Arrow Flight RPC +================ + +Arrow Flight is an RPC framework for efficient transfer of Flight data +over the network. See :doc:`../format/Flight` for full details on +the protocol, or :doc:`./api/flight` for API docs. + +Writing a Flight Service +======================== + +Servers are subclasses of :class:`arrow::flight::FlightServerBase`. To +implement individual RPCs, override the RPC methods on this class. + +.. code-block:: cpp + + class MyFlightServer : public FlightServerBase { + Status ListFlights(const ServerCallContext& context, const Criteria* criteria, + std::unique_ptr<FlightListing>* listings) override { + std::vector<FlightInfo> flights = ...; + *listings = std::unique_ptr<FlightListing>(new SimpleFlightListing(flights)); + return Status::OK(); + } + }; + +Each RPC method always takes a +:class:`arrow::flight::ServerCallContext` for common parameters and +returns a :class:`arrow::Status` to indicate success or +failure. Flight-specific error codes can be returned via +:func:`arrow::flight::MakeFlightError`. + +RPC methods that return a value in addition to a status will use an +out parameter, as shown above. Often, there are helper classes +providing basic implementations of these out parameters. For instance, +above, :class:`arrow::flight::SimpleFlightListing` uses a vector of +:class:`arrow::flight::FlightInfo` objects as the result of a +``ListFlights`` RPC. + +To start a server, create a :class:`arrow::flight::Location` to +specify where to listen, and call +:func:`arrow::flight::FlightServerBase::Init`. This will start the +server, but won't block the rest of the program. Use +:func:`arrow::flight::FlightServerBase::SetShutdownOnSignals` to +enable stopping the server if an interrupt signal is received, then +call :func:`arrow::flight::FlightServerBase::Serve` to block until the +server stops. + +.. code-block:: cpp + + std::unique_ptr<arrow::flight::FlightServerBase> server; + // Initialize server + arrow::flight::Location location; + // Listen to all interfaces on a free port + ARROW_CHECK_OK(arrow::flight::Location::ForGrpcTcp("0.0.0.0", 0, &location)); + arrow::flight::FlightServerOptions options(location); + + // Start the server + ARROW_CHECK_OK(server->Init(options)); + // Exit with a clean error code (0) on SIGTERM + ARROW_CHECK_OK(server->SetShutdownOnSignals({SIGTERM})); + + std::cout << "Server listening on localhost:" << server->port() << std::endl; + ARROW_CHECK_OK(server->Serve()); + + +Enabling TLS and Authentication +------------------------------- + +TLS can be enabled by providing a certificate and key pair to +:func:`FlightServerBase::Init +<arrow::flight::FlightServerBase::Init>`. Additionally, use +:func:`Location::ForGrpcTls <arrow::flight::Location::ForGrpcTls>` to +construct the :class:`arrow::flight::Location` to listen on. + +Similarly, authentication can be enabled by providing an +implementation of :class:`ServerAuthHandler +<arrow::flight::ServerAuthHandler>`. Authentication consists of two +parts: on initial client connection, the server and client +authentication implementations can perform any negotiation needed; +then, on each RPC thereafter, the client provides a token. The server +authentication handler validates the token and provides the identity +of the client. This identity can be obtained from the +:class:`arrow::flight::ServerCallContext`. + +Using the Flight Client +======================= + +To connect to a Flight service, create an instance of +:class:`arrow::flight::FlightClient` by calling :func:`Connect +<arrow::flight::FlightClient::Connect>`. This takes a Location and +returns the client through an out parameter. To authenticate, call +:func:`Authenticate <arrow::flight::FlightClient::Authenticate>` with +the desired client authentication implementation. + +Each RPC method returns :class:`arrow::Status` to indicate the +success/failure of the request. Any other return values are specified +through out parameters. They also take an optional :class:`options +<arrow::flight::FlightCallOptions>` parameter that allows specifying a +timeout for the call. diff --git a/src/arrow/docs/source/cpp/getting_started.rst b/src/arrow/docs/source/cpp/getting_started.rst new file mode 100644 index 000000000..36ea4803f --- /dev/null +++ b/src/arrow/docs/source/cpp/getting_started.rst @@ -0,0 +1,41 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +User Guide +========== + +.. toctree:: + + overview + conventions + build_system + memory + arrays + datatypes + tables + compute + streaming_execution + io + ipc + parquet + csv + json + dataset + flight diff --git a/src/arrow/docs/source/cpp/index.rst b/src/arrow/docs/source/cpp/index.rst new file mode 100644 index 000000000..b3f6e4c82 --- /dev/null +++ b/src/arrow/docs/source/cpp/index.rst @@ -0,0 +1,32 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +C++ Implementation +================== + +.. toctree:: + :maxdepth: 2 + + getting_started + Examples <examples/index> + api + +.. TODO add "topics" chapter +.. - nested arrays +.. - dictionary encoding + +.. TODO add "building" or "development" chapter diff --git a/src/arrow/docs/source/cpp/io.rst b/src/arrow/docs/source/cpp/io.rst new file mode 100644 index 000000000..6e1d261c0 --- /dev/null +++ b/src/arrow/docs/source/cpp/io.rst @@ -0,0 +1,87 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp +.. cpp:namespace:: arrow::io + +============================== +Input / output and filesystems +============================== + +Arrow provides a range of C++ interfaces abstracting the concrete details +of input / output operations. They operate on streams of untyped binary data. +Those abstractions are used for various purposes such as reading CSV or +Parquet data, transmitting IPC streams, and more. + +.. seealso:: + :doc:`API reference for input/output facilities <api/io>`. + +Reading binary data +=================== + +Interfaces for reading binary data come in two flavours: + +* Sequential reading: the :class:`InputStream` interface provides + ``Read`` methods; it is recommended to ``Read`` to a ``Buffer`` as it + may in some cases avoid a memory copy. + +* Random access reading: the :class:`RandomAccessFile` interface + provides additional facilities for positioning and, most importantly, + the ``ReadAt`` methods which allow parallel reading from multiple threads. + +Concrete implementations are available for :class:`in-memory reads <BufferReader>`, +:class:`unbuffered file reads <ReadableFile>`, +:class:`memory-mapped file reads <MemoryMappedFile>`, +:class:`buffered reads <BufferedInputStream>`, +:class:`compressed reads <CompressedInputStream>`. + +Writing binary data +=================== + +Writing binary data is mostly done through the :class:`OutputStream` +interface. + +Concrete implementations are available for :class:`in-memory writes <BufferOutputStream>`, +:class:`unbuffered file writes <FileOutputStream>`, +:class:`memory-mapped file writes <MemoryMappedFile>`, +:class:`buffered writes <BufferedOutputStream>`, +:class:`compressed writes <CompressedOutputStream>`. + +.. cpp:namespace:: arrow::fs + +.. _cpp-filesystems: + +Filesystems +=========== + +The :class:`filesystem interface <FileSystem>` allows abstracted access over +various data storage backends such as the local filesystem or a S3 bucket. +It provides input and output streams as well as directory operations. + +The filesystem interface exposes a simplified view of the underlying data +storage. Data paths are represented as *abstract paths*, which are +``/``-separated, even on Windows, and shouldn't include special path +components such as ``.`` and ``..``. Symbolic links, if supported by the +underlying storage, are automatically dereferenced. Only basic +:class:`metadata <FileStats>` about file entries, such as the file size +and modification time, is made available. + +Concrete implementations are available for +:class:`local filesystem access <LocalFileSystem>`, +:class:`HDFS <HadoopFileSystem>` and +:class:`Amazon S3-compatible storage <S3FileSystem>`. diff --git a/src/arrow/docs/source/cpp/ipc.rst b/src/arrow/docs/source/cpp/ipc.rst new file mode 100644 index 000000000..ce4175bca --- /dev/null +++ b/src/arrow/docs/source/cpp/ipc.rst @@ -0,0 +1,75 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp +.. cpp:namespace:: arrow::ipc + +======================================== +Reading and writing the Arrow IPC format +======================================== + +.. seealso:: + :ref:`Arrow IPC format specification <format-ipc>`. + + :doc:`API reference for IPC readers and writers <api/ipc>`. + +Arrow C++ provides readers and writers for the Arrow IPC format which wrap +lower level input/output, handled through the :doc:`IO interfaces <io>`. +For reading, there is also an event-driven API that enables feeding +arbitrary data into the IPC decoding layer asynchronously. + +Reading IPC streams and files +============================= + +Synchronous reading +------------------- + +For most cases, it is most convenient to use the :class:`RecordBatchStreamReader` +or :class:`RecordBatchFileReader` class, depending on which variant of the IPC +format you want to read. The former requires a :class:`~arrow::io::InputStream` +source, while the latter requires a :class:`~arrow::io::RandomAccessFile`. + +Reading Arrow IPC data is inherently zero-copy if the source allows it. +For example, a :class:`~arrow::io::BufferReader` or :class:`~arrow::io::MemoryMappedFile` +can typically be zero-copy. Exceptions are when the data must be transformed +on the fly, e.g. when buffer compression has been enabled on the IPC stream +or file. + +Event-driven reading +-------------------- + +When it is necessary to process the IPC format without blocking (for example +to integrate Arrow with an event loop), or if data is coming from an unusual +source, use the event-driven :class:`StreamDecoder`. You will need to define +a subclass of :class:`Listener` and implement the virtual methods for the +desired events (for example, implement :func:`Listener::OnRecordBatchDecoded` +to be notified of each incoming :class:`RecordBatch`). + +Writing IPC streams and files +============================= + +Use one of the factory functions, :func:`MakeStreamWriter` or +:func:`MakeFileWriter`, to obtain a :class:`RecordBatchWriter` instance for +the given IPC format variant. + +Configuring +=========== + +Various aspects of reading and writing the IPC format can be configured +using the :class:`IpcReadOptions` and :class:`IpcWriteOptions` classes, +respectively. diff --git a/src/arrow/docs/source/cpp/json.rst b/src/arrow/docs/source/cpp/json.rst new file mode 100644 index 000000000..cdb742e6c --- /dev/null +++ b/src/arrow/docs/source/cpp/json.rst @@ -0,0 +1,128 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: arrow::json + +================== +Reading JSON files +================== + +Arrow allows reading line-separated JSON files as Arrow tables. Each +independent JSON object in the input file is converted to a row in +the target Arrow table. + +.. seealso:: + :ref:`JSON reader API reference <cpp-api-json>`. + +Basic usage +=========== + +A JSON file is read from a :class:`~arrow::io::InputStream`. + +.. code-block:: cpp + + #include "arrow/json/api.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr<arrow::io::InputStream> input = ...; + + auto read_options = arrow::json::ReadOptions::Defaults(); + auto parse_options = arrow::json::ParseOptions::Defaults(); + + // Instantiate TableReader from input stream and options + std::shared_ptr<arrow::json::TableReader> reader; + st = arrow::json::TableReader::Make(pool, input, read_options, + parse_options, &reader); + if (!st.ok()) { + // Handle TableReader instantiation error... + } + + std::shared_ptr<arrow::Table> table; + // Read table from JSON file + st = reader->Read(&table); + if (!st.ok()) { + // Handle JSON read error + // (for example a JSON syntax error or failed type conversion) + } + } + +Data types +========== + +Since JSON values are typed, the possible Arrow data types on output +depend on the input value types. Top-level JSON values should always be +objects. The fields of top-level objects are taken to represent columns +in the Arrow data. For each name/value pair in a JSON object, there are +two possible modes of deciding the output data type: + +* if the name is in :class:`ConvertOptions::explicit_schema`, + conversion of the JSON value to the corresponding Arrow data type is + attempted; + +* otherwise, the Arrow data type is determined via type inference on + the JSON value, trying out a number of Arrow data types in order. + +The following tables show the possible combinations for each of those +two modes. + +.. table:: Explicit conversions from JSON to Arrow + :align: center + + +-----------------+----------------------------------------------------+ + | JSON value type | Allowed Arrow data types | + +=================+====================================================+ + | Null | Any (including Null) | + +-----------------+----------------------------------------------------+ + | Number | All Integer types, Float32, Float64, | + | | Date32, Date64, Time32, Time64 | + +-----------------+----------------------------------------------------+ + | Boolean | Boolean | + +-----------------+----------------------------------------------------+ + | String | Binary, LargeBinary, String, LargeString, | + | | Timestamp | + +-----------------+----------------------------------------------------+ + | Array | List | + +-----------------+----------------------------------------------------+ + | Object (nested) | Struct | + +-----------------+----------------------------------------------------+ + +.. table:: Implicit type inference from JSON to Arrow + :align: center + + +-----------------+----------------------------------------------------+ + | JSON value type | Inferred Arrow data types (in order) | + +=================+====================================================+ + | Null | Null, any other | + +-----------------+----------------------------------------------------+ + | Number | Int64, Float64 | + | | | + +-----------------+----------------------------------------------------+ + | Boolean | Boolean | + +-----------------+----------------------------------------------------+ + | String | Timestamp (with seconds unit), String | + | | | + +-----------------+----------------------------------------------------+ + | Array | List | + +-----------------+----------------------------------------------------+ + | Object (nested) | Struct | + +-----------------+----------------------------------------------------+ diff --git a/src/arrow/docs/source/cpp/memory.rst b/src/arrow/docs/source/cpp/memory.rst new file mode 100644 index 000000000..ff8ffb044 --- /dev/null +++ b/src/arrow/docs/source/cpp/memory.rst @@ -0,0 +1,203 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. _cpp_memory_management: + +================= +Memory Management +================= + +.. seealso:: + :doc:`Memory management API reference <api/memory>` + +Buffers +======= + +To avoid passing around raw data pointers with varying and non-obvious +lifetime rules, Arrow provides a generic abstraction called :class:`arrow::Buffer`. +A Buffer encapsulates a pointer and data size, and generally also ties its +lifetime to that of an underlying provider (in other words, a Buffer should +*always* point to valid memory till its destruction). Buffers are untyped: +they simply denote a physical memory area regardless of its intended meaning +or interpretation. + +Buffers may be allocated by Arrow itself , or by third-party routines. +For example, it is possible to pass the data of a Python bytestring as a Arrow +buffer, keeping the Python object alive as necessary. + +In addition, buffers come in various flavours: mutable or not, resizable or +not. Generally, you will hold a mutable buffer when building up a piece +of data, then it will be frozen as an immutable container such as an +:doc:`array <arrays>`. + +.. note:: + Some buffers may point to non-CPU memory, such as GPU-backed memory + provided by a CUDA context. If you're writing a GPU-aware application, + you will need to be careful not to interpret a GPU memory pointer as + a CPU-reachable pointer, or vice-versa. + +Accessing Buffer Memory +----------------------- + +Buffers provide fast access to the underlying memory using the +:func:`~arrow::Buffer::size` and :func:`~arrow::Buffer::data` accessors +(or :func:`~arrow::Buffer::mutable_data` for writable access to a mutable +buffer). + +Slicing +------- + +It is possible to make zero-copy slices of buffers, to obtain a buffer +referring to some contiguous subset of the underlying data. This is done +by calling the :func:`arrow::SliceBuffer` and :func:`arrow::SliceMutableBuffer` +functions. + +Allocating a Buffer +------------------- + +You can allocate a buffer yourself by calling one of the +:func:`arrow::AllocateBuffer` or :func:`arrow::AllocateResizableBuffer` +overloads:: + + arrow::Result<std::unique_ptr<Buffer>> maybe_buffer = arrow::AllocateBuffer(4096); + if (!maybe_buffer.ok()) { + // ... handle allocation error + } + + std::shared_ptr<arrow::Buffer> buffer = *std::move(maybe_buffer); + uint8_t* buffer_data = buffer->mutable_data(); + memcpy(buffer_data, "hello world", 11); + +Allocating a buffer this way ensures it is 64-bytes aligned and padded +as recommended by the :doc:`Arrow memory specification <../format/Layout>`. + +Building a Buffer +----------------- + +You can also allocate *and* build a Buffer incrementally, using the +:class:`arrow::BufferBuilder` API:: + + BufferBuilder builder; + builder.Resize(11); // reserve enough space for 11 bytes + builder.Append("hello ", 6); + builder.Append("world", 5); + + auto maybe_buffer = builder.Finish(); + if (!maybe_buffer.ok()) { + // ... handle buffer allocation error + } + std::shared_ptr<arrow::Buffer> buffer = *maybe_buffer; + +If a Buffer is meant to contain values of a given fixed-width type (for +example the 32-bit offsets of a List array), it can be more convenient to +use the template :class:`arrow::TypedBufferBuilder` API:: + + TypedBufferBuilder<int32_t> builder; + builder.Reserve(2); // reserve enough space for two int32_t values + builder.Append(0x12345678); + builder.Append(-0x765643210); + + auto maybe_buffer = builder.Finish(); + if (!maybe_buffer.ok()) { + // ... handle buffer allocation error + } + std::shared_ptr<arrow::Buffer> buffer = *maybe_buffer; + +Memory Pools +============ + +When allocating a Buffer using the Arrow C++ API, the buffer's underlying +memory is allocated by a :class:`arrow::MemoryPool` instance. Usually this +will be the process-wide *default memory pool*, but many Arrow APIs allow +you to pass another MemoryPool instance for their internal allocations. + +Memory pools are used for large long-lived data such as array buffers. +Other data, such as small C++ objects and temporary workspaces, usually +goes through the regular C++ allocators. + +Default Memory Pool +------------------- + +The default memory pool depends on how Arrow C++ was compiled: + +- if enabled at compile time, a `jemalloc <http://jemalloc.net/>`_ heap; +- otherwise, if enabled at compile time, a + `mimalloc <https://github.com/microsoft/mimalloc>`_ heap; +- otherwise, the C library ``malloc`` heap. + +Overriding the Default Memory Pool +---------------------------------- + +One can override the above selection algorithm by setting the +``ARROW_DEFAULT_MEMORY_POOL`` environment variable to one of the following +values: ``jemalloc``, ``mimalloc`` or ``system``. This variable is inspected +once when Arrow C++ is loaded in memory (for example when the Arrow C++ DLL +is loaded). + +STL Integration +--------------- + +If you wish to use a Arrow memory pool to allocate the data of STL containers, +you can do so using the :class:`arrow::stl::allocator` wrapper. + +Conversely, you can also use a STL allocator to allocate Arrow memory, +using the :class:`arrow::stl::STLMemoryPool` class. However, this may be less +performant, as STL allocators don't provide a resizing operation. + +Devices +======= + +Many Arrow applications only access host (CPU) memory. However, in some cases +it is desirable to handle on-device memory (such as on-board memory on a GPU) +as well as host memory. + +Arrow represents the CPU and other devices using the +:class:`arrow::Device` abstraction. The associated class :class:`arrow::MemoryManager` +specifies how to allocate on a given device. Each device has a default memory manager, but +additional instances may be constructed (for example, wrapping a custom +:class:`arrow::MemoryPool` the CPU). +:class:`arrow::MemoryManager` instances which specify how to allocate +memory on a given device (for example, using a particular +:class:`arrow::MemoryPool` on the CPU). + +Device-Agnostic Programming +--------------------------- + +If you receive a Buffer from third-party code, you can query whether it is +CPU-readable by calling its :func:`~arrow::Buffer::is_cpu` method. + +You can also view the Buffer on a given device, in a generic way, by calling +:func:`arrow::Buffer::View` or :func:`arrow::Buffer::ViewOrCopy`. This will +be a no-operation if the source and destination devices are identical. +Otherwise, a device-dependent mechanism will attempt to construct a memory +address for the destination device that gives access to the buffer contents. +Actual device-to-device transfer may happen lazily, when reading the buffer +contents. + +Similarly, if you want to do I/O on a buffer without assuming a CPU-readable +buffer, you can call :func:`arrow::Buffer::GetReader` and +:func:`arrow::Buffer::GetWriter`. + +For example, to get an on-CPU view or copy of an arbitrary buffer, you can +simply do:: + + std::shared_ptr<arrow::Buffer> arbitrary_buffer = ... ; + std::shared_ptr<arrow::Buffer> cpu_buffer = arrow::Buffer::ViewOrCopy( + arbitrary_buffer, arrow::default_cpu_memory_manager()); diff --git a/src/arrow/docs/source/cpp/overview.rst b/src/arrow/docs/source/cpp/overview.rst new file mode 100644 index 000000000..ccebdba45 --- /dev/null +++ b/src/arrow/docs/source/cpp/overview.rst @@ -0,0 +1,97 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +High-Level Overview +=================== + +The Arrow C++ library is comprised of different parts, each of which serves +a specific purpose. + +The physical layer +------------------ + +**Memory management** abstractions provide a uniform API over memory that +may be allocated through various means, such as heap allocation, the memory +mapping of a file or a static memory area. In particular, the **buffer** +abstraction represents a contiguous area of physical data. + +The one-dimensional layer +------------------------- + +**Data types** govern the *logical* interpretation of *physical* data. +Many operations in Arrow are parametered, at compile-time or at runtime, +by a data type. + +**Arrays** assemble one or several buffers with a data type, allowing to +view them as a logical contiguous sequence of values (possibly nested). + +**Chunked arrays** are a generalization of arrays, comprising several same-type +arrays into a longer logical sequence of values. + +The two-dimensional layer +------------------------- + +**Schemas** describe a logical collection of several pieces of data, +each with a distinct name and type, and optional metadata. + +**Tables** are collections of chunked array in accordance to a schema. They +are the most capable dataset-providing abstraction in Arrow. + +**Record batches** are collections of contiguous arrays, described +by a schema. They allow incremental construction or serialization of tables. + +The compute layer +----------------- + +**Datums** are flexible dataset references, able to hold for example an array or table +reference. + +**Kernels** are specialized computation functions running in a loop over a +given set of datums representing input and output parameters to the functions. + +The IO layer +------------ + +**Streams** allow untyped sequential or seekable access over external data +of various kinds (for example compressed or memory-mapped). + +The Inter-Process Communication (IPC) layer +------------------------------------------- + +A **messaging format** allows interchange of Arrow data between processes, using +as few copies as possible. + +The file formats layer +---------------------- + +Reading and writing Arrow data from/to various file formats is possible, for +example **Parquet**, **CSV**, **Orc** or the Arrow-specific **Feather** format. + +The devices layer +----------------- + +Basic **CUDA** integration is provided, allowing to describe Arrow data backed +by GPU-allocated memory. + +The filesystem layer +-------------------- + +A filesystem abstraction allows reading and writing data from different storage +backends, such as the local filesystem or a S3 bucket. diff --git a/src/arrow/docs/source/cpp/parquet.rst b/src/arrow/docs/source/cpp/parquet.rst new file mode 100644 index 000000000..88ea4e5b6 --- /dev/null +++ b/src/arrow/docs/source/cpp/parquet.rst @@ -0,0 +1,432 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +.. cpp:namespace:: parquet + +================================= +Reading and writing Parquet files +================================= + +.. seealso:: + :ref:`Parquet reader and writer API reference <cpp-api-parquet>`. + +The `Parquet format <https://parquet.apache.org/documentation/latest/>`__ +is a space-efficient columnar storage format for complex data. The Parquet +C++ implementation is part of the Apache Arrow project and benefits +from tight integration with the Arrow C++ classes and facilities. + +Supported Parquet features +========================== + +The Parquet format has many features, and Parquet C++ supports a subset of them. + +Page types +---------- + ++-------------------+---------+ +| Page type | Notes | ++===================+=========+ +| DATA_PAGE | | ++-------------------+---------+ +| DATA_PAGE_V2 | | ++-------------------+---------+ +| DICTIONARY_PAGE | | ++-------------------+---------+ + +*Unsupported page type:* INDEX_PAGE. When reading a Parquet file, pages of +this type are ignored. + +Compression +----------- + ++-------------------+---------+ +| Compression codec | Notes | ++===================+=========+ +| SNAPPY | | ++-------------------+---------+ +| GZIP | | ++-------------------+---------+ +| BROTLI | | ++-------------------+---------+ +| LZ4 | \(1) | ++-------------------+---------+ +| ZSTD | | ++-------------------+---------+ + +* \(1) On the read side, Parquet C++ is able to decompress both the regular + LZ4 block format and the ad-hoc Hadoop LZ4 format used by the + `reference Parquet implementation <https://github.com/apache/parquet-mr>`__. + On the write side, Parquet C++ always generates the ad-hoc Hadoop LZ4 format. + +*Unsupported compression codec:* LZO. + +Encodings +--------- + ++--------------------------+---------+ +| Encoding | Notes | ++==========================+=========+ +| PLAIN | | ++--------------------------+---------+ +| PLAIN_DICTIONARY | | ++--------------------------+---------+ +| BIT_PACKED | | ++--------------------------+---------+ +| RLE | \(1) | ++--------------------------+---------+ +| RLE_DICTIONARY | \(2) | ++--------------------------+---------+ +| BYTE_STREAM_SPLIT | | ++--------------------------+---------+ + +* \(1) Only supported for encoding definition and repetition levels, not values. + +* \(2) On the write path, RLE_DICTIONARY is only enabled if Parquet format version + 2.4 or greater is selected in :func:`WriterProperties::version`. + +*Unsupported encodings:* DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY, +DELTA_BYTE_ARRAY. + +Types +----- + +Physical types +~~~~~~~~~~~~~~ + ++--------------------------+-------------------------+------------+ +| Physical type | Mapped Arrow type | Notes | ++==========================+=========================+============+ +| BOOLEAN | Boolean | | ++--------------------------+-------------------------+------------+ +| INT32 | Int32 / other | \(1) | ++--------------------------+-------------------------+------------+ +| INT64 | Int64 / other | \(1) | ++--------------------------+-------------------------+------------+ +| INT96 | Timestamp (nanoseconds) | \(2) | ++--------------------------+-------------------------+------------+ +| FLOAT | Float32 | | ++--------------------------+-------------------------+------------+ +| DOUBLE | Float64 | | ++--------------------------+-------------------------+------------+ +| BYTE_ARRAY | Binary / other | \(1) \(3) | ++--------------------------+-------------------------+------------+ +| FIXED_LENGTH_BYTE_ARRAY | FixedSizeBinary / other | \(1) | ++--------------------------+-------------------------+------------+ + +* \(1) Can be mapped to other Arrow types, depending on the logical type + (see below). + +* \(2) On the write side, :func:`ArrowWriterProperties::support_deprecated_int96_timestamps` + must be enabled. + +* \(3) On the write side, an Arrow LargeBinary can also mapped to BYTE_ARRAY. + +Logical types +~~~~~~~~~~~~~ + +Specific logical types can override the default Arrow type mapping for a given +physical type. + ++-------------------+-----------------------------+----------------------------+---------+ +| Logical type | Physical type | Mapped Arrow type | Notes | ++===================+=============================+============================+=========+ +| NULL | Any | Null | \(1) | ++-------------------+-----------------------------+----------------------------+---------+ +| INT | INT32 | Int8 / UInt8 / Int16 / | | +| | | UInt16 / Int32 / UInt32 | | ++-------------------+-----------------------------+----------------------------+---------+ +| INT | INT64 | Int64 / UInt64 | | ++-------------------+-----------------------------+----------------------------+---------+ +| DECIMAL | INT32 / INT64 / BYTE_ARRAY | Decimal128 / Decimal256 | \(2) | +| | / FIXED_LENGTH_BYTE_ARRAY | | | ++-------------------+-----------------------------+----------------------------+---------+ +| DATE | INT32 | Date32 | \(3) | ++-------------------+-----------------------------+----------------------------+---------+ +| TIME | INT32 | Time32 (milliseconds) | | ++-------------------+-----------------------------+----------------------------+---------+ +| TIME | INT64 | Time64 (micro- or | | +| | | nanoseconds) | | ++-------------------+-----------------------------+----------------------------+---------+ +| TIMESTAMP | INT64 | Timestamp (milli-, micro- | | +| | | or nanoseconds) | | ++-------------------+-----------------------------+----------------------------+---------+ +| STRING | BYTE_ARRAY | Utf8 | \(4) | ++-------------------+-----------------------------+----------------------------+---------+ +| LIST | Any | List | \(5) | ++-------------------+-----------------------------+----------------------------+---------+ +| MAP | Any | Map | \(6) | ++-------------------+-----------------------------+----------------------------+---------+ + +* \(1) On the write side, the Parquet physical type INT32 is generated. + +* \(2) On the write side, a FIXED_LENGTH_BYTE_ARRAY is always emitted. + +* \(3) On the write side, an Arrow Date64 is also mapped to a Parquet DATE INT32. + +* \(4) On the write side, an Arrow LargeUtf8 is also mapped to a Parquet STRING. + +* \(5) On the write side, an Arrow LargeList or FixedSizedList is also mapped to + a Parquet LIST. + +* \(6) On the read side, a key with multiple values does not get deduplicated, + in contradiction with the + `Parquet specification <https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps>`__. + +*Unsupported logical types:* JSON, BSON, UUID. If such a type is encountered +when reading a Parquet file, the default physical type mapping is used (for +example, a Parquet JSON column may be read as Arrow Binary or FixedSizeBinary). + +Converted types +~~~~~~~~~~~~~~~ + +While converted types are deprecated in the Parquet format (they are superceded +by logical types), they are recognized and emitted by the Parquet C++ +implementation so as to maximize compatibility with other Parquet +implementations. + +Special cases +~~~~~~~~~~~~~ + +An Arrow Extension type is written out as its storage type. It can still +be recreated at read time using Parquet metadata (see "Roundtripping Arrow +types" below). + +An Arrow Dictionary type is written out as its value type. It can still +be recreated at read time using Parquet metadata (see "Roundtripping Arrow +types" below). + +Roundtripping Arrow types +~~~~~~~~~~~~~~~~~~~~~~~~~ + +While there is no bijection between Arrow types and Parquet types, it is +possible to serialize the Arrow schema as part of the Parquet file metadata. +This is enabled using :func:`ArrowWriterProperties::store_schema`. + +On the read path, the serialized schema will be automatically recognized +and will recreate the original Arrow data, converting the Parquet data as +required (for example, a LargeList will be recreated from the Parquet LIST +type). + +As an example, when serializing an Arrow LargeList to Parquet: + +* The data is written out as a Parquet LIST + +* When read back, the Parquet LIST data is decoded as an Arrow LargeList if + :func:`ArrowWriterProperties::store_schema` was enabled when writing the file; + otherwise, it is decoded as an Arrow List. + +Serialization details +""""""""""""""""""""" + +The Arrow schema is serialized as a :ref:`Arrow IPC <format-ipc>` schema message, +then base64-encoded and stored under the ``ARROW:schema`` metadata key in +the Parquet file metadata. + +Limitations +~~~~~~~~~~~ + +Writing or reading back FixedSizedList data with null entries is not supported. + +Encryption +---------- + +Parquet C++ implements all features specified in the +`encryption specification <https://github.com/apache/parquet-format/blob/master/Encryption.md>`__, +except for encryption of column index and bloom filter modules. + +More specifically, Parquet C++ supports: + +* AES_GCM_V1 and AES_GCM_CTR_V1 encryption algorithms. +* AAD suffix for Footer, ColumnMetaData, Data Page, Dictionary Page, + Data PageHeader, Dictionary PageHeader module types. Other module types + (ColumnIndex, OffsetIndex, BloomFilter Header, BloomFilter Bitset) are not + supported. +* EncryptionWithFooterKey and EncryptionWithColumnKey modes. +* Encrypted Footer and Plaintext Footer modes. + + +Reading Parquet files +===================== + +The :class:`arrow::FileReader` class reads data for an entire +file or row group into an :class:`::arrow::Table`. + +The :class:`StreamReader` and :class:`StreamWriter` classes allow for +data to be written using a C++ input/output streams approach to +read/write fields column by column and row by row. This approach is +offered for ease of use and type-safety. It is of course also useful +when data must be streamed as files are read and written +incrementally. + +Please note that the performance of the :class:`StreamReader` and +:class:`StreamWriter` classes will not be as good due to the type +checking and the fact that column values are processed one at a time. + +FileReader +---------- + +The Parquet :class:`arrow::FileReader` requires a +:class:`::arrow::io::RandomAccessFile` instance representing the input +file. + +.. code-block:: cpp + + #include "arrow/parquet/arrow/reader.h" + + { + // ... + arrow::Status st; + arrow::MemoryPool* pool = default_memory_pool(); + std::shared_ptr<arrow::io::RandomAccessFile> input = ...; + + // Open Parquet file reader + std::unique_ptr<parquet::arrow::FileReader> arrow_reader; + st = parquet::arrow::OpenFile(input, pool, &arrow_reader); + if (!st.ok()) { + // Handle error instantiating file reader... + } + + // Read entire file as a single Arrow table + std::shared_ptr<arrow::Table> table; + st = arrow_reader->ReadTable(&table); + if (!st.ok()) { + // Handle error reading Parquet data... + } + } + +Finer-grained options are available through the +:class:`arrow::FileReaderBuilder` helper class. + +.. TODO write section about performance and memory efficiency + +StreamReader +------------ + +The :class:`StreamReader` allows for Parquet files to be read using +standard C++ input operators which ensures type-safety. + +Please note that types must match the schema exactly i.e. if the +schema field is an unsigned 16-bit integer then you must supply a +uint16_t type. + +Exceptions are used to signal errors. A :class:`ParquetException` is +thrown in the following circumstances: + +* Attempt to read field by supplying the incorrect type. + +* Attempt to read beyond end of row. + +* Attempt to read beyond end of file. + +.. code-block:: cpp + + #include "arrow/io/file.h" + #include "parquet/stream_reader.h" + + { + std::shared_ptr<arrow::io::ReadableFile> infile; + + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::ReadableFile::Open("test.parquet")); + + parquet::StreamReader os{parquet::ParquetFileReader::Open(infile)}; + + std::string article; + float price; + uint32_t quantity; + + while ( !os.eof() ) + { + os >> article >> price >> quantity >> parquet::EndRow; + // ... + } + } + +Writing Parquet files +===================== + +WriteTable +---------- + +The :func:`arrow::WriteTable` function writes an entire +:class:`::arrow::Table` to an output file. + +.. code-block:: cpp + + #include "parquet/arrow/writer.h" + + { + std::shared_ptr<arrow::io::FileOutputStream> outfile; + PARQUET_ASSIGN_OR_THROW( + outfile, + arrow::io::FileOutputStream::Open("test.parquet")); + + PARQUET_THROW_NOT_OK( + parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3)); + } + +StreamWriter +------------ + +The :class:`StreamWriter` allows for Parquet files to be written using +standard C++ output operators. This type-safe approach also ensures +that rows are written without omitting fields and allows for new row +groups to be created automatically (after certain volume of data) or +explicitly by using the :type:`EndRowGroup` stream modifier. + +Exceptions are used to signal errors. A :class:`ParquetException` is +thrown in the following circumstances: + +* Attempt to write a field using an incorrect type. + +* Attempt to write too many fields in a row. + +* Attempt to skip a required field. + +.. code-block:: cpp + + #include "arrow/io/file.h" + #include "parquet/stream_writer.h" + + { + std::shared_ptr<arrow::io::FileOutputStream> outfile; + + PARQUET_ASSIGN_OR_THROW( + outfile, + arrow::io::FileOutputStream::Open("test.parquet")); + + parquet::WriterProperties::Builder builder; + std::shared_ptr<parquet::schema::GroupNode> schema; + + // Set up builder with required compression type etc. + // Define schema. + // ... + + parquet::StreamWriter os{ + parquet::ParquetFileWriter::Open(outfile, schema, builder.build())}; + + // Loop over some data structure which provides the required + // fields to be written and write each row. + for (const auto& a : getArticles()) + { + os << a.name() << a.price() << a.quantity() << parquet::EndRow; + } + } diff --git a/src/arrow/docs/source/cpp/simple_graph.svg b/src/arrow/docs/source/cpp/simple_graph.svg new file mode 100644 index 000000000..d87507224 --- /dev/null +++ b/src/arrow/docs/source/cpp/simple_graph.svg @@ -0,0 +1,139 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<svg width="320pt" height="404pt" + viewBox="0.00 0.00 388.02 404.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> +<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 400)"> +<title>G</title> +<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-400 384.0173,-400 384.0173,4 -4,4"/> +<!-- scan lineitem --> +<g id="node1" class="node"> +<title>scan lineitem</title> +<ellipse fill="none" stroke="#000000" cx="62.2569" cy="-378" rx="62.0148" ry="18"/> +<text text-anchor="middle" x="62.2569" y="-373.8" font-family="Times,serif" font-size="14.00" fill="#000000">scan lineitem</text> +</g> +<!-- filter --> +<g id="node2" class="node"> +<title>filter</title> +<ellipse fill="none" stroke="#000000" cx="86.2569" cy="-306" rx="29.6089" ry="18"/> +<text text-anchor="middle" x="86.2569" y="-301.8" font-family="Times,serif" font-size="14.00" fill="#000000">filter</text> +</g> +<!-- scan lineitem->filter --> +<g id="edge1" class="edge"> +<title>scan lineitem->filter</title> +<path fill="none" stroke="#000000" d="M68.3132,-359.8314C70.9767,-351.8406 74.163,-342.2819 77.1065,-333.4514"/> +<polygon fill="#000000" stroke="#000000" points="80.4439,-334.5071 80.2858,-323.9134 73.8031,-332.2934 80.4439,-334.5071"/> +</g> +<!-- join --> +<g id="node3" class="node"> +<title>join</title> +<ellipse fill="none" stroke="#000000" cx="184.2569" cy="-234" rx="27" ry="18"/> +<text text-anchor="middle" x="184.2569" y="-229.8" font-family="Times,serif" font-size="14.00" fill="#000000">join</text> +</g> +<!-- filter->join --> +<g id="edge2" class="edge"> +<title>filter->join</title> +<path fill="none" stroke="#000000" d="M105.6186,-291.7751C120.5341,-280.8168 141.3184,-265.5467 157.7735,-253.4572"/> +<polygon fill="#000000" stroke="#000000" points="159.9433,-256.2062 165.9299,-247.4648 155.7988,-250.565 159.9433,-256.2062"/> +</g> +<!-- join again --> +<g id="node4" class="node"> +<title>join again</title> +<ellipse fill="none" stroke="#000000" cx="231.2569" cy="-162" rx="49.2784" ry="18"/> +<text text-anchor="middle" x="231.2569" y="-157.8" font-family="Times,serif" font-size="14.00" fill="#000000">join again</text> +</g> +<!-- join->join again --> +<g id="edge3" class="edge"> +<title>join->join again</title> +<path fill="none" stroke="#000000" d="M195.1578,-217.3008C200.8051,-208.6496 207.8305,-197.8873 214.1788,-188.1623"/> +<polygon fill="#000000" stroke="#000000" points="217.224,-189.9002 219.7594,-179.6132 211.3623,-186.0738 217.224,-189.9002"/> +</g> +<!-- filter again --> +<g id="node9" class="node"> +<title>filter again</title> +<ellipse fill="none" stroke="#000000" cx="231.2569" cy="-90" rx="53.2645" ry="18"/> +<text text-anchor="middle" x="231.2569" y="-85.8" font-family="Times,serif" font-size="14.00" fill="#000000">filter again</text> +</g> +<!-- join again->filter again --> +<g id="edge8" class="edge"> +<title>join again->filter again</title> +<path fill="none" stroke="#000000" d="M231.2569,-143.8314C231.2569,-136.131 231.2569,-126.9743 231.2569,-118.4166"/> +<polygon fill="#000000" stroke="#000000" points="234.757,-118.4132 231.2569,-108.4133 227.757,-118.4133 234.757,-118.4132"/> +</g> +<!-- scan orders --> +<g id="node5" class="node"> +<title>scan orders</title> +<ellipse fill="none" stroke="#000000" cx="197.2569" cy="-378" rx="54.9752" ry="18"/> +<text text-anchor="middle" x="197.2569" y="-373.8" font-family="Times,serif" font-size="14.00" fill="#000000">scan orders</text> +</g> +<!-- project --> +<g id="node6" class="node"> +<title>project</title> +<ellipse fill="none" stroke="#000000" cx="184.2569" cy="-306" rx="37.6986" ry="18"/> +<text text-anchor="middle" x="184.2569" y="-301.8" font-family="Times,serif" font-size="14.00" fill="#000000">project</text> +</g> +<!-- scan orders->project --> +<g id="edge4" class="edge"> +<title>scan orders->project</title> +<path fill="none" stroke="#000000" d="M193.9765,-359.8314C192.5861,-352.131 190.9329,-342.9743 189.3877,-334.4166"/> +<polygon fill="#000000" stroke="#000000" points="192.8028,-333.6322 187.5816,-324.4133 185.9142,-334.8761 192.8028,-333.6322"/> +</g> +<!-- project->join --> +<g id="edge5" class="edge"> +<title>project->join</title> +<path fill="none" stroke="#000000" d="M184.2569,-287.8314C184.2569,-280.131 184.2569,-270.9743 184.2569,-262.4166"/> +<polygon fill="#000000" stroke="#000000" points="187.757,-262.4132 184.2569,-252.4133 180.757,-262.4133 187.757,-262.4132"/> +</g> +<!-- scan customers --> +<g id="node7" class="node"> +<title>scan customers</title> +<ellipse fill="none" stroke="#000000" cx="310.2569" cy="-306" rx="69.5216" ry="18"/> +<text text-anchor="middle" x="310.2569" y="-301.8" font-family="Times,serif" font-size="14.00" fill="#000000">scan customers</text> +</g> +<!-- aggregate --> +<g id="node8" class="node"> +<title>aggregate</title> +<ellipse fill="none" stroke="#000000" cx="294.2569" cy="-234" rx="48.6346" ry="18"/> +<text text-anchor="middle" x="294.2569" y="-229.8" font-family="Times,serif" font-size="14.00" fill="#000000">aggregate</text> +</g> +<!-- scan customers->aggregate --> +<g id="edge6" class="edge"> +<title>scan customers->aggregate</title> +<path fill="none" stroke="#000000" d="M306.2195,-287.8314C304.5083,-280.131 302.4735,-270.9743 300.5717,-262.4166"/> +<polygon fill="#000000" stroke="#000000" points="303.9348,-261.4159 298.3488,-252.4133 297.1015,-262.9344 303.9348,-261.4159"/> +</g> +<!-- aggregate->join again --> +<g id="edge7" class="edge"> +<title>aggregate->join again</title> +<path fill="none" stroke="#000000" d="M279.0064,-216.5708C271.1906,-207.6385 261.5369,-196.6056 252.9595,-186.8029"/> +<polygon fill="#000000" stroke="#000000" points="255.5861,-184.4897 246.367,-179.2687 250.3181,-189.0993 255.5861,-184.4897"/> +</g> +<!-- write to disk --> +<g id="node10" class="node"> +<title>write to disk</title> +<ellipse fill="none" stroke="#000000" cx="231.2569" cy="-18" rx="59.1276" ry="18"/> +<text text-anchor="middle" x="231.2569" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">write to disk</text> +</g> +<!-- filter again->write to disk --> +<g id="edge9" class="edge"> +<title>filter again->write to disk</title> +<path fill="none" stroke="#000000" d="M231.2569,-71.8314C231.2569,-64.131 231.2569,-54.9743 231.2569,-46.4166"/> +<polygon fill="#000000" stroke="#000000" points="234.757,-46.4132 231.2569,-36.4133 227.757,-46.4133 234.757,-46.4132"/> +</g> +</g> +</svg> diff --git a/src/arrow/docs/source/cpp/streaming_execution.rst b/src/arrow/docs/source/cpp/streaming_execution.rst new file mode 100644 index 000000000..a3406265b --- /dev/null +++ b/src/arrow/docs/source/cpp/streaming_execution.rst @@ -0,0 +1,307 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp +.. cpp:namespace:: arrow::compute + +========================== +Streaming execution engine +========================== + +.. warning:: + + The streaming execution engine is experimental, and a stable API + is not yet guaranteed. + +Motivation +---------- + +For many complex computations, successive direct :ref:`invocation of +compute functions <invoking-compute-functions>` is not feasible +in either memory or computation time. Doing so causes all intermediate +data to be fully materialized. To facilitate arbitrarily large inputs +and more efficient resource usage, Arrow also provides a streaming query +engine with which computations can be formulated and executed. + +.. image:: simple_graph.svg + :alt: An example graph of a streaming execution workflow. + +:class:`ExecNode` is provided to reify the graph of operations in a query. +Batches of data (:struct:`ExecBatch`) flow along edges of the graph from +node to node. Structuring the API around streams of batches allows the +working set for each node to be tuned for optimal performance independent +of any other nodes in the graph. Each :class:`ExecNode` processes batches +as they are pushed to it along an edge of the graph by upstream nodes +(its inputs), and pushes batches along an edge of the graph to downstream +nodes (its outputs) as they are finalized. + +..seealso:: + + `SHAIKHHA, A., DASHTI, M., & KOCH, C. + (2018). Push versus pull-based loop fusion in query engines. + Journal of Functional Programming, 28. + <https://doi.org/10.1017/s0956796818000102>`_ + +Overview +-------- + +:class:`ExecNode` + Each node in the graph is an implementation of the :class:`ExecNode` interface. + +:class:`ExecPlan` + A set of :class:`ExecNode` is contained and (to an extent) coordinated by an + :class:`ExecPlan`. + +:class:`ExecFactoryRegistry` + Instances of :class:`ExecNode` are constructed by factory functions held + in a :class:`ExecFactoryRegistry`. + +:class:`ExecNodeOptions` + Heterogenous parameters for factories of :class:`ExecNode` are bundled in an + :class:`ExecNodeOptions`. + +:struct:`Declaration` + ``dplyr``-inspired helper for efficient construction of an :class:`ExecPlan`. + +:struct:`ExecBatch` + A lightweight container for a single chunk of data in the Arrow format. In + contrast to :class:`RecordBatch`, :struct:`ExecBatch` is intended for use + exclusively in a streaming execution context (for example, it doesn't have a + corresponding Python binding). Furthermore columns which happen to have a + constant value may be represented by a :class:`Scalar` instead of an + :class:`Array`. In addition, :struct:`ExecBatch` may carry + execution-relevant properties including a guaranteed-true-filter + for :class:`Expression` simplification. + + +An example :class:`ExecNode` implementation which simply passes all input batches +through unchanged:: + + class PassthruNode : public ExecNode { + public: + // InputReceived is the main entry point for ExecNodes. It is invoked + // by an input of this node to push a batch here for processing. + void InputReceived(ExecNode* input, ExecBatch batch) override { + // Since this is a passthru node we simply push the batch to our + // only output here. + outputs_[0]->InputReceived(this, batch); + } + + // ErrorReceived is called by an input of this node to report an error. + // ExecNodes should always forward errors to their outputs unless they + // are able to fully handle the error (this is rare). + void ErrorReceived(ExecNode* input, Status error) override { + outputs_[0]->ErrorReceived(this, error); + } + + // InputFinished is used to signal how many batches will ultimately arrive. + // It may be called with any ordering relative to InputReceived/ErrorReceived. + void InputFinished(ExecNode* input, int total_batches) override { + outputs_[0]->InputFinished(this, total_batches); + } + + // ExecNodes may request that their inputs throttle production of batches + // until they are ready for more, or stop production if no further batches + // are required. These signals should typically be forwarded to the inputs + // of the ExecNode. + void ResumeProducing(ExecNode* output) override { inputs_[0]->ResumeProducing(this); } + void PauseProducing(ExecNode* output) override { inputs_[0]->PauseProducing(this); } + void StopProducing(ExecNode* output) override { inputs_[0]->StopProducing(this); } + + // An ExecNode has a single output schema to which all its batches conform. + using ExecNode::output_schema; + + // ExecNodes carry basic introspection for debugging purposes + const char* kind_name() const override { return "PassthruNode"; } + using ExecNode::label; + using ExecNode::SetLabel; + using ExecNode::ToString; + + // An ExecNode holds references to its inputs and outputs, so it is possible + // to walk the graph of execution if necessary. + using ExecNode::inputs; + using ExecNode::outputs; + + // StartProducing() and StopProducing() are invoked by an ExecPlan to + // coordinate the graph-wide execution state. These do not need to be + // forwarded to inputs or outputs. + Status StartProducing() override { return Status::OK(); } + void StopProducing() override {} + Future<> finished() override { return inputs_[0]->finished(); } + }; + +Note that each method which is associated with an edge of the graph must be invoked +with an ``ExecNode*`` to identify the node which invoked it. For example, in an +:class:`ExecNode` which implements ``JOIN`` this tagging might be used to differentiate +between batches from the left or right inputs. +``InputReceived``, ``ErrorReceived``, ``InputFinished`` may only be invoked by +the inputs of a node, while ``ResumeProducing``, ``PauseProducing``, ``StopProducing`` +may only be invoked by outputs of a node. + +:class:`ExecPlan` contains the associated instances of :class:`ExecNode` +and is used to start and stop execution of all nodes and for querying/awaiting +their completion:: + + // construct an ExecPlan first to hold your nodes + ARROW_ASSIGN_OR_RAISE(auto plan, ExecPlan::Make(default_exec_context())); + + // ... add nodes to your ExecPlan + + // start all nodes in the graph + ARROW_RETURN_NOT_OK(plan->StartProducing()); + + SetUserCancellationCallback([plan] { + // stop all nodes in the graph + plan->StopProducing(); + }); + + // Complete will be marked finished when all nodes have run to completion + // or acknowledged a StopProducing() signal. The ExecPlan should be kept + // alive until this future is marked finished. + Future<> complete = plan->finished(); + + +Constructing ``ExecPlan`` objects +--------------------------------- + +.. warning:: + + The following will be superceded by construction from Compute IR, see ARROW-14074. + +None of the concrete implementations of :class:`ExecNode` are exposed +in headers, so they can't be constructed directly outside the +translation unit where they are defined. Instead, factories to +create them are provided in an extensible registry. This structure +provides a number of benefits: + +- This enforces consistent construction. +- It decouples implementations from consumers of the interface + (for example: we have two classes for scalar and grouped aggregate, + we can choose which to construct within the single factory by + checking whether grouping keys are provided) +- This expedites integration with out-of-library extensions. For example + "scan" nodes are implemented in the separate ``libarrow_dataset.so`` library. +- Since the class is not referencable outside the translation unit in which it + is defined, compilers can optimize more aggressively. + +Factories of :class:`ExecNode` can be retrieved by name from the registry. +The default registry is available through +:func:`arrow::compute::default_exec_factory_registry()` +and can be queried for the built-in factories:: + + // get the factory for "filter" nodes: + ARROW_ASSIGN_OR_RAISE(auto make_filter, + default_exec_factory_registry()->GetFactory("filter")); + + // factories take three arguments: + ARROW_ASSIGN_OR_RAISE(ExecNode* filter_node, *make_filter( + // the ExecPlan which should own this node + plan.get(), + + // nodes which will send batches to this node (inputs) + {scan_node}, + + // parameters unique to "filter" nodes + FilterNodeOptions{filter_expression})); + + // alternative shorthand: + ARROW_ASSIGN_OR_RAISE(filter_node, MakeExecNode("filter", + plan.get(), {scan_node}, FilterNodeOptions{filter_expression}); + +Factories can also be added to the default registry as long as they are +convertible to ``std::function<Result<ExecNode*>( +ExecPlan*, std::vector<ExecNode*>, const ExecNodeOptions&)>``. + +To build an :class:`ExecPlan` representing a simple pipeline which +reads from a :class:`RecordBatchReader` then filters, projects, and +writes to disk:: + + std::shared_ptr<RecordBatchReader> reader = GetStreamOfBatches(); + ExecNode* source_node = *MakeExecNode("source", plan.get(), {}, + SourceNodeOptions::FromReader( + reader, + GetCpuThreadPool())); + + ExecNode* filter_node = *MakeExecNode("filter", plan.get(), {source_node}, + FilterNodeOptions{ + greater(field_ref("score"), literal(3)) + }); + + ExecNode* project_node = *MakeExecNode("project", plan.get(), {filter_node}, + ProjectNodeOptions{ + {add(field_ref("score"), literal(1))}, + {"score + 1"} + }); + + arrow::dataset::internal::Initialize(); + MakeExecNode("write", plan.get(), {project_node}, + WriteNodeOptions{/*base_dir=*/"/dat", /*...*/}); + +:struct:`Declaration` is a `dplyr <https://dplyr.tidyverse.org>`_-inspired +helper which further decreases the boilerplate associated with populating +an :class:`ExecPlan` from C++:: + + arrow::dataset::internal::Initialize(); + + std::shared_ptr<RecordBatchReader> reader = GetStreamOfBatches(); + ASSERT_OK(Declaration::Sequence( + { + {"source", SourceNodeOptions::FromReader( + reader, + GetCpuThreadPool())}, + {"filter", FilterNodeOptions{ + greater(field_ref("score"), literal(3))}}, + {"project", ProjectNodeOptions{ + {add(field_ref("score"), literal(1))}, + {"score + 1"}}}, + {"write", WriteNodeOptions{/*base_dir=*/"/dat", /*...*/}}, + }) + .AddToPlan(plan.get())); + +Note that a source node can wrap anything which resembles a stream of batches. +For example, `PR#11032 <https://github.com/apache/arrow/pull/11032>`_ adds +support for use of a `DuckDB <https://duckdb.org>`_ query as a source node. +Similarly, a sink node can wrap anything which absorbs a stream of batches. +In the example above we're writing completed +batches to disk. However we can also collect these in memory into a :class:`Table` +or forward them to a :class:`RecordBatchReader` as an out-of-graph stream. +This flexibility allows an :class:`ExecPlan` to be used as streaming middleware +between any endpoints which support Arrow formatted batches. + +An :class:`arrow::dataset::Dataset` can also be wrapped as a source node which +pushes all the dataset's batches into an :class:`ExecPlan`. This factory is added +to the default registry with the name ``"scan"`` by calling +``arrow::dataset::internal::Initialize()``:: + + arrow::dataset::internal::Initialize(); + + std::shared_ptr<Dataset> dataset = GetDataset(); + + ASSERT_OK(Declaration::Sequence( + { + {"scan", ScanNodeOptions{dataset, + /* push down predicate, projection, ... */}}, + {"filter", FilterNodeOptions{/* ... */}}, + // ... + }) + .AddToPlan(plan.get())); + +Datasets may be scanned multiple times; just make multiple scan +nodes from that dataset. (Useful for a self-join, for example.) +Note that producing two scan nodes like this will perform all +reads and decodes twice. diff --git a/src/arrow/docs/source/cpp/tables.rst b/src/arrow/docs/source/cpp/tables.rst new file mode 100644 index 000000000..ea9198771 --- /dev/null +++ b/src/arrow/docs/source/cpp/tables.rst @@ -0,0 +1,83 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. default-domain:: cpp +.. highlight:: cpp + +============ +Tabular Data +============ + +.. seealso:: + :doc:`Table and RecordBatch API reference <api/table>`. + +While arrays and chunked arrays represent a one-dimensional sequence of +homogeneous values, data often comes in the form of two-dimensional sets of +heterogeneous data (such as database tables, CSV files...). Arrow provides +several abstractions to handle such data conveniently and efficiently. + +Fields +====== + +Fields are used to denote the particular columns of a table (and also +the particular members of a nested data type such as :class:`arrow::StructType`). +A field, i.e. an instance of :class:`arrow::Field`, holds together a data +type, a field name and some optional metadata. + +The recommended way to create a field is to call the :func:`arrow::field` +factory function. + +Schemas +======= + +A schema describes the overall structure of a two-dimensional dataset such +as a table. It holds a sequence of fields together with some optional +schema-wide metadata (in addition to per-field metadata). The recommended +way to create a schema is to call one the :func:`arrow::schema` factory +function overloads:: + + // Create a schema describing datasets with two columns: + // a int32 column "A" and a utf8-encoded string column "B" + std::shared_ptr<arrow::Field> field_a, field_b; + std::shared_ptr<arrow::Schema> schema; + + field_a = arrow::field("A", arrow::int32()); + field_b = arrow::field("B", arrow::utf8()); + schema = arrow::schema({field_a, field_b}); + +Tables +====== + +A :class:`arrow::Table` is a two-dimensional dataset with chunked arrays for +columns, together with a schema providing field names. Also, each chunked +column must have the same logical length in number of elements (although each +column can be chunked in a different way). + +Record Batches +============== + +A :class:`arrow::RecordBatch` is a two-dimensional dataset of a number of +contiguous arrays, each the same length. Like a table, a record batch also +has a schema which must match its arrays' datatypes. + +Record batches are a convenient unit of work for various serialization +and computation functions, possibly incremental. + +A table can be streamed as an arbitrary number of record batches using +a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of +record batches can be assembled to form a table using one of the +:func:`arrow::Table::FromRecordBatches` factory function overloads. diff --git a/src/arrow/docs/source/developers/archery.rst b/src/arrow/docs/source/developers/archery.rst new file mode 100644 index 000000000..a587975d6 --- /dev/null +++ b/src/arrow/docs/source/developers/archery.rst @@ -0,0 +1,87 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _archery: + +Daily Development using Archery +=============================== + +To ease some of the daily development tasks, we developed a Python-written +utility called Archery. + +Installation +------------ + +Archery requires Python 3.6 or later. It is recommended to install archery in +*editable* mode with the ``-e`` flag to automatically update the installation +when pulling the Arrow repository. After cloning the Arrow repository, from +the top level directory install Archery by using the command + +.. code:: bash + + pip install -e dev/archery[all] + +Usage +----- + +You can inspect Archery usage by passing the ``--help`` flag: + +.. code:: bash + + $ archery --help + Usage: archery [OPTIONS] COMMAND [ARGS]... + + Apache Arrow developer utilities. + + See sub-commands help with `archery <cmd> --help`. + + Options: + --debug Increase logging with debugging output. + --pdb Invoke pdb on uncaught exception. + -q, --quiet Silence executed commands. + --help Show this message and exit. + + Commands: + benchmark Arrow benchmarking. + build Initialize an Arrow C++ build + crossbow Schedule packaging tasks or nightly builds on CI services. + docker Interact with docker-compose based builds. + integration Execute protocol and Flight integration tests + linking Quick and dirty utilities for checking library linkage. + lint Check Arrow source tree for errors + numpydoc Lint python docstring with NumpyDoc + release Release releated commands. + trigger-bot + +Archery exposes independent subcommands, each of which provides dedicated +help output, for example: + +.. code:: bash + + $ archery docker --help + Usage: archery docker [OPTIONS] COMMAND [ARGS]... + + Interact with docker-compose based builds. + + Options: + --src <arrow_src> Specify Arrow source directory. + --help Show this message and exit. + + Commands: + images List the available docker-compose images. + push Push the generated docker-compose image. + run Execute docker-compose builds. diff --git a/src/arrow/docs/source/developers/benchmarks.rst b/src/arrow/docs/source/developers/benchmarks.rst new file mode 100644 index 000000000..22eb5159d --- /dev/null +++ b/src/arrow/docs/source/developers/benchmarks.rst @@ -0,0 +1,179 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _benchmarks: + +========== +Benchmarks +========== + +Setup +===== + +First install the :ref:`Archery <archery>` utility to run the benchmark suite. + +Running the benchmark suite +=========================== + +The benchmark suites can be run with the ``benchmark run`` sub-command. + +.. code-block:: shell + + # Run benchmarks in the current git workspace + archery benchmark run + # Storing the results in a file + archery benchmark run --output=run.json + +Sometimes, it is required to pass custom CMake flags, e.g. + +.. code-block:: shell + + export CC=clang-8 CXX=clang++8 + archery benchmark run --cmake-extras="-DARROW_SIMD_LEVEL=NONE" + +Additionally a full CMake build directory may be specified. + +.. code-block:: shell + + archery benchmark run $HOME/arrow/cpp/release-build + +Comparison +========== + +One goal with benchmarking is to detect performance regressions. To this end, +``archery`` implements a benchmark comparison facility via the ``benchmark +diff`` sub-command. + +In the default invocation, it will compare the current source (known as the +current workspace in git) with local master branch: + +.. code-block:: shell + + archery --quiet benchmark diff --benchmark-filter=FloatParsing + ----------------------------------------------------------------------------------- + Non-regressions: (1) + ----------------------------------------------------------------------------------- + benchmark baseline contender change % counters + FloatParsing<FloatType> 105.983M items/sec 105.983M items/sec 0.0 {} + + ------------------------------------------------------------------------------------ + Regressions: (1) + ------------------------------------------------------------------------------------ + benchmark baseline contender change % counters + FloatParsing<DoubleType> 209.941M items/sec 109.941M items/sec -47.632 {} + +For more information, invoke the ``archery benchmark diff --help`` command for +multiple examples of invocation. + +Iterating efficiently +~~~~~~~~~~~~~~~~~~~~~ + +Iterating with benchmark development can be a tedious process due to long +build time and long run times. Multiple tricks can be used with +``archery benchmark diff`` to reduce this overhead. + +First, the benchmark command supports comparing existing +build directories, This can be paired with the ``--preserve`` flag to +avoid rebuilding sources from zero. + +.. code-block:: shell + + # First invocation clone and checkouts in a temporary directory. The + # directory is preserved with --preserve + archery benchmark diff --preserve + + # Modify C++ sources + + # Re-run benchmark in the previously created build directory. + archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build + +Second, a benchmark run result can be saved in a json file. This also avoids +rebuilding the sources, but also executing the (sometimes) heavy benchmarks. +This technique can be used as a poor's man caching. + +.. code-block:: shell + + # Run the benchmarks on a given commit and save the result + archery benchmark run --output=run-head-1.json HEAD~1 + # Compare the previous captured result with HEAD + archery benchmark diff HEAD run-head-1.json + +Third, the benchmark command supports filtering suites (``--suite-filter``) +and benchmarks (``--benchmark-filter``), both options supports regular +expressions. + +.. code-block:: shell + + # Taking over a previous run, but only filtering for benchmarks matching + # `Kernel` and suite matching `compute-aggregate`. + archery benchmark diff \ + --suite-filter=compute-aggregate --benchmark-filter=Kernel \ + /tmp/arrow-bench*/{WORKSPACE,master}/build + +Instead of rerunning benchmarks on comparison, a JSON file (generated by +``archery benchmark run``) may be specified for the contender and/or the +baseline. + +.. code-block:: shell + + archery benchmark run --output=baseline.json $HOME/arrow/cpp/release-build + git checkout some-feature + archery benchmark run --output=contender.json $HOME/arrow/cpp/release-build + archery benchmark diff contender.json baseline.json + +Regression detection +==================== + +Writing a benchmark +~~~~~~~~~~~~~~~~~~~ + +1. The benchmark command will filter (by default) benchmarks with the regular + expression ``^Regression``. This way, not all benchmarks are run by default. + Thus, if you want your benchmark to be verified for regression + automatically, the name must match. + +2. The benchmark command will run with the ``--benchmark_repetitions=K`` + options for statistical significance. Thus, a benchmark should not override + the repetitions in the (C++) benchmark's arguments definition. + +3. Due to #2, a benchmark should run sufficiently fast. Often, when the input + does not fit in memory (L2/L3), the benchmark will be memory bound instead + of CPU bound. In this case, the input can be downsized. + +4. By default, google's benchmark library will use the cputime metric, which + is the sum of runtime dedicated on the CPU for all threads of the process. + By contrast to realtime which is the wall clock time, e.g. the difference + between end_time - start_time. In a single thread model, the cputime is + preferable since it is less affected by context switching. In a multi thread + scenario, the cputime will give incorrect result since the since it'll + be inflated by the number of threads and can be far off realtime. Thus, if + the benchmark is multi threaded, it might be better to use + ``SetRealtime()``, see this `example <https://github.com/apache/arrow/blob/a9582ea6ab2db055656809a2c579165fe6a811ba/cpp/src/arrow/io/memory-benchmark.cc#L223-L227>`_. + +Scripting +========= + +``archery`` is written as a python library with a command line frontend. The +library can be imported to automate some tasks. + +Some invocation of the command line interface can be quite verbose due to build +output. This can be controlled/avoided with the ``--quiet`` option or the +``--output=<file>`` can be used, e.g. + +.. code-block:: shell + + archery benchmark diff --benchmark-filter=Kernel --output=compare.json ... diff --git a/src/arrow/docs/source/developers/computeir.rst b/src/arrow/docs/source/developers/computeir.rst new file mode 100644 index 000000000..9ebe1d5af --- /dev/null +++ b/src/arrow/docs/source/developers/computeir.rst @@ -0,0 +1,59 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +********************************************** +Arrow Compute IR (Intermediate Representation) +********************************************** + +In the same way that the Arrow format provides a powerful tool +for communicating data, Compute IR is intended to provide a +consistent format for representing analytical operations against +that data. As an arrow-native expression of computation it includes +information such as explicit types and schemas and arrow formatted +literal data. It is also optimized for low runtime overhead in both +serialization and deserialization. + +Built-in definitions are included to enable representation of +relational algebraic operations- the contents of a "logical query plan". +Compute IR also has first class support for representing operations +which are not members of a minimal relational algebra, including +implementation and optimization details- the contents of a "physical +query plan". This approach is taken in emulation of `MLIR`_ (Multi-Level +Intermediate Representation), a system which has had strong successes in +spaces of comparable complexity to representation of analytic operations. +To borrow terms from that project, there are two mutations of interest: + +* Replacement of representations with semantically equivalent representations + which will yield better performance for consumers- an optimization pass. +* Replacement of abstract or generic representations with more specific + and potentially consumer-specific representations- a lowering pass. + This modification corresponds to the translation of a logical plan + to a physical plan. + +Allowing representation of physical plans (and plans which are between +logical and physical) in Compute IR enables systems to define incremental +optimization and lowering passes which operate on and produce valid +Compute IR. This in turn enables communication, manipulation, and inspection +at every stage of lowering/optimization by the same tools +used for logical-plan-equivalent-IR. This is especially useful for systems +where such passes may depend on information only available on every node +of a distributed consumer (for example statistics unique to that node's +local data) or may not be universal to all backends in a heterogeneous +consumer (for example which optimizations nodes are capable of for +non equi joins). + +.. _MLIR: https://mlir.llvm.org diff --git a/src/arrow/docs/source/developers/contributing.rst b/src/arrow/docs/source/developers/contributing.rst new file mode 100644 index 000000000..9b81a6ff1 --- /dev/null +++ b/src/arrow/docs/source/developers/contributing.rst @@ -0,0 +1,362 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _contributing: + +**************************** +Contributing to Apache Arrow +**************************** + +Thanks for your interest in the Apache Arrow project. Arrow is a large project +and may seem overwhelming when you're first getting involved. +Contributing code is great, but that's probably not the first place to start. +There are lots of ways to make valuable contributions to the project and +community. + +This page provides some orientation for how to get involved. It also offers +some recommendations on how to get best results when engaging with the +community. + +Code of Conduct +=============== + +All participation in the Apache Arrow project is governed by the ASF's +`Code of Conduct <https://www.apache.org/foundation/policies/conduct.html>`_. + +Join the mailing lists +====================== + +A good first step to getting involved in the Arrow project is to join the +mailing lists and participate in discussions where you can. +Projects in The Apache Software Foundation ("the ASF") use public, archived +mailing lists to create a public record of each project's development +activities and decision-making process. +While lacking the immediacy of chat or other forms of communication, +the mailing lists give participants the opportunity to slow down and be +thoughtful in their responses, and they help developers who are spread across +many timezones to participate more equally. + +See `the community page <https://arrow.apache.org/community/>`_ for links to +subscribe to the mailing lists and to view archives. + +Report bugs and propose features +================================ + +Using the software and sharing your experience is a very helpful contribution +itself. Those who actively develop Arrow need feedback from users on what +works and what doesn't. Alerting us to unexpected behavior and missing features, +even if you can't solve the problems yourself, help us understand and prioritize +work to improve the libraries. + +We use `JIRA <https://issues.apache.org/jira/projects/ARROW/issues>`_ +to manage our development "todo" list and to maintain changelogs for releases. +In addition, the project's `Confluence site <https://cwiki.apache.org/confluence/display/ARROW>`_ +has some useful higher-level views of the JIRA issues. + +To create a JIRA issue, you'll need to have an account on the ASF JIRA, which +you can `sign yourself up for <https://issues.apache.org/jira/secure/Signup!default.jspa>`_. +The JIRA server hosts bugs and issues for multiple Apache projects. The JIRA +project name for Arrow is "ARROW". + +You don't need any special permissions on JIRA to be able to create issues. +Once you are more involved in the project and want to do more on JIRA, such as +assign yourself an issue, you will need "Contributor" permissions on the +Apache Arrow JIRA. To get this role, ask on the mailing list for a project +maintainer's help. + +Tips for using JIRA ++++++++++++++++++++ + +Before you create a new issue, we recommend you first +`search <https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20resolution%20%3D%20Unresolved>`_ +among existing Arrow issues. + +When reporting a new issue, follow these conventions to help make sure the +right people see it: + +* Use the **Component** field to indicate the area of the project that your + issue pertains to (for example "Python" or "C++"). +* Also prefix the issue title with the component name in brackets, for example + ``[Python] issue name`` ; this helps when navigating lists of open issues, + and it also makes our changelogs more readable. Most prefixes are exactly the + same as the **Component** name, with the following exceptions: + + * **Component:** Continuous Integration — **Summary prefix:** [CI] + * **Component:** Developer Tools — **Summary prefix:** [Dev] + * **Component:** Documentation — **Summary prefix:** [Docs] + +* If you're reporting something that used to work in a previous version + but doesn't work in the current release, you can add the "Affects version" + field. For feature requests and other proposals, "Affects version" isn't + appropriate. + +Project maintainers may later tweak formatting and labels to help improve their +visibility. They may add a "Fix version" to indicate that they're considering +it for inclusion in the next release, though adding that tag is not a +commitment that it will be done in the next release. + +Tips for successful bug reports ++++++++++++++++++++++++++++++++ + +No one likes having bugs in their software, and in an ideal world, all bugs +would get fixed as soon as they were reported. However, time and attention are +finite, especially in an open-source project where most contributors are +participating in their spare time. All contributors in Apache projects are +volunteers and act as individuals, even if they are contributing to the project +as part of their job responsibilities. + +In order for your bug to get prompt +attention, there are things you can do to make it easier for contributors to +reproduce and fix it. +When you're reporting a bug, please help us understand the issue by providing, +to the best of your ability, + +* Clear, minimal steps to reproduce the issue, with as few non-Arrow + dependencies as possible. If there's a problem on reading a file, try to + provide as small of an example file as possible, or code to create one. + If your bug report says "it crashes trying to read my file, but I can't + share it with you," it's really hard for us to debug. +* Any relevant operating system, language, and library version information +* If it isn't obvious, clearly state the expected behavior and what actually + happened. + +If a developer can't get a failing unit test, they won't be able to know that +the issue has been identified, and they won't know when it has been fixed. +Try to anticipate the questions you might be asked by someone working to +understand the issue and provide those supporting details up front. + +Other resources: + +* `Mozilla's bug-reporting guidelines <https://developer.mozilla.org/en-US/docs/Mozilla/QA/Bug_writing_guidelines>`_ +* `Reprex do's and don'ts <https://reprex.tidyverse.org/articles/reprex-dos-and-donts.html>`_ + +Improve documentation +===================== + +A great way to contribute to the project is to improve documentation. If you +found some docs to be incomplete or inaccurate, share your hard-earned knowledge +with the rest of the community. + +Documentation improvements are also a great way to gain some experience with +our submission and review process, discussed below, without requiring a lot +of local development environment setup. In fact, many documentation-only changes +can be made directly in the GitHub web interface by clicking the "edit" button. +This will handle making a fork and a pull request for you. + +Contribute code +=============== + +Code contributions, or "patches", are delivered in the form of GitHub pull +requests against the `github.com/apache/arrow +<https://github.com/apache/arrow>`_ repository. + +Before starting ++++++++++++++++ + +You'll first need to select a JIRA issue to work on. Perhaps you're working on +one you reported yourself. Otherwise, if you're looking for something, +`search <https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20resolution%20%3D%20Unresolved>`_ +the open issues. Anything that's not in the "In Progress" state is fair game, +even if it is "Assigned" to someone, particularly if it has not been +recently updated. When in doubt, comment on the issue asking if they mind +if you try to put together a pull request; interpret no response to mean that +you're free to proceed. + +Please do ask questions, either on the JIRA itself or on the dev mailing list, +if you have doubts about where to begin or what approach to take. +This is particularly a good idea if this is your first code contribution, +so you can get some sense of what the core developers in this part of the +project think a good solution looks like. For best results, ask specific, +direct questions, such as: + +* Do you think $PROPOSED_APPROACH is the right one? +* In which file(s) should I be looking to make changes? +* Is there anything related in the codebase I can look at to learn? + +If you ask these questions and do not get an answer, it is OK to ask again. + +Pull request and review ++++++++++++++++++++++++ + +To contribute a patch: + +* Submit the patch as a GitHub pull request against the master branch. For a + tutorial, see the GitHub guides on `forking a repo <https://help.github.com/en/articles/fork-a-repo>`_ + and `sending a pull request <https://help.github.com/en/articles/creating-a-pull-request-from-a-fork>`_. + So that your pull request syncs with the JIRA issue, prefix your pull request + name with the JIRA issue id (ex: + `ARROW-767: [C++] Filesystem abstraction <https://github.com/apache/arrow/pull/4225>`_). +* Give the pull request a clear, brief description: when the pull request is + merged, this will be retained in the extended commit message. +* Make sure that your code passes the unit tests. You can find instructions how + to run the unit tests for each Arrow component in its respective README file. + +Core developers and others with a stake in the part of the project your change +affects will review, request changes, and hopefully indicate their approval +in the end. To make the review process smooth for everyone, try to + +* Break your work into small, single-purpose patches if possible. It’s much + harder to merge in a large change with a lot of disjoint features, and + particularly if you're new to the project, smaller changes are much easier + for maintainers to accept. +* Add new unit tests for your code. +* Follow the style guides for the part(s) of the project you're modifying. + Some languages (C++ and Python, for example) run a lint check in + continuous integration. For all languages, see their respective developer + documentation and READMEs for style guidance. In general, try to make it look + as if the codebase has a single author, and emulate any conventions you see, + whether or not they are officially documented or checked. + +When tests are passing and the pull request has been approved by the interested +parties, a `committer <https://arrow.apache.org/committers/>`_ +will merge the pull request. This is done with a +command-line utility that does a squash merge, so all of your commits will be +registered as a single commit to the master branch; this simplifies the +connection between JIRA issues and commits, makes it easier to bisect +history to identify where changes were introduced, and helps us be able to +cherry-pick individual patches onto a maintenance branch. + +A side effect of this way of +merging is that your pull request will appear in the GitHub interface to have +been "closed without merge". Do not be alarmed: if you look at the bottom, you +will see a message that says ``@user closed this in $COMMIT``. In the commit +message of that commit, the merge tool adds the pull request description, a +link back to the pull request, and attribution to the contributor and any +co-authors. + +Local git conventions ++++++++++++++++++++++ + +If you are tracking the Arrow source repository locally, here are some tips +for using ``git``. + +All Arrow contributors work off of their personal fork of ``apache/arrow`` +and submit pull requests "upstream". Once you've cloned your fork of Arrow, +be sure to:: + + $ git remote add upstream https://github.com/apache/arrow + +to set the "upstream" repository. + +You are encouraged to develop on branches, rather than your own "master" branch, +and it helps to keep your fork's master branch synced with ``upstream/master``. + +To start a new branch, pull the latest from upstream first:: + + $ git fetch upstream + $ git checkout master + $ git pull --ff-only upstream master + $ git checkout -b $BRANCH + +It does not matter what you call your branch. Some people like to use the JIRA +number as branch name, others use descriptive names. + +Once you have a branch going, you should sync with ``upstream/master`` +regularly, as many commits are merged to master every day. +It is recommended to use ``git rebase`` rather than ``git merge``. +To sync your local copy of a branch, you may do the following:: + + $ git pull upstream $BRANCH --rebase + +This will rebase your local commits on top of the tip of ``upstream/$BRANCH``. In case +there are conflicts, and your local commit history has multiple commits, you may +simplify the conflict resolution process by squashing your local commits into a single +commit. Preserving the commit history isn't as important because when your +feature branch is merged upstream, a squash happens automatically. If you choose this +route, you can abort the rebase with:: + + $ git rebase --abort + +Following which, the local commits can be squashed interactively by running:: + + $ git rebase --interactive ORIG_HEAD~n + +Where ``n`` is the number of commits you have in your local branch. After the squash, +you can try the merge again, and this time conflict resolution should be relatively +straightforward. + +If you set the following in your repo's ``.git/config``, the ``--rebase`` option can be +omitted from the ``git pull`` command, as it is implied by default. :: + + [pull] + rebase = true + +Once you have an updated local copy, you can push to your remote repo. Note, since your +remote repo still holds the old history, you would need to do a force push. :: + + $ git push --force origin branch + +*Note about force pushing to a branch that is being reviewed:* if you want reviewers to +look at your updates, please ensure you comment on the PR on GitHub as simply force +pushing does not trigger a notification in the GitHub user interface. + +Also, once you have a pull request up, be sure you pull from ``origin`` +before rebasing and force-pushing. Arrow maintainers can push commits directly +to your branch, which they sometimes do to help move a pull request along. +In addition, the GitHub PR "suggestion" feature can also add commits to +your branch, so it is possible that your local copy of your branch is missing +some additions. + +.. include:: experimental_repos.rst + +Guidance for specific features +============================== + +From time to time the community has discussions on specific types of features +and improvements that they expect to support. This section outlines decisions +that have been made in this regard. + +Endianness +++++++++++ + +The Arrow format allows setting endianness. Due to the popularity of +little endian architectures most of implementation assume little endian by +default. There has been some effort to support big endian platforms as well. +Based on a `mailing-list discussion +<https://mail-archives.apache.org/mod_mbox/arrow-dev/202009.mbox/%3cCAK7Z5T--HHhr9Dy43PYhD6m-XoU4qoGwQVLwZsG-kOxXjPTyZA@mail.gmail.com%3e>`__, +the requirements for a new platform are: + +1. A robust (non-flaky, returning results in a reasonable time) Continuous + Integration setup. +2. Benchmarks for performance critical parts of the code to demonstrate + no regression. + +Furthermore, for big-endian support, there are two levels that an +implementation can support: + +1. Native endianness (all Arrow communication happens with processes of the + same endianness). This includes ancillary functionality such as reading + and writing various file formats, such as Parquet. +2. Cross endian support (implementations will do byte reordering when + appropriate for :ref:`IPC <format-ipc>` and :ref:`Flight <flight-rpc>` + messages). + +The decision on what level to support is based on maintainers' preferences for +complexity and technical risk. In general all implementations should be open +to native endianness support (provided the CI and performance requirements +are met). Cross endianness support is a question for individual maintainers. + +The current implementations aiming for cross endian support are: + +1. C++ + +Implementations that do not intend to implement cross endian support: + +1. Java + +For other libraries, a discussion to gather consensus on the mailing-list +should be had before submitting PRs. diff --git a/src/arrow/docs/source/developers/cpp/building.rst b/src/arrow/docs/source/developers/cpp/building.rst new file mode 100644 index 000000000..6b18c7312 --- /dev/null +++ b/src/arrow/docs/source/developers/cpp/building.rst @@ -0,0 +1,510 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _building-arrow-cpp: + +================== +Building Arrow C++ +================== + +System setup +============ + +Arrow uses CMake as a build configuration system. We recommend building +out-of-source. If you are not familiar with this terminology: + +* **In-source build**: ``cmake`` is invoked directly from the ``cpp`` + directory. This can be inflexible when you wish to maintain multiple build + environments (e.g. one for debug builds and another for release builds) +* **Out-of-source build**: ``cmake`` is invoked from another directory, + creating an isolated build environment that does not interact with any other + build environment. For example, you could create ``cpp/build-debug`` and + invoke ``cmake $CMAKE_ARGS ..`` from this directory + +Building requires: + +* A C++11-enabled compiler. On Linux, gcc 4.8 and higher should be + sufficient. For Windows, at least Visual Studio 2017 is required. +* CMake 3.5 or higher +* On Linux and macOS, either ``make`` or ``ninja`` build utilities + +On Ubuntu/Debian you can install the requirements with: + +.. code-block:: shell + + sudo apt-get install \ + build-essential \ + cmake + +On Alpine Linux: + +.. code-block:: shell + + apk add autoconf \ + bash \ + cmake \ + g++ \ + gcc \ + make + +On Fedora Linux: + +.. code-block:: shell + + sudo dnf install \ + cmake \ + gcc \ + gcc-c++ \ + make + +On macOS, you can use `Homebrew <https://brew.sh/>`_: + +.. code-block:: shell + + git clone https://github.com/apache/arrow.git + cd arrow + brew update && brew bundle --file=cpp/Brewfile + +With `vcpkg <https://github.com/Microsoft/vcpkg>`_: + +.. code-block:: shell + + git clone https://github.com/apache/arrow.git + cd arrow + vcpkg install \ + --x-manifest-root cpp \ + --feature-flags=versions \ + --clean-after-build + +On MSYS2: + +.. code-block:: shell + + pacman --sync --refresh --noconfirm \ + ccache \ + git \ + mingw-w64-${MSYSTEM_CARCH}-boost \ + mingw-w64-${MSYSTEM_CARCH}-brotli \ + mingw-w64-${MSYSTEM_CARCH}-cmake \ + mingw-w64-${MSYSTEM_CARCH}-gcc \ + mingw-w64-${MSYSTEM_CARCH}-gflags \ + mingw-w64-${MSYSTEM_CARCH}-glog \ + mingw-w64-${MSYSTEM_CARCH}-gtest \ + mingw-w64-${MSYSTEM_CARCH}-lz4 \ + mingw-w64-${MSYSTEM_CARCH}-protobuf \ + mingw-w64-${MSYSTEM_CARCH}-python3-numpy \ + mingw-w64-${MSYSTEM_CARCH}-rapidjson \ + mingw-w64-${MSYSTEM_CARCH}-snappy \ + mingw-w64-${MSYSTEM_CARCH}-thrift \ + mingw-w64-${MSYSTEM_CARCH}-zlib \ + mingw-w64-${MSYSTEM_CARCH}-zstd + +Building +======== + +The build system uses ``CMAKE_BUILD_TYPE=release`` by default, so if this +argument is omitted then a release build will be produced. + +.. note:: + + You need to more options to build on Windows. See + :ref:`developers-cpp-windows` for details. + +Minimal release build: + +.. code-block:: shell + + git clone https://github.com/apache/arrow.git + cd arrow/cpp + mkdir release + cd release + cmake .. + make + +Minimal debug build with unit tests: + +.. code-block:: shell + + git clone https://github.com/apache/arrow.git + cd arrow + git submodule update --init --recursive + export ARROW_TEST_DATA=$PWD/testing/data + cd cpp + mkdir debug + cd debug + cmake -DCMAKE_BUILD_TYPE=Debug -DARROW_BUILD_TESTS=ON .. + make unittest + +The unit tests are not built by default. After building, one can also invoke +the unit tests using the ``ctest`` tool provided by CMake (note that ``test`` +depends on ``python`` being available). + +On some Linux distributions, running the test suite might require setting an +explicit locale. If you see any locale-related errors, try setting the +environment variable (which requires the `locales` package or equivalent): + +.. code-block:: shell + + export LC_ALL="en_US.UTF-8" + +Faster builds with Ninja +~~~~~~~~~~~~~~~~~~~~~~~~ + +Many contributors use the `Ninja build system <https://ninja-build.org/>`_ to +get faster builds. It especially speeds up incremental builds. To use +``ninja``, pass ``-GNinja`` when calling ``cmake`` and then use the ``ninja`` +command instead of ``make``. + +Optional Components +~~~~~~~~~~~~~~~~~~~ + +By default, the C++ build system creates a fairly minimal build. We have +several optional system components which you can opt into building by passing +boolean flags to ``cmake``. + +* ``-DARROW_BUILD_UTILITIES=ON`` : Build Arrow commandline utilities +* ``-DARROW_COMPUTE=ON``: Computational kernel functions and other support +* ``-DARROW_CSV=ON``: CSV reader module +* ``-DARROW_CUDA=ON``: CUDA integration for GPU development. Depends on NVIDIA + CUDA toolkit. The CUDA toolchain used to build the library can be customized + by using the ``$CUDA_HOME`` environment variable. +* ``-DARROW_DATASET=ON``: Dataset API, implies the Filesystem API +* ``-DARROW_FILESYSTEM=ON``: Filesystem API for accessing local and remote + filesystems +* ``-DARROW_FLIGHT=ON``: Arrow Flight RPC system, which depends at least on + gRPC +* ``-DARROW_GANDIVA=ON``: Gandiva expression compiler, depends on LLVM, + Protocol Buffers, and re2 +* ``-DARROW_GANDIVA_JAVA=ON``: Gandiva JNI bindings for Java +* ``-DARROW_GCS=ON``: Build Arrow with GCS support (requires the GCloud SDK for C++) +* ``-DARROW_HDFS=ON``: Arrow integration with libhdfs for accessing the Hadoop + Filesystem +* ``-DARROW_HIVESERVER2=ON``: Client library for HiveServer2 database protocol +* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default +* ``-DARROW_JSON=ON``: JSON reader module +* ``-DARROW_MIMALLOC=ON``: Build the Arrow mimalloc-based allocator +* ``-DARROW_ORC=ON``: Arrow integration with Apache ORC +* ``-DARROW_PARQUET=ON``: Apache Parquet libraries and Arrow integration +* ``-DARROW_PLASMA=ON``: Plasma Shared Memory Object Store +* ``-DARROW_PLASMA_JAVA_CLIENT=ON``: Build Java client for Plasma +* ``-DARROW_PYTHON=ON``: Arrow Python C++ integration library (required for + building pyarrow). This library must be built against the same Python version + for which you are building pyarrow. NumPy must also be installed. Enabling + this option also enables ``ARROW_COMPUTE``, ``ARROW_CSV``, ``ARROW_DATASET``, + ``ARROW_FILESYSTEM``, ``ARROW_HDFS``, and ``ARROW_JSON``. +* ``-DARROW_S3=ON``: Support for Amazon S3-compatible filesystems +* ``-DARROW_WITH_RE2=ON`` Build with support for regular expressions using the re2 + library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` is ``ON`` +* ``-DARROW_WITH_UTF8PROC=ON``: Build with support for Unicode properties using + the utf8proc library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` + is ``ON`` +* ``-DARROW_TENSORFLOW=ON``: Build Arrow with TensorFlow support enabled + +Compression options available in Arrow are: + +* ``-DARROW_WITH_BROTLI=ON``: Build support for Brotli compression +* ``-DARROW_WITH_BZ2=ON``: Build support for BZ2 compression +* ``-DARROW_WITH_LZ4=ON``: Build support for lz4 compression +* ``-DARROW_WITH_SNAPPY=ON``: Build support for Snappy compression +* ``-DARROW_WITH_ZLIB=ON``: Build support for zlib (gzip) compression +* ``-DARROW_WITH_ZSTD=ON``: Build support for ZSTD compression + +Some features of the core Arrow shared library can be switched off for improved +build times if they are not required for your application: + +* ``-DARROW_IPC=ON``: build the IPC extensions + +Optional Targets +~~~~~~~~~~~~~~~~ + +For development builds, you will often want to enable additional targets in +enable to exercise your changes, using the following ``cmake`` options. + +* ``-DARROW_BUILD_BENCHMARKS=ON``: Build executable benchmarks. +* ``-DARROW_BUILD_EXAMPLES=ON``: Build examples of using the Arrow C++ API. +* ``-DARROW_BUILD_INTEGRATION=ON``: Build additional executables that are + used to exercise protocol interoperability between the different Arrow + implementations. +* ``-DARROW_BUILD_UTILITIES=ON``: Build executable utilities. +* ``-DARROW_BUILD_TESTS=ON``: Build executable unit tests. +* ``-DARROW_ENABLE_TIMING_TESTS=ON``: If building unit tests, enable those + unit tests that rely on wall-clock timing (this flag is disabled on CI + because it can make test results flaky). +* ``-DARROW_FUZZING=ON``: Build fuzz targets and related executables. + +Optional Checks +~~~~~~~~~~~~~~~ + +The following special checks are available as well. They instrument the +generated code in various ways so as to detect select classes of problems +at runtime (for example when executing unit tests). + +* ``-DARROW_USE_ASAN=ON``: Enable Address Sanitizer to check for memory leaks, + buffer overflows or other kinds of memory management issues. +* ``-DARROW_USE_TSAN=ON``: Enable Thread Sanitizer to check for races in + multi-threaded code. +* ``-DARROW_USE_UBSAN=ON``: Enable Undefined Behavior Sanitizer to check for + situations which trigger C++ undefined behavior. + +Some of those options are mutually incompatible, so you may have to build +several times with different options if you want to exercise all of them. + +CMake version requirements +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While we support CMake 3.5 and higher, some features require a newer version of +CMake: + +* Building the benchmarks requires 3.6 or higher +* Building zstd from source requires 3.7 or higher +* Building Gandiva JNI bindings requires 3.11 or higher + +LLVM and Clang Tools +~~~~~~~~~~~~~~~~~~~~ + +We are currently using LLVM 8 for library builds and for other developer tools +such as code formatting with ``clang-format``. LLVM can be installed via most +modern package managers (apt, yum, conda, Homebrew, vcpkg, chocolatey). + +.. _cpp-build-dependency-management: + +Build Dependency Management +=========================== + +The build system supports a number of third-party dependencies + + * ``AWSSDK``: for S3 support, requires system cURL and can use the + ``BUNDLED`` method described below + * ``benchmark``: Google benchmark, for testing + * ``Boost``: for cross-platform support + * ``Brotli``: for data compression + * ``BZip2``: for data compression + * ``c-ares``: a dependency of gRPC + * ``gflags``: for command line utilities (formerly Googleflags) + * ``GLOG``: for logging + * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires + system cURL and can use the ``BUNDLED`` method described below + * ``gRPC``: for remote procedure calls + * ``GTest``: Googletest, for testing + * ``LLVM``: a dependency of Gandiva + * ``Lz4``: for data compression + * ``ORC``: for Apache ORC format support + * ``re2``: for compute kernels and Gandiva, a dependency of gRPC + * ``Protobuf``: Google Protocol Buffers, for data serialization + * ``RapidJSON``: for data serialization + * ``Snappy``: for data compression + * ``Thrift``: Apache Thrift, for data serialization + * ``utf8proc``: for compute kernels + * ``ZLIB``: for data compression + * ``zstd``: for data compression + +The CMake option ``ARROW_DEPENDENCY_SOURCE`` is a global option that instructs +the build system how to resolve each dependency. There are a few options: + +* ``AUTO``: Try to find package in the system default locations and build from + source if not found +* ``BUNDLED``: Building the dependency automatically from source +* ``SYSTEM``: Finding the dependency in system paths using CMake's built-in + ``find_package`` function, or using ``pkg-config`` for packages that do not + have this feature +* ``CONDA``: Use ``$CONDA_PREFIX`` as alternative ``SYSTEM`` PATH +* ``VCPKG``: Find dependencies installed by vcpkg, and if not found, run + ``vcpkg install`` to install them +* ``BREW``: Use Homebrew default paths as an alternative ``SYSTEM`` path + +The default method is ``AUTO`` unless you are developing within an active conda +environment (detected by presence of the ``$CONDA_PREFIX`` environment +variable), in which case it is ``CONDA``. + +Individual Dependency Resolution +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While ``-DARROW_DEPENDENCY_SOURCE=$SOURCE`` sets a global default for all +packages, the resolution strategy can be overridden for individual packages by +setting ``-D$PACKAGE_NAME_SOURCE=..``. For example, to build Protocol Buffers +from source, set + +.. code-block:: shell + + -DProtobuf_SOURCE=BUNDLED + +This variable is unfortunately case-sensitive; the name used for each package +is listed above, but the most up-to-date listing can be found in +`cpp/cmake_modules/ThirdpartyToolchain.cmake <https://github.com/apache/arrow/blob/master/cpp/cmake_modules/ThirdpartyToolchain.cmake>`_. + +Bundled Dependency Versions +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using the ``BUNDLED`` method to build a dependency from source, the +version number from ``cpp/thirdparty/versions.txt`` is used. There is also a +dependency source downloader script (see below), which can be used to set up +offline builds. + +When using ``BUNDLED`` for dependency resolution (and if you use either the +jemalloc or mimalloc allocators, which are recommended), statically linking the +Arrow libraries in a third party project is more complex. See below for +instructions about how to configure your build system in this case. + +Boost-related Options +~~~~~~~~~~~~~~~~~~~~~ + +We depend on some Boost C++ libraries for cross-platform support. In most cases, +the Boost version available in your package manager may be new enough, and the +build system will find it automatically. If you have Boost installed in a +non-standard location, you can specify it by passing +``-DBOOST_ROOT=$MY_BOOST_ROOT`` or setting the ``BOOST_ROOT`` environment +variable. + +Offline Builds +~~~~~~~~~~~~~~ + +If you do not use the above variables to direct the Arrow build system to +preinstalled dependencies, they will be built automatically by the Arrow build +system. The source archive for each dependency will be downloaded via the +internet, which can cause issues in environments with limited access to the +internet. + +To enable offline builds, you can download the source artifacts yourself and +use environment variables of the form ``ARROW_$LIBRARY_URL`` to direct the +build system to read from a local file rather than accessing the internet. + +To make this easier for you, we have prepared a script +``thirdparty/download_dependencies.sh`` which will download the correct version +of each dependency to a directory of your choosing. It will print a list of +bash-style environment variable statements at the end to use for your build +script. + +.. code-block:: shell + + # Download tarballs into $HOME/arrow-thirdparty + $ ./thirdparty/download_dependencies.sh $HOME/arrow-thirdparty + +You can then invoke CMake to create the build directory and it will use the +declared environment variable pointing to downloaded archives instead of +downloading them (one for each build dir!). + +Statically Linking +~~~~~~~~~~~~~~~~~~ + +When ``-DARROW_BUILD_STATIC=ON``, all build dependencies built as static +libraries by the Arrow build system will be merged together to create a static +library ``arrow_bundled_dependencies``. In UNIX-like environments (Linux, macOS, +MinGW), this is called ``libarrow_bundled_dependencies.a`` and on Windows with +Visual Studio ``arrow_bundled_dependencies.lib``. This "dependency bundle" +library is installed in the same place as the other Arrow static libraries. + +If you are using CMake, the bundled dependencies will automatically be included +when linking if you use the ``arrow_static`` CMake target. In other build +systems, you may need to explicitly link to the dependency bundle. We created +an `example CMake-based build configuration +<https://github.com/apache/arrow/tree/master/cpp/examples/minimal_build>`_ to +show you a working example. + +On Linux and macOS, if your application does not link to the ``pthread`` +library already, you must include ``-pthread`` in your linker setup. In CMake +this can be accomplished with the ``Threads`` built-in package: + +.. code-block:: cmake + + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(my_target PRIVATE Threads::Threads) + +Extra debugging help +~~~~~~~~~~~~~~~~~~~~ + +If you use the CMake option ``-DARROW_EXTRA_ERROR_CONTEXT=ON`` it will compile +the libraries with extra debugging information on error checks inside the +``RETURN_NOT_OK`` macro. In unit tests with ``ASSERT_OK``, this will yield error +outputs like: + +.. code-block:: shell + + ../src/arrow/ipc/ipc-read-write-test.cc:609: Failure + Failed + ../src/arrow/ipc/metadata-internal.cc:508 code: TypeToFlatbuffer(fbb, *field.type(), &children, &layout, &type_enum, dictionary_memo, &type_offset) + ../src/arrow/ipc/metadata-internal.cc:598 code: FieldToFlatbuffer(fbb, *schema.field(i), dictionary_memo, &offset) + ../src/arrow/ipc/metadata-internal.cc:651 code: SchemaToFlatbuffer(fbb, schema, dictionary_memo, &fb_schema) + ../src/arrow/ipc/writer.cc:697 code: WriteSchemaMessage(schema_, dictionary_memo_, &schema_fb) + ../src/arrow/ipc/writer.cc:730 code: WriteSchema() + ../src/arrow/ipc/writer.cc:755 code: schema_writer.Write(&dictionaries_) + ../src/arrow/ipc/writer.cc:778 code: CheckStarted() + ../src/arrow/ipc/ipc-read-write-test.cc:574 code: writer->WriteRecordBatch(batch) + NotImplemented: Unable to convert type: decimal(19, 4) + +Deprecations and API Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use the compiler definition ``ARROW_NO_DEPRECATED_API`` to disable APIs that +have been deprecated. It is a good practice to compile third party applications +with this flag to proactively catch and account for API changes. + +Modular Build Targets +~~~~~~~~~~~~~~~~~~~~~ + +Since there are several major parts of the C++ project, we have provided +modular CMake targets for building each library component, group of unit tests +and benchmarks, and their dependencies: + +* ``make arrow`` for Arrow core libraries +* ``make parquet`` for Parquet libraries +* ``make gandiva`` for Gandiva (LLVM expression compiler) libraries +* ``make plasma`` for Plasma libraries, server + +.. note:: + If you have selected Ninja as CMake generator, replace ``make arrow`` with + ``ninja arrow``, and so on. + +To build the unit tests or benchmarks, add ``-tests`` or ``-benchmarks`` +to the target name. So ``make arrow-tests`` will build the Arrow core unit +tests. Using the ``-all`` target, e.g. ``parquet-all``, will build everything. + +If you wish to only build and install one or more project subcomponents, we +have provided the CMake option ``ARROW_OPTIONAL_INSTALL`` to only install +targets that have been built. For example, if you only wish to build the +Parquet libraries, its tests, and its dependencies, you can run: + +.. code-block:: shell + + cmake .. -DARROW_PARQUET=ON \ + -DARROW_OPTIONAL_INSTALL=ON \ + -DARROW_BUILD_TESTS=ON + make parquet + make install + +If you omit an explicit target when invoking ``make``, all targets will be +built. + +Debugging with Xcode on macOS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Xcode is the IDE provided with macOS and can be use to develop and debug Arrow +by generating an Xcode project: + +.. code-block:: shell + + cd cpp + mkdir xcode-build + cd xcode-build + cmake .. -G Xcode -DARROW_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=DEBUG + open arrow.xcodeproj + +This will generate a project and open it in the Xcode app. As an alternative, +the command ``xcodebuild`` will perform a command-line build using the +generated project. It is recommended to use the "Automatically Create Schemes" +option when first launching the project. Selecting an auto-generated scheme +will allow you to build and run a unittest with breakpoints enabled. diff --git a/src/arrow/docs/source/developers/cpp/conventions.rst b/src/arrow/docs/source/developers/cpp/conventions.rst new file mode 100644 index 000000000..9db15fbcf --- /dev/null +++ b/src/arrow/docs/source/developers/cpp/conventions.rst @@ -0,0 +1,90 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. highlight:: cpp + +=========== +Conventions +=========== + +This section provides some information about some of the abstractions and +development approaches we use to solve problems common to many parts of the C++ +project. + +File Naming +=========== + +C++ source and header files should use underscores for word separation, not hyphens. +Compiled executables, however, will automatically use hyphens (such that +e.g. ``src/arrow/scalar_test.cc`` will be compiled into ``arrow-scalar-test``). + +C++ header files use the ``.h`` extension. Any header file name not +containing ``internal`` is considered to be a public header, and will be +automatically installed by the build. + +Comments and Docstrings +======================= + +Regular comments start with ``//``. + +Doxygen docstrings start with ``///``, and Doxygen directives start with ``\``, +like this:: + + /// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. + /// + /// \param[in] size size of buffer to allocate + /// \param[in] pool a memory pool + ARROW_EXPORT + Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, + MemoryPool* pool = NULLPTR); + +The summary line of a docstring uses the infinitive, not the indicative +(for example, "Allocate a buffer" rather than "Allocates a buffer"). + +Memory Pools +============ + +We provide a default memory pool with ``arrow::default_memory_pool()``. + +Error Handling and Exceptions +============================= + +For error handling, we return ``arrow::Status`` values instead of throwing C++ +exceptions. Since the Arrow C++ libraries are intended to be useful as a +component in larger C++ projects, using ``Status`` objects can help with good +code hygiene by making explicit when a function is expected to be able to fail. + +A more recent option is to return a ``arrow::Result<T>`` object that can +represent either a successful result with a ``T`` value, or an error result +with a ``Status`` value. + +For expressing internal invariants and "cannot fail" errors, we use ``DCHECK`` macros +defined in ``arrow/util/logging.h``. These checks are disabled in release builds +and are intended to catch internal development errors, particularly when +refactoring. These macros are not to be included in any public header files. + +Since we do not use exceptions, we avoid doing expensive work in object +constructors. Objects that are expensive to construct may often have private +constructors, with public static factory methods that return ``Status`` or +``Result<T>``. + +There are a number of object constructors, like ``arrow::Schema`` and +``arrow::RecordBatch`` where larger STL container objects like ``std::vector`` may +be created. While it is possible for ``std::bad_alloc`` to be thrown in these +constructors, the circumstances where they would are somewhat esoteric, and it +is likely that an application would have encountered other more serious +problems prior to having ``std::bad_alloc`` thrown in a constructor. diff --git a/src/arrow/docs/source/developers/cpp/development.rst b/src/arrow/docs/source/developers/cpp/development.rst new file mode 100644 index 000000000..4098f1c4e --- /dev/null +++ b/src/arrow/docs/source/developers/cpp/development.rst @@ -0,0 +1,294 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +====================== +Development Guidelines +====================== + +This section provides information for developers who wish to contribute to the +C++ codebase. + +.. note:: + + Since most of the project's developers work on Linux or macOS, not all + features or developer tools are uniformly supported on Windows. If you are + on Windows, have a look at :ref:`developers-cpp-windows`. + +Compiler warning levels +======================= + +The ``BUILD_WARNING_LEVEL`` CMake option switches between sets of predetermined +compiler warning levels that we use for code tidiness. For release builds, the +default warning level is ``PRODUCTION``, while for debug builds the default is +``CHECKIN``. + +When using ``CHECKIN`` for debug builds, ``-Werror`` is added when using gcc +and clang, causing build failures for any warning, and ``/WX`` is set with MSVC +having the same effect. + +Running unit tests +================== + +The ``-DARROW_BUILD_TESTS=ON`` CMake option enables building of unit test +executables. You can then either run them individually, by launching the +desired executable, or run them all at once by launching the ``ctest`` +executable (which is part of the CMake suite). + +A possible invocation is something like:: + + $ ctest -j16 --output-on-failure + +where the ``-j16`` option runs up to 16 tests in parallel, taking advantage +of multiple CPU cores and hardware threads. + +Running benchmarks +================== + +The ``-DARROW_BUILD_BENCHMARKS=ON`` CMake option enables building of benchmark +executables. You can then run benchmarks individually by launching the +corresponding executable from the command line, e.g.:: + + $ ./build/release/arrow-builder-benchmark + +.. note:: + For meaningful benchmark numbers, it is very strongly recommended to build + in ``Release`` mode, so as to enable compiler optimizations. + +Code Style, Linting, and CI +=========================== + +This project follows `Google's C++ Style Guide +<https://google.github.io/styleguide/cppguide.html>`_ with minor exceptions: + +* We relax the line length restriction to 90 characters. +* We use the ``NULLPTR`` macro in header files (instead of ``nullptr``) defined + in ``src/arrow/util/macros.h`` to support building C++/CLI (ARROW-1134) +* We relax the guide's rules regarding structs. For public headers we should + use struct only for objects that are principally simple data containers where + it is OK to expose all the internal members and any methods are primarily + conveniences. For private headers the rules are relaxed further and structs + can be used where convenient for types that do not need access control even + though they may not be simple data containers. + +Our continuous integration builds on GitHub Actions run the unit test +suites on a variety of platforms and configuration, including using +Address Sanitizer and Undefined Behavior Sanitizer to check for various +patterns of misbehaviour such as memory leaks. In addition, the +codebase is subjected to a number of code style and code cleanliness checks. + +In order to have a passing CI build, your modified git branch must pass the +following checks: + +* C++ builds with the project's active version of ``clang`` without + compiler warnings with ``-DBUILD_WARNING_LEVEL=CHECKIN``. Note that + there are classes of warnings (such as ``-Wdocumentation``, see more + on this below) that are not caught by ``gcc``. +* CMake files pass style checks, can be fixed by running + ``archery lint --cmake-format --fix``. This requires Python + 3 and `cmake_format <https://github.com/cheshirekow/cmake_format>`_ (note: + this currently does not work on Windows) +* Passes various C++ (and others) style checks, checked with the ``lint`` + subcommand to :ref:`Archery <archery>`. This can also be fixed locally + by running ``archery lint --cpplint --fix``. + +In order to account for variations in the behavior of ``clang-format`` between +major versions of LLVM, we pin the version of ``clang-format`` used (current +LLVM 8). + +Depending on how you installed clang-format, the build system may not be able +to find it. You can provide an explicit path to your LLVM installation (or the +root path for the clang tools) with the environment variable +`$CLANG_TOOLS_PATH` or by passing ``-DClangTools_PATH=$PATH_TO_CLANG_TOOLS`` when +invoking CMake. + +To make linting more reproducible for everyone, we provide a ``docker-compose`` +target that is executable from the root of the repository: + +.. code-block:: shell + + docker-compose run ubuntu-lint + +Cleaning includes with include-what-you-use (IWYU) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We occasionally use Google's `include-what-you-use +<https://github.com/include-what-you-use/include-what-you-use>`_ tool, also +known as IWYU, to remove unnecessary imports. + +To begin using IWYU, you must first build it by following the instructions in +the project's documentation. Once the ``include-what-you-use`` executable is in +your ``$PATH``, you must run CMake with ``-DCMAKE_EXPORT_COMPILE_COMMANDS=ON`` +in a new out-of-source CMake build directory like so: + +.. code-block:: shell + + mkdir -p $ARROW_ROOT/cpp/iwyu + cd $ARROW_ROOT/cpp/iwyu + cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DARROW_PYTHON=ON \ + -DARROW_PARQUET=ON \ + -DARROW_FLIGHT=ON \ + -DARROW_PLASMA=ON \ + -DARROW_GANDIVA=ON \ + -DARROW_BUILD_BENCHMARKS=ON \ + -DARROW_BUILD_BENCHMARKS_REFERENCE=ON \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_BUILD_UTILITIES=ON \ + -DARROW_S3=ON \ + -DARROW_WITH_BROTLI=ON \ + -DARROW_WITH_BZ2=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_WITH_ZSTD=ON .. + +In order for IWYU to run on the desired component in the codebase, it must be +enabled by the CMake configuration flags. Once this is done, you can run IWYU +on the whole codebase by running a helper ``iwyu.sh`` script: + +.. code-block:: shell + + IWYU_SH=$ARROW_ROOT/cpp/build-support/iwyu/iwyu.sh + ./$IWYU_SH + +Since this is very time consuming, you can check a subset of files matching +some string pattern with the special "match" option + +.. code-block:: shell + + ./$IWYU_SH match $PATTERN + +For example, if you wanted to do IWYU checks on all files in +``src/arrow/array``, you could run + +.. code-block:: shell + + ./$IWYU_SH match arrow/array + +Checking for ABI and API stability +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To build ABI compliance reports, you need to install the two tools +``abi-dumper`` and ``abi-compliance-checker``. + +Build Arrow C++ in Debug mode, alternatively you could use ``-Og`` which also +builds with the necessary symbols but includes a bit of code optimization. +Once the build has finished, you can generate ABI reports using: + +.. code-block:: shell + + abi-dumper -lver 9 debug/libarrow.so -o ABI-9.dump + +The above version number is freely selectable. As we want to compare versions, +you should now ``git checkout`` the version you want to compare it to and re-run +the above command using a different version number. Once both reports are +generated, you can build a comparison report using + +.. code-block:: shell + + abi-compliance-checker -l libarrow -d1 ABI-PY-9.dump -d2 ABI-PY-10.dump + +The report is then generated in ``compat_reports/libarrow`` as a HTML. + +API Documentation +================= + +We use Doxygen style comments (``///``) in header files for comments +that we wish to show up in API documentation for classes and +functions. + +When using ``clang`` and building with +``-DBUILD_WARNING_LEVEL=CHECKIN``, the ``-Wdocumentation`` flag is +used which checks for some common documentation inconsistencies, like +documenting some, but not all function parameters with ``\param``. See +the `LLVM documentation warnings section +<https://releases.llvm.org/7.0.1/tools/clang/docs/DiagnosticsReference.html#wdocumentation>`_ +for more about this. + +While we publish the API documentation as part of the main Sphinx-based +documentation site, you can also build the C++ API documentation anytime using +Doxygen. Run the following command from the ``cpp/apidoc`` directory: + +.. code-block:: shell + + doxygen Doxyfile + +This requires `Doxygen <https://www.doxygen.org>`_ to be installed. + +Apache Parquet Development +========================== + +To build the C++ libraries for Apache Parquet, add the flag +``-DARROW_PARQUET=ON`` when invoking CMake. +To build Apache Parquet with encryption support, add the flag +``-DPARQUET_REQUIRE_ENCRYPTION=ON`` when invoking CMake. The Parquet libraries and unit tests +can be built with the ``parquet`` make target: + +.. code-block:: shell + + make parquet + +On Linux and macOS if you do not have Apache Thrift installed on your system, +or you are building with ``-DThrift_SOURCE=BUNDLED``, you must install +``bison`` and ``flex`` packages. On Windows we handle these build dependencies +automatically when building Thrift from source. + +Running ``ctest -L unittest`` will run all built C++ unit tests, while ``ctest -L +parquet`` will run only the Parquet unit tests. The unit tests depend on an +environment variable ``PARQUET_TEST_DATA`` that depends on a git submodule to the +repository https://github.com/apache/parquet-testing: + +.. code-block:: shell + + git submodule update --init + export PARQUET_TEST_DATA=$ARROW_ROOT/cpp/submodules/parquet-testing/data + +Here ``$ARROW_ROOT`` is the absolute path to the Arrow codebase. + +Arrow Flight RPC +================ + +In addition to the Arrow dependencies, Flight requires: + +* gRPC (>= 1.14, roughly) +* Protobuf (>= 3.6, earlier versions may work) +* c-ares (used by gRPC) + +By default, Arrow will try to download and build these dependencies +when building Flight. + +The optional ``flight`` libraries and tests can be built by passing +``-DARROW_FLIGHT=ON``. + +.. code-block:: shell + + cmake .. -DARROW_FLIGHT=ON -DARROW_BUILD_TESTS=ON + make + +You can also use existing installations of the extra dependencies. +When building, set the environment variables ``gRPC_ROOT`` and/or +``Protobuf_ROOT`` and/or ``c-ares_ROOT``. + +We are developing against recent versions of gRPC, and the versions. The +``grpc-cpp`` package available from https://conda-forge.org/ is one reliable +way to obtain gRPC in a cross-platform way. You may try using system libraries +for gRPC and Protobuf, but these are likely to be too old. On macOS, you can +try `Homebrew <https://brew.sh/>`_: + +.. code-block:: shell + + brew install grpc diff --git a/src/arrow/docs/source/developers/cpp/fuzzing.rst b/src/arrow/docs/source/developers/cpp/fuzzing.rst new file mode 100644 index 000000000..41398a13d --- /dev/null +++ b/src/arrow/docs/source/developers/cpp/fuzzing.rst @@ -0,0 +1,99 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================= +Fuzzing Arrow C++ +================= + +To make the handling of invalid input more robust, we have enabled +fuzz testing on several parts of the Arrow C++ feature set, currently: + +* the IPC stream format +* the IPC file format +* the Parquet file format + +We welcome any contribution to expand the scope of fuzz testing and cover +areas ingesting potentially invalid or malicious data. + +Fuzz Targets and Utilities +========================== + +By passing the ``-DARROW_FUZZING=ON`` CMake option, you will build +the fuzz targets corresponding to the aforementioned Arrow features, as well +as additional related utilities. + +Generating the seed corpus +-------------------------- + +Fuzzing essentially explores the domain space by randomly mutating previously +tested inputs, without having any high-level understanding of the area being +fuzz-tested. However, the domain space is so huge that this strategy alone +may fail to actually produce any "interesting" inputs. + +To guide the process, it is therefore important to provide a *seed corpus* +of valid (or invalid, but remarkable) inputs from which the fuzzing +infrastructure can derive new inputs for testing. A script is provided +to automate that task. Assuming the fuzzing executables can be found in +``build/debug``, the seed corpus can be generated thusly: + +.. code-block:: shell + + $ ./build-support/fuzzing/generate_corpuses.sh build/debug + +Continuous fuzzing infrastructure +================================= + +The process of fuzz testing is computationally intensive and therefore +benefits from dedicated computing facilities. Arrow C++ is exercised by +the `OSS-Fuzz`_ continuous fuzzing infrastructure operated by Google. + +Issues found by OSS-Fuzz are notified and available to a limited set of +`core developers <https://github.com/google/oss-fuzz/blob/master/projects/arrow/project.yaml>`_. +If you are a Arrow core developer and want to be added to that list, you can +ask on the :ref:`mailing-list <contributing>`. + +.. _OSS-Fuzz: https://google.github.io/oss-fuzz/ + +Reproducing locally +=================== + +When a crash is found by fuzzing, it is often useful to download the data +used to produce the crash, and use it to reproduce the crash so as to debug +and investigate. + +Assuming you are in a subdirectory inside ``cpp``, the following command +would allow you to build the fuzz targets with debug information and the +various sanitizer checks enabled. + +.. code-block:: shell + + $ cmake .. -GNinja \ + -DCMAKE_BUILD_TYPE=Debug \ + -DARROW_USE_ASAN=on \ + -DARROW_USE_UBSAN=on \ + -DARROW_FUZZING=on + +Then, assuming you have downloaded the crashing data file (let's call it +``testcase-arrow-ipc-file-fuzz-123465``), you can reproduce the crash +by running the affected fuzz target on that file: + +.. code-block:: shell + + $ build/debug/arrow-ipc-file-fuzz testcase-arrow-ipc-file-fuzz-123465 + +(you may want to run that command under a debugger so as to inspect the +program state more closely) diff --git a/src/arrow/docs/source/developers/cpp/index.rst b/src/arrow/docs/source/developers/cpp/index.rst new file mode 100644 index 000000000..36c9778be --- /dev/null +++ b/src/arrow/docs/source/developers/cpp/index.rst @@ -0,0 +1,31 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _cpp-development: + +*************** +C++ Development +*************** + +.. toctree:: + :maxdepth: 2 + + building + development + windows + conventions + fuzzing diff --git a/src/arrow/docs/source/developers/cpp/windows.rst b/src/arrow/docs/source/developers/cpp/windows.rst new file mode 100644 index 000000000..ee5a613bc --- /dev/null +++ b/src/arrow/docs/source/developers/cpp/windows.rst @@ -0,0 +1,412 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _developers-cpp-windows: + +===================== +Developing on Windows +===================== + +Like Linux and macOS, we have worked to enable builds to work "out of the box" +with CMake for a reasonably large subset of the project. + +.. _windows-system-setup: + +System Setup +============ + +Microsoft provides the free Visual Studio Community edition. When doing +development in the shell, you must initialize the development environment +each time you open the shell. + +For Visual Studio 2017, execute the following batch script: + +.. code-block:: shell + + "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64 + +For Visual Studio 2019, the script is: + +.. code-block:: shell + + "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64 + +One can configure a console emulator like `cmder <https://cmder.net/>`_ to +automatically launch this when starting a new development console. + +Using conda-forge for build dependencies +======================================== + +`Miniconda <https://conda.io/miniconda.html>`_ is a minimal Python distribution +including the `conda <https://conda.io>`_ package manager. Some memers of the +Apache Arrow community participate in the maintenance of `conda-forge +<https://conda-forge.org/>`_, a community-maintained cross-platform package +repository for conda. + +To use ``conda-forge`` for your C++ build dependencies on Windows, first +download and install a 64-bit distribution from the `Miniconda homepage +<https://conda.io/miniconda.html>`_ + +To configure ``conda`` to use the ``conda-forge`` channel by default, launch a +command prompt (``cmd.exe``), run the initialization command shown +:ref:`above<windows-system-setup>` (``vcvarsall.bat`` or ``VsDevCmd.bat``), then +run the command: + +.. code-block:: shell + + conda config --add channels conda-forge + +Now, you can bootstrap a build environment (call from the root directory of the +Arrow codebase): + +.. code-block:: shell + + conda create -y -n arrow-dev --file=ci\conda_env_cpp.txt + +Then "activate" this conda environment with: + +.. code-block:: shell + + activate arrow-dev + +If the environment has been activated, the Arrow build system will +automatically see the ``%CONDA_PREFIX%`` environment variable and use that for +resolving the build dependencies. This is equivalent to setting + +.. code-block:: shell + + -DARROW_DEPENDENCY_SOURCE=SYSTEM ^ + -DARROW_PACKAGE_PREFIX=%CONDA_PREFIX%\Library + +To use the Visual Studio IDE with this conda environment activated, launch it by +running the command ``devenv`` from the same command prompt. + +Note that dependencies installed as conda packages are built in release mode and +cannot link with debug builds. If you intend to use ``-DCMAKE_BUILD_TYPE=debug`` +then you must build the packages from source. +``-DCMAKE_BUILD_TYPE=relwithdebinfo`` is also available, which produces a build +that can both be linked with release libraries and be debugged. + +.. note:: + + If you run into any problems using conda packages for dependencies, a very + common problem is mixing packages from the ``defaults`` channel with those + from ``conda-forge``. You can examine the installed packages in your + environment (and their origin) with ``conda list`` + +Using vcpkg for build dependencies +======================================== + +`vcpkg <https://github.com/microsoft/vcpkg>`_ is an open source package manager +from Microsoft. It hosts community-contributed ports of C and C++ packages and +their dependencies. Arrow includes a manifest file `cpp/vcpkg.json +<https://github.com/apache/arrow/blob/master/cpp/vcpkg.json>`_ that specifies +which vcpkg packages are required to build the C++ library. + +To use vcpkg for C++ build dependencies on Windows, first +`install <https://docs.microsoft.com/en-us/cpp/build/install-vcpkg>`_ and +`integrate <https://docs.microsoft.com/en-us/cpp/build/integrate-vcpkg>`_ +vcpkg. Then change working directory in ``cmd.exe`` to the root directory +of Arrow and run the command: + +.. code-block:: shell + + vcpkg install ^ + --triplet x64-windows ^ + --x-manifest-root cpp ^ + --feature-flags=versions ^ + --clean-after-build + +On Windows, vcpkg builds dynamic link libraries by default. Use the triplet +``x64-windows-static`` to build static libraries. vcpkg downloads source +packages and compiles them locally, so installing dependencies with vcpkg is +more time-consuming than with conda. + +Then in your ``cmake`` command, to use dependencies installed by vcpkg, set: + +.. code-block:: shell + + -DARROW_DEPENDENCY_SOURCE=VCPKG + +You can optionally set other variables to override the default CMake +configurations for vcpkg, including: + +* ``-DCMAKE_TOOLCHAIN_FILE``: by default, the CMake scripts automatically find + the location of the vcpkg CMake toolchain file ``vcpkg.cmake``; use this to + instead specify its location +* ``-DVCPKG_TARGET_TRIPLET``: by default, the CMake scripts attempt to infer the + vcpkg + `triplet <https://github.com/microsoft/vcpkg/blob/master/docs/users/triplets.md>`_; + use this to instead specify the triplet +* ``-DARROW_DEPENDENCY_USE_SHARED``: default is ``ON``; set to ``OFF`` for + static libraries +* ``-DVCPKG_MANIFEST_MODE``: default is ``ON``; set to ``OFF`` to ignore the + ``vcpkg.json`` manifest file and only look for vcpkg packages that are + already installed under the directory where vcpkg is installed + + +Building using Visual Studio (MSVC) Solution Files +================================================== + +Change working directory in ``cmd.exe`` to the root directory of Arrow and do +an out of source build by generating a MSVC solution: + +.. code-block:: shell + + cd cpp + mkdir build + cd build + cmake .. -G "Visual Studio 15 2017" -A x64 ^ + -DARROW_BUILD_TESTS=ON + cmake --build . --config Release + +For newer versions of Visual Studio, specify the generator +``Visual Studio 16 2019`` or see ``cmake --help`` for available +generators. + +Building with Ninja and clcache +=============================== + +The `Ninja <https://ninja-build.org/>`_ build system offers better build +parallelization, and the optional `clcache +<https://github.com/frerich/clcache/>`_ compiler cache keeps track of +past compilations to avoid running them over and over again (in a way similar +to the Unix-specific ``ccache``). + +Newer versions of Visual Studio include Ninja. To see if your Visual Studio +includes Ninja, run the initialization command shown +:ref:`above<windows-system-setup>` (``vcvarsall.bat`` or ``VsDevCmd.bat``), then +run ``ninja --version``. + +If Ninja is not included in your version of Visual Studio, and you are using +conda, activate your conda environment and install Ninja and clcache: + +.. code-block:: shell + + activate arrow-dev + conda install -c conda-forge ninja + pip install git+https://github.com/frerich/clcache.git + +If you are not using conda, +`install Ninja from another source <https://github.com/ninja-build/ninja/wiki/Pre-built-Ninja-packages>`_ +and optionally +`install clcache from another source <https://github.com/frerich/clcache/wiki/Installation>`_ +. + +After installation is complete, change working directory in ``cmd.exe`` to the root directory of Arrow and +do an out of source build by generating Ninja files: + +.. code-block:: shell + + cd cpp + mkdir build + cd build + cmake -G "Ninja" ^ + -DCMAKE_C_COMPILER=clcache ^ + -DCMAKE_CXX_COMPILER=clcache ^ + -DARROW_BUILD_TESTS=ON ^ + -DGTest_SOURCE=BUNDLED .. + cmake --build . --config Release + +Setting ``CMAKE_C_COMPILER`` and ``CMAKE_CXX_COMPILER`` in the command line +of ``cmake`` is the preferred method of using ``clcache``. Alternatively, you +can set ``CC`` and ``CXX`` environment variables before calling ``cmake``: + +.. code-block:: shell + + ... + set CC=clcache + set CXX=clcache + cmake -G "Ninja" ^ + ... + + + +Building with NMake +=================== + +Change working directory in ``cmd.exe`` to the root directory of Arrow and +do an out of source build using ``nmake``: + +.. code-block:: shell + + cd cpp + mkdir build + cd build + cmake -G "NMake Makefiles" .. + nmake + +Building on MSYS2 +================= + +You can build on MSYS2 terminal, ``cmd.exe`` or PowerShell terminal. + +On MSYS2 terminal: + +.. code-block:: shell + + cd cpp + mkdir build + cd build + cmake -G "MSYS Makefiles" .. + make + +On ``cmd.exe`` or PowerShell terminal, you can use the following batch +file: + +.. code-block:: batch + + setlocal + + REM For 64bit + set MINGW_PACKAGE_PREFIX=mingw-w64-x86_64 + set MINGW_PREFIX=c:\msys64\mingw64 + set MSYSTEM=MINGW64 + + set PATH=%MINGW_PREFIX%\bin;c:\msys64\usr\bin;%PATH% + + rmdir /S /Q cpp\build + mkdir cpp\build + pushd cpp\build + cmake -G "MSYS Makefiles" .. || exit /B + make || exit /B + popd + +Debug builds +============ + +To build a Debug version of Arrow, you should have pre-installed a Debug +version of Boost. It's recommended to configure ``cmake`` with the following +variables for Debug build: + +* ``-DARROW_BOOST_USE_SHARED=OFF``: enables static linking with boost debug + libs and simplifies run-time loading of 3rd parties +* ``-DBOOST_ROOT``: sets the root directory of boost libs. (Optional) +* ``-DBOOST_LIBRARYDIR``: sets the directory with boost lib files. (Optional) + +The command line to build Arrow in Debug mode will look something like this: + +.. code-block:: shell + + cd cpp + mkdir build + cd build + cmake .. -G "Visual Studio 15 2017" -A x64 ^ + -DARROW_BOOST_USE_SHARED=OFF ^ + -DCMAKE_BUILD_TYPE=Debug ^ + -DBOOST_ROOT=C:/local/boost_1_63_0 ^ + -DBOOST_LIBRARYDIR=C:/local/boost_1_63_0/lib64-msvc-14.0 + cmake --build . --config Debug + +Windows dependency resolution issues +==================================== + +Because Windows uses ``.lib`` files for both static and dynamic linking of +dependencies, the static library sometimes may be named something different +like ``%PACKAGE%_static.lib`` to distinguish itself. If you are statically +linking some dependencies, we provide some options + +* ``-DBROTLI_MSVC_STATIC_LIB_SUFFIX=%BROTLI_SUFFIX%`` +* ``-DSNAPPY_MSVC_STATIC_LIB_SUFFIX=%SNAPPY_SUFFIX%`` +* ``-LZ4_MSVC_STATIC_LIB_SUFFIX=%LZ4_SUFFIX%`` +* ``-ZSTD_MSVC_STATIC_LIB_SUFFIX=%ZSTD_SUFFIX%`` + +To get the latest build instructions, you can reference `ci/appveyor-built.bat +<https://github.com/apache/arrow/blob/master/ci/appveyor-cpp-build.bat>`_, +which is used by automated Appveyor builds. + +Statically linking to Arrow on Windows +====================================== + +The Arrow headers on Windows static library builds (enabled by the CMake +option ``ARROW_BUILD_STATIC``) use the preprocessor macro ``ARROW_STATIC`` to +suppress dllimport/dllexport marking of symbols. Projects that statically link +against Arrow on Windows additionally need this definition. The Unix builds do +not use the macro. + +Replicating Appveyor Builds +=========================== + +For people more familiar with linux development but need to replicate a failing +appveyor build, here are some rough notes from replicating the +``Static_Crt_Build`` (make unittest will probably still fail but many unit +tests can be made with there individual make targets). + +1. Microsoft offers trial VMs for `Windows with Microsoft Visual Studio + <https://developer.microsoft.com/en-us/windows/downloads/virtual-machines>`_. + Download and install a version. +2. Run the VM and install `Git <https://git-scm.com/>`_, `CMake + <https://cmake.org/>`_, and Miniconda or Anaconda (these instructions assume + Anaconda). Also install the `"Build Tools for Visual Studio" + <https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2019>`_. + Make sure to select the C++ toolchain in the installer wizard, and reboot + after installation. +3. Download `pre-built Boost debug binaries + <https://sourceforge.net/projects/boost/files/boost-binaries/>`_ and install + it. + + Run this from an Anaconda/Miniconda command prompt (*not* PowerShell prompt), + and make sure to run "vcvarsall.bat x64" first. The location of vcvarsall.bat + will depend, it may be under a different path than commonly indicated, + e.g. "``C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat``" + with the 2019 build tools. + +.. code-block:: shell + + cd $EXTRACT_BOOST_DIRECTORY + .\bootstrap.bat + @rem This is for static libraries needed for static_crt_build in appveyor + .\b2 link=static --with-filesystem --with-regex --with-system install + @rem this should put libraries and headers in c:\Boost + +4. Activate anaconda/miniconda: + +.. code-block:: shell + + @rem this might differ for miniconda + C:\Users\User\Anaconda3\Scripts\activate + +5. Clone and change directories to the arrow source code (you might need to + install git). +6. Setup environment variables: + +.. code-block:: shell + + @rem Change the build type based on which appveyor job you want. + SET JOB=Static_Crt_Build + SET GENERATOR=Ninja + SET APPVEYOR_BUILD_WORKER_IMAGE=Visual Studio 2017 + SET USE_CLCACHE=false + SET ARROW_BUILD_GANDIVA=OFF + SET ARROW_LLVM_VERSION=8.0.* + SET PYTHON=3.6 + SET ARCH=64 + SET PATH=C:\Users\User\Anaconda3;C:\Users\User\Anaconda3\Scripts;C:\Users\User\Anaconda3\Library\bin;%PATH% + SET BOOST_LIBRARYDIR=C:\Boost\lib + SET BOOST_ROOT=C:\Boost + +7. Run appveyor scripts: + +.. code-block:: shell + + conda install -c conda-forge --file .\ci\conda_env_cpp.txt + .\ci\appveyor-cpp-setup.bat + @rem this might fail but at this point most unit tests should be buildable by there individual targets + @rem see next line for example. + .\ci\appveyor-cpp-build.bat + @rem you can also just invoke cmake directly with the desired options + cmake --build . --config Release --target arrow-compute-hash-test diff --git a/src/arrow/docs/source/developers/crossbow.rst b/src/arrow/docs/source/developers/crossbow.rst new file mode 100644 index 000000000..cb49a2446 --- /dev/null +++ b/src/arrow/docs/source/developers/crossbow.rst @@ -0,0 +1,258 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Packaging and Testing with Crossbow +=================================== + +The content of ``arrow/dev/tasks`` directory aims for automating the process of +Arrow packaging and integration testing. + +Packages: + - C++ and Python `conda-forge packages`_ for Linux, Mac and Windows + - Python `Wheels`_ for Linux, Mac and Windows + - C++ and GLib `Linux packages`_ for multiple distributions + - Java for Gandiva + +Integration tests: + - Various docker tests + - Pandas + - Dask + - Turbodbc + - HDFS + - Spark + +Architecture +------------ + +Executors +~~~~~~~~~ + +Individual jobs are executed on public CI services, currently: + +- Linux: TravisCI, CircleCI, Azure Pipelines +- Mac: TravisCI, Azure Pipelines +- Windows: AppVeyor, Azure Pipelines + +Queue +~~~~~ + +Because of the nature of how the CI services work, the scheduling of +jobs happens through an additional git repository, which acts like a job +queue for the tasks. Anyone can host a ``queue`` repository which is usually +called as ``crossbow``. + +A job is a git commit on a particular git branch, containing only the required +configuration file to run the requested build (like ``.travis.yml``, +``appveyor.yml`` or ``azure-pipelines.yml``). + +Scheduler +~~~~~~~~~ + +Crossbow handles version generation, task rendering and +submission. The tasks are defined in ``tasks.yml``. + +Install +------- + +The following guide depends on GitHub, but theoretically any git +server can be used. + +If you are not using the `ursacomputing/crossbow <https://github.com/ursacomputing/crossbow>`_ +repository, you will need to complete the first two steps, otherwise procede +to step 3: + +1. `Create the queue repository`_ + +2. Enable `TravisCI`_, `Appveyor`_, `Azure Pipelines`_ and `CircleCI`_ + integrations on for the newly created queue repository. + + - turn off Travis’ `auto cancellation`_ feature on branches + +3. Clone either ursacomputing/crossbow if you are using that, or the newly + created repository next to the arrow repository: + + By default the scripts looks for ``crossbow`` next to arrow repository, but + this can configured through command line arguments. + + .. code:: bash + + git clone https://github.com/<user>/crossbow crossbow + + **Important note:** Crossbow only supports GitHub token based + authentication. Although it overwrites the repository urls provided with ssh + protocol, it's advisable to use the HTTPS repository URLs. + +4. `Create a Personal Access Token`_ with ``repo`` and ``workflow`` permissions (other + permissions are not needed) + +5. Locally export the token as an environment variable: + + .. code:: bash + + export CROSSBOW_GITHUB_TOKEN=<token> + + or pass as an argument to the CLI script ``--github-token`` + +6. Export the previously created GitHub token on both CI services: + + Use ``CROSSBOW_GITHUB_TOKEN`` encrypted environment variable. You can + set them at the following URLs, where ``ghuser`` is the GitHub + username and ``ghrepo`` is the GitHub repository name (typically + ``crossbow``): + + - TravisCI: ``https://travis-ci.org/<ghuser>/<ghrepo>/settings`` + - Appveyor: + ``https://ci.appveyor.com/project/<ghuser>/<ghrepo>/settings/environment`` + - CircleCI: + ``https://circleci.com/gh/<ghuser>/<ghrepo>/edit#env-vars`` + + On Appveyor check the ``skip branches without appveyor.yml`` checkbox + on the web UI under crossbow repository’s settings. + +7. Install Python (minimum supported version is 3.6): + + Miniconda is preferred, see installation instructions: + https://conda.io/docs/user-guide/install/index.html + +8. Install the archery toolset containing crossbow itself: + + .. code:: bash + + pip install -e "arrow/dev/archery[crossbow]" + +9. Try running it: + + .. code:: bash + + $ archery crossbow --help + +Usage +----- + +The script does the following: + +1. Detects the current repository, thus supports forks. The following + snippet will build kszucs’s fork instead of the upstream apache/arrow + repository. + + .. code:: bash + + $ git clone https://github.com/kszucs/arrow + $ git clone https://github.com/kszucs/crossbow + + $ cd arrow/dev/tasks + $ archery crossbow submit --help # show the available options + $ archery crossbow submit conda-win conda-linux conda-osx + +2. Gets the HEAD commit of the currently checked out branch and + generates the version number based on `setuptools_scm`_. So to build + a particular branch check out before running the script: + + .. code:: bash + + git checkout ARROW-<ticket number> + archery crossbow submit --dry-run conda-linux conda-osx + + Note that the arrow branch must be pushed beforehand, because the + script will clone the selected branch. + +3. Reads and renders the required build configurations with the + parameters substituted. + +4. Create a branch per task, prefixed with the job id. For example to + build conda recipes on linux it will create a new branch: + ``crossbow@build-<id>-conda-linux``. + +5. Pushes the modified branches to GitHub which triggers the builds. For + authentication it uses GitHub OAuth tokens described in the install + section. + +Query the build status +~~~~~~~~~~~~~~~~~~~~~~ + +Build id (which has a corresponding branch in the queue repository) is returned +by the ``submit`` command. + +.. code:: bash + + archery crossbow status <build id / branch name> + +Download the build artifacts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: bash + + archery crossbow artifacts <build id / branch name> + +Examples +~~~~~~~~ + +Submit command accepts a list of task names and/or a list of task-group names +to select which tasks to build. + +Run multiple builds: + +.. code:: bash + + $ archery crossbow submit debian-stretch conda-linux-gcc-py37-r40 + Repository: https://github.com/kszucs/arrow@tasks + Commit SHA: 810a718836bb3a8cefc053055600bdcc440e6702 + Version: 0.9.1.dev48+g810a7188.d20180414 + Pushed branches: + - debian-stretch + - conda-linux-gcc-py37-r40 + +Just render without applying or committing the changes: + +.. code:: bash + + $ archery crossbow submit --dry-run task_name + +Run only ``conda`` package builds and a Linux one: + +.. code:: bash + + $ archery crossbow submit --group conda centos-7 + +Run ``wheel`` builds: + +.. code:: bash + + $ archery crossbow submit --group wheel + +There are multiple task groups in the ``tasks.yml`` like docker, integration +and cpp-python for running docker based tests. + +``archery crossbow submit`` supports multiple options and arguments, for more +see its help page: + +.. code:: bash + + $ archery crossbow submit --help + + +.. _conda-forge packages: conda-recipes +.. _Wheels: python-wheels +.. _Linux packages: linux-packages +.. _Create the queue repository: https://help.github.com/articles/creating-a-new-repository +.. _TravisCI: https://travis-ci.org/getting_started +.. _Appveyor: https://www.appveyor.com/docs/ +.. _CircleCI: https://circleci.com/docs/2.0/getting-started/ +.. _Azure Pipelines: https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started/pipelines-sign-up +.. _auto cancellation: https://docs.travis-ci.com/user/customizing-the-build/#Building-only-the-latest-commit +.. _Create a Personal Access Token: https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/ +.. _setuptools_scm: https://pypi.python.org/pypi/setuptools_scm diff --git a/src/arrow/docs/source/developers/docker.rst b/src/arrow/docs/source/developers/docker.rst new file mode 100644 index 000000000..36b468752 --- /dev/null +++ b/src/arrow/docs/source/developers/docker.rst @@ -0,0 +1,226 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _docker-builds: + +Running Docker Builds +===================== + +Most of our Linux based Continuous Integration tasks are decoupled from public +CI services using `Docker <https://docs.docker.com/>`_ and +`docker-compose <https://docs.docker.com/compose/>`_. Keeping the CI configuration +minimal makes local reproducibility possible. + +Usage +----- + +There are multiple ways to execute the docker based builds. +The recommended way is to use the :ref:`Archery <archery>` tool: + +Examples +~~~~~~~~ + +**List the available images:** + +.. code:: bash + + archery docker images + +**Execute a build:** + +.. code:: bash + + archery docker run conda-python + +Archery calls the following docker-compose commands: + +.. code:: bash + + docker-compose pull --ignore-pull-failures conda-cpp + docker-compose pull --ignore-pull-failures conda-python + docker-compose build conda-cpp + docker-compose build conda-python + docker-compose run --rm conda-python + +**Show the docker-compose commands instead of executing them:** + +.. code:: bash + + archery docker run --dry-run conda-python + +**To disable the image pulling:** + +.. code:: bash + + archery docker run --no-cache conda-python + +Which translates to: + +.. code:: bash + + docker-compose build --no-cache conda-cpp + docker-compose build --no-cache conda-python + docker-compose run --rm conda-python + +**To disable the cache only for the leaf image:** + +Useful to force building the development version of a dependency. +In case of the example below the command builds the +``conda-cpp > conda-python > conda-python-pandas`` branch of the image tree +where the leaf image is ``conda-python-pandas``. + +.. code:: bash + + PANDAS=master archery docker run --no-leaf-cache conda-python-pandas + +Which translates to: + +.. code:: bash + + export PANDAS=master + docker-compose pull --ignore-pull-failures conda-cpp + docker-compose pull --ignore-pull-failures conda-python + docker-compose build conda-cpp + docker-compose build conda-python + docker-compose build --no-cache conda-python-pandas + docker-compose run --rm conda-python-pandas + +Note that it doesn't pull the conda-python-pandas image and disable the cache +when building it. + +``PANDAS`` is a `build parameter <Docker Build Parameters>`_, see the +defaults in the .env file. + +**To entirely skip building the image:** + +The layer-caching mechanism of docker-compose can be less reliable than +docker's, depending on the version, the ``cache_from`` build entry, and the +backend used (docker-py, docker-cli, docker-cli and buildkit). This can lead to +different layer hashes - even when executing the same build command +repeatedly - eventually causing cache misses full image rebuilds. + +*If the image has been already built but the cache doesn't work properly*, it +can be useful to skip the build phases: + +.. code:: bash + + # first run ensures that the image is built + archery docker run conda-python + + # if the second run tries the build the image again and none of the files + # referenced in the relevant dockerfile have changed, then it indicates a + # cache miss caused by the issue described above + archery docker run conda-python + + # since the image is properly built with the first command, there is no + # need to rebuild it, so manually disable the pull and build phases to + # spare the some time + archery docker run --no-pull --no-build conda-python + +**Pass environment variables to the container:** + +Most of the build scripts used within the containers can be configured through +environment variables. Pass them using ``--env`` or ``-e`` CLI options - +similar to the ``docker run`` and ``docker-compose run`` interface. + +.. code:: bash + + archery docker run --env CMAKE_BUILD_TYPE=release ubuntu-cpp + +For the available environment variables in the C++ builds see the +``ci/scripts/cpp_build.sh`` script. + +**Run the image with custom command:** + +Custom docker commands may be passed as the second argument to +``archery docker run``. + +The following example starts an interactive ``bash`` session in the container +- useful for debugging the build interactively: + +.. code:: bash + + archery docker run ubuntu-cpp bash + +Docker Volume Caches +~~~~~~~~~~~~~~~~~~~~ + +Most of the compose container have specific directories mounted from the host +to reuse ``ccache`` and ``maven`` artifacts. These docker volumes are placed +in the ``.docker`` directory. + +In order to clean up the cache simply delete one or more directories (or the +whole ``.docker`` directory). + + +Development +----------- + +The docker-compose configuration is tuned towards reusable development +containers using hierarchical images. For example multiple language bindings +are dependent on the C++ implementation, so instead of redefining the +C++ environment multiple Dockerfiles, we can reuse the exact same base C++ +image when building Glib, Ruby, R and Python bindings. +This reduces duplication and streamlines maintenance, but makes the +docker-compose configuration more complicated. + +Docker Build Parameters +~~~~~~~~~~~~~~~~~~~~~~~ + +The build time parameters are pushed down to the dockerfiles to make the +image building more flexible. These parameters are usually called as docker +build args, but we pass these values as environment variables to +docker-compose.yml. The build parameters are extensively used for: + +- defining the docker registry used for caching +- platform architectures +- operation systems and versions +- defining various versions if dependencies + +The default parameter values are stored in the top level .env file. +For detailed examples see the docker-compose.yml. + +Build Scripts +~~~~~~~~~~~~~ + +The scripts maintainted under ci/scripts directory should be kept +parametrizable but reasonably minimal to clearly encapsulate the tasks it is +responsible for. Like: + +- ``cpp_build.sh``: build the C++ implementation without running the tests. +- ``cpp_test.sh``: execute the C++ tests. +- ``python_build.sh``: build the Python bindings without running the tests. +- ``python_test.sh``: execute the python tests. +- ``docs_build.sh``: build the Sphinx documentation. +- ``integration_dask.sh``: execute the dask integration tests. +- ``integration_pandas.sh``: execute the pandas integration tests. +- ``install_minio.sh``: install minio server for multiple platforms. +- ``install_conda.sh``: install miniconda for multiple platforms. +- ``install_gcs_testbench.sh``: install the GCS testbench for multiple platforms. + +The parametrization (like the C++ CMake options) is achieved via environment +variables with useful defaults to keep the build configurations declarative. + +A good example is ``cpp_build.sh`` build script which forwards environment +variables as CMake options - so the same scripts can be invoked in various +configurations without the necessity of changing it. For examples see how the +environment variables are passed in the docker-compose.yml's C++ images. + +Adding New Images +~~~~~~~~~~~~~~~~~ + +See the inline comments available in the docker-compose.yml file. diff --git a/src/arrow/docs/source/developers/documentation.rst b/src/arrow/docs/source/developers/documentation.rst new file mode 100644 index 000000000..813cc9cbd --- /dev/null +++ b/src/arrow/docs/source/developers/documentation.rst @@ -0,0 +1,103 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _building-docs: + +Building the Documentation +========================== + +Prerequisites +------------- + +The documentation build process uses `Doxygen <http://www.doxygen.nl/>`_ and +`Sphinx <http://www.sphinx-doc.org/>`_ along with a few extensions. + +If you're using Conda, the required software can be installed in a single line: + +.. code-block:: shell + + conda install -c conda-forge --file=ci/conda_env_sphinx.txt + +Otherwise, you'll first need to install `Doxygen <http://www.doxygen.nl/>`_ +yourself (for example from your distribution's official repositories, if +using Linux). Then you can install the Python-based requirements with the +following command: + +.. code-block:: shell + + pip install -r docs/requirements.txt + +Building +-------- + +.. note:: + + If you are building the documentation on Windows, not all sections + may build properly. + +These two steps are mandatory and must be executed in order. + +#. Process the C++ API using Doxygen + + .. code-block:: shell + + pushd cpp/apidoc + doxygen + popd + +#. Build the complete documentation using Sphinx. + + .. note:: + + This step requires the pyarrow library is installed + in your python environment. One way to accomplish + this is to follow the build instructions at :ref:`python-development` + and then run ``python setup.py install`` in arrow/python + (it is best to do this in a dedicated conda/virtual environment). + + .. code-block:: shell + + pushd docs + make html + popd + +.. note:: + + Note that building the documentation may fail if your build of pyarrow is + not sufficiently comprehensive. Portions of the Python API documentation + will also not build without CUDA support having been built. + +After these steps are completed, the documentation is rendered in HTML +format in ``docs/_build/html``. In particular, you can point your browser +at ``docs/_build/html/index.html`` to read the docs and review any changes +you made. + +Building with Docker +-------------------- + +You can use :ref:`Archery <archery>` to build the documentation within a +Docker container. + +.. code-block:: shell + + archery docker run ubuntu-docs + +The final output is located under ``docs/_build/html``. + +.. seealso:: + + :ref:`docker-builds`. diff --git a/src/arrow/docs/source/developers/experimental_repos.rst b/src/arrow/docs/source/developers/experimental_repos.rst new file mode 100644 index 000000000..f13adba2b --- /dev/null +++ b/src/arrow/docs/source/developers/experimental_repos.rst @@ -0,0 +1,65 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Experimental repositories +========================= + +Apache Arrow has an explicit policy over developing experimental repositories +in the context of +`rules for revolutionaries <https://grep.codeconsult.ch/2020/04/07/rules-for-revolutionaries-2000-edition/>`_. + +The main motivation for this policy is to offer a lightweight mechanism to +conduct experimental work, with the necessary creative freedom, within the ASF +and the Apache Arrow governance model. This policy allows committers to work on +new repositories, as they offer many important tools to manage it (e.g. github +issues, “watch”, “github stars” to measure overall interest). + +Process ++++++++ + +* A committer *may* initiate experimental work by creating a separate git + repository within the Apache Arrow (e.g. via `selfserve <https://selfserve.apache.org/>`_) + and announcing it on the mailing list, together with its goals, and a link to the + newly created repository. +* The committer *must* initiate an email thread with the sole purpose of + presenting updates to the community about the status of the repo. +* There *must not* be official releases from the repository. +* Any decision to make the experimental repo official in any way, whether by merging or migrating, *must* be discussed and voted on in the mailing list. +* The committer is responsible for managing issues, documentation, CI of the repository, + including licensing checks. +* The committer decides when the repository is archived. + +Repository management ++++++++++++++++++++++ + +* The repository *must* be under ``apache/`` +* The repository’s name *must* be prefixed by ``arrow-experimental-`` +* The committer has full permissions over the repository (within possible in ASF) +* Push / merge permissions *must only* be granted to Apache Arrow committers + +Development process ++++++++++++++++++++ + +* The repository must follow the ASF requirements about 3rd party code. +* The committer decides how to manage issues, PRs, etc. + +Divergences ++++++++++++ + +* If any of the “must” above fails to materialize and no correction measure + is taken by the committer upon request, the PMC *should* take ownership + and decide what to do. diff --git a/src/arrow/docs/source/developers/python.rst b/src/arrow/docs/source/developers/python.rst new file mode 100644 index 000000000..3795512ef --- /dev/null +++ b/src/arrow/docs/source/developers/python.rst @@ -0,0 +1,565 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. _python-development: + +================== +Python Development +================== + +This page provides general Python development guidelines and source build +instructions for all platforms. + +Coding Style +============ + +We follow a similar PEP8-like coding style to the `pandas project +<https://github.com/pandas-dev/pandas>`_. To check style issues, use the +:ref:`Archery <archery>` subcommand ``lint``: + +.. code-block:: shell + + pip install -e arrow/dev/archery[lint] + +.. code-block:: shell + + archery lint --python + +Some of the issues can be automatically fixed by passing the ``--fix`` option: + +.. code-block:: shell + + archery lint --python --fix + +Unit Testing +============ + +We are using `pytest <https://docs.pytest.org/en/latest/>`_ to develop our unit +test suite. After building the project (see below) you can run its unit tests +like so: + +.. code-block:: shell + + pytest pyarrow + +Package requirements to run the unit tests are found in +``requirements-test.txt`` and can be installed if needed with ``pip install -r +requirements-test.txt``. + +The project has a number of custom command line options for its test +suite. Some tests are disabled by default, for example. To see all the options, +run + +.. code-block:: shell + + pytest pyarrow --help + +and look for the "custom options" section. + +Test Groups +----------- + +We have many tests that are grouped together using pytest marks. Some of these +are disabled by default. To enable a test group, pass ``--$GROUP_NAME``, +e.g. ``--parquet``. To disable a test group, prepend ``disable``, so +``--disable-parquet`` for example. To run **only** the unit tests for a +particular group, prepend ``only-`` instead, for example ``--only-parquet``. + +The test groups currently include: + +* ``gandiva``: tests for Gandiva expression compiler (uses LLVM) +* ``hdfs``: tests that use libhdfs or libhdfs3 to access the Hadoop filesystem +* ``hypothesis``: tests that use the ``hypothesis`` module for generating + random test cases. Note that ``--hypothesis`` doesn't work due to a quirk + with pytest, so you have to pass ``--enable-hypothesis`` +* ``large_memory``: Test requiring a large amount of system RAM +* ``orc``: Apache ORC tests +* ``parquet``: Apache Parquet tests +* ``plasma``: Plasma Object Store tests +* ``s3``: Tests for Amazon S3 +* ``tensorflow``: Tests that involve TensorFlow +* ``flight``: Flight RPC tests + +Benchmarking +------------ + +For running the benchmarks, see :ref:`python-benchmarks`. + +Building on Linux and MacOS +============================= + +System Requirements +------------------- + +On macOS, any modern XCode (6.4 or higher; the current version is 10) is +sufficient. + +On Linux, for this guide, we require a minimum of gcc 4.8, or clang 3.7 or +higher. You can check your version by running + +.. code-block:: shell + + $ gcc --version + +If the system compiler is older than gcc 4.8, it can be set to a newer version +using the ``$CC`` and ``$CXX`` environment variables: + +.. code-block:: shell + + export CC=gcc-4.8 + export CXX=g++-4.8 + +Environment Setup and Build +--------------------------- + +First, let's clone the Arrow git repository: + +.. code-block:: shell + + mkdir repos + cd repos + git clone https://github.com/apache/arrow.git + +You should now see + +.. code-block:: shell + + $ ls -l + total 8 + drwxrwxr-x 12 wesm wesm 4096 Apr 15 19:19 arrow/ + +Pull in the test data and setup the environment variables: + +.. code-block:: shell + + pushd arrow + git submodule init + git submodule update + export PARQUET_TEST_DATA="${PWD}/cpp/submodules/parquet-testing/data" + export ARROW_TEST_DATA="${PWD}/testing/data" + popd + +Using Conda +~~~~~~~~~~~ + +.. note:: + + Using conda to build Arrow on macOS is complicated by the + fact that the `conda-forge compilers require an older macOS SDK <https://stackoverflow.com/a/55798942>`_. + Conda offers some `installation instructions <https://docs.conda.io/projects/conda-build/en/latest/resources/compiler-tools.html#macos-sdk>`_; + the alternative would be to use :ref:`Homebrew <python-homebrew>` and + ``pip`` instead. + +Let's create a conda environment with all the C++ build and Python dependencies +from conda-forge, targeting development for Python 3.7: + +On Linux and macOS: + +.. code-block:: shell + + conda create -y -n pyarrow-dev -c conda-forge \ + --file arrow/ci/conda_env_unix.txt \ + --file arrow/ci/conda_env_cpp.txt \ + --file arrow/ci/conda_env_python.txt \ + --file arrow/ci/conda_env_gandiva.txt \ + compilers \ + python=3.7 \ + pandas + +As of January 2019, the ``compilers`` package is needed on many Linux +distributions to use packages from conda-forge. + +With this out of the way, you can now activate the conda environment + +.. code-block:: shell + + conda activate pyarrow-dev + +For Windows, see the `Building on Windows`_ section below. + +We need to set some environment variables to let Arrow's build system know +about our build toolchain: + +.. code-block:: shell + + export ARROW_HOME=$CONDA_PREFIX + +Using pip +~~~~~~~~~ + +.. warning:: + + If you installed Python using the Anaconda distribution or `Miniconda + <https://conda.io/miniconda.html>`_, you cannot currently use ``virtualenv`` + to manage your development. Please follow the conda-based development + instructions instead. + +.. _python-homebrew: + +On macOS, use Homebrew to install all dependencies required for +building Arrow C++: + +.. code-block:: shell + + brew update && brew bundle --file=arrow/cpp/Brewfile + +See :ref:`here <cpp-build-dependency-management>` for a list of dependencies you +may need. + +On Debian/Ubuntu, you need the following minimal set of dependencies. All other +dependencies will be automatically built by Arrow's third-party toolchain. + +.. code-block:: shell + + $ sudo apt-get install libjemalloc-dev libboost-dev \ + libboost-filesystem-dev \ + libboost-system-dev \ + libboost-regex-dev \ + python-dev \ + autoconf \ + flex \ + bison + +If you are building Arrow for Python 3, install ``python3-dev`` instead of ``python-dev``. + +On Arch Linux, you can get these dependencies via pacman. + +.. code-block:: shell + + $ sudo pacman -S jemalloc boost + +Now, let's create a Python virtualenv with all Python dependencies in the same +folder as the repositories and a target installation folder: + +.. code-block:: shell + + virtualenv pyarrow + source ./pyarrow/bin/activate + pip install -r arrow/python/requirements-build.txt \ + -r arrow/python/requirements-test.txt + + # This is the folder where we will install the Arrow libraries during + # development + mkdir dist + +If your cmake version is too old on Linux, you could get a newer one via +``pip install cmake``. + +We need to set some environment variables to let Arrow's build system know +about our build toolchain: + +.. code-block:: shell + + export ARROW_HOME=$(pwd)/dist + export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH + +Build and test +-------------- + +Now build and install the Arrow C++ libraries: + +.. code-block:: shell + + mkdir arrow/cpp/build + pushd arrow/cpp/build + + cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_WITH_BZ2=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_BROTLI=ON \ + -DARROW_PARQUET=ON \ + -DARROW_PYTHON=ON \ + -DARROW_BUILD_TESTS=ON \ + .. + make -j4 + make install + popd + +There are a number of optional components that can can be switched ON by +adding flags with ``ON``: + +* ``ARROW_FLIGHT``: RPC framework +* ``ARROW_GANDIVA``: LLVM-based expression compiler +* ``ARROW_ORC``: Support for Apache ORC file format +* ``ARROW_PARQUET``: Support for Apache Parquet file format +* ``ARROW_PLASMA``: Shared memory object store + +Anything set to ``ON`` above can also be turned off. Note that some compression +libraries are needed for Parquet support. + +If multiple versions of Python are installed in your environment, you may have +to pass additional parameters to cmake so that it can find the right +executable, headers and libraries. For example, specifying +``-DPython3_EXECUTABLE=$VIRTUAL_ENV/bin/python`` (assuming that you're in +virtualenv) enables cmake to choose the python executable which you are using. + +.. note:: + + On Linux systems with support for building on multiple architectures, + ``make`` may install libraries in the ``lib64`` directory by default. For + this reason we recommend passing ``-DCMAKE_INSTALL_LIBDIR=lib`` because the + Python build scripts assume the library directory is ``lib`` + +.. note:: + + If you have conda installed but are not using it to manage dependencies, + and you have trouble building the C++ library, you may need to set + ``-DARROW_DEPENDENCY_SOURCE=AUTO`` or some other value (described + :ref:`here <cpp-build-dependency-management>`) + to explicitly tell CMake not to use conda. + +.. note:: + + With older versions of ``cmake`` (<3.15) you might need to pass ``-DPYTHON_EXECUTABLE`` + instead of ``-DPython3_EXECUTABLE``. See `cmake documentation <https://cmake.org/cmake/help/latest/module/FindPython3.html#artifacts-specification>` + for more details. + +For any other C++ build challenges, see :ref:`cpp-development`. + +Now, build pyarrow: + +.. code-block:: shell + + pushd arrow/python + export PYARROW_WITH_PARQUET=1 + python setup.py build_ext --inplace + popd + +If you did not build one of the optional components, set the corresponding +``PYARROW_WITH_$COMPONENT`` environment variable to 0. + +Now you are ready to install test dependencies and run `Unit Testing`_, as +described above. + +To build a self-contained wheel (including the Arrow and Parquet C++ +libraries), one can set ``--bundle-arrow-cpp``: + +.. code-block:: shell + + pip install wheel # if not installed + python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ + --bundle-arrow-cpp bdist_wheel + +Docker examples +~~~~~~~~~~~~~~~ + +If you are having difficulty building the Python library from source, take a +look at the ``python/examples/minimal_build`` directory which illustrates a +complete build and test from source both with the conda and pip/virtualenv +build methods. + +Building with CUDA support +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :mod:`pyarrow.cuda` module offers support for using Arrow platform +components with Nvidia's CUDA-enabled GPU devices. To build with this support, +pass ``-DARROW_CUDA=ON`` when building the C++ libraries, and set the following +environment variable when building pyarrow: + +.. code-block:: shell + + export PYARROW_WITH_CUDA=1 + +Debugging +--------- + +Since pyarrow depends on the Arrow C++ libraries, debugging can +frequently involve crossing between Python and C++ shared libraries. + +Using gdb on Linux +~~~~~~~~~~~~~~~~~~ + +To debug the C++ libraries with gdb while running the Python unit + test, first start pytest with gdb: + +.. code-block:: shell + + gdb --args python -m pytest pyarrow/tests/test_to_run.py -k $TEST_TO_MATCH + +To set a breakpoint, use the same gdb syntax that you would when +debugging a C++ unittest, for example: + +.. code-block:: shell + + (gdb) b src/arrow/python/arrow_to_pandas.cc:1874 + No source file named src/arrow/python/arrow_to_pandas.cc. + Make breakpoint pending on future shared library load? (y or [n]) y + Breakpoint 1 (src/arrow/python/arrow_to_pandas.cc:1874) pending. + +Building on Windows +=================== + +Building on Windows requires one of the following compilers to be installed: + +- `Build Tools for Visual Studio 2017 <https://download.visualstudio.microsoft.com/download/pr/3e542575-929e-4297-b6c6-bef34d0ee648/639c868e1219c651793aff537a1d3b77/vs_buildtools.exe>`_ +- Visual Studio 2017 + +During the setup of Build Tools ensure at least one Windows SDK is selected. + +Visual Studio 2019 and its build tools are currently not supported. + +We bootstrap a conda environment similar to above, but skipping some of the +Linux/macOS-only packages: + +First, starting from fresh clones of Apache Arrow: + +.. code-block:: shell + + git clone https://github.com/apache/arrow.git + +.. code-block:: shell + + conda create -y -n pyarrow-dev -c conda-forge ^ + --file arrow\ci\conda_env_cpp.txt ^ + --file arrow\ci\conda_env_python.txt ^ + --file arrow\ci\conda_env_gandiva.txt ^ + python=3.7 + conda activate pyarrow-dev + +Now, we build and install Arrow C++ libraries. + +We set a number of environment variables: + +- the path of the installation directory of the Arrow C++ libraries as + ``ARROW_HOME`` +- add the path of installed DLL libraries to ``PATH`` +- and choose the compiler to be used + +.. code-block:: shell + + set ARROW_HOME=%cd%\arrow-dist + set PATH=%ARROW_HOME%\bin;%PATH% + set PYARROW_CMAKE_GENERATOR=Visual Studio 15 2017 Win64 + +Let's configure, build and install the Arrow C++ libraries: + +.. code-block:: shell + + mkdir arrow\cpp\build + pushd arrow\cpp\build + cmake -G "%PYARROW_CMAKE_GENERATOR%" ^ + -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ + -DCMAKE_UNITY_BUILD=ON ^ + -DARROW_CXXFLAGS="/WX /MP" ^ + -DARROW_WITH_LZ4=on ^ + -DARROW_WITH_SNAPPY=on ^ + -DARROW_WITH_ZLIB=on ^ + -DARROW_WITH_ZSTD=on ^ + -DARROW_PARQUET=on ^ + -DARROW_PYTHON=on ^ + .. + cmake --build . --target INSTALL --config Release + popd + +Now, we can build pyarrow: + +.. code-block:: shell + + pushd arrow\python + set PYARROW_WITH_PARQUET=1 + python setup.py build_ext --inplace + popd + +.. note:: + + For building pyarrow, the above defined environment variables need to also + be set. Remember this if to want to re-build ``pyarrow`` after your initial build. + +Then run the unit tests with: + +.. code-block:: shell + + pushd arrow\python + py.test pyarrow -v + popd + +.. note:: + + With the above instructions the Arrow C++ libraries are not bundled with + the Python extension. This is recommended for development as it allows the + C++ libraries to be re-built separately. + + As a consequence however, ``python setup.py install`` will also not install + the Arrow C++ libraries. Therefore, to use ``pyarrow`` in python, ``PATH`` + must contain the directory with the Arrow .dll-files. + + If you want to bundle the Arrow C++ libraries with ``pyarrow`` add + ``--bundle-arrow-cpp`` as build parameter: + + ``python setup.py build_ext --bundle-arrow-cpp`` + + Important: If you combine ``--bundle-arrow-cpp`` with ``--inplace`` the + Arrow C++ libraries get copied to the python source tree and are not cleared + by ``python setup.py clean``. They remain in place and will take precedence + over any later Arrow C++ libraries contained in ``PATH``. This can lead to + incompatibilities when ``pyarrow`` is later built without + ``--bundle-arrow-cpp``. + +Running C++ unit tests for Python integration +--------------------------------------------- + +Running C++ unit tests should not be necessary for most developers. If you do +want to run them, you need to pass ``-DARROW_BUILD_TESTS=ON`` during +configuration of the Arrow C++ library build: + +.. code-block:: shell + + mkdir arrow\cpp\build + pushd arrow\cpp\build + cmake -G "%PYARROW_CMAKE_GENERATOR%" ^ + -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ + -DARROW_CXXFLAGS="/WX /MP" ^ + -DARROW_PARQUET=on ^ + -DARROW_PYTHON=on ^ + -DARROW_BUILD_TESTS=ON ^ + .. + cmake --build . --target INSTALL --config Release + popd + + +Getting ``arrow-python-test.exe`` (C++ unit tests for python integration) to +run is a bit tricky because your ``%PYTHONHOME%`` must be configured to point +to the active conda environment: + +.. code-block:: shell + + set PYTHONHOME=%CONDA_PREFIX% + pushd arrow\cpp\build\release\Release + arrow-python-test.exe + popd + +To run all tests of the Arrow C++ library, you can also run ``ctest``: + +.. code-block:: shell + + set PYTHONHOME=%CONDA_PREFIX% + pushd arrow\cpp\build + ctest + popd + +Windows Caveats +--------------- + +Some components are not supported yet on Windows: + +* Flight RPC +* Plasma diff --git a/src/arrow/docs/source/example.gz b/src/arrow/docs/source/example.gz Binary files differnew file mode 100644 index 000000000..4fc60405c --- /dev/null +++ b/src/arrow/docs/source/example.gz diff --git a/src/arrow/docs/source/format/Arrow.graffle b/src/arrow/docs/source/format/Arrow.graffle Binary files differnew file mode 100644 index 000000000..f4eead922 --- /dev/null +++ b/src/arrow/docs/source/format/Arrow.graffle diff --git a/src/arrow/docs/source/format/Arrow.png b/src/arrow/docs/source/format/Arrow.png Binary files differnew file mode 100644 index 000000000..1b09aa2d8 --- /dev/null +++ b/src/arrow/docs/source/format/Arrow.png diff --git a/src/arrow/docs/source/format/CDataInterface.rst b/src/arrow/docs/source/format/CDataInterface.rst new file mode 100644 index 000000000..20446411a --- /dev/null +++ b/src/arrow/docs/source/format/CDataInterface.rst @@ -0,0 +1,948 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _c-data-interface: + +========================== +The Arrow C data interface +========================== + +Rationale +========= + +Apache Arrow is designed to be a universal in-memory format for the representation +of tabular ("columnar") data. However, some projects may face a difficult +choice between either depending on a fast-evolving project such as the +Arrow C++ library, or having to reimplement adapters for data interchange, +which may require significant, redundant development effort. + +The Arrow C data interface defines a very small, stable set of C definitions +that can be easily *copied* in any project's source code and used for columnar +data interchange in the Arrow format. For non-C/C++ languages and runtimes, +it should be almost as easy to translate the C definitions into the +corresponding C FFI declarations. + +Applications and libraries can therefore work with Arrow memory without +necessarily using Arrow libraries or reinventing the wheel. Developers can +choose between tight integration +with the Arrow *software project* (benefitting from the growing array of +facilities exposed by e.g. the C++ or Java implementations of Apache Arrow, +but with the cost of a dependency) or minimal integration with the Arrow +*format* only. + +Goals +----- + +* Expose an ABI-stable interface. +* Make it easy for third-party projects to implement support for (including partial + support where sufficient), with little initial investment. +* Allow zero-copy sharing of Arrow data between independent runtimes + and components running in the same process. +* Match the Arrow array concepts closely to avoid the development of + yet another marshalling layer. +* Avoid the need for one-to-one adaptation layers such as the limited + JPype-based bridge between Java and Python. +* Enable integration without an explicit dependency (either at compile-time + or runtime) on the Arrow software project. + +Ideally, the Arrow C data interface can become a low-level *lingua franca* +for sharing columnar data at runtime and establish Arrow as the universal +building block in the columnar processing ecosystem. + +Non-goals +--------- + +* Expose a C API mimicking operations available in higher-level runtimes + (such as C++, Java...). +* Data sharing between distinct processes or storage persistence. + + +Comparison with the Arrow IPC format +------------------------------------ + +Pros of the C data interface vs. the IPC format: + +* No dependency on Flatbuffers. +* No buffer reassembly (data is already exposed in logical Arrow format). +* Zero-copy by design. +* Easy to reimplement from scratch. +* Minimal C definition that can be easily copied into other codebases. +* Resource lifetime management through a custom release callback. + +Pros of the IPC format vs. the data interface: + +* Works across processes and machines. +* Allows data storage and persistence. +* Being a streamable format, the IPC format has room for composing more features + (such as integrity checks, compression...). +* Does not require explicit C data access. + +Data type description -- format strings +======================================= + +A data type is described using a format string. The format string only +encodes information about the top-level type; for nested type, child types +are described separately. Also, metadata is encoded in a separate string. + +The format strings are designed to be easily parsable, even from a language +such as C. The most common primitive formats have one-character format +strings: + ++-----------------+--------------------------+------------+ +| Format string | Arrow data type | Notes | ++=================+==========================+============+ +| ``n`` | null | | ++-----------------+--------------------------+------------+ +| ``b`` | boolean | | ++-----------------+--------------------------+------------+ +| ``c`` | int8 | | ++-----------------+--------------------------+------------+ +| ``C`` | uint8 | | ++-----------------+--------------------------+------------+ +| ``s`` | int16 | | ++-----------------+--------------------------+------------+ +| ``S`` | uint16 | | ++-----------------+--------------------------+------------+ +| ``i`` | int32 | | ++-----------------+--------------------------+------------+ +| ``I`` | uint32 | | ++-----------------+--------------------------+------------+ +| ``l`` | int64 | | ++-----------------+--------------------------+------------+ +| ``L`` | uint64 | | ++-----------------+--------------------------+------------+ +| ``e`` | float16 | | ++-----------------+--------------------------+------------+ +| ``f`` | float32 | | ++-----------------+--------------------------+------------+ +| ``g`` | float64 | | ++-----------------+--------------------------+------------+ + ++-----------------+---------------------------------------------------+------------+ +| Format string | Arrow data type | Notes | ++=================+===================================================+============+ +| ``z`` | binary | | ++-----------------+---------------------------------------------------+------------+ +| ``Z`` | large binary | | ++-----------------+---------------------------------------------------+------------+ +| ``u`` | utf-8 string | | ++-----------------+---------------------------------------------------+------------+ +| ``U`` | large utf-8 string | | ++-----------------+---------------------------------------------------+------------+ +| ``d:19,10`` | decimal128 [precision 19, scale 10] | | ++-----------------+---------------------------------------------------+------------+ +| ``d:19,10,NNN`` | decimal bitwidth = NNN [precision 19, scale 10] | | ++-----------------+---------------------------------------------------+------------+ +| ``w:42`` | fixed-width binary [42 bytes] | | ++-----------------+---------------------------------------------------+------------+ + +Temporal types have multi-character format strings starting with ``t``: + ++-----------------+---------------------------------------------------+------------+ +| Format string | Arrow data type | Notes | ++=================+===================================================+============+ +| ``tdD`` | date32 [days] | | ++-----------------+---------------------------------------------------+------------+ +| ``tdm`` | date64 [milliseconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``tts`` | time32 [seconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``ttm`` | time32 [milliseconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``ttu`` | time64 [microseconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``ttn`` | time64 [nanoseconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``tss:...`` | timestamp [seconds] with timezone "..." | \(1) | ++-----------------+---------------------------------------------------+------------+ +| ``tsm:...`` | timestamp [milliseconds] with timezone "..." | \(1) | ++-----------------+---------------------------------------------------+------------+ +| ``tsu:...`` | timestamp [microseconds] with timezone "..." | \(1) | ++-----------------+---------------------------------------------------+------------+ +| ``tsn:...`` | timestamp [nanoseconds] with timezone "..." | \(1) | ++-----------------+---------------------------------------------------+------------+ +| ``tDs`` | duration [seconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``tDm`` | duration [milliseconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``tDu`` | duration [microseconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``tDn`` | duration [nanoseconds] | | ++-----------------+---------------------------------------------------+------------+ +| ``tiM`` | interval [months] | | ++-----------------+---------------------------------------------------+------------+ +| ``tiD`` | interval [days, time] | | ++-----------------+---------------------------------------------------+------------+ +| ``tin`` | interval [month, day, nanoseconds] | | ++-----------------+---------------------------------------------------+------------+ + + +Dictionary-encoded types do not have a specific format string. Instead, the +format string of the base array represents the dictionary index type, and the +value type can be read from the dependent dictionary array (see below +"Dictionary-encoded arrays"). + +Nested types have multiple-character format strings starting with ``+``. The +names and types of child fields are read from the child arrays. + ++------------------------+---------------------------------------------------+------------+ +| Format string | Arrow data type | Notes | ++========================+===================================================+============+ +| ``+l`` | list | | ++------------------------+---------------------------------------------------+------------+ +| ``+L`` | large list | | ++------------------------+---------------------------------------------------+------------+ +| ``+w:123`` | fixed-sized list [123 items] | | ++------------------------+---------------------------------------------------+------------+ +| ``+s`` | struct | | ++------------------------+---------------------------------------------------+------------+ +| ``+m`` | map | \(2) | ++------------------------+---------------------------------------------------+------------+ +| ``+ud:I,J,...`` | dense union with type ids I,J... | | ++------------------------+---------------------------------------------------+------------+ +| ``+us:I,J,...`` | sparse union with type ids I,J... | | ++------------------------+---------------------------------------------------+------------+ + +Notes: + +(1) + The timezone string is appended as-is after the colon character ``:``, without + any quotes. If the timezone is empty, the colon ``:`` must still be included. + +(2) + As specified in the Arrow columnar format, the map type has a single child type + named ``entries``, itself a 2-child struct type of ``(key, value)``. + +Examples +-------- + +* A dictionary-encoded ``decimal128(precision = 12, scale = 5)`` array + with ``int16`` indices has format string ``s``, and its dependent dictionary + array has format string ``d:12,5``. +* A ``list<uint64>`` array has format string ``+l``, and its single child + has format string ``L``. +* A ``struct<ints: int32, floats: float32>`` has format string ``+s``; its two + children have names ``ints`` and ``floats``, and format strings ``i`` and + ``f`` respectively. +* A ``map<string, float64>`` array has format string ``+m``; its single child + has name ``entries`` and format string ``+s``; its two grandchildren have names + ``key`` and ``value``, and format strings ``u`` and ``g`` respectively. +* A ``sparse_union<ints: int32, floats: float32>`` with type ids ``4, 5`` + has format string ``+us:4,5``; its two children have names ``ints`` and + ``floats``, and format strings ``i`` and ``f`` respectively. + + +Structure definitions +===================== + +The following free-standing definitions are enough to support the Arrow +C data interface in your project. Like the rest of the Arrow project, they +are available under the Apache License 2.0. + +.. code-block:: c + + #define ARROW_FLAG_DICTIONARY_ORDERED 1 + #define ARROW_FLAG_NULLABLE 2 + #define ARROW_FLAG_MAP_KEYS_SORTED 4 + + struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; + }; + + struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; + }; + +The ArrowSchema structure +------------------------- + +The ``ArrowSchema`` structure describes the type and metadata of an exported +array or record batch. It has the following fields: + +.. c:member:: const char* ArrowSchema.format + + Mandatory. A null-terminated, UTF8-encoded string describing + the data type. If the data type is nested, child types are not + encoded here but in the :c:member:`ArrowSchema.children` structures. + + Consumers MAY decide not to support all data types, but they + should document this limitation. + +.. c:member:: const char* ArrowSchema.name + + Optional. A null-terminated, UTF8-encoded string of the field + or array name. This is mainly used to reconstruct child fields + of nested types. + + Producers MAY decide not to provide this information, and consumers + MAY decide to ignore it. If omitted, MAY be NULL or an empty string. + +.. c:member:: const char* ArrowSchema.metadata + + Optional. A binary string describing the type's metadata. + If the data type is nested, child types are not encoded here but + in the :c:member:`ArrowSchema.children` structures. + + This string is not null-terminated but follows a specific format:: + + int32: number of key/value pairs (noted N below) + int32: byte length of key 0 + key 0 (not null-terminated) + int32: byte length of value 0 + value 0 (not null-terminated) + ... + int32: byte length of key N - 1 + key N - 1 (not null-terminated) + int32: byte length of value N - 1 + value N - 1 (not null-terminated) + + Integers are stored in native endianness. For example, the metadata + ``[('key1', 'value1')]`` is encoded on a little-endian machine as:: + + \x01\x00\x00\x00\x04\x00\x00\x00key1\x06\x00\x00\x00value1 + + On a big-endian machine, the same example would be encoded as:: + + \x00\x00\x00\x01\x00\x00\x00\x04key1\x00\x00\x00\x06value1 + + If omitted, this field MUST be NULL (not an empty string). + + Consumers MAY choose to ignore this information. + +.. c:member:: int64_t ArrowSchema.flags + + Optional. A bitfield of flags enriching the type description. + Its value is computed by OR'ing together the flag values. + The following flags are available: + + * ``ARROW_FLAG_NULLABLE``: whether this field is semantically nullable + (regardless of whether it actually has null values). + * ``ARROW_FLAG_DICTIONARY_ORDERED``: for dictionary-encoded types, + whether the ordering of dictionary indices is semantically meaningful. + * ``ARROW_FLAG_MAP_KEYS_SORTED``: for map types, whether the keys within + each map value are sorted. + + If omitted, MUST be 0. + + Consumers MAY choose to ignore some or all of the flags. Even then, + they SHOULD keep this value around so as to propagate its information + to their own consumers. + +.. c:member:: int64_t ArrowSchema.n_children + + Mandatory. The number of children this type has. + +.. c:member:: ArrowSchema** ArrowSchema.children + + Optional. A C array of pointers to each child type of this type. + There must be :c:member:`ArrowSchema.n_children` pointers. + + MAY be NULL only if :c:member:`ArrowSchema.n_children` is 0. + +.. c:member:: ArrowSchema* ArrowSchema.dictionary + + Optional. A pointer to the type of dictionary values. + + MUST be present if the ArrowSchema represents a dictionary-encoded type. + MUST be NULL otherwise. + +.. c:member:: void (*ArrowSchema.release)(struct ArrowSchema*) + + Mandatory. A pointer to a producer-provided release callback. + + See below for memory management and release callback semantics. + +.. c:member:: void* ArrowSchema.private_data + + Optional. An opaque pointer to producer-provided private data. + + Consumers MUST not process this member. Lifetime of this member + is handled by the producer, and especially by the release callback. + + +The ArrowArray structure +------------------------ + +The ``ArrowArray`` describes the data of an exported array or record batch. +For the ``ArrowArray`` structure to be interpreted type, the array type +or record batch schema must already be known. This is either done by +convention -- for example a producer API that always produces the same data +type -- or by passing a ``ArrowSchema`` on the side. + +It has the following fields: + +.. c:member:: int64_t ArrowArray.length + + Mandatory. The logical length of the array (i.e. its number of items). + +.. c:member:: int64_t ArrowArray.null_count + + Mandatory. The number of null items in the array. MAY be -1 if not + yet computed. + +.. c:member:: int64_t ArrowArray.offset + + Mandatory. The logical offset inside the array (i.e. the number of items + from the physical start of the buffers). MUST be 0 or positive. + + Producers MAY specify that they will only produce 0-offset arrays to + ease implementation of consumer code. + Consumers MAY decide not to support non-0-offset arrays, but they + should document this limitation. + +.. c:member:: int64_t ArrowArray.n_buffers + + Mandatory. The number of physical buffers backing this array. The + number of buffers is a function of the data type, as described in the + :ref:`Columnar format specification <format_columnar>`. + + Buffers of children arrays are not included. + +.. c:member:: const void** ArrowArray.buffers + + Mandatory. A C array of pointers to the start of each physical buffer + backing this array. Each `void*` pointer is the physical start of + a contiguous buffer. There must be :c:member:`ArrowArray.n_buffers` pointers. + + The producer MUST ensure that each contiguous buffer is large enough to + represent `length + offset` values encoded according to the + :ref:`Columnar format specification <format_columnar>`. + + It is recommended, but not required, that the memory addresses of the + buffers be aligned at least according to the type of primitive data that + they contain. Consumers MAY decide not to support unaligned memory. + + The pointer to the null bitmap buffer, if the data type specifies one, + MAY be NULL only if :c:member:`ArrowArray.null_count` is 0. + + Buffers of children arrays are not included. + +.. c:member:: int64_t ArrowArray.n_children + + Mandatory. The number of children this array has. The number of children + is a function of the data type, as described in the + :ref:`Columnar format specification <format_columnar>`. + +.. c:member:: ArrowArray** ArrowArray.children + + Optional. A C array of pointers to each child array of this array. + There must be :c:member:`ArrowArray.n_children` pointers. + + MAY be NULL only if :c:member:`ArrowArray.n_children` is 0. + +.. c:member:: ArrowArray* ArrowArray.dictionary + + Optional. A pointer to the underlying array of dictionary values. + + MUST be present if the ArrowArray represents a dictionary-encoded array. + MUST be NULL otherwise. + +.. c:member:: void (*ArrowArray.release)(struct ArrowArray*) + + Mandatory. A pointer to a producer-provided release callback. + + See below for memory management and release callback semantics. + +.. c:member:: void* ArrowArray.private_data + + Optional. An opaque pointer to producer-provided private data. + + Consumers MUST not process this member. Lifetime of this member + is handled by the producer, and especially by the release callback. + + +Dictionary-encoded arrays +------------------------- + +For dictionary-encoded arrays, the :c:member:`ArrowSchema.format` string +encodes the *index* type. The dictionary *value* type can be read +from the :c:member:`ArrowSchema.dictionary` structure. + +The same holds for :c:member:`ArrowArray` structure: while the parent +structure points to the index data, the :c:member:`ArrowArray.dictionary` +points to the dictionary values array. + +Extension arrays +---------------- + +For extension arrays, the :c:member:`ArrowSchema.format` string encodes the +*storage* type. Information about the extension type is encoded in the +:c:member:`ArrowSchema.metadata` string, similarly to the +:ref:`IPC format <format_metadata_extension_types>`. Specifically, the +metadata key ``ARROW:extension:name`` encodes the extension type name, +and the metadata key ``ARROW:extension:metadata`` encodes the +implementation-specific serialization of the extension type (for +parameterized extension types). The base64 encoding of metadata values +ensures that any possible serialization is representable. + +The ``ArrowArray`` structure exported from an extension array simply points +to the storage data of the extension array. + +Memory management +----------------- + +The ``ArrowSchema`` and ``ArrowArray`` structures follow the same conventions +for memory management. The term *"base structure"* below refers to the +``ArrowSchema`` or ``ArrowArray`` that is passed between producer and consumer +-- not any child structure thereof. + +Member allocation +''''''''''''''''' + +It is intended for the base structure to be stack- or heap-allocated by the +consumer. In this case, the producer API should take a pointer to the +consumer-allocated structure. + +However, any data pointed to by the struct MUST be allocated and maintained +by the producer. This includes the format and metadata strings, the arrays +of buffer and children pointers, etc. + +Therefore, the consumer MUST not try to interfere with the producer's +handling of these members' lifetime. The only way the consumer influences +data lifetime is by calling the base structure's ``release`` callback. + +.. _c-data-interface-released: + +Released structure +'''''''''''''''''' + +A released structure is indicated by setting its ``release`` callback to NULL. +Before reading and interpreting a structure's data, consumers SHOULD check +for a NULL release callback and treat it accordingly (probably by erroring +out). + +Release callback semantics -- for consumers +''''''''''''''''''''''''''''''''''''''''''' + +Consumers MUST call a base structure's release callback when they won't be using +it anymore, but they MUST not call any of its children's release callbacks +(including the optional dictionary). The producer is responsible for releasing +the children. + +In any case, a consumer MUST not try to access the base structure anymore +after calling its release callback -- including any associated data such +as its children. + +Release callback semantics -- for producers +''''''''''''''''''''''''''''''''''''''''''' + +If producers need additional information for lifetime handling (for +example, a C++ producer may want to use ``shared_ptr`` for array and +buffer lifetime), they MUST use the ``private_data`` member to locate the +required bookkeeping information. + +The release callback MUST not assume that the structure will be located +at the same memory location as when it was originally produced. The consumer +is free to move the structure around (see "Moving an array"). + +The release callback MUST walk all children structures (including the optional +dictionary) and call their own release callbacks. + +The release callback MUST free any data area directly owned by the structure +(such as the buffers and children members). + +The release callback MUST mark the structure as released, by setting +its ``release`` member to NULL. + +Below is a good starting point for implementing a release callback, where the +TODO area must be filled with producer-specific deallocation code: + +.. code-block:: c + + static void ReleaseExportedArray(struct ArrowArray* array) { + // This should not be called on already released array + assert(array->format != NULL); + + // Release children + for (int64_t i = 0; i < array->n_children; ++i) { + struct ArrowArray* child = array->children[i]; + if (child->release != NULL) { + child->release(child); + assert(child->release == NULL); + } + } + + // Release dictionary + struct ArrowArray* dict = array->dictionary; + if (dict != NULL && dict->release != NULL) { + dict->release(dict); + assert(dict->release == NULL); + } + + // TODO here: release and/or deallocate all data directly owned by + // the ArrowArray struct, such as the private_data. + + // Mark array released + array->release = NULL; + } + + +Moving an array +''''''''''''''' + +The consumer can *move* the ``ArrowArray`` structure by bitwise copying or +shallow member-wise copying. Then it MUST mark the source structure released +(see "released structure" above for how to do it) but *without* calling the +release callback. This ensures that only one live copy of the struct is +active at any given time and that lifetime is correctly communicated to +the producer. + +As usual, the release callback will be called on the destination structure +when it is not needed anymore. + +Moving child arrays +~~~~~~~~~~~~~~~~~~~ + +It is also possible to move one or several child arrays, but the parent +``ArrowArray`` structure MUST be released immediately afterwards, as it +won't point to valid child arrays anymore. + +The main use case for this is to keep alive only a subset of child arrays +(for example if you are only interested in certain columns of the data), +while releasing the others. + +.. note:: + + For moving to work correctly, the ``ArrowArray`` structure has to be + trivially relocatable. Therefore, pointer members inside the ``ArrowArray`` + structure (including ``private_data``) MUST not point inside the structure + itself. Also, external pointers to the structure MUST not be separately + stored by the producer. Instead, the producer MUST use the ``private_data`` + member so as to remember any necessary bookkeeping information. + +Record batches +-------------- + +A record batch can be trivially considered as an equivalent struct array with +additional top-level metadata. + +Example use case +================ + +A C++ database engine wants to provide the option to deliver results in Arrow +format, but without imposing themselves a dependency on the Arrow software +libraries. With the Arrow C data interface, the engine can let the caller pass +a pointer to a ``ArrowArray`` structure, and fill it with the next chunk of +results. + +It can do so without including the Arrow C++ headers or linking with the +Arrow DLLs. Furthermore, the database engine's C API can benefit other +runtimes and libraries that know about the Arrow C data interface, +through e.g. a C FFI layer. + +C producer examples +=================== + +Exporting a simple ``int32`` array +---------------------------------- + +Export a non-nullable ``int32`` type with empty metadata. In this case, +all ``ArrowSchema`` members point to statically-allocated data, so the +release callback is trivial. + +.. code-block:: c + + static void release_int32_type(struct ArrowSchema* schema) { + // Mark released + schema->release = NULL; + } + + void export_int32_type(struct ArrowSchema* schema) { + *schema = (struct ArrowSchema) { + // Type description + .format = "i", + .name = "", + .metadata = NULL, + .flags = 0, + .n_children = 0, + .children = NULL, + .dictionary = NULL, + // Bookkeeping + .release = &release_int32_type + }; + } + +Export a C-malloc()ed array of the same type as a Arrow array, transferring +ownership to the consumer through the release callback: + +.. code-block:: c + + static void release_int32_array(struct ArrowArray* array) { + assert(array->n_buffers == 2); + // Free the buffers and the buffers array + free((void *) array->buffers[1]); + free(array->buffers); + // Mark released + array->release = NULL; + } + + void export_int32_array(const int32_t* data, int64_t nitems, + struct ArrowArray* array) { + // Initialize primitive fields + *array = (struct ArrowArray) { + // Data description + .length = nitems, + .offset = 0, + .null_count = 0, + .n_buffers = 2, + .n_children = 0, + .children = NULL, + .dictionary = NULL, + // Bookkeeping + .release = &release_int32_array + }; + // Allocate list of buffers + array->buffers = (const void**) malloc(sizeof(void*) * array->n_buffers); + assert(array->buffers != NULL); + array->buffers[0] = NULL; // no nulls, null bitmap can be omitted + array->buffers[1] = data; + } + +Exporting a ``struct<float32, utf8>`` array +------------------------------------------- + +Export the array type as a ``ArrowSchema`` with C-malloc()ed children: + +.. code-block:: c + + static void release_malloced_type(struct ArrowSchema* schema) { + int i; + for (i = 0; i < schema->n_children; ++i) { + struct ArrowSchema* child = schema->children[i]; + if (child->release != NULL) { + child->release(child); + } + } + free(schema->children); + // Mark released + schema->release = NULL; + } + + void export_float32_utf8_type(struct ArrowSchema* schema) { + struct ArrowSchema* child; + + // + // Initialize parent type + // + *schema = (struct ArrowSchema) { + // Type description + .format = "+s", + .name = "", + .metadata = NULL, + .flags = 0, + .n_children = 2, + .dictionary = NULL, + // Bookkeeping + .release = &release_malloced_type + }; + // Allocate list of children types + schema->children = malloc(sizeof(struct ArrowSchema*) * schema->n_children); + + // + // Initialize child type #0 + // + child = schema->children[0] = malloc(sizeof(struct ArrowSchema)); + *child = (struct ArrowSchema) { + // Type description + .format = "f", + .name = "floats", + .metadata = NULL, + .flags = ARROW_FLAG_NULLABLE, + .n_children = 0, + .dictionary = NULL, + .children = NULL, + // Bookkeeping + .release = &release_malloced_type + }; + + // + // Initialize child type #1 + // + child = schema->children[1] = malloc(sizeof(struct ArrowSchema)); + *child = (struct ArrowSchema) { + // Type description + .format = "u", + .name = "strings", + .metadata = NULL, + .flags = ARROW_FLAG_NULLABLE, + .n_children = 0, + .dictionary = NULL, + .children = NULL, + // Bookkeeping + .release = &release_malloced_type + }; + } + +Export C-malloc()ed arrays in Arrow-compatible layout as an Arrow struct array, +transferring ownership to the consumer: + +.. code-block:: c + + static void release_malloced_array(struct ArrowArray* array) { + int i; + // Free children + for (i = 0; i < array->n_children; ++i) { + struct ArrowArray* child = array->children[i]; + if (child->release != NULL) { + child->release(child); + } + } + free(array->children); + // Free buffers + for (i = 0; i < array->n_buffers; ++i) { + free((void *) array->buffers[i]); + } + free(array->buffers); + // Mark released + array->release = NULL; + } + + void export_float32_utf8_array( + int64_t nitems, + const uint8_t* float32_nulls, const float* float32_data, + const uint8_t* utf8_nulls, const int32_t* utf8_offsets, const uint8_t* utf8_data, + struct ArrowArray* array) { + struct ArrowArray* child; + + // + // Initialize parent array + // + *array = (struct ArrowArray) { + // Data description + .length = nitems, + .offset = 0, + .null_count = 0, + .n_buffers = 1, + .n_children = 2, + .dictionary = NULL, + // Bookkeeping + .release = &release_malloced_array + }; + // Allocate list of parent buffers + array->buffers = malloc(sizeof(void*) * array->n_buffers); + array->buffers[0] = NULL; // no nulls, null bitmap can be omitted + // Allocate list of children arrays + array->children = malloc(sizeof(struct ArrowArray*) * array->n_children); + + // + // Initialize child array #0 + // + child = array->children[0] = malloc(sizeof(struct ArrowArray)); + *child = (struct ArrowArray) { + // Data description + .length = nitems, + .offset = 0, + .null_count = -1, + .n_buffers = 2, + .n_children = 0, + .dictionary = NULL, + .children = NULL, + // Bookkeeping + .release = &release_malloced_array + }; + child->buffers = malloc(sizeof(void*) * array->n_buffers); + child->buffers[0] = float32_nulls; + child->buffers[1] = float32_data; + + // + // Initialize child array #1 + // + child = array->children[1] = malloc(sizeof(struct ArrowArray)); + *child = (struct ArrowArray) { + // Data description + .length = nitems, + .offset = 0, + .null_count = -1, + .n_buffers = 3, + .n_children = 0, + .dictionary = NULL, + .children = NULL, + // Bookkeeping + .release = &release_malloced_array + }; + child->buffers = malloc(sizeof(void*) * array->n_buffers); + child->buffers[0] = utf8_nulls; + child->buffers[1] = utf8_offsets; + child->buffers[2] = utf8_data; + } + + +Why two distinct structures? +============================ + +In many cases, the same type or schema description applies to multiple, +possibly short, batches of data. To avoid paying the cost of exporting +and importing the type description for each batch, the ``ArrowSchema`` +can be passed once, separately, at the beginning of the conversation between +producer and consumer. + +In other cases yet, the data type is fixed by the producer API, and may not +need to be communicated at all. + +However, if a producer is focused on one-shot exchange of data, it can +communicate the ``ArrowSchema`` and ``ArrowArray`` structures in the same +API call. + +Updating this specification +=========================== + +Once this specification is supported in an official Arrow release, the C +ABI is frozen. This means the ``ArrowSchema`` and ``ArrowArray`` structure +definitions should not change in any way -- including adding new members. + +Backwards-compatible changes are allowed, for example new +:c:member:`ArrowSchema.flags` values or expanded possibilities for +the :c:member:`ArrowSchema.format` string. + +Any incompatible changes should be part of a new specification, for example +"Arrow C data interface v2". + +Inspiration +=========== + +The Arrow C data interface is inspired by the `Python buffer protocol`_, +which has proven immensely successful in allowing various Python libraries +exchange numerical data with no knowledge of each other and near-zero +adaptation cost. + + +.. _Python buffer protocol: https://www.python.org/dev/peps/pep-3118/ diff --git a/src/arrow/docs/source/format/CStreamInterface.rst b/src/arrow/docs/source/format/CStreamInterface.rst new file mode 100644 index 000000000..b8ccce355 --- /dev/null +++ b/src/arrow/docs/source/format/CStreamInterface.rst @@ -0,0 +1,218 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. highlight:: c + +.. _c-stream-interface: + +============================ +The Arrow C stream interface +============================ + +.. warning:: + This interface is experimental and may evolve based on feedback from + early users. ABI stability is not guaranteed yet. Feel free to + `contact us <https://arrow.apache.org/community/>`__. + +The C stream interface builds on the structures defined in the +:ref:`C data interface <c-data-interface>` and combines them into a higher-level +specification so as to ease the communication of streaming data within a single +process. + +Semantics +========= + +An Arrow C stream exposes a streaming source of data chunks, each with the +same schema. Chunks are obtained by calling a blocking pull-style iteration +function. + +Structure definition +==================== + +The C stream interface is defined by a single ``struct`` definition:: + + struct ArrowArrayStream { + // Callbacks providing stream functionality + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; + }; + +The ArrowArrayStream structure +------------------------------ + +The ``ArrowArrayStream`` provides the required callbacks to interact with a +streaming source of Arrow arrays. It has the following fields: + +.. c:member:: int (*ArrowArrayStream.get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out) + + *Mandatory.* This callback allows the consumer to query the schema of + the chunks of data in the stream. The schema is the same for all + data chunks. + + This callback must NOT be called on a released ``ArrowArrayStream``. + + *Return value:* 0 on success, a non-zero + :ref:`error code <c-stream-interface-error-codes>` otherwise. + +.. c:member:: int (*ArrowArrayStream.get_next)(struct ArrowArrayStream*, struct ArrowArray* out) + + *Mandatory.* This callback allows the consumer to get the next chunk + of data in the stream. + + This callback must NOT be called on a released ``ArrowArrayStream``. + + *Return value:* 0 on success, a non-zero + :ref:`error code <c-stream-interface-error-codes>` otherwise. + + On success, the consumer must check whether the ``ArrowArray`` is + marked :ref:`released <c-data-interface-released>`. If the + ``ArrowArray`` is released, then the end of stream has been reached. + Otherwise, the ``ArrowArray`` contains a valid data chunk. + +.. c:member:: const char* (*ArrowArrayStream.get_last_error)(struct ArrowArrayStream*) + + *Mandatory.* This callback allows the consumer to get a textual description + of the last error. + + This callback must ONLY be called if the last operation on the + ``ArrowArrayStream`` returned an error. It must NOT be called on a + released ``ArrowArrayStream``. + + *Return value:* a pointer to a NULL-terminated character string (UTF8-encoded). + NULL can also be returned if no detailed description is available. + + The returned pointer is only guaranteed to be valid until the next call of + one of the stream's callbacks. The character string it points to should + be copied to consumer-managed storage if it is intended to survive longer. + +.. c:member:: void (*ArrowArrayStream.release)(struct ArrowArrayStream*) + + *Mandatory.* A pointer to a producer-provided release callback. + +.. c:member:: void* ArrowArrayStream.private_data + + *Optional.* An opaque pointer to producer-provided private data. + + Consumers MUST not process this member. Lifetime of this member + is handled by the producer, and especially by the release callback. + + +.. _c-stream-interface-error-codes: + +Error codes +----------- + +The ``get_schema`` and ``get_next`` callbacks may return an error under the form +of a non-zero integer code. Such error codes should be interpreted like +``errno`` numbers (as defined by the local platform). Note that the symbolic +forms of these constants are stable from platform to platform, but their numeric +values are platform-specific. + +In particular, it is recommended to recognize the following values: + +* ``EINVAL``: for a parameter or input validation error +* ``ENOMEM``: for a memory allocation failure (out of memory) +* ``EIO``: for a generic input/output error + +.. seealso:: + `Standard POSIX error codes <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html>`__. + + `Error codes recognized by the Windows C runtime library + <https://docs.microsoft.com/en-us/cpp/c-runtime-library/errno-doserrno-sys-errlist-and-sys-nerr>`__. + +Result lifetimes +---------------- + +The data returned by the ``get_schema`` and ``get_next`` callbacks must be +released independently. Their lifetimes are not tied to that of the +``ArrowArrayStream``. + +Stream lifetime +--------------- + +Lifetime of the C stream is managed using a release callback with similar +usage as in the :ref:`C data interface <c-data-interface-released>`. + + +C consumer example +================== + +Let's say a particular database provides the following C API to execute +a SQL query and return the result set as a Arrow C stream:: + + void MyDB_Query(const char* query, struct ArrowArrayStream* result_set); + +Then a consumer could use the following code to iterate over the results:: + + static void handle_error(int errcode, struct ArrowArrayStream* stream) { + // Print stream error + const char* errdesc = stream->get_last_error(stream); + if (errdesc != NULL) { + fputs(errdesc, stderr); + } else { + fputs(strerror(errcode), stderr); + } + // Release stream and abort + stream->release(stream), + exit(1); + } + + void run_query() { + struct ArrowArrayStream stream; + struct ArrowSchema schema; + struct ArrowArray chunk; + int errcode; + + MyDB_Query("SELECT * FROM my_table", &stream); + + // Query result set schema + errcode = stream.get_schema(&stream, &schema); + if (errcode != 0) { + handle_error(errcode, &stream); + } + + int64_t num_rows = 0; + + // Iterate over results: loop until error or end of stream + while ((errcode = stream.get_next(&stream, &chunk) == 0) && + chunk.release != NULL) { + // Do something with chunk... + fprintf(stderr, "Result chunk: got %lld rows\n", chunk.length); + num_rows += chunk.length; + + // Release chunk + chunk.release(&chunk); + } + + // Was it an error? + if (errcode != 0) { + handle_error(errcode, &stream); + } + + fprintf(stderr, "Result stream ended: total %lld rows\n", num_rows); + + // Release schema and stream + schema.release(&schema); + stream.release(&stream); + } diff --git a/src/arrow/docs/source/format/Columnar.rst b/src/arrow/docs/source/format/Columnar.rst new file mode 100644 index 000000000..85261e7d9 --- /dev/null +++ b/src/arrow/docs/source/format/Columnar.rst @@ -0,0 +1,1221 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _format_columnar: + +********************* +Arrow Columnar Format +********************* + +*Version: 1.0* + +The "Arrow Columnar Format" includes a language-agnostic in-memory +data structure specification, metadata serialization, and a protocol +for serialization and generic data transport. + +This document is intended to provide adequate detail to create a new +implementation of the columnar format without the aid of an existing +implementation. We utilize Google's `Flatbuffers`_ project for +metadata serialization, so it will be necessary to refer to the +project's `Flatbuffers protocol definition files`_ +while reading this document. + +The columnar format has some key features: + +* Data adjacency for sequential access (scans) +* O(1) (constant-time) random access +* SIMD and vectorization-friendly +* Relocatable without "pointer swizzling", allowing for true zero-copy + access in shared memory + +The Arrow columnar format provides analytical performance and data +locality guarantees in exchange for comparatively more expensive +mutation operations. This document is concerned only with in-memory +data representation and serialization details; issues such as +coordinating mutation of data structures are left to be handled by +implementations. + +Terminology +=========== + +Since different projects have used different words to describe various +concepts, here is a small glossary to help disambiguate. + +* **Array** or **Vector**: a sequence of values with known length all + having the same type. These terms are used interchangeably in + different Arrow implementations, but we use "array" in this + document. +* **Slot**: a single logical value in an array of some particular data type +* **Buffer** or **Contiguous memory region**: a sequential virtual + address space with a given length. Any byte can be reached via a + single pointer offset less than the region's length. +* **Physical Layout**: The underlying memory layout for an array + without taking into account any value semantics. For example, a + 32-bit signed integer array and 32-bit floating point array have the + same layout. +* **Parent** and **child arrays**: names to express relationships + between physical value arrays in a nested type structure. For + example, a ``List<T>``-type parent array has a T-type array as its + child (see more on lists below). +* **Primitive type**: a data type having no child types. This includes + such types as fixed bit-width, variable-size binary, and null types. +* **Nested type**: a data type whose full structure depends on one or + more other child types. Two fully-specified nested types are equal + if and only if their child types are equal. For example, ``List<U>`` + is distinct from ``List<V>`` iff U and V are different types. +* **Logical type**: An application-facing semantic value type that is + implemented using some physical layout. For example, Decimal + values are stored as 16 bytes in a fixed-size binary + layout. Similarly, strings can be stored as ``List<1-byte>``. A + timestamp may be stored as 64-bit fixed-size layout. + +.. _format_layout: + +Physical Memory Layout +====================== + +Arrays are defined by a few pieces of metadata and data: + +* A logical data type. +* A sequence of buffers. +* A length as a 64-bit signed integer. Implementations are permitted + to be limited to 32-bit lengths, see more on this below. +* A null count as a 64-bit signed integer. +* An optional **dictionary**, for dictionary-encoded arrays. + +Nested arrays additionally have a sequence of one or more sets of +these items, called the **child arrays**. + +Each logical data type has a well-defined physical layout. Here are +the different physical layouts defined by Arrow: + +* **Primitive (fixed-size)**: a sequence of values each having the + same byte or bit width +* **Variable-size Binary**: a sequence of values each having a variable + byte length. Two variants of this layout are supported using 32-bit + and 64-bit length encoding. +* **Fixed-size List**: a nested layout where each value has the same + number of elements taken from a child data type. +* **Variable-size List**: a nested layout where each value is a + variable-length sequence of values taken from a child data type. Two + variants of this layout are supported using 32-bit and 64-bit length + encoding. +* **Struct**: a nested layout consisting of a collection of named + child **fields** each having the same length but possibly different + types. +* **Sparse** and **Dense Union**: a nested layout representing a + sequence of values, each of which can have type chosen from a + collection of child array types. +* **Null**: a sequence of all null values, having null logical type + +The Arrow columnar memory layout only applies to *data* and not +*metadata*. Implementations are free to represent metadata in-memory +in whichever form is convenient for them. We handle metadata +**serialization** in an implementation-independent way using +`Flatbuffers`_, detailed below. + +Buffer Alignment and Padding +---------------------------- + +Implementations are recommended to allocate memory on aligned +addresses (multiple of 8- or 64-bytes) and pad (overallocate) to a +length that is a multiple of 8 or 64 bytes. When serializing Arrow +data for interprocess communication, these alignment and padding +requirements are enforced. If possible, we suggest that you prefer +using 64-byte alignment and padding. Unless otherwise noted, padded +bytes do not need to have a specific value. + +The alignment requirement follows best practices for optimized memory +access: + +* Elements in numeric arrays will be guaranteed to be retrieved via aligned access. +* On some architectures alignment can help limit partially used cache lines. + +The recommendation for 64 byte alignment comes from the `Intel +performance guide`_ that recommends alignment of memory to match SIMD +register width. The specific padding length was chosen because it +matches the largest SIMD instruction registers available on widely +deployed x86 architecture (Intel AVX-512). + +The recommended padding of 64 bytes allows for using `SIMD`_ +instructions consistently in loops without additional conditional +checks. This should allow for simpler, efficient and CPU +cache-friendly code. In other words, we can load the entire 64-byte +buffer into a 512-bit wide SIMD register and get data-level +parallelism on all the columnar values packed into the 64-byte +buffer. Guaranteed padding can also allow certain compilers to +generate more optimized code directly (e.g. One can safely use Intel's +``-qopt-assume-safe-padding``). + +Array lengths +------------- + +Array lengths are represented in the Arrow metadata as a 64-bit signed +integer. An implementation of Arrow is considered valid even if it only +supports lengths up to the maximum 32-bit signed integer, though. If using +Arrow in a multi-language environment, we recommend limiting lengths to +2 :sup:`31` - 1 elements or less. Larger data sets can be represented using +multiple array chunks. + +Null count +---------- + +The number of null value slots is a property of the physical array and +considered part of the data structure. The null count is represented +in the Arrow metadata as a 64-bit signed integer, as it may be as +large as the array length. + +Validity bitmaps +---------------- + +Any value in an array may be semantically null, whether primitive or nested +type. + +All array types, with the exception of union types (more on these later), +utilize a dedicated memory buffer, known as the validity (or "null") bitmap, to +encode the nullness or non-nullness of each value slot. The validity bitmap +must be large enough to have at least 1 bit for each array slot. + +Whether any array slot is valid (non-null) is encoded in the respective bits of +this bitmap. A 1 (set bit) for index ``j`` indicates that the value is not null, +while a 0 (bit not set) indicates that it is null. Bitmaps are to be +initialized to be all unset at allocation time (this includes padding): :: + + is_valid[j] -> bitmap[j / 8] & (1 << (j % 8)) + +We use `least-significant bit (LSB) numbering`_ (also known as +bit-endianness). This means that within a group of 8 bits, we read +right-to-left: :: + + values = [0, 1, null, 2, null, 3] + + bitmap + j mod 8 7 6 5 4 3 2 1 0 + 0 0 1 0 1 0 1 1 + +Arrays having a 0 null count may choose to not allocate the validity +bitmap. Implementations may choose to always allocate one anyway as a +matter of convenience, but this should be noted when memory is being +shared. + +Nested type arrays except for union types have their own validity bitmap and +null count regardless of the null count and valid bits of their child arrays. + +Array slots which are null are not required to have a particular +value; any "masked" memory can have any value and need not be zeroed, +though implementations frequently choose to zero memory for null +values. + +Fixed-size Primitive Layout +--------------------------- + +A primitive value array represents an array of values each having the +same physical slot width typically measured in bytes, though the spec +also provides for bit-packed types (e.g. boolean values encoded in +bits). + +Internally, the array contains a contiguous memory buffer whose total +size is at least as large as the slot width multiplied by the array +length. For bit-packed types, the size is rounded up to the nearest +byte. + +The associated validity bitmap is contiguously allocated (as described +above) but does not need to be adjacent in memory to the values +buffer. + +**Example Layout: Int32 Array** + +For example a primitive array of int32s: :: + + [1, null, 2, 4, 8] + +Would look like: :: + + * Length: 5, Null count: 1 + * Validity bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + | 00011101 | 0 (padding) | + + * Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | unspecified | 2 | 4 | 8 | unspecified | + +**Example Layout: Non-null int32 Array** + +``[1, 2, 3, 4, 8]`` has two possible layouts: :: + + * Length: 5, Null count: 0 + * Validity bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00011111 | 0 (padding) | + + * Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | unspecified | + +or with the bitmap elided: :: + + * Length 5, Null count: 0 + * Validity bitmap buffer: Not required + * Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | unspecified | + +Variable-size Binary Layout +--------------------------- + +Each value in this layout consists of 0 or more bytes. While primitive +arrays have a single values buffer, variable-size binary have an +**offsets** buffer and **data** buffer. + +The offsets buffer contains `length + 1` signed integers (either +32-bit or 64-bit, depending on the logical type), which encode the +start position of each slot in the data buffer. The length of the +value in each slot is computed using the difference between the offset +at that slot's index and the subsequent offset. For example, the +position and length of slot j is computed as: + +:: + + slot_position = offsets[j] + slot_length = offsets[j + 1] - offsets[j] // (for 0 <= j < length) + +It should be noted that a null value may have a positive slot length. +That is, a null value may occupy a **non-empty** memory space in the data +buffer. When this is true, the content of the corresponding memory space +is undefined. + +Generally the first value in the offsets array is 0, and the last slot +is the length of the values array. When serializing this layout, we +recommend normalizing the offsets to start at 0. + +Variable-size List Layout +------------------------- + +List is a nested type which is semantically similar to variable-size +binary. It is defined by two buffers, a validity bitmap and an offsets +buffer, and a child array. The offsets are the same as in the +variable-size binary case, and both 32-bit and 64-bit signed integer +offsets are supported options for the offsets. Rather than referencing +an additional data buffer, instead these offsets reference the child +array. + +Similar to the layout of variable-size binary, a null value may +correspond to a **non-empty** segment in the child array. When this is +true, the content of the corresponding segment can be arbitrary. + +A list type is specified like ``List<T>``, where ``T`` is any type +(primitive or nested). In these examples we use 32-bit offsets where +the 64-bit offset version would be denoted by ``LargeList<T>``. + +**Example Layout: ``List<Int8>`` Array** + +We illustrate an example of ``List<Int8>`` with length 4 having values:: + + [[12, -7, 25], null, [0, -127, 127, 50], []] + +will have the following representation: :: + + * Length: 4, Null count: 1 + * Validity bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001101 | 0 (padding) | + + * Offsets buffer (int32) + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 0 | 3 | 3 | 7 | 7 | unspecified | + + * Values array (Int8array): + * Length: 7, Null count: 0 + * Validity bitmap buffer: Not required + * Values buffer (int8) + + | Bytes 0-6 | Bytes 7-63 | + |------------------------------|-------------| + | 12, -7, 25, 0, -127, 127, 50 | unspecified | + +**Example Layout: ``List<List<Int8>>``** + +``[[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], [[9, 10]]]`` + +will be represented as follows: :: + + * Length 3 + * Nulls count: 0 + * Validity bitmap buffer: Not required + * Offsets buffer (int32) + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |------------|------------|------------|-------------|-------------| + | 0 | 2 | 5 | 6 | unspecified | + + * Values array (`List<Int8>`) + * Length: 6, Null count: 1 + * Validity bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-------------| + | 00110111 | 0 (padding) | + + * Offsets buffer (int32) + + | Bytes 0-27 | Bytes 28-63 | + |----------------------|-------------| + | 0, 2, 4, 7, 7, 8, 10 | unspecified | + + * Values array (Int8): + * Length: 10, Null count: 0 + * Validity bitmap buffer: Not required + + | Bytes 0-9 | Bytes 10-63 | + |-------------------------------|-------------| + | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified | + +Fixed-Size List Layout +---------------------- + +Fixed-Size List is a nested type in which each array slot contains a +fixed-size sequence of values all having the same type. + +A fixed size list type is specified like ``FixedSizeList<T>[N]``, +where ``T`` is any type (primitive or nested) and ``N`` is a 32-bit +signed integer representing the length of the lists. + +A fixed size list array is represented by a values array, which is a +child array of type T. T may also be a nested type. The value in slot +``j`` of a fixed size list array is stored in an ``N``-long slice of +the values array, starting at an offset of ``j * N``. + +**Example Layout: ``FixedSizeList<byte>[4]`` Array** + +Here we illustrate ``FixedSizeList<byte>[4]``. + +For an array of length 4 with respective values: :: + + [[192, 168, 0, 12], null, [192, 168, 0, 25], [192, 168, 0, 1]] + +will have the following representation: :: + + * Length: 4, Null count: 1 + * Validity bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001101 | 0 (padding) | + + * Values array (byte array): + * Length: 16, Null count: 0 + * validity bitmap buffer: Not required + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-15 | + |-----------------|-------------|---------------------------------| + | 192, 168, 0, 12 | unspecified | 192, 168, 0, 25, 192, 168, 0, 1 | + + +Struct Layout +------------- + +A struct is a nested type parameterized by an ordered sequence of +types (which can all be distinct), called its fields. Each field must +have a UTF8-encoded name, and these field names are part of the type +metadata. + +A struct array does not have any additional allocated physical storage +for its values. A struct array must still have an allocated validity +bitmap, if it has one or more null values. + +Physically, a struct array has one child array for each field. The +child arrays are independent and need not be adjacent to each other in +memory. + +For example, the struct (field names shown here as strings for illustration +purposes):: + + Struct < + name: VarBinary + age: Int32 + > + +has two child arrays, one ``VarBinary`` array (using variable-size binary +layout) and one 4-byte primitive value array having ``Int32`` logical +type. + +**Example Layout: ``Struct<VarBinary, Int32>``** + +The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: + + * Length: 4, Null count: 1 + * Validity bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + | 00001011 | 0 (padding) | + + * Children arrays: + * field-0 array (`VarBinary`): + * Length: 4, Null count: 2 + * Validity bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001001 | 0 (padding) | + + * Offsets buffer: + + | Bytes 0-19 | + |----------------| + | 0, 3, 3, 3, 7 | + + * Values array: + * Length: 7, Null count: 0 + * Validity bitmap buffer: Not required + + * Value buffer: + + | Bytes 0-6 | + |----------------| + | joemark | + + * field-1 array (int32 array): + * Length: 4, Null count: 1 + * Validity bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001011 | 0 (padding) | + + * Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |------------|-------------|-------------|-------------|-------------| + | 1 | 2 | unspecified | 4 | unspecified | + +While a struct does not have physical storage for each of its semantic +slots (i.e. each scalar C-like struct), an entire struct slot can be +set to null via the validity bitmap. Any of the child field arrays can +have null values according to their respective independent validity +bitmaps. This implies that for a particular struct slot the validity +bitmap for the struct array might indicate a null slot when one or +more of its child arrays has a non-null value in their corresponding +slot. When reading the struct array the parent validity bitmap takes +priority. This is illustrated in the example above, the child arrays +have valid entries for the null struct but are 'hidden' from the +consumer by the parent array's validity bitmap. However, when treated +independently corresponding values of the children array will be +non-null. + +Union Layout +------------ + +A union is defined by an ordered sequence of types; each slot in the +union can have a value chosen from these types. The types are named +like a struct's fields, and the names are part of the type metadata. + +Unlike other data types, unions do not have their own validity bitmap. Instead, +the nullness of each slot is determined exclusively by the child arrays which +are composed to create the union. + +We define two distinct union types, "dense" and "sparse", that are +optimized for different use cases. + +Dense Union +~~~~~~~~~~~ + +Dense union represents a mixed-type array with 5 bytes of overhead for +each value. Its physical layout is as follows: + +* One child array for each type +* Types buffer: A buffer of 8-bit signed integers. Each type in the + union has a corresponding type id whose values are found in this + buffer. A union with more than 127 possible types can be modeled as + a union of unions. +* Offsets buffer: A buffer of signed int32 values indicating the + relative offset into the respective child array for the type in a + given slot. The respective offsets for each child value array must + be in order / increasing. + +Critically, the dense union allows for minimal overhead in the ubiquitous +union-of-structs with non-overlapping-fields use case (``Union<s1: Struct1, s2: +Struct2, s3: Struct3, ...>``) + +**Example Layout: Dense union** + +An example layout for logical union of: ``Union<f: float, i: int32>`` +having the values: ``[{f=1.2}, null, {f=3.4}, {i=5}]`` + +:: + + * Length: 4, Null count: 0 + * Types buffer: + + |Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | + |---------|-------------|----------|----------|-------------| + | 0 | 0 | 0 | 1 | unspecified | + + * Offset buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |----------|-------------|------------|-------------|-------------| + | 0 | 1 | 2 | 0 | unspecified | + + * Children arrays: + * Field-0 array (f: float): + * Length: 2, Null count: 1 + * Validity bitmap buffer: 00000101 + + * Value Buffer: + + | Bytes 0-11 | Bytes 12-63 | + |----------------|-------------| + | 1.2, null, 3.4 | unspecified | + + + * Field-1 array (i: int32): + * Length: 1, Null count: 0 + * Validity bitmap buffer: Not required + + * Value Buffer: + + | Bytes 0-3 | Bytes 4-63 | + |-----------|-------------| + | 5 | unspecified | + +Sparse Union +~~~~~~~~~~~~ + +A sparse union has the same structure as a dense union, with the omission of +the offsets array. In this case, the child arrays are each equal in length to +the length of the union. + +While a sparse union may use significantly more space compared with a +dense union, it has some advantages that may be desirable in certain +use cases: + +* A sparse union is more amenable to vectorized expression evaluation in some use cases. +* Equal-length arrays can be interpreted as a union by only defining the types array. + +**Example layout: ``SparseUnion<u0: Int32, u1: Float, u2: VarBinary>``** + +For the union array: :: + + [{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, {u2='mark'}] + +will have the following layout: :: + + * Length: 6, Null count: 0 + * Types buffer: + + | Byte 0 | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Bytes 6-63 | + |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 0 | 1 | 2 | 1 | 0 | 2 | unspecified (padding) | + + * Children arrays: + + * u0 (Int32): + * Length: 6, Null count: 4 + * Validity bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + |00010001 | 0 (padding) | + + * Value buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | + + * u1 (float): + * Length: 6, Null count: 4 + * Validity bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + | 00001010 | 0 (padding) | + + * Value buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | + + * u2 (`VarBinary`) + * Length: 6, Null count: 4 + * Validity bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00100100 | 0 (padding) | + + * Offsets buffer (int32) + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | + |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| + | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified | + + * Values array (VarBinary): + * Length: 7, Null count: 0 + * Validity bitmap buffer: Not required + + | Bytes 0-6 | Bytes 7-63 | + |------------|-----------------------| + | joemark | unspecified (padding) | + +Only the slot in the array corresponding to the type index is considered. All +"unselected" values are ignored and could be any semantically correct array +value. + +Null Layout +----------- + +We provide a simplified memory-efficient layout for the Null data type +where all values are null. In this case no memory buffers are +allocated. + +.. _dictionary-encoded-layout: + +Dictionary-encoded Layout +------------------------- + +Dictionary encoding is a data representation technique to represent +values by integers referencing a **dictionary** usually consisting of +unique values. It can be effective when you have data with many +repeated values. + +Any array can be dictionary-encoded. The dictionary is stored as an optional +property of an array. When a field is dictionary encoded, the values are +represented by an array of non-negative integers representing the index of the +value in the dictionary. The memory layout for a dictionary-encoded array is +the same as that of a primitive integer layout. The dictionary is handled as a +separate columnar array with its own respective layout. + +As an example, you could have the following data: :: + + type: VarBinary + + ['foo', 'bar', 'foo', 'bar', null, 'baz'] + +In dictionary-encoded form, this could appear as: + +:: + + data VarBinary (dictionary-encoded) + index_type: Int32 + values: [0, 1, 0, 1, null, 2] + + dictionary + type: VarBinary + values: ['foo', 'bar', 'baz'] + +Note that a dictionary is permitted to contain duplicate values or +nulls: + +:: + + data VarBinary (dictionary-encoded) + index_type: Int32 + values: [0, 1, 3, 1, 4, 2] + + dictionary + type: VarBinary + values: ['foo', 'bar', 'baz', 'foo', null] + +The null count of such arrays is dictated only by the validity bitmap +of its indices, irrespective of any null values in the dictionary. + +Since unsigned integers can be more difficult to work with in some cases +(e.g. in the JVM), we recommend preferring signed integers over unsigned +integers for representing dictionary indices. Additionally, we recommend +avoiding using 64-bit unsigned integer indices unless they are required by an +application. + +We discuss dictionary encoding as it relates to serialization further +below. + +Buffer Listing for Each Layout +------------------------------ + +For the avoidance of ambiguity, we provide listing the order and type +of memory buffers for each layout. + +.. csv-table:: Buffer Layouts + :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2" + :widths: 30, 20, 20, 20 + + "Primitive",validity,data, + "Variable Binary",validity,offsets,data + "List",validity,offsets, + "Fixed-size List",validity,, + "Struct",validity,, + "Sparse Union",type ids,, + "Dense Union",type ids,offsets, + "Null",,, + "Dictionary-encoded",validity,data (indices), + +Logical Types +============= + +The `Schema.fbs`_ defines built-in logical types supported by the +Arrow columnar format. Each logical type uses one of the above +physical layouts. Nested logical types may have different physical +layouts depending on the particular realization of the type. + +We do not go into detail about the logical types definitions in this +document as we consider `Schema.fbs`_ to be authoritative. + +.. _format-ipc: + +Serialization and Interprocess Communication (IPC) +================================================== + +The primitive unit of serialized data in the columnar format is the +"record batch". Semantically, a record batch is an ordered collection +of arrays, known as its **fields**, each having the same length as one +another but potentially different data types. A record batch's field +names and types collectively form the batch's **schema**. + +In this section we define a protocol for serializing record batches +into a stream of binary payloads and reconstructing record batches +from these payloads without need for memory copying. + +The columnar IPC protocol utilizes a one-way stream of binary messages +of these types: + +* Schema +* RecordBatch +* DictionaryBatch + +We specify a so-called *encapsulated IPC message* format which +includes a serialized Flatbuffer type along with an optional message +body. We define this message format before describing how to serialize +each constituent IPC message type. + +Encapsulated message format +--------------------------- + +For simple streaming and file-based serialization, we define a +"encapsulated" message format for interprocess communication. Such +messages can be "deserialized" into in-memory Arrow array objects by +examining only the message metadata without any need to copy or move +any of the actual data. + +The encapsulated binary message format is as follows: + +* A 32-bit continuation indicator. The value ``0xFFFFFFFF`` indicates + a valid message. This component was introduced in version 0.15.0 in + part to address the 8-byte alignment requirement of Flatbuffers +* A 32-bit little-endian length prefix indicating the metadata size +* The message metadata as using the ``Message`` type defined in + `Message.fbs`_ +* Padding bytes to an 8-byte boundary +* The message body, whose length must be a multiple of 8 bytes + +Schematically, we have: :: + + <continuation: 0xFFFFFFFF> + <metadata_size: int32> + <metadata_flatbuffer: bytes> + <padding> + <message body> + +The complete serialized message must be a multiple of 8 bytes so that messages +can be relocated between streams. Otherwise the amount of padding between the +metadata and the message body could be non-deterministic. + +The ``metadata_size`` includes the size of the ``Message`` plus +padding. The ``metadata_flatbuffer`` contains a serialized ``Message`` +Flatbuffer value, which internally includes: + +* A version number +* A particular message value (one of ``Schema``, ``RecordBatch``, or + ``DictionaryBatch``) +* The size of the message body +* A ``custom_metadata`` field for any application-supplied metadata + +When read from an input stream, generally the ``Message`` metadata is +initially parsed and validated to obtain the body size. Then the body +can be read. + +Schema message +-------------- + +The Flatbuffers files `Schema.fbs`_ contains the definitions for all +built-in logical data types and the ``Schema`` metadata type which +represents the schema of a given record batch. A schema consists of +an ordered sequence of fields, each having a name and type. A +serialized ``Schema`` does not contain any data buffers, only type +metadata. + +The ``Field`` Flatbuffers type contains the metadata for a single +array. This includes: + +* The field's name +* The field's logical type +* Whether the field is semantically nullable. While this has no + bearing on the array's physical layout, many systems distinguish + nullable and non-nullable fields and we want to allow them to + preserve this metadata to enable faithful schema round trips. +* A collection of child ``Field`` values, for nested types +* A ``dictionary`` property indicating whether the field is + dictionary-encoded or not. If it is, a dictionary "id" is assigned + to allow matching a subsequent dictionary IPC message with the + appropriate field. + +We additionally provide both schema-level and field-level +``custom_metadata`` attributes allowing for systems to insert their +own application defined metadata to customize behavior. + +RecordBatch message +------------------- + +A RecordBatch message contains the actual data buffers corresponding +to the physical memory layout determined by a schema. The metadata for +this message provides the location and size of each buffer, permitting +Array data structures to be reconstructed using pointer arithmetic and +thus no memory copying. + +The serialized form of the record batch is the following: + +* The ``data header``, defined as the ``RecordBatch`` type in + `Message.fbs`_. +* The ``body``, a flat sequence of memory buffers written end-to-end + with appropriate padding to ensure a minimum of 8-byte alignment + +The data header contains the following: + +* The length and null count for each flattened field in the record + batch +* The memory offset and length of each constituent ``Buffer`` in the + record batch's body + +Fields and buffers are flattened by a pre-order depth-first traversal +of the fields in the record batch. For example, let's consider the +schema :: + + col1: Struct<a: Int32, b: List<item: Int64>, c: Float64> + col2: Utf8 + +The flattened version of this is: :: + + FieldNode 0: Struct name='col1' + FieldNode 1: Int32 name='a' + FieldNode 2: List name='b' + FieldNode 3: Int64 name='item' + FieldNode 4: Float64 name='c' + FieldNode 5: Utf8 name='col2' + +For the buffers produced, we would have the following (refer to the +table above): :: + + buffer 0: field 0 validity + buffer 1: field 1 validity + buffer 2: field 1 values + buffer 3: field 2 validity + buffer 4: field 2 offsets + buffer 5: field 3 validity + buffer 6: field 3 values + buffer 7: field 4 validity + buffer 8: field 4 values + buffer 9: field 5 validity + buffer 10: field 5 offsets + buffer 11: field 5 data + +The ``Buffer`` Flatbuffers value describes the location and size of a +piece of memory. Generally these are interpreted relative to the +**encapsulated message format** defined below. + +The ``size`` field of ``Buffer`` is not required to account for padding +bytes. Since this metadata can be used to communicate in-memory pointer +addresses between libraries, it is recommended to set ``size`` to the actual +memory size rather than the padded size. + +Byte Order (`Endianness`_) +--------------------------- + +The Arrow format is little endian by default. + +Serialized Schema metadata has an endianness field indicating +endianness of RecordBatches. Typically this is the endianness of the +system where the RecordBatch was generated. The main use case is +exchanging RecordBatches between systems with the same Endianness. At +first we will return an error when trying to read a Schema with an +endianness that does not match the underlying system. The reference +implementation is focused on Little Endian and provides tests for +it. Eventually we may provide automatic conversion via byte swapping. + +IPC Streaming Format +-------------------- + +We provide a streaming protocol or "format" for record batches. It is +presented as a sequence of encapsulated messages, each of which +follows the format above. The schema comes first in the stream, and it +is the same for all of the record batches that follow. If any fields +in the schema are dictionary-encoded, one or more ``DictionaryBatch`` +messages will be included. ``DictionaryBatch`` and ``RecordBatch`` +messages may be interleaved, but before any dictionary key is used in +a ``RecordBatch`` it should be defined in a ``DictionaryBatch``. :: + + <SCHEMA> + <DICTIONARY 0> + ... + <DICTIONARY k - 1> + <RECORD BATCH 0> + ... + <DICTIONARY x DELTA> + ... + <DICTIONARY y DELTA> + ... + <RECORD BATCH n - 1> + <EOS [optional]: 0xFFFFFFFF 0x00000000> + +.. note:: An edge-case for interleaved dictionary and record batches occurs + when the record batches contain dictionary encoded arrays that are + completely null. In this case, the dictionary for the encoded column might + appear after the first record batch. + +When a stream reader implementation is reading a stream, after each +message, it may read the next 8 bytes to determine both if the stream +continues and the size of the message metadata that follows. Once the +message flatbuffer is read, you can then read the message body. + +The stream writer can signal end-of-stream (EOS) either by writing 8 bytes +containing the 4-byte continuation indicator (``0xFFFFFFFF``) followed by 0 +metadata length (``0x00000000``) or closing the stream interface. We +recommend the ".arrows" file extension for the streaming format although +in many cases these streams will not ever be stored as files. + +IPC File Format +--------------- + +We define a "file format" supporting random access that is an extension of +the stream format. The file starts and ends with a magic string ``ARROW1`` +(plus padding). What follows in the file is identical to the stream format. +At the end of the file, we write a *footer* containing a redundant copy of +the schema (which is a part of the streaming format) plus memory offsets and +sizes for each of the data blocks in the file. This enables random access to +any record batch in the file. See `File.fbs`_ for the precise details of the +file footer. + +Schematically we have: :: + + <magic number "ARROW1"> + <empty padding bytes [to 8 byte boundary]> + <STREAMING FORMAT with EOS> + <FOOTER> + <FOOTER SIZE: int32> + <magic number "ARROW1"> + +In the file format, there is no requirement that dictionary keys +should be defined in a ``DictionaryBatch`` before they are used in a +``RecordBatch``, as long as the keys are defined somewhere in the +file. Further more, it is invalid to have more than one **non-delta** +dictionary batch per dictionary ID (i.e. dictionary replacement is not +supported). Delta dictionaries are applied in the order they appear in +the file footer. We recommend the ".arrow" extension for files created with +this format. + +Dictionary Messages +------------------- + +Dictionaries are written in the stream and file formats as a sequence of record +batches, each having a single field. The complete semantic schema for a +sequence of record batches, therefore, consists of the schema along with all of +the dictionaries. The dictionary types are found in the schema, so it is +necessary to read the schema to first determine the dictionary types so that +the dictionaries can be properly interpreted: :: + + table DictionaryBatch { + id: long; + data: RecordBatch; + isDelta: boolean = false; + } + +The dictionary ``id`` in the message metadata can be referenced one or more times +in the schema, so that dictionaries can even be used for multiple fields. See +the :ref:`dictionary-encoded-layout` section for more about the semantics of +dictionary-encoded data. + +The dictionary ``isDelta`` flag allows existing dictionaries to be +expanded for future record batch materializations. A dictionary batch +with ``isDelta`` set indicates that its vector should be concatenated +with those of any previous batches with the same ``id``. In a stream +which encodes one column, the list of strings ``["A", "B", "C", "B", +"D", "C", "E", "A"]``, with a delta dictionary batch could take the +form: :: + + <SCHEMA> + <DICTIONARY 0> + (0) "A" + (1) "B" + (2) "C" + + <RECORD BATCH 0> + 0 + 1 + 2 + 1 + + <DICTIONARY 0 DELTA> + (3) "D" + (4) "E" + + <RECORD BATCH 1> + 3 + 2 + 4 + 0 + EOS + +Alternatively, if ``isDelta`` is set to false, then the dictionary +replaces the existing dictionary for the same ID. Using the same +example as above, an alternate encoding could be: :: + + + <SCHEMA> + <DICTIONARY 0> + (0) "A" + (1) "B" + (2) "C" + + <RECORD BATCH 0> + 0 + 1 + 2 + 1 + + <DICTIONARY 0> + (0) "A" + (1) "C" + (2) "D" + (3) "E" + + <RECORD BATCH 1> + 2 + 1 + 3 + 0 + EOS + + +Custom Application Metadata +--------------------------- + +We provide a ``custom_metadata`` field at three levels to provide a +mechanism for developers to pass application-specific metadata in +Arrow protocol messages. This includes ``Field``, ``Schema``, and +``Message``. + +The colon symbol ``:`` is to be used as a namespace separator. It can +be used multiple times in a key. + +The ``ARROW`` pattern is a reserved namespace for internal Arrow use +in the ``custom_metadata`` fields. For example, +``ARROW:extension:name``. + +.. _format_metadata_extension_types: + +Extension Types +--------------- + +User-defined "extension" types can be defined setting certain +``KeyValue`` pairs in ``custom_metadata`` in the ``Field`` metadata +structure. These extension keys are: + +* ``'ARROW:extension:name'`` for the string name identifying the + custom data type. We recommend that you use a "namespace"-style + prefix for extension type names to minimize the possibility of + conflicts with multiple Arrow readers and writers in the same + application. For example, use ``myorg.name_of_type`` instead of + simply ``name_of_type`` +* ``'ARROW:extension:metadata'`` for a serialized representation + of the ``ExtensionType`` necessary to reconstruct the custom type + +This extension metadata can annotate any of the built-in Arrow logical +types. The intent is that an implementation that does not support an +extension type can still handle the underlying data. For example a +16-byte UUID value could be embedded in ``FixedSizeBinary(16)``, and +implementations that do not have this extension type can still work +with the underlying binary values and pass along the +``custom_metadata`` in subsequent Arrow protocol messages. + +Extension types may or may not use the +``'ARROW:extension:metadata'`` field. Let's consider some example +extension types: + +* ``uuid`` represented as ``FixedSizeBinary(16)`` with empty metadata +* ``latitude-longitude`` represented as ``struct<latitude: double, + longitude: double>``, and empty metadata +* ``tensor`` (multidimensional array) stored as ``Binary`` values and + having serialized metadata indicating the data type and shape of + each value. This could be JSON like ``{'type': 'int8', 'shape': [4, + 5]}`` for a 4x5 cell tensor. +* ``trading-time`` represented as ``Timestamp`` with serialized + metadata indicating the market trading calendar the data corresponds + to + +Implementation guidelines +========================= + +An execution engine (or framework, or UDF executor, or storage engine, +etc) can implement only a subset of the Arrow spec and/or extend it +given the following constraints: + +Implementing a subset the spec +------------------------------ + +* **If only producing (and not consuming) arrow vectors**: Any subset + of the vector spec and the corresponding metadata can be implemented. +* **If consuming and producing vectors**: There is a minimal subset of + vectors to be supported. Production of a subset of vectors and + their corresponding metadata is always fine. Consumption of vectors + should at least convert the unsupported input vectors to the + supported subset (for example Timestamp.millis to timestamp.micros + or int32 to int64). + +Extensibility +------------- + +An execution engine implementor can also extend their memory +representation with their own vectors internally as long as they are +never exposed. Before sending data to another system expecting Arrow +data, these custom vectors should be converted to a type that exist in +the Arrow spec. + +.. _Flatbuffers: http://github.com/google/flatbuffers +.. _Flatbuffers protocol definition files: https://github.com/apache/arrow/tree/master/format +.. _Schema.fbs: https://github.com/apache/arrow/blob/master/format/Schema.fbs +.. _Message.fbs: https://github.com/apache/arrow/blob/master/format/Message.fbs +.. _File.fbs: https://github.com/apache/arrow/blob/master/format/File.fbs +.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering +.. _Intel performance guide: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors +.. _Endianness: https://en.wikipedia.org/wiki/Endianness +.. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates +.. _Parquet: https://parquet.apache.org/documentation/latest/ diff --git a/src/arrow/docs/source/format/Flight.rst b/src/arrow/docs/source/format/Flight.rst new file mode 100644 index 000000000..c79c56386 --- /dev/null +++ b/src/arrow/docs/source/format/Flight.rst @@ -0,0 +1,152 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _flight-rpc: + +Arrow Flight RPC +================ + +Arrow Flight is an RPC framework for high-performance data services +based on Arrow data, and is built on top of gRPC_ and the :doc:`IPC +format <IPC>`. + +Flight is organized around streams of Arrow record batches, being +either downloaded from or uploaded to another service. A set of +metadata methods offers discovery and introspection of streams, as +well as the ability to implement application-specific methods. + +Methods and message wire formats are defined by Protobuf, enabling +interoperability with clients that may support gRPC and Arrow +separately, but not Flight. However, Flight implementations include +further optimizations to avoid overhead in usage of Protobuf (mostly +around avoiding excessive memory copies). + +.. _gRPC: https://grpc.io/ + +RPC Methods +----------- + +Flight defines a set of RPC methods for uploading/downloading data, +retrieving metadata about a data stream, listing available data +streams, and for implementing application-specific RPC methods. A +Flight service implements some subset of these methods, while a Flight +client can call any of these methods. Thus, one Flight client can +connect to any Flight service and perform basic operations. + +Data streams are identified by descriptors, which are either a path or +an arbitrary binary command. A client that wishes to download the data +would: + +#. Construct or acquire a ``FlightDescriptor`` for the data set they + are interested in. A client may know what descriptor they want + already, or they may use methods like ``ListFlights`` to discover + them. +#. Call ``GetFlightInfo(FlightDescriptor)`` to get a ``FlightInfo`` + message containing details on where the data is located (as well as + other metadata, like the schema and possibly an estimate of the + dataset size). + + Flight does not require that data live on the same server as + metadata: this call may list other servers to connect to. The + ``FlightInfo`` message includes a ``Ticket``, an opaque binary + token that the server uses to identify the exact data set being + requested. +#. Connect to other servers (if needed). +#. Call ``DoGet(Ticket)`` to get back a stream of Arrow record + batches. + +To upload data, a client would: + +#. Construct or acquire a ``FlightDescriptor``, as before. +#. Call ``DoPut(FlightData)`` and upload a stream of Arrow record + batches. They would also include the ``FlightDescriptor`` with the + first message. + +See `Protocol Buffer Definitions`_ for full details on the methods and +messages involved. + +Authentication +-------------- + +Flight supports application-implemented authentication +methods. Authentication, if enabled, has two phases: at connection +time, the client and server can exchange any number of messages. Then, +the client can provide a token alongside each call, and the server can +validate that token. + +Applications may use any part of this; for instance, they may ignore +the initial handshake and send an externally acquired token on each +call, or they may establish trust during the handshake and not +validate a token for each call. (Note that the latter is not secure if +you choose to deploy a layer 7 load balancer, as is common with gRPC.) + +Error Handling +-------------- + +Arrow Flight defines its own set of error codes. The implementation +differs between languages (e.g. in C++, Unimplemented is a general +Arrow error status while it's a Flight-specific exception in Java), +but the following set is exposed: + ++----------------+-------------------------------------------+ +|Error Code |Description | ++================+===========================================+ +|UNKNOWN |An unknown error. The default if no other | +| |error applies. | ++----------------+-------------------------------------------+ +|INTERNAL |An error internal to the service | +| |implementation occurred. | ++----------------+-------------------------------------------+ +|INVALID_ARGUMENT|The client passed an invalid argument to | +| |the RPC. | ++----------------+-------------------------------------------+ +|TIMED_OUT |The operation exceeded a timeout or | +| |deadline. | ++----------------+-------------------------------------------+ +|NOT_FOUND |The requested resource (action, data | +| |stream) was not found. | ++----------------+-------------------------------------------+ +|ALREADY_EXISTS |The resource already exists. | ++----------------+-------------------------------------------+ +|CANCELLED |The operation was cancelled (either by the | +| |client or the server). | ++----------------+-------------------------------------------+ +|UNAUTHENTICATED |The client is not authenticated. | ++----------------+-------------------------------------------+ +|UNAUTHORIZED |The client is authenticated, but does not | +| |have permissions for the requested | +| |operation. | ++----------------+-------------------------------------------+ +|UNIMPLEMENTED |The RPC is not implemented. | ++----------------+-------------------------------------------+ +|UNAVAILABLE |The server is not available. May be emitted| +| |by the client for connectivity reasons. | ++----------------+-------------------------------------------+ + + +External Resources +------------------ + +- https://arrow.apache.org/blog/2018/10/09/0.11.0-release/ +- https://www.slideshare.net/JacquesNadeau5/apache-arrow-flight-overview + +Protocol Buffer Definitions +--------------------------- + +.. literalinclude:: ../../../format/Flight.proto + :language: protobuf + :linenos: diff --git a/src/arrow/docs/source/format/Guidelines.rst b/src/arrow/docs/source/format/Guidelines.rst new file mode 100644 index 000000000..40624521a --- /dev/null +++ b/src/arrow/docs/source/format/Guidelines.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +:orphan: + +Implementation Guidelines +========================= + +The contents of this document have relocated to the main :ref:`Columnar +Specification <format_columnar>` page. diff --git a/src/arrow/docs/source/format/IPC.rst b/src/arrow/docs/source/format/IPC.rst new file mode 100644 index 000000000..65b47f7d7 --- /dev/null +++ b/src/arrow/docs/source/format/IPC.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +:orphan: + +IPC +=== + +The contents of this document have relocated to the main :ref:`Columnar +Specification <format_columnar>` page. diff --git a/src/arrow/docs/source/format/Integration.rst b/src/arrow/docs/source/format/Integration.rst new file mode 100644 index 000000000..22d595e99 --- /dev/null +++ b/src/arrow/docs/source/format/Integration.rst @@ -0,0 +1,398 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _format_integration_testing: + +Integration Testing +=================== + +Our strategy for integration testing between Arrow implementations is: + +* Test datasets are specified in a custom human-readable, JSON-based format + designed exclusively for Arrow's integration tests +* Each implementation provides a testing executable capable of converting + between the JSON and the binary Arrow file representation +* The test executable is also capable of validating the contents of a binary + file against a corresponding JSON file + +Running integration tests +------------------------- + +The integration test data generator and runner are implemented inside +the :ref:`Archery <archery>` utility. + +The integration tests are run using the ``archery integration`` command. + +.. code-block:: shell + + archery integration --help + +In order to run integration tests, you'll first need to build each component +you want to include. See the respective developer docs for C++, Java, etc. +for instructions on building those. + +Some languages may require additional build options to enable integration +testing. For C++, for example, you need to add ``-DARROW_BUILD_INTEGRATION=ON`` +to your cmake command. + +Depending on which components you have built, you can enable and add them to +the archery test run. For example, if you only have the C++ project built, run: + +.. code-block:: shell + + archery integration --with-cpp=1 + + +For Java, it may look like: + +.. code-block:: shell + + VERSION=0.11.0-SNAPSHOT + export ARROW_JAVA_INTEGRATION_JAR=$JAVA_DIR/tools/target/arrow-tools-$VERSION-jar-with-dependencies.jar + archery integration --with-cpp=1 --with-java=1 + +To run all tests, including Flight integration tests, do: + +.. code-block:: shell + + archery integration --with-all --run-flight + +Note that we run these tests in continuous integration, and the CI job uses +docker-compose. You may also run the docker-compose job locally, or at least +refer to it if you have questions about how to build other languages or enable +certain tests. + +See :ref:`docker-builds` for more information about the project's +``docker-compose`` configuration. + +JSON test data format +--------------------- + +A JSON representation of Arrow columnar data is provided for +cross-language integration testing purposes. +This representation is `not canonical <https://lists.apache.org/thread.html/6947fb7666a0f9cc27d9677d2dad0fb5990f9063b7cf3d80af5e270f%40%3Cdev.arrow.apache.org%3E>`_ +but it provides a human-readable way of verifying language implementations. + +See `here <https://github.com/apache/arrow/tree/master/docs/source/format/integration_json_examples>`_ +for some examples of this JSON data. + +.. can we check in more examples, e.g. from the generated_*.json test files? + +The high level structure of a JSON integration test files is as follows: + +**Data file** :: + + { + "schema": /*Schema*/, + "batches": [ /*RecordBatch*/ ], + "dictionaries": [ /*DictionaryBatch*/ ], + } + +All files contain ``schema`` and ``batches``, while ``dictionaries`` is only +present if there are dictionary type fields in the schema. + +**Schema** :: + + { + "fields" : [ + /* Field */ + ], + "metadata" : /* Metadata */ + } + +**Field** :: + + { + "name" : "name_of_the_field", + "nullable" : /* boolean */, + "type" : /* Type */, + "children" : [ /* Field */ ], + "dictionary": { + "id": /* integer */, + "indexType": /* Type */, + "isOrdered": /* boolean */ + }, + "metadata" : /* Metadata */ + } + +The ``dictionary`` attribute is present if and only if the ``Field`` corresponds to a +dictionary type, and its ``id`` maps onto a column in the ``DictionaryBatch``. In this +case the ``type`` attribute describes the value type of the dictionary. + +For primitive types, ``children`` is an empty array. + +**Metadata** :: + + null | + [ { + "key": /* string */, + "value": /* string */ + } ] + +A key-value mapping of custom metadata. It may be omitted or null, in which case it is +considered equivalent to ``[]`` (no metadata). Duplicated keys are not forbidden here. + +**Type**: :: + + { + "name" : "null|struct|list|largelist|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map" + } + +A ``Type`` will have other fields as defined in +`Schema.fbs <https://github.com/apache/arrow/tree/master/format/Schema.fbs>`_ +depending on its name. + +Int: :: + + { + "name" : "int", + "bitWidth" : /* integer */, + "isSigned" : /* boolean */ + } + +FloatingPoint: :: + + { + "name" : "floatingpoint", + "precision" : "HALF|SINGLE|DOUBLE" + } + +FixedSizeBinary: :: + + { + "name" : "fixedsizebinary", + "byteWidth" : /* byte width */ + } + +Decimal: :: + + { + "name" : "decimal", + "precision" : /* integer */, + "scale" : /* integer */ + } + +Timestamp: :: + + { + "name" : "timestamp", + "unit" : "$TIME_UNIT", + "timezone": "$timezone" + } + +``$TIME_UNIT`` is one of ``"SECOND|MILLISECOND|MICROSECOND|NANOSECOND"`` + +"timezone" is an optional string. + +Duration: :: + + { + "name" : "duration", + "unit" : "$TIME_UNIT" + } + +Date: :: + + { + "name" : "date", + "unit" : "DAY|MILLISECOND" + } + +Time: :: + + { + "name" : "time", + "unit" : "$TIME_UNIT", + "bitWidth": /* integer: 32 or 64 */ + } + +Interval: :: + + { + "name" : "interval", + "unit" : "YEAR_MONTH|DAY_TIME" + } + +Union: :: + + { + "name" : "union", + "mode" : "SPARSE|DENSE", + "typeIds" : [ /* integer */ ] + } + +The ``typeIds`` field in ``Union`` are the codes used to denote which member of +the union is active in each array slot. Note that in general these discriminants are not identical +to the index of the corresponding child array. + +List: :: + + { + "name": "list" + } + +The type that the list is a "list of" will be included in the ``Field``'s +"children" member, as a single ``Field`` there. For example, for a list of +``int32``, :: + + { + "name": "list_nullable", + "type": { + "name": "list" + }, + "nullable": true, + "children": [ + { + "name": "item", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + } + ] + } + +FixedSizeList: :: + + { + "name": "fixedsizelist", + "listSize": /* integer */ + } + +This type likewise comes with a length-1 "children" array. + +Struct: :: + + { + "name": "struct" + } + +The ``Field``'s "children" contains an array of ``Fields`` with meaningful +names and types. + +Map: :: + + { + "name": "map", + "keysSorted": /* boolean */ + } + +The ``Field``'s "children" contains a single ``struct`` field, which itself +contains 2 children, named "key" and "value". + +Null: :: + + { + "name": "null" + } + +Extension types are, as in the IPC format, represented as their underlying +storage type plus some dedicated field metadata to reconstruct the extension +type. For example, assuming a "uuid" extension type backed by a +FixedSizeBinary(16) storage, here is how a "uuid" field would be represented:: + + { + "name" : "name_of_the_field", + "nullable" : /* boolean */, + "type" : { + "name" : "fixedsizebinary", + "byteWidth" : 16 + }, + "children" : [], + "metadata" : [ + {"key": "ARROW:extension:name", "value": "uuid"}, + {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} + ] + } + +**RecordBatch**:: + + { + "count": /* integer number of rows */, + "columns": [ /* FieldData */ ] + } + +**DictionaryBatch**:: + + { + "id": /* integer */, + "data": [ /* RecordBatch */ ] + } + +**FieldData**:: + + { + "name": "field_name", + "count" "field_length", + "$BUFFER_TYPE": /* BufferData */ + ... + "$BUFFER_TYPE": /* BufferData */ + "children": [ /* FieldData */ ] + } + +The "name" member of a ``Field`` in the ``Schema`` corresponds to the "name" +of a ``FieldData`` contained in the "columns" of a ``RecordBatch``. +For nested types (list, struct, etc.), ``Field``'s "children" each have a +"name" that corresponds to the "name" of a ``FieldData`` inside the +"children" of that ``FieldData``. +For ``FieldData`` inside of a ``DictionaryBatch``, the "name" field does not +correspond to anything. + +Here ``$BUFFER_TYPE`` is one of ``VALIDITY``, ``OFFSET`` (for +variable-length types, such as strings and lists), ``TYPE_ID`` (for unions), +or ``DATA``. + +``BufferData`` is encoded based on the type of buffer: + +* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable + ``Field`` still has a ``VALIDITY`` array, even though all values are 1. +* ``OFFSET``: a JSON array of integers for 32-bit offsets or + string-formatted integers for 64-bit offsets +* ``TYPE_ID``: a JSON array of integers +* ``DATA``: a JSON array of encoded values + +The value encoding for ``DATA`` is different depending on the logical +type: + +* For boolean type: an array of 1 (true) and 0 (false). +* For integer-based types (including timestamps): an array of JSON numbers. +* For 64-bit integers: an array of integers formatted as JSON strings, + so as to avoid loss of precision. +* For floating point types: an array of JSON numbers. Values are limited + to 3 decimal places to avoid loss of precision. +* For binary types, an array of uppercase hex-encoded strings, so as + to represent arbitrary binary data. +* For UTF-8 string types, an array of JSON strings. + +For "list" and "largelist" types, ``BufferData`` has ``VALIDITY`` and +``OFFSET``, and the rest of the data is inside "children". These child +``FieldData`` contain all of the same attributes as non-child data, so in +the example of a list of ``int32``, the child data has ``VALIDITY`` and +``DATA``. + +For "fixedsizelist", there is no ``OFFSET`` member because the offsets are +implied by the field's "listSize". + +Note that the "count" for these child data may not match the parent "count". +For example, if a ``RecordBatch`` has 7 rows and contains a ``FixedSizeList`` +of ``listSize`` 4, then the data inside the "children" of that ``FieldData`` +will have count 28. + +For "null" type, ``BufferData`` does not contain any buffers. diff --git a/src/arrow/docs/source/format/Layout.rst b/src/arrow/docs/source/format/Layout.rst new file mode 100644 index 000000000..4568f31c5 --- /dev/null +++ b/src/arrow/docs/source/format/Layout.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +:orphan: + +Physical Memory Layout +====================== + +The contents of this document have relocated to the main :ref:`Columnar +Specification <format_columnar>` page. diff --git a/src/arrow/docs/source/format/Metadata.rst b/src/arrow/docs/source/format/Metadata.rst new file mode 100644 index 000000000..55045abb0 --- /dev/null +++ b/src/arrow/docs/source/format/Metadata.rst @@ -0,0 +1,24 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +:orphan: + +Format Metadata +=============== + +The contents of this document have relocated to the main :ref:`Columnar +Specification <format_columnar>` page. diff --git a/src/arrow/docs/source/format/Other.rst b/src/arrow/docs/source/format/Other.rst new file mode 100644 index 000000000..9504998d6 --- /dev/null +++ b/src/arrow/docs/source/format/Other.rst @@ -0,0 +1,63 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Other Data Structures +===================== + +Our Flatbuffers protocol files have metadata for some other data +structures defined to allow other kinds of applications to take +advantage of common interprocess communication machinery. These data +structures are not considered to be part of the columnar format. + +An Arrow columnar implementation is not required to implement these +types. + +Tensor (Multi-dimensional Array) +-------------------------------- + +The ``Tensor`` message types provides a way to write a +multidimensional array of fixed-size values (such as a NumPy ndarray). + +When writing a standalone encapsulated tensor message, we use the +encapsulated IPC format defined in the :ref:`Columnar Specification +<format_columnar>`, but additionally align the starting offset of the +tensor body to be a multiple of 64 bytes: :: + + <metadata prefix and metadata> + <PADDING> + <tensor body> + +Sparse Tensor +------------- + +``SparseTensor`` represents a multidimensional array whose elements +are generally almost all zeros. + +When writing a standalone encapsulated sparse tensor message, we use +the encapsulated IPC format defined in the :ref:`Columnar Specification +<format_columnar>`, but additionally align the starting offsets of the +sparse index and the sparse tensor body (if writing to a shared memory +region) to be multiples of 64 bytes: :: + + <metadata prefix and metadata> + <PADDING> + <sparse index> + <PADDING> + <sparse tensor body> + +The contents of the sparse tensor index depends on what kind of sparse +format is used. diff --git a/src/arrow/docs/source/format/README.md b/src/arrow/docs/source/format/README.md new file mode 100644 index 000000000..68a2d72b5 --- /dev/null +++ b/src/arrow/docs/source/format/README.md @@ -0,0 +1,24 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# Apache Arrow Format Documentation + +These documents go together with the Flatbuffers and Protocol Buffers +protocol definition files to provide sufficient detail necessary to +build a new Arrow implementation.
\ No newline at end of file diff --git a/src/arrow/docs/source/format/Versioning.rst b/src/arrow/docs/source/format/Versioning.rst new file mode 100644 index 000000000..b70656987 --- /dev/null +++ b/src/arrow/docs/source/format/Versioning.rst @@ -0,0 +1,70 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Format Versioning and Stability +=============================== + +Starting with version 1.0.0, Apache Arrow utilizes +**two versions** to describe each release of the project: +the **Format Version** and the **Library Version**. Each Library +Version has a corresponding Format Version, and multiple versions of +the library may have the same format version. For example, library +versions 2.0.0 and 3.0.0 may both track format version 1.0.0. + +For library versions prior to 1.0.0, major releases may contain API +changes. From 1.0.0 onward, we follow `Semantic Versioning +<https://semver.org/>`_ with regards to communicating API changes. We +expect most releases to be major library releases. + +Backward Compatibility +---------------------- + +A newer versioned client library will be able to read any data and +metadata produced by an older client library. + +So long as the **major** format version is not changed, a newer +library is backward compatible with an older library. + +Forward Compatibility +--------------------- + +An older client library must be able to either read data generated +from a new client library or detect that it cannot properly read the +data. + +An increase in the **minor** version of the format version, such as +1.0.0 to 1.1.0, indicates that 1.1.0 contains new features not +available in 1.0.0. So long as these features are not used (such as a +new logical data type), forward compatibility is preserved. + +Long-Term Stability +------------------- + +A change in the format major version (e.g. from 1.0.0 to 2.0.0) +indicates a disruption to these compatibility guarantees in some way. +We **do not expect** this to be a frequent occurrence. +This would be an exceptional +event and, should this come to pass, we would exercise caution in +ensuring that production applications are not harmed. + +Pre-1.0.0 Versions +------------------ + +We made no forward or backward compatibility guarantees for +versions prior to 1.0.0. However, we made every effort to ensure +that new clients can read serialized data produced by library version +0.8.0 and onward. diff --git a/src/arrow/docs/source/format/integration_json_examples/simple.json b/src/arrow/docs/source/format/integration_json_examples/simple.json new file mode 100644 index 000000000..663472919 --- /dev/null +++ b/src/arrow/docs/source/format/integration_json_examples/simple.json @@ -0,0 +1,98 @@ +{ + "schema": { + "fields": [ + { + "name": "foo", + "type": {"name": "int", "isSigned": true, "bitWidth": 32}, + "nullable": true, + "children": [] + }, + { + "name": "bar", + "type": {"name": "floatingpoint", "precision": "DOUBLE"}, + "nullable": true, + "children": [] + }, + { + "name": "baz", + "type": {"name": "utf8"}, + "nullable": true, + "children": [] + } + ] + }, + "batches": [ + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [1, 0, 1, 1, 1], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [1, 0, 0, 1, 1], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [1, 0, 0, 1, 1], + "OFFSET": [0, 2, 2, 2, 5, 9], + "DATA": ["aa", "", "", "bbb", "cccc"] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [1, 1, 1, 1, 1], + "OFFSET": [0, 2, 3, 4, 7, 11], + "DATA": ["aa", "b", "c", "ddd", "eeee"] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "DATA": [1, 2, 3, 4, 5] + }, + { + "name": "bar", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0] + }, + { + "name": "baz", + "count": 5, + "VALIDITY": [0, 0, 0, 0, 0], + "OFFSET": [0, 0, 0, 0, 0, 0], + "DATA": ["", "", "", "", ""] + } + ] + } + ] +} diff --git a/src/arrow/docs/source/format/integration_json_examples/struct.json b/src/arrow/docs/source/format/integration_json_examples/struct.json new file mode 100644 index 000000000..4e6cc774e --- /dev/null +++ b/src/arrow/docs/source/format/integration_json_examples/struct.json @@ -0,0 +1,201 @@ +{ + "schema": { + "fields": [ + { + "name": "struct_nullable", + "type": { + "name": "struct" + }, + "nullable": true, + "children": [ + { + "name": "f1", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "f2", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + } + ] + } + ] + }, + "batches": [ + { + "count": 7, + "columns": [ + { + "name": "struct_nullable", + "count": 7, + "VALIDITY": [ + 0, + 1, + 1, + 1, + 0, + 1, + 0 + ], + "children": [ + { + "name": "f1", + "count": 7, + "VALIDITY": [ + 1, + 0, + 1, + 1, + 1, + 0, + 0 + ], + "DATA": [ + 1402032511, + 290876774, + 137773603, + 410361374, + 1959836418, + 1995074679, + -163525262 + ] + }, + { + "name": "f2", + "count": 7, + "VALIDITY": [ + 0, + 1, + 1, + 1, + 0, + 1, + 0 + ], + "OFFSET": [ + 0, + 0, + 7, + 14, + 21, + 21, + 28, + 28 + ], + "DATA": [ + "", + "MhRNxD4", + "3F9HBxK", + "aVd88fp", + "", + "3loZrRf", + "" + ] + } + ] + } + ] + }, + { + "count": 10, + "columns": [ + { + "name": "struct_nullable", + "count": 10, + "VALIDITY": [ + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 1, + 1, + 1 + ], + "children": [ + { + "name": "f1", + "count": 10, + "VALIDITY": [ + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0 + ], + "DATA": [ + -2041500147, + 1715692943, + -35444996, + 1425496657, + 112765084, + 1760754983, + 413888857, + 2039738337, + -1924327700, + 670528518 + ] + }, + { + "name": "f2", + "count": 10, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 0 + ], + "OFFSET": [ + 0, + 7, + 7, + 7, + 14, + 21, + 28, + 35, + 42, + 49, + 49 + ], + "DATA": [ + "AS5oARE", + "", + "", + "JGdagcX", + "78SLiRw", + "vbGf7OY", + "5uh5fTs", + "0ilsf82", + "LjS9MbU", + "" + ] + } + ] + } + ] + } + ] +} diff --git a/src/arrow/docs/source/index.rst b/src/arrow/docs/source/index.rst new file mode 100644 index 000000000..90d6ac09b --- /dev/null +++ b/src/arrow/docs/source/index.rst @@ -0,0 +1,96 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Apache Arrow +============ + +Apache Arrow is a development platform for in-memory analytics. It contains a +set of technologies that enable big data systems to process and move data +fast. It specifies a standardized language-independent columnar memory format +for flat and hierarchical data, organized for efficient analytic operations on +modern hardware. + +The project is developing a multi-language collection of libraries for solving +systems problems related to in-memory analytical data processing. This includes +such topics as: + +* Zero-copy shared memory and RPC-based data movement +* Reading and writing file formats (like CSV, Apache ORC, and Apache Parquet) +* In-memory analytics and query processing + +**To learn how to use Arrow refer to the documentation specific to your +target environment.** + +.. _toc.usage: + +.. toctree:: + :maxdepth: 1 + :caption: Supported Environments + + C/GLib <c_glib/index> + C++ <cpp/index> + C# <https://github.com/apache/arrow/blob/master/csharp/README.md> + Go <https://godoc.org/github.com/apache/arrow/go/arrow> + Java <java/index> + JavaScript <js/index> + Julia <https://github.com/apache/arrow/blob/master/julia/Arrow/README.md> + MATLAB <https://github.com/apache/arrow/blob/master/matlab/README.md> + Python <python/index> + R <r/index> + Ruby <https://github.com/apache/arrow/blob/master/ruby/README.md> + Rust <https://docs.rs/crate/arrow/> + status + +.. _toc.cookbook: + +.. toctree:: + :maxdepth: 1 + :caption: Cookbooks + + C++ <https://arrow.apache.org/cookbook/cpp/> + Python <https://arrow.apache.org/cookbook/py/> + R <https://arrow.apache.org/cookbook/r/> + +.. _toc.columnar: + +.. toctree:: + :maxdepth: 2 + :caption: Specifications and Protocols + + format/Versioning + format/Columnar + format/Flight + format/Integration + format/CDataInterface + format/CStreamInterface + format/Other + +.. _toc.development: + +.. toctree:: + :maxdepth: 2 + :caption: Development + + developers/contributing + developers/cpp/index + developers/python + developers/archery + developers/crossbow + developers/docker + developers/benchmarks + developers/documentation + developers/computeir diff --git a/src/arrow/docs/source/java/algorithm.rst b/src/arrow/docs/source/java/algorithm.rst new file mode 100644 index 000000000..f838398af --- /dev/null +++ b/src/arrow/docs/source/java/algorithm.rst @@ -0,0 +1,92 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Java Algorithms +=============== + +Arrow's Java library provides algorithms for some commonly-used +functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm`` +package of the ``algorithm`` module. + +Comparing Vector Elements +------------------------- + +Comparing vector elements is the basic for many algorithms. Vector +elements can be compared in one of the two ways: + +1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``. +Currently, this type of comparison is supported through the ``org.apache.arrow.vector.compare.VectorValueEqualizer`` +interface. + +2. **Ordering comparison**: there are three possible results for this type of comparisons: ``less than``, ``equal to `` +and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``. + +We provide default implementations to compare vector elements. However, users can also define ways +for customized comparisons. + +Vector Element Search +--------------------- + +A search algorithm tries to find a particular value in a vector. When successful, a vector index is +returned; otherwise, a ``-1`` is returned. The following search algorithms are provided: + +1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is +found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements +in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``. + +2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. +However, it is only applicable to sorted vectors. To get a sorted vector, +one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm +is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``. + +3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search +for a value. To make this process faster, one can split the vector into multiple partitions, and perform the +search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``. + +4. **Range search**: for many scenarios, there can be multiple matching values in the vector. +If the vector is sorted, the matching values reside in a contiguous region in the vector. The +range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. +An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``. + +Vector Sorting +-------------- + +Given a vector, a sorting algorithm turns it into a sorted one. The sorting criteria must +be specified by some ordering comparison operation. The sorting algorithms can be +classified into the following categories: + +1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original +vector, without creating any new vector. So it just returns the original vector after the sorting operations. +Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place +sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. + +2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead, +it copies vector elements to a new vector in sorted order, and returns the new vector. +We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` +and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter`` +for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. + +3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer +vector, which correspond to indices of vector elements in sorted order. With the index vector, one can +easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k``th +smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, +which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. + +Other Algorithms +---------------- + +Other algorithms include vector deduplication, dictionary encoding, etc., in the ``algorithm`` module. diff --git a/src/arrow/docs/source/java/index.rst b/src/arrow/docs/source/java/index.rst new file mode 100644 index 000000000..65a7a3a4f --- /dev/null +++ b/src/arrow/docs/source/java/index.rst @@ -0,0 +1,31 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Java Implementation +=================== + +This is the documentation of the Java API of Apache Arrow. For more details +on the Arrow format and other language bindings see the :doc:`parent documentation <../index>`. + +.. toctree:: + :maxdepth: 2 + + vector + vector_schema_root + ipc + algorithm + Reference (javadoc) <reference/index> diff --git a/src/arrow/docs/source/java/ipc.rst b/src/arrow/docs/source/java/ipc.rst new file mode 100644 index 000000000..7cab480c4 --- /dev/null +++ b/src/arrow/docs/source/java/ipc.rst @@ -0,0 +1,187 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=========================== +Reading/Writing IPC formats +=========================== +Arrow defines two types of binary formats for serializing record batches: + +* **Streaming format**: for sending an arbitrary number of record + batches. The format must be processed from start to end, and does not support + random access + +* **File or Random Access format**: for serializing a fixed number of record + batches. It supports random access, and thus is very useful when used with + memory maps + +Writing and Reading Streaming Format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +First, let's populate a :class:`VectorSchemaRoot` with a small batch of records + +.. code-block:: Java + + BitVector bitVector = new BitVector("boolean", allocator); + VarCharVector varCharVector = new VarCharVector("varchar", allocator); + for (int i = 0; i < 10; i++) { + bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); + varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); + } + bitVector.setValueCount(10); + varCharVector.setValueCount(10); + + List<Field> fields = Arrays.asList(bitVector.getField(), varCharVector.getField()); + List<FieldVector> vectors = Arrays.asList(bitVector, varCharVector); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors); + +Now, we can begin writing a stream containing some number of these batches. For this we use :class:`ArrowStreamWriter` +(DictionaryProvider used for any vectors that are dictionary encoded is optional and can be null)):: + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out)); + + +Here we used an in-memory stream, but this could have been a socket or some other IO stream. Then we can do + +.. code-block:: Java + + writer.start(); + // write the first batch + writer.writeBatch(); + + // write another four batches. + for (int i = 0; i < 4; i++) { + // populate VectorSchemaRoot data and write the second batch + BitVector childVector1 = (BitVector)root.getVector(0); + VarCharVector childVector2 = (VarCharVector)root.getVector(1); + childVector1.reset(); + childVector2.reset(); + ... do some populate work here, could be different for each batch + writer.writeBatch(); + } + + // end + writer.end(); + +Note since the :class:`VectorSchemaRoot` in writer is a container that can hold batches, batches flow through +:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before `writeBatch` so that later batches +could overwrite previous ones. + +Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches. +We can read such a stream with :class:`ArrowStreamReader`, note that :class:`VectorSchemaRoot` within +reader will be loaded with new values on every call to :class:`loadNextBatch()` + +.. code-block:: Java + + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) { + Schema schema = reader.getVectorSchemaRoot().getSchema(); + for (int i = 0; i < 5; i++) { + // This will be loaded with new values on every call to loadNextBatch + VectorSchemaRoot readBatch = reader.getVectorSchemaRoot(); + reader.loadNextBatch(); + ... do something with readBatch + } + + } + +Here we also give a simple example with dictionary encoded vectors + +.. code-block:: Java + + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + // create dictionary and provider + final VarCharVector dictVector = new VarCharVector("dict", allocator); + dictVector.allocateNewSafe(); + dictVector.setSafe(0, "aa".getBytes()); + dictVector.setSafe(1, "bb".getBytes()); + dictVector.setSafe(2, "cc".getBytes()); + dictVector.setValueCount(3); + + Dictionary dictionary = + new Dictionary(dictVector, new DictionaryEncoding(1L, false, /*indexType=*/null)); + provider.put(dictionary); + + // create vector and encode it + final VarCharVector vector = new VarCharVector("vector", allocator); + vector.allocateNewSafe(); + vector.setSafe(0, "bb".getBytes()); + vector.setSafe(1, "bb".getBytes()); + vector.setSafe(2, "cc".getBytes()); + vector.setSafe(3, "aa".getBytes()); + vector.setValueCount(4); + + // get the encoded vector + IntVector encodedVector = (IntVector) DictionaryEncoder.encode(vector, dictionary); + + // create VectorSchemaRoot + List<Field> fields = Arrays.asList(encodedVector.getField()); + List<FieldVector> vectors = Arrays.asList(encodedVector); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors); + + // write data + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out)); + writer.start(); + writer.writeBatch(); + writer.end(); + + // read data + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) { + reader.loadNextBatch(); + VectorSchemaRoot readRoot = reader.getVectorSchemaRoot(); + // get the encoded vector + IntVector intVector = (IntVector) readRoot.getVector(0); + + // get dictionaries and decode the vector + Map<Long, Dictionary> dictionaryMap = reader.getDictionaryVectors(); + long dictionaryId = intVector.getField().getDictionary().getId(); + VarCharVector varCharVector = + (VarCharVector) DictionaryEncoder.decode(intVector, dictionaryMap.get(dictionaryId)); + + } + +Writing and Reading Random Access Files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The :class:`ArrowFileWriter` has the same API as :class:`ArrowStreamWriter` + +.. code-block:: Java + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(root, null, Channels.newChannel(out)); + writer.start(); + // write the first batch + writer.writeBatch(); + // write another four batches. + for (int i = 0; i < 4; i++) { + ... do populate work + writer.writeBatch(); + } + writer.end(); + +The difference between :class:`ArrowFileReader` and :class:`ArrowStreamReader` is that the input source +must have a ``seek`` method for random access. Because we have access to the entire payload, we know the +number of record batches in the file, and can read any at random + +.. code-block:: Java + + try (ArrowFileReader reader = new ArrowFileReader( + new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator)) { + + // read the 4-th batch + ArrowBlock block = reader.getRecordBlocks().get(3); + reader.loadRecordBatch(block); + VectorSchemaRoot readBatch = reader.getVectorSchemaRoot(); + } diff --git a/src/arrow/docs/source/java/reference/index.rst b/src/arrow/docs/source/java/reference/index.rst new file mode 100644 index 000000000..523ac0c7f --- /dev/null +++ b/src/arrow/docs/source/java/reference/index.rst @@ -0,0 +1,21 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Java Reference (javadoc) +======================== + +Stub page for the Java reference docs; actual source is located in the java/ directory. diff --git a/src/arrow/docs/source/java/vector.rst b/src/arrow/docs/source/java/vector.rst new file mode 100644 index 000000000..ece07d0a7 --- /dev/null +++ b/src/arrow/docs/source/java/vector.rst @@ -0,0 +1,288 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=========== +ValueVector +=========== + +:class:`ValueVector` interface (which called Array in C++ implementation and +the :doc:`the specification <../format/Columnar>`) is an abstraction that is used to store a +sequence of values having the same type in an individual column. Internally, those values are +represented by one or several buffers, the number and meaning of which depend on the vector’s data type. + +There are concrete subclasses of :class:`ValueVector` for each primitive data type +and nested type described in the specification. There are a few differences in naming +with the type names described in the specification: +Table with non-intuitive names (BigInt = 64 bit integer, etc). + +It is important that vector is allocated before attempting to read or write, +:class:`ValueVector` "should" strive to guarantee this order of operation: +create > allocate > mutate > set value count > access > clear (or allocate to start the process over). +We will go through a concrete example to demonstrate each operation in the next section. + +Vector Life Cycle +================= + +As discussed above, each vector goes through several steps in its life cycle, +and each step is triggered by a vector operation. In particular, we have the following vector operations: + +1. **Vector creation**: we create a new vector object by, for example, the vector constructor. +The following code creates a new ``IntVector`` by the constructor: + +.. code-block:: Java + + RootAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ... + IntVector vector = new IntVector("int vector", allocator); + +By now, a vector object is created. However, no underlying memory has been allocated, so we need the +following step. + +2. **Vector allocation**: in this step, we allocate memory for the vector. For most vectors, we +have two options: 1) if we know the maximum vector capacity, we can specify it by calling the +``allocateNew(int)`` method; 2) otherwise, we should call the ``allocateNew()`` method, and a default +capacity will be allocated for it. For our running example, we assume that the vector capacity never +exceeds 10: + +.. code-block:: Java + + vector.allocateNew(10); + +3. **Vector mutation**: now we can populate the vector with values we desire. For all vectors, we can populate +vector values through vector writers (An example will be given in the next section). For primitive types, +we can also mutate the vector by the set methods. There are two classes of set methods: 1) if we can +be sure the vector has enough capacity, we can call the ``set(index, value)`` method. 2) if we are not sure +about the vector capacity, we should call the ``setSafe(index, value)`` method, which will automatically +take care of vector reallocation, if the capacity is not sufficient. For our running example, we know the +vector has enough capacity, so we can call + +.. code-block:: Java + + vector.set(/*index*/5, /*value*/25); + +4. **Set value count**: for this step, we set the value count of the vector by calling the +``setValueCount(int)`` method: + +.. code-block:: Java + + vector.setValueCount(10); + +After this step, the vector enters an immutable state. In other words, we should no longer mutate it. +(Unless we reuse the vector by allocating it again. This will be discussed shortly.) + +5. **Vector access**: it is time to access vector values. Similarly, we have two options to access values: +1) get methods and 2) vector reader. Vector reader works for all types of vectors, while get methods are +only available for primitive vectors. A concrete example for vector reader will be given in the next section. +Below is an example of vector access by get method: + +.. code-block:: Java + + int value = vector.get(5); // value == 25 + +6. **Vector clear**: when we are done with the vector, we should clear it to release its memory. This is done by +calling the ``close()`` method: + +.. code-block:: Java + + vector.close(); + +Some points to note about the steps above: + +* The steps are not necessarily performed in a linear sequence. Instead, they can be in a loop. For example, + when a vector enters the access step, we can also go back to the vector mutation step, and then set value + count, access vector, and so on. + +* We should try to make sure the above steps are carried out in order. Otherwise, the vector + may be in an undefined state, and some unexpected behavior may occur. However, this restriction + is not strict. That means it is possible that we violates the order above, but still get + correct results. + +* When mutating vector values through set methods, we should prefer ``set(index, value)`` methods to + ``setSafe(index, value)`` methods whenever possible, to avoid unnecessary performance overhead of handling + vector capacity. + +* All vectors implement the ``AutoCloseable`` interface. So they must be closed explicitly when they are + no longer used, to avoid resource leak. To make sure of this, it is recommended to place vector related operations + into a try-with-resources block. + +* For fixed width vectors (e.g. IntVector), we can set values at different indices in arbitrary orders. + For variable width vectors (e.g. VarCharVector), however, we must set values in non-decreasing order of the + indices. Otherwise, the values after the set position will become invalid. For example, suppose we use the + following statements to populate a variable width vector: + +.. code-block:: Java + + VarCharVector vector = new VarCharVector("vector", allocator); + vector.allocateNew(); + vector.setSafe(0, "zero"); + vector.setSafe(1, "one"); + ... + vector.setSafe(9, "nine"); + +Then we set the value at position 5 again: + +.. code-block:: Java + + vector.setSafe(5, "5"); + +After that, the values at positions 6, 7, 8, and 9 of the vector will become invalid. + +Building ValueVector +==================== + +Note that the current implementation doesn't enforce the rule that Arrow objects are immutable. +:class:`ValueVector` instances could be created directly by using new keyword, there are +set/setSafe APIs and concrete subclasses of FieldWriter for populating values. + +For example, the code below shows how to build a :class:`BigIntVector`, in this case, we build a +vector of the range 0 to 7 where the element that should hold the fourth value is nulled + +.. code-block:: Java + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + BigIntVector vector = new BigIntVector("vector", allocator)) { + vector.allocateNew(8); + vector.set(0, 1); + vector.set(1, 2); + vector.set(2, 3); + vector.setNull(3); + vector.set(4, 5); + vector.set(5, 6); + vector.set(6, 7); + vector.set(7, 8); + vector.setValueCount(8); // this will finalizes the vector by convention. + ... + } + +The :class:`BigIntVector` holds two ArrowBufs. The first buffer holds the null bitmap, which consists +here of a single byte with the bits 1|1|1|1|0|1|1|1 (the bit is 1 if the value is non-null). +The second buffer contains all the above values. As the fourth entry is null, the value at that position +in the buffer is undefined. Note compared with set API, setSafe API would check value capacity before setting +values and reallocate buffers if necessary. + +Here is how to build a vector using writer + +.. code-block:: Java + + try (BigIntVector vector = new BigIntVector("vector", allocator); + BigIntWriter writer = new BigIntWriterImpl(vector)) { + writer.setPosition(0); + writer.writeBigInt(1); + writer.setPosition(1); + writer.writeBigInt(2); + writer.setPosition(2); + writer.writeBigInt(3); + // writer.setPosition(3) is not called which means the forth value is null. + writer.setPosition(4); + writer.writeBigInt(5); + writer.setPosition(5); + writer.writeBigInt(6); + writer.setPosition(6); + writer.writeBigInt(7); + writer.setPosition(7); + writer.writeBigInt(8); + } + +There are get API and concrete subclasses of :class:`FieldReader` for accessing vector values, what needs +to be declared is that writer/reader is not as efficient as direct access + +.. code-block:: Java + + // access via get API + for (int i = 0; i < vector.getValueCount(); i++) { + if (!vector.isNull(i)) { + System.out.println(vector.get(i)); + } + } + + // access via reader + BigIntReader reader = vector.getReader(); + for (int i = 0; i < vector.getValueCount(); i++) { + reader.setPosition(i); + if (reader.isSet()) { + System.out.println(reader.readLong()); + } + } + +Building ListVector +=================== + +A :class:`ListVector` is a vector that holds a list of values for each index. Working with one you need to handle the same steps as mentioned above (create > allocate > mutate > set value count > access > clear), but the details of how you accomplish this are slightly different since you need to both create the vector and set the list of values for each index. + +For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid. + +.. code-block:: Java + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ListVector listVector = ListVector.empty("vector", allocator)) { + UnionListWriter writer = listVector.getWriter(); + for (int i = 0; i < 10; i++) { + writer.startList(); + writer.setPosition(i); + for (int j = 0; j < 5; j++) { + writer.writeInt(j * i); + } + writer.setValueCount(5); + writer.endList(); + } + listVector.setValueCount(10); + } + +:class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values. + +.. code-block:: Java + + // access via get API + for (int i = 0; i < listVector.getValueCount(); i++) { + if (!listVector.isNull(i)) { + ArrayList<Integer> elements = (ArrayList<Integer>) listVector.getObject(i); + for (Integer element : elements) { + System.out.println(element); + } + } + } + + // access via reader + UnionListReader reader = listVector.getReader(); + for (int i = 0; i < listVector.getValueCount(); i++) { + reader.setPosition(i); + while (reader.next()) { + IntReader intReader = reader.reader(); + if (intReader.isSet()) { + System.out.println(intReader.readInteger()); + } + } + } + +Slicing +======= + +Similar with C++ implementation, it is possible to make zero-copy slices of vectors to obtain a vector +referring to some logical sub-sequence of the data through :class:`TransferPair` + +.. code-block:: Java + + IntVector vector = new IntVector("intVector", allocator); + for (int i = 0; i < 10; i++) { + vector.setSafe(i, i); + } + vector.setValueCount(10); + + TransferPair tp = vector.getTransferPair(allocator); + tp.splitAndTransfer(0, 5); + IntVector sliced = (IntVector) tp.getTo(); + // In this case, the vector values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] and the sliceVector values are [0, 1, 2, 3, 4]. diff --git a/src/arrow/docs/source/java/vector_schema_root.rst b/src/arrow/docs/source/java/vector_schema_root.rst new file mode 100644 index 000000000..7f787d9d5 --- /dev/null +++ b/src/arrow/docs/source/java/vector_schema_root.rst @@ -0,0 +1,74 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================ +VectorSchemaRoot +================ +A :class:`VectorSchemaRoot` is a container that can hold batches, batches flow through :class:`VectorSchemaRoot` +as part of a pipeline. Note this is different from other implementations (i.e. in C++ and Python, +a :class:`RecordBatch` is a collection of equal-length vector instances and was created each time for a new batch). + +The recommended usage for :class:`VectorSchemaRoot` is creating a single :class:`VectorSchemaRoot` +based on the known schema and populated data over and over into the same VectorSchemaRoot in a stream +of batches rather than creating a new :class:`VectorSchemaRoot` instance each time +(see `Numba <https://github.com/apache/arrow/tree/master/java/flight/src/main/java/org/apache/arrow/flight>`_ or +``ArrowFileWriter`` for better understanding). Thus at any one point a VectorSchemaRoot may have data or +may have no data (say it was transferred downstream or not yet populated). + + +Here is the example of building a :class:`VectorSchemaRoot` + +.. code-block:: Java + + BitVector bitVector = new BitVector("boolean", allocator); + VarCharVector varCharVector = new VarCharVector("varchar", allocator); + bitVector.allocateNew(); + varCharVector.allocateNew(); + for (int i = 0; i < 10; i++) { + bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); + varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); + } + bitVector.setValueCount(10); + varCharVector.setValueCount(10); + + List<Field> fields = Arrays.asList(bitVector.getField(), varCharVector.getField()); + List<FieldVector> vectors = Arrays.asList(bitVector, varCharVector); + VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(fields, vectors); + +The vectors within a :class:`VectorSchemaRoot` could be loaded/unloaded via :class:`VectorLoader` and :class:`VectorUnloader`. +:class:`VectorLoader` and :class:`VectorUnloader` handles converting between :class:`VectorSchemaRoot` and :class:`ArrowRecordBatch`( +representation of a RecordBatch :doc:`IPC <../format/IPC.rst>` message). Examples as below + +.. code-block:: Java + + // create a VectorSchemaRoot root1 and convert its data into recordBatch + VectorSchemaRoot root1 = new VectorSchemaRoot(fields, vectors); + VectorUnloader unloader = new VectorUnloader(root1); + ArrowRecordBatch recordBatch = unloader.getRecordBatch(); + + // create a VectorSchemaRoot root2 and load the recordBatch + VectorSchemaRoot root2 = VectorSchemaRoot.create(root1.getSchema(), allocator); + VectorLoader loader = new VectorLoader(root2); + loader.load(recordBatch); + +A new :class:`VectorSchemaRoot` could be sliced from an existing instance with zero-copy + +.. code-block:: Java + + // 0 indicates start index (inclusive) and 5 indicated length (exclusive). + VectorSchemaRoot newRoot = vectorSchemaRoot.slice(0, 5); + diff --git a/src/arrow/docs/source/js/index.rst b/src/arrow/docs/source/js/index.rst new file mode 100644 index 000000000..77813c137 --- /dev/null +++ b/src/arrow/docs/source/js/index.rst @@ -0,0 +1,21 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +JavaScript docs +=============== + +Stub page for the JavaScript docs; actual source is located in js/ sub-directory. diff --git a/src/arrow/docs/source/python/api.rst b/src/arrow/docs/source/python/api.rst new file mode 100644 index 000000000..12cf4e068 --- /dev/null +++ b/src/arrow/docs/source/python/api.rst @@ -0,0 +1,40 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api: + +************* +API Reference +************* + +.. toctree:: + :maxdepth: 2 + + api/datatypes + api/arrays + api/memory + api/compute + api/files + api/tables + api/ipc + api/flight + api/formats + api/filesystems + api/dataset + api/plasma + api/cuda + api/misc diff --git a/src/arrow/docs/source/python/api/arrays.rst b/src/arrow/docs/source/python/api/arrays.rst new file mode 100644 index 000000000..dbc4c0bd1 --- /dev/null +++ b/src/arrow/docs/source/python/api/arrays.rst @@ -0,0 +1,127 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.array: +.. currentmodule:: pyarrow + +Arrays and Scalars +================== + +Factory Functions +----------------- + +These functions create new Arrow arrays: + +.. autosummary:: + :toctree: ../generated/ + + array + nulls + +Array Types +----------- + +An array's Python class depends on its data type. Concrete array classes +may expose data type-specific methods or properties. + +.. autosummary:: + :toctree: ../generated/ + + Array + BooleanArray + FloatingPointArray + IntegerArray + Int8Array + Int16Array + Int32Array + Int64Array + NullArray + NumericArray + UInt8Array + UInt16Array + UInt32Array + UInt64Array + BinaryArray + StringArray + FixedSizeBinaryArray + LargeBinaryArray + LargeStringArray + Time32Array + Time64Array + Date32Array + Date64Array + TimestampArray + DurationArray + MonthDayNanoIntervalArray + Decimal128Array + DictionaryArray + ListArray + FixedSizeListArray + LargeListArray + StructArray + UnionArray + ExtensionArray + +.. _api.scalar: + +Scalars +------- + +This function constructs a new Arrow scalar: + +.. autosummary:: + :toctree: ../generated/ + + scalar + +A scalar's python class depends on its data type. Concrete scalar +classes may expose data type-specific methods or properties. + +.. autosummary:: + :toctree: ../generated/ + + NA + Scalar + BooleanScalar + Int8Scalar + Int16Scalar + Int32Scalar + Int64Scalar + UInt8Scalar + UInt16Scalar + UInt32Scalar + UInt64Scalar + FloatScalar + DoubleScalar + BinaryScalar + StringScalar + FixedSizeBinaryScalar + LargeBinaryScalar + LargeStringScalar + Time32Scalar + Time64Scalar + Date32Scalar + Date64Scalar + TimestampScalar + DurationScalar + MonthDayNanoIntervalScalar + Decimal128Scalar + DictionaryScalar + ListScalar + LargeListScalar + StructScalar + UnionScalar diff --git a/src/arrow/docs/source/python/api/compute.rst b/src/arrow/docs/source/python/api/compute.rst new file mode 100644 index 000000000..521182f8a --- /dev/null +++ b/src/arrow/docs/source/python/api/compute.rst @@ -0,0 +1,498 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.compute: +.. currentmodule:: pyarrow.compute + +Compute Functions +================= + +Aggregations +------------ + +.. autosummary:: + :toctree: ../generated/ + + all + any + approximate_median + count + count_distinct + index + max + mean + min + min_max + mode + product + quantile + stddev + sum + tdigest + variance + +Grouped Aggregations +-------------------- + +.. autosummary:: + :toctree: ../generated/ + + hash_all + hash_any + hash_approximate_median + hash_count + hash_count_distinct + hash_distinct + hash_max + hash_mean + hash_min + hash_min_max + hash_product + hash_stddev + hash_sum + hash_tdigest + hash_variance + +Arithmetic Functions +-------------------- + +By default these functions do not detect overflow. Most functions are also +available in an overflow-checking variant, suffixed ``_checked``, which +throws an ``ArrowInvalid`` exception when overflow is detected. + +.. autosummary:: + :toctree: ../generated/ + + abs + abs_checked + add + add_checked + divide + divide_checked + multiply + multiply_checked + negate + negate_checked + power + power_checked + sign + subtract + subtract_checked + +Bit-wise Functions +------------------ + +.. autosummary:: + :toctree: ../generated/ + + bit_wise_and + bit_wise_not + bit_wise_or + bit_wise_xor + shift_left + shift_left_checked + shift_right + shift_right_checked + +Rounding Functions +------------------ + +Rounding functions displace numeric inputs to an approximate value with a simpler +representation based on the rounding criterion. + +.. autosummary:: + :toctree: ../generated/ + + ceil + floor + round + round_to_multiple + trunc + +Logarithmic Functions +--------------------- + +Logarithmic functions are also supported, and also offer ``_checked`` +variants which detect domain errors. + +.. autosummary:: + :toctree: ../generated/ + + ln + ln_checked + log10 + log10_checked + log1p + log1p_checked + log2 + log2_checked + logb + logb_checked + +Trigonometric Functions +----------------------- + +Trigonometric functions are also supported, and also offer ``_checked`` +variants which detect domain errors where appropriate. + +.. autosummary:: + :toctree: ../generated/ + + acos + acos_checked + asin + asin_checked + atan + atan2 + cos + cos_checked + sin + sin_checked + tan + tan_checked + +Comparisons +----------- + +These functions expect two inputs of the same type. If one of the inputs is `null` +they return ``null``. + +.. autosummary:: + :toctree: ../generated/ + + equal + greater + greater_equal + less + less_equal + not_equal + +These functions take any number of arguments of a numeric or temporal type. + +.. autosummary:: + :toctree: ../generated/ + + max_element_wise + min_element_wise + +Logical Functions +----------------- + +These functions normally emit a null when one of the inputs is null. However, Kleene +logic variants are provided (suffixed ``_kleene``). See User Guide for details. + +.. autosummary:: + :toctree: ../generated/ + + and_ + and_kleene + and_not + and_not_kleene + invert + or_ + or_kleene + xor + +String Predicates +----------------- + +In these functions an empty string emits false in the output. For ASCII +variants (prefixed ``ascii_``) a string element with non-ASCII characters +emits false in the output. + +The first set of functions emit true if the input contains only +characters of a given class. + +.. autosummary:: + :toctree: ../generated/ + + ascii_is_alnum + ascii_is_alpha + ascii_is_decimal + ascii_is_lower + ascii_is_printable + ascii_is_space + ascii_is_upper + utf8_is_alnum + utf8_is_alpha + utf8_is_decimal + utf8_is_digit + utf8_is_lower + utf8_is_numeric + utf8_is_printable + utf8_is_space + utf8_is_upper + +The second set of functions also consider the order of characters +in the string element. + +.. autosummary:: + :toctree: ../generated/ + + ascii_is_title + utf8_is_title + +The third set of functions examines string elements on +a byte-by-byte basis. + +.. autosummary:: + :toctree: ../generated/ + + string_is_ascii + +String Transforms +----------------- + +.. autosummary:: + :toctree: ../generated/ + + ascii_capitalize + ascii_lower + ascii_reverse + ascii_swapcase + ascii_title + ascii_upper + binary_length + binary_replace_slice + replace_substring + replace_substring_regex + utf8_capitalize + utf8_length + utf8_lower + utf8_replace_slice + utf8_reverse + utf8_swapcase + utf8_title + utf8_upper + +String Padding +-------------- + +.. autosummary:: + :toctree: ../generated/ + + ascii_center + ascii_lpad + ascii_rpad + utf8_center + utf8_lpad + utf8_rpad + +String Trimming +--------------- + +.. autosummary:: + :toctree: ../generated/ + + ascii_ltrim + ascii_ltrim_whitespace + ascii_rtrim + ascii_rtrim_whitespace + ascii_trim + ascii_trim_whitespace + utf8_ltrim + utf8_ltrim_whitespace + utf8_rtrim + utf8_rtrim_whitespace + utf8_trim + utf8_trim_whitespace + +String Splitting +---------------- + +.. autosummary:: + :toctree: ../generated/ + + ascii_split_whitespace + split_pattern + split_pattern_regex + utf8_split_whitespace + +String Component Extraction +--------------------------- + +.. autosummary:: + :toctree: ../generated/ + + extract_regex + +String Joining +-------------- + +.. autosummary:: + :toctree: ../generated/ + + binary_join + binary_join_element_wise + +String Slicing +-------------- + +.. autosummary:: + :toctree: ../generated/ + + utf8_slice_codeunits + +Containment Tests +----------------- + +.. autosummary:: + :toctree: ../generated/ + + count_substring + count_substring_regex + ends_with + find_substring + find_substring_regex + index_in + is_in + match_like + match_substring + match_substring_regex + starts_with + +Categorizations +--------------- + +.. autosummary:: + :toctree: ../generated/ + + is_finite + is_inf + is_nan + is_null + is_valid + +Selecting / Multiplexing +------------------------ + +.. autosummary:: + :toctree: ../generated/ + + case_when + choose + coalesce + if_else + +Conversions +----------- + +.. autosummary:: + :toctree: ../generated/ + + cast + strftime + strptime + +Temporal Component Extraction +----------------------------- + +.. autosummary:: + :toctree: ../generated/ + + day + day_of_week + day_of_year + hour + iso_week + iso_year + iso_calendar + microsecond + millisecond + minute + month + nanosecond + quarter + second + subsecond + us_week + week + year + +Temporal Difference +------------------- + +.. autosummary:: + :toctree: ../generated/ + + day_time_interval_between + days_between + hours_between + microseconds_between + milliseconds_between + minutes_between + month_day_nano_interval_between + month_interval_between + nanoseconds_between + quarters_between + seconds_between + weeks_between + years_between + +Timezone Handling +----------------- + +.. autosummary:: + :toctree: ../generated/ + + assume_timezone + +Associative Transforms +---------------------- + +.. autosummary:: + :toctree: ../generated/ + + dictionary_encode + unique + value_counts + +Selections +---------- + +.. autosummary:: + :toctree: ../generated/ + + array_filter + array_take + drop_null + filter + take + +Sorts and Partitions +-------------------- + +.. autosummary:: + :toctree: ../generated/ + + array_sort_indices + partition_nth_indices + select_k_unstable + sort_indices + +Structural Transforms +--------------------- + +.. autosummary:: + :toctree: ../generated/ + + list_element + list_flatten + list_parent_indices + list_value_length + make_struct + replace_with_mask diff --git a/src/arrow/docs/source/python/api/cuda.rst b/src/arrow/docs/source/python/api/cuda.rst new file mode 100644 index 000000000..364f03240 --- /dev/null +++ b/src/arrow/docs/source/python/api/cuda.rst @@ -0,0 +1,62 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.cuda + +CUDA Integration +================ + +.. ifconfig:: not cuda_enabled + + .. error:: + This documentation was built without CUDA enabled. The CUDA + API docs are not available. + +.. NOTE We still generate those API docs (with empty docstrings) +.. when CUDA is disabled and `pyarrow.cuda` mocked (see conf.py). +.. Otherwise we'd get autodoc warnings, see https://github.com/sphinx-doc/sphinx/issues/4770 + +CUDA Contexts +------------- + +.. autosummary:: + :toctree: ../generated/ + + Context + +CUDA Buffers +------------ + +.. autosummary:: + :toctree: ../generated/ + + CudaBuffer + new_host_buffer + HostBuffer + BufferReader + BufferWriter + +Serialization and IPC +--------------------- + +.. autosummary:: + :toctree: ../generated/ + + serialize_record_batch + read_record_batch + read_message + IpcMemHandle diff --git a/src/arrow/docs/source/python/api/dataset.rst b/src/arrow/docs/source/python/api/dataset.rst new file mode 100644 index 000000000..9718006ab --- /dev/null +++ b/src/arrow/docs/source/python/api/dataset.rst @@ -0,0 +1,64 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.dataset + +.. _api.dataset: + +Dataset +======= + +.. warning:: + + The ``pyarrow.dataset`` module is experimental (specifically the classes), + and a stable API is not yet guaranteed. + +Factory functions +----------------- + +.. autosummary:: + :toctree: ../generated/ + + dataset + parquet_dataset + partitioning + field + scalar + write_dataset + +Classes +------- + +.. autosummary:: + :toctree: ../generated/ + + FileFormat + ParquetFileFormat + ORCFileFormat + IpcFileFormat + CsvFileFormat + Partitioning + PartitioningFactory + DirectoryPartitioning + HivePartitioning + Dataset + FileSystemDataset + FileSystemFactoryOptions + FileSystemDatasetFactory + UnionDataset + Scanner + Expression diff --git a/src/arrow/docs/source/python/api/datatypes.rst b/src/arrow/docs/source/python/api/datatypes.rst new file mode 100644 index 000000000..48a254a00 --- /dev/null +++ b/src/arrow/docs/source/python/api/datatypes.rst @@ -0,0 +1,165 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.types: +.. currentmodule:: pyarrow + +Data Types and Schemas +====================== + +Factory Functions +----------------- + +These should be used to create Arrow data types and schemas. + +.. autosummary:: + :toctree: ../generated/ + + null + bool_ + int8 + int16 + int32 + int64 + uint8 + uint16 + uint32 + uint64 + float16 + float32 + float64 + time32 + time64 + timestamp + date32 + date64 + duration + month_day_nano_interval + binary + string + utf8 + large_binary + large_string + large_utf8 + decimal128 + list_ + large_list + map_ + struct + dictionary + field + schema + from_numpy_dtype + +Utility Functions +----------------- + +.. autosummary:: + :toctree: ../generated/ + + unify_schemas + +.. _api.type_classes: +.. currentmodule:: pyarrow + +Type Classes +------------ + +Do not instantiate these classes directly. Instead, call one of the factory +functions above. + +.. autosummary:: + :toctree: ../generated/ + + DataType + DictionaryType + ListType + MapType + StructType + UnionType + TimestampType + Time32Type + Time64Type + FixedSizeBinaryType + Decimal128Type + Field + Schema + +Specific classes and functions for extension types. + +.. autosummary:: + :toctree: ../generated/ + + ExtensionType + PyExtensionType + register_extension_type + unregister_extension_type + + +.. _api.types.checking: +.. currentmodule:: pyarrow.types + +Type Checking +------------- + +These functions are predicates to check whether a :class:`DataType` instance +represents a given data type (such as ``int32``) or general category +(such as "is a signed integer"). + +.. autosummary:: + :toctree: ../generated/ + + is_boolean + is_integer + is_signed_integer + is_unsigned_integer + is_int8 + is_int16 + is_int32 + is_int64 + is_uint8 + is_uint16 + is_uint32 + is_uint64 + is_floating + is_float16 + is_float32 + is_float64 + is_decimal + is_list + is_large_list + is_struct + is_union + is_nested + is_temporal + is_timestamp + is_date + is_date32 + is_date64 + is_time + is_time32 + is_time64 + is_null + is_binary + is_unicode + is_string + is_large_binary + is_large_unicode + is_large_string + is_fixed_size_binary + is_map + is_dictionary diff --git a/src/arrow/docs/source/python/api/files.rst b/src/arrow/docs/source/python/api/files.rst new file mode 100644 index 000000000..106dfde8a --- /dev/null +++ b/src/arrow/docs/source/python/api/files.rst @@ -0,0 +1,65 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +Streams and File Access +======================= + +.. _api.io: + +Factory Functions +----------------- + +These factory functions are the recommended way to create a Arrow stream. +They accept various kinds of sources, such as in-memory buffers or on-disk files. + +.. autosummary:: + :toctree: ../generated/ + + input_stream + output_stream + memory_map + create_memory_map + +Stream Classes +-------------- + +.. autosummary:: + :toctree: ../generated/ + + NativeFile + OSFile + PythonFile + BufferReader + BufferOutputStream + FixedSizeBufferWriter + MemoryMappedFile + CompressedInputStream + CompressedOutputStream + +File Systems +------------ + +.. autosummary:: + :toctree: ../generated/ + + hdfs.connect + LocalFileSystem + +.. class:: HadoopFileSystem + :noindex: diff --git a/src/arrow/docs/source/python/api/filesystems.rst b/src/arrow/docs/source/python/api/filesystems.rst new file mode 100644 index 000000000..3e2ac29ee --- /dev/null +++ b/src/arrow/docs/source/python/api/filesystems.rst @@ -0,0 +1,53 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.fs + +Filesystems +=========== + +.. _api.fs: + +Interface +--------- + +.. autosummary:: + :toctree: ../generated/ + + FileInfo + FileSelector + FileSystem + +Concrete Subclasses +------------------- + +.. autosummary:: + :toctree: ../generated/ + + LocalFileSystem + S3FileSystem + HadoopFileSystem + SubTreeFileSystem + +To define filesystems with behavior implemented in Python: + +.. autosummary:: + :toctree: ../generated/ + + PyFileSystem + FileSystemHandler + FSSpecHandler diff --git a/src/arrow/docs/source/python/api/flight.rst b/src/arrow/docs/source/python/api/flight.rst new file mode 100644 index 000000000..0cfbb6b4b --- /dev/null +++ b/src/arrow/docs/source/python/api/flight.rst @@ -0,0 +1,91 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.flight + +Arrow Flight +============ + +.. ifconfig:: not flight_enabled + + .. error:: + This documentation was built without Flight enabled. The Flight + API docs are not available. + +.. NOTE We still generate those API docs (with empty docstrings) +.. when Flight is disabled and `pyarrow.flight` mocked (see conf.py). +.. Otherwise we'd get autodoc warnings, see https://github.com/sphinx-doc/sphinx/issues/4770 + +.. warning:: Flight is currently unstable. APIs are subject to change, + though we don't expect drastic changes. + +Common Types +------------ + +.. autosummary:: + :toctree: ../generated/ + + Action + ActionType + DescriptorType + FlightDescriptor + FlightEndpoint + FlightInfo + Location + Ticket + Result + +Flight Client +------------- + +.. autosummary:: + :toctree: ../generated/ + + FlightCallOptions + FlightClient + ClientMiddlewareFactory + ClientMiddleware + +Flight Server +------------- + +.. autosummary:: + :toctree: ../generated/ + + FlightServerBase + GeneratorStream + RecordBatchStream + ServerMiddlewareFactory + ServerMiddleware + +Authentication +-------------- + +.. autosummary:: + :toctree: ../generated/ + + ClientAuthHandler + ServerAuthHandler + +Middleware +---------- + +.. autosummary:: + :toctree: ../generated/ + + FlightMethod + CallInfo diff --git a/src/arrow/docs/source/python/api/formats.rst b/src/arrow/docs/source/python/api/formats.rst new file mode 100644 index 000000000..fdc28040a --- /dev/null +++ b/src/arrow/docs/source/python/api/formats.rst @@ -0,0 +1,101 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Tabular File Formats +==================== + +.. _api.csv: + +CSV Files +--------- + +.. currentmodule:: pyarrow.csv + +.. autosummary:: + :toctree: ../generated/ + + ConvertOptions + CSVStreamingReader + CSVWriter + ISO8601 + ParseOptions + ReadOptions + WriteOptions + open_csv + read_csv + write_csv + +.. _api.feather: + +Feather Files +------------- + +.. currentmodule:: pyarrow.feather + +.. autosummary:: + :toctree: ../generated/ + + read_feather + read_table + write_feather + +.. _api.json: + +JSON Files +---------- + +.. currentmodule:: pyarrow.json + +.. autosummary:: + :toctree: ../generated/ + + ReadOptions + ParseOptions + read_json + +.. _api.parquet: + +Parquet Files +------------- + +.. currentmodule:: pyarrow.parquet + +.. autosummary:: + :toctree: ../generated/ + + ParquetDataset + ParquetFile + ParquetWriter + read_table + read_metadata + read_pandas + read_schema + write_metadata + write_table + write_to_dataset + +.. _api.orc: + +ORC Files +--------- + +.. currentmodule:: pyarrow.orc + +.. autosummary:: + :toctree: ../generated/ + + ORCFile diff --git a/src/arrow/docs/source/python/api/ipc.rst b/src/arrow/docs/source/python/api/ipc.rst new file mode 100644 index 000000000..83ff53de7 --- /dev/null +++ b/src/arrow/docs/source/python/api/ipc.rst @@ -0,0 +1,69 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.ipc: + +Serialization and IPC +===================== + +Inter-Process Communication +--------------------------- + +.. autosummary:: + :toctree: ../generated/ + + ipc.new_file + ipc.open_file + ipc.new_stream + ipc.open_stream + ipc.read_message + ipc.read_record_batch + ipc.get_record_batch_size + ipc.read_tensor + ipc.write_tensor + ipc.get_tensor_size + ipc.IpcWriteOptions + ipc.Message + ipc.MessageReader + ipc.RecordBatchFileReader + ipc.RecordBatchFileWriter + ipc.RecordBatchStreamReader + ipc.RecordBatchStreamWriter + +Serialization +------------- + +.. warning:: + + The serialization functionality is deprecated in pyarrow 2.0, and will + be removed in a future version. Use the standard library ``pickle`` or + the IPC functionality of pyarrow (see :ref:`ipc`). + + +.. autosummary:: + :toctree: ../generated/ + + serialize + serialize_to + deserialize + deserialize_components + deserialize_from + read_serialized + SerializedPyObject + SerializationContext diff --git a/src/arrow/docs/source/python/api/memory.rst b/src/arrow/docs/source/python/api/memory.rst new file mode 100644 index 000000000..f4382ba23 --- /dev/null +++ b/src/arrow/docs/source/python/api/memory.rst @@ -0,0 +1,73 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.memory: + +Buffers and Memory +================== + +In-Memory Buffers +----------------- + +Factory Functions +~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + allocate_buffer + py_buffer + foreign_buffer + +Classes +~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + Buffer + ResizableBuffer + +Miscellaneous +~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: ../generated/ + + Codec + compress + decompress + +.. _api.memory_pool: + +Memory Pools +------------ + +.. autosummary:: + :toctree: ../generated/ + + MemoryPool + default_memory_pool + jemalloc_memory_pool + mimalloc_memory_pool + system_memory_pool + jemalloc_set_decay_ms + set_memory_pool + log_memory_allocations + total_allocated_bytes diff --git a/src/arrow/docs/source/python/api/misc.rst b/src/arrow/docs/source/python/api/misc.rst new file mode 100644 index 000000000..c13b80620 --- /dev/null +++ b/src/arrow/docs/source/python/api/misc.rst @@ -0,0 +1,40 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +Miscellaneous +============= + +Multi-Threading +--------------- + +.. autosummary:: + :toctree: ../generated/ + + cpu_count + set_cpu_count + +Using with C extensions +----------------------- + +.. autosummary:: + :toctree: ../generated/ + + get_include + get_libraries + get_library_dirs diff --git a/src/arrow/docs/source/python/api/plasma.rst b/src/arrow/docs/source/python/api/plasma.rst new file mode 100644 index 000000000..8df9e4e21 --- /dev/null +++ b/src/arrow/docs/source/python/api/plasma.rst @@ -0,0 +1,33 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.plasma + +.. _api.plasma: + +Plasma In-Memory Object Store +============================= + +Classes +------- + +.. autosummary:: + :toctree: ../generated/ + + ObjectID + PlasmaClient + PlasmaBuffer diff --git a/src/arrow/docs/source/python/api/tables.rst b/src/arrow/docs/source/python/api/tables.rst new file mode 100644 index 000000000..6e7a3b6e1 --- /dev/null +++ b/src/arrow/docs/source/python/api/tables.rst @@ -0,0 +1,55 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _api.table: + +Tables and Tensors +================== + +Factory Functions +----------------- + +.. autosummary:: + :toctree: ../generated/ + + chunked_array + concat_arrays + concat_tables + record_batch + table + +Classes +------- + +.. autosummary:: + :toctree: ../generated/ + + ChunkedArray + RecordBatch + Table + +.. _api.tensor: + +Tensors +------- + +.. autosummary:: + :toctree: ../generated/ + + Tensor diff --git a/src/arrow/docs/source/python/benchmarks.rst b/src/arrow/docs/source/python/benchmarks.rst new file mode 100644 index 000000000..aee83b778 --- /dev/null +++ b/src/arrow/docs/source/python/benchmarks.rst @@ -0,0 +1,56 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _python-benchmarks: + +Benchmarks +========== + +The ``pyarrow`` package comes with a suite of benchmarks meant to +run with `ASV`_. You'll need to install the ``asv`` package first +(``pip install asv`` or ``conda install -c conda-forge asv``). + +Running the benchmarks +---------------------- + +To run the benchmarks for a locally-built Arrow, run ``asv dev`` or +``asv run --python=same``. + +We use conda environments as part of running the benchmarks. To use the ``asv`` +setup, you must set the ``$CONDA_HOME`` environment variable to point to the +root of your conda installation. + +Running for arbitrary Git revisions +----------------------------------- + +ASV allows to store results and generate graphs of the benchmarks over +the project's evolution. You need to have the latest development version of ASV: + +.. code:: + + pip install git+https://github.com/airspeed-velocity/asv + +Now you should be ready to run ``asv run`` or whatever other command +suits your needs. Note that this can be quite long, as each Arrow needs +to be rebuilt for each Git revision you're running the benchmarks for. + +Compatibility +------------- + +We only expect the benchmarking setup to work on a Unix-like system with bash. + +.. _asv: https://asv.readthedocs.org/ diff --git a/src/arrow/docs/source/python/compute.rst b/src/arrow/docs/source/python/compute.rst new file mode 100644 index 000000000..133520de9 --- /dev/null +++ b/src/arrow/docs/source/python/compute.rst @@ -0,0 +1,69 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.compute +.. _compute: + +================= +Compute Functions +================= + +Arrow supports logical compute operations over inputs of possibly +varying types. Many compute functions support both array (chunked or not) +and scalar inputs, but some will mandate either. For example, +``sort_indices`` requires its first and only input to be an array. + +Below are a few simple examples: + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> a = pa.array([1, 1, 2, 3]) + >>> pc.sum(a) + <pyarrow.Int64Scalar: 7> + >>> b = pa.array([4, 1, 2, 8]) + >>> pc.equal(a, b) + <pyarrow.lib.BooleanArray object at 0x7f686e4eef30> + [ + false, + true, + true, + false + ] + >>> x, y = pa.scalar(7.8), pa.scalar(9.3) + >>> pc.multiply(x, y) + <pyarrow.DoubleScalar: 72.54> + +These functions can do more than just element-by-element operations. +Here is an example of sorting a table: + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> t = pa.table({'x':[1,2,3],'y':[3,2,1]}) + >>> i = pc.sort_indices(t, sort_keys=[('y', 'ascending')]) + >>> i + <pyarrow.lib.UInt64Array object at 0x7fcee5df75e8> + [ + 2, + 1, + 0 + ] + + + +.. seealso:: + + :ref:`Available compute functions (C++ documentation) <compute-function-list>`. diff --git a/src/arrow/docs/source/python/csv.rst b/src/arrow/docs/source/python/csv.rst new file mode 100644 index 000000000..1724c63f4 --- /dev/null +++ b/src/arrow/docs/source/python/csv.rst @@ -0,0 +1,170 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.csv +.. _csv: + +Reading and Writing CSV files +============================= + +Arrow supports reading and writing columnar data from/to CSV files. +The features currently offered are the following: + +* multi-threaded or single-threaded reading +* automatic decompression of input files (based on the filename extension, + such as ``my_data.csv.gz``) +* fetching column names from the first row in the CSV file +* column-wise type inference and conversion to one of ``null``, ``int64``, + ``float64``, ``date32``, ``time32[s]``, ``timestamp[s]``, ``timestamp[ns]``, + ``string`` or ``binary`` data +* opportunistic dictionary encoding of ``string`` and ``binary`` columns + (disabled by default) +* detecting various spellings of null values such as ``NaN`` or ``#N/A`` +* writing CSV files with options to configure the exact output format + +Usage +----- + +CSV reading and writing functionality is available through the +:mod:`pyarrow.csv` module. In many cases, you will simply call the +:func:`read_csv` function with the file path you want to read from:: + + >>> from pyarrow import csv + >>> fn = 'tips.csv.gz' + >>> table = csv.read_csv(fn) + >>> table + pyarrow.Table + total_bill: double + tip: double + sex: string + smoker: string + day: string + time: string + size: int64 + >>> len(table) + 244 + >>> df = table.to_pandas() + >>> df.head() + total_bill tip sex smoker day time size + 0 16.99 1.01 Female No Sun Dinner 2 + 1 10.34 1.66 Male No Sun Dinner 3 + 2 21.01 3.50 Male No Sun Dinner 3 + 3 23.68 3.31 Male No Sun Dinner 2 + 4 24.59 3.61 Female No Sun Dinner 4 + +To write CSV files, just call :func:`write_csv` with a +:class:`pyarrow.RecordBatch` or :class:`pyarrow.Table` and a path or +file-like object:: + + >>> import pyarrow as pa + >>> import pyarrow.csv as csv + >>> csv.write_csv(table, "tips.csv") + >>> with pa.CompressedOutputStream("tips.csv.gz", "gzip") as out: + ... csv.write_csv(table, out) + +.. note:: The writer does not yet support all Arrow types. + +Customized parsing +------------------ + +To alter the default parsing settings in case of reading CSV files with an +unusual structure, you should create a :class:`ParseOptions` instance +and pass it to :func:`read_csv`. + +Customized conversion +--------------------- + +To alter how CSV data is converted to Arrow types and data, you should create +a :class:`ConvertOptions` instance and pass it to :func:`read_csv`:: + + import pyarrow as pa + import pyarrow.csv as csv + + table = csv.read_csv('tips.csv.gz', convert_options=pa.csv.ConvertOptions( + column_types={ + 'total_bill': pa.decimal128(precision=10, scale=2), + 'tip': pa.decimal128(precision=10, scale=2), + } + )) + + +Incremental reading +------------------- + +For memory-constrained environments, it is also possible to read a CSV file +one batch at a time, using :func:`open_csv`. + +There are a few caveats: + +1. For now, the incremental reader is always single-threaded (regardless of + :attr:`ReadOptions.use_threads`) + +2. Type inference is done on the first block and types are frozen afterwards; + to make sure the right data types are inferred, either set + :attr:`ReadOptions.block_size` to a large enough value, or use + :attr:`ConvertOptions.column_types` to set the desired data types explicitly. + +Character encoding +------------------ + +By default, CSV files are expected to be encoded in UTF8. Non-UTF8 data +is accepted for ``binary`` columns. The encoding can be changed using +the :class:`ReadOptions` class. + +Customized writing +------------------ + +To alter the default write settings in case of writing CSV files with +different conventions, you can create a :class:`WriteOptions` instance and +pass it to :func:`write_csv`:: + + >>> import pyarrow as pa + >>> import pyarrow.csv as csv + >>> # Omit the header row (include_header=True is the default) + >>> options = csv.WriteOptions(include_header=False) + >>> csv.write_csv(table, "data.csv", options) + +Incremental writing +------------------- + +To write CSV files one batch at a time, create a :class:`CSVWriter`. This +requires the output (a path or file-like object), the schema of the data to +be written, and optionally write options as described above:: + + >>> import pyarrow as pa + >>> import pyarrow.csv as csv + >>> with csv.CSVWriter("data.csv", table.schema) as writer: + >>> writer.write_table(table) + +Performance +----------- + +Due to the structure of CSV files, one cannot expect the same levels of +performance as when reading dedicated binary formats like +:ref:`Parquet <Parquet>`. Nevertheless, Arrow strives to reduce the +overhead of reading CSV files. A reasonable expectation is at least +100 MB/s per core on a performant desktop or laptop computer (measured +in source CSV bytes, not target Arrow data bytes). + +Performance options can be controlled through the :class:`ReadOptions` class. +Multi-threaded reading is the default for highest performance, distributing +the workload efficiently over all available cores. + +.. note:: + The number of concurrent threads is automatically inferred by Arrow. + You can inspect and change it using the :func:`~pyarrow.cpu_count()` + and :func:`~pyarrow.set_cpu_count()` functions, respectively. diff --git a/src/arrow/docs/source/python/cuda.rst b/src/arrow/docs/source/python/cuda.rst new file mode 100644 index 000000000..b0150c1c5 --- /dev/null +++ b/src/arrow/docs/source/python/cuda.rst @@ -0,0 +1,159 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.cuda + +CUDA Integration +================ + +Arrow is not limited to CPU buffers (located in the computer's main memory, +also named "host memory"). It also has provisions for accessing buffers +located on a CUDA-capable GPU device (in "device memory"). + +.. note:: + This functionality is optional and must have been enabled at build time. + If this is not done by your package manager, you might have to build Arrow + yourself. + +CUDA Contexts +------------- + +A CUDA context represents access to a particular CUDA-capable device. +For example, this is creating a CUDA context accessing CUDA device number 0:: + + >>> from pyarrow import cuda + >>> ctx = cuda.Context(0) + >>> + +CUDA Buffers +------------ + +A CUDA buffer can be created by copying data from host memory to the memory +of a CUDA device, using the :meth:`Context.buffer_from_data` method. +The source data can be any Python buffer-like object, including Arrow buffers:: + + >>> import numpy as np + >>> arr = np.arange(4, dtype=np.int32) + >>> arr.nbytes + 16 + >>> cuda_buf = ctx.buffer_from_data(arr) + >>> type(cuda_buf) + pyarrow._cuda.CudaBuffer + >>> cuda_buf.size # The buffer's size in bytes + 16 + >>> cuda_buf.address # The buffer's address in device memory + 30088364544 + >>> cuda_buf.context.device_number + 0 + +Conversely, you can copy back a CUDA buffer to device memory, getting a regular +CPU buffer:: + + >>> buf = cuda_buf.copy_to_host() + >>> type(buf) + pyarrow.lib.Buffer + >>> np.frombuffer(buf, dtype=np.int32) + array([0, 1, 2, 3], dtype=int32) + +.. warning:: + Many Arrow functions expect a CPU buffer but will not check the buffer's + actual type. You will get a crash if you pass a CUDA buffer to such a + function:: + + >>> pa.py_buffer(b"x" * 16).equals(cuda_buf) + Segmentation fault + +Numba Integration +----------------- + +There is not much you can do directly with Arrow CUDA buffers from Python, +but they support interoperation with `Numba <https://numba.pydata.org/>`_, +a JIT compiler which can turn Python code into optimized CUDA kernels. + +Arrow to Numba +~~~~~~~~~~~~~~ + +First let's define a Numba CUDA kernel operating on an ``int32`` array. Here, +we will simply increment each array element (assuming the array is writable):: + + import numba.cuda + + @numba.cuda.jit + def increment_by_one(an_array): + pos = numba.cuda.grid(1) + if pos < an_array.size: + an_array[pos] += 1 + +Then we need to wrap our CUDA buffer into a Numba "device array" with the right +array metadata (shape, strides and datatype). This is necessary so that Numba +can identify the array's characteristics and compile the kernel with the +appropriate type declarations. + +In this case the metadata can simply be got from the original Numpy array. +Note the GPU data isn't copied, just pointed to:: + + >>> from numba.cuda.cudadrv.devicearray import DeviceNDArray + >>> device_arr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=cuda_buf.to_numba()) + +(ideally we could have defined an Arrow array in CPU memory, copied it to CUDA +memory without losing type information, and then invoked the Numba kernel on it +without constructing the DeviceNDArray by hand; this is not yet possible) + +Finally we can run the Numba CUDA kernel on the Numba device array (here +with a 16x16 grid size):: + + >>> increment_by_one[16, 16](device_arr) + +And the results can be checked by copying back the CUDA buffer to CPU memory:: + + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) + array([1, 2, 3, 4], dtype=int32) + +Numba to Arrow +~~~~~~~~~~~~~~ + +Conversely, a Numba-created device array can be viewed as an Arrow CUDA buffer, +using the :meth:`CudaBuffer.from_numba` factory method. + +For the sake of example, let's first create a Numba device array:: + + >>> arr = np.arange(10, 14, dtype=np.int32) + >>> arr + array([10, 11, 12, 13], dtype=int32) + >>> device_arr = numba.cuda.to_device(arr) + +Then we can create a CUDA buffer pointing the device array's memory. +We don't need to pass a CUDA context explicitly this time: the appropriate +CUDA context is automatically retrieved and adapted from the Numba object. + +:: + + >>> cuda_buf = cuda.CudaBuffer.from_numba(device_arr.gpu_data) + >>> cuda_buf.size + 16 + >>> cuda_buf.address + 30088364032 + >>> cuda_buf.context.device_number + 0 + +Of course, we can copy the CUDA buffer back to host memory:: + + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) + array([10, 11, 12, 13], dtype=int32) + +.. seealso:: + Documentation for Numba's `CUDA support <https://numba.pydata.org/numba-doc/latest/cuda/index.html>`_. diff --git a/src/arrow/docs/source/python/data.rst b/src/arrow/docs/source/python/data.rst new file mode 100644 index 000000000..b8a90039f --- /dev/null +++ b/src/arrow/docs/source/python/data.rst @@ -0,0 +1,434 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. _data: + +Data Types and In-Memory Data Model +=================================== + +Apache Arrow defines columnar array data structures by composing type metadata +with memory buffers, like the ones explained in the documentation on +:ref:`Memory and IO <io>`. These data structures are exposed in Python through +a series of interrelated classes: + +* **Type Metadata**: Instances of ``pyarrow.DataType``, which describe a logical + array type +* **Schemas**: Instances of ``pyarrow.Schema``, which describe a named + collection of types. These can be thought of as the column types in a + table-like object. +* **Arrays**: Instances of ``pyarrow.Array``, which are atomic, contiguous + columnar data structures composed from Arrow Buffer objects +* **Record Batches**: Instances of ``pyarrow.RecordBatch``, which are a + collection of Array objects with a particular Schema +* **Tables**: Instances of ``pyarrow.Table``, a logical table data structure in + which each column consists of one or more ``pyarrow.Array`` objects of the + same type. + +We will examine these in the sections below in a series of examples. + +.. _data.types: + +Type Metadata +------------- + +Apache Arrow defines language agnostic column-oriented data structures for +array data. These include: + +* **Fixed-length primitive types**: numbers, booleans, date and times, fixed + size binary, decimals, and other values that fit into a given number +* **Variable-length primitive types**: binary, string +* **Nested types**: list, struct, and union +* **Dictionary type**: An encoded categorical type (more on this later) + +Each logical data type in Arrow has a corresponding factory function for +creating an instance of that type object in Python: + +.. ipython:: python + + import pyarrow as pa + t1 = pa.int32() + t2 = pa.string() + t3 = pa.binary() + t4 = pa.binary(10) + t5 = pa.timestamp('ms') + + t1 + print(t1) + print(t4) + print(t5) + +We use the name **logical type** because the **physical** storage may be the +same for one or more types. For example, ``int64``, ``float64``, and +``timestamp[ms]`` all occupy 64 bits per value. + +These objects are `metadata`; they are used for describing the data in arrays, +schemas, and record batches. In Python, they can be used in functions where the +input data (e.g. Python objects) may be coerced to more than one Arrow type. + +The :class:`~pyarrow.Field` type is a type plus a name and optional +user-defined metadata: + +.. ipython:: python + + f0 = pa.field('int32_field', t1) + f0 + f0.name + f0.type + +Arrow supports **nested value types** like list, struct, and union. When +creating these, you must pass types or fields to indicate the data types of the +types' children. For example, we can define a list of int32 values with: + +.. ipython:: python + + t6 = pa.list_(t1) + t6 + +A `struct` is a collection of named fields: + +.. ipython:: python + + fields = [ + pa.field('s0', t1), + pa.field('s1', t2), + pa.field('s2', t4), + pa.field('s3', t6), + ] + + t7 = pa.struct(fields) + print(t7) + +For convenience, you can pass ``(name, type)`` tuples directly instead of +:class:`~pyarrow.Field` instances: + +.. ipython:: python + + t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)]) + print(t8) + t8 == t7 + + +See :ref:`Data Types API <api.types>` for a full listing of data type +functions. + +.. _data.schema: + +Schemas +------- + +The :class:`~pyarrow.Schema` type is similar to the ``struct`` array type; it +defines the column names and types in a record batch or table data +structure. The :func:`pyarrow.schema` factory function makes new Schema objects in +Python: + +.. ipython:: python + + my_schema = pa.schema([('field0', t1), + ('field1', t2), + ('field2', t4), + ('field3', t6)]) + my_schema + +In some applications, you may not create schemas directly, only using the ones +that are embedded in :ref:`IPC messages <ipc>`. + +.. _data.array: + +Arrays +------ + +For each data type, there is an accompanying array data structure for holding +memory buffers that define a single contiguous chunk of columnar array +data. When you are using PyArrow, this data may come from IPC tools, though it +can also be created from various types of Python sequences (lists, NumPy +arrays, pandas data). + +A simple way to create arrays is with ``pyarrow.array``, which is similar to +the ``numpy.array`` function. By default PyArrow will infer the data type +for you: + +.. ipython:: python + + arr = pa.array([1, 2, None, 3]) + arr + +But you may also pass a specific data type to override type inference: + +.. ipython:: python + + pa.array([1, 2], type=pa.uint16()) + +The array's ``type`` attribute is the corresponding piece of type metadata: + +.. ipython:: python + + arr.type + +Each in-memory array has a known length and null count (which will be 0 if +there are no null values): + +.. ipython:: python + + len(arr) + arr.null_count + +Scalar values can be selected with normal indexing. ``pyarrow.array`` converts +``None`` values to Arrow nulls; we return the special ``pyarrow.NA`` value for +nulls: + +.. ipython:: python + + arr[0] + arr[2] + +Arrow data is immutable, so values can be selected but not assigned. + +Arrays can be sliced without copying: + +.. ipython:: python + + arr[1:3] + +None values and NAN handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As mentioned in the above section, the Python object ``None`` is always +converted to an Arrow null element on the conversion to ``pyarrow.Array``. For +the float NaN value which is either represented by the Python object +``float('nan')`` or ``numpy.nan`` we normally convert it to a *valid* float +value during the conversion. If an integer input is supplied to +``pyarrow.array`` that contains ``np.nan``, ``ValueError`` is raised. + +To handle better compatibility with Pandas, we support interpreting NaN values as +null elements. This is enabled automatically on all ``from_pandas`` function and +can be enable on the other conversion functions by passing ``from_pandas=True`` +as a function parameter. + +List arrays +~~~~~~~~~~~ + +``pyarrow.array`` is able to infer the type of simple nested data structures +like lists: + +.. ipython:: python + + nested_arr = pa.array([[], None, [1, 2], [None, 1]]) + print(nested_arr.type) + +Struct arrays +~~~~~~~~~~~~~ + +For other kinds of nested arrays, such as struct arrays, you currently need +to pass the type explicitly. Struct arrays can be initialized from a +sequence of Python dicts or tuples: + +.. ipython:: python + + ty = pa.struct([('x', pa.int8()), + ('y', pa.bool_())]) + pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty) + pa.array([(3, True), (4, False)], type=ty) + +When initializing a struct array, nulls are allowed both at the struct +level and at the individual field level. If initializing from a sequence +of Python dicts, a missing dict key is handled as a null value: + +.. ipython:: python + + pa.array([{'x': 1}, None, {'y': None}], type=ty) + +You can also construct a struct array from existing arrays for each of the +struct's components. In this case, data storage will be shared with the +individual arrays, and no copy is involved: + +.. ipython:: python + + xs = pa.array([5, 6, 7], type=pa.int16()) + ys = pa.array([False, True, True]) + arr = pa.StructArray.from_arrays((xs, ys), names=('x', 'y')) + arr.type + arr + +Union arrays +~~~~~~~~~~~~ + +The union type represents a nested array type where each value can be one +(and only one) of a set of possible types. There are two possible +storage types for union arrays: sparse and dense. + +In a sparse union array, each of the child arrays has the same length +as the resulting union array. They are adjuncted with a ``int8`` "types" +array that tells, for each value, from which child array it must be +selected: + +.. ipython:: python + + xs = pa.array([5, 6, 7]) + ys = pa.array([False, False, True]) + types = pa.array([0, 1, 1], type=pa.int8()) + union_arr = pa.UnionArray.from_sparse(types, [xs, ys]) + union_arr.type + union_arr + +In a dense union array, you also pass, in addition to the ``int8`` "types" +array, a ``int32`` "offsets" array that tells, for each value, at +each offset in the selected child array it can be found: + +.. ipython:: python + + xs = pa.array([5, 6, 7]) + ys = pa.array([False, True]) + types = pa.array([0, 1, 1, 0, 0], type=pa.int8()) + offsets = pa.array([0, 0, 1, 1, 2], type=pa.int32()) + union_arr = pa.UnionArray.from_dense(types, offsets, [xs, ys]) + union_arr.type + union_arr + +.. _data.dictionary: + +Dictionary Arrays +~~~~~~~~~~~~~~~~~ + +The **Dictionary** type in PyArrow is a special array type that is similar to a +factor in R or a ``pandas.Categorical``. It enables one or more record batches +in a file or stream to transmit integer *indices* referencing a shared +**dictionary** containing the distinct values in the logical array. This is +particularly often used with strings to save memory and improve performance. + +The way that dictionaries are handled in the Apache Arrow format and the way +they appear in C++ and Python is slightly different. We define a special +:class:`~.DictionaryArray` type with a corresponding dictionary type. Let's +consider an example: + +.. ipython:: python + + indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) + dictionary = pa.array(['foo', 'bar', 'baz']) + + dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) + dict_array + +Here we have: + +.. ipython:: python + + print(dict_array.type) + dict_array.indices + dict_array.dictionary + +When using :class:`~.DictionaryArray` with pandas, the analogue is +``pandas.Categorical`` (more on this later): + +.. ipython:: python + + dict_array.to_pandas() + +.. _data.record_batch: + +Record Batches +-------------- + +A **Record Batch** in Apache Arrow is a collection of equal-length array +instances. Let's consider a collection of arrays: + +.. ipython:: python + + data = [ + pa.array([1, 2, 3, 4]), + pa.array(['foo', 'bar', 'baz', None]), + pa.array([True, None, False, True]) + ] + +A record batch can be created from this list of arrays using +``RecordBatch.from_arrays``: + +.. ipython:: python + + batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2']) + batch.num_columns + batch.num_rows + batch.schema + + batch[1] + +A record batch can be sliced without copying memory like an array: + +.. ipython:: python + + batch2 = batch.slice(1, 3) + batch2[1] + +.. _data.table: + +Tables +------ + +The PyArrow :class:`~.Table` type is not part of the Apache Arrow +specification, but is rather a tool to help with wrangling multiple record +batches and array pieces as a single logical dataset. As a relevant example, we +may receive multiple small record batches in a socket stream, then need to +concatenate them into contiguous memory for use in NumPy or pandas. The Table +object makes this efficient without requiring additional memory copying. + +Considering the record batch we created above, we can create a Table containing +one or more copies of the batch using ``Table.from_batches``: + +.. ipython:: python + + batches = [batch] * 5 + table = pa.Table.from_batches(batches) + table + table.num_rows + +The table's columns are instances of :class:`~.ChunkedArray`, which is a +container for one or more arrays of the same type. + +.. ipython:: python + + c = table[0] + c + c.num_chunks + c.chunk(0) + +As you'll see in the :ref:`pandas section <pandas_interop>`, we can convert +these objects to contiguous NumPy arrays for use in pandas: + +.. ipython:: python + + c.to_pandas() + +Multiple tables can also be concatenated together to form a single table using +``pyarrow.concat_tables``, if the schemas are equal: + +.. ipython:: python + + tables = [table] * 2 + table_all = pa.concat_tables(tables) + table_all.num_rows + c = table_all[0] + c.num_chunks + +This is similar to ``Table.from_batches``, but uses tables as input instead of +record batches. Record batches can be made into tables, but not the other way +around, so if your data is already in table form, then use +``pyarrow.concat_tables``. + +Custom Schema and Field Metadata +-------------------------------- + +TODO diff --git a/src/arrow/docs/source/python/dataset.rst b/src/arrow/docs/source/python/dataset.rst new file mode 100644 index 000000000..e2d8c900b --- /dev/null +++ b/src/arrow/docs/source/python/dataset.rst @@ -0,0 +1,626 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.dataset + +.. _dataset: + +Tabular Datasets +================ + +.. warning:: + + The ``pyarrow.dataset`` module is experimental (specifically the classes), + and a stable API is not yet guaranteed. + +The ``pyarrow.dataset`` module provides functionality to efficiently work with +tabular, potentially larger than memory, and multi-file datasets. This includes: + +* A unified interface that supports different sources and file formats + (Parquet, ORC, Feather / Arrow IPC, and CSV files) and different file systems + (local, cloud). +* Discovery of sources (crawling directories, handle directory-based partitioned + datasets, basic schema normalization, ..) +* Optimized reading with predicate pushdown (filtering rows), projection + (selecting and deriving columns), and optionally parallel reading. + +Currently, only Parquet, ORC, Feather / Arrow IPC, and CSV files are +supported. The goal is to expand this in the future to other file formats and +data sources (e.g. database connections). + +For those familiar with the existing :class:`pyarrow.parquet.ParquetDataset` for +reading Parquet datasets: ``pyarrow.dataset``'s goal is similar but not specific +to the Parquet format and not tied to Python: the same datasets API is exposed +in the R bindings or Arrow. In addition ``pyarrow.dataset`` boasts improved +performance and new features (e.g. filtering within files rather than only on +partition keys). + + +Reading Datasets +---------------- + +.. TODO Full blown example with NYC taxi data to show off, afterwards explain all parts: + +For the examples below, let's create a small dataset consisting +of a directory with two parquet files: + +.. ipython:: python + + import tempfile + import pathlib + import pyarrow as pa + import pyarrow.parquet as pq + import numpy as np + + base = pathlib.Path(tempfile.gettempdir()) + (base / "parquet_dataset").mkdir(exist_ok=True) + + # creating an Arrow Table + table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5}) + + # writing it into two parquet files + pq.write_table(table.slice(0, 5), base / "parquet_dataset/data1.parquet") + pq.write_table(table.slice(5, 10), base / "parquet_dataset/data2.parquet") + +Dataset discovery +~~~~~~~~~~~~~~~~~ + +A :class:`Dataset` object can be created with the :func:`dataset` function. We +can pass it the path to the directory containing the data files: + +.. ipython:: python + + import pyarrow.dataset as ds + dataset = ds.dataset(base / "parquet_dataset", format="parquet") + dataset + +In addition to searching a base directory, :func:`dataset` accepts a path to a +single file or a list of file paths. + +Creating a :class:`Dataset` object does not begin reading the data itself. If +needed, it only crawls the directory to find all the files: + +.. ipython:: python + + dataset.files + +... and infers the dataset's schema (by default from the first file): + +.. ipython:: python + + print(dataset.schema.to_string(show_field_metadata=False)) + +Using the :meth:`Dataset.to_table` method we can read the dataset (or a portion +of it) into a pyarrow Table (note that depending on the size of your dataset +this can require a lot of memory, see below on filtering / iterative loading): + +.. ipython:: python + + dataset.to_table() + # converting to pandas to see the contents of the scanned table + dataset.to_table().to_pandas() + +Reading different file formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The above examples use Parquet files as dataset sources but the Dataset API +provides a consistent interface across multiple file formats and filesystems. +Currently, Parquet, ORC, Feather / Arrow IPC, and CSV file formats are +supported; more formats are planned in the future. + +If we save the table as Feather files instead of Parquet files: + +.. ipython:: python + + import pyarrow.feather as feather + + feather.write_feather(table, base / "data.feather") + +…then we can read the Feather file using the same functions, but with specifying +``format="feather"``: + +.. ipython:: python + + dataset = ds.dataset(base / "data.feather", format="feather") + dataset.to_table().to_pandas().head() + +Customizing file formats +~~~~~~~~~~~~~~~~~~~~~~~~ + +The format name as a string, like:: + + ds.dataset(..., format="parquet") + +is short hand for a default constructed :class:`ParquetFileFormat`:: + + ds.dataset(..., format=ds.ParquetFileFormat()) + +The :class:`FileFormat` objects can be customized using keywords. For example:: + + parquet_format = ds.ParquetFileFormat(read_options={'dictionary_columns': ['a']}) + ds.dataset(..., format=parquet_format) + +Will configure column ``"a"`` to be dictionary encoded on scan. + +Filtering data +-------------- + +To avoid reading all data when only needing a subset, the ``columns`` and +``filter`` keywords can be used. + +The ``columns`` keyword can be used to only read the specified columns: + +.. ipython:: python + + dataset = ds.dataset(base / "parquet_dataset", format="parquet") + dataset.to_table(columns=['a', 'b']).to_pandas() + +With the ``filter`` keyword, rows which do not match the filter predicate will +not be included in the returned table. The keyword expects a boolean +:class:`Expression` referencing at least one of the columns: + +.. ipython:: python + + dataset.to_table(filter=ds.field('a') >= 7).to_pandas() + dataset.to_table(filter=ds.field('c') == 2).to_pandas() + +The easiest way to construct those :class:`Expression` objects is by using the +:func:`field` helper function. Any column - not just partition columns - can be +referenced using the :func:`field` function (which creates a +:class:`FieldExpression`). Operator overloads are provided to compose filters +including the comparisons (equal, larger/less than, etc), set membership +testing, and boolean combinations (``&``, ``|``, ``~``): + +.. ipython:: python + + ds.field('a') != 3 + ds.field('a').isin([1, 2, 3]) + (ds.field('a') > ds.field('b')) & (ds.field('b') > 1) + +Note that :class:`Expression` objects can **not** be combined by python logical +operators ``and``, ``or`` and ``not``. + +Projecting columns +------------------ + +The ``columns`` keyword can be used to read a subset of the columns of the +dataset by passing it a list of column names. The keyword can also be used +for more complex projections in combination with expressions. + +In this case, we pass it a dictionary with the keys being the resulting +column names and the values the expression that is used to construct the column +values: + +.. ipython:: python + + projection = { + "a_renamed": ds.field("a"), + "b_as_float32": ds.field("b").cast("float32"), + "c_1": ds.field("c") == 1, + } + dataset.to_table(columns=projection).to_pandas().head() + +The dictionary also determines the column selection (only the keys in the +dictionary will be present as columns in the resulting table). If you want +to include a derived column in *addition* to the existing columns, you can +build up the dictionary from the dataset schema: + +.. ipython:: python + + projection = {col: ds.field(col) for col in dataset.schema.names} + projection.update({"b_large": ds.field("b") > 1}) + dataset.to_table(columns=projection).to_pandas().head() + + +Reading partitioned data +------------------------ + +Above, a dataset consisting of a flat directory with files was shown. However, a +dataset can exploit a nested directory structure defining a partitioned dataset, +where the sub-directory names hold information about which subset of the data is +stored in that directory. + +For example, a dataset partitioned by year and month may look like on disk: + +.. code-block:: text + + dataset_name/ + year=2007/ + month=01/ + data0.parquet + data1.parquet + ... + month=02/ + data0.parquet + data1.parquet + ... + month=03/ + ... + year=2008/ + month=01/ + ... + ... + +The above partitioning scheme is using "/key=value/" directory names, as found +in Apache Hive. + +Let's create a small partitioned dataset. The :func:`~pyarrow.parquet.write_to_dataset` +function can write such hive-like partitioned datasets. + +.. ipython:: python + + table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5, + 'part': ['a'] * 5 + ['b'] * 5}) + pq.write_to_dataset(table, str(base / "parquet_dataset_partitioned"), + partition_cols=['part']) + +The above created a directory with two subdirectories ("part=a" and "part=b"), +and the Parquet files written in those directories no longer include the "part" +column. + +Reading this dataset with :func:`dataset`, we now specify that the dataset +should use a hive-like partitioning scheme with the ``partitioning`` keyword: + +.. ipython:: python + + dataset = ds.dataset(str(base / "parquet_dataset_partitioned"), format="parquet", + partitioning="hive") + dataset.files + +Although the partition fields are not included in the actual Parquet files, +they will be added back to the resulting table when scanning this dataset: + +.. ipython:: python + + dataset.to_table().to_pandas().head(3) + +We can now filter on the partition keys, which avoids loading files +altogether if they do not match the filter: + +.. ipython:: python + + dataset.to_table(filter=ds.field("part") == "b").to_pandas() + + +Different partitioning schemes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The above example uses a hive-like directory scheme, such as "/year=2009/month=11/day=15". +We specified this passing the ``partitioning="hive"`` keyword. In this case, +the types of the partition keys are inferred from the file paths. + +It is also possible to explicitly define the schema of the partition keys +using the :func:`partitioning` function. For example: + +.. code-block:: python + + part = ds.partitioning( + pa.schema([("year", pa.int16()), ("month", pa.int8()), ("day", pa.int32())]), + flavor="hive" + ) + dataset = ds.dataset(..., partitioning=part) + +"Directory partitioning" is also supported, where the segments in the file path +represent the values of the partition keys without including the name (the +field name are implicit in the segment's index). For example, given field names +"year", "month", and "day", one path might be "/2019/11/15". + +Since the names are not included in the file paths, these must be specified +when constructing a directory partitioning: + +.. code-block:: python + + part = ds.partitioning(field_names=["year", "month", "day"]) + +Directory partitioning also supports providing a full schema rather than inferring +types from file paths. + + +Reading from cloud storage +-------------------------- + +In addition to local files, pyarrow also supports reading from cloud storage. +Currently, :class:`HDFS <pyarrow.fs.HadoopFileSystem>` and +:class:`Amazon S3-compatible storage <pyarrow.fs.S3FileSystem>` are supported. + +When passing a file URI, the file system will be inferred. For example, +specifying a S3 path: + +.. code-block:: python + + dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"]) + +Typically, you will want to customize the connection parameters, and then +a file system object can be created and passed to the ``filesystem`` keyword: + +.. code-block:: python + + from pyarrow import fs + + s3 = fs.S3FileSystem(region="us-east-2") + dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=s3, + partitioning=["year", "month"]) + +The currently available classes are :class:`~pyarrow.fs.S3FileSystem` and +:class:`~pyarrow.fs.HadoopFileSystem`. See the :ref:`filesystem` docs for more +details. + + +Reading from Minio +------------------ + +In addition to cloud storage, pyarrow also supports reading from a +`MinIO <https://github.com/minio/minio>`_ object storage instance emulating S3 +APIs. Paired with `toxiproxy <https://github.com/shopify/toxiproxy>`_, this is +useful for testing or benchmarking. + +.. code-block:: python + + from pyarrow import fs + + # By default, MinIO will listen for unencrypted HTTP traffic. + minio = fs.S3FileSystem(scheme="http", endpoint="localhost:9000") + dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=minio, + partitioning=["year", "month"]) + + +Working with Parquet Datasets +----------------------------- + +While the Datasets API provides a unified interface to different file formats, +some specific methods exist for Parquet Datasets. + +Some processing frameworks such as Dask (optionally) use a ``_metadata`` file +with partitioned datasets which includes information about the schema and the +row group metadata of the full dataset. Using such a file can give a more +efficient creation of a parquet Dataset, since it does not need to infer the +schema and crawl the directories for all Parquet files (this is especially the +case for filesystems where accessing files is expensive). The +:func:`parquet_dataset` function allows us to create a Dataset from a partitioned +dataset with a ``_metadata`` file: + +.. code-block:: python + + dataset = ds.parquet_dataset("/path/to/dir/_metadata") + +By default, the constructed :class:`Dataset` object for Parquet datasets maps +each fragment to a single Parquet file. If you want fragments mapping to each +row group of a Parquet file, you can use the ``split_by_row_group()`` method of +the fragments: + +.. code-block:: python + + fragments = list(dataset.get_fragments()) + fragments[0].split_by_row_group() + +This method returns a list of new Fragments mapping to each row group of +the original Fragment (Parquet file). Both ``get_fragments()`` and +``split_by_row_group()`` accept an optional filter expression to get a +filtered list of fragments. + + +Manual specification of the Dataset +----------------------------------- + +The :func:`dataset` function allows easy creation of a Dataset viewing a directory, +crawling all subdirectories for files and partitioning information. However +sometimes discovery is not required and the dataset's files and partitions +are already known (for example, when this information is stored in metadata). +In this case it is possible to create a Dataset explicitly without any +automatic discovery or inference. + +For the example here, we are going to use a dataset where the file names contain +additional partitioning information: + +.. ipython:: python + + # creating a dummy dataset: directory with two files + table = pa.table({'col1': range(3), 'col2': np.random.randn(3)}) + (base / "parquet_dataset_manual").mkdir(exist_ok=True) + pq.write_table(table, base / "parquet_dataset_manual" / "data_2018.parquet") + pq.write_table(table, base / "parquet_dataset_manual" / "data_2019.parquet") + +To create a Dataset from a list of files, we need to specify the paths, schema, +format, filesystem, and partition expressions manually: + +.. ipython:: python + + from pyarrow import fs + + schema = pa.schema([("year", pa.int64()), ("col1", pa.int64()), ("col2", pa.float64())]) + + dataset = ds.FileSystemDataset.from_paths( + ["data_2018.parquet", "data_2019.parquet"], schema=schema, format=ds.ParquetFileFormat(), + filesystem=fs.SubTreeFileSystem(str(base / "parquet_dataset_manual"), fs.LocalFileSystem()), + partitions=[ds.field('year') == 2018, ds.field('year') == 2019]) + +Since we specified the "partition expressions" for our files, this information +is materialized as columns when reading the data and can be used for filtering: + +.. ipython:: python + + dataset.to_table().to_pandas() + dataset.to_table(filter=ds.field('year') == 2019).to_pandas() + +Another benefit of manually listing the files is that the order of the files +controls the order of the data. When performing an ordered read (or a read to +a table) then the rows returned will match the order of the files given. This +only applies when the dataset is constructed with a list of files. There +are no order guarantees given when the files are instead discovered by scanning +a directory. + +Iterative (out of core or streaming) reads +------------------------------------------ + +The previous examples have demonstrated how to read the data into a table using :func:`~Dataset.to_table`. This is +useful if the dataset is small or there is only a small amount of data that needs to +be read. The dataset API contains additional methods to read and process large amounts +of data in a streaming fashion. + +The easiest way to do this is to use the method :meth:`Dataset.to_batches`. This +method returns an iterator of record batches. For example, we can use this method to +calculate the average of a column without loading the entire column into memory: + +.. ipython:: python + + import pyarrow.compute as pc + + col2_sum = 0 + count = 0 + for batch in dataset.to_batches(columns=["col2"], filter=~ds.field("col2").is_null()): + col2_sum += pc.sum(batch.column("col2")).as_py() + count += batch.num_rows + mean_a = col2_sum/count + +Customizing the batch size +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An iterative read of a dataset is often called a "scan" of the dataset and pyarrow +uses an object called a :class:`Scanner` to do this. A Scanner is created for you +automatically by the :func:`~Dataset.to_table` and :func:`~Dataset.to_batches` method of the dataset. +Any arguments you pass to these methods will be passed on to the Scanner constructor. + +One of those parameters is the ``batch_size``. This controls the maximum size of the +batches returned by the scanner. Batches can still be smaller than the ``batch_size`` +if the dataset consists of small files or those files themselves consist of small +row groups. For example, a parquet file with 10,000 rows per row group will yield +batches with, at most, 10,000 rows unless the ``batch_size`` is set to a smaller value. + +The default batch size is one million rows and this is typically a good default but +you may want to customize it if you are reading a large number of columns. + +Writing Datasets +---------------- + +The dataset API also simplifies writing data to a dataset using :func:`write_dataset` . This can be useful when +you want to partition your data or you need to write a large amount of data. A +basic dataset write is similar to writing a table except that you specify a directory +instead of a filename. + +.. ipython:: python + + base = pathlib.Path(tempfile.gettempdir()) + dataset_root = base / "sample_dataset" + dataset_root.mkdir(exist_ok=True) + + table = pa.table({"a": range(10), "b": np.random.randn(10), "c": [1, 2] * 5}) + ds.write_dataset(table, dataset_root, format="parquet") + +The above example will create a single file named part-0.parquet in our sample_dataset +directory. + +.. warning:: + + If you run the example again it will replace the existing part-0.parquet file. + Appending files to an existing dataset requires specifying a new + ``basename_template`` for each call to ``ds.write_dataset`` + to avoid overwrite. + +Writing partitioned data +~~~~~~~~~~~~~~~~~~~~~~~~ + +A partitioning object can be used to specify how your output data should be partitioned. +This uses the same kind of partitioning objects we used for reading datasets. To write +our above data out to a partitioned directory we only need to specify how we want the +dataset to be partitioned. For example: + +.. ipython:: python + + part = ds.partitioning( + pa.schema([("c", pa.int16())]), flavor="hive" + ) + ds.write_dataset(table, dataset_root, format="parquet", partitioning=part) + +This will create two files. Half our data will be in the dataset_root/c=1 directory and +the other half will be in the dataset_root/c=2 directory. + +Writing large amounts of data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The above examples wrote data from a table. If you are writing a large amount of data +you may not be able to load everything into a single in-memory table. Fortunately, the +:func:`~Dataset.write_dataset` method also accepts an iterable of record batches. This makes it really +simple, for example, to repartition a large dataset without loading the entire dataset +into memory: + +.. ipython:: python + + old_part = ds.partitioning( + pa.schema([("c", pa.int16())]), flavor="hive" + ) + new_part = ds.partitioning( + pa.schema([("c", pa.int16())]), flavor=None + ) + input_dataset = ds.dataset(dataset_root, partitioning=old_part) + new_root = base / "repartitioned_dataset" + # A scanner can act as an iterator of record batches but you could also receive + # data from the network (e.g. via flight), from your own scanning, or from any + # other method that yields record batches. In addition, you can pass a dataset + # into write_dataset directly but this method is useful if you want to customize + # the scanner (e.g. to filter the input dataset or set a maximum batch size) + scanner = input_dataset.scanner(use_async=True) + + ds.write_dataset(scanner, new_root, format="parquet", partitioning=new_part) + +After the above example runs our data will be in dataset_root/1 and dataset_root/2 +directories. In this simple example we are not changing the structure of the data +(only the directory naming schema) but you could also use this mechnaism to change +which columns are used to partition the dataset. This is useful when you expect to +query your data in specific ways and you can utilize partitioning to reduce the +amount of data you need to read. + +Customizing & inspecting written files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default the dataset API will create files named "part-i.format" where "i" is a integer +generated during the write and "format" is the file format specified in the write_dataset +call. For simple datasets it may be possible to know which files will be created but for +larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used +to supply a visitor that will be called as each file is created: + +.. ipython:: python + + def file_visitor(written_file): + print(f"path={written_file.path}") + print(f"metadata={written_file.metadata}") + +.. ipython:: python + + ds.write_dataset(table, base / "dataset_visited", format="parquet", partitioning=part, + file_visitor=file_visitor) + +This will allow you to collect the filenames that belong to the dataset and store them elsewhere +which can be useful when you want to avoid scanning directories the next time you need to read +the data. It can also be used to generate the _metadata index file used by other tools such as +dask or spark to create an index of the dataset. + +Configuring format-specific parameters during a write +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to the common options shared by all formats there are also format specific options +that are unique to a particular format. For example, to allow truncated timestamps while writing +Parquet files: + +.. ipython:: python + + dataset_root = base / "sample_dataset2" + dataset_root.mkdir(exist_ok=True) + + parquet_format = ds.ParquetFileFormat() + write_options = parquet_format.make_write_options(allow_truncated_timestamps=True) + ds.write_dataset(table, dataset_root, format="parquet", partitioning=part, + file_options=write_options) diff --git a/src/arrow/docs/source/python/extending.rst b/src/arrow/docs/source/python/extending.rst new file mode 100644 index 000000000..5e00e7905 --- /dev/null +++ b/src/arrow/docs/source/python/extending.rst @@ -0,0 +1,483 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. cpp:namespace:: arrow + +.. _extending: + +Using pyarrow from C++ and Cython Code +====================================== + +pyarrow provides both a Cython and C++ API, allowing your own native code +to interact with pyarrow objects. + +C++ API +------- + +.. default-domain:: cpp + +The Arrow C++ header files are bundled with a pyarrow installation. +To get the absolute path to this directory (like ``numpy.get_include()``), use: + +.. code-block:: python + + import pyarrow as pa + pa.get_include() + +Assuming the path above is on your compiler's include path, the pyarrow API +can be included using the following directive: + +.. code-block:: cpp + + #include <arrow/python/pyarrow.h> + +This will not include other parts of the Arrow API, which you will need +to include yourself (for example ``arrow/api.h``). + +When building C extensions that use the Arrow C++ libraries, you must add +appropriate linker flags. We have provided functions ``pyarrow.get_libraries`` +and ``pyarrow.get_library_dirs`` which return a list of library names and +likely library install locations (if you installed pyarrow with pip or +conda). These must be included when declaring your C extensions with +setuptools (see below). + +Initializing the API +~~~~~~~~~~~~~~~~~~~~ + +.. function:: int import_pyarrow() + + Initialize inner pointers of the pyarrow API. On success, 0 is + returned. Otherwise, -1 is returned and a Python exception is set. + + It is mandatory to call this function before calling any other function + in the pyarrow C++ API. Failing to do so will likely lead to crashes. + +Wrapping and Unwrapping +~~~~~~~~~~~~~~~~~~~~~~~ + +pyarrow provides the following functions to go back and forth between +Python wrappers (as exposed by the pyarrow Python API) and the underlying +C++ objects. + +.. function:: bool arrow::py::is_array(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`Array` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.Array` instance. + +.. function:: bool arrow::py::is_batch(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`RecordBatch` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.RecordBatch` instance. + +.. function:: bool arrow::py::is_buffer(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`Buffer` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.Buffer` instance. + +.. function:: bool arrow::py::is_data_type(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`DataType` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.DataType` instance. + +.. function:: bool arrow::py::is_field(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`Field` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.Field` instance. + +.. function:: bool arrow::py::is_scalar(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`Scalar` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.Scalar` instance. + +.. function:: bool arrow::py::is_schema(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`Schema` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.Schema` instance. + +.. function:: bool arrow::py::is_table(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`Table` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.Table` instance. + +.. function:: bool arrow::py::is_tensor(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :class:`Tensor` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.Tensor` instance. + +.. function:: bool arrow::py::is_sparse_coo_tensor(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :type:`SparseCOOTensor` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseCOOTensor` instance. + +.. function:: bool arrow::py::is_sparse_csc_matrix(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :type:`SparseCSCMatrix` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseCSCMatrix` instance. + +.. function:: bool arrow::py::is_sparse_csf_tensor(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :type:`SparseCSFTensor` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseCSFTensor` instance. + +.. function:: bool arrow::py::is_sparse_csr_matrix(PyObject* obj) + + Return whether *obj* wraps an Arrow C++ :type:`SparseCSRMatrix` pointer; + in other words, whether *obj* is a :py:class:`pyarrow.SparseCSRMatrix` instance. + + +The following functions expect a pyarrow object, unwrap the underlying +Arrow C++ API pointer, and return it as a :class:`Result` object. An error +may be returned if the input object doesn't have the expected type. + +.. function:: Result<std::shared_ptr<Array>> arrow::py::unwrap_array(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`Array` pointer from *obj*. + +.. function:: Result<std::shared_ptr<RecordBatch>> arrow::py::unwrap_batch(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`RecordBatch` pointer from *obj*. + +.. function:: Result<std::shared_ptr<Buffer>> arrow::py::unwrap_buffer(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`Buffer` pointer from *obj*. + +.. function:: Result<std::shared_ptr<DataType>> arrow::py::unwrap_data_type(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`DataType` pointer from *obj*. + +.. function:: Result<std::shared_ptr<Field>> arrow::py::unwrap_field(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`Field` pointer from *obj*. + +.. function:: Result<std::shared_ptr<Scalar>> arrow::py::unwrap_scalar(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`Scalar` pointer from *obj*. + +.. function:: Result<std::shared_ptr<Schema>> arrow::py::unwrap_schema(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`Schema` pointer from *obj*. + +.. function:: Result<std::shared_ptr<Table>> arrow::py::unwrap_table(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`Table` pointer from *obj*. + +.. function:: Result<std::shared_ptr<Tensor>> arrow::py::unwrap_tensor(PyObject* obj) + + Unwrap and return the Arrow C++ :class:`Tensor` pointer from *obj*. + +.. function:: Result<std::shared_ptr<SparseCOOTensor>> arrow::py::unwrap_sparse_coo_tensor(PyObject* obj) + + Unwrap and return the Arrow C++ :type:`SparseCOOTensor` pointer from *obj*. + +.. function:: Result<std::shared_ptr<SparseCSCMatrix>> arrow::py::unwrap_sparse_csc_matrix(PyObject* obj) + + Unwrap and return the Arrow C++ :type:`SparseCSCMatrix` pointer from *obj*. + +.. function:: Result<std::shared_ptr<SparseCSFTensor>> arrow::py::unwrap_sparse_csf_tensor(PyObject* obj) + + Unwrap and return the Arrow C++ :type:`SparseCSFTensor` pointer from *obj*. + +.. function:: Result<std::shared_ptr<SparseCSRMatrix>> arrow::py::unwrap_sparse_csr_matrix(PyObject* obj) + + Unwrap and return the Arrow C++ :type:`SparseCSRMatrix` pointer from *obj*. + + +The following functions take an Arrow C++ API pointer and wrap it in a +pyarray object of the corresponding type. A new reference is returned. +On error, NULL is returned and a Python exception is set. + +.. function:: PyObject* arrow::py::wrap_array(const std::shared_ptr<Array>& array) + + Wrap the Arrow C++ *array* in a :py:class:`pyarrow.Array` instance. + +.. function:: PyObject* arrow::py::wrap_batch(const std::shared_ptr<RecordBatch>& batch) + + Wrap the Arrow C++ record *batch* in a :py:class:`pyarrow.RecordBatch` instance. + +.. function:: PyObject* arrow::py::wrap_buffer(const std::shared_ptr<Buffer>& buffer) + + Wrap the Arrow C++ *buffer* in a :py:class:`pyarrow.Buffer` instance. + +.. function:: PyObject* arrow::py::wrap_data_type(const std::shared_ptr<DataType>& data_type) + + Wrap the Arrow C++ *data_type* in a :py:class:`pyarrow.DataType` instance. + +.. function:: PyObject* arrow::py::wrap_field(const std::shared_ptr<Field>& field) + + Wrap the Arrow C++ *field* in a :py:class:`pyarrow.Field` instance. + +.. function:: PyObject* arrow::py::wrap_scalar(const std::shared_ptr<Scalar>& scalar) + + Wrap the Arrow C++ *scalar* in a :py:class:`pyarrow.Scalar` instance. + +.. function:: PyObject* arrow::py::wrap_schema(const std::shared_ptr<Schema>& schema) + + Wrap the Arrow C++ *schema* in a :py:class:`pyarrow.Schema` instance. + +.. function:: PyObject* arrow::py::wrap_table(const std::shared_ptr<Table>& table) + + Wrap the Arrow C++ *table* in a :py:class:`pyarrow.Table` instance. + +.. function:: PyObject* arrow::py::wrap_tensor(const std::shared_ptr<Tensor>& tensor) + + Wrap the Arrow C++ *tensor* in a :py:class:`pyarrow.Tensor` instance. + +.. function:: PyObject* arrow::py::wrap_sparse_coo_tensor(const std::shared_ptr<SparseCOOTensor>& sparse_tensor) + + Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCOOTensor` instance. + +.. function:: PyObject* arrow::py::wrap_sparse_csc_matrix(const std::shared_ptr<SparseCSCMatrix>& sparse_tensor) + + Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSCMatrix` instance. + +.. function:: PyObject* arrow::py::wrap_sparse_csf_tensor(const std::shared_ptr<SparseCSFTensor>& sparse_tensor) + + Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSFTensor` instance. + +.. function:: PyObject* arrow::py::wrap_sparse_csr_matrix(const std::shared_ptr<SparseCSRMatrix>& sparse_tensor) + + Wrap the Arrow C++ *sparse_tensor* in a :py:class:`pyarrow.SparseCSRMatrix` instance. + + +Cython API +---------- + +.. default-domain:: py + +The Cython API more or less mirrors the C++ API, but the calling convention +can be different as required by Cython. In Cython, you don't need to +initialize the API as that will be handled automatically by the ``cimport`` +directive. + +.. note:: + Classes from the Arrow C++ API are renamed when exposed in Cython, to + avoid named clashes with the corresponding Python classes. For example, + C++ Arrow arrays have the ``CArray`` type and ``Array`` is the + corresponding Python wrapper class. + +Wrapping and Unwrapping +~~~~~~~~~~~~~~~~~~~~~~~ + +The following functions expect a pyarrow object, unwrap the underlying +Arrow C++ API pointer, and return it. NULL is returned (without setting +an exception) if the input is not of the right type. + +.. function:: pyarrow_unwrap_array(obj) -> shared_ptr[CArray] + + Unwrap the Arrow C++ :cpp:class:`Array` pointer from *obj*. + +.. function:: pyarrow_unwrap_batch(obj) -> shared_ptr[CRecordBatch] + + Unwrap the Arrow C++ :cpp:class:`RecordBatch` pointer from *obj*. + +.. function:: pyarrow_unwrap_buffer(obj) -> shared_ptr[CBuffer] + + Unwrap the Arrow C++ :cpp:class:`Buffer` pointer from *obj*. + +.. function:: pyarrow_unwrap_data_type(obj) -> shared_ptr[CDataType] + + Unwrap the Arrow C++ :cpp:class:`CDataType` pointer from *obj*. + +.. function:: pyarrow_unwrap_field(obj) -> shared_ptr[CField] + + Unwrap the Arrow C++ :cpp:class:`Field` pointer from *obj*. + +.. function:: pyarrow_unwrap_scalar(obj) -> shared_ptr[CScalar] + + Unwrap the Arrow C++ :cpp:class:`Scalar` pointer from *obj*. + +.. function:: pyarrow_unwrap_schema(obj) -> shared_ptr[CSchema] + + Unwrap the Arrow C++ :cpp:class:`Schema` pointer from *obj*. + +.. function:: pyarrow_unwrap_table(obj) -> shared_ptr[CTable] + + Unwrap the Arrow C++ :cpp:class:`Table` pointer from *obj*. + +.. function:: pyarrow_unwrap_tensor(obj) -> shared_ptr[CTensor] + + Unwrap the Arrow C++ :cpp:class:`Tensor` pointer from *obj*. + +.. function:: pyarrow_unwrap_sparse_coo_tensor(obj) -> shared_ptr[CSparseCOOTensor] + + Unwrap the Arrow C++ :cpp:type:`SparseCOOTensor` pointer from *obj*. + +.. function:: pyarrow_unwrap_sparse_csc_matrix(obj) -> shared_ptr[CSparseCSCMatrix] + + Unwrap the Arrow C++ :cpp:type:`SparseCSCMatrix` pointer from *obj*. + +.. function:: pyarrow_unwrap_sparse_csf_tensor(obj) -> shared_ptr[CSparseCSFTensor] + + Unwrap the Arrow C++ :cpp:type:`SparseCSFTensor` pointer from *obj*. + +.. function:: pyarrow_unwrap_sparse_csr_matrix(obj) -> shared_ptr[CSparseCSRMatrix] + + Unwrap the Arrow C++ :cpp:type:`SparseCSRMatrix` pointer from *obj*. + + +The following functions take a Arrow C++ API pointer and wrap it in a +pyarray object of the corresponding type. An exception is raised on error. + +.. function:: pyarrow_wrap_array(const shared_ptr[CArray]& array) -> object + + Wrap the Arrow C++ *array* in a Python :class:`pyarrow.Array` instance. + +.. function:: pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& batch) -> object + + Wrap the Arrow C++ record *batch* in a Python :class:`pyarrow.RecordBatch` instance. + +.. function:: pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buffer) -> object + + Wrap the Arrow C++ *buffer* in a Python :class:`pyarrow.Buffer` instance. + +.. function:: pyarrow_wrap_data_type(const shared_ptr[CDataType]& data_type) -> object + + Wrap the Arrow C++ *data_type* in a Python :class:`pyarrow.DataType` instance. + +.. function:: pyarrow_wrap_field(const shared_ptr[CField]& field) -> object + + Wrap the Arrow C++ *field* in a Python :class:`pyarrow.Field` instance. + +.. function:: pyarrow_wrap_resizable_buffer(const shared_ptr[CResizableBuffer]& buffer) -> object + + Wrap the Arrow C++ resizable *buffer* in a Python :class:`pyarrow.ResizableBuffer` instance. + +.. function:: pyarrow_wrap_scalar(const shared_ptr[CScalar]& scalar) -> object + + Wrap the Arrow C++ *scalar* in a Python :class:`pyarrow.Scalar` instance. + +.. function:: pyarrow_wrap_schema(const shared_ptr[CSchema]& schema) -> object + + Wrap the Arrow C++ *schema* in a Python :class:`pyarrow.Schema` instance. + +.. function:: pyarrow_wrap_table(const shared_ptr[CTable]& table) -> object + + Wrap the Arrow C++ *table* in a Python :class:`pyarrow.Table` instance. + +.. function:: pyarrow_wrap_tensor(const shared_ptr[CTensor]& tensor) -> object + + Wrap the Arrow C++ *tensor* in a Python :class:`pyarrow.Tensor` instance. + +.. function:: pyarrow_wrap_sparse_coo_tensor(const shared_ptr[CSparseCOOTensor]& sparse_tensor) -> object + + Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseCOOTensor` instance. + +.. function:: pyarrow_wrap_sparse_csc_matrix(const shared_ptr[CSparseCSCMatrix]& sparse_tensor) -> object + + Wrap the Arrow C++ *CSC sparse tensor* in a Python :class:`pyarrow.SparseCSCMatrix` instance. + +.. function:: pyarrow_wrap_sparse_csf_tensor(const shared_ptr[CSparseCSFTensor]& sparse_tensor) -> object + + Wrap the Arrow C++ *COO sparse tensor* in a Python :class:`pyarrow.SparseCSFTensor` instance. + +.. function:: pyarrow_wrap_sparse_csr_matrix(const shared_ptr[CSparseCSRMatrix]& sparse_tensor) -> object + + Wrap the Arrow C++ *CSR sparse tensor* in a Python :class:`pyarrow.SparseCSRMatrix` instance. + + +Example +~~~~~~~ + +The following Cython module shows how to unwrap a Python object and call +the underlying C++ object's API. + +.. code-block:: python + + # distutils: language=c++ + + from pyarrow.lib cimport * + + + def get_array_length(obj): + # Just an example function accessing both the pyarrow Cython API + # and the Arrow C++ API + cdef shared_ptr[CArray] arr = pyarrow_unwrap_array(obj) + if arr.get() == NULL: + raise TypeError("not an array") + return arr.get().length() + +To build this module, you will need a slightly customized ``setup.py`` file +(this is assuming the file above is named ``example.pyx``): + +.. code-block:: python + + from setuptools import setup + from Cython.Build import cythonize + + import os + import numpy as np + import pyarrow as pa + + + ext_modules = cythonize("example.pyx") + + for ext in ext_modules: + # The Numpy C headers are currently required + ext.include_dirs.append(np.get_include()) + ext.include_dirs.append(pa.get_include()) + ext.libraries.extend(pa.get_libraries()) + ext.library_dirs.extend(pa.get_library_dirs()) + + if os.name == 'posix': + ext.extra_compile_args.append('-std=c++11') + + # Try uncommenting the following line on Linux + # if you get weird linker errors or runtime crashes + # ext.define_macros.append(("_GLIBCXX_USE_CXX11_ABI", "0")) + + + setup(ext_modules=ext_modules) + + +Compile the extension: + +.. code-block:: bash + + python setup.py build_ext --inplace + +Building Extensions against PyPI Wheels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The Python wheels have the Arrow C++ libraries bundled in the top level +``pyarrow/`` install directory. On Linux and macOS, these libraries have an ABI +tag like ``libarrow.so.17`` which means that linking with ``-larrow`` using the +linker path provided by ``pyarrow.get_library_dirs()`` will not work right out +of the box. To fix this, you must run ``pyarrow.create_library_symlinks()`` +once as a user with write access to the directory where pyarrow is +installed. This function will attempt to create symlinks like +``pyarrow/libarrow.so``. For example: + +.. code-block:: bash + + pip install pyarrow + python -c "import pyarrow; pyarrow.create_library_symlinks()" + +Toolchain Compatibility (Linux) +""""""""""""""""""""""""""""""" + +The Python wheels for Linux are built using the +`PyPA manylinux images <https://quay.io/organization/pypa>`_ which use +the CentOS `devtoolset-8` or `devtoolset-9` depending on which manylinux +wheel version (2010 or 2014) is being used. In addition to the other notes +above, if you are compiling C++ using these shared libraries, you will need +to make sure you use a compatible toolchain as well or you might see a +segfault during runtime. + +Also, if you encounter errors when linking or loading the library, consider +setting the ``_GLIBCXX_USE_CXX11_ABI`` preprocessor macro to ``0`` +(for example by adding ``-D_GLIBCXX_USE_CXX11_ABI=0`` to ``CFLAGS``). diff --git a/src/arrow/docs/source/python/extending_types.rst b/src/arrow/docs/source/python/extending_types.rst new file mode 100644 index 000000000..689724a4a --- /dev/null +++ b/src/arrow/docs/source/python/extending_types.rst @@ -0,0 +1,324 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. _extending_types: + +Extending pyarrow +================= + +Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol +----------------------------------------------------------------------------- + +The :func:`pyarrow.array` function has built-in support for Python sequences, +numpy arrays and pandas 1D objects (Series, Index, Categorical, ..) to convert +those to Arrow arrays. This can be extended for other array-like objects +by implementing the ``__arrow_array__`` method (similar to numpy's ``__array__`` +protocol). + +For example, to support conversion of your duck array class to an Arrow array, +define the ``__arrow_array__`` method to return an Arrow array:: + + class MyDuckArray: + + ... + + def __arrow_array__(self, type=None): + # convert the underlying array values to a pyarrow Array + import pyarrow + return pyarrow.array(..., type=type) + +The ``__arrow_array__`` method takes an optional `type` keyword which is passed +through from :func:`pyarrow.array`. The method is allowed to return either +a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`. + + +Defining extension types ("user-defined types") +----------------------------------------------- + +Arrow has the notion of extension types in the metadata specification as a +possibility to extend the built-in types. This is done by annotating any of the +built-in Arrow logical types (the "storage type") with a custom type name and +optional serialized representation ("ARROW:extension:name" and +"ARROW:extension:metadata" keys in the Field’s custom_metadata of an IPC +message). +See the :ref:`format_metadata_extension_types` section of the metadata +specification for more details. + +Pyarrow allows you to define such extension types from Python. + +There are currently two ways: + +* Subclassing :class:`PyExtensionType`: the (de)serialization is based on pickle. + This is a good option for an extension type that is only used from Python. +* Subclassing :class:`ExtensionType`: this allows to give a custom + Python-independent name and serialized metadata, that can potentially be + recognized by other (non-Python) Arrow implementations such as PySpark. + +For example, we could define a custom UUID type for 128-bit numbers which can +be represented as ``FixedSizeBinary`` type with 16 bytes. +Using the first approach, we create a ``UuidType`` subclass, and implement the +``__reduce__`` method to ensure the class can be properly pickled:: + + class UuidType(pa.PyExtensionType): + + def __init__(self): + pa.PyExtensionType.__init__(self, pa.binary(16)) + + def __reduce__(self): + return UuidType, () + +This can now be used to create arrays and tables holding the extension type:: + + >>> uuid_type = UuidType() + >>> uuid_type.extension_name + 'arrow.py_extension_type' + >>> uuid_type.storage_type + FixedSizeBinaryType(fixed_size_binary[16]) + + >>> import uuid + >>> storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) + >>> arr = pa.ExtensionArray.from_storage(uuid_type, storage_array) + >>> arr + <pyarrow.lib.ExtensionArray object at 0x7f75c2f300a0> + [ + A6861959108644B797664AEEE686B682, + 718747F48E5F4058A7261E2B6B228BE8, + 7FE201227D624D96A5CD8639DEF2A68B, + C6CA8C7F95744BFD9462A40B3F57A86C + ] + +This array can be included in RecordBatches, sent over IPC and received in +another Python process. The custom UUID type will be preserved there, as long +as the definition of the class is available (the type can be unpickled). + +For example, creating a RecordBatch and writing it to a stream using the +IPC protocol:: + + >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"]) + >>> sink = pa.BufferOutputStream() + >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer: + ... writer.write_batch(batch) + >>> buf = sink.getvalue() + +and then reading it back yields the proper type:: + + >>> with pa.ipc.open_stream(buf) as reader: + ... result = reader.read_all() + >>> result.column('ext').type + UuidType(extension<arrow.py_extension_type>) + +We can define the same type using the other option:: + + class UuidType(pa.ExtensionType): + + def __init__(self): + pa.ExtensionType.__init__(self, pa.binary(16), "my_package.uuid") + + def __arrow_ext_serialize__(self): + # since we don't have a parameterized type, we don't need extra + # metadata to be deserialized + return b'' + + @classmethod + def __arrow_ext_deserialize__(self, storage_type, serialized): + # return an instance of this subclass given the serialized + # metadata. + return UuidType() + +This is a slightly longer implementation (you need to implement the special +methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__``), and the +extension type needs to be registered to be received through IPC (using +:func:`register_extension_type`), but it has +now a unique name:: + + >>> uuid_type = UuidType() + >>> uuid_type.extension_name + 'my_package.uuid' + + >>> pa.register_extension_type(uuid_type) + +The receiving application doesn't need to be Python but can still recognize +the extension type as a "uuid" type, if it has implemented its own extension +type to receive it. +If the type is not registered in the receiving application, it will fall back +to the storage type. + +Parameterized extension type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The above example used a fixed storage type with no further metadata. But +more flexible, parameterized extension types are also possible. + +The example given here implements an extension type for the `pandas "period" +data type <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-span-representation>`__, +representing time spans (e.g., a frequency of a day, a month, a quarter, etc). +It is stored as an int64 array which is interpreted as the number of time spans +of the given frequency since 1970. + +:: + + class PeriodType(pa.ExtensionType): + + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pa.ExtensionType.__init__(self, pa.int64(), 'my_package.period') + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + return "freq={}".format(self.freq).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + # return an instance of this subclass given the serialized + # metadata. + serialized = serialized.decode() + assert serialized.startswith("freq=") + freq = serialized.split('=')[1] + return PeriodType(freq) + +Here, we ensure to store all information in the serialized metadata that is +needed to reconstruct the instance (in the ``__arrow_ext_deserialize__`` class +method), in this case the frequency string. + +Note that, once created, the data type instance is considered immutable. If, +in the example above, the ``freq`` parameter would change after instantiation, +the reconstruction of the type instance after IPC will be incorrect. +In the example above, the ``freq`` parameter is therefore stored in a private +attribute with a public read-only property to access it. + +Parameterized extension types are also possible using the pickle-based type +subclassing :class:`PyExtensionType`. The equivalent example for the period +data type from above would look like:: + + class PeriodType(pa.PyExtensionType): + + def __init__(self, freq): + self._freq = freq + pa.PyExtensionType.__init__(self, pa.int64()) + + @property + def freq(self): + return self._freq + + def __reduce__(self): + return PeriodType, (self.freq,) + +Also the storage type does not need to be fixed but can be parameterized. + +Custom extension array class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, all arrays with an extension type are constructed or deserialized into +a built-in :class:`ExtensionArray` object. Nevertheless, one could want to subclass +:class:`ExtensionArray` in order to add some custom logic specific to the extension +type. Arrow allows to do so by adding a special method ``__arrow_ext_class__`` to the +definition of the extension type. + +For instance, let us consider the example from the `Numpy Quickstart <https://docs.scipy.org/doc/numpy-1.13.0/user/quickstart.html>`_ of points in 3D space. +We can store these as a fixed-size list, where we wish to be able to extract +the data as a 2-D Numpy array ``(N, 3)`` without any copy:: + + class Point3DArray(pa.ExtensionArray): + def to_numpy_array(self): + return self.storage.flatten().to_numpy().reshape((-1, 3)) + + + class Point3DType(pa.PyExtensionType): + def __init__(self): + pa.PyExtensionType.__init__(self, pa.list_(pa.float32(), 3)) + + def __reduce__(self): + return Point3DType, () + + def __arrow_ext_class__(self): + return Point3DArray + +Arrays built using this extension type now have the expected custom array class:: + + >>> storage = pa.array([[1, 2, 3], [4, 5, 6]], pa.list_(pa.float32(), 3)) + >>> arr = pa.ExtensionArray.from_storage(Point3DType(), storage) + >>> arr + <__main__.Point3DArray object at 0x7f40dea80670> + [ + [ + 1, + 2, + 3 + ], + [ + 4, + 5, + 6 + ] + ] + +The additional methods in the extension class are then available to the user:: + + >>> arr.to_numpy_array() + array([[1., 2., 3.], + [4., 5., 6.]], dtype=float32) + + +This array can be sent over IPC, received in another Python process, and the custom +extension array class will be preserved (as long as the definitions of the classes above +are available). + +The same ``__arrow_ext_class__`` specialization can be used with custom types defined +by subclassing :class:`ExtensionType`. + + +Conversion to pandas +~~~~~~~~~~~~~~~~~~~~ + +The conversion to pandas (in :meth:`Table.to_pandas`) of columns with an +extension type can controlled in case there is a corresponding +`pandas extension array <https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extension-types>`__ +for your extension type. + +For this, the :meth:`ExtensionType.to_pandas_dtype` method needs to be +implemented, and should return a ``pandas.api.extensions.ExtensionDtype`` +subclass instance. + +Using the pandas period type from above as example, this would look like:: + + class PeriodType(pa.ExtensionType): + ... + + def to_pandas_dtype(self): + import pandas as pd + return pd.PeriodDtype(freq=self.freq) + +Secondly, the pandas ``ExtensionDtype`` on its turn needs to have the +``__from_arrow__`` method implemented: a method that given a pyarrow Array +or ChunkedArray of the extension type can construct the corresponding +pandas ``ExtensionArray``. This method should have the following signature:: + + + class MyExtensionDtype(pd.api.extensions.ExtensionDtype): + ... + + def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> pandas.ExtensionArray: + ... + +This way, you can control the conversion of a pyarrow ``Array`` of your pyarrow +extension type to a pandas ``ExtensionArray`` that can be stored in a DataFrame. diff --git a/src/arrow/docs/source/python/feather.rst b/src/arrow/docs/source/python/feather.rst new file mode 100644 index 000000000..026ea987a --- /dev/null +++ b/src/arrow/docs/source/python/feather.rst @@ -0,0 +1,109 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _feather: + +Feather File Format +=================== + +Feather is a portable file format for storing Arrow tables or data frames (from +languages like Python or R) that utilizes the :ref:`Arrow IPC format <ipc>` +internally. Feather was created early in the Arrow project as a proof of +concept for fast, language-agnostic data frame storage for Python (pandas) and +R. There are two file format versions for Feather: + +* Version 2 (V2), the default version, which is exactly represented as the + Arrow IPC file format on disk. V2 files support storing all Arrow data types + as well as compression with LZ4 or ZSTD. V2 was first made available in + Apache Arrow 0.17.0. +* Version 1 (V1), a legacy version available starting in 2016, replaced by + V2. V1 files are distinct from Arrow IPC files and lack many features, such + as the ability to store all Arrow data types. V1 files also lack compression + support. We intend to maintain read support for V1 for the foreseeable + future. + +The ``pyarrow.feather`` module contains the read and write functions for the +format. :func:`~pyarrow.feather.write_feather` accepts either a +:class:`~pyarrow.Table` or ``pandas.DataFrame`` object: + +.. code-block:: python + + import pyarrow.feather as feather + feather.write_feather(df, '/path/to/file') + +:func:`~pyarrow.feather.read_feather` reads a Feather file as a +``pandas.DataFrame``. :func:`~pyarrow.feather.read_table` reads a Feather file +as a :class:`~pyarrow.Table`. Internally, :func:`~pyarrow.feather.read_feather` +simply calls :func:`~pyarrow.feather.read_table` and the result is converted to +pandas: + +.. code-block:: python + + # Result is pandas.DataFrame + read_df = feather.read_feather('/path/to/file') + + # Result is pyarrow.Table + read_arrow = feather.read_table('/path/to/file') + +These functions can read and write with file-paths or file-like objects. For +example: + +.. code-block:: python + + with open('/path/to/file', 'wb') as f: + feather.write_feather(df, f) + + with open('/path/to/file', 'rb') as f: + read_df = feather.read_feather(f) + +A file input to ``read_feather`` must support seeking. + +Using Compression +----------------- + +As of Apache Arrow version 0.17.0, Feather V2 files (the default version) +support two fast compression libraries, LZ4 (using the frame format) and +ZSTD. LZ4 is used by default if it is available (which it should be if you +obtained pyarrow through a normal package manager): + +.. code-block:: python + + # Uses LZ4 by default + feather.write_feather(df, file_path) + + # Use LZ4 explicitly + feather.write_feather(df, file_path, compression='lz4') + + # Use ZSTD + feather.write_feather(df, file_path, compression='zstd') + + # Do not compress + feather.write_feather(df, file_path, compression='uncompressed') + +Note that the default LZ4 compression generally yields much smaller files +without sacrificing much read or write performance. In some instances, +LZ4-compressed files may be faster to read and write than uncompressed due to +reduced disk IO requirements. + +Writing Version 1 (V1) Files +---------------------------- + +For compatibility with libraries without support for Version 2 files, you can +write the version 1 format by passing ``version=1`` to ``write_feather``. We +intend to maintain read support for V1 for the foreseeable future. diff --git a/src/arrow/docs/source/python/filesystems.rst b/src/arrow/docs/source/python/filesystems.rst new file mode 100644 index 000000000..1ddb4dfa2 --- /dev/null +++ b/src/arrow/docs/source/python/filesystems.rst @@ -0,0 +1,305 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _filesystem: + +.. currentmodule:: pyarrow.fs + +Filesystem Interface +==================== + +PyArrow comes with an abstract filesystem interface, as well as concrete +implementations for various storage types. + +The filesystem interface provides input and output streams as well as +directory operations. A simplified view of the underlying data +storage is exposed. Data paths are represented as *abstract paths*, which +are ``/``-separated, even on Windows, and shouldn't include special path +components such as ``.`` and ``..``. Symbolic links, if supported by the +underlying storage, are automatically dereferenced. Only basic +:class:`metadata <FileInfo>` about file entries, such as the file size +and modification time, is made available. + +The core interface is represented by the base class :class:`FileSystem`. + +Pyarrow implements natively the following filesystem subclasses: + +* :ref:`filesystem-localfs` (:class:`LocalFileSystem`) +* :ref:`filesystem-s3` (:class:`S3FileSystem`) +* :ref:`filesystem-hdfs` (:class:`HadoopFileSystem`) + +It is also possible to use your own fsspec-compliant filesystem with pyarrow functionalities as described in the section :ref:`filesystem-fsspec`. + + +.. _filesystem-usage: + +Usage +----- + +Instantiating a filesystem +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A FileSystem object can be created with one of the constructors (and check the +respective constructor for its options):: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + +or alternatively inferred from a URI:: + + >>> s3, path = fs.FileSystem.from_uri("s3://my-bucket") + >>> s3 + <pyarrow._s3fs.S3FileSystem at 0x7f6760cbf4f0> + >>> path + 'my-bucket' + + +Reading and writing files +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Several of the IO-related functions in PyArrow accept either a URI (and infer +the filesystem) or an explicit ``filesystem`` argument to specify the filesystem +to read or write from. For example, the :meth:`pyarrow.parquet.read_table` +function can be used in the following ways:: + + import pyarrow.parquet as pq + + # using a URI -> filesystem is inferred + pq.read_table("s3://my-bucket/data.parquet") + # using a path and filesystem + s3 = fs.S3FileSystem(..) + pq.read_table("my-bucket/data.parquet", filesystem=s3) + +The filesystem interface further allows to open files for reading (input) or +writing (output) directly, which can be combined with functions that work with +file-like objects. For example:: + + import pyarrow as pa + + local = fs.LocalFileSystem() + + with local.open_output_stream("test.arrow") as file: + with pa.RecordBatchFileWriter(file, table.schema) as writer: + writer.write_table(table) + + +Listing files +~~~~~~~~~~~~~ + +Inspecting the directories and files on a filesystem can be done with the +:meth:`FileSystem.get_file_info` method. To list the contents of a directory, +use the :class:`FileSelector` object to specify the selection:: + + >>> local.get_file_info(fs.FileSelector("dataset/", recursive=True)) + [<FileInfo for 'dataset/part=B': type=FileType.Directory>, + <FileInfo for 'dataset/part=B/data0.parquet': type=FileType.File, size=1564>, + <FileInfo for 'dataset/part=A': type=FileType.Directory>, + <FileInfo for 'dataset/part=A/data0.parquet': type=FileType.File, size=1564>] + +This returns a list of :class:`FileInfo` objects, containing information about +the type (file or directory), the size, the date last modified, etc. + +You can also get this information for a single explicit path (or list of +paths):: + + >>> local.get_file_info('test.arrow') + <FileInfo for 'test.arrow': type=FileType.File, size=3250> + + >>> local.get_file_info('non_existent') + <FileInfo for 'non_existent': type=FileType.NotFound> + + +.. _filesystem-localfs: + +Local FS +-------- + +The :class:`LocalFileSystem` allows you to access files on the local machine. + +Example how to write to disk and read it back:: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream('/tmp/pyarrowtest.dat') as stream: + stream.write(b'data') + 4 + >>> with local.open_input_stream('/tmp/pyarrowtest.dat') as stream: + print(stream.readall()) + b'data' + + +.. _filesystem-s3: + +S3 +-- + +PyArrow implements natively a S3 filesystem for S3 compatible storage. + +The :class:`S3FileSystem` constructor has several options to configure the S3 +connection (e.g. credentials, the region, an endpoint override, etc). In +addition, the constructor will also inspect configured S3 credentials as +supported by AWS (for example the ``AWS_ACCESS_KEY_ID`` and +``AWS_SECRET_ACCESS_KEY`` environment variables). + +Example how you can read contents from a S3 bucket:: + + >>> from pyarrow import fs + >>> s3 = fs.S3FileSystem(region='eu-west-3') + + # List all contents in a bucket, recursively + >>> s3.get_file_info(fs.FileSelector('my-test-bucket', recursive=True)) + [<FileInfo for 'my-test-bucket/File1': type=FileType.File, size=10>, + <FileInfo for 'my-test-bucket/File5': type=FileType.File, size=10>, + <FileInfo for 'my-test-bucket/Dir1': type=FileType.Directory>, + <FileInfo for 'my-test-bucket/Dir2': type=FileType.Directory>, + <FileInfo for 'my-test-bucket/EmptyDir': type=FileType.Directory>, + <FileInfo for 'my-test-bucket/Dir1/File2': type=FileType.File, size=11>, + <FileInfo for 'my-test-bucket/Dir1/Subdir': type=FileType.Directory>, + <FileInfo for 'my-test-bucket/Dir2/Subdir': type=FileType.Directory>, + <FileInfo for 'my-test-bucket/Dir2/Subdir/File3': type=FileType.File, size=10>] + + # Open a file for reading and download its contents + >>> f = s3.open_input_stream('my-test-bucket/Dir1/File2') + >>> f.readall() + b'some data' + +.. seealso:: + + See the `AWS docs <https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html>`__ + for the different ways to configure the AWS credentials. + + +.. _filesystem-hdfs: + +Hadoop Distributed File System (HDFS) +------------------------------------- + +PyArrow comes with bindings to the Hadoop File System (based on C++ bindings +using ``libhdfs``, a JNI-based interface to the Java Hadoop client). You connect +using the :class:`HadoopFileSystem` constructor: + +.. code-block:: python + + from pyarrow import fs + hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) + +The ``libhdfs`` library is loaded **at runtime** (rather than at link / library +load time, since the library may not be in your LD_LIBRARY_PATH), and relies on +some environment variables. + +* ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has + `lib/native/libhdfs.so`. + +* ``JAVA_HOME``: the location of your Java SDK installation. + +* ``ARROW_LIBHDFS_DIR`` (optional): explicit location of ``libhdfs.so`` if it is + installed somewhere other than ``$HADOOP_HOME/lib/native``. + +* ``CLASSPATH``: must contain the Hadoop jars. You can set these using: + + .. code-block:: shell + + export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` + # or on Windows + %HADOOP_HOME%/bin/hadoop classpath --glob > %CLASSPATH% + + In contrast to the legacy HDFS filesystem with ``pa.hdfs.connect``, setting + ``CLASSPATH`` is not optional (pyarrow will not attempt to infer it). + +.. _filesystem-fsspec: + +Using fsspec-compatible filesystems with Arrow +---------------------------------------------- + +The filesystems mentioned above are natively supported by Arrow C++ / PyArrow. +The Python ecosystem, however, also has several filesystem packages. Those +packages following the `fsspec`_ interface can be used in PyArrow as well. + +Functions accepting a filesystem object will also accept an fsspec subclass. +For example:: + + # creating an fsspec-based filesystem object for Google Cloud Storage + import gcsfs + fs = gcsfs.GCSFileSystem(project='my-google-project') + + # using this to read a partitioned dataset + import pyarrow.dataset as ds + ds.dataset("data/", filesystem=fs) + +Similarly for Azure Blob Storage:: + + import adlfs + # ... load your credentials and configure the filesystem + fs = adlfs.AzureBlobFileSystem(account_name=account_name, account_key=account_key) + + import pyarrow.dataset as ds + ds.dataset("mycontainer/data/", filesystem=fs) + +Under the hood, the fsspec filesystem object is wrapped into a python-based +PyArrow filesystem (:class:`PyFileSystem`) using :class:`FSSpecHandler`. +You can also manually do this to get an object with the PyArrow FileSystem +interface:: + + from pyarrow.fs import PyFileSystem, FSSpecHandler + pa_fs = PyFileSystem(FSSpecHandler(fs)) + +Then all the functionalities of :class:`FileSystem` are accessible:: + + # write data + with pa_fs.open_output_stream('mycontainer/pyarrowtest.dat') as stream: + stream.write(b'data') + + # read data + with pa_fs.open_input_stream('mycontainer/pyarrowtest.dat') as stream: + print(stream.readall()) + #b'data' + + # read a partitioned dataset + ds.dataset("data/", filesystem=pa_fs) + + +Using Arrow filesystems with fsspec +----------------------------------- + +The Arrow FileSystem interface has a limited, developer-oriented API surface. +This is sufficient for basic interactions and for using this with +Arrow's IO functionality. On the other hand, the `fsspec`_ interface provides +a very large API with many helper methods. If you want to use those, or if you +need to interact with a package that expects fsspec-compatible filesystem +objects, you can wrap an Arrow FileSystem object with fsspec. + +Starting with ``fsspec`` version 2021.09, the ``ArrowFSWrapper`` can be used +for this:: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> from fsspec.implementations.arrow import ArrowFSWrapper + >>> local_fsspec = ArrowFSWrapper(local) + +The resulting object now has an fsspec-compatible interface, while being backed +by the Arrow FileSystem under the hood. +Example usage to create a directory and file, and list the content:: + + >>> local_fsspec.mkdir("./test") + >>> local_fsspec.touch("./test/file.txt") + >>> local_fsspec.ls("./test/") + ['./test/file.txt'] + +For more information, see the `fsspec`_ documentation. + + +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ diff --git a/src/arrow/docs/source/python/filesystems_deprecated.rst b/src/arrow/docs/source/python/filesystems_deprecated.rst new file mode 100644 index 000000000..04887e977 --- /dev/null +++ b/src/arrow/docs/source/python/filesystems_deprecated.rst @@ -0,0 +1,95 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Filesystem Interface (legacy) +============================= + +.. warning:: + This section documents the deprecated filesystem layer. You should + use the :ref:`new filesystem layer <filesystem>` instead. + +.. _hdfs: + +Hadoop File System (HDFS) +------------------------- + +PyArrow comes with bindings to a C++-based interface to the Hadoop File +System. You connect like so: + +.. code-block:: python + + import pyarrow as pa + fs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path) + with fs.open(path, 'rb') as f: + # Do something with f + +By default, ``pyarrow.hdfs.HadoopFileSystem`` uses libhdfs, a JNI-based +interface to the Java Hadoop client. This library is loaded **at runtime** +(rather than at link / library load time, since the library may not be in your +LD_LIBRARY_PATH), and relies on some environment variables. + +* ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has + `lib/native/libhdfs.so`. + +* ``JAVA_HOME``: the location of your Java SDK installation. + +* ``ARROW_LIBHDFS_DIR`` (optional): explicit location of ``libhdfs.so`` if it is + installed somewhere other than ``$HADOOP_HOME/lib/native``. + +* ``CLASSPATH``: must contain the Hadoop jars. You can set these using: + +.. code-block:: shell + + export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob` + +If ``CLASSPATH`` is not set, then it will be set automatically if the +``hadoop`` executable is in your system path, or if ``HADOOP_HOME`` is set. + +You can also use libhdfs3, a thirdparty C++ library for HDFS from Pivotal Labs: + +.. code-block:: python + + fs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path, + driver='libhdfs3') + +HDFS API +~~~~~~~~ + +.. currentmodule:: pyarrow + +.. autosummary:: + :toctree: generated/ + + hdfs.connect + HadoopFileSystem.cat + HadoopFileSystem.chmod + HadoopFileSystem.chown + HadoopFileSystem.delete + HadoopFileSystem.df + HadoopFileSystem.disk_usage + HadoopFileSystem.download + HadoopFileSystem.exists + HadoopFileSystem.get_capacity + HadoopFileSystem.get_space_used + HadoopFileSystem.info + HadoopFileSystem.ls + HadoopFileSystem.mkdir + HadoopFileSystem.open + HadoopFileSystem.rename + HadoopFileSystem.rm + HadoopFileSystem.upload + HdfsFile diff --git a/src/arrow/docs/source/python/getstarted.rst b/src/arrow/docs/source/python/getstarted.rst new file mode 100644 index 000000000..36e4707ad --- /dev/null +++ b/src/arrow/docs/source/python/getstarted.rst @@ -0,0 +1,145 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _getstarted: + +Getting Started +=============== + +Arrow manages data in arrays (:class:`pyarrow.Array`), which can be +grouped in tables (:class:`pyarrow.Table`) to represent columns of data +in tabular data. + +Arrow also provides support for various formats to get those tabular +data in and out of disk and networks. Most commonly used formats are +Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). + +Creating Arrays and Tables +-------------------------- + +Arrays in Arrow are collections of data of uniform type. That allows +Arrow to use the best performing implementation to store the data and +perform computations on it. So each array is meant to have data and +a type + +.. ipython:: python + + import pyarrow as pa + + days = pa.array([1, 12, 17, 23, 28], type=pa.int8()) + +Multiple arrays can be combined in tables to form the columns +in tabular data when attached to a column name + +.. ipython:: python + + months = pa.array([1, 3, 5, 7, 1], type=pa.int8()) + years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16()) + + birthdays_table = pa.table([days, months, years], + names=["days", "months", "years"]) + + birthdays_table + +See :ref:`data` for more details. + +Saving and Loading Tables +------------------------- + +Once you have tabular data, Arrow provides out of the box +the features to save and restore that data for common formats +like Parquet: + +.. ipython:: python + + import pyarrow.parquet as pq + + pq.write_table(birthdays_table, 'birthdays.parquet') + +Once you have your data on disk, loading it back is a single function call, +and Arrow is heavily optimized for memory and speed so loading +data will be as quick as possible + +.. ipython:: python + + reloaded_birthdays = pq.read_table('birthdays.parquet') + + reloaded_birthdays + +Saving and loading back data in arrow is usually done through +:ref:`Parquet <parquet>`, :ref:`IPC format <ipc>` (:ref:`feather`), +:ref:`CSV <csv>` or :ref:`Line-Delimited JSON <json>` formats. + +Performing Computations +----------------------- + +Arrow ships with a bunch of compute functions that can be applied +to its arrays and tables, so through the compute functions +it's possible to apply transformations to the data + +.. ipython:: python + + import pyarrow.compute as pc + + pc.value_counts(birthdays_table["years"]) + +See :ref:`compute` for a list of available compute functions and +how to use them. + +Working with large data +----------------------- + +Arrow also provides the :class:`pyarrow.dataset` API to work with +large data, which will handle for you partitioning of your data in +smaller chunks + +.. ipython:: python + + import pyarrow.dataset as ds + + ds.write_dataset(birthdays_table, "savedir", format="parquet", + partitioning=ds.partitioning( + pa.schema([birthdays_table.schema.field("years")]) + )) + +Loading back the partitioned dataset will detect the chunks + +.. ipython:: python + + birthdays_dataset = ds.dataset("savedir", format="parquet", partitioning=["years"]) + + birthdays_dataset.files + +and will lazily load chunks of data only when iterating over them + +.. ipython:: python + + import datetime + + current_year = datetime.datetime.utcnow().year + for table_chunk in birthdays_dataset.to_batches(): + print("AGES", pc.subtract(current_year, table_chunk["years"])) + +For further details on how to work with big datasets, how to filter them, +how to project them, etc., refer to :ref:`dataset` documentation. + +Continuining from here +---------------------- + +For digging further into Arrow, you might want to read the +:doc:`PyArrow Documentation <./index>` itself or the +`Arrow Python Cookbook <https://arrow.apache.org/cookbook/py/>`_ diff --git a/src/arrow/docs/source/python/getting_involved.rst b/src/arrow/docs/source/python/getting_involved.rst new file mode 100644 index 000000000..7159bdfb0 --- /dev/null +++ b/src/arrow/docs/source/python/getting_involved.rst @@ -0,0 +1,35 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Getting Involved +================ + +Right now the primary audience for Apache Arrow are the developers of data +systems; most people will use Apache Arrow indirectly through systems that use +it for internal data handling and interoperating with other Arrow-enabled +systems. + +Even if you do not plan to contribute to Apache Arrow itself or Arrow +integrations in other projects, we'd be happy to have you involved: + + * Join the mailing list: send an email to + `dev-subscribe@arrow.apache.org <mailto:dev-subscribe@arrow.apache.org>`_. + Share your ideas and use cases for the project or read through the + `Archive <http://mail-archives.apache.org/mod_mbox/arrow-dev/>`_. + * Follow our activity on `JIRA <https://issues.apache.org/jira/browse/ARROW>`_ + * Learn the `Format / Specification + <https://github.com/apache/arrow/tree/master/format>`_ diff --git a/src/arrow/docs/source/python/index.rst b/src/arrow/docs/source/python/index.rst new file mode 100644 index 000000000..0ffa40545 --- /dev/null +++ b/src/arrow/docs/source/python/index.rst @@ -0,0 +1,62 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +PyArrow - Apache Arrow Python bindings +====================================== + +This is the documentation of the Python API of Apache Arrow. + +Apache Arrow is a development platform for in-memory analytics. +It contains a set of technologies that enable big data systems to store, process and move data fast. + +See the :doc:`parent documentation <../index>` for additional details on +the Arrow Project itself, on the Arrow format and the other language bindings. + +The Arrow Python bindings (also named "PyArrow") have first-class integration +with NumPy, pandas, and built-in Python objects. They are based on the C++ +implementation of Arrow. + +Here will we detail the usage of the Python API for Arrow and the leaf +libraries that add additional functionality such as reading Apache Parquet +files into Arrow structures. + +.. toctree:: + :maxdepth: 2 + + install + getstarted + data + compute + memory + ipc + filesystems + filesystems_deprecated + plasma + numpy + pandas + timestamps + csv + feather + json + parquet + dataset + cuda + extending_types + extending + api + getting_involved + benchmarks diff --git a/src/arrow/docs/source/python/install.rst b/src/arrow/docs/source/python/install.rst new file mode 100644 index 000000000..3c23d8a0f --- /dev/null +++ b/src/arrow/docs/source/python/install.rst @@ -0,0 +1,90 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Installing PyArrow +================== + +System Compatibility +-------------------- + +PyArrow is regularly built and tested on Windows, macOS and various Linux +distributions (including Ubuntu 16.04, Ubuntu 18.04). We strongly recommend +using a 64-bit system. + +Python Compatibility +-------------------- + +PyArrow is currently compatible with Python 3.6, 3.7, 3.8, and 3.9. + +Using Conda +----------- + +Install the latest version of PyArrow from +`conda-forge <https://conda-forge.org/>`_ using `Conda <https://conda.io>`_: + +.. code-block:: bash + + conda install -c conda-forge pyarrow + +Using Pip +--------- + +Install the latest version from `PyPI <https://pypi.org/>`_ (Windows, Linux, +and macOS): + +.. code-block:: bash + + pip install pyarrow + +If you encounter any importing issues of the pip wheels on Windows, you may +need to install the `Visual C++ Redistributable for Visual Studio 2015 +<https://www.microsoft.com/en-us/download/details.aspx?id=48145>`_. + +.. warning:: + On Linux, you will need pip >= 19.0 to detect the prebuilt binary packages. + +Installing from source +---------------------- + +See :ref:`python-development`. + +Installing Nightly Packages +--------------------------- + +.. warning:: + These packages are not official releases. Use them at your own risk. + +PyArrow has nightly wheels and conda packages for testing purposes. + +These may be suitable for downstream libraries in their continuous integration +setup to maintain compatibility with the upcoming PyArrow features, +deprecations and/or feature removals. + +Install the development version of PyArrow from `arrow-nightlies +<https://anaconda.org/arrow-nightlies/pyarrow>`_ conda channel: + +.. code-block:: bash + + conda install -c arrow-nightlies pyarrow + +Install the development version from an `alternative PyPI +<https://gemfury.com/arrow-nightlies>`_ index: + +.. code-block:: bash + + pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ \ + --prefer-binary --pre pyarrow diff --git a/src/arrow/docs/source/python/ipc.rst b/src/arrow/docs/source/python/ipc.rst new file mode 100644 index 000000000..0ba557b64 --- /dev/null +++ b/src/arrow/docs/source/python/ipc.rst @@ -0,0 +1,385 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow + +.. _ipc: + +Streaming, Serialization, and IPC +================================= + +Writing and Reading Streams +--------------------------- + +Arrow defines two types of binary formats for serializing record batches: + +* **Streaming format**: for sending an arbitrary length sequence of record + batches. The format must be processed from start to end, and does not support + random access + +* **File or Random Access format**: for serializing a fixed number of record + batches. Supports random access, and thus is very useful when used with + memory maps + +To follow this section, make sure to first read the section on :ref:`Memory and +IO <io>`. + +Using streams +~~~~~~~~~~~~~ + +First, let's create a small record batch: + +.. ipython:: python + + import pyarrow as pa + + data = [ + pa.array([1, 2, 3, 4]), + pa.array(['foo', 'bar', 'baz', None]), + pa.array([True, None, False, True]) + ] + + batch = pa.record_batch(data, names=['f0', 'f1', 'f2']) + batch.num_rows + batch.num_columns + +Now, we can begin writing a stream containing some number of these batches. For +this we use :class:`~pyarrow.RecordBatchStreamWriter`, which can write to a +writeable ``NativeFile`` object or a writeable Python object. For convenience, +this one can be created with :func:`~pyarrow.ipc.new_stream`: + +.. ipython:: python + + sink = pa.BufferOutputStream() + + with pa.ipc.new_stream(sink, batch.schema) as writer: + for i in range(5): + writer.write_batch(batch) + +Here we used an in-memory Arrow buffer stream (``sink``), +but this could have been a socket or some other IO sink. + +When creating the ``StreamWriter``, we pass the schema, since the schema +(column names and types) must be the same for all of the batches sent in this +particular stream. Now we can do: + +.. ipython:: python + + buf = sink.getvalue() + buf.size + +Now ``buf`` contains the complete stream as an in-memory byte buffer. We can +read such a stream with :class:`~pyarrow.RecordBatchStreamReader` or the +convenience function ``pyarrow.ipc.open_stream``: + +.. ipython:: python + + with pa.ipc.open_stream(buf) as reader: + schema = reader.schema + batches = [b for b in reader] + + schema + len(batches) + +We can check the returned batches are the same as the original input: + +.. ipython:: python + + batches[0].equals(batch) + +An important point is that if the input source supports zero-copy reads +(e.g. like a memory map, or ``pyarrow.BufferReader``), then the returned +batches are also zero-copy and do not allocate any new memory on read. + +Writing and Reading Random Access Files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :class:`~pyarrow.RecordBatchFileWriter` has the same API as +:class:`~pyarrow.RecordBatchStreamWriter`. You can create one with +:func:`~pyarrow.ipc.new_file`: + +.. ipython:: python + + sink = pa.BufferOutputStream() + + with pa.ipc.new_file(sink, batch.schema) as writer: + for i in range(10): + writer.write_batch(batch) + + buf = sink.getvalue() + buf.size + +The difference between :class:`~pyarrow.RecordBatchFileReader` and +:class:`~pyarrow.RecordBatchStreamReader` is that the input source must have a +``seek`` method for random access. The stream reader only requires read +operations. We can also use the :func:`~pyarrow.ipc.open_file` method to open a file: + +.. ipython:: python + + with pa.ipc.open_file(buf) as reader: + num_record_batches = reader.num_record_batches + b = reader.get_batch(3) + +Because we have access to the entire payload, we know the number of record +batches in the file, and can read any at random. + +.. ipython:: python + + num_record_batches + b.equals(batch) + +Reading from Stream and File Format for pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The stream and file reader classes have a special ``read_pandas`` method to +simplify reading multiple record batches and converting them to a single +DataFrame output: + +.. ipython:: python + + with pa.ipc.open_file(buf) as reader: + df = reader.read_pandas() + + df[:5] + +Efficiently Writing and Reading Arrow Data +------------------------------------------ + +Being optimized for zero copy and memory mapped data, Arrow allows to easily +read and write arrays consuming the minimum amount of resident memory. + +When writing and reading raw Arrow data, we can use the Arrow File Format +or the Arrow Streaming Format. + +To dump an array to file, you can use the :meth:`~pyarrow.ipc.new_file` +which will provide a new :class:`~pyarrow.ipc.RecordBatchFileWriter` instance +that can be used to write batches of data to that file. + +For example to write an array of 10M integers, we could write it in 1000 chunks +of 10000 entries: + +.. ipython:: python + + BATCH_SIZE = 10000 + NUM_BATCHES = 1000 + + schema = pa.schema([pa.field('nums', pa.int32())]) + + with pa.OSFile('bigfile.arrow', 'wb') as sink: + with pa.ipc.new_file(sink, schema) as writer: + for row in range(NUM_BATCHES): + batch = pa.record_batch([pa.array(range(BATCH_SIZE), type=pa.int32())], schema) + writer.write(batch) + +record batches support multiple columns, so in practice we always write the +equivalent of a :class:`~pyarrow.Table`. + +Writing in batches is effective because we in theory need to keep in memory only +the current batch we are writing. But when reading back, we can be even more effective +by directly mapping the data from disk and avoid allocating any new memory on read. + +Under normal conditions, reading back our file will consume a few hundred megabytes +of memory: + +.. ipython:: python + + with pa.OSFile('bigfile.arrow', 'rb') as source: + loaded_array = pa.ipc.open_file(source).read_all() + + print("LEN:", len(loaded_array)) + print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + +To more efficiently read big data from disk, we can memory map the file, so that +Arrow can directly reference the data mapped from disk and avoid having to +allocate its own memory. +In such case the operating system will be able to page in the mapped memory +lazily and page it out without any write back cost when under pressure, +allowing to more easily read arrays bigger than the total memory. + +.. ipython:: python + + with pa.memory_map('bigfile.arrow', 'rb') as source: + loaded_array = pa.ipc.open_file(source).read_all() + print("LEN:", len(loaded_array)) + print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + +.. note:: + + Other high level APIs like :meth:`~pyarrow.parquet.read_table` also provide a + ``memory_map`` option. But in those cases, the memory mapping can't help with + reducing resident memory consumption. See :ref:`parquet_mmap` for details. + +Arbitrary Object Serialization +------------------------------ + +.. warning:: + + The custom serialization functionality is deprecated in pyarrow 2.0, and + will be removed in a future version. + + While the serialization functions in this section utilize the Arrow stream + protocol internally, they do not produce data that is compatible with the + above ``ipc.open_file`` and ``ipc.open_stream`` functions. + + For arbitrary objects, you can use the standard library ``pickle`` + functionality instead. For pyarrow objects, you can use the IPC + serialization format through the ``pyarrow.ipc`` module, as explained + above. + + PyArrow serialization was originally meant to provide a higher-performance + alternative to ``pickle`` thanks to zero-copy semantics. However, + ``pickle`` protocol 5 gained support for zero-copy using out-of-band + buffers, and can be used instead for similar benefits. + +In ``pyarrow`` we are able to serialize and deserialize many kinds of Python +objects. As an example, consider a dictionary containing NumPy arrays: + +.. ipython:: python + + import numpy as np + + data = { + i: np.random.randn(500, 500) + for i in range(100) + } + +We use the ``pyarrow.serialize`` function to convert this data to a byte +buffer: + +.. ipython:: python + :okwarning: + + buf = pa.serialize(data).to_buffer() + type(buf) + buf.size + +``pyarrow.serialize`` creates an intermediate object which can be converted to +a buffer (the ``to_buffer`` method) or written directly to an output stream. + +``pyarrow.deserialize`` converts a buffer-like object back to the original +Python object: + +.. ipython:: python + :okwarning: + + restored_data = pa.deserialize(buf) + restored_data[0] + + +Serializing Custom Data Types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If an unrecognized data type is encountered when serializing an object, +``pyarrow`` will fall back on using ``pickle`` for converting that type to a +byte string. There may be a more efficient way, though. + +Consider a class with two members, one of which is a NumPy array: + +.. code-block:: python + + class MyData: + def __init__(self, name, data): + self.name = name + self.data = data + +We write functions to convert this to and from a dictionary with simpler types: + +.. code-block:: python + + def _serialize_MyData(val): + return {'name': val.name, 'data': val.data} + + def _deserialize_MyData(data): + return MyData(data['name'], data['data'] + +then, we must register these functions in a ``SerializationContext`` so that +``MyData`` can be recognized: + +.. code-block:: python + + context = pa.SerializationContext() + context.register_type(MyData, 'MyData', + custom_serializer=_serialize_MyData, + custom_deserializer=_deserialize_MyData) + +Lastly, we use this context as an additional argument to ``pyarrow.serialize``: + +.. code-block:: python + + buf = pa.serialize(val, context=context).to_buffer() + restored_val = pa.deserialize(buf, context=context) + +The ``SerializationContext`` also has convenience methods ``serialize`` and +``deserialize``, so these are equivalent statements: + +.. code-block:: python + + buf = context.serialize(val).to_buffer() + restored_val = context.deserialize(buf) + +Component-based Serialization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For serializing Python objects containing some number of NumPy arrays, Arrow +buffers, or other data types, it may be desirable to transport their serialized +representation without having to produce an intermediate copy using the +``to_buffer`` method. To motivate this, suppose we have a list of NumPy arrays: + +.. ipython:: python + + import numpy as np + data = [np.random.randn(10, 10) for i in range(5)] + +The call ``pa.serialize(data)`` does not copy the memory inside each of these +NumPy arrays. This serialized representation can be then decomposed into a +dictionary containing a sequence of ``pyarrow.Buffer`` objects containing +metadata for each array and references to the memory inside the arrays. To do +this, use the ``to_components`` method: + +.. ipython:: python + :okwarning: + + serialized = pa.serialize(data) + components = serialized.to_components() + +The particular details of the output of ``to_components`` are not too +important. The objects in the ``'data'`` field are ``pyarrow.Buffer`` objects, +which are zero-copy convertible to Python ``memoryview`` objects: + +.. ipython:: python + + memoryview(components['data'][0]) + +A memoryview can be converted back to a Arrow ``Buffer`` with +``pyarrow.py_buffer``: + +.. ipython:: python + + mv = memoryview(components['data'][0]) + buf = pa.py_buffer(mv) + +An object can be reconstructed from its component-based representation using +``deserialize_components``: + +.. ipython:: python + :okwarning: + + restored_data = pa.deserialize_components(components) + restored_data[0] + +``deserialize_components`` is also available as a method on +``SerializationContext`` objects. diff --git a/src/arrow/docs/source/python/json.rst b/src/arrow/docs/source/python/json.rst new file mode 100644 index 000000000..99ecbc19a --- /dev/null +++ b/src/arrow/docs/source/python/json.rst @@ -0,0 +1,117 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.json +.. _json: + +Reading JSON files +================== + +Arrow supports reading columnar data from line-delimited JSON files. +In this context, a JSON file consists of multiple JSON objects, one per line, +representing individual data rows. For example, this file represents +two rows of data with four columns "a", "b", "c", "d": + +.. code-block:: json + + {"a": 1, "b": 2.0, "c": "foo", "d": false} + {"a": 4, "b": -5.5, "c": null, "d": true} + +The features currently offered are the following: + +* multi-threaded or single-threaded reading +* automatic decompression of input files (based on the filename extension, + such as ``my_data.json.gz``) +* sophisticated type inference (see below) + +.. note:: + Currently only the line-delimited JSON format is supported. + + +Usage +----- + +JSON reading functionality is available through the :mod:`pyarrow.json` module. +In many cases, you will simply call the :func:`read_json` function +with the file path you want to read from:: + + >>> from pyarrow import json + >>> fn = 'my_data.json' + >>> table = json.read_json(fn) + >>> table + pyarrow.Table + a: int64 + b: double + c: string + d: bool + >>> table.to_pandas() + a b c d + 0 1 2.0 foo False + 1 4 -5.5 None True + + +Automatic Type Inference +------------------------ + +Arrow :ref:`data types <data.types>` are inferred from the JSON types and +values of each column: + +* JSON null values convert to the ``null`` type, but can fall back to any + other type. +* JSON booleans convert to ``bool_``. +* JSON numbers convert to ``int64``, falling back to ``float64`` if a + non-integer is encountered. +* JSON strings of the kind "YYYY-MM-DD" and "YYYY-MM-DD hh:mm:ss" convert + to ``timestamp[s]``, falling back to ``utf8`` if a conversion error occurs. +* JSON arrays convert to a ``list`` type, and inference proceeds recursively + on the JSON arrays' values. +* Nested JSON objects convert to a ``struct`` type, and inference proceeds + recursively on the JSON objects' values. + +Thus, reading this JSON file: + +.. code-block:: json + + {"a": [1, 2], "b": {"c": true, "d": "1991-02-03"}} + {"a": [3, 4, 5], "b": {"c": false, "d": "2019-04-01"}} + +returns the following data:: + + >>> table = json.read_json("my_data.json") + >>> table + pyarrow.Table + a: list<item: int64> + child 0, item: int64 + b: struct<c: bool, d: timestamp[s]> + child 0, c: bool + child 1, d: timestamp[s] + >>> table.to_pandas() + a b + 0 [1, 2] {'c': True, 'd': 1991-02-03 00:00:00} + 1 [3, 4, 5] {'c': False, 'd': 2019-04-01 00:00:00} + + +Customized parsing +------------------ + +To alter the default parsing settings in case of reading JSON files with an +unusual structure, you should create a :class:`ParseOptions` instance +and pass it to :func:`read_json`. For example, you can pass an explicit +:ref:`schema <data.schema>` in order to bypass automatic type inference. + +Similarly, you can choose performance settings by passing a +:class:`ReadOptions` instance to :func:`read_json`. diff --git a/src/arrow/docs/source/python/memory.rst b/src/arrow/docs/source/python/memory.rst new file mode 100644 index 000000000..4febc668c --- /dev/null +++ b/src/arrow/docs/source/python/memory.rst @@ -0,0 +1,298 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. highlight:: python + +.. _io: + +======================== +Memory and IO Interfaces +======================== + +This section will introduce you to the major concepts in PyArrow's memory +management and IO systems: + +* Buffers +* Memory pools +* File-like and stream-like objects + +Referencing and Allocating Memory +================================= + +pyarrow.Buffer +-------------- + +The :class:`Buffer` object wraps the C++ :cpp:class:`arrow::Buffer` type +which is the primary tool for memory management in Apache Arrow in C++. It permits +higher-level array classes to safely interact with memory which they may or may +not own. ``arrow::Buffer`` can be zero-copy sliced to permit Buffers to cheaply +reference other Buffers, while preserving memory lifetime and clean +parent-child relationships. + +There are many implementations of ``arrow::Buffer``, but they all provide a +standard interface: a data pointer and length. This is similar to Python's +built-in `buffer protocol` and ``memoryview`` objects. + +A :class:`Buffer` can be created from any Python object implementing +the buffer protocol by calling the :func:`py_buffer` function. Let's consider +a bytes object: + +.. ipython:: python + + import pyarrow as pa + + data = b'abcdefghijklmnopqrstuvwxyz' + buf = pa.py_buffer(data) + buf + buf.size + +Creating a Buffer in this way does not allocate any memory; it is a zero-copy +view on the memory exported from the ``data`` bytes object. + +External memory, under the form of a raw pointer and size, can also be +referenced using the :func:`foreign_buffer` function. + +Buffers can be used in circumstances where a Python buffer or memoryview is +required, and such conversions are zero-copy: + +.. ipython:: python + + memoryview(buf) + +The Buffer's :meth:`~Buffer.to_pybytes` method converts the Buffer's data to a +Python bytestring (thus making a copy of the data): + +.. ipython:: python + + buf.to_pybytes() + +Memory Pools +------------ + +All memory allocations and deallocations (like ``malloc`` and ``free`` in C) +are tracked in an instance of :class:`MemoryPool`. This means that we can +then precisely track amount of memory that has been allocated: + +.. ipython:: python + + pa.total_allocated_bytes() + +Let's allocate a resizable :class:`Buffer` from the default pool: + +.. ipython:: python + + buf = pa.allocate_buffer(1024, resizable=True) + pa.total_allocated_bytes() + buf.resize(2048) + pa.total_allocated_bytes() + +The default allocator requests memory in a minimum increment of 64 bytes. If +the buffer is garbaged-collected, all of the memory is freed: + +.. ipython:: python + + buf = None + pa.total_allocated_bytes() + +Besides the default built-in memory pool, there may be additional memory pools +to choose (such as `mimalloc <https://github.com/microsoft/mimalloc>`_) +from depending on how Arrow was built. One can get the backend +name for a memory pool:: + + >>> pa.default_memory_pool().backend_name + 'jemalloc' + +.. seealso:: + :ref:`API documentation for memory pools <api.memory_pool>`. + +.. seealso:: + On-GPU buffers using Arrow's optional :doc:`CUDA integration <cuda>`. + + +Input and Output +================ + +.. _io.native_file: + +The Arrow C++ libraries have several abstract interfaces for different kinds of +IO objects: + +* Read-only streams +* Read-only files supporting random access +* Write-only streams +* Write-only files supporting random access +* File supporting reads, writes, and random access + +In the interest of making these objects behave more like Python's built-in +``file`` objects, we have defined a :class:`~pyarrow.NativeFile` base class +which implements the same API as regular Python file objects. + +:class:`~pyarrow.NativeFile` has some important features which make it +preferable to using Python files with PyArrow where possible: + +* Other Arrow classes can access the internal C++ IO objects natively, and do + not need to acquire the Python GIL +* Native C++ IO may be able to do zero-copy IO, such as with memory maps + +There are several kinds of :class:`~pyarrow.NativeFile` options available: + +* :class:`~pyarrow.OSFile`, a native file that uses your operating system's + file descriptors +* :class:`~pyarrow.MemoryMappedFile`, for reading (zero-copy) and writing with + memory maps +* :class:`~pyarrow.BufferReader`, for reading :class:`~pyarrow.Buffer` objects + as a file +* :class:`~pyarrow.BufferOutputStream`, for writing data in-memory, producing a + Buffer at the end +* :class:`~pyarrow.FixedSizeBufferWriter`, for writing data into an already + allocated Buffer +* :class:`~pyarrow.HdfsFile`, for reading and writing data to the Hadoop Filesystem +* :class:`~pyarrow.PythonFile`, for interfacing with Python file objects in C++ +* :class:`~pyarrow.CompressedInputStream` and + :class:`~pyarrow.CompressedOutputStream`, for on-the-fly compression or + decompression to/from another stream + +There are also high-level APIs to make instantiating common kinds of streams +easier. + +High-Level API +-------------- + +Input Streams +~~~~~~~~~~~~~ + +The :func:`~pyarrow.input_stream` function allows creating a readable +:class:`~pyarrow.NativeFile` from various kinds of sources. + +* If passed a :class:`~pyarrow.Buffer` or a ``memoryview`` object, a + :class:`~pyarrow.BufferReader` will be returned: + + .. ipython:: python + + buf = memoryview(b"some data") + stream = pa.input_stream(buf) + stream.read(4) + +* If passed a string or file path, it will open the given file on disk + for reading, creating a :class:`~pyarrow.OSFile`. Optionally, the file + can be compressed: if its filename ends with a recognized extension + such as ``.gz``, its contents will automatically be decompressed on + reading. + + .. ipython:: python + + import gzip + with gzip.open('example.gz', 'wb') as f: + f.write(b'some data\n' * 3) + + stream = pa.input_stream('example.gz') + stream.read() + +* If passed a Python file object, it will wrapped in a :class:`PythonFile` + such that the Arrow C++ libraries can read data from it (at the expense + of a slight overhead). + +Output Streams +~~~~~~~~~~~~~~ + +:func:`~pyarrow.output_stream` is the equivalent function for output streams +and allows creating a writable :class:`~pyarrow.NativeFile`. It has the same +features as explained above for :func:`~pyarrow.input_stream`, such as being +able to write to buffers or do on-the-fly compression. + +.. ipython:: python + + with pa.output_stream('example1.dat') as stream: + stream.write(b'some data') + + f = open('example1.dat', 'rb') + f.read() + + +On-Disk and Memory Mapped Files +------------------------------- + +PyArrow includes two ways to interact with data on disk: standard operating +system-level file APIs, and memory-mapped files. In regular Python we can +write: + +.. ipython:: python + + with open('example2.dat', 'wb') as f: + f.write(b'some example data') + +Using pyarrow's :class:`~pyarrow.OSFile` class, you can write: + +.. ipython:: python + + with pa.OSFile('example3.dat', 'wb') as f: + f.write(b'some example data') + +For reading files, you can use :class:`~pyarrow.OSFile` or +:class:`~pyarrow.MemoryMappedFile`. The difference between these is that +:class:`~pyarrow.OSFile` allocates new memory on each read, like Python file +objects. In reads from memory maps, the library constructs a buffer referencing +the mapped memory without any memory allocation or copying: + +.. ipython:: python + + file_obj = pa.OSFile('example2.dat') + mmap = pa.memory_map('example3.dat') + file_obj.read(4) + mmap.read(4) + +The ``read`` method implements the standard Python file ``read`` API. To read +into Arrow Buffer objects, use ``read_buffer``: + +.. ipython:: python + + mmap.seek(0) + buf = mmap.read_buffer(4) + print(buf) + buf.to_pybytes() + +Many tools in PyArrow, particular the Apache Parquet interface and the file and +stream messaging tools, are more efficient when used with these ``NativeFile`` +types than with normal Python file objects. + +.. ipython:: python + :suppress: + + buf = mmap = file_obj = None + !rm example.dat + !rm example2.dat + +In-Memory Reading and Writing +----------------------------- + +To assist with serialization and deserialization of in-memory data, we have +file interfaces that can read and write to Arrow Buffers. + +.. ipython:: python + + writer = pa.BufferOutputStream() + writer.write(b'hello, friends') + + buf = writer.getvalue() + buf + buf.size + reader = pa.BufferReader(buf) + reader.seek(7) + reader.read(7) + +These have similar semantics to Python's built-in ``io.BytesIO``. diff --git a/src/arrow/docs/source/python/numpy.rst b/src/arrow/docs/source/python/numpy.rst new file mode 100644 index 000000000..870f9cb73 --- /dev/null +++ b/src/arrow/docs/source/python/numpy.rst @@ -0,0 +1,75 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _numpy_interop: + +NumPy Integration +================= + +PyArrow allows converting back and forth from +`NumPy <https://www.numpy.org/>`_ arrays to Arrow :ref:`Arrays <data.array>`. + +NumPy to Arrow +-------------- + +To convert a NumPy array to Arrow, one can simply call the :func:`pyarrow.array` +factory function. + +.. code-block:: pycon + + >>> import numpy as np + >>> import pyarrow as pa + >>> data = np.arange(10, dtype='int16') + >>> arr = pa.array(data) + >>> arr + <pyarrow.lib.Int16Array object at 0x7fb1d1e6ae58> + [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 + ] + +Converting from NumPy supports a wide range of input dtypes, including +structured dtypes or strings. + +Arrow to NumPy +-------------- + +In the reverse direction, it is possible to produce a view of an Arrow Array +for use with NumPy using the :meth:`~pyarrow.Array.to_numpy` method. +This is limited to primitive types for which NumPy has the same physical +representation as Arrow, and assuming the Arrow data has no nulls. + +.. code-block:: pycon + + >>> import numpy as np + >>> import pyarrow as pa + >>> arr = pa.array([4, 5, 6], type=pa.int32()) + >>> view = arr.to_numpy() + >>> view + array([4, 5, 6], dtype=int32) + +For more complex data types, you have to use the :meth:`~pyarrow.Array.to_pandas` +method (which will construct a Numpy array with Pandas semantics for, e.g., +representation of null values). diff --git a/src/arrow/docs/source/python/pandas.rst b/src/arrow/docs/source/python/pandas.rst new file mode 100644 index 000000000..aa030cfff --- /dev/null +++ b/src/arrow/docs/source/python/pandas.rst @@ -0,0 +1,480 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _pandas_interop: + +Pandas Integration +================== + +To interface with `pandas <https://pandas.pydata.org/>`_, PyArrow provides +various conversion routines to consume pandas structures and convert back +to them. + +.. note:: + While pandas uses NumPy as a backend, it has enough peculiarities + (such as a different type system, and support for null values) that this + is a separate topic from :ref:`numpy_interop`. + +To follow examples in this document, make sure to run: + +.. ipython:: python + + import pandas as pd + import pyarrow as pa + +DataFrames +---------- + +The equivalent to a pandas DataFrame in Arrow is a :ref:`Table <data.table>`. +Both consist of a set of named columns of equal length. While pandas only +supports flat columns, the Table also provides nested columns, thus it can +represent more data than a DataFrame, so a full conversion is not always possible. + +Conversion from a Table to a DataFrame is done by calling +:meth:`pyarrow.Table.to_pandas`. The inverse is then achieved by using +:meth:`pyarrow.Table.from_pandas`. + +.. code-block:: python + + import pyarrow as pa + import pandas as pd + + df = pd.DataFrame({"a": [1, 2, 3]}) + # Convert from pandas to Arrow + table = pa.Table.from_pandas(df) + # Convert back to pandas + df_new = table.to_pandas() + + # Infer Arrow schema from pandas + schema = pa.Schema.from_pandas(df) + +By default ``pyarrow`` tries to preserve and restore the ``.index`` +data as accurately as possible. See the section below for more about +this, and how to disable this logic. + +Series +------ + +In Arrow, the most similar structure to a pandas Series is an Array. +It is a vector that contains data of the same type as linear memory. You can +convert a pandas Series to an Arrow Array using :meth:`pyarrow.Array.from_pandas`. +As Arrow Arrays are always nullable, you can supply an optional mask using +the ``mask`` parameter to mark all null-entries. + +Handling pandas Indexes +----------------------- + +Methods like :meth:`pyarrow.Table.from_pandas` have a +``preserve_index`` option which defines how to preserve (store) or not +to preserve (to not store) the data in the ``index`` member of the +corresponding pandas object. This data is tracked using schema-level +metadata in the internal ``arrow::Schema`` object. + +The default of ``preserve_index`` is ``None``, which behaves as +follows: + +* ``RangeIndex`` is stored as metadata-only, not requiring any extra + storage. +* Other index types are stored as one or more physical data columns in + the resulting :class:`Table` + +To not store the index at all pass ``preserve_index=False``. Since +storing a ``RangeIndex`` can cause issues in some limited scenarios +(such as storing multiple DataFrame objects in a Parquet file), to +force all index data to be serialized in the resulting table, pass +``preserve_index=True``. + +Type differences +---------------- + +With the current design of pandas and Arrow, it is not possible to convert all +column types unmodified. One of the main issues here is that pandas has no +support for nullable columns of arbitrary type. Also ``datetime64`` is currently +fixed to nanosecond resolution. On the other side, Arrow might be still missing +support for some types. + +pandas -> Arrow Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++------------------------+--------------------------+ +| Source Type (pandas) | Destination Type (Arrow) | ++========================+==========================+ +| ``bool`` | ``BOOL`` | ++------------------------+--------------------------+ +| ``(u)int{8,16,32,64}`` | ``(U)INT{8,16,32,64}`` | ++------------------------+--------------------------+ +| ``float32`` | ``FLOAT`` | ++------------------------+--------------------------+ +| ``float64`` | ``DOUBLE`` | ++------------------------+--------------------------+ +| ``str`` / ``unicode`` | ``STRING`` | ++------------------------+--------------------------+ +| ``pd.Categorical`` | ``DICTIONARY`` | ++------------------------+--------------------------+ +| ``pd.Timestamp`` | ``TIMESTAMP(unit=ns)`` | ++------------------------+--------------------------+ +| ``datetime.date`` | ``DATE`` | ++------------------------+--------------------------+ + +Arrow -> pandas Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++-------------------------------------+--------------------------------------------------------+ +| Source Type (Arrow) | Destination Type (pandas) | ++=====================================+========================================================+ +| ``BOOL`` | ``bool`` | ++-------------------------------------+--------------------------------------------------------+ +| ``BOOL`` *with nulls* | ``object`` (with values ``True``, ``False``, ``None``) | ++-------------------------------------+--------------------------------------------------------+ +| ``(U)INT{8,16,32,64}`` | ``(u)int{8,16,32,64}`` | ++-------------------------------------+--------------------------------------------------------+ +| ``(U)INT{8,16,32,64}`` *with nulls* | ``float64`` | ++-------------------------------------+--------------------------------------------------------+ +| ``FLOAT`` | ``float32`` | ++-------------------------------------+--------------------------------------------------------+ +| ``DOUBLE`` | ``float64`` | ++-------------------------------------+--------------------------------------------------------+ +| ``STRING`` | ``str`` | ++-------------------------------------+--------------------------------------------------------+ +| ``DICTIONARY`` | ``pd.Categorical`` | ++-------------------------------------+--------------------------------------------------------+ +| ``TIMESTAMP(unit=*)`` | ``pd.Timestamp`` (``np.datetime64[ns]``) | ++-------------------------------------+--------------------------------------------------------+ +| ``DATE`` | ``object``(with ``datetime.date`` objects) | ++-------------------------------------+--------------------------------------------------------+ + +Categorical types +~~~~~~~~~~~~~~~~~ + +`Pandas categorical <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_ +columns are converted to :ref:`Arrow dictionary arrays <data.dictionary>`, +a special array type optimized to handle repeated and limited +number of possible values. + +.. ipython:: python + + df = pd.DataFrame({"cat": pd.Categorical(["a", "b", "c", "a", "b", "c"])}) + df.cat.dtype.categories + df + + table = pa.Table.from_pandas(df) + table + +We can inspect the :class:`~.ChunkedArray` of the created table and see the +same categories of the Pandas DataFrame. + +.. ipython:: python + + column = table[0] + chunk = column.chunk(0) + chunk.dictionary + chunk.indices + +Datetime (Timestamp) types +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`Pandas Timestamps <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html>`_ +use the ``datetime64[ns]`` type in Pandas and are converted to an Arrow +:class:`~.TimestampArray`. + +.. ipython:: python + + df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=3)}) + df.dtypes + df + + table = pa.Table.from_pandas(df) + table + +In this example the Pandas Timestamp is time zone aware +(``UTC`` on this case), and this information is used to create the Arrow +:class:`~.TimestampArray`. + +Date types +~~~~~~~~~~ + +While dates can be handled using the ``datetime64[ns]`` type in +pandas, some systems work with object arrays of Python's built-in +``datetime.date`` object: + +.. ipython:: python + + from datetime import date + s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)]) + s + +When converting to an Arrow array, the ``date32`` type will be used by +default: + +.. ipython:: python + + arr = pa.array(s) + arr.type + arr[0] + +To use the 64-bit ``date64``, specify this explicitly: + +.. ipython:: python + + arr = pa.array(s, type='date64') + arr.type + +When converting back with ``to_pandas``, object arrays of +``datetime.date`` objects are returned: + +.. ipython:: python + + arr.to_pandas() + +If you want to use NumPy's ``datetime64`` dtype instead, pass +``date_as_object=False``: + +.. ipython:: python + + s2 = pd.Series(arr.to_pandas(date_as_object=False)) + s2.dtype + +.. warning:: + + As of Arrow ``0.13`` the parameter ``date_as_object`` is ``True`` + by default. Older versions must pass ``date_as_object=True`` to + obtain this behavior + +Time types +~~~~~~~~~~ + +The builtin ``datetime.time`` objects inside Pandas data structures will be +converted to an Arrow ``time64`` and :class:`~.Time64Array` respectively. + +.. ipython:: python + + from datetime import time + s = pd.Series([time(1, 1, 1), time(2, 2, 2)]) + s + + arr = pa.array(s) + arr.type + arr + +When converting to pandas, arrays of ``datetime.time`` objects are returned: + +.. ipython:: python + + arr.to_pandas() + +Nullable types +-------------- + +In Arrow all data types are nullable, meaning they support storing missing +values. In pandas, however, not all data types have support for missing data. +Most notably, the default integer data types do not, and will get casted +to float when missing values are introduced. Therefore, when an Arrow array +or table gets converted to pandas, integer columns will become float when +missing values are present: + +.. code-block:: python + + >>> arr = pa.array([1, 2, None]) + >>> arr + <pyarrow.lib.Int64Array object at 0x7f07d467c640> + [ + 1, + 2, + null + ] + >>> arr.to_pandas() + 0 1.0 + 1 2.0 + 2 NaN + dtype: float64 + +Pandas has experimental nullable data types +(https://pandas.pydata.org/docs/user_guide/integer_na.html). Arrows supports +round trip conversion for those: + +.. code-block:: python + + >>> df = pd.DataFrame({'a': pd.Series([1, 2, None], dtype="Int64")}) + >>> df + a + 0 1 + 1 2 + 2 <NA> + + >>> table = pa.table(df) + >>> table + Out[32]: + pyarrow.Table + a: int64 + ---- + a: [[1,2,null]] + + >>> table.to_pandas() + a + 0 1 + 1 2 + 2 <NA> + + >>> table.to_pandas().dtypes + a Int64 + dtype: object + +This roundtrip conversion works because metadata about the original pandas +DataFrame gets stored in the Arrow table. However, if you have Arrow data (or +e.g. a Parquet file) not originating from a pandas DataFrame with nullable +data types, the default conversion to pandas will not use those nullable +dtypes. + +The :meth:`pyarrow.Table.to_pandas` method has a ``types_mapper`` keyword +that can be used to override the default data type used for the resulting +pandas DataFrame. This way, you can instruct Arrow to create a pandas +DataFrame using nullable dtypes. + +.. code-block:: python + + >>> table = pa.table({"a": [1, 2, None]}) + >>> table.to_pandas() + a + 0 1.0 + 1 2.0 + 2 NaN + >>> table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get) + a + 0 1 + 1 2 + 2 <NA> + +The ``types_mapper`` keyword expects a function that will return the pandas +data type to use given a pyarrow data type. By using the ``dict.get`` method, +we can create such a function using a dictionary. + +If you want to use all currently supported nullable dtypes by pandas, this +dictionary becomes: + +.. code-block:: python + + dtype_mapping = { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + pa.uint8(): pd.UInt8Dtype(), + pa.uint16(): pd.UInt16Dtype(), + pa.uint32(): pd.UInt32Dtype(), + pa.uint64(): pd.UInt64Dtype(), + pa.bool_(): pd.BooleanDtype(), + pa.float32(): pd.Float32Dtype(), + pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + } + + df = table.to_pandas(types_mapper=dtype_mapping.get) + + +When using the pandas API for reading Parquet files (``pd.read_parquet(..)``), +this can also be achieved by passing ``use_nullable_dtypes``: + +.. code-block:: python + + df = pd.read_parquet(path, use_nullable_dtypes=True) + + +Memory Usage and Zero Copy +-------------------------- + +When converting from Arrow data structures to pandas objects using various +``to_pandas`` methods, one must occasionally be mindful of issues related to +performance and memory usage. + +Since pandas's internal data representation is generally different from the +Arrow columnar format, zero copy conversions (where no memory allocation or +computation is required) are only possible in certain limited cases. + +In the worst case scenario, calling ``to_pandas`` will result in two versions +of the data in memory, one for Arrow and one for pandas, yielding approximately +twice the memory footprint. We have implement some mitigations for this case, +particularly when creating large ``DataFrame`` objects, that we describe below. + +Zero Copy Series Conversions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Zero copy conversions from ``Array`` or ``ChunkedArray`` to NumPy arrays or +pandas Series are possible in certain narrow cases: + +* The Arrow data is stored in an integer (signed or unsigned ``int8`` through + ``int64``) or floating point type (``float16`` through ``float64``). This + includes many numeric types as well as timestamps. +* The Arrow data has no null values (since these are represented using bitmaps + which are not supported by pandas). +* For ``ChunkedArray``, the data consists of a single chunk, + i.e. ``arr.num_chunks == 1``. Multiple chunks will always require a copy + because of pandas's contiguousness requirement. + +In these scenarios, ``to_pandas`` or ``to_numpy`` will be zero copy. In all +other scenarios, a copy will be required. + +Reducing Memory Use in ``Table.to_pandas`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As of this writing, pandas applies a data management strategy called +"consolidation" to collect like-typed DataFrame columns in two-dimensional +NumPy arrays, referred to internally as "blocks". We have gone to great effort +to construct the precise "consolidated" blocks so that pandas will not perform +any further allocation or copies after we hand off the data to +``pandas.DataFrame``. The obvious downside of this consolidation strategy is +that it forces a "memory doubling". + +To try to limit the potential effects of "memory doubling" during +``Table.to_pandas``, we provide a couple of options: + +* ``split_blocks=True``, when enabled ``Table.to_pandas`` produces one internal + DataFrame "block" for each column, skipping the "consolidation" step. Note + that many pandas operations will trigger consolidation anyway, but the peak + memory use may be less than the worst case scenario of a full memory + doubling. As a result of this option, we are able to do zero copy conversions + of columns in the same cases where we can do zero copy with ``Array`` and + ``ChunkedArray``. +* ``self_destruct=True``, this destroys the internal Arrow memory buffers in + each column ``Table`` object as they are converted to the pandas-compatible + representation, potentially releasing memory to the operating system as soon + as a column is converted. Note that this renders the calling ``Table`` object + unsafe for further use, and any further methods called will cause your Python + process to crash. + +Used together, the call + +.. code-block:: python + + df = table.to_pandas(split_blocks=True, self_destruct=True) + del table # not necessary, but a good practice + +will yield significantly lower memory usage in some scenarios. Without these +options, ``to_pandas`` will always double memory. + +Note that ``self_destruct=True`` is not guaranteed to save memory. Since the +conversion happens column by column, memory is also freed column by column. But +if multiple columns share an underlying buffer, then no memory will be freed +until all of those columns are converted. In particular, due to implementation +details, data that comes from IPC or Flight is prone to this, as memory will be +laid out as follows:: + + Record Batch 0: Allocation 0: array 0 chunk 0, array 1 chunk 0, ... + Record Batch 1: Allocation 1: array 0 chunk 1, array 1 chunk 1, ... + ... + +In this case, no memory can be freed until the entire table is converted, even +with ``self_destruct=True``. diff --git a/src/arrow/docs/source/python/parquet.rst b/src/arrow/docs/source/python/parquet.rst new file mode 100644 index 000000000..82461ec5d --- /dev/null +++ b/src/arrow/docs/source/python/parquet.rst @@ -0,0 +1,597 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. _parquet: + +Reading and Writing the Apache Parquet Format +============================================= + +The `Apache Parquet <http://parquet.apache.org/>`_ project provides a +standardized open-source columnar storage format for use in data analysis +systems. It was created originally for use in `Apache Hadoop +<http://hadoop.apache.org/>`_ with systems like `Apache Drill +<http://drill.apache.org>`_, `Apache Hive <http://hive.apache.org>`_, `Apache +Impala (incubating) <http://impala.apache.org>`_, and `Apache Spark +<http://spark.apache.org>`_ adopting it as a shared standard for high +performance data IO. + +Apache Arrow is an ideal in-memory transport layer for data that is being read +or written with Parquet files. We have been concurrently developing the `C++ +implementation of Apache Parquet <http://github.com/apache/parquet-cpp>`_, +which includes a native, multithreaded C++ adapter to and from in-memory Arrow +data. PyArrow includes Python bindings to this code, which thus enables reading +and writing Parquet files with pandas as well. + +Obtaining pyarrow with Parquet Support +-------------------------------------- + +If you installed ``pyarrow`` with pip or conda, it should be built with Parquet +support bundled: + +.. ipython:: python + + import pyarrow.parquet as pq + +If you are building ``pyarrow`` from source, you must use +``-DARROW_PARQUET=ON`` when compiling the C++ libraries and enable the Parquet +extensions when building ``pyarrow``. See the :ref:`Python Development +<python-development>` page for more details. + +Reading and Writing Single Files +-------------------------------- + +The functions :func:`~.parquet.read_table` and :func:`~.parquet.write_table` +read and write the :ref:`pyarrow.Table <data.table>` object, respectively. + +Let's look at a simple table: + +.. ipython:: python + + import numpy as np + import pandas as pd + import pyarrow as pa + + df = pd.DataFrame({'one': [-1, np.nan, 2.5], + 'two': ['foo', 'bar', 'baz'], + 'three': [True, False, True]}, + index=list('abc')) + table = pa.Table.from_pandas(df) + +We write this to Parquet format with ``write_table``: + +.. ipython:: python + + import pyarrow.parquet as pq + pq.write_table(table, 'example.parquet') + +This creates a single Parquet file. In practice, a Parquet dataset may consist +of many files in many directories. We can read a single file back with +``read_table``: + +.. ipython:: python + + table2 = pq.read_table('example.parquet') + table2.to_pandas() + +You can pass a subset of columns to read, which can be much faster than reading +the whole file (due to the columnar layout): + +.. ipython:: python + + pq.read_table('example.parquet', columns=['one', 'three']) + +When reading a subset of columns from a file that used a Pandas dataframe as the +source, we use ``read_pandas`` to maintain any additional index column data: + +.. ipython:: python + + pq.read_pandas('example.parquet', columns=['two']).to_pandas() + +We need not use a string to specify the origin of the file. It can be any of: + +* A file path as a string +* A :ref:`NativeFile <io.native_file>` from PyArrow +* A Python file object + +In general, a Python file object will have the worst read performance, while a +string file path or an instance of :class:`~.NativeFile` (especially memory +maps) will perform the best. + +.. _parquet_mmap: + +Reading Parquet and Memory Mapping +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Because Parquet data needs to be decoded from the Parquet format +and compression, it can't be directly mapped from disk. +Thus the ``memory_map`` option might perform better on some systems +but won't help much with resident memory consumption. + +.. code-block:: python + + >>> pq_array = pa.parquet.read_table("area1.parquet", memory_map=True) + >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + RSS: 4299MB + + >>> pq_array = pa.parquet.read_table("area1.parquet", memory_map=False) + >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + RSS: 4299MB + +If you need to deal with Parquet data bigger than memory, +the :ref:`dataset` and partitioning is probably what you are looking for. + +Parquet file writing options +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~pyarrow.parquet.write_table()` has a number of options to +control various settings when writing a Parquet file. + +* ``version``, the Parquet format version to use. ``'1.0'`` ensures + compatibility with older readers, while ``'2.4'`` and greater values + enable more Parquet types and encodings. +* ``data_page_size``, to control the approximate size of encoded data + pages within a column chunk. This currently defaults to 1MB. +* ``flavor``, to set compatibility options particular to a Parquet + consumer like ``'spark'`` for Apache Spark. + +See the :func:`~pyarrow.parquet.write_table()` docstring for more details. + +There are some additional data type handling-specific options +described below. + +Omitting the DataFrame index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using ``pa.Table.from_pandas`` to convert to an Arrow table, by default +one or more special columns are added to keep track of the index (row +labels). Storing the index takes extra space, so if your index is not valuable, +you may choose to omit it by passing ``preserve_index=False`` + +.. ipython:: python + + df = pd.DataFrame({'one': [-1, np.nan, 2.5], + 'two': ['foo', 'bar', 'baz'], + 'three': [True, False, True]}, + index=list('abc')) + df + table = pa.Table.from_pandas(df, preserve_index=False) + +Then we have: + +.. ipython:: python + + pq.write_table(table, 'example_noindex.parquet') + t = pq.read_table('example_noindex.parquet') + t.to_pandas() + +Here you see the index did not survive the round trip. + +Finer-grained Reading and Writing +--------------------------------- + +``read_table`` uses the :class:`~.ParquetFile` class, which has other features: + +.. ipython:: python + + parquet_file = pq.ParquetFile('example.parquet') + parquet_file.metadata + parquet_file.schema + +As you can learn more in the `Apache Parquet format +<https://github.com/apache/parquet-format>`_, a Parquet file consists of +multiple row groups. ``read_table`` will read all of the row groups and +concatenate them into a single table. You can read individual row groups with +``read_row_group``: + +.. ipython:: python + + parquet_file.num_row_groups + parquet_file.read_row_group(0) + +We can similarly write a Parquet file with multiple row groups by using +``ParquetWriter``: + +.. ipython:: python + + with pq.ParquetWriter('example2.parquet', table.schema) as writer: + for i in range(3): + writer.write_table(table) + + pf2 = pq.ParquetFile('example2.parquet') + pf2.num_row_groups + +Inspecting the Parquet File Metadata +------------------------------------ + +The ``FileMetaData`` of a Parquet file can be accessed through +:class:`~.ParquetFile` as shown above: + +.. ipython:: python + + parquet_file = pq.ParquetFile('example.parquet') + metadata = parquet_file.metadata + +or can also be read directly using :func:`~parquet.read_metadata`: + +.. ipython:: python + + metadata = pq.read_metadata('example.parquet') + metadata + +The returned ``FileMetaData`` object allows to inspect the +`Parquet file metadata <https://github.com/apache/parquet-format#metadata>`__, +such as the row groups and column chunk metadata and statistics: + +.. ipython:: python + + metadata.row_group(0) + metadata.row_group(0).column(0) + +.. ipython:: python + :suppress: + + !rm example.parquet + !rm example_noindex.parquet + !rm example2.parquet + !rm example3.parquet + +Data Type Handling +------------------ + +Reading types as DictionaryArray +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``read_dictionary`` option in ``read_table`` and ``ParquetDataset`` will +cause columns to be read as ``DictionaryArray``, which will become +``pandas.Categorical`` when converted to pandas. This option is only valid for +string and binary column types, and it can yield significantly lower memory use +and improved performance for columns with many repeated string values. + +.. code-block:: python + + pq.read_table(table, where, read_dictionary=['binary_c0', 'stringb_c2']) + +Storing timestamps +~~~~~~~~~~~~~~~~~~ + +Some Parquet readers may only support timestamps stored in millisecond +(``'ms'``) or microsecond (``'us'``) resolution. Since pandas uses nanoseconds +to represent timestamps, this can occasionally be a nuisance. By default +(when writing version 1.0 Parquet files), the nanoseconds will be cast to +microseconds ('us'). + +In addition, We provide the ``coerce_timestamps`` option to allow you to select +the desired resolution: + +.. code-block:: python + + pq.write_table(table, where, coerce_timestamps='ms') + +If a cast to a lower resolution value may result in a loss of data, by default +an exception will be raised. This can be suppressed by passing +``allow_truncated_timestamps=True``: + +.. code-block:: python + + pq.write_table(table, where, coerce_timestamps='ms', + allow_truncated_timestamps=True) + +Timestamps with nanoseconds can be stored without casting when using the +more recent Parquet format version 2.6: + +.. code-block:: python + + pq.write_table(table, where, version='2.6') + +However, many Parquet readers do not yet support this newer format version, and +therefore the default is to write version 1.0 files. When compatibility across +different processing frameworks is required, it is recommended to use the +default version 1.0. + +Older Parquet implementations use ``INT96`` based storage of +timestamps, but this is now deprecated. This includes some older +versions of Apache Impala and Apache Spark. To write timestamps in +this format, set the ``use_deprecated_int96_timestamps`` option to +``True`` in ``write_table``. + +.. code-block:: python + + pq.write_table(table, where, use_deprecated_int96_timestamps=True) + +Compression, Encoding, and File Compatibility +--------------------------------------------- + +The most commonly used Parquet implementations use dictionary encoding when +writing files; if the dictionaries grow too large, then they "fall back" to +plain encoding. Whether dictionary encoding is used can be toggled using the +``use_dictionary`` option: + +.. code-block:: python + + pq.write_table(table, where, use_dictionary=False) + +The data pages within a column in a row group can be compressed after the +encoding passes (dictionary, RLE encoding). In PyArrow we use Snappy +compression by default, but Brotli, Gzip, and uncompressed are also supported: + +.. code-block:: python + + pq.write_table(table, where, compression='snappy') + pq.write_table(table, where, compression='gzip') + pq.write_table(table, where, compression='brotli') + pq.write_table(table, where, compression='none') + +Snappy generally results in better performance, while Gzip may yield smaller +files. + +These settings can also be set on a per-column basis: + +.. code-block:: python + + pq.write_table(table, where, compression={'foo': 'snappy', 'bar': 'gzip'}, + use_dictionary=['foo', 'bar']) + +Partitioned Datasets (Multiple Files) +------------------------------------------------ + +Multiple Parquet files constitute a Parquet *dataset*. These may present in a +number of ways: + +* A list of Parquet absolute file paths +* A directory name containing nested directories defining a partitioned dataset + +A dataset partitioned by year and month may look like on disk: + +.. code-block:: text + + dataset_name/ + year=2007/ + month=01/ + 0.parq + 1.parq + ... + month=02/ + 0.parq + 1.parq + ... + month=03/ + ... + year=2008/ + month=01/ + ... + ... + +Writing to Partitioned Datasets +------------------------------- + +You can write a partitioned dataset for any ``pyarrow`` file system that is a +file-store (e.g. local, HDFS, S3). The default behaviour when no filesystem is +added is to use the local filesystem. + +.. code-block:: python + + # Local dataset write + pq.write_to_dataset(table, root_path='dataset_name', + partition_cols=['one', 'two']) + +The root path in this case specifies the parent directory to which data will be +saved. The partition columns are the column names by which to partition the +dataset. Columns are partitioned in the order they are given. The partition +splits are determined by the unique values in the partition columns. + +To use another filesystem you only need to add the filesystem parameter, the +individual table writes are wrapped using ``with`` statements so the +``pq.write_to_dataset`` function does not need to be. + +.. code-block:: python + + # Remote file-system example + from pyarrow.fs import HadoopFileSystem + fs = HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) + pq.write_to_dataset(table, root_path='dataset_name', + partition_cols=['one', 'two'], filesystem=fs) + +Compatibility Note: if using ``pq.write_to_dataset`` to create a table that +will then be used by HIVE then partition column values must be compatible with +the allowed character set of the HIVE version you are running. + +Writing ``_metadata`` and ``_common_medata`` files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some processing frameworks such as Spark or Dask (optionally) use ``_metadata`` +and ``_common_metadata`` files with partitioned datasets. + +Those files include information about the schema of the full dataset (for +``_common_metadata``) and potentially all row group metadata of all files in the +partitioned dataset as well (for ``_metadata``). The actual files are +metadata-only Parquet files. Note this is not a Parquet standard, but a +convention set in practice by those frameworks. + +Using those files can give a more efficient creation of a parquet Dataset, +since it can use the stored schema and and file paths of all row groups, +instead of inferring the schema and crawling the directories for all Parquet +files (this is especially the case for filesystems where accessing files +is expensive). + +The :func:`~pyarrow.parquet.write_to_dataset` function does not automatically +write such metadata files, but you can use it to gather the metadata and +combine and write them manually: + +.. code-block:: python + + # Write a dataset and collect metadata information of all written files + metadata_collector = [] + pq.write_to_dataset(table, root_path, metadata_collector=metadata_collector) + + # Write the ``_common_metadata`` parquet file without row groups statistics + pq.write_metadata(table.schema, root_path / '_common_metadata') + + # Write the ``_metadata`` parquet file with row groups statistics of all files + pq.write_metadata( + table.schema, root_path / '_metadata', + metadata_collector=metadata_collector + ) + +When not using the :func:`~pyarrow.parquet.write_to_dataset` function, but +writing the individual files of the partitioned dataset using +:func:`~pyarrow.parquet.write_table` or :class:`~pyarrow.parquet.ParquetWriter`, +the ``metadata_collector`` keyword can also be used to collect the FileMetaData +of the written files. In this case, you need to ensure to set the file path +contained in the row group metadata yourself before combining the metadata, and +the schemas of all different files and collected FileMetaData objects should be +the same: + +.. code-block:: python + + metadata_collector = [] + pq.write_table( + table1, root_path / "year=2017/data1.parquet", + metadata_collector=metadata_collector + ) + + # set the file path relative to the root of the partitioned dataset + metadata_collector[-1].set_file_path("year=2017/data1.parquet") + + # combine and write the metadata + metadata = metadata_collector[0] + for _meta in metadata_collector[1:]: + metadata.append_row_groups(_meta) + metadata.write_metadata_file(root_path / "_metadata") + + # or use pq.write_metadata to combine and write in a single step + pq.write_metadata( + table1.schema, root_path / "_metadata", + metadata_collector=metadata_collector + ) + +Reading from Partitioned Datasets +------------------------------------------------ + +The :class:`~.ParquetDataset` class accepts either a directory name or a list +of file paths, and can discover and infer some common partition structures, +such as those produced by Hive: + +.. code-block:: python + + dataset = pq.ParquetDataset('dataset_name/') + table = dataset.read() + +You can also use the convenience function ``read_table`` exposed by +``pyarrow.parquet`` that avoids the need for an additional Dataset object +creation step. + +.. code-block:: python + + table = pq.read_table('dataset_name') + +Note: the partition columns in the original table will have their types +converted to Arrow dictionary types (pandas categorical) on load. Ordering of +partition columns is not preserved through the save/load process. If reading +from a remote filesystem into a pandas dataframe you may need to run +``sort_index`` to maintain row ordering (as long as the ``preserve_index`` +option was enabled on write). + +.. note:: + + The ParquetDataset is being reimplemented based on the new generic Dataset + API (see the :ref:`dataset` docs for an overview). This is not yet the + default, but can already be enabled by passing the ``use_legacy_dataset=False`` + keyword to :class:`ParquetDataset` or :func:`read_table`:: + + pq.ParquetDataset('dataset_name/', use_legacy_dataset=False) + + Enabling this gives the following new features: + + - Filtering on all columns (using row group statistics) instead of only on + the partition keys. + - More fine-grained partitioning: support for a directory partitioning scheme + in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of + "/year=2019/month=11/day=15/"), and the ability to specify a schema for + the partition keys. + - General performance improvement and bug fixes. + + It also has the following changes in behaviour: + + - The partition keys need to be explicitly included in the ``columns`` + keyword when you want to include them in the result while reading a + subset of the columns + + This new implementation is already enabled in ``read_table``, and in the + future, this will be turned on by default for ``ParquetDataset``. The new + implementation does not yet cover all existing ParquetDataset features (e.g. + specifying the ``metadata``, or the ``pieces`` property API). Feedback is + very welcome. + + +Using with Spark +---------------- + +Spark places some constraints on the types of Parquet files it will read. The +option ``flavor='spark'`` will set these options automatically and also +sanitize field characters unsupported by Spark SQL. + +Multithreaded Reads +------------------- + +Each of the reading functions by default use multi-threading for reading +columns in parallel. Depending on the speed of IO +and how expensive it is to decode the columns in a particular file +(particularly with GZIP compression), this can yield significantly higher data +throughput. + +This can be disabled by specifying ``use_threads=False``. + +.. note:: + The number of threads to use concurrently is automatically inferred by Arrow + and can be inspected using the :func:`~pyarrow.cpu_count()` function. + +Reading from cloud storage +-------------------------- + +In addition to local files, pyarrow supports other filesystems, such as cloud +filesystems, through the ``filesystem`` keyword: + +.. code-block:: python + + from pyarrow import fs + + s3 = fs.S3FileSystem(region="us-east-2") + table = pq.read_table("bucket/object/key/prefix", filesystem=s3) + +Currently, :class:`HDFS <pyarrow.fs.HadoopFileSystem>` and +:class:`Amazon S3-compatible storage <pyarrow.fs.S3FileSystem>` are +supported. See the :ref:`filesystem` docs for more details. For those +built-in filesystems, the filesystem can also be inferred from the file path, +if specified as a URI: + +.. code-block:: python + + table = pq.read_table("s3://bucket/object/key/prefix") + +Other filesystems can still be supported if there is an +`fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`__-compatible +implementation available. See :ref:`filesystem-fsspec` for more details. +One example is Azure Blob storage, which can be interfaced through the +`adlfs <https://github.com/dask/adlfs>`__ package. + +.. code-block:: python + + from adlfs import AzureBlobFileSystem + + abfs = AzureBlobFileSystem(account_name="XXXX", account_key="XXXX", container_name="XXXX") + table = pq.read_table("file.parquet", filesystem=abfs) diff --git a/src/arrow/docs/source/python/plasma.rst b/src/arrow/docs/source/python/plasma.rst new file mode 100644 index 000000000..51c7b6eaf --- /dev/null +++ b/src/arrow/docs/source/python/plasma.rst @@ -0,0 +1,462 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow +.. _plasma: + +The Plasma In-Memory Object Store +================================= + +.. note:: + + As present, Plasma is only supported for use on Linux and macOS. + +The Plasma API +-------------- + +Starting the Plasma store +^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can start the Plasma store by issuing a terminal command similar to the +following: + +.. code-block:: bash + + plasma_store -m 1000000000 -s /tmp/plasma + +The ``-m`` flag specifies the size of the store in bytes, and the ``-s`` flag +specifies the socket that the store will listen at. Thus, the above command +allows the Plasma store to use up to 1GB of memory, and sets the socket to +``/tmp/plasma``. + +Leaving the current terminal window open as long as Plasma store should keep +running. Messages, concerning such as disconnecting clients, may occasionally be +printed to the screen. To stop running the Plasma store, you can press +``Ctrl-C`` in the terminal. + +Creating a Plasma client +^^^^^^^^^^^^^^^^^^^^^^^^ + +To start a Plasma client from Python, call ``plasma.connect`` using the same +socket name: + +.. code-block:: python + + import pyarrow.plasma as plasma + client = plasma.connect("/tmp/plasma") + +If the following error occurs from running the above Python code, that +means that either the socket given is incorrect, or the ``./plasma_store`` is +not currently running. Check to see if the Plasma store is still running. + +.. code-block:: shell + + >>> client = plasma.connect("/tmp/plasma") + Connection to socket failed for pathname /tmp/plasma + Could not connect to socket /tmp/plasma + + +Object IDs +^^^^^^^^^^ + +Each object in the Plasma store should be associated with a unique ID. The +Object ID then serves as a key that any client can use to retrieve that object +from the Plasma store. You can form an ``ObjectID`` object from a byte string of +length 20. + +.. code-block:: shell + + # Create an ObjectID. + >>> id = plasma.ObjectID(20 * b"a") + + # The character "a" is encoded as 61 in hex. + >>> id + ObjectID(6161616161616161616161616161616161616161) + +The random generation of Object IDs is often good enough to ensure unique IDs. +You can easily create a helper function that randomly generates object IDs as +follows: + +.. code-block:: python + + import numpy as np + + def random_object_id(): + return plasma.ObjectID(np.random.bytes(20)) + +Putting and Getting Python Objects +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Plasma supports two APIs for creating and accessing objects: A high level +API that allows storing and retrieving Python objects and a low level +API that allows creating, writing and sealing buffers and operating on +the binary data directly. In this section we describe the high level API. + +This is how you can put and get a Python object: + +.. code-block:: python + + # Create a python object. + object_id = client.put("hello, world") + + # Get the object. + client.get(object_id) + +This works with all Python objects supported by the Arrow Python object +serialization. + +You can also get multiple objects at the same time (which can be more +efficient since it avoids IPC round trips): + +.. code-block:: python + + # Create multiple python objects. + object_id1 = client.put(1) + object_id2 = client.put(2) + object_id3 = client.put(3) + + # Get the objects. + client.get([object_id1, object_id2, object_id3]) + +Furthermore, it is possible to provide a timeout for the get call. If the +object is not available within the timeout, the special object +`pyarrow.ObjectNotAvailable` will be returned. + +Creating an Object Buffer +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Objects are created in Plasma in two stages. First, they are **created**, which +allocates a buffer for the object. At this point, the client can write to the +buffer and construct the object within the allocated buffer. + +To create an object for Plasma, you need to create an object ID, as well as +give the object's maximum size in bytes. + +.. code-block:: python + + # Create an object buffer. + object_id = plasma.ObjectID(20 * b"a") + object_size = 1000 + buffer = memoryview(client.create(object_id, object_size)) + + # Write to the buffer. + for i in range(1000): + buffer[i] = i % 128 + +When the client is done, the client **seals** the buffer, making the object +immutable, and making it available to other Plasma clients. + +.. code-block:: python + + # Seal the object. This makes the object immutable and available to other clients. + client.seal(object_id) + + +Getting an Object Buffer +^^^^^^^^^^^^^^^^^^^^^^^^ + +After an object has been sealed, any client who knows the object ID can get +the object buffer. + +.. code-block:: python + + # Create a different client. Note that this second client could be + # created in the same or in a separate, concurrent Python session. + client2 = plasma.connect("/tmp/plasma") + + # Get the object in the second client. This blocks until the object has been sealed. + object_id2 = plasma.ObjectID(20 * b"a") + [buffer2] = client2.get_buffers([object_id]) + +If the object has not been sealed yet, then the call to client.get_buffers will +block until the object has been sealed by the client constructing the object. +Using the ``timeout_ms`` argument to get, you can specify a timeout for this (in +milliseconds). After the timeout, the interpreter will yield control back. + +.. code-block:: shell + + >>> buffer + <memory at 0x7fdbdc96e708> + >>> buffer[1] + 1 + >>> buffer2 + <plasma.plasma.PlasmaBuffer object at 0x7fdbf2770e88> + >>> view2 = memoryview(buffer2) + >>> view2[1] + 1 + >>> view2[129] + 1 + >>> bytes(buffer[1:4]) + b'\x01\x02\x03' + >>> bytes(view2[1:4]) + b'\x01\x02\x03' + + +Listing objects in the store +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The objects in the store can be listed in the following way (note that +this functionality is currently experimental and the concrete representation +of the object info might change in the future): + +.. code-block:: python + + import pyarrow.plasma as plasma + import time + + client = plasma.connect("/tmp/plasma") + + client.put("hello, world") + # Sleep a little so we get different creation times + time.sleep(2) + client.put("another object") + # Create an object that is not sealed yet + object_id = plasma.ObjectID.from_random() + client.create(object_id, 100) + print(client.list()) + + >>> {ObjectID(4cba8f80c54c6d265b46c2cdfcee6e32348b12be): {'construct_duration': 0, + >>> 'create_time': 1535223642, + >>> 'data_size': 460, + >>> 'metadata_size': 0, + >>> 'ref_count': 0, + >>> 'state': 'sealed'}, + >>> ObjectID(a7598230b0c26464c9d9c99ae14773ee81485428): {'construct_duration': 0, + >>> 'create_time': 1535223644, + >>> 'data_size': 460, + >>> 'metadata_size': 0, + >>> 'ref_count': 0, + >>> 'state': 'sealed'}, + >>> ObjectID(e603ab0c92098ebf08f90bfcea33ff98f6476870): {'construct_duration': -1, + >>> 'create_time': 1535223644, + >>> 'data_size': 100, + >>> 'metadata_size': 0, + >>> 'ref_count': 1, + >>> 'state': 'created'}} + + +Using Arrow and Pandas with Plasma +---------------------------------- + +Storing Arrow Objects in Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To store an Arrow object in Plasma, we must first **create** the object and then +**seal** it. However, Arrow objects such as ``Tensors`` may be more complicated +to write than simple binary data. + +To create the object in Plasma, you still need an ``ObjectID`` and a size to +pass in. To find out the size of your Arrow object, you can use pyarrow +API such as ``pyarrow.ipc.get_tensor_size``. + +.. code-block:: python + + import numpy as np + import pyarrow as pa + + # Create a pyarrow.Tensor object from a numpy random 2-dimensional array + data = np.random.randn(10, 4) + tensor = pa.Tensor.from_numpy(data) + + # Create the object in Plasma + object_id = plasma.ObjectID(np.random.bytes(20)) + data_size = pa.ipc.get_tensor_size(tensor) + buf = client.create(object_id, data_size) + +To write the Arrow ``Tensor`` object into the buffer, you can use Plasma to +convert the ``memoryview`` buffer into a ``pyarrow.FixedSizeBufferWriter`` +object. A ``pyarrow.FixedSizeBufferWriter`` is a format suitable for Arrow's +``pyarrow.ipc.write_tensor``: + +.. code-block:: python + + # Write the tensor into the Plasma-allocated buffer + stream = pa.FixedSizeBufferWriter(buf) + pa.ipc.write_tensor(tensor, stream) # Writes tensor's 552 bytes to Plasma stream + +To finish storing the Arrow object in Plasma, call ``seal``: + +.. code-block:: python + + # Seal the Plasma object + client.seal(object_id) + +Getting Arrow Objects from Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To read the object, first retrieve it as a ``PlasmaBuffer`` using its object ID. + +.. code-block:: python + + # Get the arrow object by ObjectID. + [buf2] = client.get_buffers([object_id]) + +To convert the ``PlasmaBuffer`` back into an Arrow ``Tensor``, first create a +pyarrow ``BufferReader`` object from it. You can then pass the ``BufferReader`` +into ``pyarrow.ipc.read_tensor`` to reconstruct the Arrow ``Tensor`` object: + +.. code-block:: python + + # Reconstruct the Arrow tensor object. + reader = pa.BufferReader(buf2) + tensor2 = pa.ipc.read_tensor(reader) + +Finally, you can use ``pyarrow.ipc.read_tensor`` to convert the Arrow object +back into numpy data: + +.. code-block:: python + + # Convert back to numpy + array = tensor2.to_numpy() + +Storing Pandas DataFrames in Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Storing a Pandas ``DataFrame`` still follows the **create** then **seal** +process of storing an object in the Plasma store, however one cannot directly +write the ``DataFrame`` to Plasma with Pandas alone. Plasma also needs to know +the size of the ``DataFrame`` to allocate a buffer for. + +See :ref:`pandas_interop` for more information on using Arrow with Pandas. + +You can create the pyarrow equivalent of a Pandas ``DataFrame`` by using +``pyarrow.from_pandas`` to convert it to a ``RecordBatch``. + +.. code-block:: python + + import pyarrow as pa + import pandas as pd + + # Create a Pandas DataFrame + d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), + 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + df = pd.DataFrame(d) + + # Convert the Pandas DataFrame into a PyArrow RecordBatch + record_batch = pa.RecordBatch.from_pandas(df) + +Creating the Plasma object requires an ``ObjectID`` and the size of the +data. Now that we have converted the Pandas ``DataFrame`` into a PyArrow +``RecordBatch``, use the ``MockOutputStream`` to determine the +size of the Plasma object. + +.. code-block:: python + + # Create the Plasma object from the PyArrow RecordBatch. Most of the work here + # is done to determine the size of buffer to request from the object store. + object_id = plasma.ObjectID(np.random.bytes(20)) + mock_sink = pa.MockOutputStream() + with pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) as stream_writer: + stream_writer.write_batch(record_batch) + data_size = mock_sink.size() + buf = client.create(object_id, data_size) + +The DataFrame can now be written to the buffer as follows. + +.. code-block:: python + + # Write the PyArrow RecordBatch to Plasma + stream = pa.FixedSizeBufferWriter(buf) + with pa.RecordBatchStreamWriter(stream, record_batch.schema) as stream_writer: + stream_writer.write_batch(record_batch) + +Finally, seal the finished object for use by all clients: + +.. code-block:: python + + # Seal the Plasma object + client.seal(object_id) + +Getting Pandas DataFrames from Plasma +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since we store the Pandas DataFrame as a PyArrow ``RecordBatch`` object, +to get the object back from the Plasma store, we follow similar steps +to those specified in `Getting Arrow Objects from Plasma`_. + +We first have to convert the ``PlasmaBuffer`` returned from +``client.get_buffers`` into an Arrow ``BufferReader`` object. + +.. code-block:: python + + # Fetch the Plasma object + [data] = client.get_buffers([object_id]) # Get PlasmaBuffer from ObjectID + buffer = pa.BufferReader(data) + +From the ``BufferReader``, we can create a specific ``RecordBatchStreamReader`` +in Arrow to reconstruct the stored PyArrow ``RecordBatch`` object. + +.. code-block:: python + + # Convert object back into an Arrow RecordBatch + reader = pa.RecordBatchStreamReader(buffer) + record_batch = reader.read_next_batch() + +The last step is to convert the PyArrow ``RecordBatch`` object back into +the original Pandas ``DataFrame`` structure. + +.. code-block:: python + + # Convert back into Pandas + result = record_batch.to_pandas() + +Using Plasma with Huge Pages +---------------------------- + +On Linux it is possible to use the Plasma store with huge pages for increased +throughput. You first need to create a file system and activate huge pages with + +.. code-block:: shell + + sudo mkdir -p /mnt/hugepages + gid=`id -g` + uid=`id -u` + sudo mount -t hugetlbfs -o uid=$uid -o gid=$gid none /mnt/hugepages + sudo bash -c "echo $gid > /proc/sys/vm/hugetlb_shm_group" + sudo bash -c "echo 20000 > /proc/sys/vm/nr_hugepages" + +Note that you only need root access to create the file system, not for +running the object store. You can then start the Plasma store with the ``-d`` +flag for the mount point of the huge page file system and the ``-h`` flag +which indicates that huge pages are activated: + +.. code-block:: shell + + plasma_store -s /tmp/plasma -m 10000000000 -d /mnt/hugepages -h + +You can test this with the following script: + +.. code-block:: python + + import numpy as np + import pyarrow as pa + import pyarrow.plasma as plasma + import time + + client = plasma.connect("/tmp/plasma") + + data = np.random.randn(100000000) + tensor = pa.Tensor.from_numpy(data) + + object_id = plasma.ObjectID(np.random.bytes(20)) + buf = client.create(object_id, pa.ipc.get_tensor_size(tensor)) + + stream = pa.FixedSizeBufferWriter(buf) + stream.set_memcopy_threads(4) + a = time.time() + pa.ipc.write_tensor(tensor, stream) + print("Writing took ", time.time() - a) diff --git a/src/arrow/docs/source/python/timestamps.rst b/src/arrow/docs/source/python/timestamps.rst new file mode 100644 index 000000000..fb4da5cc0 --- /dev/null +++ b/src/arrow/docs/source/python/timestamps.rst @@ -0,0 +1,198 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +********** +Timestamps +********** + +Arrow/Pandas Timestamps +======================= + +Arrow timestamps are stored as a 64-bit integer with column metadata to +associate a time unit (e.g. milliseconds, microseconds, or nanoseconds), and an +optional time zone. Pandas (`Timestamp`) uses a 64-bit integer representing +nanoseconds and an optional time zone. +Python/Pandas timestamp types without a associated time zone are referred to as +"Time Zone Naive". Python/Pandas timestamp types with an associated time zone are +referred to as "Time Zone Aware". + + +Timestamp Conversions +===================== + +Pandas/Arrow ⇄ Spark +-------------------- + +Spark stores timestamps as 64-bit integers representing microseconds since +the UNIX epoch. It does not store any metadata about time zones with its +timestamps. + +Spark interprets timestamps with the *session local time zone*, (i.e. +``spark.sql.session.timeZone``). If that time zone is undefined, Spark turns to +the default system time zone. For simplicity's sake below, the session +local time zone is always defined. + +This implies a few things when round-tripping timestamps: + +#. Timezone information is lost (all timestamps that result from + converting from spark to arrow/pandas are "time zone naive"). +#. Timestamps are truncated to microseconds. +#. The session time zone might have unintuitive impacts on + translation of timestamp values. + +Spark to Pandas (through Apache Arrow) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following cases assume the Spark configuration +``spark.sql.execution.arrow.enabled`` is set to ``"true"``. + +:: + + >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], + ... 'aware': [Timestamp(year=2019, month=1, day=1, + ... nanosecond=500, tz=timezone(timedelta(hours=-8)))]}) + >>> pdf + naive aware + 0 2018-10-01 2018-10-01 00:00:00.000000500-08:00 + + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") + >>> utc_df = sqlContext.createDataFrame(pdf) + >>> utf_df.show() + +-------------------+-------------------+ + | naive| aware| + +-------------------+-------------------+ + |2019-01-01 00:00:00|2019-01-01 08:00:00| + +-------------------+-------------------+ + +Note that conversion of the aware timestamp is shifted to reflect the time +assuming UTC (it represents the same instant in time). For naive +timestamps, Spark treats them as being in the system local +time zone and converts them UTC. Recall that internally, the schema +for spark dataframe's does not store any time zone information with +timestamps. + +Now if the session time zone is set to US Pacific Time (PST) we don't +see any shift in the display of the aware time zone (it +still represents the same instant in time): + +:: + + >>> spark.conf.set("spark.sql.session.timeZone", "US/Pacific") + >>> pst_df = sqlContext.createDataFrame(pdf) + >>> pst_df.show() + +-------------------+-------------------+ + | naive| aware| + +-------------------+-------------------+ + |2019-01-01 00:00:00|2019-01-01 00:00:00| + +-------------------+-------------------+ + +Looking again at utc_df.show() we see one of the impacts of the session time +zone. The naive timestamp was initially converted assuming UTC, the instant it +reflects is actually earlier than the naive time zone from the PST converted +data frame: + +:: + + >>> utc_df.show() + +-------------------+-------------------+ + | naive| aware| + +-------------------+-------------------+ + |2018-12-31 16:00:00|2019-01-01 00:00:00| + +-------------------+-------------------+ + +Spark to Pandas +~~~~~~~~~~~~~~~ + +We can observe what happens when converting back to Arrow/Pandas. Assuming the +session time zone is still PST: + +:: + + >>> pst_df.show() + +-------------------+-------------------+ + | naive| aware| + +-------------------+-------------------+ + |2019-01-01 00:00:00|2019-01-01 00:00:00| + +-------------------+-------------------+ + + + >>> pst_df.toPandas() + naive aware + 0 2019-01-01 2019-01-01 + >>> pst_df.toPandas().info() + <class 'pandas.core.frame.DataFrame'> + RangeIndex: 1 entries, 0 to 0 + Data columns (total 2 columns): + naive 1 non-null datetime64[ns] + aware 1 non-null datetime64[ns] + dtypes: datetime64[ns](2) + memory usage: 96.0 bytes + +Notice that, in addition to being a "time zone naive" timestamp, the 'aware' +value will now differ when converting to an epoch offset. Spark does the conversion +by first converting to the session time zone (or system local time zone if +session time zones isn't set) and then localizes to remove the time zone +information. This results in the timestamp being 8 hours before the original +time: + +:: + + >>> pst_df.toPandas()['aware'][0] + Timestamp('2019-01-01 00:00:00') + >>> pdf['aware'][0] + Timestamp('2019-01-01 00:00:00.000000500-0800', tz='UTC-08:00') + >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 + -8.0 + +The same type of conversion happens with the data frame converted while +the session time zone was UTC. In this case both naive and aware +represent different instants in time (the naive instant is due to +the change in session time zone between creating data frames): + +:: + + >>> utc_df.show() + +-------------------+-------------------+ + | naive| aware| + +-------------------+-------------------+ + |2018-12-31 16:00:00|2019-01-01 00:00:00| + +-------------------+-------------------+ + + >>> utc_df.toPandas() + naive aware + 0 2018-12-31 16:00:00 2019-01-01 + +Note that the surprising shift for aware doesn't happen +when the session time zone is UTC (but the timestamps +still become "time zone naive"): + +:: + + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") + >>> pst_df.show() + +-------------------+-------------------+ + | naive| aware| + +-------------------+-------------------+ + |2019-01-01 08:00:00|2019-01-01 08:00:00| + +-------------------+-------------------+ + + >>> pst_df.toPandas()['aware'][0] + Timestamp('2019-01-01 08:00:00') + >>> pdf['aware'][0] + Timestamp('2019-01-01 00:00:00.000000500-0800', tz='UTC-08:00') + >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 + 0.0 diff --git a/src/arrow/docs/source/r/index.rst b/src/arrow/docs/source/r/index.rst new file mode 100644 index 000000000..b799544bb --- /dev/null +++ b/src/arrow/docs/source/r/index.rst @@ -0,0 +1,21 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +R docs +====== + +Stub page for the R docs; actual source is located in r/ sub-directory. diff --git a/src/arrow/docs/source/status.rst b/src/arrow/docs/source/status.rst new file mode 100644 index 000000000..8e3e998df --- /dev/null +++ b/src/arrow/docs/source/status.rst @@ -0,0 +1,239 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +===================== +Implementation Status +===================== + +The following tables summarize the features available in the various official +Arrow libraries. Unless otherwise stated, the Python, R, Ruby and C/GLib +libraries follow the C++ Arrow library. + +Data Types +========== + ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | +| (primitive) | | | | | | | | ++===================+=======+=======+=======+============+=======+=======+=======+ +| Null | ✓ | ✓ | ✓ | | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Boolean | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Int8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| UInt8/16/32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Float16 | | | ✓ | | | | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Float32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Decimal128 | ✓ | ✓ | ✓ | | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Decimal256 | ✓ | ✓ | | | ✓ | | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Date32/64 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Time32/64 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Timestamp | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Duration | ✓ | ✓ | ✓ | | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Interval | ✓ | ✓ | ✓ | | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Fixed Size Binary | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Binary | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Large Binary | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ + ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | +| (nested) | | | | | | | | ++===================+=======+=======+=======+============+=======+=======+=======+ +| Fixed Size List | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| List | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Large List | ✓ | ✓ | | | | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Struct | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Map | ✓ | ✓ | ✓ | ✓ | | | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Dense Union | ✓ | ✓ | | | | | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Sparse Union | ✓ | ✓ | | | | | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ + ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | +| (special) | | | | | | | | ++===================+=======+=======+=======+============+=======+=======+=======+ +| Dictionary | ✓ | ✓ (1) | | ✓ (1) | | ✓ (1) | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ +| Extension | ✓ | ✓ | ✓ | | | | ✓ | ++-------------------+-------+-------+-------+------------+-------+-------+-------+ + +Notes: + +* \(1) Nested dictionaries not supported + +.. seealso:: + The :ref:`format_columnar` specification. + + +IPC Format +========== + ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| IPC Feature | C++ | Java | Go | JavaScript | C# | Rust | Julia | +| | | | | | | | | ++=============================+=======+=======+=======+============+=======+=======+=======+ +| Arrow stream format | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Arrow file format | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Record batches | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Dictionaries | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Replacement dictionaries | ✓ | ✓ | | | | | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Delta dictionaries | ✓ (1) | | | | | | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Tensors | ✓ | | | | | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Sparse tensors | ✓ | | | | | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Buffer compression | ✓ | ✓ (3) | ✓ | | | | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Endianness conversion | ✓ (2) | | | | | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Custom schema metadata | ✓ | ✓ | ✓ | | | ✓ | ✓ | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ + +Notes: + +* \(1) Delta dictionaries not supported on nested dictionaries + +* \(2) Data with non-native endianness can be byte-swapped automatically when reading. + +* \(3) LZ4 Codec currently is quite inefficient. ARROW-11901 tracks improving performance. + +.. seealso:: + The :ref:`format-ipc` specification. + + +Flight RPC +========== + ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Flight RPC Feature | C++ | Java | Go | JavaScript | C# | Rust | Julia | +| | | | | | | | | ++=============================+=======+=======+=======+============+=======+=======+=======+ +| gRPC transport | ✓ | ✓ | ✓ | | ✓ (1) | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| gRPC + TLS transport | ✓ | ✓ | ✓ | | ✓ | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| RPC error codes | ✓ | ✓ | ✓ | | ✓ | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Authentication handlers | ✓ | ✓ | ✓ | | ✓ (2) | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Custom client middleware | ✓ | ✓ | ✓ | | | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Custom server middleware | ✓ | ✓ | ✓ | | | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ + +Notes: + +* \(1) No support for handshake or DoExchange. +* \(2) Support using AspNetCore authentication handlers. + +.. seealso:: + The :ref:`flight-rpc` specification. + + +C Data Interface +================ + ++-----------------------------+-----+--------+---+------+----+------+--------+------+ +| Feature | C++ | Python | R | Rust | Go | Java | C/GLib | Ruby | +| | | | | | | | | | ++=============================+=====+========+===+======+====+======+========+======+ +| Schema export | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-----+--------+---+------+----+------+--------+------+ +| Array export | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-----+--------+---+------+----+------+--------+------+ +| Schema import | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-----+--------+---+------+----+------+--------+------+ +| Array import | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-----+--------+---+------+----+------+--------+------+ + +.. seealso:: + The :ref:`C Data Interface <c-data-interface>` specification. + + +C Stream Interface (experimental) +================================= + ++-----------------------------+-----+--------+----+--------+------+ +| Feature | C++ | Python | Go | C/GLib | Ruby | +| | | | | | | ++=============================+=====+========+====+========+======+ +| Stream export | ✓ | ✓ | | ✓ | ✓ | ++-----------------------------+-----+--------+----+--------+------+ +| Stream import | ✓ | ✓ | ✓ | ✓ | ✓ | ++-----------------------------+-----+--------+----+--------+------+ + +.. seealso:: + The :ref:`C Stream Interface <c-stream-interface>` specification. + + +Third-Party Data Formats +======================== + ++-----------------------------+---------+---------+-------+------------+-------+---------+-------+ +| Format | C++ | Java | Go | JavaScript | C# | Rust | Julia | +| | | | | | | | | ++=============================+=========+=========+=======+============+=======+=========+=======+ +| Avro | | R | | | | | | ++-----------------------------+---------+---------+-------+------------+-------+---------+-------+ +| CSV | R | | R/W | | | R/W | R/W | ++-----------------------------+---------+---------+-------+------------+-------+---------+-------+ +| ORC | R/W | R (2) | | | | | | ++-----------------------------+---------+---------+-------+------------+-------+---------+-------+ +| Parquet | R/W | R (3) | | | | R/W (1) | | ++-----------------------------+---------+---------+-------+------------+-------+---------+-------+ + +Notes: + +* *R* = Read supported + +* *W* = Write supported + +* \(1) Nested read/write not supported. + +* \(2) Through JNI bindings. (Provided by ``org.apache.arrow.orc:arrow-orc``) + +* \(3) Through JNI bindings to Arrow C++ Datasets. (Provided by ``org.apache.arrow:arrow-dataset``) |