summaryrefslogtreecommitdiffstats
path: root/src/arrow/python/pyarrow/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/python/pyarrow/__init__.py')
-rw-r--r--src/arrow/python/pyarrow/__init__.py511
1 files changed, 511 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/__init__.py b/src/arrow/python/pyarrow/__init__.py
new file mode 100644
index 000000000..1ec229d53
--- /dev/null
+++ b/src/arrow/python/pyarrow/__init__.py
@@ -0,0 +1,511 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# flake8: noqa
+
+"""
+PyArrow is the python implementation of Apache Arrow.
+
+Apache Arrow is a cross-language development platform for in-memory data.
+It specifies a standardized language-independent columnar memory format for
+flat and hierarchical data, organized for efficient analytic operations on
+modern hardware. It also provides computational libraries and zero-copy
+streaming messaging and interprocess communication.
+
+For more information see the official page at https://arrow.apache.org
+"""
+
+import gc as _gc
+import os as _os
+import sys as _sys
+import warnings as _warnings
+
+try:
+ from ._generated_version import version as __version__
+except ImportError:
+ # Package is not installed, parse git tag at runtime
+ try:
+ import setuptools_scm
+ # Code duplicated from setup.py to avoid a dependency on each other
+
+ def parse_git(root, **kwargs):
+ """
+ Parse function for setuptools_scm that ignores tags for non-C++
+ subprojects, e.g. apache-arrow-js-XXX tags.
+ """
+ from setuptools_scm.git import parse
+ kwargs['describe_command'] = \
+ "git describe --dirty --tags --long --match 'apache-arrow-[0-9].*'"
+ return parse(root, **kwargs)
+ __version__ = setuptools_scm.get_version('../',
+ parse=parse_git)
+ except ImportError:
+ __version__ = None
+
+# ARROW-8684: Disable GC while initializing Cython extension module,
+# to workaround Cython bug in https://github.com/cython/cython/issues/3603
+_gc_enabled = _gc.isenabled()
+_gc.disable()
+import pyarrow.lib as _lib
+if _gc_enabled:
+ _gc.enable()
+
+from pyarrow.lib import (BuildInfo, RuntimeInfo, MonthDayNano,
+ VersionInfo, cpp_build_info, cpp_version,
+ cpp_version_info, runtime_info, cpu_count,
+ set_cpu_count, enable_signal_handlers,
+ io_thread_count, set_io_thread_count)
+
+
+def show_versions():
+ """
+ Print various version information, to help with error reporting.
+ """
+ # TODO: CPU information and flags
+ print("pyarrow version info\n--------------------")
+ print("Package kind: {}".format(cpp_build_info.package_kind
+ if len(cpp_build_info.package_kind) > 0
+ else "not indicated"))
+ print("Arrow C++ library version: {0}".format(cpp_build_info.version))
+ print("Arrow C++ compiler: {0} {1}"
+ .format(cpp_build_info.compiler_id, cpp_build_info.compiler_version))
+ print("Arrow C++ compiler flags: {0}"
+ .format(cpp_build_info.compiler_flags))
+ print("Arrow C++ git revision: {0}".format(cpp_build_info.git_id))
+ print("Arrow C++ git description: {0}"
+ .format(cpp_build_info.git_description))
+
+
+from pyarrow.lib import (null, bool_,
+ int8, int16, int32, int64,
+ uint8, uint16, uint32, uint64,
+ time32, time64, timestamp, date32, date64, duration,
+ month_day_nano_interval,
+ float16, float32, float64,
+ binary, string, utf8,
+ large_binary, large_string, large_utf8,
+ decimal128, decimal256,
+ list_, large_list, map_, struct,
+ union, sparse_union, dense_union,
+ dictionary,
+ field,
+ type_for_alias,
+ DataType, DictionaryType, StructType,
+ ListType, LargeListType, MapType, FixedSizeListType,
+ UnionType, SparseUnionType, DenseUnionType,
+ TimestampType, Time32Type, Time64Type, DurationType,
+ FixedSizeBinaryType, Decimal128Type, Decimal256Type,
+ BaseExtensionType, ExtensionType,
+ PyExtensionType, UnknownExtensionType,
+ register_extension_type, unregister_extension_type,
+ DictionaryMemo,
+ KeyValueMetadata,
+ Field,
+ Schema,
+ schema,
+ unify_schemas,
+ Array, Tensor,
+ array, chunked_array, record_batch, nulls, repeat,
+ SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
+ SparseCSFTensor,
+ infer_type, from_numpy_dtype,
+ NullArray,
+ NumericArray, IntegerArray, FloatingPointArray,
+ BooleanArray,
+ Int8Array, UInt8Array,
+ Int16Array, UInt16Array,
+ Int32Array, UInt32Array,
+ Int64Array, UInt64Array,
+ ListArray, LargeListArray, MapArray,
+ FixedSizeListArray, UnionArray,
+ BinaryArray, StringArray,
+ LargeBinaryArray, LargeStringArray,
+ FixedSizeBinaryArray,
+ DictionaryArray,
+ Date32Array, Date64Array, TimestampArray,
+ Time32Array, Time64Array, DurationArray,
+ MonthDayNanoIntervalArray,
+ Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
+ scalar, NA, _NULL as NULL, Scalar,
+ NullScalar, BooleanScalar,
+ Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
+ UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
+ HalfFloatScalar, FloatScalar, DoubleScalar,
+ Decimal128Scalar, Decimal256Scalar,
+ ListScalar, LargeListScalar, FixedSizeListScalar,
+ Date32Scalar, Date64Scalar,
+ Time32Scalar, Time64Scalar,
+ TimestampScalar, DurationScalar,
+ MonthDayNanoIntervalScalar,
+ BinaryScalar, LargeBinaryScalar,
+ StringScalar, LargeStringScalar,
+ FixedSizeBinaryScalar, DictionaryScalar,
+ MapScalar, StructScalar, UnionScalar,
+ ExtensionScalar)
+
+# Buffers, allocation
+from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
+ Codec, compress, decompress, allocate_buffer)
+
+from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool,
+ total_allocated_bytes, set_memory_pool,
+ default_memory_pool, system_memory_pool,
+ jemalloc_memory_pool, mimalloc_memory_pool,
+ logging_memory_pool, proxy_memory_pool,
+ log_memory_allocations, jemalloc_set_decay_ms)
+
+# I/O
+from pyarrow.lib import (NativeFile, PythonFile,
+ BufferedInputStream, BufferedOutputStream,
+ CompressedInputStream, CompressedOutputStream,
+ TransformInputStream, transcoding_input_stream,
+ FixedSizeBufferWriter,
+ BufferReader, BufferOutputStream,
+ OSFile, MemoryMappedFile, memory_map,
+ create_memory_map, MockOutputStream,
+ input_stream, output_stream)
+
+from pyarrow._hdfsio import HdfsFile, have_libhdfs
+
+from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table,
+ concat_arrays, concat_tables)
+
+# Exceptions
+from pyarrow.lib import (ArrowCancelled,
+ ArrowCapacityError,
+ ArrowException,
+ ArrowKeyError,
+ ArrowIndexError,
+ ArrowInvalid,
+ ArrowIOError,
+ ArrowMemoryError,
+ ArrowNotImplementedError,
+ ArrowTypeError,
+ ArrowSerializationError)
+
+# Serialization
+from pyarrow.lib import (deserialize_from, deserialize,
+ deserialize_components,
+ serialize, serialize_to, read_serialized,
+ SerializationCallbackError,
+ DeserializationCallbackError)
+
+import pyarrow.hdfs as hdfs
+
+from pyarrow.ipc import serialize_pandas, deserialize_pandas
+import pyarrow.ipc as ipc
+
+from pyarrow.serialization import (default_serialization_context,
+ register_default_serialization_handlers,
+ register_torch_serialization_handlers)
+
+import pyarrow.types as types
+
+
+# deprecated top-level access
+
+
+from pyarrow.filesystem import FileSystem as _FileSystem
+from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem
+from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem
+
+from pyarrow.lib import SerializationContext as _SerializationContext
+from pyarrow.lib import SerializedPyObject as _SerializedPyObject
+
+
+_localfs = _LocalFileSystem._get_instance()
+
+
+_msg = (
+ "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
+)
+
+_serialization_msg = (
+ "'pyarrow.{0}' is deprecated and will be removed in a future version. "
+ "Use pickle or the pyarrow IPC functionality instead."
+)
+
+_deprecated = {
+ "localfs": (_localfs, "LocalFileSystem"),
+ "FileSystem": (_FileSystem, "FileSystem"),
+ "LocalFileSystem": (_LocalFileSystem, "LocalFileSystem"),
+ "HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"),
+}
+
+_serialization_deprecatd = {
+ "SerializationContext": _SerializationContext,
+ "SerializedPyObject": _SerializedPyObject,
+}
+
+if _sys.version_info >= (3, 7):
+ def __getattr__(name):
+ if name in _deprecated:
+ obj, new_name = _deprecated[name]
+ _warnings.warn(_msg.format(name, new_name),
+ FutureWarning, stacklevel=2)
+ return obj
+ elif name in _serialization_deprecatd:
+ _warnings.warn(_serialization_msg.format(name),
+ FutureWarning, stacklevel=2)
+ return _serialization_deprecatd[name]
+
+ raise AttributeError(
+ "module 'pyarrow' has no attribute '{0}'".format(name)
+ )
+else:
+ localfs = _localfs
+ FileSystem = _FileSystem
+ LocalFileSystem = _LocalFileSystem
+ HadoopFileSystem = _HadoopFileSystem
+ SerializationContext = _SerializationContext
+ SerializedPyObject = _SerializedPyObject
+
+
+# Entry point for starting the plasma store
+
+
+def _plasma_store_entry_point():
+ """Entry point for starting the plasma store.
+
+ This can be used by invoking e.g.
+ ``plasma_store -s /tmp/plasma -m 1000000000``
+ from the command line and will start the plasma_store executable with the
+ given arguments.
+ """
+ import pyarrow
+ plasma_store_executable = _os.path.join(pyarrow.__path__[0],
+ "plasma-store-server")
+ _os.execv(plasma_store_executable, _sys.argv)
+
+
+# ----------------------------------------------------------------------
+# Deprecations
+
+from pyarrow.util import _deprecate_api, _deprecate_class
+
+read_message = _deprecate_api("read_message", "ipc.read_message",
+ ipc.read_message, "0.17.0")
+
+read_record_batch = _deprecate_api("read_record_batch",
+ "ipc.read_record_batch",
+ ipc.read_record_batch, "0.17.0")
+
+read_schema = _deprecate_api("read_schema", "ipc.read_schema",
+ ipc.read_schema, "0.17.0")
+
+read_tensor = _deprecate_api("read_tensor", "ipc.read_tensor",
+ ipc.read_tensor, "0.17.0")
+
+write_tensor = _deprecate_api("write_tensor", "ipc.write_tensor",
+ ipc.write_tensor, "0.17.0")
+
+get_record_batch_size = _deprecate_api("get_record_batch_size",
+ "ipc.get_record_batch_size",
+ ipc.get_record_batch_size, "0.17.0")
+
+get_tensor_size = _deprecate_api("get_tensor_size",
+ "ipc.get_tensor_size",
+ ipc.get_tensor_size, "0.17.0")
+
+open_stream = _deprecate_api("open_stream", "ipc.open_stream",
+ ipc.open_stream, "0.17.0")
+
+open_file = _deprecate_api("open_file", "ipc.open_file", ipc.open_file,
+ "0.17.0")
+
+
+def _deprecate_scalar(ty, symbol):
+ return _deprecate_class("{}Value".format(ty), symbol, "1.0.0")
+
+
+ArrayValue = _deprecate_class("ArrayValue", Scalar, "1.0.0")
+NullType = _deprecate_class("NullType", NullScalar, "1.0.0")
+
+BooleanValue = _deprecate_scalar("Boolean", BooleanScalar)
+Int8Value = _deprecate_scalar("Int8", Int8Scalar)
+Int16Value = _deprecate_scalar("Int16", Int16Scalar)
+Int32Value = _deprecate_scalar("Int32", Int32Scalar)
+Int64Value = _deprecate_scalar("Int64", Int64Scalar)
+UInt8Value = _deprecate_scalar("UInt8", UInt8Scalar)
+UInt16Value = _deprecate_scalar("UInt16", UInt16Scalar)
+UInt32Value = _deprecate_scalar("UInt32", UInt32Scalar)
+UInt64Value = _deprecate_scalar("UInt64", UInt64Scalar)
+HalfFloatValue = _deprecate_scalar("HalfFloat", HalfFloatScalar)
+FloatValue = _deprecate_scalar("Float", FloatScalar)
+DoubleValue = _deprecate_scalar("Double", DoubleScalar)
+ListValue = _deprecate_scalar("List", ListScalar)
+LargeListValue = _deprecate_scalar("LargeList", LargeListScalar)
+MapValue = _deprecate_scalar("Map", MapScalar)
+FixedSizeListValue = _deprecate_scalar("FixedSizeList", FixedSizeListScalar)
+BinaryValue = _deprecate_scalar("Binary", BinaryScalar)
+StringValue = _deprecate_scalar("String", StringScalar)
+LargeBinaryValue = _deprecate_scalar("LargeBinary", LargeBinaryScalar)
+LargeStringValue = _deprecate_scalar("LargeString", LargeStringScalar)
+FixedSizeBinaryValue = _deprecate_scalar("FixedSizeBinary",
+ FixedSizeBinaryScalar)
+Decimal128Value = _deprecate_scalar("Decimal128", Decimal128Scalar)
+Decimal256Value = _deprecate_scalar("Decimal256", Decimal256Scalar)
+UnionValue = _deprecate_scalar("Union", UnionScalar)
+StructValue = _deprecate_scalar("Struct", StructScalar)
+DictionaryValue = _deprecate_scalar("Dictionary", DictionaryScalar)
+Date32Value = _deprecate_scalar("Date32", Date32Scalar)
+Date64Value = _deprecate_scalar("Date64", Date64Scalar)
+Time32Value = _deprecate_scalar("Time32", Time32Scalar)
+Time64Value = _deprecate_scalar("Time64", Time64Scalar)
+TimestampValue = _deprecate_scalar("Timestamp", TimestampScalar)
+DurationValue = _deprecate_scalar("Duration", DurationScalar)
+
+
+# TODO: Deprecate these somehow in the pyarrow namespace
+from pyarrow.ipc import (Message, MessageReader, MetadataVersion,
+ RecordBatchFileReader, RecordBatchFileWriter,
+ RecordBatchStreamReader, RecordBatchStreamWriter)
+
+# ----------------------------------------------------------------------
+# Returning absolute path to the pyarrow include directory (if bundled, e.g. in
+# wheels)
+
+
+def get_include():
+ """
+ Return absolute path to directory containing Arrow C++ include
+ headers. Similar to numpy.get_include
+ """
+ return _os.path.join(_os.path.dirname(__file__), 'include')
+
+
+def _get_pkg_config_executable():
+ return _os.environ.get('PKG_CONFIG', 'pkg-config')
+
+
+def _has_pkg_config(pkgname):
+ import subprocess
+ try:
+ return subprocess.call([_get_pkg_config_executable(),
+ '--exists', pkgname]) == 0
+ except FileNotFoundError:
+ return False
+
+
+def _read_pkg_config_variable(pkgname, cli_args):
+ import subprocess
+ cmd = [_get_pkg_config_executable(), pkgname] + cli_args
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ out, err = proc.communicate()
+ if proc.returncode != 0:
+ raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
+ return out.rstrip().decode('utf8')
+
+
+def get_libraries():
+ """
+ Return list of library names to include in the `libraries` argument for C
+ or Cython extensions using pyarrow
+ """
+ return ['arrow', 'arrow_python']
+
+
+def create_library_symlinks():
+ """
+ With Linux and macOS wheels, the bundled shared libraries have an embedded
+ ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them
+ with -larrow won't work unless we create symlinks at locations like
+ site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses
+ prior problems we had with shipping two copies of the shared libraries to
+ permit third party projects like turbodbc to build their C++ extensions
+ against the pyarrow wheels.
+
+ This function must only be invoked once and only when the shared libraries
+ are bundled with the Python package, which should only apply to wheel-based
+ installs. It requires write access to the site-packages/pyarrow directory
+ and so depending on your system may need to be run with root.
+ """
+ import glob
+ if _sys.platform == 'win32':
+ return
+ package_cwd = _os.path.dirname(__file__)
+
+ if _sys.platform == 'linux':
+ bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*'))
+
+ def get_symlink_path(hard_path):
+ return hard_path.rsplit('.', 1)[0]
+ else:
+ bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib'))
+
+ def get_symlink_path(hard_path):
+ return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib'))
+
+ for lib_hard_path in bundled_libs:
+ symlink_path = get_symlink_path(lib_hard_path)
+ if _os.path.exists(symlink_path):
+ continue
+ try:
+ _os.symlink(lib_hard_path, symlink_path)
+ except PermissionError:
+ print("Tried creating symlink {}. If you need to link to "
+ "bundled shared libraries, run "
+ "pyarrow.create_library_symlinks() as root")
+
+
+def get_library_dirs():
+ """
+ Return lists of directories likely to contain Arrow C++ libraries for
+ linking C or Cython extensions using pyarrow
+ """
+ package_cwd = _os.path.dirname(__file__)
+ library_dirs = [package_cwd]
+
+ def append_library_dir(library_dir):
+ if library_dir not in library_dirs:
+ library_dirs.append(library_dir)
+
+ # Search library paths via pkg-config. This is necessary if the user
+ # installed libarrow and the other shared libraries manually and they
+ # are not shipped inside the pyarrow package (see also ARROW-2976).
+ pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
+ for pkgname in ["arrow", "arrow_python"]:
+ if _has_pkg_config(pkgname):
+ library_dir = _read_pkg_config_variable(pkgname,
+ ["--libs-only-L"])
+ # pkg-config output could be empty if Arrow is installed
+ # as a system package.
+ if library_dir:
+ if not library_dir.startswith("-L"):
+ raise ValueError(
+ "pkg-config --libs-only-L returned unexpected "
+ "value {!r}".format(library_dir))
+ append_library_dir(library_dir[2:])
+
+ if _sys.platform == 'win32':
+ # TODO(wesm): Is this necessary, or does setuptools within a conda
+ # installation add Library\lib to the linker path for MSVC?
+ python_base_install = _os.path.dirname(_sys.executable)
+ library_dir = _os.path.join(python_base_install, 'Library', 'lib')
+
+ if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
+ append_library_dir(library_dir)
+
+ # ARROW-4074: Allow for ARROW_HOME to be set to some other directory
+ if _os.environ.get('ARROW_HOME'):
+ append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
+ else:
+ # Python wheels bundle the Arrow libraries in the pyarrow directory.
+ append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
+
+ return library_dirs