summaryrefslogtreecommitdiffstats
path: root/src/arrow/python/pyarrow/pandas-shim.pxi
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/python/pyarrow/pandas-shim.pxi')
-rw-r--r--src/arrow/python/pyarrow/pandas-shim.pxi254
1 files changed, 254 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/pandas-shim.pxi b/src/arrow/python/pyarrow/pandas-shim.pxi
new file mode 100644
index 000000000..0e7cfe937
--- /dev/null
+++ b/src/arrow/python/pyarrow/pandas-shim.pxi
@@ -0,0 +1,254 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pandas lazy-loading API shim that reduces API call and import overhead
+
+import warnings
+
+
+cdef class _PandasAPIShim(object):
+ """
+ Lazy pandas importer that isolates usages of pandas APIs and avoids
+ importing pandas until it's actually needed
+ """
+ cdef:
+ bint _tried_importing_pandas
+ bint _have_pandas
+
+ cdef readonly:
+ object _loose_version, _version
+ object _pd, _types_api, _compat_module
+ object _data_frame, _index, _series, _categorical_type
+ object _datetimetz_type, _extension_array, _extension_dtype
+ object _array_like_types, _is_extension_array_dtype
+ bint has_sparse
+ bint _pd024
+
+ def __init__(self):
+ self._tried_importing_pandas = False
+ self._have_pandas = 0
+
+ cdef _import_pandas(self, bint raise_):
+ try:
+ import pandas as pd
+ import pyarrow.pandas_compat as pdcompat
+ except ImportError:
+ self._have_pandas = False
+ if raise_:
+ raise
+ else:
+ return
+
+ from pyarrow.vendored.version import Version
+
+ self._pd = pd
+ self._version = pd.__version__
+ self._loose_version = Version(pd.__version__)
+
+ if self._loose_version < Version('0.23.0'):
+ self._have_pandas = False
+ if raise_:
+ raise ImportError(
+ "pyarrow requires pandas 0.23.0 or above, pandas {} is "
+ "installed".format(self._version)
+ )
+ else:
+ warnings.warn(
+ "pyarrow requires pandas 0.23.0 or above, pandas {} is "
+ "installed. Therefore, pandas-specific integration is not "
+ "used.".format(self._version), stacklevel=2)
+ return
+
+ self._compat_module = pdcompat
+ self._data_frame = pd.DataFrame
+ self._index = pd.Index
+ self._categorical_type = pd.Categorical
+ self._series = pd.Series
+ self._extension_array = pd.api.extensions.ExtensionArray
+ self._array_like_types = (
+ self._series, self._index, self._categorical_type,
+ self._extension_array)
+ self._extension_dtype = pd.api.extensions.ExtensionDtype
+ if self._loose_version >= Version('0.24.0'):
+ self._is_extension_array_dtype = \
+ pd.api.types.is_extension_array_dtype
+ else:
+ self._is_extension_array_dtype = None
+
+ self._types_api = pd.api.types
+ self._datetimetz_type = pd.api.types.DatetimeTZDtype
+ self._have_pandas = True
+
+ if self._loose_version > Version('0.25'):
+ self.has_sparse = False
+ else:
+ self.has_sparse = True
+
+ self._pd024 = self._loose_version >= Version('0.24')
+
+ cdef inline _check_import(self, bint raise_=True):
+ if self._tried_importing_pandas:
+ if not self._have_pandas and raise_:
+ self._import_pandas(raise_)
+ return
+
+ self._tried_importing_pandas = True
+ self._import_pandas(raise_)
+
+ def series(self, *args, **kwargs):
+ self._check_import()
+ return self._series(*args, **kwargs)
+
+ def data_frame(self, *args, **kwargs):
+ self._check_import()
+ return self._data_frame(*args, **kwargs)
+
+ cdef inline bint _have_pandas_internal(self):
+ if not self._tried_importing_pandas:
+ self._check_import(raise_=False)
+ return self._have_pandas
+
+ @property
+ def have_pandas(self):
+ return self._have_pandas_internal()
+
+ @property
+ def compat(self):
+ self._check_import()
+ return self._compat_module
+
+ @property
+ def pd(self):
+ self._check_import()
+ return self._pd
+
+ cpdef infer_dtype(self, obj):
+ self._check_import()
+ try:
+ return self._types_api.infer_dtype(obj, skipna=False)
+ except AttributeError:
+ return self._pd.lib.infer_dtype(obj)
+
+ cpdef pandas_dtype(self, dtype):
+ self._check_import()
+ try:
+ return self._types_api.pandas_dtype(dtype)
+ except AttributeError:
+ return None
+
+ @property
+ def loose_version(self):
+ self._check_import()
+ return self._loose_version
+
+ @property
+ def version(self):
+ self._check_import()
+ return self._version
+
+ @property
+ def categorical_type(self):
+ self._check_import()
+ return self._categorical_type
+
+ @property
+ def datetimetz_type(self):
+ self._check_import()
+ return self._datetimetz_type
+
+ @property
+ def extension_dtype(self):
+ self._check_import()
+ return self._extension_dtype
+
+ cpdef is_array_like(self, obj):
+ self._check_import()
+ return isinstance(obj, self._array_like_types)
+
+ cpdef is_categorical(self, obj):
+ if self._have_pandas_internal():
+ return isinstance(obj, self._categorical_type)
+ else:
+ return False
+
+ cpdef is_datetimetz(self, obj):
+ if self._have_pandas_internal():
+ return isinstance(obj, self._datetimetz_type)
+ else:
+ return False
+
+ cpdef is_extension_array_dtype(self, obj):
+ self._check_import()
+ if self._is_extension_array_dtype:
+ return self._is_extension_array_dtype(obj)
+ else:
+ return False
+
+ cpdef is_sparse(self, obj):
+ if self._have_pandas_internal():
+ return self._types_api.is_sparse(obj)
+ else:
+ return False
+
+ cpdef is_data_frame(self, obj):
+ if self._have_pandas_internal():
+ return isinstance(obj, self._data_frame)
+ else:
+ return False
+
+ cpdef is_series(self, obj):
+ if self._have_pandas_internal():
+ return isinstance(obj, self._series)
+ else:
+ return False
+
+ cpdef is_index(self, obj):
+ if self._have_pandas_internal():
+ return isinstance(obj, self._index)
+ else:
+ return False
+
+ cpdef get_values(self, obj):
+ """
+ Get the underlying array values of a pandas Series or Index in the
+ format (np.ndarray or pandas ExtensionArray) as we need them.
+
+ Assumes obj is a pandas Series or Index.
+ """
+ self._check_import()
+ if isinstance(obj.dtype, (self.pd.api.types.IntervalDtype,
+ self.pd.api.types.PeriodDtype)):
+ if self._pd024:
+ # only since pandas 0.24, interval and period are stored as
+ # such in Series
+ return obj.array
+ return obj.values
+
+ def assert_frame_equal(self, *args, **kwargs):
+ self._check_import()
+ return self._pd.util.testing.assert_frame_equal
+
+ def get_rangeindex_attribute(self, level, name):
+ # public start/stop/step attributes added in pandas 0.25.0
+ self._check_import()
+ if hasattr(level, name):
+ return getattr(level, name)
+ return getattr(level, '_' + name)
+
+
+cdef _PandasAPIShim pandas_api = _PandasAPIShim()
+_pandas_api = pandas_api