diff options
Diffstat (limited to 'src/arrow/python/pyarrow/pandas-shim.pxi')
-rw-r--r-- | src/arrow/python/pyarrow/pandas-shim.pxi | 254 |
1 files changed, 254 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/pandas-shim.pxi b/src/arrow/python/pyarrow/pandas-shim.pxi new file mode 100644 index 000000000..0e7cfe937 --- /dev/null +++ b/src/arrow/python/pyarrow/pandas-shim.pxi @@ -0,0 +1,254 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pandas lazy-loading API shim that reduces API call and import overhead + +import warnings + + +cdef class _PandasAPIShim(object): + """ + Lazy pandas importer that isolates usages of pandas APIs and avoids + importing pandas until it's actually needed + """ + cdef: + bint _tried_importing_pandas + bint _have_pandas + + cdef readonly: + object _loose_version, _version + object _pd, _types_api, _compat_module + object _data_frame, _index, _series, _categorical_type + object _datetimetz_type, _extension_array, _extension_dtype + object _array_like_types, _is_extension_array_dtype + bint has_sparse + bint _pd024 + + def __init__(self): + self._tried_importing_pandas = False + self._have_pandas = 0 + + cdef _import_pandas(self, bint raise_): + try: + import pandas as pd + import pyarrow.pandas_compat as pdcompat + except ImportError: + self._have_pandas = False + if raise_: + raise + else: + return + + from pyarrow.vendored.version import Version + + self._pd = pd + self._version = pd.__version__ + self._loose_version = Version(pd.__version__) + + if self._loose_version < Version('0.23.0'): + self._have_pandas = False + if raise_: + raise ImportError( + "pyarrow requires pandas 0.23.0 or above, pandas {} is " + "installed".format(self._version) + ) + else: + warnings.warn( + "pyarrow requires pandas 0.23.0 or above, pandas {} is " + "installed. Therefore, pandas-specific integration is not " + "used.".format(self._version), stacklevel=2) + return + + self._compat_module = pdcompat + self._data_frame = pd.DataFrame + self._index = pd.Index + self._categorical_type = pd.Categorical + self._series = pd.Series + self._extension_array = pd.api.extensions.ExtensionArray + self._array_like_types = ( + self._series, self._index, self._categorical_type, + self._extension_array) + self._extension_dtype = pd.api.extensions.ExtensionDtype + if self._loose_version >= Version('0.24.0'): + self._is_extension_array_dtype = \ + pd.api.types.is_extension_array_dtype + else: + self._is_extension_array_dtype = None + + self._types_api = pd.api.types + self._datetimetz_type = pd.api.types.DatetimeTZDtype + self._have_pandas = True + + if self._loose_version > Version('0.25'): + self.has_sparse = False + else: + self.has_sparse = True + + self._pd024 = self._loose_version >= Version('0.24') + + cdef inline _check_import(self, bint raise_=True): + if self._tried_importing_pandas: + if not self._have_pandas and raise_: + self._import_pandas(raise_) + return + + self._tried_importing_pandas = True + self._import_pandas(raise_) + + def series(self, *args, **kwargs): + self._check_import() + return self._series(*args, **kwargs) + + def data_frame(self, *args, **kwargs): + self._check_import() + return self._data_frame(*args, **kwargs) + + cdef inline bint _have_pandas_internal(self): + if not self._tried_importing_pandas: + self._check_import(raise_=False) + return self._have_pandas + + @property + def have_pandas(self): + return self._have_pandas_internal() + + @property + def compat(self): + self._check_import() + return self._compat_module + + @property + def pd(self): + self._check_import() + return self._pd + + cpdef infer_dtype(self, obj): + self._check_import() + try: + return self._types_api.infer_dtype(obj, skipna=False) + except AttributeError: + return self._pd.lib.infer_dtype(obj) + + cpdef pandas_dtype(self, dtype): + self._check_import() + try: + return self._types_api.pandas_dtype(dtype) + except AttributeError: + return None + + @property + def loose_version(self): + self._check_import() + return self._loose_version + + @property + def version(self): + self._check_import() + return self._version + + @property + def categorical_type(self): + self._check_import() + return self._categorical_type + + @property + def datetimetz_type(self): + self._check_import() + return self._datetimetz_type + + @property + def extension_dtype(self): + self._check_import() + return self._extension_dtype + + cpdef is_array_like(self, obj): + self._check_import() + return isinstance(obj, self._array_like_types) + + cpdef is_categorical(self, obj): + if self._have_pandas_internal(): + return isinstance(obj, self._categorical_type) + else: + return False + + cpdef is_datetimetz(self, obj): + if self._have_pandas_internal(): + return isinstance(obj, self._datetimetz_type) + else: + return False + + cpdef is_extension_array_dtype(self, obj): + self._check_import() + if self._is_extension_array_dtype: + return self._is_extension_array_dtype(obj) + else: + return False + + cpdef is_sparse(self, obj): + if self._have_pandas_internal(): + return self._types_api.is_sparse(obj) + else: + return False + + cpdef is_data_frame(self, obj): + if self._have_pandas_internal(): + return isinstance(obj, self._data_frame) + else: + return False + + cpdef is_series(self, obj): + if self._have_pandas_internal(): + return isinstance(obj, self._series) + else: + return False + + cpdef is_index(self, obj): + if self._have_pandas_internal(): + return isinstance(obj, self._index) + else: + return False + + cpdef get_values(self, obj): + """ + Get the underlying array values of a pandas Series or Index in the + format (np.ndarray or pandas ExtensionArray) as we need them. + + Assumes obj is a pandas Series or Index. + """ + self._check_import() + if isinstance(obj.dtype, (self.pd.api.types.IntervalDtype, + self.pd.api.types.PeriodDtype)): + if self._pd024: + # only since pandas 0.24, interval and period are stored as + # such in Series + return obj.array + return obj.values + + def assert_frame_equal(self, *args, **kwargs): + self._check_import() + return self._pd.util.testing.assert_frame_equal + + def get_rangeindex_attribute(self, level, name): + # public start/stop/step attributes added in pandas 0.25.0 + self._check_import() + if hasattr(level, name): + return getattr(level, name) + return getattr(level, '_' + name) + + +cdef _PandasAPIShim pandas_api = _PandasAPIShim() +_pandas_api = pandas_api |