summaryrefslogtreecommitdiffstats
path: root/src/arrow/python/pyarrow/compute.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/python/pyarrow/compute.py')
-rw-r--r--src/arrow/python/pyarrow/compute.py759
1 files changed, 759 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/compute.py b/src/arrow/python/pyarrow/compute.py
new file mode 100644
index 000000000..6e3bd7fca
--- /dev/null
+++ b/src/arrow/python/pyarrow/compute.py
@@ -0,0 +1,759 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow._compute import ( # noqa
+ Function,
+ FunctionOptions,
+ FunctionRegistry,
+ HashAggregateFunction,
+ HashAggregateKernel,
+ Kernel,
+ ScalarAggregateFunction,
+ ScalarAggregateKernel,
+ ScalarFunction,
+ ScalarKernel,
+ VectorFunction,
+ VectorKernel,
+ # Option classes
+ ArraySortOptions,
+ AssumeTimezoneOptions,
+ CastOptions,
+ CountOptions,
+ DayOfWeekOptions,
+ DictionaryEncodeOptions,
+ ElementWiseAggregateOptions,
+ ExtractRegexOptions,
+ FilterOptions,
+ IndexOptions,
+ JoinOptions,
+ MakeStructOptions,
+ MatchSubstringOptions,
+ ModeOptions,
+ NullOptions,
+ PadOptions,
+ PartitionNthOptions,
+ QuantileOptions,
+ ReplaceSliceOptions,
+ ReplaceSubstringOptions,
+ RoundOptions,
+ RoundToMultipleOptions,
+ ScalarAggregateOptions,
+ SelectKOptions,
+ SetLookupOptions,
+ SliceOptions,
+ SortOptions,
+ SplitOptions,
+ SplitPatternOptions,
+ StrftimeOptions,
+ StrptimeOptions,
+ TakeOptions,
+ TDigestOptions,
+ TrimOptions,
+ VarianceOptions,
+ WeekOptions,
+ # Functions
+ call_function,
+ function_registry,
+ get_function,
+ list_functions,
+)
+
+import inspect
+from textwrap import dedent
+import warnings
+
+import pyarrow as pa
+
+
+def _get_arg_names(func):
+ return func._doc.arg_names
+
+
+def _decorate_compute_function(wrapper, exposed_name, func, option_class):
+ # Decorate the given compute function wrapper with useful metadata
+ # and documentation.
+ wrapper.__arrow_compute_function__ = dict(name=func.name,
+ arity=func.arity)
+ wrapper.__name__ = exposed_name
+ wrapper.__qualname__ = exposed_name
+
+ doc_pieces = []
+
+ cpp_doc = func._doc
+ summary = cpp_doc.summary
+ if not summary:
+ arg_str = "arguments" if func.arity > 1 else "argument"
+ summary = ("Call compute function {!r} with the given {}"
+ .format(func.name, arg_str))
+
+ description = cpp_doc.description
+ arg_names = _get_arg_names(func)
+
+ doc_pieces.append("""\
+ {}.
+
+ """.format(summary))
+
+ if description:
+ doc_pieces.append("{}\n\n".format(description))
+
+ doc_pieces.append("""\
+ Parameters
+ ----------
+ """)
+
+ for arg_name in arg_names:
+ if func.kind in ('vector', 'scalar_aggregate'):
+ arg_type = 'Array-like'
+ else:
+ arg_type = 'Array-like or scalar-like'
+ doc_pieces.append("""\
+ {} : {}
+ Argument to compute function
+ """.format(arg_name, arg_type))
+
+ doc_pieces.append("""\
+ memory_pool : pyarrow.MemoryPool, optional
+ If not passed, will allocate memory from the default memory pool.
+ """)
+ if option_class is not None:
+ doc_pieces.append("""\
+ options : pyarrow.compute.{0}, optional
+ Parameters altering compute function semantics.
+ """.format(option_class.__name__))
+ options_sig = inspect.signature(option_class)
+ for p in options_sig.parameters.values():
+ doc_pieces.append("""\
+ {0} : optional
+ Parameter for {1} constructor. Either `options`
+ or `{0}` can be passed, but not both at the same time.
+ """.format(p.name, option_class.__name__))
+
+ wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces)
+ return wrapper
+
+
+def _get_options_class(func):
+ class_name = func._doc.options_class
+ if not class_name:
+ return None
+ try:
+ return globals()[class_name]
+ except KeyError:
+ warnings.warn("Python binding for {} not exposed"
+ .format(class_name), RuntimeWarning)
+ return None
+
+
+def _handle_options(name, option_class, options, kwargs):
+ if kwargs:
+ if options is None:
+ return option_class(**kwargs)
+ raise TypeError(
+ "Function {!r} called with both an 'options' argument "
+ "and additional named arguments"
+ .format(name))
+
+ if options is not None:
+ if isinstance(options, dict):
+ return option_class(**options)
+ elif isinstance(options, option_class):
+ return options
+ raise TypeError(
+ "Function {!r} expected a {} parameter, got {}"
+ .format(name, option_class, type(options)))
+
+ return options
+
+
+def _make_generic_wrapper(func_name, func, option_class):
+ if option_class is None:
+ def wrapper(*args, memory_pool=None):
+ return func.call(args, None, memory_pool)
+ else:
+ def wrapper(*args, memory_pool=None, options=None, **kwargs):
+ options = _handle_options(func_name, option_class, options,
+ kwargs)
+ return func.call(args, options, memory_pool)
+ return wrapper
+
+
+def _make_signature(arg_names, var_arg_names, option_class):
+ from inspect import Parameter
+ params = []
+ for name in arg_names:
+ params.append(Parameter(name, Parameter.POSITIONAL_OR_KEYWORD))
+ for name in var_arg_names:
+ params.append(Parameter(name, Parameter.VAR_POSITIONAL))
+ params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY,
+ default=None))
+ if option_class is not None:
+ params.append(Parameter("options", Parameter.KEYWORD_ONLY,
+ default=None))
+ options_sig = inspect.signature(option_class)
+ for p in options_sig.parameters.values():
+ # XXX for now, our generic wrappers don't allow positional
+ # option arguments
+ params.append(p.replace(kind=Parameter.KEYWORD_ONLY))
+ return inspect.Signature(params)
+
+
+def _wrap_function(name, func):
+ option_class = _get_options_class(func)
+ arg_names = _get_arg_names(func)
+ has_vararg = arg_names and arg_names[-1].startswith('*')
+ if has_vararg:
+ var_arg_names = [arg_names.pop().lstrip('*')]
+ else:
+ var_arg_names = []
+
+ wrapper = _make_generic_wrapper(name, func, option_class)
+ wrapper.__signature__ = _make_signature(arg_names, var_arg_names,
+ option_class)
+ return _decorate_compute_function(wrapper, name, func, option_class)
+
+
+def _make_global_functions():
+ """
+ Make global functions wrapping each compute function.
+
+ Note that some of the automatically-generated wrappers may be overriden
+ by custom versions below.
+ """
+ g = globals()
+ reg = function_registry()
+
+ # Avoid clashes with Python keywords
+ rewrites = {'and': 'and_',
+ 'or': 'or_'}
+
+ for cpp_name in reg.list_functions():
+ name = rewrites.get(cpp_name, cpp_name)
+ func = reg.get_function(cpp_name)
+ assert name not in g, name
+ g[cpp_name] = g[name] = _wrap_function(name, func)
+
+
+_make_global_functions()
+
+
+def cast(arr, target_type, safe=True):
+ """
+ Cast array values to another data type. Can also be invoked as an array
+ instance method.
+
+ Parameters
+ ----------
+ arr : Array or ChunkedArray
+ target_type : DataType or type string alias
+ Type to cast to
+ safe : bool, default True
+ Check for overflows or other unsafe conversions
+
+ Examples
+ --------
+ >>> from datetime import datetime
+ >>> import pyarrow as pa
+ >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
+ >>> arr.type
+ TimestampType(timestamp[us])
+
+ You can use ``pyarrow.DataType`` objects to specify the target type:
+
+ >>> cast(arr, pa.timestamp('ms'))
+ <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910>
+ [
+ 2010-01-01 00:00:00.000,
+ 2015-01-01 00:00:00.000
+ ]
+
+ >>> cast(arr, pa.timestamp('ms')).type
+ TimestampType(timestamp[ms])
+
+ Alternatively, it is also supported to use the string aliases for these
+ types:
+
+ >>> arr.cast('timestamp[ms]')
+ <pyarrow.lib.TimestampArray object at 0x10420eb88>
+ [
+ 1262304000000,
+ 1420070400000
+ ]
+ >>> arr.cast('timestamp[ms]').type
+ TimestampType(timestamp[ms])
+
+ Returns
+ -------
+ casted : Array
+ """
+ if target_type is None:
+ raise ValueError("Cast target type must not be None")
+ if safe:
+ options = CastOptions.safe(target_type)
+ else:
+ options = CastOptions.unsafe(target_type)
+ return call_function("cast", [arr], options)
+
+
+def count_substring(array, pattern, *, ignore_case=False):
+ """
+ Count the occurrences of substring *pattern* in each value of a
+ string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ pattern to search for exact matches
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+ """
+ return call_function("count_substring", [array],
+ MatchSubstringOptions(pattern,
+ ignore_case=ignore_case))
+
+
+def count_substring_regex(array, pattern, *, ignore_case=False):
+ """
+ Count the non-overlapping matches of regex *pattern* in each value
+ of a string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ pattern to search for exact matches
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+ """
+ return call_function("count_substring_regex", [array],
+ MatchSubstringOptions(pattern,
+ ignore_case=ignore_case))
+
+
+def find_substring(array, pattern, *, ignore_case=False):
+ """
+ Find the index of the first occurrence of substring *pattern* in each
+ value of a string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ pattern to search for exact matches
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+ """
+ return call_function("find_substring", [array],
+ MatchSubstringOptions(pattern,
+ ignore_case=ignore_case))
+
+
+def find_substring_regex(array, pattern, *, ignore_case=False):
+ """
+ Find the index of the first match of regex *pattern* in each
+ value of a string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ regex pattern to search for
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+ """
+ return call_function("find_substring_regex", [array],
+ MatchSubstringOptions(pattern,
+ ignore_case=ignore_case))
+
+
+def match_like(array, pattern, *, ignore_case=False):
+ """
+ Test if the SQL-style LIKE pattern *pattern* matches a value of a
+ string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ SQL-style LIKE pattern. '%' will match any number of
+ characters, '_' will match exactly one character, and all
+ other characters match themselves. To match a literal percent
+ sign or underscore, precede the character with a backslash.
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+
+ """
+ return call_function("match_like", [array],
+ MatchSubstringOptions(pattern,
+ ignore_case=ignore_case))
+
+
+def match_substring(array, pattern, *, ignore_case=False):
+ """
+ Test if substring *pattern* is contained within a value of a string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ pattern to search for exact matches
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+ """
+ return call_function("match_substring", [array],
+ MatchSubstringOptions(pattern,
+ ignore_case=ignore_case))
+
+
+def match_substring_regex(array, pattern, *, ignore_case=False):
+ """
+ Test if regex *pattern* matches at any position a value of a string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ regex pattern to search
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+ """
+ return call_function("match_substring_regex", [array],
+ MatchSubstringOptions(pattern,
+ ignore_case=ignore_case))
+
+
+def mode(array, n=1, *, skip_nulls=True, min_count=0):
+ """
+ Return top-n most common values and number of times they occur in a passed
+ numerical (chunked) array, in descending order of occurrence. If there are
+ multiple values with same count, the smaller one is returned first.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ n : int, default 1
+ Specify the top-n values.
+ skip_nulls : bool, default True
+ If True, ignore nulls in the input. Else return an empty array
+ if any input is null.
+ min_count : int, default 0
+ If there are fewer than this many values in the input, return
+ an empty array.
+
+ Returns
+ -------
+ An array of <input type "Mode", int64_t "Count"> structs
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import pyarrow.compute as pc
+ >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2])
+ >>> modes = pc.mode(arr, 2)
+ >>> modes[0]
+ <pyarrow.StructScalar: {'mode': 2, 'count': 5}>
+ >>> modes[1]
+ <pyarrow.StructScalar: {'mode': 1, 'count': 2}>
+ """
+ options = ModeOptions(n, skip_nulls=skip_nulls, min_count=min_count)
+ return call_function("mode", [array], options)
+
+
+def filter(data, mask, null_selection_behavior='drop'):
+ """
+ Select values (or records) from array- or table-like data given boolean
+ filter, where true values are selected.
+
+ Parameters
+ ----------
+ data : Array, ChunkedArray, RecordBatch, or Table
+ mask : Array, ChunkedArray
+ Must be of boolean type
+ null_selection_behavior : str, default 'drop'
+ Configure the behavior on encountering a null slot in the mask.
+ Allowed values are 'drop' and 'emit_null'.
+
+ - 'drop': nulls will be treated as equivalent to False.
+ - 'emit_null': nulls will result in a null in the output.
+
+ Returns
+ -------
+ result : depends on inputs
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> arr = pa.array(["a", "b", "c", None, "e"])
+ >>> mask = pa.array([True, False, None, False, True])
+ >>> arr.filter(mask)
+ <pyarrow.lib.StringArray object at 0x7fa826df9200>
+ [
+ "a",
+ "e"
+ ]
+ >>> arr.filter(mask, null_selection_behavior='emit_null')
+ <pyarrow.lib.StringArray object at 0x7fa826df9200>
+ [
+ "a",
+ null,
+ "e"
+ ]
+ """
+ options = FilterOptions(null_selection_behavior)
+ return call_function('filter', [data, mask], options)
+
+
+def index(data, value, start=None, end=None, *, memory_pool=None):
+ """
+ Find the index of the first occurrence of a given value.
+
+ Parameters
+ ----------
+ data : Array or ChunkedArray
+ value : Scalar-like object
+ start : int, optional
+ end : int, optional
+ memory_pool : MemoryPool, optional
+ If not passed, will allocate memory from the default memory pool.
+
+ Returns
+ -------
+ index : the index, or -1 if not found
+ """
+ if start is not None:
+ if end is not None:
+ data = data.slice(start, end - start)
+ else:
+ data = data.slice(start)
+ elif end is not None:
+ data = data.slice(0, end)
+
+ if not isinstance(value, pa.Scalar):
+ value = pa.scalar(value, type=data.type)
+ elif data.type != value.type:
+ value = pa.scalar(value.as_py(), type=data.type)
+ options = IndexOptions(value=value)
+ result = call_function('index', [data], options, memory_pool)
+ if start is not None and result.as_py() >= 0:
+ result = pa.scalar(result.as_py() + start, type=pa.int64())
+ return result
+
+
+def take(data, indices, *, boundscheck=True, memory_pool=None):
+ """
+ Select values (or records) from array- or table-like data given integer
+ selection indices.
+
+ The result will be of the same type(s) as the input, with elements taken
+ from the input array (or record batch / table fields) at the given
+ indices. If an index is null then the corresponding value in the output
+ will be null.
+
+ Parameters
+ ----------
+ data : Array, ChunkedArray, RecordBatch, or Table
+ indices : Array, ChunkedArray
+ Must be of integer type
+ boundscheck : boolean, default True
+ Whether to boundscheck the indices. If False and there is an out of
+ bounds index, will likely cause the process to crash.
+ memory_pool : MemoryPool, optional
+ If not passed, will allocate memory from the default memory pool.
+
+ Returns
+ -------
+ result : depends on inputs
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
+ >>> indices = pa.array([0, None, 4, 3])
+ >>> arr.take(indices)
+ <pyarrow.lib.StringArray object at 0x7ffa4fc7d368>
+ [
+ "a",
+ null,
+ "e",
+ null
+ ]
+ """
+ options = TakeOptions(boundscheck=boundscheck)
+ return call_function('take', [data, indices], options, memory_pool)
+
+
+def fill_null(values, fill_value):
+ """
+ Replace each null element in values with fill_value. The fill_value must be
+ the same type as values or able to be implicitly casted to the array's
+ type.
+
+ This is an alias for :func:`coalesce`.
+
+ Parameters
+ ----------
+ values : Array, ChunkedArray, or Scalar-like object
+ Each null element is replaced with the corresponding value
+ from fill_value.
+ fill_value : Array, ChunkedArray, or Scalar-like object
+ If not same type as data will attempt to cast.
+
+ Returns
+ -------
+ result : depends on inputs
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> arr = pa.array([1, 2, None, 3], type=pa.int8())
+ >>> fill_value = pa.scalar(5, type=pa.int8())
+ >>> arr.fill_null(fill_value)
+ pyarrow.lib.Int8Array object at 0x7f95437f01a0>
+ [
+ 1,
+ 2,
+ 5,
+ 3
+ ]
+ """
+ if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)):
+ fill_value = pa.scalar(fill_value, type=values.type)
+ elif values.type != fill_value.type:
+ fill_value = pa.scalar(fill_value.as_py(), type=values.type)
+
+ return call_function("coalesce", [values, fill_value])
+
+
+def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
+ """
+ Select the indices of the top-k ordered elements from array- or table-like
+ data.
+
+ This is a specialization for :func:`select_k_unstable`. Output is not
+ guaranteed to be stable.
+
+ Parameters
+ ----------
+ values : Array, ChunkedArray, RecordBatch, or Table
+ Data to sort and get top indices from.
+ k : int
+ The number of `k` elements to keep.
+ sort_keys : List-like
+ Column key names to order by when input is table-like data.
+ memory_pool : MemoryPool, optional
+ If not passed, will allocate memory from the default memory pool.
+
+ Returns
+ -------
+ result : Array of indices
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import pyarrow.compute as pc
+ >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
+ >>> pc.top_k_unstable(arr, k=3)
+ <pyarrow.lib.UInt64Array object at 0x7fdcb19d7f30>
+ [
+ 5,
+ 4,
+ 2
+ ]
+ """
+ if sort_keys is None:
+ sort_keys = []
+ if isinstance(values, (pa.Array, pa.ChunkedArray)):
+ sort_keys.append(("dummy", "descending"))
+ else:
+ sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
+ options = SelectKOptions(k, sort_keys)
+ return call_function("select_k_unstable", [values], options, memory_pool)
+
+
+def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
+ """
+ Select the indices of the bottom-k ordered elements from
+ array- or table-like data.
+
+ This is a specialization for :func:`select_k_unstable`. Output is not
+ guaranteed to be stable.
+
+ Parameters
+ ----------
+ values : Array, ChunkedArray, RecordBatch, or Table
+ Data to sort and get bottom indices from.
+ k : int
+ The number of `k` elements to keep.
+ sort_keys : List-like
+ Column key names to order by when input is table-like data.
+ memory_pool : MemoryPool, optional
+ If not passed, will allocate memory from the default memory pool.
+
+ Returns
+ -------
+ result : Array of indices
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import pyarrow.compute as pc
+ >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
+ >>> pc.bottom_k_unstable(arr, k=3)
+ <pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0>
+ [
+ 0,
+ 1,
+ 2
+ ]
+ """
+ if sort_keys is None:
+ sort_keys = []
+ if isinstance(values, (pa.Array, pa.ChunkedArray)):
+ sort_keys.append(("dummy", "ascending"))
+ else:
+ sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
+ options = SelectKOptions(k, sort_keys)
+ return call_function("select_k_unstable", [values], options, memory_pool)