diff options
Diffstat (limited to 'src/arrow/python/pyarrow/compute.py')
-rw-r--r-- | src/arrow/python/pyarrow/compute.py | 759 |
1 files changed, 759 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/compute.py b/src/arrow/python/pyarrow/compute.py new file mode 100644 index 000000000..6e3bd7fca --- /dev/null +++ b/src/arrow/python/pyarrow/compute.py @@ -0,0 +1,759 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow._compute import ( # noqa + Function, + FunctionOptions, + FunctionRegistry, + HashAggregateFunction, + HashAggregateKernel, + Kernel, + ScalarAggregateFunction, + ScalarAggregateKernel, + ScalarFunction, + ScalarKernel, + VectorFunction, + VectorKernel, + # Option classes + ArraySortOptions, + AssumeTimezoneOptions, + CastOptions, + CountOptions, + DayOfWeekOptions, + DictionaryEncodeOptions, + ElementWiseAggregateOptions, + ExtractRegexOptions, + FilterOptions, + IndexOptions, + JoinOptions, + MakeStructOptions, + MatchSubstringOptions, + ModeOptions, + NullOptions, + PadOptions, + PartitionNthOptions, + QuantileOptions, + ReplaceSliceOptions, + ReplaceSubstringOptions, + RoundOptions, + RoundToMultipleOptions, + ScalarAggregateOptions, + SelectKOptions, + SetLookupOptions, + SliceOptions, + SortOptions, + SplitOptions, + SplitPatternOptions, + StrftimeOptions, + StrptimeOptions, + TakeOptions, + TDigestOptions, + TrimOptions, + VarianceOptions, + WeekOptions, + # Functions + call_function, + function_registry, + get_function, + list_functions, +) + +import inspect +from textwrap import dedent +import warnings + +import pyarrow as pa + + +def _get_arg_names(func): + return func._doc.arg_names + + +def _decorate_compute_function(wrapper, exposed_name, func, option_class): + # Decorate the given compute function wrapper with useful metadata + # and documentation. + wrapper.__arrow_compute_function__ = dict(name=func.name, + arity=func.arity) + wrapper.__name__ = exposed_name + wrapper.__qualname__ = exposed_name + + doc_pieces = [] + + cpp_doc = func._doc + summary = cpp_doc.summary + if not summary: + arg_str = "arguments" if func.arity > 1 else "argument" + summary = ("Call compute function {!r} with the given {}" + .format(func.name, arg_str)) + + description = cpp_doc.description + arg_names = _get_arg_names(func) + + doc_pieces.append("""\ + {}. + + """.format(summary)) + + if description: + doc_pieces.append("{}\n\n".format(description)) + + doc_pieces.append("""\ + Parameters + ---------- + """) + + for arg_name in arg_names: + if func.kind in ('vector', 'scalar_aggregate'): + arg_type = 'Array-like' + else: + arg_type = 'Array-like or scalar-like' + doc_pieces.append("""\ + {} : {} + Argument to compute function + """.format(arg_name, arg_type)) + + doc_pieces.append("""\ + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """) + if option_class is not None: + doc_pieces.append("""\ + options : pyarrow.compute.{0}, optional + Parameters altering compute function semantics. + """.format(option_class.__name__)) + options_sig = inspect.signature(option_class) + for p in options_sig.parameters.values(): + doc_pieces.append("""\ + {0} : optional + Parameter for {1} constructor. Either `options` + or `{0}` can be passed, but not both at the same time. + """.format(p.name, option_class.__name__)) + + wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces) + return wrapper + + +def _get_options_class(func): + class_name = func._doc.options_class + if not class_name: + return None + try: + return globals()[class_name] + except KeyError: + warnings.warn("Python binding for {} not exposed" + .format(class_name), RuntimeWarning) + return None + + +def _handle_options(name, option_class, options, kwargs): + if kwargs: + if options is None: + return option_class(**kwargs) + raise TypeError( + "Function {!r} called with both an 'options' argument " + "and additional named arguments" + .format(name)) + + if options is not None: + if isinstance(options, dict): + return option_class(**options) + elif isinstance(options, option_class): + return options + raise TypeError( + "Function {!r} expected a {} parameter, got {}" + .format(name, option_class, type(options))) + + return options + + +def _make_generic_wrapper(func_name, func, option_class): + if option_class is None: + def wrapper(*args, memory_pool=None): + return func.call(args, None, memory_pool) + else: + def wrapper(*args, memory_pool=None, options=None, **kwargs): + options = _handle_options(func_name, option_class, options, + kwargs) + return func.call(args, options, memory_pool) + return wrapper + + +def _make_signature(arg_names, var_arg_names, option_class): + from inspect import Parameter + params = [] + for name in arg_names: + params.append(Parameter(name, Parameter.POSITIONAL_OR_KEYWORD)) + for name in var_arg_names: + params.append(Parameter(name, Parameter.VAR_POSITIONAL)) + params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY, + default=None)) + if option_class is not None: + params.append(Parameter("options", Parameter.KEYWORD_ONLY, + default=None)) + options_sig = inspect.signature(option_class) + for p in options_sig.parameters.values(): + # XXX for now, our generic wrappers don't allow positional + # option arguments + params.append(p.replace(kind=Parameter.KEYWORD_ONLY)) + return inspect.Signature(params) + + +def _wrap_function(name, func): + option_class = _get_options_class(func) + arg_names = _get_arg_names(func) + has_vararg = arg_names and arg_names[-1].startswith('*') + if has_vararg: + var_arg_names = [arg_names.pop().lstrip('*')] + else: + var_arg_names = [] + + wrapper = _make_generic_wrapper(name, func, option_class) + wrapper.__signature__ = _make_signature(arg_names, var_arg_names, + option_class) + return _decorate_compute_function(wrapper, name, func, option_class) + + +def _make_global_functions(): + """ + Make global functions wrapping each compute function. + + Note that some of the automatically-generated wrappers may be overriden + by custom versions below. + """ + g = globals() + reg = function_registry() + + # Avoid clashes with Python keywords + rewrites = {'and': 'and_', + 'or': 'or_'} + + for cpp_name in reg.list_functions(): + name = rewrites.get(cpp_name, cpp_name) + func = reg.get_function(cpp_name) + assert name not in g, name + g[cpp_name] = g[name] = _wrap_function(name, func) + + +_make_global_functions() + + +def cast(arr, target_type, safe=True): + """ + Cast array values to another data type. Can also be invoked as an array + instance method. + + Parameters + ---------- + arr : Array or ChunkedArray + target_type : DataType or type string alias + Type to cast to + safe : bool, default True + Check for overflows or other unsafe conversions + + Examples + -------- + >>> from datetime import datetime + >>> import pyarrow as pa + >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + >>> arr.type + TimestampType(timestamp[us]) + + You can use ``pyarrow.DataType`` objects to specify the target type: + + >>> cast(arr, pa.timestamp('ms')) + <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910> + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + + >>> cast(arr, pa.timestamp('ms')).type + TimestampType(timestamp[ms]) + + Alternatively, it is also supported to use the string aliases for these + types: + + >>> arr.cast('timestamp[ms]') + <pyarrow.lib.TimestampArray object at 0x10420eb88> + [ + 1262304000000, + 1420070400000 + ] + >>> arr.cast('timestamp[ms]').type + TimestampType(timestamp[ms]) + + Returns + ------- + casted : Array + """ + if target_type is None: + raise ValueError("Cast target type must not be None") + if safe: + options = CastOptions.safe(target_type) + else: + options = CastOptions.unsafe(target_type) + return call_function("cast", [arr], options) + + +def count_substring(array, pattern, *, ignore_case=False): + """ + Count the occurrences of substring *pattern* in each value of a + string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + pattern to search for exact matches + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("count_substring", [array], + MatchSubstringOptions(pattern, + ignore_case=ignore_case)) + + +def count_substring_regex(array, pattern, *, ignore_case=False): + """ + Count the non-overlapping matches of regex *pattern* in each value + of a string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + pattern to search for exact matches + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("count_substring_regex", [array], + MatchSubstringOptions(pattern, + ignore_case=ignore_case)) + + +def find_substring(array, pattern, *, ignore_case=False): + """ + Find the index of the first occurrence of substring *pattern* in each + value of a string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + pattern to search for exact matches + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("find_substring", [array], + MatchSubstringOptions(pattern, + ignore_case=ignore_case)) + + +def find_substring_regex(array, pattern, *, ignore_case=False): + """ + Find the index of the first match of regex *pattern* in each + value of a string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + regex pattern to search for + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("find_substring_regex", [array], + MatchSubstringOptions(pattern, + ignore_case=ignore_case)) + + +def match_like(array, pattern, *, ignore_case=False): + """ + Test if the SQL-style LIKE pattern *pattern* matches a value of a + string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + SQL-style LIKE pattern. '%' will match any number of + characters, '_' will match exactly one character, and all + other characters match themselves. To match a literal percent + sign or underscore, precede the character with a backslash. + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + + """ + return call_function("match_like", [array], + MatchSubstringOptions(pattern, + ignore_case=ignore_case)) + + +def match_substring(array, pattern, *, ignore_case=False): + """ + Test if substring *pattern* is contained within a value of a string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + pattern to search for exact matches + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("match_substring", [array], + MatchSubstringOptions(pattern, + ignore_case=ignore_case)) + + +def match_substring_regex(array, pattern, *, ignore_case=False): + """ + Test if regex *pattern* matches at any position a value of a string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + regex pattern to search + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("match_substring_regex", [array], + MatchSubstringOptions(pattern, + ignore_case=ignore_case)) + + +def mode(array, n=1, *, skip_nulls=True, min_count=0): + """ + Return top-n most common values and number of times they occur in a passed + numerical (chunked) array, in descending order of occurrence. If there are + multiple values with same count, the smaller one is returned first. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + n : int, default 1 + Specify the top-n values. + skip_nulls : bool, default True + If True, ignore nulls in the input. Else return an empty array + if any input is null. + min_count : int, default 0 + If there are fewer than this many values in the input, return + an empty array. + + Returns + ------- + An array of <input type "Mode", int64_t "Count"> structs + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> modes = pc.mode(arr, 2) + >>> modes[0] + <pyarrow.StructScalar: {'mode': 2, 'count': 5}> + >>> modes[1] + <pyarrow.StructScalar: {'mode': 1, 'count': 2}> + """ + options = ModeOptions(n, skip_nulls=skip_nulls, min_count=min_count) + return call_function("mode", [array], options) + + +def filter(data, mask, null_selection_behavior='drop'): + """ + Select values (or records) from array- or table-like data given boolean + filter, where true values are selected. + + Parameters + ---------- + data : Array, ChunkedArray, RecordBatch, or Table + mask : Array, ChunkedArray + Must be of boolean type + null_selection_behavior : str, default 'drop' + Configure the behavior on encountering a null slot in the mask. + Allowed values are 'drop' and 'emit_null'. + + - 'drop': nulls will be treated as equivalent to False. + - 'emit_null': nulls will result in a null in the output. + + Returns + ------- + result : depends on inputs + + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array(["a", "b", "c", None, "e"]) + >>> mask = pa.array([True, False, None, False, True]) + >>> arr.filter(mask) + <pyarrow.lib.StringArray object at 0x7fa826df9200> + [ + "a", + "e" + ] + >>> arr.filter(mask, null_selection_behavior='emit_null') + <pyarrow.lib.StringArray object at 0x7fa826df9200> + [ + "a", + null, + "e" + ] + """ + options = FilterOptions(null_selection_behavior) + return call_function('filter', [data, mask], options) + + +def index(data, value, start=None, end=None, *, memory_pool=None): + """ + Find the index of the first occurrence of a given value. + + Parameters + ---------- + data : Array or ChunkedArray + value : Scalar-like object + start : int, optional + end : int, optional + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + index : the index, or -1 if not found + """ + if start is not None: + if end is not None: + data = data.slice(start, end - start) + else: + data = data.slice(start) + elif end is not None: + data = data.slice(0, end) + + if not isinstance(value, pa.Scalar): + value = pa.scalar(value, type=data.type) + elif data.type != value.type: + value = pa.scalar(value.as_py(), type=data.type) + options = IndexOptions(value=value) + result = call_function('index', [data], options, memory_pool) + if start is not None and result.as_py() >= 0: + result = pa.scalar(result.as_py() + start, type=pa.int64()) + return result + + +def take(data, indices, *, boundscheck=True, memory_pool=None): + """ + Select values (or records) from array- or table-like data given integer + selection indices. + + The result will be of the same type(s) as the input, with elements taken + from the input array (or record batch / table fields) at the given + indices. If an index is null then the corresponding value in the output + will be null. + + Parameters + ---------- + data : Array, ChunkedArray, RecordBatch, or Table + indices : Array, ChunkedArray + Must be of integer type + boundscheck : boolean, default True + Whether to boundscheck the indices. If False and there is an out of + bounds index, will likely cause the process to crash. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : depends on inputs + + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> indices = pa.array([0, None, 4, 3]) + >>> arr.take(indices) + <pyarrow.lib.StringArray object at 0x7ffa4fc7d368> + [ + "a", + null, + "e", + null + ] + """ + options = TakeOptions(boundscheck=boundscheck) + return call_function('take', [data, indices], options, memory_pool) + + +def fill_null(values, fill_value): + """ + Replace each null element in values with fill_value. The fill_value must be + the same type as values or able to be implicitly casted to the array's + type. + + This is an alias for :func:`coalesce`. + + Parameters + ---------- + values : Array, ChunkedArray, or Scalar-like object + Each null element is replaced with the corresponding value + from fill_value. + fill_value : Array, ChunkedArray, or Scalar-like object + If not same type as data will attempt to cast. + + Returns + ------- + result : depends on inputs + + Examples + -------- + >>> import pyarrow as pa + >>> arr = pa.array([1, 2, None, 3], type=pa.int8()) + >>> fill_value = pa.scalar(5, type=pa.int8()) + >>> arr.fill_null(fill_value) + pyarrow.lib.Int8Array object at 0x7f95437f01a0> + [ + 1, + 2, + 5, + 3 + ] + """ + if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)): + fill_value = pa.scalar(fill_value, type=values.type) + elif values.type != fill_value.type: + fill_value = pa.scalar(fill_value.as_py(), type=values.type) + + return call_function("coalesce", [values, fill_value]) + + +def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None): + """ + Select the indices of the top-k ordered elements from array- or table-like + data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get top indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.top_k_unstable(arr, k=3) + <pyarrow.lib.UInt64Array object at 0x7fdcb19d7f30> + [ + 5, + 4, + 2 + ] + """ + if sort_keys is None: + sort_keys = [] + if isinstance(values, (pa.Array, pa.ChunkedArray)): + sort_keys.append(("dummy", "descending")) + else: + sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys) + options = SelectKOptions(k, sort_keys) + return call_function("select_k_unstable", [values], options, memory_pool) + + +def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): + """ + Select the indices of the bottom-k ordered elements from + array- or table-like data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.bottom_k_unstable(arr, k=3) + <pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0> + [ + 0, + 1, + 2 + ] + """ + if sort_keys is None: + sort_keys = [] + if isinstance(values, (pa.Array, pa.ChunkedArray)): + sort_keys.append(("dummy", "ascending")) + else: + sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys) + options = SelectKOptions(k, sort_keys) + return call_function("select_k_unstable", [values], options, memory_pool) |