summaryrefslogtreecommitdiffstats
path: root/src/arrow/python/benchmarks/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/python/benchmarks/common.py')
-rw-r--r--src/arrow/python/benchmarks/common.py349
1 files changed, 349 insertions, 0 deletions
diff --git a/src/arrow/python/benchmarks/common.py b/src/arrow/python/benchmarks/common.py
new file mode 100644
index 000000000..48526a405
--- /dev/null
+++ b/src/arrow/python/benchmarks/common.py
@@ -0,0 +1,349 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import codecs
+import decimal
+from functools import partial
+import itertools
+import sys
+import unicodedata
+
+import numpy as np
+
+import pyarrow as pa
+
+
+KILOBYTE = 1 << 10
+MEGABYTE = KILOBYTE * KILOBYTE
+
+DEFAULT_NONE_PROB = 0.3
+
+
+def _multiplicate_sequence(base, target_size):
+ q, r = divmod(target_size, len(base))
+ return [base] * q + [base[:r]]
+
+
+def get_random_bytes(n, seed=42):
+ """
+ Generate a random bytes object of size *n*.
+ Note the result might be compressible.
+ """
+ rnd = np.random.RandomState(seed)
+ # Computing a huge random bytestring can be costly, so we get at most
+ # 100KB and duplicate the result as needed
+ base_size = 100003
+ q, r = divmod(n, base_size)
+ if q == 0:
+ result = rnd.bytes(r)
+ else:
+ base = rnd.bytes(base_size)
+ result = b''.join(_multiplicate_sequence(base, n))
+ assert len(result) == n
+ return result
+
+
+def get_random_ascii(n, seed=42):
+ """
+ Get a random ASCII-only unicode string of size *n*.
+ """
+ arr = np.frombuffer(get_random_bytes(n, seed=seed), dtype=np.int8) & 0x7f
+ result, _ = codecs.ascii_decode(arr)
+ assert isinstance(result, str)
+ assert len(result) == n
+ return result
+
+
+def _random_unicode_letters(n, seed=42):
+ """
+ Generate a string of random unicode letters (slow).
+ """
+ def _get_more_candidates():
+ return rnd.randint(0, sys.maxunicode, size=n).tolist()
+
+ rnd = np.random.RandomState(seed)
+ out = []
+ candidates = []
+
+ while len(out) < n:
+ if not candidates:
+ candidates = _get_more_candidates()
+ ch = chr(candidates.pop())
+ # XXX Do we actually care that the code points are valid?
+ if unicodedata.category(ch)[0] == 'L':
+ out.append(ch)
+ return out
+
+
+_1024_random_unicode_letters = _random_unicode_letters(1024)
+
+
+def get_random_unicode(n, seed=42):
+ """
+ Get a random non-ASCII unicode string of size *n*.
+ """
+ indices = np.frombuffer(get_random_bytes(n * 2, seed=seed),
+ dtype=np.int16) & 1023
+ unicode_arr = np.array(_1024_random_unicode_letters)[indices]
+
+ result = ''.join(unicode_arr.tolist())
+ assert len(result) == n, (len(result), len(unicode_arr))
+ return result
+
+
+class BuiltinsGenerator(object):
+
+ def __init__(self, seed=42):
+ self.rnd = np.random.RandomState(seed)
+
+ def sprinkle(self, lst, prob, value):
+ """
+ Sprinkle *value* entries in list *lst* with likelihood *prob*.
+ """
+ for i, p in enumerate(self.rnd.random_sample(size=len(lst))):
+ if p < prob:
+ lst[i] = value
+
+ def sprinkle_nones(self, lst, prob):
+ """
+ Sprinkle None entries in list *lst* with likelihood *prob*.
+ """
+ self.sprinkle(lst, prob, None)
+
+ def generate_int_list(self, n, none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of Python ints with *none_prob* probability of
+ an entry being None.
+ """
+ data = list(range(n))
+ self.sprinkle_nones(data, none_prob)
+ return data
+
+ def generate_float_list(self, n, none_prob=DEFAULT_NONE_PROB,
+ use_nan=False):
+ """
+ Generate a list of Python floats with *none_prob* probability of
+ an entry being None (or NaN if *use_nan* is true).
+ """
+ # Make sure we get Python floats, not np.float64
+ data = list(map(float, self.rnd.uniform(0.0, 1.0, n)))
+ assert len(data) == n
+ self.sprinkle(data, none_prob, value=float('nan') if use_nan else None)
+ return data
+
+ def generate_bool_list(self, n, none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of Python bools with *none_prob* probability of
+ an entry being None.
+ """
+ # Make sure we get Python bools, not np.bool_
+ data = [bool(x >= 0.5) for x in self.rnd.uniform(0.0, 1.0, n)]
+ assert len(data) == n
+ self.sprinkle_nones(data, none_prob)
+ return data
+
+ def generate_decimal_list(self, n, none_prob=DEFAULT_NONE_PROB,
+ use_nan=False):
+ """
+ Generate a list of Python Decimals with *none_prob* probability of
+ an entry being None (or NaN if *use_nan* is true).
+ """
+ data = [decimal.Decimal('%.9f' % f)
+ for f in self.rnd.uniform(0.0, 1.0, n)]
+ assert len(data) == n
+ self.sprinkle(data, none_prob,
+ value=decimal.Decimal('nan') if use_nan else None)
+ return data
+
+ def generate_object_list(self, n, none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of generic Python objects with *none_prob*
+ probability of an entry being None.
+ """
+ data = [object() for i in range(n)]
+ self.sprinkle_nones(data, none_prob)
+ return data
+
+ def _generate_varying_sequences(self, random_factory, n, min_size,
+ max_size, none_prob):
+ """
+ Generate a list of *n* sequences of varying size between *min_size*
+ and *max_size*, with *none_prob* probability of an entry being None.
+ The base material for each sequence is obtained by calling
+ `random_factory(<some size>)`
+ """
+ base_size = 10000
+ base = random_factory(base_size + max_size)
+ data = []
+ for i in range(n):
+ off = self.rnd.randint(base_size)
+ if min_size == max_size:
+ size = min_size
+ else:
+ size = self.rnd.randint(min_size, max_size + 1)
+ data.append(base[off:off + size])
+ self.sprinkle_nones(data, none_prob)
+ assert len(data) == n
+ return data
+
+ def generate_fixed_binary_list(self, n, size, none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of bytestrings with a fixed *size*.
+ """
+ return self._generate_varying_sequences(get_random_bytes, n,
+ size, size, none_prob)
+
+ def generate_varying_binary_list(self, n, min_size, max_size,
+ none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of bytestrings with a random size between
+ *min_size* and *max_size*.
+ """
+ return self._generate_varying_sequences(get_random_bytes, n,
+ min_size, max_size, none_prob)
+
+ def generate_ascii_string_list(self, n, min_size, max_size,
+ none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of ASCII strings with a random size between
+ *min_size* and *max_size*.
+ """
+ return self._generate_varying_sequences(get_random_ascii, n,
+ min_size, max_size, none_prob)
+
+ def generate_unicode_string_list(self, n, min_size, max_size,
+ none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of unicode strings with a random size between
+ *min_size* and *max_size*.
+ """
+ return self._generate_varying_sequences(get_random_unicode, n,
+ min_size, max_size, none_prob)
+
+ def generate_int_list_list(self, n, min_size, max_size,
+ none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of lists of Python ints with a random size between
+ *min_size* and *max_size*.
+ """
+ return self._generate_varying_sequences(
+ partial(self.generate_int_list, none_prob=none_prob),
+ n, min_size, max_size, none_prob)
+
+ def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of tuples with random values.
+ Each tuple has the form `(int value, float value, bool value)`
+ """
+ dicts = self.generate_dict_list(n, none_prob=none_prob)
+ tuples = [(d.get('u'), d.get('v'), d.get('w'))
+ if d is not None else None
+ for d in dicts]
+ assert len(tuples) == n
+ return tuples
+
+ def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB):
+ """
+ Generate a list of dicts with random values.
+ Each dict has the form
+
+ `{'u': int value, 'v': float value, 'w': bool value}`
+ """
+ ints = self.generate_int_list(n, none_prob=none_prob)
+ floats = self.generate_float_list(n, none_prob=none_prob)
+ bools = self.generate_bool_list(n, none_prob=none_prob)
+ dicts = []
+ # Keep half the Nones, omit the other half
+ keep_nones = itertools.cycle([True, False])
+ for u, v, w in zip(ints, floats, bools):
+ d = {}
+ if u is not None or next(keep_nones):
+ d['u'] = u
+ if v is not None or next(keep_nones):
+ d['v'] = v
+ if w is not None or next(keep_nones):
+ d['w'] = w
+ dicts.append(d)
+ self.sprinkle_nones(dicts, none_prob)
+ assert len(dicts) == n
+ return dicts
+
+ def get_type_and_builtins(self, n, type_name):
+ """
+ Return a `(arrow type, list)` tuple where the arrow type
+ corresponds to the given logical *type_name*, and the list
+ is a list of *n* random-generated Python objects compatible
+ with the arrow type.
+ """
+ size = None
+
+ if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'):
+ kind = type_name
+ elif type_name.startswith(('int', 'uint')):
+ kind = 'int'
+ elif type_name.startswith('float'):
+ kind = 'float'
+ elif type_name.startswith('struct'):
+ kind = 'struct'
+ elif type_name == 'binary':
+ kind = 'varying binary'
+ elif type_name.startswith('binary'):
+ kind = 'fixed binary'
+ size = int(type_name[6:])
+ assert size > 0
+ else:
+ raise ValueError("unrecognized type %r" % (type_name,))
+
+ if kind in ('int', 'float'):
+ ty = getattr(pa, type_name)()
+ elif kind == 'bool':
+ ty = pa.bool_()
+ elif kind == 'decimal':
+ ty = pa.decimal128(9, 9)
+ elif kind == 'fixed binary':
+ ty = pa.binary(size)
+ elif kind == 'varying binary':
+ ty = pa.binary()
+ elif kind in ('ascii', 'unicode'):
+ ty = pa.string()
+ elif kind == 'int64 list':
+ ty = pa.list_(pa.int64())
+ elif kind == 'struct':
+ ty = pa.struct([pa.field('u', pa.int64()),
+ pa.field('v', pa.float64()),
+ pa.field('w', pa.bool_())])
+
+ factories = {
+ 'int': self.generate_int_list,
+ 'float': self.generate_float_list,
+ 'bool': self.generate_bool_list,
+ 'decimal': self.generate_decimal_list,
+ 'fixed binary': partial(self.generate_fixed_binary_list,
+ size=size),
+ 'varying binary': partial(self.generate_varying_binary_list,
+ min_size=3, max_size=40),
+ 'ascii': partial(self.generate_ascii_string_list,
+ min_size=3, max_size=40),
+ 'unicode': partial(self.generate_unicode_string_list,
+ min_size=3, max_size=40),
+ 'int64 list': partial(self.generate_int_list_list,
+ min_size=0, max_size=20),
+ 'struct': self.generate_dict_list,
+ 'struct from tuples': self.generate_tuple_list,
+ }
+ data = factories[kind](n)
+ return ty, data