diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/python/pyarrow/tests/test_array.py | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/python/pyarrow/tests/test_array.py')
-rw-r--r-- | src/arrow/python/pyarrow/tests/test_array.py | 3064 |
1 files changed, 3064 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/tests/test_array.py b/src/arrow/python/pyarrow/tests/test_array.py new file mode 100644 index 000000000..9a1f41efe --- /dev/null +++ b/src/arrow/python/pyarrow/tests/test_array.py @@ -0,0 +1,3064 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterable +import datetime +import decimal +import hypothesis as h +import hypothesis.strategies as st +import itertools +import pickle +import pytest +import struct +import sys +import weakref + +import numpy as np +try: + import pickle5 +except ImportError: + pickle5 = None +import pytz + +import pyarrow as pa +import pyarrow.tests.strategies as past + + +def test_total_bytes_allocated(): + assert pa.total_allocated_bytes() == 0 + + +def test_weakref(): + arr = pa.array([1, 2, 3]) + wr = weakref.ref(arr) + assert wr() is not None + del arr + assert wr() is None + + +def test_getitem_NULL(): + arr = pa.array([1, None, 2]) + assert arr[1].as_py() is None + assert arr[1].is_valid is False + assert isinstance(arr[1], pa.Int64Scalar) + + +def test_constructor_raises(): + # This could happen by wrong capitalization. + # ARROW-2638: prevent calling extension class constructors directly + with pytest.raises(TypeError): + pa.Array([1, 2]) + + +def test_list_format(): + arr = pa.array([[1], None, [2, 3, None]]) + result = arr.to_string() + expected = """\ +[ + [ + 1 + ], + null, + [ + 2, + 3, + null + ] +]""" + assert result == expected + + +def test_string_format(): + arr = pa.array(['', None, 'foo']) + result = arr.to_string() + expected = """\ +[ + "", + null, + "foo" +]""" + assert result == expected + + +def test_long_array_format(): + arr = pa.array(range(100)) + result = arr.to_string(window=2) + expected = """\ +[ + 0, + 1, + ... + 98, + 99 +]""" + assert result == expected + + +def test_binary_format(): + arr = pa.array([b'\x00', b'', None, b'\x01foo', b'\x80\xff']) + result = arr.to_string() + expected = """\ +[ + 00, + , + null, + 01666F6F, + 80FF +]""" + assert result == expected + + +def test_binary_total_values_length(): + arr = pa.array([b'0000', None, b'11111', b'222222', b'3333333'], + type='binary') + large_arr = pa.array([b'0000', None, b'11111', b'222222', b'3333333'], + type='large_binary') + + assert arr.total_values_length == 22 + assert arr.slice(1, 3).total_values_length == 11 + assert large_arr.total_values_length == 22 + assert large_arr.slice(1, 3).total_values_length == 11 + + +def test_to_numpy_zero_copy(): + arr = pa.array(range(10)) + + np_arr = arr.to_numpy() + + # check for zero copy (both arrays using same memory) + arrow_buf = arr.buffers()[1] + assert arrow_buf.address == np_arr.ctypes.data + + arr = None + import gc + gc.collect() + + # Ensure base is still valid + assert np_arr.base is not None + expected = np.arange(10) + np.testing.assert_array_equal(np_arr, expected) + + +def test_to_numpy_unsupported_types(): + # ARROW-2871: Some primitive types are not yet supported in to_numpy + bool_arr = pa.array([True, False, True]) + + with pytest.raises(ValueError): + bool_arr.to_numpy() + + result = bool_arr.to_numpy(zero_copy_only=False) + expected = np.array([True, False, True]) + np.testing.assert_array_equal(result, expected) + + null_arr = pa.array([None, None, None]) + + with pytest.raises(ValueError): + null_arr.to_numpy() + + result = null_arr.to_numpy(zero_copy_only=False) + expected = np.array([None, None, None], dtype=object) + np.testing.assert_array_equal(result, expected) + + arr = pa.array([1, 2, None]) + + with pytest.raises(ValueError, match="with 1 nulls"): + arr.to_numpy() + + +def test_to_numpy_writable(): + arr = pa.array(range(10)) + np_arr = arr.to_numpy() + + # by default not writable for zero-copy conversion + with pytest.raises(ValueError): + np_arr[0] = 10 + + np_arr2 = arr.to_numpy(zero_copy_only=False, writable=True) + np_arr2[0] = 10 + assert arr[0].as_py() == 0 + + # when asking for writable, cannot do zero-copy + with pytest.raises(ValueError): + arr.to_numpy(zero_copy_only=True, writable=True) + + +@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) +def test_to_numpy_datetime64(unit): + arr = pa.array([1, 2, 3], pa.timestamp(unit)) + expected = np.array([1, 2, 3], dtype="datetime64[{}]".format(unit)) + np_arr = arr.to_numpy() + np.testing.assert_array_equal(np_arr, expected) + + +@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) +def test_to_numpy_timedelta64(unit): + arr = pa.array([1, 2, 3], pa.duration(unit)) + expected = np.array([1, 2, 3], dtype="timedelta64[{}]".format(unit)) + np_arr = arr.to_numpy() + np.testing.assert_array_equal(np_arr, expected) + + +def test_to_numpy_dictionary(): + # ARROW-7591 + arr = pa.array(["a", "b", "a"]).dictionary_encode() + expected = np.array(["a", "b", "a"], dtype=object) + np_arr = arr.to_numpy(zero_copy_only=False) + np.testing.assert_array_equal(np_arr, expected) + + +@pytest.mark.pandas +def test_to_pandas_zero_copy(): + import gc + + arr = pa.array(range(10)) + + for i in range(10): + series = arr.to_pandas() + assert sys.getrefcount(series) == 2 + series = None # noqa + + assert sys.getrefcount(arr) == 2 + + for i in range(10): + arr = pa.array(range(10)) + series = arr.to_pandas() + arr = None + gc.collect() + + # Ensure base is still valid + + # Because of py.test's assert inspection magic, if you put getrefcount + # on the line being examined, it will be 1 higher than you expect + base_refcount = sys.getrefcount(series.values.base) + assert base_refcount == 2 + series.sum() + + +@pytest.mark.nopandas +@pytest.mark.pandas +def test_asarray(): + # ensure this is tested both when pandas is present or not (ARROW-6564) + + arr = pa.array(range(4)) + + # The iterator interface gives back an array of Int64Value's + np_arr = np.asarray([_ for _ in arr]) + assert np_arr.tolist() == [0, 1, 2, 3] + assert np_arr.dtype == np.dtype('O') + assert type(np_arr[0]) == pa.lib.Int64Value + + # Calling with the arrow array gives back an array with 'int64' dtype + np_arr = np.asarray(arr) + assert np_arr.tolist() == [0, 1, 2, 3] + assert np_arr.dtype == np.dtype('int64') + + # An optional type can be specified when calling np.asarray + np_arr = np.asarray(arr, dtype='str') + assert np_arr.tolist() == ['0', '1', '2', '3'] + + # If PyArrow array has null values, numpy type will be changed as needed + # to support nulls. + arr = pa.array([0, 1, 2, None]) + assert arr.type == pa.int64() + np_arr = np.asarray(arr) + elements = np_arr.tolist() + assert elements[:3] == [0., 1., 2.] + assert np.isnan(elements[3]) + assert np_arr.dtype == np.dtype('float64') + + # DictionaryType data will be converted to dense numpy array + arr = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 2, 0, 1]), pa.array(['a', 'b', 'c'])) + np_arr = np.asarray(arr) + assert np_arr.dtype == np.dtype('object') + assert np_arr.tolist() == ['a', 'b', 'c', 'a', 'b'] + + +@pytest.mark.parametrize('ty', [ + None, + pa.null(), + pa.int8(), + pa.string() +]) +def test_nulls(ty): + arr = pa.nulls(3, type=ty) + expected = pa.array([None, None, None], type=ty) + + assert len(arr) == 3 + assert arr.equals(expected) + + if ty is None: + assert arr.type == pa.null() + else: + assert arr.type == ty + + +def test_array_from_scalar(): + today = datetime.date.today() + now = datetime.datetime.now() + now_utc = now.replace(tzinfo=pytz.utc) + now_with_tz = now_utc.astimezone(pytz.timezone('US/Eastern')) + oneday = datetime.timedelta(days=1) + + cases = [ + (None, 1, pa.array([None])), + (None, 10, pa.nulls(10)), + (-1, 3, pa.array([-1, -1, -1], type=pa.int64())), + (2.71, 2, pa.array([2.71, 2.71], type=pa.float64())), + ("string", 4, pa.array(["string"] * 4)), + ( + pa.scalar(8, type=pa.uint8()), + 17, + pa.array([8] * 17, type=pa.uint8()) + ), + (pa.scalar(None), 3, pa.array([None, None, None])), + (pa.scalar(True), 11, pa.array([True] * 11)), + (today, 2, pa.array([today] * 2)), + (now, 10, pa.array([now] * 10)), + ( + now_with_tz, + 2, + pa.array( + [now_utc] * 2, + type=pa.timestamp('us', tz=pytz.timezone('US/Eastern')) + ) + ), + (now.time(), 9, pa.array([now.time()] * 9)), + (oneday, 4, pa.array([oneday] * 4)), + (False, 9, pa.array([False] * 9)), + ([1, 2], 2, pa.array([[1, 2], [1, 2]])), + ( + pa.scalar([-1, 3], type=pa.large_list(pa.int8())), + 5, + pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int8())) + ), + ({'a': 1, 'b': 2}, 3, pa.array([{'a': 1, 'b': 2}] * 3)) + ] + + for value, size, expected in cases: + arr = pa.repeat(value, size) + assert len(arr) == size + assert arr.type.equals(expected.type) + assert arr.equals(expected) + if expected.type == pa.null(): + assert arr.null_count == size + else: + assert arr.null_count == 0 + + +def test_array_from_dictionary_scalar(): + dictionary = ['foo', 'bar', 'baz'] + arr = pa.DictionaryArray.from_arrays([2, 1, 2, 0], dictionary=dictionary) + + result = pa.repeat(arr[0], 5) + expected = pa.DictionaryArray.from_arrays([2] * 5, dictionary=dictionary) + assert result.equals(expected) + + result = pa.repeat(arr[3], 5) + expected = pa.DictionaryArray.from_arrays([0] * 5, dictionary=dictionary) + assert result.equals(expected) + + +def test_array_getitem(): + arr = pa.array(range(10, 15)) + lst = arr.to_pylist() + + for idx in range(-len(arr), len(arr)): + assert arr[idx].as_py() == lst[idx] + for idx in range(-2 * len(arr), -len(arr)): + with pytest.raises(IndexError): + arr[idx] + for idx in range(len(arr), 2 * len(arr)): + with pytest.raises(IndexError): + arr[idx] + + # check that numpy scalars are supported + for idx in range(-len(arr), len(arr)): + assert arr[np.int32(idx)].as_py() == lst[idx] + + +def test_array_slice(): + arr = pa.array(range(10)) + + sliced = arr.slice(2) + expected = pa.array(range(2, 10)) + assert sliced.equals(expected) + + sliced2 = arr.slice(2, 4) + expected2 = pa.array(range(2, 6)) + assert sliced2.equals(expected2) + + # 0 offset + assert arr.slice(0).equals(arr) + + # Slice past end of array + assert len(arr.slice(len(arr))) == 0 + assert len(arr.slice(len(arr) + 2)) == 0 + assert len(arr.slice(len(arr) + 2, 100)) == 0 + + with pytest.raises(IndexError): + arr.slice(-1) + + with pytest.raises(ValueError): + arr.slice(2, -1) + + # Test slice notation + assert arr[2:].equals(arr.slice(2)) + assert arr[2:5].equals(arr.slice(2, 3)) + assert arr[-5:].equals(arr.slice(len(arr) - 5)) + + n = len(arr) + for start in range(-n * 2, n * 2): + for stop in range(-n * 2, n * 2): + res = arr[start:stop] + res.validate() + expected = arr.to_pylist()[start:stop] + assert res.to_pylist() == expected + assert res.to_numpy().tolist() == expected + + +def test_array_slice_negative_step(): + # ARROW-2714 + np_arr = np.arange(20) + arr = pa.array(np_arr) + chunked_arr = pa.chunked_array([arr]) + + cases = [ + slice(None, None, -1), + slice(None, 6, -2), + slice(10, 6, -2), + slice(8, None, -2), + slice(2, 10, -2), + slice(10, 2, -2), + slice(None, None, 2), + slice(0, 10, 2), + ] + + for case in cases: + result = arr[case] + expected = pa.array(np_arr[case]) + assert result.equals(expected) + + result = pa.record_batch([arr], names=['f0'])[case] + expected = pa.record_batch([expected], names=['f0']) + assert result.equals(expected) + + result = chunked_arr[case] + expected = pa.chunked_array([np_arr[case]]) + assert result.equals(expected) + + +def test_array_diff(): + # ARROW-6252 + arr1 = pa.array(['foo'], type=pa.utf8()) + arr2 = pa.array(['foo', 'bar', None], type=pa.utf8()) + arr3 = pa.array([1, 2, 3]) + arr4 = pa.array([[], [1], None], type=pa.list_(pa.int64())) + + assert arr1.diff(arr1) == '' + assert arr1.diff(arr2) == ''' +@@ -1, +1 @@ ++"bar" ++null +''' + assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' + assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' + assert arr1.diff(arr4).strip() == ('# Array types differed: string vs ' + 'list<item: int64>') + + +def test_array_iter(): + arr = pa.array(range(10)) + + for i, j in zip(range(10), arr): + assert i == j.as_py() + + assert isinstance(arr, Iterable) + + +def test_struct_array_slice(): + # ARROW-2311: slicing nested arrays needs special care + ty = pa.struct([pa.field('a', pa.int8()), + pa.field('b', pa.float32())]) + arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) + assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, + {'a': 5, 'b': 6.5}] + + +def test_array_factory_invalid_type(): + + class MyObject: + pass + + arr = np.array([MyObject()]) + with pytest.raises(ValueError): + pa.array(arr) + + +def test_array_ref_to_ndarray_base(): + arr = np.array([1, 2, 3]) + + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) + + +def test_array_eq(): + # ARROW-2150 / ARROW-9445: we define the __eq__ behavior to be + # data equality (not element-wise equality) + arr1 = pa.array([1, 2, 3], type=pa.int32()) + arr2 = pa.array([1, 2, 3], type=pa.int32()) + arr3 = pa.array([1, 2, 3], type=pa.int64()) + + assert (arr1 == arr2) is True + assert (arr1 != arr2) is False + assert (arr1 == arr3) is False + assert (arr1 != arr3) is True + + assert (arr1 == 1) is False + assert (arr1 == None) is False # noqa: E711 + + +def test_array_from_buffers(): + values_buf = pa.py_buffer(np.int16([4, 5, 6, 7])) + nulls_buf = pa.py_buffer(np.uint8([0b00001101])) + arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf]) + assert arr.type == pa.int16() + assert arr.to_pylist() == [4, None, 6, 7] + + arr = pa.Array.from_buffers(pa.int16(), 4, [None, values_buf]) + assert arr.type == pa.int16() + assert arr.to_pylist() == [4, 5, 6, 7] + + arr = pa.Array.from_buffers(pa.int16(), 3, [nulls_buf, values_buf], + offset=1) + assert arr.type == pa.int16() + assert arr.to_pylist() == [None, 6, 7] + + with pytest.raises(TypeError): + pa.Array.from_buffers(pa.int16(), 3, ['', ''], offset=1) + + +def test_string_binary_from_buffers(): + array = pa.array(["a", None, "b", "c"]) + + buffers = array.buffers() + copied = pa.StringArray.from_buffers( + len(array), buffers[1], buffers[2], buffers[0], array.null_count, + array.offset) + assert copied.to_pylist() == ["a", None, "b", "c"] + + binary_copy = pa.Array.from_buffers(pa.binary(), len(array), + array.buffers(), array.null_count, + array.offset) + assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"] + + copied = pa.StringArray.from_buffers( + len(array), buffers[1], buffers[2], buffers[0]) + assert copied.to_pylist() == ["a", None, "b", "c"] + + sliced = array[1:] + buffers = sliced.buffers() + copied = pa.StringArray.from_buffers( + len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset) + assert copied.to_pylist() == [None, "b", "c"] + assert copied.null_count == 1 + + # Slice but exclude all null entries so that we don't need to pass + # the null bitmap. + sliced = array[2:] + buffers = sliced.buffers() + copied = pa.StringArray.from_buffers( + len(sliced), buffers[1], buffers[2], None, -1, sliced.offset) + assert copied.to_pylist() == ["b", "c"] + assert copied.null_count == 0 + + +@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list]) +def test_list_from_buffers(list_type_factory): + ty = list_type_factory(pa.int16()) + array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty) + assert array.type == ty + + buffers = array.buffers() + + with pytest.raises(ValueError): + # No children + pa.Array.from_buffers(ty, 4, [None, buffers[1]]) + + child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:]) + copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child]) + assert copied.equals(array) + + with pytest.raises(ValueError): + # too many children + pa.Array.from_buffers(ty, 4, [None, buffers[1]], + children=[child, child]) + + +def test_struct_from_buffers(): + ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())]) + array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}], + type=ty) + buffers = array.buffers() + + with pytest.raises(ValueError): + # No children + pa.Array.from_buffers(ty, 3, [None, buffers[1]]) + + children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]), + pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])] + copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children) + assert copied.equals(array) + + with pytest.raises(ValueError): + # not enough many children + pa.Array.from_buffers(ty, 3, [buffers[0]], + children=children[:1]) + + +def test_struct_from_arrays(): + a = pa.array([4, 5, 6], type=pa.int64()) + b = pa.array(["bar", None, ""]) + c = pa.array([[1, 2], None, [3, None]]) + expected_list = [ + {'a': 4, 'b': 'bar', 'c': [1, 2]}, + {'a': 5, 'b': None, 'c': None}, + {'a': 6, 'b': '', 'c': [3, None]}, + ] + + # From field names + arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"]) + assert arr.type == pa.struct( + [("a", a.type), ("b", b.type), ("c", c.type)]) + assert arr.to_pylist() == expected_list + + with pytest.raises(ValueError): + pa.StructArray.from_arrays([a, b, c], ["a", "b"]) + + arr = pa.StructArray.from_arrays([], []) + assert arr.type == pa.struct([]) + assert arr.to_pylist() == [] + + # From fields + fa = pa.field("a", a.type, nullable=False) + fb = pa.field("b", b.type) + fc = pa.field("c", c.type) + arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc]) + assert arr.type == pa.struct([fa, fb, fc]) + assert not arr.type[0].nullable + assert arr.to_pylist() == expected_list + + with pytest.raises(ValueError): + pa.StructArray.from_arrays([a, b, c], fields=[fa, fb]) + + arr = pa.StructArray.from_arrays([], fields=[]) + assert arr.type == pa.struct([]) + assert arr.to_pylist() == [] + + # Inconsistent fields + fa2 = pa.field("a", pa.int32()) + with pytest.raises(ValueError, match="int64 vs int32"): + pa.StructArray.from_arrays([a, b, c], fields=[fa2, fb, fc]) + + arrays = [a, b, c] + fields = [fa, fb, fc] + # With mask + mask = pa.array([True, False, False]) + arr = pa.StructArray.from_arrays(arrays, fields=fields, mask=mask) + assert arr.to_pylist() == [None] + expected_list[1:] + + arr = pa.StructArray.from_arrays(arrays, names=['a', 'b', 'c'], mask=mask) + assert arr.to_pylist() == [None] + expected_list[1:] + + # Bad masks + with pytest.raises(ValueError, match='Mask must be'): + pa.StructArray.from_arrays(arrays, fields, mask=[True, False, False]) + + with pytest.raises(ValueError, match='not contain nulls'): + pa.StructArray.from_arrays( + arrays, fields, mask=pa.array([True, False, None])) + + with pytest.raises(ValueError, match='Mask must be'): + pa.StructArray.from_arrays( + arrays, fields, mask=pa.chunked_array([mask])) + + +def test_struct_array_from_chunked(): + # ARROW-11780 + # Check that we don't segfault when trying to build + # a StructArray from a chunked array. + chunked_arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]]) + + with pytest.raises(TypeError, match="Expected Array"): + pa.StructArray.from_arrays([chunked_arr], ["foo"]) + + +def test_dictionary_from_numpy(): + indices = np.repeat([0, 1, 2], 2) + dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) + mask = np.array([False, False, True, False, False, False]) + + d1 = pa.DictionaryArray.from_arrays(indices, dictionary) + d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask) + + assert d1.indices.to_pylist() == indices.tolist() + assert d1.indices.to_pylist() == indices.tolist() + assert d1.dictionary.to_pylist() == dictionary.tolist() + assert d2.dictionary.to_pylist() == dictionary.tolist() + + for i in range(len(indices)): + assert d1[i].as_py() == dictionary[indices[i]] + + if mask[i]: + assert d2[i].as_py() is None + else: + assert d2[i].as_py() == dictionary[indices[i]] + + +def test_dictionary_to_numpy(): + expected = pa.array( + ["foo", "bar", None, "foo"] + ).to_numpy(zero_copy_only=False) + a = pa.DictionaryArray.from_arrays( + pa.array([0, 1, None, 0]), + pa.array(['foo', 'bar']) + ) + np.testing.assert_array_equal(a.to_numpy(zero_copy_only=False), + expected) + + with pytest.raises(pa.ArrowInvalid): + # If this would be changed to no longer raise in the future, + # ensure to test the actual result because, currently, to_numpy takes + # for granted that when zero_copy_only=True there will be no nulls + # (it's the decoding of the DictionaryArray that handles the nulls and + # this is only activated with zero_copy_only=False) + a.to_numpy(zero_copy_only=True) + + anonulls = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 1, 0]), + pa.array(['foo', 'bar']) + ) + expected = pa.array( + ["foo", "bar", "bar", "foo"] + ).to_numpy(zero_copy_only=False) + np.testing.assert_array_equal(anonulls.to_numpy(zero_copy_only=False), + expected) + + with pytest.raises(pa.ArrowInvalid): + anonulls.to_numpy(zero_copy_only=True) + + afloat = pa.DictionaryArray.from_arrays( + pa.array([0, 1, 1, 0]), + pa.array([13.7, 11.0]) + ) + expected = pa.array([13.7, 11.0, 11.0, 13.7]).to_numpy() + np.testing.assert_array_equal(afloat.to_numpy(zero_copy_only=True), + expected) + np.testing.assert_array_equal(afloat.to_numpy(zero_copy_only=False), + expected) + + afloat2 = pa.DictionaryArray.from_arrays( + pa.array([0, 1, None, 0]), + pa.array([13.7, 11.0]) + ) + expected = pa.array( + [13.7, 11.0, None, 13.7] + ).to_numpy(zero_copy_only=False) + np.testing.assert_allclose( + afloat2.to_numpy(zero_copy_only=False), + expected, + equal_nan=True + ) + + # Testing for integers can reveal problems related to dealing + # with None values, as a numpy array of int dtype + # can't contain NaN nor None. + aints = pa.DictionaryArray.from_arrays( + pa.array([0, 1, None, 0]), + pa.array([7, 11]) + ) + expected = pa.array([7, 11, None, 7]).to_numpy(zero_copy_only=False) + np.testing.assert_allclose( + aints.to_numpy(zero_copy_only=False), + expected, + equal_nan=True + ) + + +def test_dictionary_from_boxed_arrays(): + indices = np.repeat([0, 1, 2], 2) + dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) + + iarr = pa.array(indices) + darr = pa.array(dictionary) + + d1 = pa.DictionaryArray.from_arrays(iarr, darr) + + assert d1.indices.to_pylist() == indices.tolist() + assert d1.dictionary.to_pylist() == dictionary.tolist() + + for i in range(len(indices)): + assert d1[i].as_py() == dictionary[indices[i]] + + +def test_dictionary_from_arrays_boundscheck(): + indices1 = pa.array([0, 1, 2, 0, 1, 2]) + indices2 = pa.array([0, -1, 2]) + indices3 = pa.array([0, 1, 2, 3]) + + dictionary = pa.array(['foo', 'bar', 'baz']) + + # Works fine + pa.DictionaryArray.from_arrays(indices1, dictionary) + + with pytest.raises(pa.ArrowException): + pa.DictionaryArray.from_arrays(indices2, dictionary) + + with pytest.raises(pa.ArrowException): + pa.DictionaryArray.from_arrays(indices3, dictionary) + + # If we are confident that the indices are "safe" we can pass safe=False to + # disable the boundschecking + pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False) + + +def test_dictionary_indices(): + # https://issues.apache.org/jira/browse/ARROW-6882 + indices = pa.array([0, 1, 2, 0, 1, 2]) + dictionary = pa.array(['foo', 'bar', 'baz']) + arr = pa.DictionaryArray.from_arrays(indices, dictionary) + arr.indices.validate(full=True) + + +@pytest.mark.parametrize(('list_array_type', 'list_type_factory'), + [(pa.ListArray, pa.list_), + (pa.LargeListArray, pa.large_list)]) +def test_list_from_arrays(list_array_type, list_type_factory): + offsets_arr = np.array([0, 2, 5, 8], dtype='i4') + offsets = pa.array(offsets_arr, type='int32') + pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] + values = pa.array(pyvalues, type='binary') + + result = list_array_type.from_arrays(offsets, values) + expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]], + type=list_type_factory(pa.binary())) + + assert result.equals(expected) + + # With nulls + offsets = [0, None, 2, 6] + values = [b'a', b'b', b'c', b'd', b'e', b'f'] + + result = list_array_type.from_arrays(offsets, values) + expected = pa.array([values[:2], None, values[2:]], + type=list_type_factory(pa.binary())) + + assert result.equals(expected) + + # Another edge case + offsets2 = [0, 2, None, 6] + result = list_array_type.from_arrays(offsets2, values) + expected = pa.array([values[:2], values[2:], None], + type=list_type_factory(pa.binary())) + assert result.equals(expected) + + # raise on invalid array + offsets = [1, 3, 10] + values = np.arange(5) + with pytest.raises(ValueError): + list_array_type.from_arrays(offsets, values) + + # Non-monotonic offsets + offsets = [0, 3, 2, 6] + values = list(range(6)) + result = list_array_type.from_arrays(offsets, values) + with pytest.raises(ValueError): + result.validate(full=True) + + +def test_map_from_arrays(): + offsets_arr = np.array([0, 2, 5, 8], dtype='i4') + offsets = pa.array(offsets_arr, type='int32') + pykeys = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] + pyitems = list(range(len(pykeys))) + pypairs = list(zip(pykeys, pyitems)) + pyentries = [pypairs[:2], pypairs[2:5], pypairs[5:8]] + keys = pa.array(pykeys, type='binary') + items = pa.array(pyitems, type='i4') + + result = pa.MapArray.from_arrays(offsets, keys, items) + expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32())) + + assert result.equals(expected) + + # With nulls + offsets = [0, None, 2, 6] + pykeys = [b'a', b'b', b'c', b'd', b'e', b'f'] + pyitems = [1, 2, 3, None, 4, 5] + pypairs = list(zip(pykeys, pyitems)) + pyentries = [pypairs[:2], None, pypairs[2:]] + keys = pa.array(pykeys, type='binary') + items = pa.array(pyitems, type='i4') + + result = pa.MapArray.from_arrays(offsets, keys, items) + expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32())) + + assert result.equals(expected) + + # check invalid usage + + offsets = [0, 1, 3, 5] + keys = np.arange(5) + items = np.arange(5) + _ = pa.MapArray.from_arrays(offsets, keys, items) + + # raise on invalid offsets + with pytest.raises(ValueError): + pa.MapArray.from_arrays(offsets + [6], keys, items) + + # raise on length of keys != items + with pytest.raises(ValueError): + pa.MapArray.from_arrays(offsets, keys, np.concatenate([items, items])) + + # raise on keys with null + keys_with_null = list(keys)[:-1] + [None] + assert len(keys_with_null) == len(items) + with pytest.raises(ValueError): + pa.MapArray.from_arrays(offsets, keys_with_null, items) + + +def test_fixed_size_list_from_arrays(): + values = pa.array(range(12), pa.int64()) + result = pa.FixedSizeListArray.from_arrays(values, 4) + assert result.to_pylist() == [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]] + assert result.type.equals(pa.list_(pa.int64(), 4)) + + # raise on invalid values / list_size + with pytest.raises(ValueError): + pa.FixedSizeListArray.from_arrays(values, -4) + + with pytest.raises(ValueError): + # array with list size 0 cannot be constructed with from_arrays + pa.FixedSizeListArray.from_arrays(pa.array([], pa.int64()), 0) + + with pytest.raises(ValueError): + # length of values not multiple of 5 + pa.FixedSizeListArray.from_arrays(values, 5) + + +def test_variable_list_from_arrays(): + values = pa.array([1, 2, 3, 4], pa.int64()) + offsets = pa.array([0, 2, 4]) + result = pa.ListArray.from_arrays(offsets, values) + assert result.to_pylist() == [[1, 2], [3, 4]] + assert result.type.equals(pa.list_(pa.int64())) + + offsets = pa.array([0, None, 2, 4]) + result = pa.ListArray.from_arrays(offsets, values) + assert result.to_pylist() == [[1, 2], None, [3, 4]] + + # raise if offset out of bounds + with pytest.raises(ValueError): + pa.ListArray.from_arrays(pa.array([-1, 2, 4]), values) + + with pytest.raises(ValueError): + pa.ListArray.from_arrays(pa.array([0, 2, 5]), values) + + +def test_union_from_dense(): + binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') + int64 = pa.array([1, 2, 3], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8') + value_offsets = pa.array([0, 0, 1, 2, 1, 2, 3], type='int32') + py_value = [b'a', 1, b'b', b'c', 2, 3, b'd'] + + def check_result(result, expected_field_names, expected_type_codes, + expected_type_code_values): + result.validate(full=True) + actual_field_names = [result.type[i].name + for i in range(result.type.num_fields)] + assert actual_field_names == expected_field_names + assert result.type.mode == "dense" + assert result.type.type_codes == expected_type_codes + assert result.to_pylist() == py_value + assert expected_type_code_values.equals(result.type_codes) + assert value_offsets.equals(result.offsets) + assert result.field(0).equals(binary) + assert result.field(1).equals(int64) + with pytest.raises(KeyError): + result.field(-1) + with pytest.raises(KeyError): + result.field(2) + + # without field names and type codes + check_result(pa.UnionArray.from_dense(types, value_offsets, + [binary, int64]), + expected_field_names=['0', '1'], + expected_type_codes=[0, 1], + expected_type_code_values=types) + + # with field names + check_result(pa.UnionArray.from_dense(types, value_offsets, + [binary, int64], + ['bin', 'int']), + expected_field_names=['bin', 'int'], + expected_type_codes=[0, 1], + expected_type_code_values=types) + + # with type codes + check_result(pa.UnionArray.from_dense(logical_types, value_offsets, + [binary, int64], + type_codes=[11, 13]), + expected_field_names=['0', '1'], + expected_type_codes=[11, 13], + expected_type_code_values=logical_types) + + # with field names and type codes + check_result(pa.UnionArray.from_dense(logical_types, value_offsets, + [binary, int64], + ['bin', 'int'], [11, 13]), + expected_field_names=['bin', 'int'], + expected_type_codes=[11, 13], + expected_type_code_values=logical_types) + + # Bad type ids + arr = pa.UnionArray.from_dense(logical_types, value_offsets, + [binary, int64]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64], + type_codes=[11, 13]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + + # Offset larger than child size + bad_offsets = pa.array([0, 0, 1, 2, 1, 2, 4], type='int32') + arr = pa.UnionArray.from_dense(types, bad_offsets, [binary, int64]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + + +def test_union_from_sparse(): + binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], + type='binary') + int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8') + py_value = [b'a', 1, b'b', b'c', 2, 3, b'd'] + + def check_result(result, expected_field_names, expected_type_codes, + expected_type_code_values): + result.validate(full=True) + assert result.to_pylist() == py_value + actual_field_names = [result.type[i].name + for i in range(result.type.num_fields)] + assert actual_field_names == expected_field_names + assert result.type.mode == "sparse" + assert result.type.type_codes == expected_type_codes + assert expected_type_code_values.equals(result.type_codes) + assert result.field(0).equals(binary) + assert result.field(1).equals(int64) + with pytest.raises(pa.ArrowTypeError): + result.offsets + with pytest.raises(KeyError): + result.field(-1) + with pytest.raises(KeyError): + result.field(2) + + # without field names and type codes + check_result(pa.UnionArray.from_sparse(types, [binary, int64]), + expected_field_names=['0', '1'], + expected_type_codes=[0, 1], + expected_type_code_values=types) + + # with field names + check_result(pa.UnionArray.from_sparse(types, [binary, int64], + ['bin', 'int']), + expected_field_names=['bin', 'int'], + expected_type_codes=[0, 1], + expected_type_code_values=types) + + # with type codes + check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64], + type_codes=[11, 13]), + expected_field_names=['0', '1'], + expected_type_codes=[11, 13], + expected_type_code_values=logical_types) + + # with field names and type codes + check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64], + ['bin', 'int'], + [11, 13]), + expected_field_names=['bin', 'int'], + expected_type_codes=[11, 13], + expected_type_code_values=logical_types) + + # Bad type ids + arr = pa.UnionArray.from_sparse(logical_types, [binary, int64]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + arr = pa.UnionArray.from_sparse(types, [binary, int64], + type_codes=[11, 13]) + with pytest.raises(pa.ArrowInvalid): + arr.validate(full=True) + + # Invalid child length + with pytest.raises(pa.ArrowInvalid): + arr = pa.UnionArray.from_sparse(logical_types, [binary, int64[1:]]) + + +def test_union_array_to_pylist_with_nulls(): + # ARROW-9556 + arr = pa.UnionArray.from_sparse( + pa.array([0, 1, 0, 0, 1], type=pa.int8()), + [ + pa.array([0.0, 1.1, None, 3.3, 4.4]), + pa.array([True, None, False, True, False]), + ] + ) + assert arr.to_pylist() == [0.0, None, None, 3.3, False] + + arr = pa.UnionArray.from_dense( + pa.array([0, 1, 0, 0, 0, 1, 1], type=pa.int8()), + pa.array([0, 0, 1, 2, 3, 1, 2], type=pa.int32()), + [ + pa.array([0.0, 1.1, None, 3.3]), + pa.array([True, None, False]) + ] + ) + assert arr.to_pylist() == [0.0, True, 1.1, None, 3.3, None, False] + + +def test_union_array_slice(): + # ARROW-2314 + arr = pa.UnionArray.from_sparse(pa.array([0, 0, 1, 1], type=pa.int8()), + [pa.array(["a", "b", "c", "d"]), + pa.array([1, 2, 3, 4])]) + assert arr[1:].to_pylist() == ["b", 3, 4] + + binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') + int64 = pa.array([1, 2, 3], type='int64') + types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') + value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') + + arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64]) + lst = arr.to_pylist() + for i in range(len(arr)): + for j in range(i, len(arr)): + assert arr[i:j].to_pylist() == lst[i:j] + + +def _check_cast_case(case, *, safe=True, check_array_construction=True): + in_data, in_type, out_data, out_type = case + if isinstance(out_data, pa.Array): + assert out_data.type == out_type + expected = out_data + else: + expected = pa.array(out_data, type=out_type) + + # check casting an already created array + if isinstance(in_data, pa.Array): + assert in_data.type == in_type + in_arr = in_data + else: + in_arr = pa.array(in_data, type=in_type) + casted = in_arr.cast(out_type, safe=safe) + casted.validate(full=True) + assert casted.equals(expected) + + # constructing an array with out type which optionally involves casting + # for more see ARROW-1949 + if check_array_construction: + in_arr = pa.array(in_data, type=out_type, safe=safe) + assert in_arr.equals(expected) + + +def test_cast_integers_safe(): + safe_cases = [ + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', + np.array([0, 1, 2, 3], dtype='i4'), pa.int32()), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', + np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', + np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), + (np.array([0, 1, 2, 3], dtype='i1'), 'int8', + np.array([0, 1, 2, 3], dtype='f8'), pa.float64()) + ] + + for case in safe_cases: + _check_cast_case(case) + + unsafe_cases = [ + (np.array([50000], dtype='i4'), 'int32', 'int16'), + (np.array([70000], dtype='i4'), 'int32', 'uint16'), + (np.array([-1], dtype='i4'), 'int32', 'uint16'), + (np.array([50000], dtype='u2'), 'uint16', 'int16') + ] + for in_data, in_type, out_type in unsafe_cases: + in_arr = pa.array(in_data, type=in_type) + + with pytest.raises(pa.ArrowInvalid): + in_arr.cast(out_type) + + +def test_cast_none(): + # ARROW-3735: Ensure that calling cast(None) doesn't segfault. + arr = pa.array([1, 2, 3]) + + with pytest.raises(ValueError): + arr.cast(None) + + +def test_cast_list_to_primitive(): + # ARROW-8070: cast segfaults on unsupported cast from list<binary> to utf8 + arr = pa.array([[1, 2], [3, 4]]) + with pytest.raises(NotImplementedError): + arr.cast(pa.int8()) + + arr = pa.array([[b"a", b"b"], [b"c"]], pa.list_(pa.binary())) + with pytest.raises(NotImplementedError): + arr.cast(pa.binary()) + + +def test_slice_chunked_array_zero_chunks(): + # ARROW-8911 + arr = pa.chunked_array([], type='int8') + assert arr.num_chunks == 0 + + result = arr[:] + assert result.equals(arr) + + # Do not crash + arr[:5] + + +def test_cast_chunked_array(): + arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] + carr = pa.chunked_array(arrays) + + target = pa.float64() + casted = carr.cast(target) + expected = pa.chunked_array([x.cast(target) for x in arrays]) + assert casted.equals(expected) + + +def test_cast_chunked_array_empty(): + # ARROW-8142 + for typ1, typ2 in [(pa.dictionary(pa.int8(), pa.string()), pa.string()), + (pa.int64(), pa.int32())]: + + arr = pa.chunked_array([], type=typ1) + result = arr.cast(typ2) + expected = pa.chunked_array([], type=typ2) + assert result.equals(expected) + + +def test_chunked_array_data_warns(): + with pytest.warns(FutureWarning): + res = pa.chunked_array([[]]).data + assert isinstance(res, pa.ChunkedArray) + + +def test_cast_integers_unsafe(): + # We let NumPy do the unsafe casting + unsafe_cases = [ + (np.array([50000], dtype='i4'), 'int32', + np.array([50000], dtype='i2'), pa.int16()), + (np.array([70000], dtype='i4'), 'int32', + np.array([70000], dtype='u2'), pa.uint16()), + (np.array([-1], dtype='i4'), 'int32', + np.array([-1], dtype='u2'), pa.uint16()), + (np.array([50000], dtype='u2'), pa.uint16(), + np.array([50000], dtype='i2'), pa.int16()) + ] + + for case in unsafe_cases: + _check_cast_case(case, safe=False) + + +def test_floating_point_truncate_safe(): + safe_cases = [ + (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32', + np.array([1, 2, 3], dtype='i4'), pa.int32()), + (np.array([1.0, 2.0, 3.0], dtype='float64'), 'float64', + np.array([1, 2, 3], dtype='i4'), pa.int32()), + (np.array([-10.0, 20.0, -30.0], dtype='float64'), 'float64', + np.array([-10, 20, -30], dtype='i4'), pa.int32()), + ] + for case in safe_cases: + _check_cast_case(case, safe=True) + + +def test_floating_point_truncate_unsafe(): + unsafe_cases = [ + (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32', + np.array([1, 2, 3], dtype='i4'), pa.int32()), + (np.array([1.1, 2.2, 3.3], dtype='float64'), 'float64', + np.array([1, 2, 3], dtype='i4'), pa.int32()), + (np.array([-10.1, 20.2, -30.3], dtype='float64'), 'float64', + np.array([-10, 20, -30], dtype='i4'), pa.int32()), + ] + for case in unsafe_cases: + # test safe casting raises + with pytest.raises(pa.ArrowInvalid, match='truncated'): + _check_cast_case(case, safe=True) + + # test unsafe casting truncates + _check_cast_case(case, safe=False) + + +def test_decimal_to_int_safe(): + safe_cases = [ + ( + [decimal.Decimal("123456"), None, decimal.Decimal("-912345")], + pa.decimal128(32, 5), + [123456, None, -912345], + pa.int32() + ), + ( + [decimal.Decimal("1234"), None, decimal.Decimal("-9123")], + pa.decimal128(19, 10), + [1234, None, -9123], + pa.int16() + ), + ( + [decimal.Decimal("123"), None, decimal.Decimal("-91")], + pa.decimal128(19, 10), + [123, None, -91], + pa.int8() + ), + ] + for case in safe_cases: + _check_cast_case(case) + _check_cast_case(case, safe=True) + + +def test_decimal_to_int_value_out_of_bounds(): + out_of_bounds_cases = [ + ( + np.array([ + decimal.Decimal("1234567890123"), + None, + decimal.Decimal("-912345678901234") + ]), + pa.decimal128(32, 5), + [1912276171, None, -135950322], + pa.int32() + ), + ( + [decimal.Decimal("123456"), None, decimal.Decimal("-912345678")], + pa.decimal128(32, 5), + [-7616, None, -19022], + pa.int16() + ), + ( + [decimal.Decimal("1234"), None, decimal.Decimal("-9123")], + pa.decimal128(32, 5), + [-46, None, 93], + pa.int8() + ), + ] + + for case in out_of_bounds_cases: + # test safe casting raises + with pytest.raises(pa.ArrowInvalid, + match='Integer value out of bounds'): + _check_cast_case(case) + + # XXX `safe=False` can be ignored when constructing an array + # from a sequence of Python objects (ARROW-8567) + _check_cast_case(case, safe=False, check_array_construction=False) + + +def test_decimal_to_int_non_integer(): + non_integer_cases = [ + ( + [ + decimal.Decimal("123456.21"), + None, + decimal.Decimal("-912345.13") + ], + pa.decimal128(32, 5), + [123456, None, -912345], + pa.int32() + ), + ( + [decimal.Decimal("1234.134"), None, decimal.Decimal("-9123.1")], + pa.decimal128(19, 10), + [1234, None, -9123], + pa.int16() + ), + ( + [decimal.Decimal("123.1451"), None, decimal.Decimal("-91.21")], + pa.decimal128(19, 10), + [123, None, -91], + pa.int8() + ), + ] + + for case in non_integer_cases: + # test safe casting raises + msg_regexp = 'Rescaling Decimal128 value would cause data loss' + with pytest.raises(pa.ArrowInvalid, match=msg_regexp): + _check_cast_case(case) + + _check_cast_case(case, safe=False) + + +def test_decimal_to_decimal(): + arr = pa.array( + [decimal.Decimal("1234.12"), None], + type=pa.decimal128(19, 10) + ) + result = arr.cast(pa.decimal128(15, 6)) + expected = pa.array( + [decimal.Decimal("1234.12"), None], + type=pa.decimal128(15, 6) + ) + assert result.equals(expected) + + msg_regexp = 'Rescaling Decimal128 value would cause data loss' + with pytest.raises(pa.ArrowInvalid, match=msg_regexp): + result = arr.cast(pa.decimal128(9, 1)) + + result = arr.cast(pa.decimal128(9, 1), safe=False) + expected = pa.array( + [decimal.Decimal("1234.1"), None], + type=pa.decimal128(9, 1) + ) + assert result.equals(expected) + + with pytest.raises(pa.ArrowInvalid, + match='Decimal value does not fit in precision'): + result = arr.cast(pa.decimal128(5, 2)) + + +def test_safe_cast_nan_to_int_raises(): + arr = pa.array([np.nan, 1.]) + + with pytest.raises(pa.ArrowInvalid, match='truncated'): + arr.cast(pa.int64(), safe=True) + + +def test_cast_signed_to_unsigned(): + safe_cases = [ + (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(), + np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), + (np.array([0, 1, 2, 3], dtype='i2'), pa.uint16(), + np.array([0, 1, 2, 3], dtype='u2'), pa.uint16()) + ] + + for case in safe_cases: + _check_cast_case(case) + + +def test_cast_from_null(): + in_data = [None] * 3 + in_type = pa.null() + out_types = [ + pa.null(), + pa.uint8(), + pa.float16(), + pa.utf8(), + pa.binary(), + pa.binary(10), + pa.list_(pa.int16()), + pa.list_(pa.int32(), 4), + pa.large_list(pa.uint8()), + pa.decimal128(19, 4), + pa.timestamp('us'), + pa.timestamp('us', tz='UTC'), + pa.timestamp('us', tz='Europe/Paris'), + pa.duration('us'), + pa.month_day_nano_interval(), + pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.list_(pa.int8())), + pa.field('c', pa.string())]), + pa.dictionary(pa.int32(), pa.string()), + ] + for out_type in out_types: + _check_cast_case((in_data, in_type, in_data, out_type)) + + out_types = [ + + pa.union([pa.field('a', pa.binary(10)), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), + pa.union([pa.field('a', pa.binary(10)), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), + ] + in_arr = pa.array(in_data, type=pa.null()) + for out_type in out_types: + with pytest.raises(NotImplementedError): + in_arr.cast(out_type) + + +def test_cast_string_to_number_roundtrip(): + cases = [ + (pa.array(["1", "127", "-128"]), + pa.array([1, 127, -128], type=pa.int8())), + (pa.array([None, "18446744073709551615"]), + pa.array([None, 18446744073709551615], type=pa.uint64())), + ] + for in_arr, expected in cases: + casted = in_arr.cast(expected.type, safe=True) + casted.validate(full=True) + assert casted.equals(expected) + casted_back = casted.cast(in_arr.type, safe=True) + casted_back.validate(full=True) + assert casted_back.equals(in_arr) + + +def test_cast_dictionary(): + # cast to the value type + arr = pa.array( + ["foo", "bar", None], + type=pa.dictionary(pa.int64(), pa.string()) + ) + expected = pa.array(["foo", "bar", None]) + assert arr.type == pa.dictionary(pa.int64(), pa.string()) + assert arr.cast(pa.string()) == expected + + # cast to a different key type + for key_type in [pa.int8(), pa.int16(), pa.int32()]: + typ = pa.dictionary(key_type, pa.string()) + expected = pa.array( + ["foo", "bar", None], + type=pa.dictionary(key_type, pa.string()) + ) + assert arr.cast(typ) == expected + + # shouldn't crash (ARROW-7077) + with pytest.raises(pa.ArrowInvalid): + arr.cast(pa.int32()) + + +def test_view(): + # ARROW-5992 + arr = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + expected = pa.array(['foo', 'bar', 'baz'], type=pa.binary()) + + assert arr.view(pa.binary()).equals(expected) + assert arr.view('binary').equals(expected) + + +def test_unique_simple(): + cases = [ + (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])), + (pa.array(['foo', None, 'bar', 'foo']), + pa.array(['foo', None, 'bar'])), + (pa.array(['foo', None, 'bar', 'foo'], pa.large_binary()), + pa.array(['foo', None, 'bar'], pa.large_binary())), + ] + for arr, expected in cases: + result = arr.unique() + assert result.equals(expected) + result = pa.chunked_array([arr]).unique() + assert result.equals(expected) + + +def test_value_counts_simple(): + cases = [ + (pa.array([1, 2, 3, 1, 2, 3]), + pa.array([1, 2, 3]), + pa.array([2, 2, 2], type=pa.int64())), + (pa.array(['foo', None, 'bar', 'foo']), + pa.array(['foo', None, 'bar']), + pa.array([2, 1, 1], type=pa.int64())), + (pa.array(['foo', None, 'bar', 'foo'], pa.large_binary()), + pa.array(['foo', None, 'bar'], pa.large_binary()), + pa.array([2, 1, 1], type=pa.int64())), + ] + for arr, expected_values, expected_counts in cases: + for arr_in in (arr, pa.chunked_array([arr])): + result = arr_in.value_counts() + assert result.type.equals( + pa.struct([pa.field("values", arr.type), + pa.field("counts", pa.int64())])) + assert result.field("values").equals(expected_values) + assert result.field("counts").equals(expected_counts) + + +def test_unique_value_counts_dictionary_type(): + indices = pa.array([3, 0, 0, 0, 1, 1, 3, 0, 1, 3, 0, 1]) + dictionary = pa.array(['foo', 'bar', 'baz', 'qux']) + + arr = pa.DictionaryArray.from_arrays(indices, dictionary) + + unique_result = arr.unique() + expected = pa.DictionaryArray.from_arrays(indices.unique(), dictionary) + assert unique_result.equals(expected) + + result = arr.value_counts() + assert result.field('values').equals(unique_result) + assert result.field('counts').equals(pa.array([3, 5, 4], type='int64')) + + arr = pa.DictionaryArray.from_arrays( + pa.array([], type='int64'), dictionary) + unique_result = arr.unique() + expected = pa.DictionaryArray.from_arrays(pa.array([], type='int64'), + pa.array([], type='utf8')) + assert unique_result.equals(expected) + + result = arr.value_counts() + assert result.field('values').equals(unique_result) + assert result.field('counts').equals(pa.array([], type='int64')) + + +def test_dictionary_encode_simple(): + cases = [ + (pa.array([1, 2, 3, None, 1, 2, 3]), + pa.DictionaryArray.from_arrays( + pa.array([0, 1, 2, None, 0, 1, 2], type='int32'), + [1, 2, 3])), + (pa.array(['foo', None, 'bar', 'foo']), + pa.DictionaryArray.from_arrays( + pa.array([0, None, 1, 0], type='int32'), + ['foo', 'bar'])), + (pa.array(['foo', None, 'bar', 'foo'], type=pa.large_binary()), + pa.DictionaryArray.from_arrays( + pa.array([0, None, 1, 0], type='int32'), + pa.array(['foo', 'bar'], type=pa.large_binary()))), + ] + for arr, expected in cases: + result = arr.dictionary_encode() + assert result.equals(expected) + result = pa.chunked_array([arr]).dictionary_encode() + assert result.num_chunks == 1 + assert result.chunk(0).equals(expected) + result = pa.chunked_array([], type=arr.type).dictionary_encode() + assert result.num_chunks == 0 + assert result.type == expected.type + + +def test_dictionary_encode_sliced(): + cases = [ + (pa.array([1, 2, 3, None, 1, 2, 3])[1:-1], + pa.DictionaryArray.from_arrays( + pa.array([0, 1, None, 2, 0], type='int32'), + [2, 3, 1])), + (pa.array([None, 'foo', 'bar', 'foo', 'xyzzy'])[1:-1], + pa.DictionaryArray.from_arrays( + pa.array([0, 1, 0], type='int32'), + ['foo', 'bar'])), + (pa.array([None, 'foo', 'bar', 'foo', 'xyzzy'], + type=pa.large_string())[1:-1], + pa.DictionaryArray.from_arrays( + pa.array([0, 1, 0], type='int32'), + pa.array(['foo', 'bar'], type=pa.large_string()))), + ] + for arr, expected in cases: + result = arr.dictionary_encode() + assert result.equals(expected) + result = pa.chunked_array([arr]).dictionary_encode() + assert result.num_chunks == 1 + assert result.type == expected.type + assert result.chunk(0).equals(expected) + result = pa.chunked_array([], type=arr.type).dictionary_encode() + assert result.num_chunks == 0 + assert result.type == expected.type + + # ARROW-9143 dictionary_encode after slice was segfaulting + array = pa.array(['foo', 'bar', 'baz']) + array.slice(1).dictionary_encode() + + +def test_dictionary_encode_zero_length(): + # User-facing experience of ARROW-7008 + arr = pa.array([], type=pa.string()) + encoded = arr.dictionary_encode() + assert len(encoded.dictionary) == 0 + encoded.validate(full=True) + + +def test_dictionary_decode(): + cases = [ + (pa.array([1, 2, 3, None, 1, 2, 3]), + pa.DictionaryArray.from_arrays( + pa.array([0, 1, 2, None, 0, 1, 2], type='int32'), + [1, 2, 3])), + (pa.array(['foo', None, 'bar', 'foo']), + pa.DictionaryArray.from_arrays( + pa.array([0, None, 1, 0], type='int32'), + ['foo', 'bar'])), + (pa.array(['foo', None, 'bar', 'foo'], type=pa.large_binary()), + pa.DictionaryArray.from_arrays( + pa.array([0, None, 1, 0], type='int32'), + pa.array(['foo', 'bar'], type=pa.large_binary()))), + ] + for expected, arr in cases: + result = arr.dictionary_decode() + assert result.equals(expected) + + +def test_cast_time32_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int32'), + type=pa.time32('s')) + expected = pa.array([0, 1, 2], type='i4') + + result = arr.cast('i4') + assert result.equals(expected) + + +def test_cast_time64_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int64'), + type=pa.time64('us')) + expected = pa.array([0, 1, 2], type='i8') + + result = arr.cast('i8') + assert result.equals(expected) + + +def test_cast_timestamp_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int64'), + type=pa.timestamp('us')) + expected = pa.array([0, 1, 2], type='i8') + + result = arr.cast('i8') + assert result.equals(expected) + + +def test_cast_date32_to_int(): + arr = pa.array([0, 1, 2], type='i4') + + result1 = arr.cast('date32') + result2 = result1.cast('i4') + + expected1 = pa.array([ + datetime.date(1970, 1, 1), + datetime.date(1970, 1, 2), + datetime.date(1970, 1, 3) + ]).cast('date32') + + assert result1.equals(expected1) + assert result2.equals(arr) + + +def test_cast_duration_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int64'), + type=pa.duration('us')) + expected = pa.array([0, 1, 2], type='i8') + + result = arr.cast('i8') + assert result.equals(expected) + + +def test_cast_binary_to_utf8(): + binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) + utf8_arr = binary_arr.cast(pa.utf8()) + expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + + assert utf8_arr.equals(expected) + + non_utf8_values = [('maƱana').encode('utf-16-le')] + non_utf8_binary = pa.array(non_utf8_values) + assert non_utf8_binary.type == pa.binary() + with pytest.raises(ValueError): + non_utf8_binary.cast(pa.string()) + + non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]), + type=pa.binary()) + # No error + casted = non_utf8_all_null.cast(pa.string()) + assert casted.null_count == 1 + + +def test_cast_date64_to_int(): + arr = pa.array(np.array([0, 1, 2], dtype='int64'), + type=pa.date64()) + expected = pa.array([0, 1, 2], type='i8') + + result = arr.cast('i8') + + assert result.equals(expected) + + +def test_date64_from_builtin_datetime(): + val1 = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456) + val2 = datetime.datetime(2000, 1, 1) + result = pa.array([val1, val2], type='date64') + result2 = pa.array([val1.date(), val2.date()], type='date64') + + assert result.equals(result2) + + as_i8 = result.view('int64') + assert as_i8[0].as_py() == as_i8[1].as_py() + + +@pytest.mark.parametrize(('ty', 'values'), [ + ('bool', [True, False, True]), + ('uint8', range(0, 255)), + ('int8', range(0, 128)), + ('uint16', range(0, 10)), + ('int16', range(0, 10)), + ('uint32', range(0, 10)), + ('int32', range(0, 10)), + ('uint64', range(0, 10)), + ('int64', range(0, 10)), + ('float', [0.0, 0.1, 0.2]), + ('double', [0.0, 0.1, 0.2]), + ('string', ['a', 'b', 'c']), + ('binary', [b'a', b'b', b'c']), + (pa.binary(3), [b'abc', b'bcd', b'cde']) +]) +def test_cast_identities(ty, values): + arr = pa.array(values, type=ty) + assert arr.cast(ty).equals(arr) + + +pickle_test_parametrize = pytest.mark.parametrize( + ('data', 'typ'), + [ + ([True, False, True, True], pa.bool_()), + ([1, 2, 4, 6], pa.int64()), + ([1.0, 2.5, None], pa.float64()), + (['a', None, 'b'], pa.string()), + ([], None), + ([[1, 2], [3]], pa.list_(pa.int64())), + ([[4, 5], [6]], pa.large_list(pa.int16())), + ([['a'], None, ['b', 'c']], pa.list_(pa.string())), + ([(1, 'a'), (2, 'c'), None], + pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) + ] +) + + +@pickle_test_parametrize +def test_array_pickle(data, typ): + # Allocate here so that we don't have any Arrow data allocated. + # This is needed to ensure that allocator tests can be reliable. + array = pa.array(data, type=typ) + for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): + result = pickle.loads(pickle.dumps(array, proto)) + assert array.equals(result) + + +def test_array_pickle_dictionary(): + # not included in the above as dictionary array cannot be created with + # the pa.array function + array = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1], ['a', 'b', 'c']) + for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): + result = pickle.loads(pickle.dumps(array, proto)) + assert array.equals(result) + + +@h.given( + past.arrays( + past.all_types, + size=st.integers(min_value=0, max_value=10) + ) +) +def test_pickling(arr): + data = pickle.dumps(arr) + restored = pickle.loads(data) + assert arr.equals(restored) + + +@pickle_test_parametrize +def test_array_pickle5(data, typ): + # Test zero-copy pickling with protocol 5 (PEP 574) + picklemod = pickle5 or pickle + if pickle5 is None and picklemod.HIGHEST_PROTOCOL < 5: + pytest.skip("need pickle5 package or Python 3.8+") + + array = pa.array(data, type=typ) + addresses = [buf.address if buf is not None else 0 + for buf in array.buffers()] + + for proto in range(5, pickle.HIGHEST_PROTOCOL + 1): + buffers = [] + pickled = picklemod.dumps(array, proto, buffer_callback=buffers.append) + result = picklemod.loads(pickled, buffers=buffers) + assert array.equals(result) + + result_addresses = [buf.address if buf is not None else 0 + for buf in result.buffers()] + assert result_addresses == addresses + + +@pytest.mark.parametrize( + 'narr', + [ + np.arange(10, dtype=np.int64), + np.arange(10, dtype=np.int32), + np.arange(10, dtype=np.int16), + np.arange(10, dtype=np.int8), + np.arange(10, dtype=np.uint64), + np.arange(10, dtype=np.uint32), + np.arange(10, dtype=np.uint16), + np.arange(10, dtype=np.uint8), + np.arange(10, dtype=np.float64), + np.arange(10, dtype=np.float32), + np.arange(10, dtype=np.float16), + ] +) +def test_to_numpy_roundtrip(narr): + arr = pa.array(narr) + assert narr.dtype == arr.to_numpy().dtype + np.testing.assert_array_equal(narr, arr.to_numpy()) + np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) + np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) + np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) + + +def test_array_uint64_from_py_over_range(): + arr = pa.array([2 ** 63], type=pa.uint64()) + expected = pa.array(np.array([2 ** 63], dtype='u8')) + assert arr.equals(expected) + + +def test_array_conversions_no_sentinel_values(): + arr = np.array([1, 2, 3, 4], dtype='int8') + refcount = sys.getrefcount(arr) + arr2 = pa.array(arr) # noqa + assert sys.getrefcount(arr) == (refcount + 1) + + assert arr2.type == 'int8' + + arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'), + type='float32') + assert arr3.type == 'float32' + assert arr3.null_count == 0 + + +def test_time32_time64_from_integer(): + # ARROW-4111 + result = pa.array([1, 2, None], type=pa.time32('s')) + expected = pa.array([datetime.time(second=1), + datetime.time(second=2), None], + type=pa.time32('s')) + assert result.equals(expected) + + result = pa.array([1, 2, None], type=pa.time32('ms')) + expected = pa.array([datetime.time(microsecond=1000), + datetime.time(microsecond=2000), None], + type=pa.time32('ms')) + assert result.equals(expected) + + result = pa.array([1, 2, None], type=pa.time64('us')) + expected = pa.array([datetime.time(microsecond=1), + datetime.time(microsecond=2), None], + type=pa.time64('us')) + assert result.equals(expected) + + result = pa.array([1000, 2000, None], type=pa.time64('ns')) + expected = pa.array([datetime.time(microsecond=1), + datetime.time(microsecond=2), None], + type=pa.time64('ns')) + assert result.equals(expected) + + +def test_binary_string_pandas_null_sentinels(): + # ARROW-6227 + def _check_case(ty): + arr = pa.array(['string', np.nan], type=ty, from_pandas=True) + expected = pa.array(['string', None], type=ty) + assert arr.equals(expected) + _check_case('binary') + _check_case('utf8') + + +def test_pandas_null_sentinels_raise_error(): + # ARROW-6227 + cases = [ + ([None, np.nan], 'null'), + (['string', np.nan], 'binary'), + (['string', np.nan], 'utf8'), + (['string', np.nan], 'large_binary'), + (['string', np.nan], 'large_utf8'), + ([b'string', np.nan], pa.binary(6)), + ([True, np.nan], pa.bool_()), + ([decimal.Decimal('0'), np.nan], pa.decimal128(12, 2)), + ([0, np.nan], pa.date32()), + ([0, np.nan], pa.date32()), + ([0, np.nan], pa.date64()), + ([0, np.nan], pa.time32('s')), + ([0, np.nan], pa.time64('us')), + ([0, np.nan], pa.timestamp('us')), + ([0, np.nan], pa.duration('us')), + ] + for case, ty in cases: + # Both types of exceptions are raised. May want to clean that up + with pytest.raises((ValueError, TypeError)): + pa.array(case, type=ty) + + # from_pandas option suppresses failure + result = pa.array(case, type=ty, from_pandas=True) + assert result.null_count == (1 if ty != 'null' else 2) + + +@pytest.mark.pandas +def test_pandas_null_sentinels_index(): + # ARROW-7023 - ensure that when passing a pandas Index, "from_pandas" + # semantics are used + import pandas as pd + idx = pd.Index([1, 2, np.nan], dtype=object) + result = pa.array(idx) + expected = pa.array([1, 2, np.nan], from_pandas=True) + assert result.equals(expected) + + +def test_array_from_numpy_datetimeD(): + arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') + + result = pa.array(arr) + expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32()) + assert result.equals(expected) + + +def test_array_from_naive_datetimes(): + arr = pa.array([ + None, + datetime.datetime(2017, 4, 4, 12, 11, 10), + datetime.datetime(2018, 1, 1, 0, 2, 0) + ]) + assert arr.type == pa.timestamp('us', tz=None) + + +@pytest.mark.parametrize(('dtype', 'type'), [ + ('datetime64[s]', pa.timestamp('s')), + ('datetime64[ms]', pa.timestamp('ms')), + ('datetime64[us]', pa.timestamp('us')), + ('datetime64[ns]', pa.timestamp('ns')) +]) +def test_array_from_numpy_datetime(dtype, type): + data = [ + None, + datetime.datetime(2017, 4, 4, 12, 11, 10), + datetime.datetime(2018, 1, 1, 0, 2, 0) + ] + + # from numpy array + arr = pa.array(np.array(data, dtype=dtype)) + expected = pa.array(data, type=type) + assert arr.equals(expected) + + # from list of numpy scalars + arr = pa.array(list(np.array(data, dtype=dtype))) + assert arr.equals(expected) + + +def test_array_from_different_numpy_datetime_units_raises(): + data = [ + None, + datetime.datetime(2017, 4, 4, 12, 11, 10), + datetime.datetime(2018, 1, 1, 0, 2, 0) + ] + s = np.array(data, dtype='datetime64[s]') + ms = np.array(data, dtype='datetime64[ms]') + data = list(s[:2]) + list(ms[2:]) + + with pytest.raises(pa.ArrowNotImplementedError): + pa.array(data) + + +@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's']) +def test_array_from_list_of_timestamps(unit): + n = np.datetime64('NaT', unit) + x = np.datetime64('2017-01-01 01:01:01.111111111', unit) + y = np.datetime64('2018-11-22 12:24:48.111111111', unit) + + a1 = pa.array([n, x, y]) + a2 = pa.array([n, x, y], type=pa.timestamp(unit)) + + assert a1.type == a2.type + assert a1.type.unit == unit + assert a1[0] == a2[0] + + +def test_array_from_timestamp_with_generic_unit(): + n = np.datetime64('NaT') + x = np.datetime64('2017-01-01 01:01:01.111111111') + y = np.datetime64('2018-11-22 12:24:48.111111111') + + with pytest.raises(pa.ArrowNotImplementedError, + match='Unbound or generic datetime64 time unit'): + pa.array([n, x, y]) + + +@pytest.mark.parametrize(('dtype', 'type'), [ + ('timedelta64[s]', pa.duration('s')), + ('timedelta64[ms]', pa.duration('ms')), + ('timedelta64[us]', pa.duration('us')), + ('timedelta64[ns]', pa.duration('ns')) +]) +def test_array_from_numpy_timedelta(dtype, type): + data = [ + None, + datetime.timedelta(1), + datetime.timedelta(0, 1) + ] + + # from numpy array + np_arr = np.array(data, dtype=dtype) + arr = pa.array(np_arr) + assert isinstance(arr, pa.DurationArray) + assert arr.type == type + expected = pa.array(data, type=type) + assert arr.equals(expected) + assert arr.to_pylist() == data + + # from list of numpy scalars + arr = pa.array(list(np.array(data, dtype=dtype))) + assert arr.equals(expected) + assert arr.to_pylist() == data + + +def test_array_from_numpy_timedelta_incorrect_unit(): + # generic (no unit) + td = np.timedelta64(1) + + for data in [[td], np.array([td])]: + with pytest.raises(NotImplementedError): + pa.array(data) + + # unsupported unit + td = np.timedelta64(1, 'M') + for data in [[td], np.array([td])]: + with pytest.raises(NotImplementedError): + pa.array(data) + + +def test_array_from_numpy_ascii(): + arr = np.array(['abcde', 'abc', ''], dtype='|S5') + + arrow_arr = pa.array(arr) + assert arrow_arr.type == 'binary' + expected = pa.array(['abcde', 'abc', ''], type='binary') + assert arrow_arr.equals(expected) + + mask = np.array([False, True, False]) + arrow_arr = pa.array(arr, mask=mask) + expected = pa.array(['abcde', None, ''], type='binary') + assert arrow_arr.equals(expected) + + # Strided variant + arr = np.array(['abcde', 'abc', ''] * 5, dtype='|S5')[::2] + mask = np.array([False, True, False] * 5)[::2] + arrow_arr = pa.array(arr, mask=mask) + + expected = pa.array(['abcde', '', None, 'abcde', '', None, 'abcde', ''], + type='binary') + assert arrow_arr.equals(expected) + + # 0 itemsize + arr = np.array(['', '', ''], dtype='|S0') + arrow_arr = pa.array(arr) + expected = pa.array(['', '', ''], type='binary') + assert arrow_arr.equals(expected) + + +def test_interval_array_from_timedelta(): + data = [ + None, + datetime.timedelta(days=1, seconds=1, microseconds=1, + milliseconds=1, minutes=1, hours=1, weeks=1)] + + # From timedelta (explicit type required) + arr = pa.array(data, pa.month_day_nano_interval()) + assert isinstance(arr, pa.MonthDayNanoIntervalArray) + assert arr.type == pa.month_day_nano_interval() + expected_list = [ + None, + pa.MonthDayNano([0, 8, + (datetime.timedelta(seconds=1, microseconds=1, + milliseconds=1, minutes=1, + hours=1) // + datetime.timedelta(microseconds=1)) * 1000])] + expected = pa.array(expected_list) + assert arr.equals(expected) + assert arr.to_pylist() == expected_list + + +@pytest.mark.pandas +def test_interval_array_from_relativedelta(): + # dateutil is dependency of pandas + from dateutil.relativedelta import relativedelta + from pandas import DateOffset + data = [ + None, + relativedelta(years=1, months=1, + days=1, seconds=1, microseconds=1, + minutes=1, hours=1, weeks=1, leapdays=1)] + # Note leapdays are ignored. + + # From relativedelta + arr = pa.array(data) + assert isinstance(arr, pa.MonthDayNanoIntervalArray) + assert arr.type == pa.month_day_nano_interval() + expected_list = [ + None, + pa.MonthDayNano([13, 8, + (datetime.timedelta(seconds=1, microseconds=1, + minutes=1, hours=1) // + datetime.timedelta(microseconds=1)) * 1000])] + expected = pa.array(expected_list) + assert arr.equals(expected) + assert arr.to_pandas().tolist() == [ + None, DateOffset(months=13, days=8, + microseconds=( + datetime.timedelta(seconds=1, microseconds=1, + minutes=1, hours=1) // + datetime.timedelta(microseconds=1)), + nanoseconds=0)] + with pytest.raises(ValueError): + pa.array([DateOffset(years=((1 << 32) // 12), months=100)]) + with pytest.raises(ValueError): + pa.array([DateOffset(weeks=((1 << 32) // 7), days=100)]) + with pytest.raises(ValueError): + pa.array([DateOffset(seconds=((1 << 64) // 1000000000), + nanoseconds=1)]) + with pytest.raises(ValueError): + pa.array([DateOffset(microseconds=((1 << 64) // 100))]) + + +@pytest.mark.pandas +def test_interval_array_from_dateoffset(): + from pandas.tseries.offsets import DateOffset + data = [ + None, + DateOffset(years=1, months=1, + days=1, seconds=1, microseconds=1, + minutes=1, hours=1, weeks=1, nanoseconds=1), + DateOffset()] + + arr = pa.array(data) + assert isinstance(arr, pa.MonthDayNanoIntervalArray) + assert arr.type == pa.month_day_nano_interval() + expected_list = [ + None, + pa.MonthDayNano([13, 8, 3661000001001]), + pa.MonthDayNano([0, 0, 0])] + expected = pa.array(expected_list) + assert arr.equals(expected) + assert arr.to_pandas().tolist() == [ + None, DateOffset(months=13, days=8, + microseconds=( + datetime.timedelta(seconds=1, microseconds=1, + minutes=1, hours=1) // + datetime.timedelta(microseconds=1)), + nanoseconds=1), + DateOffset(months=0, days=0, microseconds=0, nanoseconds=0)] + + +def test_array_from_numpy_unicode(): + dtypes = ['<U5', '>U5'] + + for dtype in dtypes: + arr = np.array(['abcde', 'abc', ''], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == 'utf8' + expected = pa.array(['abcde', 'abc', ''], type='utf8') + assert arrow_arr.equals(expected) + + mask = np.array([False, True, False]) + arrow_arr = pa.array(arr, mask=mask) + expected = pa.array(['abcde', None, ''], type='utf8') + assert arrow_arr.equals(expected) + + # Strided variant + arr = np.array(['abcde', 'abc', ''] * 5, dtype=dtype)[::2] + mask = np.array([False, True, False] * 5)[::2] + arrow_arr = pa.array(arr, mask=mask) + + expected = pa.array(['abcde', '', None, 'abcde', '', None, + 'abcde', ''], type='utf8') + assert arrow_arr.equals(expected) + + # 0 itemsize + arr = np.array(['', '', ''], dtype='<U0') + arrow_arr = pa.array(arr) + expected = pa.array(['', '', ''], type='utf8') + assert arrow_arr.equals(expected) + + +def test_array_string_from_non_string(): + # ARROW-5682 - when converting to string raise on non string-like dtype + with pytest.raises(TypeError): + pa.array(np.array([1, 2, 3]), type=pa.string()) + + +def test_array_string_from_all_null(): + # ARROW-5682 + vals = np.array([None, None], dtype=object) + arr = pa.array(vals, type=pa.string()) + assert arr.null_count == 2 + + vals = np.array([np.nan, np.nan], dtype='float64') + # by default raises, but accept as all-null when from_pandas=True + with pytest.raises(TypeError): + pa.array(vals, type=pa.string()) + arr = pa.array(vals, type=pa.string(), from_pandas=True) + assert arr.null_count == 2 + + +def test_array_from_masked(): + ma = np.ma.array([1, 2, 3, 4], dtype='int64', + mask=[False, False, True, False]) + result = pa.array(ma) + expected = pa.array([1, 2, None, 4], type='int64') + assert expected.equals(result) + + with pytest.raises(ValueError, match="Cannot pass a numpy masked array"): + pa.array(ma, mask=np.array([True, False, False, False])) + + +def test_array_from_shrunken_masked(): + ma = np.ma.array([0], dtype='int64') + result = pa.array(ma) + expected = pa.array([0], type='int64') + assert expected.equals(result) + + +def test_array_from_invalid_dim_raises(): + msg = "only handle 1-dimensional arrays" + arr2d = np.array([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(ValueError, match=msg): + pa.array(arr2d) + + arr0d = np.array(0) + with pytest.raises(ValueError, match=msg): + pa.array(arr0d) + + +def test_array_from_strided_bool(): + # ARROW-6325 + arr = np.ones((3, 2), dtype=bool) + result = pa.array(arr[:, 0]) + expected = pa.array([True, True, True]) + assert result.equals(expected) + result = pa.array(arr[0, :]) + expected = pa.array([True, True]) + assert result.equals(expected) + + +def test_array_from_strided(): + pydata = [ + ([b"ab", b"cd", b"ef"], (pa.binary(), pa.binary(2))), + ([1, 2, 3], (pa.int8(), pa.int16(), pa.int32(), pa.int64())), + ([1.0, 2.0, 3.0], (pa.float32(), pa.float64())), + (["ab", "cd", "ef"], (pa.utf8(), )) + ] + + for values, dtypes in pydata: + nparray = np.array(values) + for patype in dtypes: + for mask in (None, np.array([False, False])): + arrow_array = pa.array(nparray[::2], patype, + mask=mask) + assert values[::2] == arrow_array.to_pylist() + + +def test_boolean_true_count_false_count(): + # ARROW-9145 + arr = pa.array([True, True, None, False, None, True] * 1000) + assert arr.true_count == 3000 + assert arr.false_count == 1000 + + +def test_buffers_primitive(): + a = pa.array([1, 2, None, 4], type=pa.int16()) + buffers = a.buffers() + assert len(buffers) == 2 + null_bitmap = buffers[0].to_pybytes() + assert 1 <= len(null_bitmap) <= 64 # XXX this is varying + assert bytearray(null_bitmap)[0] == 0b00001011 + + # Slicing does not affect the buffers but the offset + a_sliced = a[1:] + buffers = a_sliced.buffers() + a_sliced.offset == 1 + assert len(buffers) == 2 + null_bitmap = buffers[0].to_pybytes() + assert 1 <= len(null_bitmap) <= 64 # XXX this is varying + assert bytearray(null_bitmap)[0] == 0b00001011 + + assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4) + + a = pa.array(np.int8([4, 5, 6])) + buffers = a.buffers() + assert len(buffers) == 2 + # No null bitmap from Numpy int array + assert buffers[0] is None + assert struct.unpack('3b', buffers[1].to_pybytes()) == (4, 5, 6) + + a = pa.array([b'foo!', None, b'bar!!']) + buffers = a.buffers() + assert len(buffers) == 3 + null_bitmap = buffers[0].to_pybytes() + assert bytearray(null_bitmap)[0] == 0b00000101 + offsets = buffers[1].to_pybytes() + assert struct.unpack('4i', offsets) == (0, 4, 4, 9) + values = buffers[2].to_pybytes() + assert values == b'foo!bar!!' + + +def test_buffers_nested(): + a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) + buffers = a.buffers() + assert len(buffers) == 4 + # The parent buffers + null_bitmap = buffers[0].to_pybytes() + assert bytearray(null_bitmap)[0] == 0b00000101 + offsets = buffers[1].to_pybytes() + assert struct.unpack('4i', offsets) == (0, 2, 2, 6) + # The child buffers + null_bitmap = buffers[2].to_pybytes() + assert bytearray(null_bitmap)[0] == 0b00110111 + values = buffers[3].to_pybytes() + assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) + + a = pa.array([(42, None), None, (None, 43)], + type=pa.struct([pa.field('a', pa.int8()), + pa.field('b', pa.int16())])) + buffers = a.buffers() + assert len(buffers) == 5 + # The parent buffer + null_bitmap = buffers[0].to_pybytes() + assert bytearray(null_bitmap)[0] == 0b00000101 + # The child buffers: 'a' + null_bitmap = buffers[1].to_pybytes() + assert bytearray(null_bitmap)[0] == 0b00000011 + values = buffers[2].to_pybytes() + assert struct.unpack('bxx', values) == (42,) + # The child buffers: 'b' + null_bitmap = buffers[3].to_pybytes() + assert bytearray(null_bitmap)[0] == 0b00000110 + values = buffers[4].to_pybytes() + assert struct.unpack('4xh', values) == (43,) + + +def test_nbytes_sizeof(): + a = pa.array(np.array([4, 5, 6], dtype='int64')) + assert a.nbytes == 8 * 3 + assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes + a = pa.array([1, None, 3], type='int64') + assert a.nbytes == 8*3 + 1 + assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes + a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) + assert a.nbytes == 1 + 4 * 4 + 1 + 6 * 8 + assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes + + +def test_invalid_tensor_constructor_repr(): + # ARROW-2638: prevent calling extension class constructors directly + with pytest.raises(TypeError): + repr(pa.Tensor([1])) + + +def test_invalid_tensor_construction(): + with pytest.raises(TypeError): + pa.Tensor() + + +@pytest.mark.parametrize(('offset_type', 'list_type_factory'), + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) +def test_list_array_flatten(offset_type, list_type_factory): + typ2 = list_type_factory( + list_type_factory( + pa.int64() + ) + ) + arr2 = pa.array([ + None, + [ + [1, None, 2], + None, + [3, 4] + ], + [], + [ + [], + [5, 6], + None + ], + [ + [7, 8] + ] + ], type=typ2) + offsets2 = pa.array([0, 0, 3, 3, 6, 7], type=offset_type) + + typ1 = list_type_factory(pa.int64()) + arr1 = pa.array([ + [1, None, 2], + None, + [3, 4], + [], + [5, 6], + None, + [7, 8] + ], type=typ1) + offsets1 = pa.array([0, 3, 3, 5, 5, 7, 7, 9], type=offset_type) + + arr0 = pa.array([ + 1, None, 2, + 3, 4, + 5, 6, + 7, 8 + ], type=pa.int64()) + + assert arr2.flatten().equals(arr1) + assert arr2.offsets.equals(offsets2) + assert arr2.values.equals(arr1) + assert arr1.flatten().equals(arr0) + assert arr1.offsets.equals(offsets1) + assert arr1.values.equals(arr0) + assert arr2.flatten().flatten().equals(arr0) + assert arr2.values.values.equals(arr0) + + +@pytest.mark.parametrize(('offset_type', 'list_type_factory'), + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) +def test_list_value_parent_indices(offset_type, list_type_factory): + arr = pa.array( + [ + [0, 1, 2], + None, + [], + [3, 4] + ], type=list_type_factory(pa.int32())) + expected = pa.array([0, 0, 0, 3, 3], type=offset_type) + assert arr.value_parent_indices().equals(expected) + + +@pytest.mark.parametrize(('offset_type', 'list_type_factory'), + [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) +def test_list_value_lengths(offset_type, list_type_factory): + arr = pa.array( + [ + [0, 1, 2], + None, + [], + [3, 4] + ], type=list_type_factory(pa.int32())) + expected = pa.array([3, None, 0, 2], type=offset_type) + assert arr.value_lengths().equals(expected) + + +@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list]) +def test_list_array_flatten_non_canonical(list_type_factory): + # Non-canonical list array (null elements backed by non-empty sublists) + typ = list_type_factory(pa.int64()) + arr = pa.array([[1], [2, 3], [4, 5, 6]], type=typ) + buffers = arr.buffers()[:2] + buffers[0] = pa.py_buffer(b"\x05") # validity bitmap + arr = arr.from_buffers(arr.type, len(arr), buffers, children=[arr.values]) + assert arr.to_pylist() == [[1], None, [4, 5, 6]] + assert arr.offsets.to_pylist() == [0, 1, 3, 6] + + flattened = arr.flatten() + flattened.validate(full=True) + assert flattened.type == typ.value_type + assert flattened.to_pylist() == [1, 4, 5, 6] + + # .values is the physical values array (including masked elements) + assert arr.values.to_pylist() == [1, 2, 3, 4, 5, 6] + + +@pytest.mark.parametrize('klass', [pa.ListArray, pa.LargeListArray]) +def test_list_array_values_offsets_sliced(klass): + # ARROW-7301 + arr = klass.from_arrays(offsets=[0, 3, 4, 6], values=[1, 2, 3, 4, 5, 6]) + assert arr.values.to_pylist() == [1, 2, 3, 4, 5, 6] + assert arr.offsets.to_pylist() == [0, 3, 4, 6] + + # sliced -> values keeps referring to full values buffer, but offsets is + # sliced as well so the offsets correctly point into the full values array + # sliced -> flatten() will return the sliced value array. + arr2 = arr[1:] + assert arr2.values.to_pylist() == [1, 2, 3, 4, 5, 6] + assert arr2.offsets.to_pylist() == [3, 4, 6] + assert arr2.flatten().to_pylist() == [4, 5, 6] + i = arr2.offsets[0].as_py() + j = arr2.offsets[1].as_py() + assert arr2[0].as_py() == arr2.values[i:j].to_pylist() == [4] + + +def test_fixed_size_list_array_flatten(): + typ2 = pa.list_(pa.list_(pa.int64(), 2), 3) + arr2 = pa.array([ + [ + [1, 2], + [3, 4], + [5, 6], + ], + None, + [ + [7, None], + None, + [8, 9] + ], + ], type=typ2) + assert arr2.type.equals(typ2) + + typ1 = pa.list_(pa.int64(), 2) + arr1 = pa.array([ + [1, 2], [3, 4], [5, 6], + None, None, None, + [7, None], None, [8, 9] + ], type=typ1) + assert arr1.type.equals(typ1) + assert arr2.flatten().equals(arr1) + + typ0 = pa.int64() + arr0 = pa.array([ + 1, 2, 3, 4, 5, 6, + None, None, None, None, None, None, + 7, None, None, None, 8, 9, + ], type=typ0) + assert arr0.type.equals(typ0) + assert arr1.flatten().equals(arr0) + assert arr2.flatten().flatten().equals(arr0) + + +def test_struct_array_flatten(): + ty = pa.struct([pa.field('x', pa.int16()), + pa.field('y', pa.float32())]) + a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) + xs, ys = a.flatten() + assert xs.type == pa.int16() + assert ys.type == pa.float32() + assert xs.to_pylist() == [1, 3, 5] + assert ys.to_pylist() == [2.5, 4.5, 6.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [3, 5] + assert ys.to_pylist() == [4.5, 6.5] + + a = pa.array([(1, 2.5), None, (3, 4.5)], type=ty) + xs, ys = a.flatten() + assert xs.to_pylist() == [1, None, 3] + assert ys.to_pylist() == [2.5, None, 4.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [None, 3] + assert ys.to_pylist() == [None, 4.5] + + a = pa.array([(1, None), (2, 3.5), (None, 4.5)], type=ty) + xs, ys = a.flatten() + assert xs.to_pylist() == [1, 2, None] + assert ys.to_pylist() == [None, 3.5, 4.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [2, None] + assert ys.to_pylist() == [3.5, 4.5] + + a = pa.array([(1, None), None, (None, 2.5)], type=ty) + xs, ys = a.flatten() + assert xs.to_pylist() == [1, None, None] + assert ys.to_pylist() == [None, None, 2.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [None, None] + assert ys.to_pylist() == [None, 2.5] + + +def test_struct_array_field(): + ty = pa.struct([pa.field('x', pa.int16()), + pa.field('y', pa.float32())]) + a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) + + x0 = a.field(0) + y0 = a.field(1) + x1 = a.field(-2) + y1 = a.field(-1) + x2 = a.field('x') + y2 = a.field('y') + + assert isinstance(x0, pa.lib.Int16Array) + assert isinstance(y1, pa.lib.FloatArray) + assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) + assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) + assert x0.equals(x1) + assert x0.equals(x2) + assert y0.equals(y1) + assert y0.equals(y2) + + for invalid_index in [None, pa.int16()]: + with pytest.raises(TypeError): + a.field(invalid_index) + + for invalid_index in [3, -3]: + with pytest.raises(IndexError): + a.field(invalid_index) + + for invalid_name in ['z', '']: + with pytest.raises(KeyError): + a.field(invalid_name) + + +def test_empty_cast(): + types = [ + pa.null(), + pa.bool_(), + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.float16(), + pa.float32(), + pa.float64(), + pa.date32(), + pa.date64(), + pa.binary(), + pa.binary(length=4), + pa.string(), + ] + + for (t1, t2) in itertools.product(types, types): + try: + # ARROW-4766: Ensure that supported types conversion don't segfault + # on empty arrays of common types + pa.array([], type=t1).cast(t2) + except (pa.lib.ArrowNotImplementedError, pa.ArrowInvalid): + continue + + +def test_nested_dictionary_array(): + dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b']) + list_arr = pa.ListArray.from_arrays([0, 2, 3], dict_arr) + assert list_arr.to_pylist() == [['a', 'b'], ['a']] + + dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b']) + dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr) + assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a'] + + +def test_array_from_numpy_str_utf8(): + # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python + # 2 they are NPY_STRING (binary), so we must do UTF-8 validation + vec = np.array(["toto", "tata"]) + vec2 = np.array(["toto", "tata"], dtype=object) + + arr = pa.array(vec, pa.string()) + arr2 = pa.array(vec2, pa.string()) + expected = pa.array(["toto", "tata"]) + assert arr.equals(expected) + assert arr2.equals(expected) + + # with mask, separate code path + mask = np.array([False, False], dtype=bool) + arr = pa.array(vec, pa.string(), mask=mask) + assert arr.equals(expected) + + # UTF8 validation failures + vec = np.array([('maƱana').encode('utf-16-le')]) + with pytest.raises(ValueError): + pa.array(vec, pa.string()) + + with pytest.raises(ValueError): + pa.array(vec, pa.string(), mask=np.array([False])) + + +@pytest.mark.slow +@pytest.mark.large_memory +def test_numpy_binary_overflow_to_chunked(): + # ARROW-3762, ARROW-5966 + + # 2^31 + 1 bytes + values = [b'x'] + unicode_values = ['x'] + + # Make 10 unique 1MB strings then repeat then 2048 times + unique_strings = { + i: b'x' * ((1 << 20) - 1) + str(i % 10).encode('utf8') + for i in range(10) + } + unicode_unique_strings = {i: x.decode('utf8') + for i, x in unique_strings.items()} + values += [unique_strings[i % 10] for i in range(1 << 11)] + unicode_values += [unicode_unique_strings[i % 10] + for i in range(1 << 11)] + + for case, ex_type in [(values, pa.binary()), + (unicode_values, pa.utf8())]: + arr = np.array(case) + arrow_arr = pa.array(arr) + arr = None + + assert isinstance(arrow_arr, pa.ChunkedArray) + assert arrow_arr.type == ex_type + + # Split up into 16MB chunks. 128 * 16 = 2048, so 129 + assert arrow_arr.num_chunks == 129 + + value_index = 0 + for i in range(arrow_arr.num_chunks): + chunk = arrow_arr.chunk(i) + for val in chunk: + assert val.as_py() == case[value_index] + value_index += 1 + + +@pytest.mark.large_memory +def test_list_child_overflow_to_chunked(): + kilobyte_string = 'x' * 1024 + two_mega = 2**21 + + vals = [[kilobyte_string]] * (two_mega - 1) + arr = pa.array(vals) + assert isinstance(arr, pa.Array) + assert len(arr) == two_mega - 1 + + vals = [[kilobyte_string]] * two_mega + arr = pa.array(vals) + assert isinstance(arr, pa.ChunkedArray) + assert len(arr) == two_mega + assert len(arr.chunk(0)) == two_mega - 1 + assert len(arr.chunk(1)) == 1 + + +def test_infer_type_masked(): + # ARROW-5208 + ty = pa.infer_type(['foo', 'bar', None, 2], + mask=[False, False, False, True]) + assert ty == pa.utf8() + + # all masked + ty = pa.infer_type(['foo', 'bar', None, 2], + mask=np.array([True, True, True, True])) + assert ty == pa.null() + + # length 0 + assert pa.infer_type([], mask=[]) == pa.null() + + +def test_array_masked(): + # ARROW-5208 + arr = pa.array([4, None, 4, 3.], + mask=np.array([False, True, False, True])) + assert arr.type == pa.int64() + + # ndarray dtype=object argument + arr = pa.array(np.array([4, None, 4, 3.], dtype="O"), + mask=np.array([False, True, False, True])) + assert arr.type == pa.int64() + + +def test_array_supported_masks(): + # ARROW-13883 + arr = pa.array([4, None, 4, 3.], + mask=np.array([False, True, False, True])) + assert arr.to_pylist() == [4, None, 4, None] + + arr = pa.array([4, None, 4, 3], + mask=pa.array([False, True, False, True])) + assert arr.to_pylist() == [4, None, 4, None] + + arr = pa.array([4, None, 4, 3], + mask=[False, True, False, True]) + assert arr.to_pylist() == [4, None, 4, None] + + arr = pa.array([4, 3, None, 3], + mask=[False, True, False, True]) + assert arr.to_pylist() == [4, None, None, None] + + # Non boolean values + with pytest.raises(pa.ArrowTypeError): + arr = pa.array([4, None, 4, 3], + mask=pa.array([1.0, 2.0, 3.0, 4.0])) + + with pytest.raises(pa.ArrowTypeError): + arr = pa.array([4, None, 4, 3], + mask=[1.0, 2.0, 3.0, 4.0]) + + with pytest.raises(pa.ArrowTypeError): + arr = pa.array([4, None, 4, 3], + mask=np.array([1.0, 2.0, 3.0, 4.0])) + + with pytest.raises(pa.ArrowTypeError): + arr = pa.array([4, None, 4, 3], + mask=pa.array([False, True, False, True], + mask=pa.array([True, True, True, True]))) + + with pytest.raises(pa.ArrowTypeError): + arr = pa.array([4, None, 4, 3], + mask=pa.array([False, None, False, True])) + + # Numpy arrays only accepts numpy masks + with pytest.raises(TypeError): + arr = pa.array(np.array([4, None, 4, 3.]), + mask=[True, False, True, False]) + + with pytest.raises(TypeError): + arr = pa.array(np.array([4, None, 4, 3.]), + mask=pa.array([True, False, True, False])) + + +def test_binary_array_masked(): + # ARROW-12431 + masked_basic = pa.array([b'\x05'], type=pa.binary(1), + mask=np.array([False])) + assert [b'\x05'] == masked_basic.to_pylist() + + # Fixed Length Binary + masked = pa.array(np.array([b'\x05']), type=pa.binary(1), + mask=np.array([False])) + assert [b'\x05'] == masked.to_pylist() + + masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(1), + mask=np.array([True])) + assert [None] == masked_nulls.to_pylist() + + # Variable Length Binary + masked = pa.array(np.array([b'\x05']), type=pa.binary(), + mask=np.array([False])) + assert [b'\x05'] == masked.to_pylist() + + masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(), + mask=np.array([True])) + assert [None] == masked_nulls.to_pylist() + + # Fixed Length Binary, copy + npa = np.array([b'aaa', b'bbb', b'ccc']*10) + arrow_array = pa.array(npa, type=pa.binary(3), + mask=np.array([False, False, False]*10)) + npa[npa == b"bbb"] = b"XXX" + assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist() + + +def test_binary_array_strided(): + # Masked + nparray = np.array([b"ab", b"cd", b"ef"]) + arrow_array = pa.array(nparray[::2], pa.binary(2), + mask=np.array([False, False])) + assert [b"ab", b"ef"] == arrow_array.to_pylist() + + # Unmasked + nparray = np.array([b"ab", b"cd", b"ef"]) + arrow_array = pa.array(nparray[::2], pa.binary(2)) + assert [b"ab", b"ef"] == arrow_array.to_pylist() + + +def test_array_invalid_mask_raises(): + # ARROW-10742 + cases = [ + ([1, 2], np.array([False, False], dtype="O"), + TypeError, "must be boolean dtype"), + + ([1, 2], np.array([[False], [False]]), + pa.ArrowInvalid, "must be 1D array"), + + ([1, 2, 3], np.array([False, False]), + pa.ArrowInvalid, "different length"), + + (np.array([1, 2]), np.array([False, False], dtype="O"), + TypeError, "must be boolean dtype"), + + (np.array([1, 2]), np.array([[False], [False]]), + ValueError, "must be 1D array"), + + (np.array([1, 2, 3]), np.array([False, False]), + ValueError, "different length"), + ] + for obj, mask, ex, msg in cases: + with pytest.raises(ex, match=msg): + pa.array(obj, mask=mask) + + +def test_array_from_large_pyints(): + # ARROW-5430 + with pytest.raises(OverflowError): + # too large for int64 so dtype must be explicitly provided + pa.array([int(2 ** 63)]) + + +def test_array_protocol(): + + class MyArray: + def __init__(self, data): + self.data = data + + def __arrow_array__(self, type=None): + return pa.array(self.data, type=type) + + arr = MyArray(np.array([1, 2, 3], dtype='int64')) + result = pa.array(arr) + expected = pa.array([1, 2, 3], type=pa.int64()) + assert result.equals(expected) + result = pa.array(arr, type=pa.int64()) + expected = pa.array([1, 2, 3], type=pa.int64()) + assert result.equals(expected) + result = pa.array(arr, type=pa.float64()) + expected = pa.array([1, 2, 3], type=pa.float64()) + assert result.equals(expected) + + # raise error when passing size or mask keywords + with pytest.raises(ValueError): + pa.array(arr, mask=np.array([True, False, True])) + with pytest.raises(ValueError): + pa.array(arr, size=3) + + # ensure the return value is an Array + class MyArrayInvalid: + def __init__(self, data): + self.data = data + + def __arrow_array__(self, type=None): + return np.array(self.data) + + arr = MyArrayInvalid(np.array([1, 2, 3], dtype='int64')) + with pytest.raises(TypeError): + pa.array(arr) + + # ARROW-7066 - allow ChunkedArray output + class MyArray2: + def __init__(self, data): + self.data = data + + def __arrow_array__(self, type=None): + return pa.chunked_array([self.data], type=type) + + arr = MyArray2(np.array([1, 2, 3], dtype='int64')) + result = pa.array(arr) + expected = pa.chunked_array([[1, 2, 3]], type=pa.int64()) + assert result.equals(expected) + + +def test_concat_array(): + concatenated = pa.concat_arrays( + [pa.array([1, 2]), pa.array([3, 4])]) + assert concatenated.equals(pa.array([1, 2, 3, 4])) + + +def test_concat_array_different_types(): + with pytest.raises(pa.ArrowInvalid): + pa.concat_arrays([pa.array([1]), pa.array([2.])]) + + +def test_concat_array_invalid_type(): + # ARROW-9920 - do not segfault on non-array input + + with pytest.raises(TypeError, match="should contain Array objects"): + pa.concat_arrays([None]) + + arr = pa.chunked_array([[0, 1], [3, 4]]) + with pytest.raises(TypeError, match="should contain Array objects"): + pa.concat_arrays(arr) + + +@pytest.mark.pandas +def test_to_pandas_timezone(): + # https://issues.apache.org/jira/browse/ARROW-6652 + arr = pa.array([1, 2, 3], type=pa.timestamp('s', tz='Europe/Brussels')) + s = arr.to_pandas() + assert s.dt.tz is not None + arr = pa.chunked_array([arr]) + s = arr.to_pandas() + assert s.dt.tz is not None |