diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/python/pyarrow/tests/pandas_examples.py | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/python/pyarrow/tests/pandas_examples.py')
-rw-r--r-- | src/arrow/python/pyarrow/tests/pandas_examples.py | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/src/arrow/python/pyarrow/tests/pandas_examples.py b/src/arrow/python/pyarrow/tests/pandas_examples.py new file mode 100644 index 000000000..466c14eeb --- /dev/null +++ b/src/arrow/python/pyarrow/tests/pandas_examples.py @@ -0,0 +1,172 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections import OrderedDict +from datetime import date, time + +import numpy as np +import pandas as pd +import pyarrow as pa + + +def dataframe_with_arrays(include_index=False): + """ + Dataframe with numpy arrays columns of every possible primitive type. + + Returns + ------- + df: pandas.DataFrame + schema: pyarrow.Schema + Arrow schema definition that is in line with the constructed df. + """ + dtypes = [('i1', pa.int8()), ('i2', pa.int16()), + ('i4', pa.int32()), ('i8', pa.int64()), + ('u1', pa.uint8()), ('u2', pa.uint16()), + ('u4', pa.uint32()), ('u8', pa.uint64()), + ('f4', pa.float32()), ('f8', pa.float64())] + + arrays = OrderedDict() + fields = [] + for dtype, arrow_dtype in dtypes: + fields.append(pa.field(dtype, pa.list_(arrow_dtype))) + arrays[dtype] = [ + np.arange(10, dtype=dtype), + np.arange(5, dtype=dtype), + None, + np.arange(1, dtype=dtype) + ] + + fields.append(pa.field('str', pa.list_(pa.string()))) + arrays['str'] = [ + np.array(["1", "ä"], dtype="object"), + None, + np.array(["1"], dtype="object"), + np.array(["1", "2", "3"], dtype="object") + ] + + fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) + arrays['datetime64'] = [ + np.array(['2007-07-13T01:23:34.123456789', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + None, + None, + np.array(['2007-07-13T02', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + ] + + if include_index: + fields.append(pa.field('__index_level_0__', pa.int64())) + df = pd.DataFrame(arrays) + schema = pa.schema(fields) + + return df, schema + + +def dataframe_with_lists(include_index=False, parquet_compatible=False): + """ + Dataframe with list columns of every possible primitive type. + + Returns + ------- + df: pandas.DataFrame + schema: pyarrow.Schema + Arrow schema definition that is in line with the constructed df. + parquet_compatible: bool + Exclude types not supported by parquet + """ + arrays = OrderedDict() + fields = [] + + fields.append(pa.field('int64', pa.list_(pa.int64()))) + arrays['int64'] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [0, 1, 2, 3, 4], + None, + [], + np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, + dtype=np.int64)[::2] + ] + fields.append(pa.field('double', pa.list_(pa.float64()))) + arrays['double'] = [ + [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], + [0., 1., 2., 3., 4.], + None, + [], + np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], + ] + fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) + arrays['bytes_list'] = [ + [b"1", b"f"], + None, + [b"1"], + [b"1", b"2", b"3"], + [], + ] + fields.append(pa.field('str_list', pa.list_(pa.string()))) + arrays['str_list'] = [ + ["1", "ä"], + None, + ["1"], + ["1", "2", "3"], + [], + ] + + date_data = [ + [], + [date(2018, 1, 1), date(2032, 12, 30)], + [date(2000, 6, 7)], + None, + [date(1969, 6, 9), date(1972, 7, 3)] + ] + time_data = [ + [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)], + [], + [time(22, 5, 59)], + None, + [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)] + ] + + temporal_pairs = [ + (pa.date32(), date_data), + (pa.date64(), date_data), + (pa.time32('s'), time_data), + (pa.time32('ms'), time_data), + (pa.time64('us'), time_data) + ] + if not parquet_compatible: + temporal_pairs += [ + (pa.time64('ns'), time_data), + ] + + for value_type, data in temporal_pairs: + field_name = '{}_list'.format(value_type) + field_type = pa.list_(value_type) + field = pa.field(field_name, field_type) + fields.append(field) + arrays[field_name] = data + + if include_index: + fields.append(pa.field('__index_level_0__', pa.int64())) + + df = pd.DataFrame(arrays) + schema = pa.schema(fields) + + return df, schema |