diff options
Diffstat (limited to 'misc/arrow-parquet/gen-parquet-test-files.py')
-rwxr-xr-x | misc/arrow-parquet/gen-parquet-test-files.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/misc/arrow-parquet/gen-parquet-test-files.py b/misc/arrow-parquet/gen-parquet-test-files.py new file mode 100755 index 0000000..a066536 --- /dev/null +++ b/misc/arrow-parquet/gen-parquet-test-files.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +######################################################################## +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +######################################################################## + +import pandas as pd +from pathlib import Path + + +def gen_str(pos): + return gen_str.values[pos] + + +gen_str.values = ( + "ubergeek", + "thwarter", + "ironfist", + "turkoman", + "mesozoan", + "seatsale", + "hardtack", + "phyllary", + "hydriads", + "stranger", + "cistuses", + "capelets", + "headband", + "dudesses", + "aminases", + "eggwhite", + "boxscore", + "upsurges", + "blowlamp", + "dionysia", + "rejecter", + "keratome", + "diasters", + "juddocks", + "gownsman", + "sweepsaw", + "chuckeys", + "partyers", + "weredogs", + "exabytes", +) + + +def main(): + data = { + "float64 with nan": [1.2, 3.4, None, None, 5.6] + } + + df = pd.DataFrame(data=data) + df["float64 with nan"] = df["float64 with nan"].astype("float64") + + print(df) + print(df.dtypes) + + outdir = Path("../../test/parquet/basic") + outpath = outdir / "float-with-non.parquet" + df.to_parquet(outpath, engine="pyarrow", compression=None) + + row_size = 10 + data = { + "int32": [v for v in range(row_size)], + "int64": [v * 10 + v for v in range(row_size)], + "float32": [-v for v in range(row_size)], + "float64": [-v - 21 for v in range(row_size)], + "boolean": [(v & 0x01) != 0 for v in range(row_size)], + "string": [gen_str(pos) for pos in range(row_size)], + } + df = pd.DataFrame(data=data) + df["int32"] = df["int32"].astype("int32") + df["int64"] = df["int64"].astype("int64") + df["float32"] = df["float32"].astype("float32") + df["float64"] = df["float64"].astype("float64") + + print(df) + print(df.dtypes) + + df.to_parquet(outdir / f"basic-nocomp.parquet", engine="pyarrow", compression=None) + for comp in ("gzip", "snappy", "zstd"): + df.to_parquet(outdir / f"basic-{comp}.parquet", engine="pyarrow", compression=comp) + + +if __name__ == "__main__": + main() + |