summaryrefslogtreecommitdiffstats
path: root/misc/arrow-parquet/gen-parquet-test-files.py
diff options
context:
space:
mode:
Diffstat (limited to 'misc/arrow-parquet/gen-parquet-test-files.py')
-rwxr-xr-xmisc/arrow-parquet/gen-parquet-test-files.py92
1 files changed, 92 insertions, 0 deletions
diff --git a/misc/arrow-parquet/gen-parquet-test-files.py b/misc/arrow-parquet/gen-parquet-test-files.py
new file mode 100755
index 0000000..a066536
--- /dev/null
+++ b/misc/arrow-parquet/gen-parquet-test-files.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+########################################################################
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+########################################################################
+
+import pandas as pd
+from pathlib import Path
+
+
+def gen_str(pos):
+ return gen_str.values[pos]
+
+
+gen_str.values = (
+ "ubergeek",
+ "thwarter",
+ "ironfist",
+ "turkoman",
+ "mesozoan",
+ "seatsale",
+ "hardtack",
+ "phyllary",
+ "hydriads",
+ "stranger",
+ "cistuses",
+ "capelets",
+ "headband",
+ "dudesses",
+ "aminases",
+ "eggwhite",
+ "boxscore",
+ "upsurges",
+ "blowlamp",
+ "dionysia",
+ "rejecter",
+ "keratome",
+ "diasters",
+ "juddocks",
+ "gownsman",
+ "sweepsaw",
+ "chuckeys",
+ "partyers",
+ "weredogs",
+ "exabytes",
+)
+
+
+def main():
+ data = {
+ "float64 with nan": [1.2, 3.4, None, None, 5.6]
+ }
+
+ df = pd.DataFrame(data=data)
+ df["float64 with nan"] = df["float64 with nan"].astype("float64")
+
+ print(df)
+ print(df.dtypes)
+
+ outdir = Path("../../test/parquet/basic")
+ outpath = outdir / "float-with-non.parquet"
+ df.to_parquet(outpath, engine="pyarrow", compression=None)
+
+ row_size = 10
+ data = {
+ "int32": [v for v in range(row_size)],
+ "int64": [v * 10 + v for v in range(row_size)],
+ "float32": [-v for v in range(row_size)],
+ "float64": [-v - 21 for v in range(row_size)],
+ "boolean": [(v & 0x01) != 0 for v in range(row_size)],
+ "string": [gen_str(pos) for pos in range(row_size)],
+ }
+ df = pd.DataFrame(data=data)
+ df["int32"] = df["int32"].astype("int32")
+ df["int64"] = df["int64"].astype("int64")
+ df["float32"] = df["float32"].astype("float32")
+ df["float64"] = df["float64"].astype("float64")
+
+ print(df)
+ print(df.dtypes)
+
+ df.to_parquet(outdir / f"basic-nocomp.parquet", engine="pyarrow", compression=None)
+ for comp in ("gzip", "snappy", "zstd"):
+ df.to_parquet(outdir / f"basic-{comp}.parquet", engine="pyarrow", compression=comp)
+
+
+if __name__ == "__main__":
+ main()
+