diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/testing/data/avro | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/testing/data/avro')
18 files changed, 37 insertions, 0 deletions
diff --git a/src/arrow/testing/data/avro/README.md b/src/arrow/testing/data/avro/README.md new file mode 100644 index 000000000..2707e1297 --- /dev/null +++ b/src/arrow/testing/data/avro/README.md @@ -0,0 +1,37 @@ +This directory contains AVRO files corresponding to the parquet testing files at https://github.com/apache/parquet-testing/blob/master/data/ + +These files were created by using spark using the commands from https://gist.github.com/Igosuki/324b011f40185269d3fc552350d21744 + +Roughly: +```scala +import com.github.mrpowers.spark.daria.sql.DariaWriters +import org.apache.hadoop.fs.FileSystem +import org.apache.hadoop.fs.Path +import org.apache.hadoop.conf.Configuration +import org.apache.commons.io.FilenameUtils + +val fileGlobs = sc.getConf.get("spark.driver.globs") +val dest = sc.getConf.get("spark.driver.out") + +val fs = FileSystem.get(new Configuration(true)); +val status = fs.globStatus(new Path(fileGlobs)) +for (fileStatus <- status) { + val path = fileStatus.getPath().toString() + try { + val dfin = spark.read.format("parquet").load(path) + val fileName = fileStatus.getPath().getName(); + val fileNameWithOutExt = FilenameUtils.removeExtension(fileName); + val destination = s"${dest}/${fileNameWithOutExt}.avro" + println(s"Converting $path to avro at $destination") + DariaWriters.writeSingleFile( + df = dfin, + format = "avro", + sc = spark.sparkContext, + tmpFolder = s"/tmp/dw/${fileName}", + filename = destination + ) + } catch { + case e: Throwable => println(s"failed to convert $path : ${e.getMessage}") + } +} +``` diff --git a/src/arrow/testing/data/avro/alltypes_dictionary.avro b/src/arrow/testing/data/avro/alltypes_dictionary.avro Binary files differnew file mode 100644 index 000000000..1fdd79e8a --- /dev/null +++ b/src/arrow/testing/data/avro/alltypes_dictionary.avro diff --git a/src/arrow/testing/data/avro/alltypes_plain.avro b/src/arrow/testing/data/avro/alltypes_plain.avro Binary files differnew file mode 100644 index 000000000..d60c62822 --- /dev/null +++ b/src/arrow/testing/data/avro/alltypes_plain.avro diff --git a/src/arrow/testing/data/avro/alltypes_plain.snappy.avro b/src/arrow/testing/data/avro/alltypes_plain.snappy.avro Binary files differnew file mode 100644 index 000000000..d818ab554 --- /dev/null +++ b/src/arrow/testing/data/avro/alltypes_plain.snappy.avro diff --git a/src/arrow/testing/data/avro/binary.avro b/src/arrow/testing/data/avro/binary.avro Binary files differnew file mode 100644 index 000000000..48081f93b --- /dev/null +++ b/src/arrow/testing/data/avro/binary.avro diff --git a/src/arrow/testing/data/avro/datapage_v2.snappy.avro b/src/arrow/testing/data/avro/datapage_v2.snappy.avro Binary files differnew file mode 100644 index 000000000..ccd590076 --- /dev/null +++ b/src/arrow/testing/data/avro/datapage_v2.snappy.avro diff --git a/src/arrow/testing/data/avro/dict-page-offset-zero.avro b/src/arrow/testing/data/avro/dict-page-offset-zero.avro Binary files differnew file mode 100644 index 000000000..388541dfc --- /dev/null +++ b/src/arrow/testing/data/avro/dict-page-offset-zero.avro diff --git a/src/arrow/testing/data/avro/fixed_length_decimal.avro b/src/arrow/testing/data/avro/fixed_length_decimal.avro Binary files differnew file mode 100644 index 000000000..1c9f195ab --- /dev/null +++ b/src/arrow/testing/data/avro/fixed_length_decimal.avro diff --git a/src/arrow/testing/data/avro/fixed_length_decimal_legacy.avro b/src/arrow/testing/data/avro/fixed_length_decimal_legacy.avro Binary files differnew file mode 100644 index 000000000..e840d486a --- /dev/null +++ b/src/arrow/testing/data/avro/fixed_length_decimal_legacy.avro diff --git a/src/arrow/testing/data/avro/int32_decimal.avro b/src/arrow/testing/data/avro/int32_decimal.avro Binary files differnew file mode 100644 index 000000000..0623a288a --- /dev/null +++ b/src/arrow/testing/data/avro/int32_decimal.avro diff --git a/src/arrow/testing/data/avro/int64_decimal.avro b/src/arrow/testing/data/avro/int64_decimal.avro Binary files differnew file mode 100644 index 000000000..182d9b7b1 --- /dev/null +++ b/src/arrow/testing/data/avro/int64_decimal.avro diff --git a/src/arrow/testing/data/avro/list_columns.avro b/src/arrow/testing/data/avro/list_columns.avro Binary files differnew file mode 100644 index 000000000..0d2dd2354 --- /dev/null +++ b/src/arrow/testing/data/avro/list_columns.avro diff --git a/src/arrow/testing/data/avro/nested_lists.snappy.avro b/src/arrow/testing/data/avro/nested_lists.snappy.avro Binary files differnew file mode 100644 index 000000000..6cbff8961 --- /dev/null +++ b/src/arrow/testing/data/avro/nested_lists.snappy.avro diff --git a/src/arrow/testing/data/avro/nonnullable.impala.avro b/src/arrow/testing/data/avro/nonnullable.impala.avro Binary files differnew file mode 100644 index 000000000..7ff8f3b7a --- /dev/null +++ b/src/arrow/testing/data/avro/nonnullable.impala.avro diff --git a/src/arrow/testing/data/avro/nullable.impala.avro b/src/arrow/testing/data/avro/nullable.impala.avro Binary files differnew file mode 100644 index 000000000..28f118b6b --- /dev/null +++ b/src/arrow/testing/data/avro/nullable.impala.avro diff --git a/src/arrow/testing/data/avro/nulls.snappy.avro b/src/arrow/testing/data/avro/nulls.snappy.avro Binary files differnew file mode 100644 index 000000000..8be5bec85 --- /dev/null +++ b/src/arrow/testing/data/avro/nulls.snappy.avro diff --git a/src/arrow/testing/data/avro/repeated_no_annotation.avro b/src/arrow/testing/data/avro/repeated_no_annotation.avro Binary files differnew file mode 100644 index 000000000..44edb8e10 --- /dev/null +++ b/src/arrow/testing/data/avro/repeated_no_annotation.avro diff --git a/src/arrow/testing/data/avro/single_nan.avro b/src/arrow/testing/data/avro/single_nan.avro Binary files differnew file mode 100644 index 000000000..ccf93e54d --- /dev/null +++ b/src/arrow/testing/data/avro/single_nan.avro |