diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/c_glib/test/dataset | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/c_glib/test/dataset')
8 files changed, 464 insertions, 0 deletions
diff --git a/src/arrow/c_glib/test/dataset/test-file-format.rb b/src/arrow/c_glib/test/dataset/test-file-format.rb new file mode 100644 index 000000000..76ffede94 --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-file-format.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileFormat < Test::Unit::TestCase + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + end + + def test_csv + assert_equal("csv", ArrowDataset::CSVFileFormat.new.type_name) + end + + def test_ipc + assert_equal("ipc", ArrowDataset::IPCFileFormat.new.type_name) + end + + def test_parquet + assert_equal("parquet", ArrowDataset::ParquetFileFormat.new.type_name) + end +end diff --git a/src/arrow/c_glib/test/dataset/test-file-system-dataset-factory.rb b/src/arrow/c_glib/test/dataset/test-file-system-dataset-factory.rb new file mode 100644 index 000000000..bca9e7241 --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-file-system-dataset-factory.rb @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @format = ArrowDataset::IPCFileFormat.new + @path1 = File.join(@dir, "table1.arrow") + @table1 = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + write_table(@table1, @path1) + @path2 = File.join(@dir, "table2.arrow") + @table2 = build_table(visible: [ + build_boolean_array([false, true]), + build_boolean_array([true]), + ], + point: [ + build_int32_array([10]), + build_int32_array([-10, -20]), + ]) + write_table(@table2, @path2) + yield + end + end + + def test_file_system + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system = Arrow::LocalFileSystem.new + factory.add_path(File.expand_path(@path1)) + dataset = factory.finish + assert_equal(@table1, dataset.to_table) + end + + def test_file_system_uri + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(@path1) + dataset = factory.finish + assert_equal(@table1, dataset.to_table) + end + + def test_directory + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(@dir) + dataset = factory.finish + assert_equal(@table1.concatenate([@table2]), + dataset.to_table) + end +end diff --git a/src/arrow/c_glib/test/dataset/test-file-system-dataset.rb b/src/arrow/c_glib/test/dataset/test-file-system-dataset.rb new file mode 100644 index 000000000..1aef38fcc --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-file-system-dataset.rb @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileSystemDataset < Test::Unit::TestCase + include Helper::Buildable + include Helper::Readable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @format = ArrowDataset::IPCFileFormat.new + @factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + @file_system = Arrow::LocalFileSystem.new + @factory.file_system = @file_system + partitioning_schema = build_schema(label: Arrow::StringDataType.new) + @partitioning = + ArrowDataset::DirectoryPartitioning.new(partitioning_schema) + @factory.partitioning = @partitioning + yield + end + end + + def test_type_name + dataset = @factory.finish + assert_equal("filesystem", dataset.type_name) + end + + def test_format + dataset = @factory.finish + assert_equal(@format, dataset.format) + end + + def test_file_system + dataset = @factory.finish + assert_equal(@file_system, dataset.file_system) + end + + def test_partitioning + dataset = @factory.finish + assert_equal(@partitioning, dataset.partitioning) + end + + def test_read_write + table = build_table(label: build_string_array(["a", "a", "b", "c"]), + count: build_int32_array([1, 10, 2, 3])) + table_reader = Arrow::TableBatchReader.new(table) + scanner_builder = ArrowDataset::ScannerBuilder.new(table_reader) + scanner_builder.use_async = true + scanner = scanner_builder.finish + options = ArrowDataset::FileSystemDatasetWriteOptions.new + options.file_write_options = @format.default_write_options + options.file_system = Arrow::LocalFileSystem.new + options.base_dir = @dir + options.base_name_template = "{i}.arrow" + options.partitioning = @partitioning + ArrowDataset::FileSystemDataset.write_scanner(scanner, options) + Find.find(@dir) do |path| + @factory.add_path(path) if File.file?(path) + end + @factory.partition_base_dir = @dir + dataset = @factory.finish + assert_equal(build_table(count: [ + build_int32_array([1, 10]), + build_int32_array([2]), + build_int32_array([3]), + ], + label: [ + build_string_array(["a", "a"]), + build_string_array(["b"]), + build_string_array(["c"]), + ]), + dataset.to_table) + end +end diff --git a/src/arrow/c_glib/test/dataset/test-file-writer.rb b/src/arrow/c_glib/test/dataset/test-file-writer.rb new file mode 100644 index 000000000..5b25d6044 --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-file-writer.rb @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileWriter < Test::Unit::TestCase + include Helper::Buildable + include Helper::Readable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @format = ArrowDataset::IPCFileFormat.new + @file_system = Arrow::LocalFileSystem.new + @path = File.join(@dir, "data.arrow") + @output = @file_system.open_output_stream(@path) + @schema = build_schema(visible: Arrow::BooleanDataType.new, + point: Arrow::UInt8DataType.new) + @writer = @format.open_writer(@output, + @file_system, + @path, + @schema, + @format.default_write_options) + yield + end + end + + def test_write_record_batch + record_batch = build_record_batch( + visible: build_boolean_array([true, false, true]), + point: build_uint8_array([1, 2, 3])) + @writer.write_record_batch(record_batch) + @writer.finish + @output.close + read_table(@path) do |written_table| + assert_equal(Arrow::Table.new(record_batch.schema, + [record_batch]), + written_table) + end + end + + def test_write_record_batch_reader + table = build_table(visible: build_boolean_array([true, false, true]), + point: build_uint8_array([1, 2, 3])) + @writer.write_record_batch_reader(Arrow::TableBatchReader.new(table)) + @writer.finish + @output.close + read_table(@path) do |written_table| + assert_equal(table, written_table) + end + end +end diff --git a/src/arrow/c_glib/test/dataset/test-partitioning-options.rb b/src/arrow/c_glib/test/dataset/test-partitioning-options.rb new file mode 100644 index 000000000..9ff585aa7 --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-partitioning-options.rb @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetPartitioningOptions < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + @options = ArrowDataset::PartitioningOptions.new + end + + def test_infer_dictionary + assert_false(@options.infer_dictionary?) + @options.infer_dictionary = true + assert_true(@options.infer_dictionary?) + end + + def test_schema + assert_nil(@options.schema) + schema = build_schema(year: Arrow::UInt16DataType.new) + @options.schema = schema + assert_equal(schema, @options.schema) + end + + def test_segment_encoding + assert_equal(ArrowDataset::SegmentEncoding::NONE, + @options.segment_encoding) + @options.segment_encoding = :uri + assert_equal(ArrowDataset::SegmentEncoding::URI, + @options.segment_encoding) + end +end diff --git a/src/arrow/c_glib/test/dataset/test-partitioning.rb b/src/arrow/c_glib/test/dataset/test-partitioning.rb new file mode 100644 index 000000000..2b33b1eaa --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-partitioning.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetPartitioning < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + end + + def test_default + assert_equal("default", ArrowDataset::Partitioning.new.type_name) + end + + def test_directory + schema = build_schema(year: Arrow::UInt16DataType.new) + partitioning = ArrowDataset::DirectoryPartitioning.new(schema) + assert_equal("directory", partitioning.type_name) + end +end diff --git a/src/arrow/c_glib/test/dataset/test-scanner-builder.rb b/src/arrow/c_glib/test/dataset/test-scanner-builder.rb new file mode 100644 index 000000000..5674db4c3 --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-scanner-builder.rb @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetScannerBuilder < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + path = File.join(tmpdir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, path) + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(path) + @dataset = factory.finish + @builder = @dataset.begin_scan + yield + end + end + + def test_new_record_batch_reader + reader = Arrow::TableBatchReader.new(@table) + builder = ArrowDataset::ScannerBuilder.new(reader) + scanner = builder.finish + assert_equal(@table, scanner.to_table) + end + + def test_filter + visible = Arrow::FieldExpression.new("visible") + true_scalar = Arrow::BooleanScalar.new(true) + true_datum = Arrow::ScalarDatum.new(true_scalar) + true_literal = Arrow::LiteralExpression.new(true_datum) + filter = Arrow::CallExpression.new("equal", [visible, true_literal]) + @builder.filter = filter + scanner = @builder.finish + assert_equal(build_table(visible: [ + build_boolean_array([true, true]), + build_boolean_array([true, true]), + ], + point: [ + build_int32_array([1, 3]), + build_int32_array([-2, -4]), + ]), + scanner.to_table) + end + + def test_use_async + @builder.use_async = true + scanner = @builder.finish + assert_equal(@table, scanner.to_table) + end +end diff --git a/src/arrow/c_glib/test/dataset/test-scanner.rb b/src/arrow/c_glib/test/dataset/test-scanner.rb new file mode 100644 index 000000000..f7702d490 --- /dev/null +++ b/src/arrow/c_glib/test/dataset/test-scanner.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetScanner < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + path = File.join(tmpdir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, path) + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(path) + @dataset = factory.finish + builder = @dataset.begin_scan + @scanner = builder.finish + yield + end + end + + def test_to_table + assert_equal(@table, @scanner.to_table) + end +end |