summaryrefslogtreecommitdiffstats
path: root/src/arrow/c_glib/test/dataset
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/arrow/c_glib/test/dataset
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/arrow/c_glib/test/dataset')
-rw-r--r--src/arrow/c_glib/test/dataset/test-file-format.rb34
-rw-r--r--src/arrow/c_glib/test/dataset/test-file-system-dataset-factory.rb73
-rw-r--r--src/arrow/c_glib/test/dataset/test-file-system-dataset.rb89
-rw-r--r--src/arrow/c_glib/test/dataset/test-file-writer.rb65
-rw-r--r--src/arrow/c_glib/test/dataset/test-partitioning-options.rb46
-rw-r--r--src/arrow/c_glib/test/dataset/test-partitioning.rb34
-rw-r--r--src/arrow/c_glib/test/dataset/test-scanner-builder.rb75
-rw-r--r--src/arrow/c_glib/test/dataset/test-scanner.rb48
8 files changed, 464 insertions, 0 deletions
diff --git a/src/arrow/c_glib/test/dataset/test-file-format.rb b/src/arrow/c_glib/test/dataset/test-file-format.rb
new file mode 100644
index 000000000..76ffede94
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-file-format.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetFileFormat < Test::Unit::TestCase
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ end
+
+ def test_csv
+ assert_equal("csv", ArrowDataset::CSVFileFormat.new.type_name)
+ end
+
+ def test_ipc
+ assert_equal("ipc", ArrowDataset::IPCFileFormat.new.type_name)
+ end
+
+ def test_parquet
+ assert_equal("parquet", ArrowDataset::ParquetFileFormat.new.type_name)
+ end
+end
diff --git a/src/arrow/c_glib/test/dataset/test-file-system-dataset-factory.rb b/src/arrow/c_glib/test/dataset/test-file-system-dataset-factory.rb
new file mode 100644
index 000000000..bca9e7241
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-file-system-dataset-factory.rb
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase
+ include Helper::Buildable
+ include Helper::Writable
+
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ Dir.mktmpdir do |tmpdir|
+ @dir = tmpdir
+ @format = ArrowDataset::IPCFileFormat.new
+ @path1 = File.join(@dir, "table1.arrow")
+ @table1 = build_table(visible: [
+ build_boolean_array([true, false, true]),
+ build_boolean_array([false, true, false, true]),
+ ],
+ point: [
+ build_int32_array([1, 2, 3]),
+ build_int32_array([-1, -2, -3, -4]),
+ ])
+ write_table(@table1, @path1)
+ @path2 = File.join(@dir, "table2.arrow")
+ @table2 = build_table(visible: [
+ build_boolean_array([false, true]),
+ build_boolean_array([true]),
+ ],
+ point: [
+ build_int32_array([10]),
+ build_int32_array([-10, -20]),
+ ])
+ write_table(@table2, @path2)
+ yield
+ end
+ end
+
+ def test_file_system
+ factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+ factory.file_system = Arrow::LocalFileSystem.new
+ factory.add_path(File.expand_path(@path1))
+ dataset = factory.finish
+ assert_equal(@table1, dataset.to_table)
+ end
+
+ def test_file_system_uri
+ factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+ factory.file_system_uri = build_file_uri(@path1)
+ dataset = factory.finish
+ assert_equal(@table1, dataset.to_table)
+ end
+
+ def test_directory
+ factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+ factory.file_system_uri = build_file_uri(@dir)
+ dataset = factory.finish
+ assert_equal(@table1.concatenate([@table2]),
+ dataset.to_table)
+ end
+end
diff --git a/src/arrow/c_glib/test/dataset/test-file-system-dataset.rb b/src/arrow/c_glib/test/dataset/test-file-system-dataset.rb
new file mode 100644
index 000000000..1aef38fcc
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-file-system-dataset.rb
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetFileSystemDataset < Test::Unit::TestCase
+ include Helper::Buildable
+ include Helper::Readable
+
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ Dir.mktmpdir do |tmpdir|
+ @dir = tmpdir
+ @format = ArrowDataset::IPCFileFormat.new
+ @factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+ @file_system = Arrow::LocalFileSystem.new
+ @factory.file_system = @file_system
+ partitioning_schema = build_schema(label: Arrow::StringDataType.new)
+ @partitioning =
+ ArrowDataset::DirectoryPartitioning.new(partitioning_schema)
+ @factory.partitioning = @partitioning
+ yield
+ end
+ end
+
+ def test_type_name
+ dataset = @factory.finish
+ assert_equal("filesystem", dataset.type_name)
+ end
+
+ def test_format
+ dataset = @factory.finish
+ assert_equal(@format, dataset.format)
+ end
+
+ def test_file_system
+ dataset = @factory.finish
+ assert_equal(@file_system, dataset.file_system)
+ end
+
+ def test_partitioning
+ dataset = @factory.finish
+ assert_equal(@partitioning, dataset.partitioning)
+ end
+
+ def test_read_write
+ table = build_table(label: build_string_array(["a", "a", "b", "c"]),
+ count: build_int32_array([1, 10, 2, 3]))
+ table_reader = Arrow::TableBatchReader.new(table)
+ scanner_builder = ArrowDataset::ScannerBuilder.new(table_reader)
+ scanner_builder.use_async = true
+ scanner = scanner_builder.finish
+ options = ArrowDataset::FileSystemDatasetWriteOptions.new
+ options.file_write_options = @format.default_write_options
+ options.file_system = Arrow::LocalFileSystem.new
+ options.base_dir = @dir
+ options.base_name_template = "{i}.arrow"
+ options.partitioning = @partitioning
+ ArrowDataset::FileSystemDataset.write_scanner(scanner, options)
+ Find.find(@dir) do |path|
+ @factory.add_path(path) if File.file?(path)
+ end
+ @factory.partition_base_dir = @dir
+ dataset = @factory.finish
+ assert_equal(build_table(count: [
+ build_int32_array([1, 10]),
+ build_int32_array([2]),
+ build_int32_array([3]),
+ ],
+ label: [
+ build_string_array(["a", "a"]),
+ build_string_array(["b"]),
+ build_string_array(["c"]),
+ ]),
+ dataset.to_table)
+ end
+end
diff --git a/src/arrow/c_glib/test/dataset/test-file-writer.rb b/src/arrow/c_glib/test/dataset/test-file-writer.rb
new file mode 100644
index 000000000..5b25d6044
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-file-writer.rb
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetFileWriter < Test::Unit::TestCase
+ include Helper::Buildable
+ include Helper::Readable
+
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ Dir.mktmpdir do |tmpdir|
+ @dir = tmpdir
+ @format = ArrowDataset::IPCFileFormat.new
+ @file_system = Arrow::LocalFileSystem.new
+ @path = File.join(@dir, "data.arrow")
+ @output = @file_system.open_output_stream(@path)
+ @schema = build_schema(visible: Arrow::BooleanDataType.new,
+ point: Arrow::UInt8DataType.new)
+ @writer = @format.open_writer(@output,
+ @file_system,
+ @path,
+ @schema,
+ @format.default_write_options)
+ yield
+ end
+ end
+
+ def test_write_record_batch
+ record_batch = build_record_batch(
+ visible: build_boolean_array([true, false, true]),
+ point: build_uint8_array([1, 2, 3]))
+ @writer.write_record_batch(record_batch)
+ @writer.finish
+ @output.close
+ read_table(@path) do |written_table|
+ assert_equal(Arrow::Table.new(record_batch.schema,
+ [record_batch]),
+ written_table)
+ end
+ end
+
+ def test_write_record_batch_reader
+ table = build_table(visible: build_boolean_array([true, false, true]),
+ point: build_uint8_array([1, 2, 3]))
+ @writer.write_record_batch_reader(Arrow::TableBatchReader.new(table))
+ @writer.finish
+ @output.close
+ read_table(@path) do |written_table|
+ assert_equal(table, written_table)
+ end
+ end
+end
diff --git a/src/arrow/c_glib/test/dataset/test-partitioning-options.rb b/src/arrow/c_glib/test/dataset/test-partitioning-options.rb
new file mode 100644
index 000000000..9ff585aa7
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-partitioning-options.rb
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetPartitioningOptions < Test::Unit::TestCase
+ include Helper::Buildable
+
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ @options = ArrowDataset::PartitioningOptions.new
+ end
+
+ def test_infer_dictionary
+ assert_false(@options.infer_dictionary?)
+ @options.infer_dictionary = true
+ assert_true(@options.infer_dictionary?)
+ end
+
+ def test_schema
+ assert_nil(@options.schema)
+ schema = build_schema(year: Arrow::UInt16DataType.new)
+ @options.schema = schema
+ assert_equal(schema, @options.schema)
+ end
+
+ def test_segment_encoding
+ assert_equal(ArrowDataset::SegmentEncoding::NONE,
+ @options.segment_encoding)
+ @options.segment_encoding = :uri
+ assert_equal(ArrowDataset::SegmentEncoding::URI,
+ @options.segment_encoding)
+ end
+end
diff --git a/src/arrow/c_glib/test/dataset/test-partitioning.rb b/src/arrow/c_glib/test/dataset/test-partitioning.rb
new file mode 100644
index 000000000..2b33b1eaa
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-partitioning.rb
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetPartitioning < Test::Unit::TestCase
+ include Helper::Buildable
+
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ end
+
+ def test_default
+ assert_equal("default", ArrowDataset::Partitioning.new.type_name)
+ end
+
+ def test_directory
+ schema = build_schema(year: Arrow::UInt16DataType.new)
+ partitioning = ArrowDataset::DirectoryPartitioning.new(schema)
+ assert_equal("directory", partitioning.type_name)
+ end
+end
diff --git a/src/arrow/c_glib/test/dataset/test-scanner-builder.rb b/src/arrow/c_glib/test/dataset/test-scanner-builder.rb
new file mode 100644
index 000000000..5674db4c3
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-scanner-builder.rb
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetScannerBuilder < Test::Unit::TestCase
+ include Helper::Buildable
+ include Helper::Writable
+
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ Dir.mktmpdir do |tmpdir|
+ path = File.join(tmpdir, "table.arrow")
+ @table = build_table(visible: [
+ build_boolean_array([true, false, true]),
+ build_boolean_array([false, true, false, true]),
+ ],
+ point: [
+ build_int32_array([1, 2, 3]),
+ build_int32_array([-1, -2, -3, -4]),
+ ])
+ @format = ArrowDataset::IPCFileFormat.new
+ write_table(@table, path)
+ factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+ factory.file_system_uri = build_file_uri(path)
+ @dataset = factory.finish
+ @builder = @dataset.begin_scan
+ yield
+ end
+ end
+
+ def test_new_record_batch_reader
+ reader = Arrow::TableBatchReader.new(@table)
+ builder = ArrowDataset::ScannerBuilder.new(reader)
+ scanner = builder.finish
+ assert_equal(@table, scanner.to_table)
+ end
+
+ def test_filter
+ visible = Arrow::FieldExpression.new("visible")
+ true_scalar = Arrow::BooleanScalar.new(true)
+ true_datum = Arrow::ScalarDatum.new(true_scalar)
+ true_literal = Arrow::LiteralExpression.new(true_datum)
+ filter = Arrow::CallExpression.new("equal", [visible, true_literal])
+ @builder.filter = filter
+ scanner = @builder.finish
+ assert_equal(build_table(visible: [
+ build_boolean_array([true, true]),
+ build_boolean_array([true, true]),
+ ],
+ point: [
+ build_int32_array([1, 3]),
+ build_int32_array([-2, -4]),
+ ]),
+ scanner.to_table)
+ end
+
+ def test_use_async
+ @builder.use_async = true
+ scanner = @builder.finish
+ assert_equal(@table, scanner.to_table)
+ end
+end
diff --git a/src/arrow/c_glib/test/dataset/test-scanner.rb b/src/arrow/c_glib/test/dataset/test-scanner.rb
new file mode 100644
index 000000000..f7702d490
--- /dev/null
+++ b/src/arrow/c_glib/test/dataset/test-scanner.rb
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetScanner < Test::Unit::TestCase
+ include Helper::Buildable
+ include Helper::Writable
+
+ def setup
+ omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+ Dir.mktmpdir do |tmpdir|
+ path = File.join(tmpdir, "table.arrow")
+ @table = build_table(visible: [
+ build_boolean_array([true, false, true]),
+ build_boolean_array([false, true, false, true]),
+ ],
+ point: [
+ build_int32_array([1, 2, 3]),
+ build_int32_array([-1, -2, -3, -4]),
+ ])
+ @format = ArrowDataset::IPCFileFormat.new
+ write_table(@table, path)
+ factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+ factory.file_system_uri = build_file_uri(path)
+ @dataset = factory.finish
+ builder = @dataset.begin_scan
+ @scanner = builder.finish
+ yield
+ end
+ end
+
+ def test_to_table
+ assert_equal(@table, @scanner.to_table)
+ end
+end