summaryrefslogtreecommitdiffstats
path: root/src/arrow/c_glib/test/dataset/test-file-system-dataset.rb
blob: 1aef38fcca55f0dc61c180a95e69d0ea945b3e4f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

class TestDatasetFileSystemDataset < Test::Unit::TestCase
  include Helper::Buildable
  include Helper::Readable

  def setup
    omit("Arrow Dataset is required") unless defined?(ArrowDataset)
    Dir.mktmpdir do |tmpdir|
      @dir = tmpdir
      @format = ArrowDataset::IPCFileFormat.new
      @factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
      @file_system = Arrow::LocalFileSystem.new
      @factory.file_system = @file_system
      partitioning_schema = build_schema(label: Arrow::StringDataType.new)
      @partitioning =
        ArrowDataset::DirectoryPartitioning.new(partitioning_schema)
      @factory.partitioning = @partitioning
      yield
    end
  end

  def test_type_name
    dataset = @factory.finish
    assert_equal("filesystem", dataset.type_name)
  end

  def test_format
    dataset = @factory.finish
    assert_equal(@format, dataset.format)
  end

  def test_file_system
    dataset = @factory.finish
    assert_equal(@file_system, dataset.file_system)
  end

  def test_partitioning
    dataset = @factory.finish
    assert_equal(@partitioning, dataset.partitioning)
  end

  def test_read_write
    table = build_table(label: build_string_array(["a", "a", "b", "c"]),
                        count: build_int32_array([1, 10, 2, 3]))
    table_reader = Arrow::TableBatchReader.new(table)
    scanner_builder = ArrowDataset::ScannerBuilder.new(table_reader)
    scanner_builder.use_async = true
    scanner = scanner_builder.finish
    options = ArrowDataset::FileSystemDatasetWriteOptions.new
    options.file_write_options = @format.default_write_options
    options.file_system = Arrow::LocalFileSystem.new
    options.base_dir = @dir
    options.base_name_template = "{i}.arrow"
    options.partitioning = @partitioning
    ArrowDataset::FileSystemDataset.write_scanner(scanner, options)
    Find.find(@dir) do |path|
      @factory.add_path(path) if File.file?(path)
    end
    @factory.partition_base_dir = @dir
    dataset = @factory.finish
    assert_equal(build_table(count: [
                               build_int32_array([1, 10]),
                               build_int32_array([2]),
                               build_int32_array([3]),
                             ],
                             label: [
                               build_string_array(["a", "a"]),
                               build_string_array(["b"]),
                               build_string_array(["c"]),
                             ]),
                 dataset.to_table)
  end
end