diff options
Diffstat (limited to 'src/arrow/ruby/red-arrow-dataset/lib')
8 files changed, 351 insertions, 0 deletions
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb new file mode 100644 index 000000000..fe4f2d518 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require "arrow" + +require "arrow-dataset/version" + +require "arrow-dataset/loader" + +module ArrowDataset + class Error < StandardError + end + + Loader.load +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb new file mode 100644 index 000000000..14c8dce6f --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + module ArrowTableLoadable + private + def path_to_uri(path) + absolute_path = ::File.expand_path(path) + if absolute_path.start_with?("/") + URI("file://#{absolute_path}") + else + URI("file:///#{absolute_path}") + end + end + + def load_from_directory + internal_load_from_uri(path_to_uri(@input)) + end + + def load_from_uri + internal_load_from_uri(@input) + end + + def internal_load_from_uri(uri) + format = FileFormat.resolve(@options[:format]) + dataset = FileSystemDataset.build(format) do |factory| + factory.file_system_uri = uri + end + scanner_builder = dataset.begin_scan + @options.each do |key, value| + next if key == :format + next if value.nil? + setter = "#{key}=" + next unless scanner_builder.respond_to?(setter) + scanner_builder.public_send(setter, value) + end + scanner = scanner_builder.finish + scanner.to_table + end + end +end + +module Arrow + class TableLoader + include ArrowDataset::ArrowTableLoadable + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb new file mode 100644 index 000000000..30ad6c292 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + module ArrowTableSavable + private + def save_to_uri + format = FileFormat.resolve(@options[:format]) + options = FileSystemDatasetWriteOptions.new + options.file_write_options = format.default_write_options + path = @output.path + if @output.scheme.nil? + options.file_system = Arrow::LocalFileSystem.new + else + options.file_system = Arrow::FileSystem.create(@output.to_s) + # /C:/... -> C:/... + unless File.expand_path(".").start_with?("/") + path = path.gsub(/\A\//, "") + end + end + partitioning = @options[:partitioning] + if partitioning + # TODO + options.base_dir = File.dirname(path) + options.base_name_template = File.basename(path) + options.partitioning = Partitioning.resolve(@options[:partitioning]) + scanner_builder = ScannerBuilder.new(@table) + scanner_builder.use_async(true) + scanner = scanner_builder.finish + FileSystemDataset.write_scanner(scanner, options) + else + dir = File.dirname(path) + unless File.exist?(dir) + options.file_system.create_dir(dir, true) + end + options.file_system.open_output_stream(path) do |output_stream| + format.open_writer(output_stream, + options.file_system, + path, + @table.schema, + format.default_write_options) do |writer| + reader = Arrow::TableBatchReader.new(@table) + writer.write_record_batch_reader(reader) + end + end + end + end + end +end + +module Arrow + class TableSaver + include ArrowDataset::ArrowTableSavable + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb new file mode 100644 index 000000000..a658fc3f2 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class Dataset + class << self + def build(*args) + factory_class = ArrowDataset.const_get("#{name}Factory") + factory = factory_class.new(*args) + yield(factory) + factory.finish + end + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb new file mode 100644 index 000000000..83e61c4b2 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class FileFormat + class << self + def resolve(format) + case format + when :arrow, :arrow_file, :arrow_streaming + IPCFileFormat.new + when :parquet + ParquetFileFormat.new + when :csv + CSVFileFormat.new + else + available_formats = [ + :arrow, + :arrow_file, + :arrow_streaming, + :parquet, + :csv, + ] + message = "Arrow::Table load format must be one of [" + message << available_formats.join(", ") + message << "]: #{@options[:format].inspect}" + raise ArgumentError, message + end + end + end + + alias_method :open_writer_raw, :open_writer + def open_writer(destination, file_system, path, schema, options) + writer = open_writer_raw(destination, file_system, path, schema, options) + if block_given? + begin + yield(writer) + ensure + writer.finish + end + else + writer + end + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb new file mode 100644 index 000000000..111a29a3c --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class FileSystemDatasetFactory + alias_method :set_file_system_uri_raw, :set_file_system_uri + def set_file_system_uri(uri) + if uri.is_a?(URI) + if uri.scheme.nil? + uri = uri.dup + absolute_path = File.expand_path(uri.path) + if absolute_path.start_with?("/") + uri.path = absolute_path + else + uri.path = "/#{absolute_path}" + end + uri.scheme = "file" + end + uri = uri.to_s + end + set_file_system_uri_raw(uri) + end + alias_method :file_system_uri=, :set_file_system_uri + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb new file mode 100644 index 000000000..b1be000f7 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class Loader < GObjectIntrospection::Loader + class << self + def load + super("ArrowDataset", ArrowDataset) + end + end + + private + def post_load(repository, namespace) + require_libraries + end + + def require_libraries + require "arrow-dataset/arrow-table-loadable" + require "arrow-dataset/arrow-table-savable" + require "arrow-dataset/dataset" + require "arrow-dataset/file-format" + require "arrow-dataset/file-system-dataset-factory" + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb new file mode 100644 index 000000000..1a37139d1 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + VERSION = "6.0.1" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end |