diff options
Diffstat (limited to '')
20 files changed, 977 insertions, 0 deletions
diff --git a/src/arrow/ruby/red-arrow-dataset/.gitignore b/src/arrow/ruby/red-arrow-dataset/.gitignore new file mode 100644 index 000000000..afd93a168 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/.gitignore @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +/Gemfile.lock +/pkg/ diff --git a/src/arrow/ruby/red-arrow-dataset/Gemfile b/src/arrow/ruby/red-arrow-dataset/Gemfile new file mode 100644 index 000000000..7c4cefcf3 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/Gemfile @@ -0,0 +1,24 @@ +# -*- ruby -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +source "https://rubygems.org/" + +gemspec + +gem "red-arrow", path: "../red-arrow" diff --git a/src/arrow/ruby/red-arrow-dataset/LICENSE.txt b/src/arrow/ruby/red-arrow-dataset/LICENSE.txt new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/src/arrow/ruby/red-arrow-dataset/NOTICE.txt b/src/arrow/ruby/red-arrow-dataset/NOTICE.txt new file mode 100644 index 000000000..e08aeda8a --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/NOTICE.txt @@ -0,0 +1,2 @@ +Apache Arrow +Copyright 2016 The Apache Software Foundation diff --git a/src/arrow/ruby/red-arrow-dataset/README.md b/src/arrow/ruby/red-arrow-dataset/README.md new file mode 100644 index 000000000..b48ef0b6c --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/README.md @@ -0,0 +1,50 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# Red Arrow Dataset - Apache Arrow Dataset Ruby + +Red Arrow Dataset is the Ruby bindings of Apache Arrow Dataset. Red Arrow Dataset is based on GObject Introspection. + +[Apache Arrow Dataset](https://arrow.apache.org/) is one of Apache Arrow components to read and write semantic datasets stored in different locations and formats. + +[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime. + +Red Arrow Dataset uses [Apache Arrow Dataset GLib](https://github.com/apache/arrow/tree/master/c_glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Arrow Dataset. + +Apache Arrow Dataset GLib is a C wrapper for [Apache Arrow Dataset C++](https://github.com/apache/arrow/tree/master/cpp). GObject Introspection can't use Apache Arrow Dataset C++ directly. Apache Arrow Dataset GLib is a bridge between Apache Arrow Dataset C++ and GObject Introspection. + +gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow Dataset uses GObject Introspection via gobject-introspection gem. + +## Install + +Install Apache Arrow Dataset GLib before install Red Arrow Dataset. Install Apache Arrow GLib before install Red Arrow. See [Apache Arrow install document](https://arrow.apache.org/install/) for details. + +Install Red Arrow Dataset after you install Apache Arrow Dataset GLib: + +```console +$ gem install red-arrow-dataset +``` + +## Usage + +```ruby +require "arrow-dataset" + +# TODO +``` diff --git a/src/arrow/ruby/red-arrow-dataset/Rakefile b/src/arrow/ruby/red-arrow-dataset/Rakefile new file mode 100644 index 000000000..2bbe6e761 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/Rakefile @@ -0,0 +1,41 @@ +# -*- ruby -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require "rubygems" +require "bundler/gem_helper" + +base_dir = File.join(File.dirname(__FILE__)) + +helper = Bundler::GemHelper.new(base_dir) +helper.install + +release_task = Rake::Task["release"] +release_task.prerequisites.replace(["build", "release:rubygem_push"]) + +desc "Run tests" +task :test do + cd(base_dir) do + cd("dependency-check") do + ruby("-S", "rake") + end + ruby("test/run-test.rb") + end +end + +task default: :test diff --git a/src/arrow/ruby/red-arrow-dataset/dependency-check/Rakefile b/src/arrow/ruby/red-arrow-dataset/dependency-check/Rakefile new file mode 100644 index 000000000..df2e24905 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/dependency-check/Rakefile @@ -0,0 +1,47 @@ +# -*- ruby -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require "pkg-config" +require "native-package-installer" +require_relative "../lib/arrow-dataset/version" + +case RUBY_PLATFORM +when /mingw|mswin/ + task :default => "nothing" +else + task :default => "dependency:check" +end + +task :nothing do +end + +namespace :dependency do + desc "Check dependency" + task :check do + unless PKGConfig.check_version?("arrow-dataset-glib", + ArrowDataset::Version::MAJOR, + ArrowDataset::Version::MINOR, + ArrowDataset::Version::MICRO) + unless NativePackageInstaller.install(:debian => "libarrow-dataset-glib-dev", + :redhat => "arrow-dataset-glib-devel") + exit(false) + end + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb new file mode 100644 index 000000000..fe4f2d518 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require "arrow" + +require "arrow-dataset/version" + +require "arrow-dataset/loader" + +module ArrowDataset + class Error < StandardError + end + + Loader.load +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb new file mode 100644 index 000000000..14c8dce6f --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + module ArrowTableLoadable + private + def path_to_uri(path) + absolute_path = ::File.expand_path(path) + if absolute_path.start_with?("/") + URI("file://#{absolute_path}") + else + URI("file:///#{absolute_path}") + end + end + + def load_from_directory + internal_load_from_uri(path_to_uri(@input)) + end + + def load_from_uri + internal_load_from_uri(@input) + end + + def internal_load_from_uri(uri) + format = FileFormat.resolve(@options[:format]) + dataset = FileSystemDataset.build(format) do |factory| + factory.file_system_uri = uri + end + scanner_builder = dataset.begin_scan + @options.each do |key, value| + next if key == :format + next if value.nil? + setter = "#{key}=" + next unless scanner_builder.respond_to?(setter) + scanner_builder.public_send(setter, value) + end + scanner = scanner_builder.finish + scanner.to_table + end + end +end + +module Arrow + class TableLoader + include ArrowDataset::ArrowTableLoadable + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb new file mode 100644 index 000000000..30ad6c292 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + module ArrowTableSavable + private + def save_to_uri + format = FileFormat.resolve(@options[:format]) + options = FileSystemDatasetWriteOptions.new + options.file_write_options = format.default_write_options + path = @output.path + if @output.scheme.nil? + options.file_system = Arrow::LocalFileSystem.new + else + options.file_system = Arrow::FileSystem.create(@output.to_s) + # /C:/... -> C:/... + unless File.expand_path(".").start_with?("/") + path = path.gsub(/\A\//, "") + end + end + partitioning = @options[:partitioning] + if partitioning + # TODO + options.base_dir = File.dirname(path) + options.base_name_template = File.basename(path) + options.partitioning = Partitioning.resolve(@options[:partitioning]) + scanner_builder = ScannerBuilder.new(@table) + scanner_builder.use_async(true) + scanner = scanner_builder.finish + FileSystemDataset.write_scanner(scanner, options) + else + dir = File.dirname(path) + unless File.exist?(dir) + options.file_system.create_dir(dir, true) + end + options.file_system.open_output_stream(path) do |output_stream| + format.open_writer(output_stream, + options.file_system, + path, + @table.schema, + format.default_write_options) do |writer| + reader = Arrow::TableBatchReader.new(@table) + writer.write_record_batch_reader(reader) + end + end + end + end + end +end + +module Arrow + class TableSaver + include ArrowDataset::ArrowTableSavable + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb new file mode 100644 index 000000000..a658fc3f2 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class Dataset + class << self + def build(*args) + factory_class = ArrowDataset.const_get("#{name}Factory") + factory = factory_class.new(*args) + yield(factory) + factory.finish + end + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb new file mode 100644 index 000000000..83e61c4b2 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class FileFormat + class << self + def resolve(format) + case format + when :arrow, :arrow_file, :arrow_streaming + IPCFileFormat.new + when :parquet + ParquetFileFormat.new + when :csv + CSVFileFormat.new + else + available_formats = [ + :arrow, + :arrow_file, + :arrow_streaming, + :parquet, + :csv, + ] + message = "Arrow::Table load format must be one of [" + message << available_formats.join(", ") + message << "]: #{@options[:format].inspect}" + raise ArgumentError, message + end + end + end + + alias_method :open_writer_raw, :open_writer + def open_writer(destination, file_system, path, schema, options) + writer = open_writer_raw(destination, file_system, path, schema, options) + if block_given? + begin + yield(writer) + ensure + writer.finish + end + else + writer + end + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb new file mode 100644 index 000000000..111a29a3c --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class FileSystemDatasetFactory + alias_method :set_file_system_uri_raw, :set_file_system_uri + def set_file_system_uri(uri) + if uri.is_a?(URI) + if uri.scheme.nil? + uri = uri.dup + absolute_path = File.expand_path(uri.path) + if absolute_path.start_with?("/") + uri.path = absolute_path + else + uri.path = "/#{absolute_path}" + end + uri.scheme = "file" + end + uri = uri.to_s + end + set_file_system_uri_raw(uri) + end + alias_method :file_system_uri=, :set_file_system_uri + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb new file mode 100644 index 000000000..b1be000f7 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + class Loader < GObjectIntrospection::Loader + class << self + def load + super("ArrowDataset", ArrowDataset) + end + end + + private + def post_load(repository, namespace) + require_libraries + end + + def require_libraries + require "arrow-dataset/arrow-table-loadable" + require "arrow-dataset/arrow-table-savable" + require "arrow-dataset/dataset" + require "arrow-dataset/file-format" + require "arrow-dataset/file-system-dataset-factory" + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb new file mode 100644 index 000000000..1a37139d1 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + VERSION = "6.0.1" + + module Version + numbers, TAG = VERSION.split("-") + MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i) + STRING = VERSION + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/red-arrow-dataset.gemspec b/src/arrow/ruby/red-arrow-dataset/red-arrow-dataset.gemspec new file mode 100644 index 000000000..0a60925e4 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/red-arrow-dataset.gemspec @@ -0,0 +1,51 @@ +# -*- ruby -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "lib/arrow-dataset/version" + +Gem::Specification.new do |spec| + spec.name = "red-arrow-dataset" + version_components = [ + ArrowDataset::Version::MAJOR.to_s, + ArrowDataset::Version::MINOR.to_s, + ArrowDataset::Version::MICRO.to_s, + ArrowDataset::Version::TAG, + ] + spec.version = version_components.compact.join(".") + spec.homepage = "https://arrow.apache.org/" + spec.authors = ["Apache Arrow Developers"] + spec.email = ["dev@arrow.apache.org"] + + spec.summary = "Red Arrow Dataset is the Ruby bindings of Apache Arrow Dataset" + spec.description = + "Apache Arrow Dataset is one of Apache Arrow components to read and write " + + "semantic datasets stored in different locations and formats." + spec.license = "Apache-2.0" + spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"] + spec.files += ["LICENSE.txt", "NOTICE.txt"] + spec.files += Dir.glob("lib/**/*.rb") + spec.test_files += Dir.glob("test/**/*") + spec.extensions = ["dependency-check/Rakefile"] + + spec.add_runtime_dependency("red-arrow", "= #{spec.version}") + + spec.add_development_dependency("bundler") + spec.add_development_dependency("rake") + spec.add_development_dependency("test-unit") +end diff --git a/src/arrow/ruby/red-arrow-dataset/test/helper.rb b/src/arrow/ruby/red-arrow-dataset/test/helper.rb new file mode 100644 index 000000000..7231eb1cb --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/test/helper.rb @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require "arrow-dataset" + +require "tmpdir" + +require "test-unit" diff --git a/src/arrow/ruby/red-arrow-dataset/test/run-test.rb b/src/arrow/ruby/red-arrow-dataset/test/run-test.rb new file mode 100755 index 000000000..48d2c49e1 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/test/run-test.rb @@ -0,0 +1,50 @@ +#!/usr/bin/env ruby +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +$VERBOSE = true + +require "pathname" + +(ENV["ARROW_DLL_PATH"] || "").split(File::PATH_SEPARATOR).each do |path| + RubyInstaller::Runtime.add_dll_directory(path) +end + +base_dir = Pathname.new(__dir__).parent.expand_path +arrow_base_dir = base_dir.parent + "red-arrow" + +lib_dir = base_dir + "lib" +test_dir = base_dir + "test" + +arrow_lib_dir = arrow_base_dir + "lib" +arrow_ext_dir = arrow_base_dir + "ext" + "arrow" + +build_dir = ENV["BUILD_DIR"] +if build_dir + arrow_build_dir = Pathname.new(build_dir) + "red-arrow" +else + arrow_build_dir = arrow_ext_dir +end + +$LOAD_PATH.unshift(arrow_build_dir.to_s) +$LOAD_PATH.unshift(arrow_lib_dir.to_s) +$LOAD_PATH.unshift(lib_dir.to_s) + +require_relative "helper" + +exit(Test::Unit::AutoRunner.run(true, test_dir.to_s)) diff --git a/src/arrow/ruby/red-arrow-dataset/test/test-arrow-table.rb b/src/arrow/ruby/red-arrow-dataset/test/test-arrow-table.rb new file mode 100644 index 000000000..191306374 --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/test/test-arrow-table.rb @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestArrowTable < Test::Unit::TestCase + def setup + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @path1 = File.join(@dir, "data", "table1.arrow") + @table1 = Arrow::Table.new(visible: [true, false, true], + point: [1, 2, 3]) + @path2 = File.join(@dir, "data", "table2.arrow") + @table2 = Arrow::Table.new(visible: [true], + point: [10]) + yield + end + end + + def build_file_uri(path) + absolute_path = File.expand_path(path) + if absolute_path.start_with?("/") + URI("file://#{absolute_path}") + else + URI("file:///#{absolute_path}") + end + end + + sub_test_case("load") do + def test_no_scheme + Dir.chdir(@dir) do + uri = URI(File.basename(@path1)) + @table1.save(uri) + assert_equal(@table1, Arrow::Table.load(uri)) + end + end + + def test_file + uri = build_file_uri(@path1) + @table1.save(uri) + assert_equal(@table1, Arrow::Table.load(uri)) + end + + def test_directory_uri + uri = build_file_uri(@dir) + @table1.save(build_file_uri(@path1)) + @table2.save(build_file_uri(@path2)) + assert_equal(@table1.concatenate([@table2]), + Arrow::Table.load(uri)) + end + + def test_directory_path + @table1.save(build_file_uri(@path1)) + @table2.save(build_file_uri(@path2)) + assert_equal(@table1.concatenate([@table2]), + Arrow::Table.load(@dir)) + end + + def test_filter + @table1.save(build_file_uri(@path1)) + @table2.save(build_file_uri(@path2)) + assert_equal(Arrow::Table.new(visible: [true, true, true], + point: [1, 3, 10]), + Arrow::Table.load(@dir, + filter: ["equal", :visible, true])) + end + end +end diff --git a/src/arrow/ruby/red-arrow-dataset/test/test-file-system-dataset.rb b/src/arrow/ruby/red-arrow-dataset/test/test-file-system-dataset.rb new file mode 100644 index 000000000..17cbcb88d --- /dev/null +++ b/src/arrow/ruby/red-arrow-dataset/test/test-file-system-dataset.rb @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFileSystemDataset < Test::Unit::TestCase + def setup + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @path = File.join(@dir, "table.arrow") + @table = Arrow::Table.new(visible: [true, false, true], + point: [1, 2, 3]) + @table.save(@path) + @format = ArrowDataset::IPCFileFormat.new + yield + end + end + + test(".build") do + dataset = ArrowDataset::FileSystemDataset.build(@format) do |factory| + factory.file_system = Arrow::LocalFileSystem.new + factory.add_path(File.expand_path(@path)) + end + assert_equal(@table, dataset.to_table) + end +end |