summaryrefslogtreecommitdiffstats
path: root/src/arrow/ruby/red-arrow-dataset/lib
diff options
context:
space:
mode:
Diffstat (limited to 'src/arrow/ruby/red-arrow-dataset/lib')
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb29
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb61
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb69
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb29
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb59
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb39
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb39
-rw-r--r--src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb26
8 files changed, 351 insertions, 0 deletions
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb
new file mode 100644
index 000000000..fe4f2d518
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+require "arrow"
+
+require "arrow-dataset/version"
+
+require "arrow-dataset/loader"
+
+module ArrowDataset
+ class Error < StandardError
+ end
+
+ Loader.load
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
new file mode 100644
index 000000000..14c8dce6f
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-loadable.rb
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ module ArrowTableLoadable
+ private
+ def path_to_uri(path)
+ absolute_path = ::File.expand_path(path)
+ if absolute_path.start_with?("/")
+ URI("file://#{absolute_path}")
+ else
+ URI("file:///#{absolute_path}")
+ end
+ end
+
+ def load_from_directory
+ internal_load_from_uri(path_to_uri(@input))
+ end
+
+ def load_from_uri
+ internal_load_from_uri(@input)
+ end
+
+ def internal_load_from_uri(uri)
+ format = FileFormat.resolve(@options[:format])
+ dataset = FileSystemDataset.build(format) do |factory|
+ factory.file_system_uri = uri
+ end
+ scanner_builder = dataset.begin_scan
+ @options.each do |key, value|
+ next if key == :format
+ next if value.nil?
+ setter = "#{key}="
+ next unless scanner_builder.respond_to?(setter)
+ scanner_builder.public_send(setter, value)
+ end
+ scanner = scanner_builder.finish
+ scanner.to_table
+ end
+ end
+end
+
+module Arrow
+ class TableLoader
+ include ArrowDataset::ArrowTableLoadable
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb
new file mode 100644
index 000000000..30ad6c292
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ module ArrowTableSavable
+ private
+ def save_to_uri
+ format = FileFormat.resolve(@options[:format])
+ options = FileSystemDatasetWriteOptions.new
+ options.file_write_options = format.default_write_options
+ path = @output.path
+ if @output.scheme.nil?
+ options.file_system = Arrow::LocalFileSystem.new
+ else
+ options.file_system = Arrow::FileSystem.create(@output.to_s)
+ # /C:/... -> C:/...
+ unless File.expand_path(".").start_with?("/")
+ path = path.gsub(/\A\//, "")
+ end
+ end
+ partitioning = @options[:partitioning]
+ if partitioning
+ # TODO
+ options.base_dir = File.dirname(path)
+ options.base_name_template = File.basename(path)
+ options.partitioning = Partitioning.resolve(@options[:partitioning])
+ scanner_builder = ScannerBuilder.new(@table)
+ scanner_builder.use_async(true)
+ scanner = scanner_builder.finish
+ FileSystemDataset.write_scanner(scanner, options)
+ else
+ dir = File.dirname(path)
+ unless File.exist?(dir)
+ options.file_system.create_dir(dir, true)
+ end
+ options.file_system.open_output_stream(path) do |output_stream|
+ format.open_writer(output_stream,
+ options.file_system,
+ path,
+ @table.schema,
+ format.default_write_options) do |writer|
+ reader = Arrow::TableBatchReader.new(@table)
+ writer.write_record_batch_reader(reader)
+ end
+ end
+ end
+ end
+ end
+end
+
+module Arrow
+ class TableSaver
+ include ArrowDataset::ArrowTableSavable
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb
new file mode 100644
index 000000000..a658fc3f2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class Dataset
+ class << self
+ def build(*args)
+ factory_class = ArrowDataset.const_get("#{name}Factory")
+ factory = factory_class.new(*args)
+ yield(factory)
+ factory.finish
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb
new file mode 100644
index 000000000..83e61c4b2
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class FileFormat
+ class << self
+ def resolve(format)
+ case format
+ when :arrow, :arrow_file, :arrow_streaming
+ IPCFileFormat.new
+ when :parquet
+ ParquetFileFormat.new
+ when :csv
+ CSVFileFormat.new
+ else
+ available_formats = [
+ :arrow,
+ :arrow_file,
+ :arrow_streaming,
+ :parquet,
+ :csv,
+ ]
+ message = "Arrow::Table load format must be one of ["
+ message << available_formats.join(", ")
+ message << "]: #{@options[:format].inspect}"
+ raise ArgumentError, message
+ end
+ end
+ end
+
+ alias_method :open_writer_raw, :open_writer
+ def open_writer(destination, file_system, path, schema, options)
+ writer = open_writer_raw(destination, file_system, path, schema, options)
+ if block_given?
+ begin
+ yield(writer)
+ ensure
+ writer.finish
+ end
+ else
+ writer
+ end
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb
new file mode 100644
index 000000000..111a29a3c
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/file-system-dataset-factory.rb
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class FileSystemDatasetFactory
+ alias_method :set_file_system_uri_raw, :set_file_system_uri
+ def set_file_system_uri(uri)
+ if uri.is_a?(URI)
+ if uri.scheme.nil?
+ uri = uri.dup
+ absolute_path = File.expand_path(uri.path)
+ if absolute_path.start_with?("/")
+ uri.path = absolute_path
+ else
+ uri.path = "/#{absolute_path}"
+ end
+ uri.scheme = "file"
+ end
+ uri = uri.to_s
+ end
+ set_file_system_uri_raw(uri)
+ end
+ alias_method :file_system_uri=, :set_file_system_uri
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb
new file mode 100644
index 000000000..b1be000f7
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ class Loader < GObjectIntrospection::Loader
+ class << self
+ def load
+ super("ArrowDataset", ArrowDataset)
+ end
+ end
+
+ private
+ def post_load(repository, namespace)
+ require_libraries
+ end
+
+ def require_libraries
+ require "arrow-dataset/arrow-table-loadable"
+ require "arrow-dataset/arrow-table-savable"
+ require "arrow-dataset/dataset"
+ require "arrow-dataset/file-format"
+ require "arrow-dataset/file-system-dataset-factory"
+ end
+ end
+end
diff --git a/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb
new file mode 100644
index 000000000..1a37139d1
--- /dev/null
+++ b/src/arrow/ruby/red-arrow-dataset/lib/arrow-dataset/version.rb
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module ArrowDataset
+ VERSION = "6.0.1"
+
+ module Version
+ numbers, TAG = VERSION.split("-")
+ MAJOR, MINOR, MICRO = numbers.split(".").collect(&:to_i)
+ STRING = VERSION
+ end
+end