diff options
Diffstat (limited to 'src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml')
-rw-r--r-- | src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml b/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml new file mode 100644 index 000000000..151bb412f --- /dev/null +++ b/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +contexts: + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("ext/arrow")) + $LOAD_PATH.unshift(File.expand_path("lib")) +prelude: |- + require "arrow" + require "faker" + + state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i + Faker::Config.random = Random.new(state) + + n_rows = 1000 + n_columns = 10 + type = Arrow::DictionaryDataType.new(:int8, :string, true) + + fields = n_columns.times.map {|i| ["column_#{i}".to_sym, type] }.to_h + schema = Arrow::Schema.new(**fields) + dictionary = Arrow::StringArray.new( + 100.times.map { Faker::Book.genre }.uniq.sort + ) + indices = Arrow::Int8Array.new( + n_rows.times.map { + Faker::Number.within(range: 0 ... dictionary.length) + } + ) + arrays = n_columns.times.map do + Arrow::DictionaryArray.new( + type, + indices, + dictionary, + ) + end + record_batch = Arrow::RecordBatch.new(schema, n_rows, arrays) + + def pure_ruby_raw_records(record_batch) + n_rows = record_batch.n_rows + n_columns = record_batch.n_columns + columns = record_batch.columns + records = [] + i = 0 + while i < n_rows + record = [] + j = 0 + while j < n_columns + record << columns[j].data.indices[i] + j += 1 + end + records << record + i += 1 + end + records + end +benchmark: + pure_ruby: |- + pure_ruby_raw_records(record_batch) + raw_records: |- + record_batch.raw_records |