diff options
Diffstat (limited to 'src/arrow/js/perf')
-rw-r--r-- | src/arrow/js/perf/config.ts | 76 | ||||
-rw-r--r-- | src/arrow/js/perf/index.ts | 234 |
2 files changed, 310 insertions, 0 deletions
diff --git a/src/arrow/js/perf/config.ts b/src/arrow/js/perf/config.ts new file mode 100644 index 000000000..08ea9ecc1 --- /dev/null +++ b/src/arrow/js/perf/config.ts @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Arrow from '../src/Arrow.dom'; + +// from https://stackoverflow.com/a/19303725/214950 +let seed = 1; +function random() { + const x = Math.sin(seed++) * 10000; + return x - Math.floor(x); +} + +console.time('Prepare Data'); + +const LENGTH = 100000; +const NUM_BATCHES = 10; + +const values = Arrow.Utf8Vector.from(['Charlottesville', 'New York', 'San Francisco', 'Seattle', 'Terre Haute', 'Washington, DC']); + +const batches = Array.from({length: NUM_BATCHES}).map(() => { + const lat = Float32Array.from( + { length: LENGTH }, + () => ((random() - 0.5) * 2 * 90)); + const lng = Float32Array.from( + { length: LENGTH }, + () => ((random() - 0.5) * 2 * 90)); + + const origin = Uint8Array.from( + { length: LENGTH }, + () => (random() * 6)); + const destination = Uint8Array.from( + { length: LENGTH }, + () => (random() * 6)); + + const originType = new Arrow.Dictionary(values.type, new Arrow.Int8, 0, false); + const destinationType = new Arrow.Dictionary(values.type, new Arrow.Int8, 0, false); + + return Arrow.RecordBatch.new({ + 'lat': Arrow.Float32Vector.from(lat), + 'lng': Arrow.Float32Vector.from(lng), + 'origin': Arrow.Vector.new(Arrow.Data.Dictionary(originType, 0, origin.length, 0, null, origin, values)), + 'destination': Arrow.Vector.new(Arrow.Data.Dictionary(destinationType, 0, destination.length, 0, null, destination, values)), + }); +}); + +const tracks = new Arrow.DataFrame(batches[0].schema, batches); + +console.timeEnd('Prepare Data'); + +export default [ + { + name: 'tracks', + df: tracks, + ipc: tracks.serialize(), + countBys: ['origin', 'destination'], + counts: [ + {column: 'lat', test: 'gt' as 'gt' | 'eq', value: 0 }, + {column: 'lng', test: 'gt' as 'gt' | 'eq', value: 0 }, + {column: 'origin', test: 'eq' as 'gt' | 'eq', value: 'Seattle'}, + ], + } +]; diff --git a/src/arrow/js/perf/index.ts b/src/arrow/js/perf/index.ts new file mode 100644 index 000000000..9f6cb8f79 --- /dev/null +++ b/src/arrow/js/perf/index.ts @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Alternatively, use bundles for performance tests +// import * as Arrow from '../targets/es5/umd'; +// import * as Arrow from '../targets/es5/cjs'; +// import * as Arrow from '../targets/es2015/umd'; +// import * as Arrow from '../targets/es2015/cjs'; + +import * as Arrow from '../src/Arrow'; + +import config from './config'; +import b from 'benny'; +import { CaseResult, Summary } from 'benny/lib/internal/common-types'; +import kleur from 'kleur'; + +const { predicate, Table, RecordBatchReader } = Arrow; +const { col } = predicate; + + +const args = process.argv.slice(2); +const json = args[0] === '--json'; + +const formatter = new Intl.NumberFormat(); +function formatNumber(number: number, precision = 0) { + const rounded = number > precision * 10 ? Math.round(number) : parseFloat((number).toPrecision(precision)); + return formatter.format(rounded); +} + +const results: CaseResult[] = []; + +function cycle(result: CaseResult, _summary: Summary) { + const duration = result.details.median * 1000; + if (json) { + result.suite = _summary.name; + results.push(result); + } + console.log( + `${kleur.cyan(result.name)} ${formatNumber(result.ops, 3)} ops/s ±${result.margin.toPrecision(2)}%, ${formatNumber(duration, 2)} ms, ${kleur.gray(result.samples + ' samples')}`, + ); +} + +for (const { name, ipc, df } of config) { + b.suite( + `Parse`, + + b.add(`dataset: ${name}, function: Table.from`, () => { + Table.from(ipc); + }), + + b.add(`dataset: ${name}, function: readBatches`, () => { + for (const _recordBatch of RecordBatchReader.from(ipc)) {} + }), + + b.add(`dataset: ${name}, function: serialize`, () => { + df.serialize(); + }), + + b.cycle(cycle) + ); + + const schema = df.schema; + + const suites = [{ + suite_name: `Get values by index`, + fn(vector: Arrow.Column<any>) { + for (let i = -1, n = vector.length; ++i < n;) { + vector.get(i); + } + } + }, { + suite_name: `Iterate vectors`, + fn(vector: Arrow.Column<any>) { for (const _value of vector) {} } + }, { + suite_name: `Slice toArray vectors`, + fn(vector: Arrow.Column<any>) { vector.slice().toArray(); } + }, { + suite_name: `Slice vectors`, + fn(vector: Arrow.Column<any>) { vector.slice(); } + }]; + + for (const {suite_name, fn} of suites) { + b.suite( + suite_name, + + ...schema.fields.map((f, i) => { + const vector = df.getColumnAt(i)!; + return b.add(`dataset: ${name}, column: ${f.name}, length: ${formatNumber(vector.length)}, type: ${vector.type}`, () => { + fn(vector); + }); + }), + + b.cycle(cycle) + ); + } +} + + +for (const { name, df, countBys, counts } of config) { + b.suite( + `DataFrame Iterate`, + + b.add(`dataset: ${name}, length: ${formatNumber(df.length)}`, () => { + for (const _value of df) {} + }), + + b.cycle(cycle) + ); + + b.suite( + `DataFrame Count By`, + + ...countBys.map((column: string) => b.add( + `dataset: ${name}, column: ${column}, length: ${formatNumber(df.length)}, type: ${df.schema.fields.find((c)=> c.name === column)!.type}`, + () => df.countBy(column) + )), + + b.cycle(cycle) + ); + + b.suite( + `DataFrame Filter-Scan Count`, + + ...counts.map(({ column, test, value }: {column: string; test: 'gt' | 'eq'; value: number | string}) => b.add( + `dataset: ${name}, column: ${column}, length: ${formatNumber(df.length)}, type: ${df.schema.fields.find((c)=> c.name === column)!.type}, test: ${test}, value: ${value}`, + () => { + let filteredDf: Arrow.FilteredDataFrame; + if (test == 'gt') { + filteredDf = df.filter(col(column).gt(value)); + } else if (test == 'eq') { + filteredDf = df.filter(col(column).eq(value)); + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return () => filteredDf.count(); + } + )), + + b.cycle(cycle) + ); + + b.suite( + `DataFrame Filter-Iterate`, + + ...counts.map(({ column, test, value }: {column: string; test: 'gt' | 'eq'; value: number | string}) => b.add( + `dataset: ${name}, column: ${column}, length: ${formatNumber(df.length)}, type: ${df.schema.fields.find((c)=> c.name === column)!.type}, test: ${test}, value: ${value}`, + () => { + let filteredDf: Arrow.FilteredDataFrame; + if (test == 'gt') { + filteredDf = df.filter(col(column).gt(value)); + } else if (test == 'eq') { + filteredDf = df.filter(col(column).eq(value)); + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return () => { + for (const _value of filteredDf) {} + }; + } + )), + + b.cycle(cycle) + ); + + b.suite( + `DataFrame Direct Count`, + + ...counts.map(({ column, test, value }: {column: string; test: 'gt' | 'eq'; value: number | string}) => b.add( + `dataset: ${name}, column: ${column}, length: ${formatNumber(df.length)}, type: ${df.schema.fields.find((c)=> c.name === column)!.type}, test: ${test}, value: ${value}`, + () => { + const colidx = df.schema.fields.findIndex((c)=> c.name === column); + + if (test == 'gt') { + return () => { + let sum = 0; + const batches = df.chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const vector = batch.getChildAt(colidx)!; + // yield all indices + for (let index = -1, length = batch.length; ++index < length;) { + sum += (vector.get(index) >= value) ? 1 : 0; + } + } + return sum; + }; + } else if (test == 'eq') { + return () => { + let sum = 0; + const batches = df.chunks; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const vector = batch.getChildAt(colidx)!; + // yield all indices + for (let index = -1, length = batch.length; ++index < length;) { + sum += (vector.get(index) === value) ? 1 : 0; + } + } + return sum; + }; + } else { + throw new Error(`Unrecognized test "${test}"`); + } + } + )), + + b.cycle(cycle), + + b.complete(() => { + // last benchmark finished + json && process.stderr.write(JSON.stringify(results, null, 2)); + }) + ); +} |